From 164166558aacea01b99c8c8ffb710d930405ba69 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 5 Dec 2019 13:35:11 +0100 Subject: netfilter: uapi: Avoid undefined left-shift in xt_sctp.h With 'bytes(__u32)' being 32, a left-shift of 31 may happen which is undefined for the signed 32-bit value 1. Avoid this by declaring 1 as unsigned. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_sctp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/xt_sctp.h b/include/uapi/linux/netfilter/xt_sctp.h index 4bc6d1a08781..b4d804a9fccb 100644 --- a/include/uapi/linux/netfilter/xt_sctp.h +++ b/include/uapi/linux/netfilter/xt_sctp.h @@ -41,19 +41,19 @@ struct xt_sctp_info { #define SCTP_CHUNKMAP_SET(chunkmap, type) \ do { \ (chunkmap)[type / bytes(__u32)] |= \ - 1 << (type % bytes(__u32)); \ + 1u << (type % bytes(__u32)); \ } while (0) #define SCTP_CHUNKMAP_CLEAR(chunkmap, type) \ do { \ (chunkmap)[type / bytes(__u32)] &= \ - ~(1 << (type % bytes(__u32))); \ + ~(1u << (type % bytes(__u32))); \ } while (0) #define SCTP_CHUNKMAP_IS_SET(chunkmap, type) \ ({ \ ((chunkmap)[type / bytes (__u32)] & \ - (1 << (type % bytes (__u32)))) ? 1: 0; \ + (1u << (type % bytes (__u32)))) ? 1: 0; \ }) #define SCTP_CHUNKMAP_RESET(chunkmap) \ -- cgit v1.2.3-59-g8ed1b From f394722fb0d0f701119368959d7cd0ecbc46363a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Dec 2019 12:23:21 -0800 Subject: neighbour: remove neigh_cleanup() method neigh_cleanup() has not been used for seven years, and was a wrong design. Messing with shared pointer in bond_neigh_init() without proper memory barriers would at least trigger syzbot complains eventually. It is time to remove this stuff. Fixes: b63b70d87741 ("IPoIB: Use a private hash table for path lookup in xmit path") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 8 -------- include/net/neighbour.h | 1 - net/core/neighbour.c | 3 --- 3 files changed, 12 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index fcb7c2f7f001..6c72623e48e5 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3712,18 +3712,10 @@ static int bond_neigh_init(struct neighbour *n) return 0; parms.neigh_setup = NULL; - parms.neigh_cleanup = NULL; ret = slave_ops->ndo_neigh_setup(slave->dev, &parms); if (ret) return ret; - /* Assign slave's neigh_cleanup to neighbour in case cleanup is called - * after the last slave has been detached. Assumes that all slaves - * utilize the same neigh_cleanup (true at this writing as only user - * is ipoib). - */ - n->parms->neigh_cleanup = parms.neigh_cleanup; - if (!parms.neigh_setup) return 0; diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 6ad9ad47a9c5..8ec77bfdc1a4 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -72,7 +72,6 @@ struct neigh_parms { struct net_device *dev; struct list_head list; int (*neigh_setup)(struct neighbour *); - void (*neigh_cleanup)(struct neighbour *); struct neigh_table *tbl; void *sysctl_table; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 652da6369037..920784a9b7ff 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -98,9 +98,6 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) static void neigh_cleanup_and_release(struct neighbour *neigh) { - if (neigh->parms->neigh_cleanup) - neigh->parms->neigh_cleanup(neigh); - trace_neigh_cleanup_and_release(neigh, 0); __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); -- cgit v1.2.3-59-g8ed1b From f8fc57e8d7c5d95f4180b127d3b167de403557c0 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Mon, 9 Dec 2019 08:21:34 +0100 Subject: net/x25: add new state X25_STATE_5 This is needed, because if the flag X25_ACCPT_APPRV_FLAG is not set on a socket (manual call confirmation) and the channel is cleared by remote before the manual call confirmation was sent, this situation needs to be handled. Signed-off-by: Martin Schiller Signed-off-by: David S. Miller --- include/net/x25.h | 3 ++- net/x25/af_x25.c | 8 ++++++++ net/x25/x25_in.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/x25.h b/include/net/x25.h index ed1acc3044ac..d7d6c2b4ffa7 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -62,7 +62,8 @@ enum { X25_STATE_1, /* Awaiting Call Accepted */ X25_STATE_2, /* Awaiting Clear Confirmation */ X25_STATE_3, /* Data Transfer */ - X25_STATE_4 /* Awaiting Reset Confirmation */ + X25_STATE_4, /* Awaiting Reset Confirmation */ + X25_STATE_5 /* Call Accepted / Call Connected pending */ }; enum { diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index c34f7d077604..2efe44a34644 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -659,6 +659,12 @@ static int x25_release(struct socket *sock) sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; + + case X25_STATE_5: + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25_disconnect(sk, 0, 0, 0); + __x25_destroy_socket(sk); + goto out; } sock_orphan(sk); @@ -1054,6 +1060,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; + } else { + makex25->state = X25_STATE_5; } /* diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index f97c43344e95..4d3bb46aaae0 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -382,6 +382,35 @@ out_clear: return 0; } +/* + * State machine for state 5, Call Accepted / Call Connected pending (X25_ACCPT_APPRV_FLAG). + * The handling of the timer(s) is in file x25_timer.c + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state5_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct x25_sock *x25 = x25_sk(sk); + + switch (frametype) { + case X25_CLEAR_REQUEST: + if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2)) { + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25->state = X25_STATE_2; + x25_start_t23timer(sk); + return 0; + } + + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, 0, skb->data[3], skb->data[4]); + break; + + default: + break; + } + + return 0; +} + /* Higher level upcall for a LAPB frame */ int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb) { @@ -406,6 +435,9 @@ int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb) case X25_STATE_4: queued = x25_state4_machine(sk, skb, frametype); break; + case X25_STATE_5: + queued = x25_state5_machine(sk, skb, frametype); + break; } x25_kick(sk); -- cgit v1.2.3-59-g8ed1b From b91e014f078e2e4f24778680e28dbbdecc7f0eb9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 8 Dec 2019 16:01:13 -0800 Subject: bpf: Make BPF trampoline use register_ftrace_direct() API Make BPF trampoline attach its generated assembly code to kernel functions via register_ftrace_direct() API. It helps ftrace-based tracers co-exist with BPF trampoline on the same kernel function. It also switches attaching logic from arch specific text_poke to generic ftrace that is available on many architectures. text_poke is still necessary for bpf-to-bpf attach and for bpf_tail_call optimization. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191209000114.1876138-3-ast@kernel.org --- include/linux/bpf.h | 1 + kernel/bpf/trampoline.c | 64 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 35903f148be5..ac7de5291509 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -461,6 +461,7 @@ struct bpf_trampoline { struct { struct btf_func_model model; void *addr; + bool ftrace_managed; } func; /* list of BPF programs using this trampoline */ struct hlist_head progs_hlist[BPF_TRAMP_MAX]; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7e89f1f49d77..23b0d5cfd47e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -3,6 +3,7 @@ #include #include #include +#include /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 @@ -59,6 +60,60 @@ out: return tr; } +static int is_ftrace_location(void *ip) +{ + long addr; + + addr = ftrace_location((long)ip); + if (!addr) + return 0; + if (WARN_ON_ONCE(addr != (long)ip)) + return -EFAULT; + return 1; +} + +static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) +{ + void *ip = tr->func.addr; + int ret; + + if (tr->func.ftrace_managed) + ret = unregister_ftrace_direct((long)ip, (long)old_addr); + else + ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); + return ret; +} + +static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr) +{ + void *ip = tr->func.addr; + int ret; + + if (tr->func.ftrace_managed) + ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr); + else + ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + return ret; +} + +/* first time registering */ +static int register_fentry(struct bpf_trampoline *tr, void *new_addr) +{ + void *ip = tr->func.addr; + int ret; + + ret = is_ftrace_location(ip); + if (ret < 0) + return ret; + tr->func.ftrace_managed = ret; + + if (tr->func.ftrace_managed) + ret = register_ftrace_direct((long)ip, (long)new_addr); + else + ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + return ret; +} + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into PAGE_SIZE / 2 */ @@ -77,8 +132,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) int err; if (fentry_cnt + fexit_cnt == 0) { - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, - old_image, NULL); + err = unregister_fentry(tr, old_image); tr->selector = 0; goto out; } @@ -105,12 +159,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (tr->selector) /* progs already running at this address */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, - old_image, new_image); + err = modify_fentry(tr, old_image, new_image); else /* first time registering */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, NULL, - new_image); + err = register_fentry(tr, new_image); if (err) goto out; tr->selector++; -- cgit v1.2.3-59-g8ed1b From 911bde0fe5ccd7e55760be9d6dcc67a8850fcc12 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 12 Dec 2019 12:14:37 +0100 Subject: mac80211: Turn AQL into an NL80211_EXT_FEATURE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of just having an airtime flag in debugfs, turn AQL into a proper NL80211_EXT_FEATURE, so drivers can turn it on when they are ready, and so we also expose the presence of the feature to userspace. This also has the effect of flipping the default, so drivers have to opt in to using AQL instead of getting it by default with TXQs. To keep functionality the same as pre-patch, we set this feature for ath10k (which is where it is needed the most). While we're at it, split out the debugfs interface so AQL gets its own per-station debugfs file instead of using the 'airtime' file. [Johannes:] This effectively disables AQL for iwlwifi, where it fixes a number of issues: * TSO in iwlwifi is causing underflows and associated warnings in AQL * HE (802.11ax) rates aren't reported properly so at HE rates, AQL could never have a valid estimate (it'd use 6 Mbps instead of up to 2400!) Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20191212111437.224294-1-toke@redhat.com Fixes: 3ace10f5b5ad ("mac80211: Implement Airtime-based Queue Limit (AQL)") Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 1 + include/uapi/linux/nl80211.h | 5 +++ net/mac80211/debugfs_sta.c | 76 ++++++++++++++++++++++++++--------- net/mac80211/main.c | 4 +- net/mac80211/sta_info.c | 3 ++ net/mac80211/sta_info.h | 1 - net/mac80211/tx.c | 4 +- 7 files changed, 70 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 83cc8778ca1e..978f0037ed52 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -8958,6 +8958,7 @@ int ath10k_mac_register(struct ath10k *ar) wiphy_ext_feature_set(ar->hw->wiphy, NL80211_EXT_FEATURE_VHT_IBSS); wiphy_ext_feature_set(ar->hw->wiphy, NL80211_EXT_FEATURE_SET_SCAN_DWELL); + wiphy_ext_feature_set(ar->hw->wiphy, NL80211_EXT_FEATURE_AQL); if (test_bit(WMI_SERVICE_TX_DATA_ACK_RSSI, ar->wmi.svc_map) || test_bit(WMI_SERVICE_HTT_MGMT_TX_COMP_VALID_FLAGS, ar->wmi.svc_map)) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 341e0e8cae46..5eab191607f8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5517,6 +5517,10 @@ enum nl80211_feature_flags { * with VLAN tagged frames and separate VLAN-specific netdevs added using * vconfig similarly to the Ethernet case. * + * @NL80211_EXT_FEATURE_AQL: The driver supports the Airtime Queue Limit (AQL) + * feature, which prevents bufferbloat by using the expected transmission + * time to limit the amount of data buffered in the hardware. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5563,6 +5567,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_STA_TX_PWR, NL80211_EXT_FEATURE_SAE_OFFLOAD, NL80211_EXT_FEATURE_VLAN_OFFLOAD, + NL80211_EXT_FEATURE_AQL, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index b3c9001d1f43..c80b1e163ea4 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -201,8 +201,6 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u64 rx_airtime = 0, tx_airtime = 0; s64 deficit[IEEE80211_NUM_ACS]; - u32 q_depth[IEEE80211_NUM_ACS]; - u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS]; ssize_t rv; int ac; @@ -214,6 +212,56 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, rx_airtime += sta->airtime[ac].rx_airtime; tx_airtime += sta->airtime[ac].tx_airtime; deficit[ac] = sta->airtime[ac].deficit; + spin_unlock_bh(&local->active_txq_lock[ac]); + } + + p += scnprintf(p, bufsz + buf - p, + "RX: %llu us\nTX: %llu us\nWeight: %u\n" + "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n", + rx_airtime, tx_airtime, sta->airtime_weight, + deficit[0], deficit[1], deficit[2], deficit[3]); + + rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + kfree(buf); + return rv; +} + +static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct sta_info *sta = file->private_data; + struct ieee80211_local *local = sta->sdata->local; + int ac; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + spin_lock_bh(&local->active_txq_lock[ac]); + sta->airtime[ac].rx_airtime = 0; + sta->airtime[ac].tx_airtime = 0; + sta->airtime[ac].deficit = sta->airtime_weight; + spin_unlock_bh(&local->active_txq_lock[ac]); + } + + return count; +} +STA_OPS_RW(airtime); + +static ssize_t sta_aql_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct sta_info *sta = file->private_data; + struct ieee80211_local *local = sta->sdata->local; + size_t bufsz = 400; + char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; + u32 q_depth[IEEE80211_NUM_ACS]; + u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS]; + ssize_t rv; + int ac; + + if (!buf) + return -ENOMEM; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + spin_lock_bh(&local->active_txq_lock[ac]); q_limit_l[ac] = sta->airtime[ac].aql_limit_low; q_limit_h[ac] = sta->airtime[ac].aql_limit_high; spin_unlock_bh(&local->active_txq_lock[ac]); @@ -221,12 +269,8 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, } p += scnprintf(p, bufsz + buf - p, - "RX: %llu us\nTX: %llu us\nWeight: %u\n" - "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n" "Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n" "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n", - rx_airtime, tx_airtime, sta->airtime_weight, - deficit[0], deficit[1], deficit[2], deficit[3], q_depth[0], q_depth[1], q_depth[2], q_depth[3], q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1], q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]), @@ -236,11 +280,10 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, return rv; } -static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, +static ssize_t sta_aql_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; - struct ieee80211_local *local = sta->sdata->local; u32 ac, q_limit_l, q_limit_h; char _buf[100] = {}, *buf = _buf; @@ -251,7 +294,7 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, return -EFAULT; buf[sizeof(_buf) - 1] = '\0'; - if (sscanf(buf, "queue limit %u %u %u", &ac, &q_limit_l, &q_limit_h) + if (sscanf(buf, "limit %u %u %u", &ac, &q_limit_l, &q_limit_h) != 3) return -EINVAL; @@ -261,17 +304,10 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, sta->airtime[ac].aql_limit_low = q_limit_l; sta->airtime[ac].aql_limit_high = q_limit_h; - for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { - spin_lock_bh(&local->active_txq_lock[ac]); - sta->airtime[ac].rx_airtime = 0; - sta->airtime[ac].tx_airtime = 0; - sta->airtime[ac].deficit = sta->airtime_weight; - spin_unlock_bh(&local->active_txq_lock[ac]); - } - return count; } -STA_OPS_RW(airtime); +STA_OPS_RW(aql); + static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) @@ -996,6 +1032,10 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) DEBUGFS_ADD(airtime); + if (wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_AQL)) + DEBUGFS_ADD(aql); + debugfs_create_xul("driver_buffered_tids", 0400, sta->debugfs_dir, &sta->driver_buffered_tids); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 6cca0853f183..4c2b5ba3ac09 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -672,9 +672,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H; } - local->airtime_flags = AIRTIME_USE_TX | - AIRTIME_USE_RX | - AIRTIME_USE_AQL; + local->airtime_flags = AIRTIME_USE_TX | AIRTIME_USE_RX; local->aql_threshold = IEEE80211_AQL_THRESHOLD; atomic_set(&local->aql_total_pending_airtime, 0); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8eafd81e97b4..0f5f40678885 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1916,6 +1916,9 @@ void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, { int tx_pending; + if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) + return; + if (!tx_completed) { if (sta) atomic_add(tx_airtime, diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index ad5d8a4ae56d..c00e28585f9d 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -127,7 +127,6 @@ enum ieee80211_agg_stop_reason { /* Debugfs flags to enable/disable use of RX/TX airtime in scheduler */ #define AIRTIME_USE_TX BIT(0) #define AIRTIME_USE_RX BIT(1) -#define AIRTIME_USE_AQL BIT(2) struct airtime_info { u64 rx_airtime; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 38b58a00db46..a8a7306a1f56 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3677,7 +3677,7 @@ begin: IEEE80211_SKB_CB(skb)->control.vif = vif; - if (local->airtime_flags & AIRTIME_USE_AQL) { + if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) { u32 airtime; airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta, @@ -3799,7 +3799,7 @@ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct sta_info *sta; struct ieee80211_local *local = hw_to_local(hw); - if (!(local->airtime_flags & AIRTIME_USE_AQL)) + if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return true; if (!txq->sta) -- cgit v1.2.3-59-g8ed1b From 8dbd76e79a16b45b2ccb01d2f2e08dbf64e71e40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Dec 2019 18:20:41 -0800 Subject: tcp/dccp: fix possible race __inet_lookup_established() Michal Kubecek and Firo Yang did a very nice analysis of crashes happening in __inet_lookup_established(). Since a TCP socket can go from TCP_ESTABLISH to TCP_LISTEN (via a close()/socket()/listen() cycle) without a RCU grace period, I should not have changed listeners linkage in their hash table. They must use the nulls protocol (Documentation/RCU/rculist_nulls.txt), so that a lookup can detect a socket in a hash list was moved in another one. Since we added code in commit d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6 ordering fix"), we have to add hlist_nulls_add_tail_rcu() helper. Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood") Signed-off-by: Eric Dumazet Reported-by: Michal Kubecek Reported-by: Firo Yang Reviewed-by: Michal Kubecek Link: https://lore.kernel.org/netdev/20191120083919.GH27852@unicorn.suse.cz/ Signed-off-by: Jakub Kicinski --- include/linux/rculist_nulls.h | 37 +++++++++++++++++++++++++++++++++++++ include/net/inet_hashtables.h | 12 +++++++++--- include/net/sock.h | 5 +++++ net/ipv4/inet_diag.c | 3 ++- net/ipv4/inet_hashtables.c | 16 ++++++++-------- net/ipv4/tcp_ipv4.c | 7 ++++--- 6 files changed, 65 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index bc8206a8f30e..61974c4c566b 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -100,6 +100,43 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, first->pprev = &n->next; } +/** + * hlist_nulls_add_tail_rcu + * @n: the element to add to the hash list. + * @h: the list to add to. + * + * Description: + * Adds the specified element to the specified hlist_nulls, + * while permitting racing traversals. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() + * or hlist_nulls_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. Regardless of the type of CPU, the + * list-traversal primitive must be guarded by rcu_read_lock(). + */ +static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, + struct hlist_nulls_head *h) +{ + struct hlist_nulls_node *i, *last = NULL; + + /* Note: write side code, so rcu accessors are not needed. */ + for (i = h->first; !is_a_nulls(i); i = i->next) + last = i; + + if (last) { + n->next = last->next; + n->pprev = &last->next; + rcu_assign_pointer(hlist_next_rcu(last), n); + } else { + hlist_nulls_add_head_rcu(n, h); + } +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index af2b4c065a04..d0019d3395cf 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -103,13 +103,19 @@ struct inet_bind_hashbucket { struct hlist_head chain; }; -/* - * Sockets can be hashed in established or listening table +/* Sockets can be hashed in established or listening table. + * We must use different 'nulls' end-of-chain value for all hash buckets : + * A socket might transition from ESTABLISH to LISTEN state without + * RCU grace period. A lookup in ehash table needs to handle this case. */ +#define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; unsigned int count; - struct hlist_head head; + union { + struct hlist_head head; + struct hlist_nulls_head nulls_head; + }; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ diff --git a/include/net/sock.h b/include/net/sock.h index 87d54ef57f00..04c274a20620 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -722,6 +722,11 @@ static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_h hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } +static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) +{ + hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); +} + static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index af154977904c..f11e997e517b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -911,11 +911,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; num = 0; ilb = &hashinfo->listening_hash[i]; spin_lock(&ilb->lock); - sk_for_each(sk, &ilb->head) { + sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 83fb00153018..2bbaaf0c7176 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -516,10 +516,11 @@ static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; + const struct hlist_nulls_node *node; struct sock *sk2; kuid_t uid = sock_i_uid(sk); - sk_for_each_rcu(sk2, &ilb->head) { + sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && @@ -555,9 +556,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) - hlist_add_tail_rcu(&sk->sk_node, &ilb->head); + __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); else - hlist_add_head_rcu(&sk->sk_node, &ilb->head); + __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); inet_hash2(hashinfo, sk); ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); @@ -606,11 +607,9 @@ void inet_unhash(struct sock *sk) reuseport_detach_sock(sk); if (ilb) { inet_unhash2(hashinfo, sk); - __sk_del_node_init(sk); - ilb->count--; - } else { - __sk_nulls_del_node_init_rcu(sk); + ilb->count--; } + __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); unlock: spin_unlock_bh(lock); @@ -750,7 +749,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_HEAD(&h->listening_hash[i].head); + INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, + i + LISTENING_NULLS_BASE); h->listening_hash[i].count = 0; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 92282f98dc82..1c7326e04f9b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2147,13 +2147,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; struct sock *sk = cur; if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock(&ilb->lock); - sk = sk_head(&ilb->head); + sk = sk_nulls_head(&ilb->nulls_head); st->offset = 0; goto get_sk; } @@ -2161,9 +2162,9 @@ get_head: ++st->num; ++st->offset; - sk = sk_next(sk); + sk = sk_nulls_next(sk); get_sk: - sk_for_each_from(sk) { + sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == afinfo->family) -- cgit v1.2.3-59-g8ed1b From ee2aabd3fc2eef4c1a0ebdadccc76fbff74b94fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Dec 2019 12:55:30 -0800 Subject: tcp: refine tcp_write_queue_empty() implementation Due to how tcp_sendmsg() is implemented, we can have an empty skb at the tail of the write queue. Most [1] tcp_write_queue_empty() callers want to know if there is anything to send (payload and/or FIN) Instead of checking if the sk_write_queue is empty, we need to test if tp->write_seq == tp->snd_nxt [1] tcp_send_fin() was the only caller that expected to see if an skb was in the write queue, I have changed the code to reuse the tcp_write_queue_tail() result. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 11 ++++++++++- net/ipv4/tcp_output.c | 5 +++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 86b9a8766648..e460ea7f767b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1766,9 +1766,18 @@ static inline bool tcp_skb_is_last(const struct sock *sk, return skb_queue_is_last(&sk->sk_write_queue, skb); } +/** + * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue + * @sk: socket + * + * Since the write queue can have a temporary empty skb in it, + * we must not use "return skb_queue_empty(&sk->sk_write_queue)" + */ static inline bool tcp_write_queue_empty(const struct sock *sk) { - return skb_queue_empty(&sk->sk_write_queue); + const struct tcp_sock *tp = tcp_sk(sk); + + return tp->write_seq == tp->snd_nxt; } static inline bool tcp_rtx_queue_empty(const struct sock *sk) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 57f434a8e41f..36902d08473e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3129,7 +3129,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) */ void tcp_send_fin(struct sock *sk) { - struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); + struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and @@ -3137,6 +3137,7 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ + tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); @@ -3144,7 +3145,7 @@ void tcp_send_fin(struct sock *sk) TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (tcp_write_queue_empty(sk)) { + if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have -- cgit v1.2.3-59-g8ed1b From a2ea07465c8d7984cc6b8b1f0b3324f9b138094a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 16 Dec 2019 17:49:00 +0100 Subject: bpf: Fix missing prog untrack in release_maps Commit da765a2f5993 ("bpf: Add poke dependency tracking for prog array maps") wrongly assumed that in case of prog load errors, we're cleaning up all program tracking via bpf_free_used_maps(). However, it can happen that we're still at the point where we didn't copy map pointers into the prog's aux section such that env->prog->aux->used_maps is still zero, running into a UAF. In such case, the verifier has similar release_maps() helper that drops references to used maps from its env. Consolidate the release code into __bpf_free_used_maps() and call it from all sides to fix it. Fixes: da765a2f5993 ("bpf: Add poke dependency tracking for prog array maps") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/1c2909484ca524ae9f55109b06f22b6213e76376.1576514756.git.daniel@iogearbox.net --- include/linux/bpf.h | 2 ++ kernel/bpf/core.c | 14 ++++++++++---- kernel/bpf/verifier.c | 14 ++------------ 3 files changed, 14 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ac7de5291509..085a59afba85 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -818,6 +818,8 @@ struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); +void __bpf_free_used_maps(struct bpf_prog_aux *aux, + struct bpf_map **used_maps, u32 len); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 49e32acad7d8..6231858df723 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2048,18 +2048,24 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) } } -static void bpf_free_used_maps(struct bpf_prog_aux *aux) +void __bpf_free_used_maps(struct bpf_prog_aux *aux, + struct bpf_map **used_maps, u32 len) { struct bpf_map *map; - int i; + u32 i; bpf_free_cgroup_storage(aux); - for (i = 0; i < aux->used_map_cnt; i++) { - map = aux->used_maps[i]; + for (i = 0; i < len; i++) { + map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); bpf_map_put(map); } +} + +static void bpf_free_used_maps(struct bpf_prog_aux *aux) +{ + __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt); kfree(aux->used_maps); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 034ef81f935b..a1acdce77070 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8298,18 +8298,8 @@ next_insn: /* drop refcnt of maps used by the rejected program */ static void release_maps(struct bpf_verifier_env *env) { - enum bpf_cgroup_storage_type stype; - int i; - - for_each_cgroup_storage_type(stype) { - if (!env->prog->aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(env->prog, - env->prog->aux->cgroup_storage[stype]); - } - - for (i = 0; i < env->used_map_cnt; i++) - bpf_map_put(env->used_maps[i]); + __bpf_free_used_maps(env->prog->aux, env->used_maps, + env->used_map_cnt); } /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ -- cgit v1.2.3-59-g8ed1b From e47304232b373362228bf233f17bd12b11c9aafc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 17 Dec 2019 13:28:16 +0100 Subject: bpf: Fix cgroup local storage prog tracking Recently noticed that we're tracking programs related to local storage maps through their prog pointer. This is a wrong assumption since the prog pointer can still change throughout the verification process, for example, whenever bpf_patch_insn_single() is called. Therefore, the prog pointer that was assigned via bpf_cgroup_storage_assign() is not guaranteed to be the same as we pass in bpf_cgroup_storage_release() and the map would therefore remain in busy state forever. Fix this by using the prog's aux pointer which is stable throughout verification and beyond. Fixes: de9cbbaadba5 ("bpf: introduce cgroup storage maps") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Roman Gushchin Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/1471c69eca3022218666f909bc927a92388fd09e.1576580332.git.daniel@iogearbox.net --- include/linux/bpf-cgroup.h | 8 ++++---- kernel/bpf/core.c | 3 +-- kernel/bpf/local_storage.c | 24 ++++++++++++------------ kernel/bpf/verifier.c | 2 +- 4 files changed, 18 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 169fd25f6bc2..9be71c195d74 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -157,8 +157,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, enum bpf_attach_type type); void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); -int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); -void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); +int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); +void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map); int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, @@ -360,9 +360,9 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, static inline void bpf_cgroup_storage_set( struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} -static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, +static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } -static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, +static inline void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map) {} static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6231858df723..af6b738cf435 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2043,8 +2043,7 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) for_each_cgroup_storage_type(stype) { if (!aux->cgroup_storage[stype]) continue; - bpf_cgroup_storage_release(aux->prog, - aux->cgroup_storage[stype]); + bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]); } } diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 2ba750725cb2..6bf605dd4b94 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -20,7 +20,7 @@ struct bpf_cgroup_storage_map { struct bpf_map map; spinlock_t lock; - struct bpf_prog *prog; + struct bpf_prog_aux *aux; struct rb_root root; struct list_head list; }; @@ -420,7 +420,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_seq_show_elem = cgroup_storage_seq_show_elem, }; -int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); @@ -428,14 +428,14 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) spin_lock_bh(&map->lock); - if (map->prog && map->prog != prog) + if (map->aux && map->aux != aux) goto unlock; - if (prog->aux->cgroup_storage[stype] && - prog->aux->cgroup_storage[stype] != _map) + if (aux->cgroup_storage[stype] && + aux->cgroup_storage[stype] != _map) goto unlock; - map->prog = prog; - prog->aux->cgroup_storage[stype] = _map; + map->aux = aux; + aux->cgroup_storage[stype] = _map; ret = 0; unlock: spin_unlock_bh(&map->lock); @@ -443,16 +443,16 @@ unlock: return ret; } -void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); spin_lock_bh(&map->lock); - if (map->prog == prog) { - WARN_ON(prog->aux->cgroup_storage[stype] != _map); - map->prog = NULL; - prog->aux->cgroup_storage[stype] = NULL; + if (map->aux == aux) { + WARN_ON(aux->cgroup_storage[stype] != _map); + map->aux = NULL; + aux->cgroup_storage[stype] = NULL; } spin_unlock_bh(&map->lock); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a1acdce77070..6ef71429d997 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8268,7 +8268,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) env->used_maps[env->used_map_cnt++] = map; if (bpf_map_is_cgroup_storage(map) && - bpf_cgroup_storage_assign(env->prog, map)) { + bpf_cgroup_storage_assign(env->prog->aux, map)) { verbose(env, "only one cgroup storage of each type is allowed\n"); fdput(f); return -EBUSY; -- cgit v1.2.3-59-g8ed1b From 7c68fa2bddda6d942bd387c9ba5b4300737fd991 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Dec 2019 18:51:03 -0800 Subject: net: annotate lockless accesses to sk->sk_pacing_shift sk->sk_pacing_shift can be read and written without lock synchronization. This patch adds annotations to document this fact and avoid future syzbot complains. This might also avoid unexpected false sharing in sk_pacing_shift_update(), as the compiler could remove the conditional check and always write over sk->sk_pacing_shift : if (sk->sk_pacing_shift != val) sk->sk_pacing_shift = val; Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- net/core/sock.c | 2 +- net/ipv4/tcp_bbr.c | 3 ++- net/ipv4/tcp_output.c | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 04c274a20620..22be668457bf 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2588,9 +2588,9 @@ static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) */ static inline void sk_pacing_shift_update(struct sock *sk, int val) { - if (!sk || !sk_fullsock(sk) || sk->sk_pacing_shift == val) + if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val) return; - sk->sk_pacing_shift = val; + WRITE_ONCE(sk->sk_pacing_shift, val); } /* if a socket is bound to a device, check that the given device diff --git a/net/core/sock.c b/net/core/sock.c index 043db3ce023e..8459ad579f73 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2916,7 +2916,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL; - sk->sk_pacing_shift = 10; + WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; sk_rx_queue_clear(sk); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 32772d6ded4e..a6545ef0d27b 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -306,7 +306,8 @@ static u32 bbr_tso_segs_goal(struct sock *sk) /* Sort of tcp_tso_autosize() but ignoring * driver provided sk_gso_max_size. */ - bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift, + bytes = min_t(unsigned long, + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 36902d08473e..1f7735ca8f22 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1725,7 +1725,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, u32 bytes, segs; bytes = min_t(unsigned long, - sk->sk_pacing_rate >> sk->sk_pacing_shift, + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); /* Goal is to send at least one packet per ms, @@ -2260,7 +2260,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = max_t(unsigned long, 2 * skb->truesize, - sk->sk_pacing_rate >> sk->sk_pacing_shift); + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); -- cgit v1.2.3-59-g8ed1b From 1f26c0d3d24125992ab0026b0dab16c08df947c7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Dec 2019 18:52:45 -0800 Subject: net: fix kernel-doc warning in Fix missing '*' kernel-doc notation that causes this warning: ../include/linux/netdevice.h:1779: warning: bad line: spinlock Fixes: ab92d68fc22f ("net: core: add generic lockdep keys") Signed-off-by: Randy Dunlap Cc: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9ef20389622d..ae5e260911e2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1775,7 +1775,7 @@ enum netdev_priv_flags { * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. * @qdisc_tx_busylock_key: lockdep class annotating Qdisc->busylock - spinlock + * spinlock * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * @qdisc_xmit_lock_key: lockdep class annotating * netdev_queue->_xmit_lock spinlock -- cgit v1.2.3-59-g8ed1b From 0aa4d016c043d16a282e7e93edf6213a7b954c90 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 17 Dec 2019 18:07:41 +0100 Subject: of: mdio: export of_mdiobus_child_is_phy This patch exports of_mdiobus_child_is_phy, allowing to check if a child node is a network PHY. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 3 ++- include/linux/of_mdio.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index c6b87ce2b0cc..fc757ef6eadc 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -162,7 +162,7 @@ static const struct of_device_id whitelist_phys[] = { * A device which is not a phy is expected to have a compatible string * indicating what sort of device it is. */ -static bool of_mdiobus_child_is_phy(struct device_node *child) +bool of_mdiobus_child_is_phy(struct device_node *child) { u32 phy_id; @@ -187,6 +187,7 @@ static bool of_mdiobus_child_is_phy(struct device_node *child) return false; } +EXPORT_SYMBOL(of_mdiobus_child_is_phy); /** * of_mdiobus_register - Register mii_bus and create PHYs from the device tree diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 99cefe6f5edb..79bc82e30c02 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -12,6 +12,7 @@ #include #if IS_ENABLED(CONFIG_OF_MDIO) +extern bool of_mdiobus_child_is_phy(struct device_node *child); extern int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np); extern struct phy_device *of_phy_find_device(struct device_node *phy_np); extern struct phy_device *of_phy_connect(struct net_device *dev, @@ -54,6 +55,11 @@ static inline int of_mdio_parse_addr(struct device *dev, } #else /* CONFIG_OF_MDIO */ +static bool of_mdiobus_child_is_phy(struct device_node *child) +{ + return false; +} + static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) { /* -- cgit v1.2.3-59-g8ed1b From d2ed49cf6c13e379c5819aa5ac20e1f9674ebc89 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 19 Dec 2019 23:24:47 +0000 Subject: mod_devicetable: fix PHY module format When a PHY is probed, if the top bit is set, we end up requesting a module with the string "mdio:-10101110000000100101000101010001" - the top bit is printed to a signed -1 value. This leads to the module not being loaded. Fix the module format string and the macro generating the values for it to ensure that we only print unsigned types and the top bit is always 0/1. We correctly end up with "mdio:10101110000000100101000101010001". Fixes: 8626d3b43280 ("phylib: Support phy module autoloading") Reviewed-by: Andrew Lunn Signed-off-by: Russell King Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mod_devicetable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 5714fd35a83c..e3596db077dc 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -587,9 +587,9 @@ struct platform_device_id { #define MDIO_NAME_SIZE 32 #define MDIO_MODULE_PREFIX "mdio:" -#define MDIO_ID_FMT "%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d" +#define MDIO_ID_FMT "%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u%u" #define MDIO_ID_ARGS(_id) \ - (_id)>>31, ((_id)>>30) & 1, ((_id)>>29) & 1, ((_id)>>28) & 1, \ + ((_id)>>31) & 1, ((_id)>>30) & 1, ((_id)>>29) & 1, ((_id)>>28) & 1, \ ((_id)>>27) & 1, ((_id)>>26) & 1, ((_id)>>25) & 1, ((_id)>>24) & 1, \ ((_id)>>23) & 1, ((_id)>>22) & 1, ((_id)>>21) & 1, ((_id)>>20) & 1, \ ((_id)>>19) & 1, ((_id)>>18) & 1, ((_id)>>17) & 1, ((_id)>>16) & 1, \ -- cgit v1.2.3-59-g8ed1b From 7d49a32a66d2215c5b3bf9bc67c9036ea9904111 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 19 Dec 2019 23:24:52 +0000 Subject: net: phy: ensure that phy IDs are correctly typed PHY IDs are 32-bit unsigned quantities. Ensure that they are always treated as such, and not passed around as "int"s. Fixes: 13d0ab6750b2 ("net: phy: check return code when requesting PHY driver module") Signed-off-by: Russell King Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 8 ++++---- include/linux/phy.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 0887ed2bb050..b13c52873ef5 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -553,7 +553,7 @@ static const struct device_type mdio_bus_phy_type = { .pm = MDIO_BUS_PHY_PM_OPS, }; -static int phy_request_driver_module(struct phy_device *dev, int phy_id) +static int phy_request_driver_module(struct phy_device *dev, u32 phy_id) { int ret; @@ -565,15 +565,15 @@ static int phy_request_driver_module(struct phy_device *dev, int phy_id) * then modprobe isn't available. */ if (IS_ENABLED(CONFIG_MODULES) && ret < 0 && ret != -ENOENT) { - phydev_err(dev, "error %d loading PHY driver module for ID 0x%08x\n", - ret, phy_id); + phydev_err(dev, "error %d loading PHY driver module for ID 0x%08lx\n", + ret, (unsigned long)phy_id); return ret; } return 0; } -struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id, +struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids) { diff --git a/include/linux/phy.h b/include/linux/phy.h index 5032d453ac66..dd4a91f1feaa 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1000,7 +1000,7 @@ int phy_modify_paged_changed(struct phy_device *phydev, int page, u32 regnum, int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set); -struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id, +struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); #if IS_ENABLED(CONFIG_PHYLIB) -- cgit v1.2.3-59-g8ed1b From 258a980d1ec23e2c786e9536a7dd260bea74bae6 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 20 Dec 2019 14:31:40 +0100 Subject: net: dst: Force 4-byte alignment of dst_metrics When storing a pointer to a dst_metrics structure in dst_entry._metrics, two flags are added in the least significant bits of the pointer value. Hence this assumes all pointers to dst_metrics structures have at least 4-byte alignment. However, on m68k, the minimum alignment of 32-bit values is 2 bytes, not 4 bytes. Hence in some kernel builds, dst_default_metrics may be only 2-byte aligned, leading to obscure boot warnings like: WARNING: CPU: 0 PID: 7 at lib/refcount.c:28 refcount_warn_saturate+0x44/0x9a refcount_t: underflow; use-after-free. Modules linked in: CPU: 0 PID: 7 Comm: ksoftirqd/0 Tainted: G W 5.5.0-rc2-atari-01448-g114a1a1038af891d-dirty #261 Stack from 10835e6c: 10835e6c 0038134f 00023fa6 00394b0f 0000001c 00000009 00321560 00023fea 00394b0f 0000001c 001a70f8 00000009 00000000 10835eb4 00000001 00000000 04208040 0000000a 00394b4a 10835ed4 00043aa8 001a70f8 00394b0f 0000001c 00000009 00394b4a 0026aba8 003215a4 00000003 00000000 0026d5a8 00000001 003215a4 003a4361 003238d6 000001f0 00000000 003215a4 10aa3b00 00025e84 003ddb00 10834000 002416a8 10aa3b00 00000000 00000080 000aa038 0004854a Call Trace: [<00023fa6>] __warn+0xb2/0xb4 [<00023fea>] warn_slowpath_fmt+0x42/0x64 [<001a70f8>] refcount_warn_saturate+0x44/0x9a [<00043aa8>] printk+0x0/0x18 [<001a70f8>] refcount_warn_saturate+0x44/0x9a [<0026aba8>] refcount_sub_and_test.constprop.73+0x38/0x3e [<0026d5a8>] ipv4_dst_destroy+0x5e/0x7e [<00025e84>] __local_bh_enable_ip+0x0/0x8e [<002416a8>] dst_destroy+0x40/0xae Fix this by forcing 4-byte alignment of all dst_metrics structures. Fixes: e5fd387ad5b30ca3 ("ipv6: do not overwrite inetpeer metrics prematurely") Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index fe62fe2eb781..8224dad2ae94 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -82,7 +82,7 @@ struct dst_entry { struct dst_metrics { u32 metrics[RTAX_MAX]; refcount_t refcnt; -}; +} __aligned(4); /* Low pointer bits contain DST_METRICS_FLAGS */ extern const struct dst_metrics dst_default_metrics; u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); -- cgit v1.2.3-59-g8ed1b