diff options
Diffstat (limited to 'net')
172 files changed, 1931 insertions, 1023 deletions
diff --git a/net/atm/common.c b/net/atm/common.c index 0ce530af534d..8575f5d52087 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -177,18 +177,18 @@ static void vcc_destroy_socket(struct sock *sk)  	set_bit(ATM_VF_CLOSE, &vcc->flags);  	clear_bit(ATM_VF_READY, &vcc->flags); -	if (vcc->dev) { -		if (vcc->dev->ops->close) -			vcc->dev->ops->close(vcc); -		if (vcc->push) -			vcc->push(vcc, NULL); /* atmarpd has no push */ -		module_put(vcc->owner); - -		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { -			atm_return(vcc, skb->truesize); -			kfree_skb(skb); -		} +	if (vcc->dev && vcc->dev->ops->close) +		vcc->dev->ops->close(vcc); +	if (vcc->push) +		vcc->push(vcc, NULL); /* atmarpd has no push */ +	module_put(vcc->owner); + +	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { +		atm_return(vcc, skb->truesize); +		kfree_skb(skb); +	} +	if (vcc->dev && vcc->dev->ops->owner) {  		module_put(vcc->dev->ops->owner);  		atm_dev_put(vcc->dev);  	} diff --git a/net/atm/lec.c b/net/atm/lec.c index 25fa3a7b72bd..ca37f5a71f5e 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1264,6 +1264,12 @@ static void lec_arp_clear_vccs(struct lec_arp_table *entry)  		entry->vcc = NULL;  	}  	if (entry->recv_vcc) { +		struct atm_vcc *vcc = entry->recv_vcc; +		struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); + +		kfree(vpriv); +		vcc->user_back = NULL; +  		entry->recv_vcc->push = entry->old_recv_push;  		vcc_release_async(entry->recv_vcc, -EPIPE);  		entry->recv_vcc = NULL; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ff57ea89c27e..fd91cd34f25e 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -635,8 +635,10 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,  		break;  	case SO_BINDTODEVICE: -		if (optlen > IFNAMSIZ) -			optlen = IFNAMSIZ; +		if (optlen > IFNAMSIZ - 1) +			optlen = IFNAMSIZ - 1; + +		memset(devname, 0, sizeof(devname));  		if (copy_from_user(devname, optval, optlen)) {  			res = -EFAULT; diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 969466218999..80b87b1f4e3a 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -893,7 +893,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,  	orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig);  	if (!orig_node) -		return; +		goto out;  	neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming,  						     ethhdr->h_source); diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 8f0717c3f7b5..b0469d15da0e 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1009,15 +1009,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,   */  static u8 batadv_nc_random_weight_tq(u8 tq)  { -	u8 rand_val, rand_tq; - -	get_random_bytes(&rand_val, sizeof(rand_val)); -  	/* randomize the estimated packet loss (max TQ - estimated TQ) */ -	rand_tq = rand_val * (BATADV_TQ_MAX_VALUE - tq); - -	/* normalize the randomized packet loss */ -	rand_tq /= BATADV_TQ_MAX_VALUE; +	u8 rand_tq = prandom_u32_max(BATADV_TQ_MAX_VALUE + 1 - tq);  	/* convert to (randomized) estimated tq again */  	return BATADV_TQ_MAX_VALUE - rand_tq; diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index c45962d8527b..0f962dcd239e 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1150,7 +1150,7 @@ static ssize_t batadv_store_throughput_override(struct kobject *kobj,  	ret = batadv_parse_throughput(net_dev, buff, "throughput_override",  				      &tp_override);  	if (!ret) -		return count; +		goto out;  	old_tp_override = atomic_read(&hard_iface->bat_v.throughput_override);  	if (old_tp_override == tp_override) @@ -1190,6 +1190,7 @@ static ssize_t batadv_show_throughput_override(struct kobject *kobj,  	tp_override = atomic_read(&hard_iface->bat_v.throughput_override); +	batadv_hardif_put(hard_iface);  	return sprintf(buff, "%u.%u MBit\n", tp_override / 10,  		       tp_override % 10);  } diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 1476a91ce935..d022f126eb02 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -170,7 +170,6 @@ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,  		    size_t len, u8 mac[16])  {  	uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX]; -	SHASH_DESC_ON_STACK(desc, tfm);  	int err;  	if (len > CMAC_MSG_MAX) @@ -181,8 +180,6 @@ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,  		return -EINVAL;  	} -	desc->tfm = tfm; -  	/* Swap key and message from LSB to MSB */  	swap_buf(k, tmp, 16);  	swap_buf(m, msg_msb, len); @@ -196,8 +193,7 @@ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,  		return err;  	} -	err = crypto_shash_digest(desc, msg_msb, len, mac_msb); -	shash_desc_zero(desc); +	err = crypto_shash_tfm_digest(tfm, msg_msb, len, mac_msb);  	if (err) {  		BT_ERR("Hash computation error %d", err);  		return err; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index ad12fe3fca8c..83490bf73a13 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2413,7 +2413,8 @@ void br_multicast_uninit_stats(struct net_bridge *br)  	free_percpu(br->mcast_stats);  } -static void mcast_stats_add_dir(u64 *dst, u64 *src) +/* noinline for https://bugs.llvm.org/show_bug.cgi?id=45802#c9 */ +static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src)  {  	dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];  	dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX]; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 43dab4066f91..a0f5dbee8f9c 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -612,6 +612,7 @@ int br_process_vlan_info(struct net_bridge *br,  					       v - 1, rtm_cmd);  				v_change_start = 0;  			} +			cond_resched();  		}  		/* v_change_start is set only if the last/whole range changed */  		if (v_change_start) diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index b325b569e761..f48cf4cfb80f 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -31,6 +31,12 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,  	ether_addr_copy(eth->h_dest, eth_hdr(oldskb)->h_source);  	eth->h_proto = eth_hdr(oldskb)->h_proto;  	skb_pull(nskb, ETH_HLEN); + +	if (skb_vlan_tag_present(oldskb)) { +		u16 vid = skb_vlan_tag_get(oldskb); + +		__vlan_hwaccel_put_tag(nskb, oldskb->vlan_proto, vid); +	}  }  static int nft_bridge_iphdr_validate(struct sk_buff *skb) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 998e26b75a78..1d4973f8cd7a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3649,7 +3649,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)  		 * supported.  		 */  		req->r_t.target_oloc.pool = m.redirect.oloc.pool; -		req->r_flags |= CEPH_OSD_FLAG_REDIRECTED; +		req->r_flags |= CEPH_OSD_FLAG_REDIRECTED | +				CEPH_OSD_FLAG_IGNORE_OVERLAY | +				CEPH_OSD_FLAG_IGNORE_CACHE;  		req->r_tid = 0;  		__submit_request(req, false);  		goto out_unlock_osdc; diff --git a/net/core/dev.c b/net/core/dev.c index 9c9e763bfe0e..2d8aceee4284 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4140,7 +4140,8 @@ EXPORT_SYMBOL(netdev_max_backlog);  int netdev_tstamp_prequeue __read_mostly = 1;  int netdev_budget __read_mostly = 300; -unsigned int __read_mostly netdev_budget_usecs = 2000; +/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ +unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;  int weight_p __read_mostly = 64;           /* old backlog weight */  int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */  int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */ @@ -4987,11 +4988,12 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,  	return 0;  } -static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc, +static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,  				    struct packet_type **ppt_prev)  {  	struct packet_type *ptype, *pt_prev;  	rx_handler_func_t *rx_handler; +	struct sk_buff *skb = *pskb;  	struct net_device *orig_dev;  	bool deliver_exact = false;  	int ret = NET_RX_DROP; @@ -5022,8 +5024,10 @@ another_round:  		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);  		preempt_enable(); -		if (ret2 != XDP_PASS) -			return NET_RX_DROP; +		if (ret2 != XDP_PASS) { +			ret = NET_RX_DROP; +			goto out; +		}  		skb_reset_mac_len(skb);  	} @@ -5173,6 +5177,13 @@ drop:  	}  out: +	/* The invariant here is that if *ppt_prev is not NULL +	 * then skb should also be non-NULL. +	 * +	 * Apparently *ppt_prev assignment above holds this invariant due to +	 * skb dereferencing near it. +	 */ +	*pskb = skb;  	return ret;  } @@ -5182,7 +5193,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)  	struct packet_type *pt_prev = NULL;  	int ret; -	ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); +	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);  	if (pt_prev)  		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,  					 skb->dev, pt_prev, orig_dev); @@ -5260,7 +5271,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo  		struct packet_type *pt_prev = NULL;  		skb_list_del_init(skb); -		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev); +		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);  		if (!pt_prev)  			continue;  		if (pt_curr != pt_prev || od_curr != orig_dev) { @@ -8666,8 +8677,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,  	const struct net_device_ops *ops = dev->netdev_ops;  	enum bpf_netdev_command query;  	u32 prog_id, expected_id = 0; -	struct bpf_prog *prog = NULL;  	bpf_op_t bpf_op, bpf_chk; +	struct bpf_prog *prog;  	bool offload;  	int err; @@ -8733,6 +8744,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,  	} else {  		if (!prog_id)  			return 0; +		prog = NULL;  	}  	err = dev_xdp_install(dev, bpf_op, extack, flags, prog); @@ -8905,11 +8917,13 @@ static void netdev_sync_lower_features(struct net_device *upper,  			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",  				   &feature, lower->name);  			lower->wanted_features &= ~feature; -			netdev_update_features(lower); +			__netdev_update_features(lower);  			if (unlikely(lower->features & feature))  				netdev_WARN(upper, "failed to disable %pNF on %s!\n",  					    &feature, lower->name); +			else +				netdev_features_change(lower);  		}  	}  } diff --git a/net/core/devlink.c b/net/core/devlink.c index 80f97722f31f..899edcee7dab 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4283,6 +4283,11 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,  		end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);  		end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);  		dump = false; + +		if (start_offset == end_offset) { +			err = 0; +			goto nla_put_failure; +		}  	}  	err = devlink_nl_region_read_snapshot_fill(skb, devlink, @@ -5363,6 +5368,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter,  {  	enum devlink_health_reporter_state prev_health_state;  	struct devlink *devlink = reporter->devlink; +	unsigned long recover_ts_threshold;  	/* write a log message of the current error */  	WARN_ON(!msg); @@ -5373,10 +5379,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter,  	devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);  	/* abort if the previous error wasn't recovered */ +	recover_ts_threshold = reporter->last_recovery_ts + +			       msecs_to_jiffies(reporter->graceful_period);  	if (reporter->auto_recover &&  	    (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || -	     jiffies - reporter->last_recovery_ts < -	     msecs_to_jiffies(reporter->graceful_period))) { +	     (reporter->last_recovery_ts && reporter->recovery_count && +	      time_is_after_jiffies(recover_ts_threshold)))) {  		trace_devlink_health_recover_aborted(devlink,  						     reporter->ops->name,  						     reporter->health_state, diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 8e33cec9fc4e..2ee7bc4c9e03 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -213,6 +213,7 @@ static void sched_send_work(struct timer_list *t)  static void trace_drop_common(struct sk_buff *skb, void *location)  {  	struct net_dm_alert_msg *msg; +	struct net_dm_drop_point *point;  	struct nlmsghdr *nlh;  	struct nlattr *nla;  	int i; @@ -231,11 +232,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location)  	nlh = (struct nlmsghdr *)dskb->data;  	nla = genlmsg_data(nlmsg_data(nlh));  	msg = nla_data(nla); +	point = msg->points;  	for (i = 0; i < msg->entries; i++) { -		if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { -			msg->points[i].count++; +		if (!memcmp(&location, &point->pc, sizeof(void *))) { +			point->count++;  			goto out;  		} +		point++;  	}  	if (msg->entries == dm_hit_limit)  		goto out; @@ -244,8 +247,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location)  	 */  	__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));  	nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); -	memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); -	msg->points[msg->entries].count = 1; +	memcpy(point->pc, &location, sizeof(void *)); +	point->count = 1;  	msg->entries++;  	if (!timer_pending(&data->send_timer)) { diff --git a/net/core/filter.c b/net/core/filter.c index 7628b947dbc3..5cc9276f1023 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2590,8 +2590,8 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,  			}  			pop = 0;  		} else if (pop >= sge->length - a) { -			sge->length = a;  			pop -= (sge->length - a); +			sge->length = a;  		}  	} @@ -5925,7 +5925,7 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)  		return -EOPNOTSUPP;  	if (unlikely(dev_net(skb->dev) != sock_net(sk)))  		return -ENETUNREACH; -	if (unlikely(sk->sk_reuseport)) +	if (unlikely(sk_fullsock(sk) && sk->sk_reuseport))  		return -ESOCKTNOSUPPORT;  	if (sk_is_refcounted(sk) &&  	    unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3eff84824c8b..5dceed467f64 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -160,12 +160,10 @@ out:  	return ret;  } -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +static int flow_dissector_bpf_prog_detach(struct net *net)  {  	struct bpf_prog *attached; -	struct net *net; -	net = current->nsproxy->net_ns;  	mutex_lock(&flow_dissector_mutex);  	attached = rcu_dereference_protected(net->flow_dissector_prog,  					     lockdep_is_held(&flow_dissector_mutex)); @@ -179,6 +177,24 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)  	return 0;  } +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +{ +	return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns); +} + +static void __net_exit flow_dissector_pernet_pre_exit(struct net *net) +{ +	/* We're not racing with attach/detach because there are no +	 * references to netns left when pre_exit gets called. +	 */ +	if (rcu_access_pointer(net->flow_dissector_prog)) +		flow_dissector_bpf_prog_detach(net); +} + +static struct pernet_operations flow_dissector_pernet_ops __net_initdata = { +	.pre_exit = flow_dissector_pernet_pre_exit, +}; +  /**   * __skb_flow_get_ports - extract the upper layer ports and return them   * @skb: sk_buff to extract the ports from @@ -1836,7 +1852,7 @@ static int __init init_default_flow_dissectors(void)  	skb_flow_dissector_init(&flow_keys_basic_dissector,  				flow_keys_basic_dissector_keys,  				ARRAY_SIZE(flow_keys_basic_dissector_keys)); -	return 0; -} +	return register_pernet_subsys(&flow_dissector_pernet_ops); +}  core_initcall(init_default_flow_dissectors); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 39d37d0ef575..dbe0c6ead773 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1082,8 +1082,8 @@ static void neigh_timer_handler(struct timer_list *t)  	}  	if (neigh->nud_state & NUD_IN_TIMER) { -		if (time_before(next, jiffies + HZ/2)) -			next = jiffies + HZ/2; +		if (time_before(next, jiffies + HZ/100)) +			next = jiffies + HZ/100;  		if (!mod_timer(&neigh->timer, next))  			neigh_hold(neigh);  	} @@ -1956,6 +1956,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,  				   NEIGH_UPDATE_F_OVERRIDE_ISROUTER);  	} +	if (protocol) +		neigh->protocol = protocol; +  	if (ndm->ndm_flags & NTF_EXT_LEARNED)  		flags |= NEIGH_UPDATE_F_EXT_LEARNED; @@ -1969,9 +1972,6 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,  		err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,  				     NETLINK_CB(skb).portid, extack); -	if (protocol) -		neigh->protocol = protocol; -  	neigh_release(neigh);  out: diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index cf0215734ceb..4773ad6ec111 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -80,7 +80,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,  	struct net_device *netdev = to_net_dev(dev);  	struct net *net = dev_net(netdev);  	unsigned long new; -	int ret = -EINVAL; +	int ret;  	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index b4c87fe31be2..41b24cd31562 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -127,10 +127,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,  	cs->classid = (u32)value;  	css_task_iter_start(css, 0, &it); -	while ((p = css_task_iter_next(&it))) { +	while ((p = css_task_iter_next(&it)))  		update_classid_task(p, cs->classid); -		cond_resched(); -	}  	css_task_iter_end(&it);  	return 0; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 8881dd943dd0..9bd4cab7d510 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -236,6 +236,8 @@ static void net_prio_attach(struct cgroup_taskset *tset)  	struct task_struct *p;  	struct cgroup_subsys_state *css; +	cgroup_sk_alloc_disable(); +  	cgroup_taskset_for_each(p, css, tset) {  		void *v = (void *)(unsigned long)css->id; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 7b6b1d2c3d10..b5bc680d4755 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -5,7 +5,6 @@  #include <linux/kernel.h>  #include <linux/init.h> -#include <linux/cryptohash.h>  #include <linux/module.h>  #include <linux/cache.h>  #include <linux/random.h> diff --git a/net/core/sock.c b/net/core/sock.c index ce1d8dce9b7a..b714162213ae 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1872,7 +1872,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  		 * as not suitable for copying when cloning.  		 */  		if (sk_user_data_is_nocopy(newsk)) -			RCU_INIT_POINTER(newsk->sk_user_data, NULL); +			newsk->sk_user_data = NULL;  		newsk->sk_err	   = 0;  		newsk->sk_err_soft = 0; @@ -2364,7 +2364,6 @@ static void sk_leave_memory_pressure(struct sock *sk)  	}  } -/* On 32bit arches, an skb frag is limited to 2^15 */  #define SKB_FRAG_PAGE_ORDER	get_order(32768)  DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 9a271a58a41d..d90665b465b8 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -459,7 +459,7 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)  	list_for_each_entry(dp, &dst->ports, list) {  		err = dsa_port_setup(dp);  		if (err) -			goto teardown; +			continue;  	}  	return 0; diff --git a/net/dsa/master.c b/net/dsa/master.c index b5c535af63a3..a621367c6e8c 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -289,7 +289,8 @@ static void dsa_master_ndo_teardown(struct net_device *dev)  {  	struct dsa_port *cpu_dp = dev->dsa_ptr; -	dev->netdev_ops = cpu_dp->orig_ndo_ops; +	if (cpu_dp->orig_ndo_ops) +		dev->netdev_ops = cpu_dp->orig_ndo_ops;  	cpu_dp->orig_ndo_ops = NULL;  } diff --git a/net/dsa/port.c b/net/dsa/port.c index 231b2d494f1c..a58fdd362574 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -670,11 +670,16 @@ int dsa_port_link_register_of(struct dsa_port *dp)  {  	struct dsa_switch *ds = dp->ds;  	struct device_node *phy_np; +	int port = dp->index;  	if (!ds->ops->adjust_link) {  		phy_np = of_parse_phandle(dp->dn, "phy-handle", 0); -		if (of_phy_is_fixed_link(dp->dn) || phy_np) +		if (of_phy_is_fixed_link(dp->dn) || phy_np) { +			if (ds->ops->phylink_mac_link_down) +				ds->ops->phylink_mac_link_down(ds, port, +					MLO_AN_FIXED, PHY_INTERFACE_MODE_NA);  			return dsa_port_phylink_register(dp); +		}  		return 0;  	} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index e94eb1aac602..d3bcb9afa795 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -856,20 +856,18 @@ dsa_slave_add_cls_matchall_mirred(struct net_device *dev,  	struct dsa_port *to_dp;  	int err; -	act = &cls->rule->action.entries[0]; -  	if (!ds->ops->port_mirror_add)  		return -EOPNOTSUPP; -	if (!act->dev) -		return -EINVAL; -  	if (!flow_action_basic_hw_stats_check(&cls->rule->action,  					      cls->common.extack))  		return -EOPNOTSUPP;  	act = &cls->rule->action.entries[0]; +	if (!act->dev) +		return -EINVAL; +  	if (!dsa_slave_dev_check(act->dev))  		return -EOPNOTSUPP; @@ -1738,6 +1736,7 @@ int dsa_slave_create(struct dsa_port *port)  	if (ds->ops->port_vlan_add && ds->ops->port_vlan_del)  		slave_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;  	slave_dev->hw_features |= NETIF_F_HW_TC; +	slave_dev->features |= NETIF_F_LLTX;  	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;  	if (!IS_ERR_OR_NULL(port->mac))  		ether_addr_copy(slave_dev->dev_addr, port->mac); @@ -1770,11 +1769,9 @@ int dsa_slave_create(struct dsa_port *port)  	rtnl_lock();  	ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN);  	rtnl_unlock(); -	if (ret && ret != -EOPNOTSUPP) { -		dev_err(ds->dev, "error %d setting MTU on port %d\n", -			ret, port->index); -		goto out_free; -	} +	if (ret) +		dev_warn(ds->dev, "nonfatal error %d setting MTU on port %d\n", +			 ret, port->index);  	netif_carrier_off(slave_dev); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index b5705cba8318..d6619edd53e5 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -15,6 +15,7 @@  #define MTK_HDR_XMIT_TAGGED_TPID_8100	1  #define MTK_HDR_RECV_SOURCE_PORT_MASK	GENMASK(2, 0)  #define MTK_HDR_XMIT_DP_BIT_MASK	GENMASK(5, 0) +#define MTK_HDR_XMIT_SA_DIS		BIT(6)  static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,  				    struct net_device *dev) @@ -22,6 +23,9 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,  	struct dsa_port *dp = dsa_slave_to_port(dev);  	u8 *mtk_tag;  	bool is_vlan_skb = true; +	unsigned char *dest = eth_hdr(skb)->h_dest; +	bool is_multicast_skb = is_multicast_ether_addr(dest) && +				!is_broadcast_ether_addr(dest);  	/* Build the special tag after the MAC Source Address. If VLAN header  	 * is present, it's required that VLAN header and special tag is @@ -47,6 +51,10 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,  		     MTK_HDR_XMIT_UNTAGGED;  	mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; +	/* Disable SA learning for multicast frames */ +	if (unlikely(is_multicast_skb)) +		mtk_tag[1] |= MTK_HDR_XMIT_SA_DIS; +  	/* Tag control information is kept for 802.1Q */  	if (!is_vlan_skb) {  		mtk_tag[2] = 0; @@ -61,6 +69,9 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,  {  	int port;  	__be16 *phdr, hdr; +	unsigned char *dest = eth_hdr(skb)->h_dest; +	bool is_multicast_skb = is_multicast_ether_addr(dest) && +				!is_broadcast_ether_addr(dest);  	if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN)))  		return NULL; @@ -86,6 +97,10 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,  	if (!skb->dev)  		return NULL; +	/* Only unicast or broadcast frames are offloaded */ +	if (likely(!is_multicast_skb)) +		skb->offload_fwd_mark = 1; +  	return skb;  } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0c772318c023..ed5357210193 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -342,7 +342,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)  	ret = ops->reply_size(req_info, reply_data);  	if (ret < 0)  		goto err_cleanup; -	reply_len = ret; +	reply_len = ret + ethnl_reply_header_size();  	ret = -ENOMEM;  	rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd,  				ops->hdr_attr, info, &reply_payload); @@ -588,7 +588,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd,  	ret = ops->reply_size(req_info, reply_data);  	if (ret < 0)  		goto err_cleanup; -	reply_len = ret; +	reply_len = ret + ethnl_reply_header_size();  	ret = -ENOMEM;  	skb = genlmsg_new(reply_len, GFP_KERNEL);  	if (!skb) diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 95eae5c68a52..0eed4e4909ab 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -324,7 +324,6 @@ static int strset_reply_size(const struct ethnl_req_info *req_base,  	int len = 0;  	int ret; -	len += ethnl_reply_header_size();  	for (i = 0; i < ETH_SS_COUNT; i++) {  		const struct strset_info *set_info = &data->sets[i]; diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 5465a395da04..1decb25f6764 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -69,10 +69,16 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,  	else  		multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]); -	if (!data[IFLA_HSR_VERSION]) +	if (!data[IFLA_HSR_VERSION]) {  		hsr_version = 0; -	else +	} else {  		hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]); +		if (hsr_version > 1) { +			NL_SET_ERR_MSG_MOD(extack, +					   "Only versions 0..1 are supported"); +			return -EINVAL; +		} +	}  	return hsr_dev_finalize(dev, link, multicast_spec, hsr_version, extack);  } diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index f4b9f7a3ce51..25b6ffba26cd 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -18,7 +18,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)  {  	struct sk_buff *skb = *pskb;  	struct hsr_port *port; -	u16 protocol; +	__be16 protocol;  	if (!skb_mac_header_was_set(skb)) {  		WARN_ONCE(1, "%s: skb invalid", __func__); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 0bd10a1f477f..a23094b050f8 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1258,7 +1258,8 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,  			return ret_val;  		} -		secattr->flags |= NETLBL_SECATTR_MLS_CAT; +		if (secattr->attr.mls.cat) +			secattr->flags |= NETLBL_SECATTR_MLS_CAT;  	}  	return 0; @@ -1439,7 +1440,8 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,  			return ret_val;  		} -		secattr->flags |= NETLBL_SECATTR_MLS_CAT; +		if (secattr->attr.mls.cat) +			secattr->flags |= NETLBL_SECATTR_MLS_CAT;  	}  	return 0; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 30fa42f5997d..5267b6b191eb 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -276,6 +276,7 @@ static struct in_device *inetdev_init(struct net_device *dev)  	err = devinet_sysctl_register(in_dev);  	if (err) {  		in_dev->dead = 1; +		neigh_parms_release(&arp_tbl, in_dev->arp_parms);  		in_dev_put(in_dev);  		in_dev = NULL;  		goto out; @@ -614,12 +615,15 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,  	return NULL;  } -static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) +static int ip_mc_autojoin_config(struct net *net, bool join, +				 const struct in_ifaddr *ifa)  { +#if defined(CONFIG_IP_MULTICAST)  	struct ip_mreqn mreq = {  		.imr_multiaddr.s_addr = ifa->ifa_address,  		.imr_ifindex = ifa->ifa_dev->dev->ifindex,  	}; +	struct sock *sk = net->ipv4.mc_autojoin_sk;  	int ret;  	ASSERT_RTNL(); @@ -632,6 +636,9 @@ static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa)  	release_sock(sk);  	return ret; +#else +	return -EOPNOTSUPP; +#endif  }  static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -675,7 +682,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,  			continue;  		if (ipv4_is_multicast(ifa->ifa_address)) -			ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa); +			ip_mc_autojoin_config(net, false, ifa);  		__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);  		return 0;  	} @@ -940,8 +947,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,  		 */  		set_ifa_lifetime(ifa, valid_lft, prefered_lft);  		if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { -			int ret = ip_mc_config(net->ipv4.mc_autojoin_sk, -					       true, ifa); +			int ret = ip_mc_autojoin_config(net, true, ifa);  			if (ret < 0) {  				inet_free_ifa(ifa); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 731022cff600..d14133eac476 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -63,10 +63,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,  		sp->olen++;  		xo = xfrm_offload(skb); -		if (!xo) { -			xfrm_state_put(x); +		if (!xo)  			goto out_reset; -		}  	}  	xo->flags |= XFRM_GRO; @@ -139,19 +137,27 @@ static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x,  	struct xfrm_offload *xo = xfrm_offload(skb);  	struct sk_buff *segs = ERR_PTR(-EINVAL);  	const struct net_offload *ops; -	int proto = xo->proto; +	u8 proto = xo->proto;  	skb->transport_header += x->props.header_len; -	if (proto == IPPROTO_BEETPH) { -		struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; +	if (x->sel.family != AF_INET6) { +		if (proto == IPPROTO_BEETPH) { +			struct ip_beet_phdr *ph = +				(struct ip_beet_phdr *)skb->data; + +			skb->transport_header += ph->hdrlen * 8; +			proto = ph->nexthdr; +		} else { +			skb->transport_header -= IPV4_BEET_PHMAXLEN; +		} +	} else { +		__be16 frag; -		skb->transport_header += ph->hdrlen * 8; -		proto = ph->nexthdr; -	} else if (x->sel.family != AF_INET6) { -		skb->transport_header -= IPV4_BEET_PHMAXLEN; -	} else if (proto == IPPROTO_TCP) { -		skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; +		skb->transport_header += +			ipv6_skip_exthdr(skb, 0, &proto, &frag); +		if (proto == IPPROTO_TCP) +			skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;  	}  	__skb_pull(skb, skb_transport_offset(skb)); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 213be9c050ad..41079490a118 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -309,17 +309,18 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)  {  	bool dev_match = false;  #ifdef CONFIG_IP_ROUTE_MULTIPATH -	int ret; +	if (unlikely(fi->nh)) { +		dev_match = nexthop_uses_dev(fi->nh, dev); +	} else { +		int ret; -	for (ret = 0; ret < fib_info_num_path(fi); ret++) { -		const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); +		for (ret = 0; ret < fib_info_num_path(fi); ret++) { +			const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); -		if (nhc->nhc_dev == dev) { -			dev_match = true; -			break; -		} else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) { -			dev_match = true; -			break; +			if (nhc_l3mdev_matches_dev(nhc, dev)) { +				dev_match = true; +				break; +			}  		}  	}  #else @@ -918,7 +919,6 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,  	else  		filter->dump_exceptions = false; -	filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC);  	filter->flags    = rtm->rtm_flags;  	filter->protocol = rtm->rtm_protocol;  	filter->rt_type  = rtm->rtm_type; @@ -990,7 +990,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)  	if (filter.table_id) {  		tb = fib_get_table(net, filter.table_id);  		if (!tb) { -			if (filter.dump_all_families) +			if (rtnl_msg_family(cb->nlh) != PF_INET)  				return skb->len;  			NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6ed8c9317179..55ca2e521828 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2014,7 +2014,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)  	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {  		struct fib_info *next_fi = fa->fa_info; -		struct fib_nh *nh; +		struct fib_nh_common *nhc;  		if (fa->fa_slen != slen)  			continue; @@ -2037,8 +2037,8 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)  		    fa->fa_type != RTN_UNICAST)  			continue; -		nh = fib_info_nh(next_fi, 0); -		if (!nh->fib_nh_gw4 || nh->fib_nh_scope != RT_SCOPE_LINK) +		nhc = fib_info_nhc(next_fi, 0); +		if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK)  			continue;  		fib_alias_accessed(fa); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 4f334b425538..248f1c1959a6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1371,6 +1371,26 @@ static inline t_key prefix_mismatch(t_key key, struct key_vector *n)  	return (key ^ prefix) & (prefix | -prefix);  } +bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, +			 const struct flowi4 *flp) +{ +	if (nhc->nhc_flags & RTNH_F_DEAD) +		return false; + +	if (ip_ignore_linkdown(nhc->nhc_dev) && +	    nhc->nhc_flags & RTNH_F_LINKDOWN && +	    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) +		return false; + +	if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { +		if (flp->flowi4_oif && +		    flp->flowi4_oif != nhc->nhc_oif) +			return false; +	} + +	return true; +} +  /* should be called with rcu_read_lock */  int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,  		     struct fib_result *res, int fib_flags) @@ -1503,6 +1523,7 @@ found:  	/* Step 3: Process the leaf, if that fails fall back to backtracing */  	hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {  		struct fib_info *fi = fa->fa_info; +		struct fib_nh_common *nhc;  		int nhsel, err;  		if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { @@ -1528,26 +1549,25 @@ out_reject:  		if (fi->fib_flags & RTNH_F_DEAD)  			continue; -		if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) { -			err = fib_props[RTN_BLACKHOLE].error; -			goto out_reject; +		if (unlikely(fi->nh)) { +			if (nexthop_is_blackhole(fi->nh)) { +				err = fib_props[RTN_BLACKHOLE].error; +				goto out_reject; +			} + +			nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp, +						     &nhsel); +			if (nhc) +				goto set_result; +			goto miss;  		}  		for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { -			struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); +			nhc = fib_info_nhc(fi, nhsel); -			if (nhc->nhc_flags & RTNH_F_DEAD) +			if (!fib_lookup_good_nhc(nhc, fib_flags, flp))  				continue; -			if (ip_ignore_linkdown(nhc->nhc_dev) && -			    nhc->nhc_flags & RTNH_F_LINKDOWN && -			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) -				continue; -			if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { -				if (flp->flowi4_oif && -				    flp->flowi4_oif != nhc->nhc_oif) -					continue; -			} - +set_result:  			if (!(fib_flags & FIB_LOOKUP_NOREF))  				refcount_inc(&fi->fib_clntref); @@ -1568,6 +1588,7 @@ out_reject:  			return err;  		}  	} +miss:  #ifdef CONFIG_IP_FIB_TRIE_STATS  	this_cpu_inc(stats->semantic_match_miss);  #endif diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5f34eb951627..65c29f2bd89f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,17 +24,19 @@  #include <net/addrconf.h>  #if IS_ENABLED(CONFIG_IPV6) -/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 - *                          only, and any IPv4 addresses if not IPv6 only - * match_wildcard == false: addresses must be exactly the same, i.e. - *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, - *                          and 0.0.0.0 equals to 0.0.0.0 only +/* match_sk*_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses + *				if IPv6 only, and any IPv4 addresses + *				if not IPv6 only + * match_sk*_wildcard == false: addresses must be exactly the same, i.e. + *				IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + *				and 0.0.0.0 equals to 0.0.0.0 only   */  static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,  				 const struct in6_addr *sk2_rcv_saddr6,  				 __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,  				 bool sk1_ipv6only, bool sk2_ipv6only, -				 bool match_wildcard) +				 bool match_sk1_wildcard, +				 bool match_sk2_wildcard)  {  	int addr_type = ipv6_addr_type(sk1_rcv_saddr6);  	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -44,8 +46,8 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,  		if (!sk2_ipv6only) {  			if (sk1_rcv_saddr == sk2_rcv_saddr)  				return true; -			if (!sk1_rcv_saddr || !sk2_rcv_saddr) -				return match_wildcard; +			return (match_sk1_wildcard && !sk1_rcv_saddr) || +				(match_sk2_wildcard && !sk2_rcv_saddr);  		}  		return false;  	} @@ -53,11 +55,11 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,  	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)  		return true; -	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && +	if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard &&  	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))  		return true; -	if (addr_type == IPV6_ADDR_ANY && match_wildcard && +	if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard &&  	    !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))  		return true; @@ -69,18 +71,19 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,  }  #endif -/* match_wildcard == true:  0.0.0.0 equals to any IPv4 addresses - * match_wildcard == false: addresses must be exactly the same, i.e. - *                          0.0.0.0 only equals to 0.0.0.0 +/* match_sk*_wildcard == true:  0.0.0.0 equals to any IPv4 addresses + * match_sk*_wildcard == false: addresses must be exactly the same, i.e. + *				0.0.0.0 only equals to 0.0.0.0   */  static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, -				 bool sk2_ipv6only, bool match_wildcard) +				 bool sk2_ipv6only, bool match_sk1_wildcard, +				 bool match_sk2_wildcard)  {  	if (!sk2_ipv6only) {  		if (sk1_rcv_saddr == sk2_rcv_saddr)  			return true; -		if (!sk1_rcv_saddr || !sk2_rcv_saddr) -			return match_wildcard; +		return (match_sk1_wildcard && !sk1_rcv_saddr) || +			(match_sk2_wildcard && !sk2_rcv_saddr);  	}  	return false;  } @@ -96,10 +99,12 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,  					    sk2->sk_rcv_saddr,  					    ipv6_only_sock(sk),  					    ipv6_only_sock(sk2), +					    match_wildcard,  					    match_wildcard);  #endif  	return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, -				    ipv6_only_sock(sk2), match_wildcard); +				    ipv6_only_sock(sk2), match_wildcard, +				    match_wildcard);  }  EXPORT_SYMBOL(inet_rcv_saddr_equal); @@ -285,10 +290,10 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,  					    tb->fast_rcv_saddr,  					    sk->sk_rcv_saddr,  					    tb->fast_ipv6_only, -					    ipv6_only_sock(sk), true); +					    ipv6_only_sock(sk), true, false);  #endif  	return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, -				    ipv6_only_sock(sk), true); +				    ipv6_only_sock(sk), true, false);  }  /* Obtain a reference to a local port for the given sock, diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 1b4e6f298648..1dda7c155c48 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -93,7 +93,28 @@ static int vti_rcv_proto(struct sk_buff *skb)  static int vti_rcv_tunnel(struct sk_buff *skb)  { -	return vti_rcv(skb, ip_hdr(skb)->saddr, true); +	struct ip_tunnel_net *itn = net_generic(dev_net(skb->dev), vti_net_id); +	const struct iphdr *iph = ip_hdr(skb); +	struct ip_tunnel *tunnel; + +	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, +				  iph->saddr, iph->daddr, 0); +	if (tunnel) { +		struct tnl_ptk_info tpi = { +			.proto = htons(ETH_P_IP), +		}; + +		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) +			goto drop; +		if (iptunnel_pull_header(skb, 0, tpi.proto, false)) +			goto drop; +		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, false); +	} + +	return -EINVAL; +drop: +	kfree_skb(skb); +	return 0;  }  static int vti_rcv_cb(struct sk_buff *skb, int err) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2f01cf6fa0de..678575adaf3b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -698,7 +698,7 @@ out:  rtnl_link_failed:  #if IS_ENABLED(CONFIG_MPLS) -	xfrm4_tunnel_deregister(&mplsip_handler, AF_INET); +	xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);  xfrm_tunnel_mplsip_failed:  #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9cf83cc85e4a..b2363b82b48d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -109,8 +109,10 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags);  static void ipmr_expire_process(struct timer_list *t);  #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES -#define ipmr_for_each_table(mrt, net) \ -	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list) +#define ipmr_for_each_table(mrt, net)					\ +	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list,	\ +				lockdep_rtnl_is_held() ||		\ +				list_empty(&net->ipv4.mr_tables))  static struct mr_table *ipmr_mr_table_iter(struct net *net,  					   struct mr_table *mrt) @@ -2611,7 +2613,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)  		mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id);  		if (!mrt) { -			if (filter.dump_all_families) +			if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR)  				return skb->len;  			NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 3c25a467b3ef..7afde8828b4c 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -166,8 +166,7 @@ pptp_outbound_pkt(struct sk_buff *skb,  		break;  	default:  		pr_debug("unknown outbound packet 0x%04x:%s\n", msg, -			 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : -					       pptp_msg_name[0]); +			 pptp_msg_name(msg));  		fallthrough;  	case PPTP_SET_LINK_INFO:  		/* only need to NAT in case PAC is behind NAT box */ @@ -268,9 +267,7 @@ pptp_inbound_pkt(struct sk_buff *skb,  		pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);  		break;  	default: -		pr_debug("unknown inbound packet %s\n", -			 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : -					       pptp_msg_name[0]); +		pr_debug("unknown inbound packet %s\n", pptp_msg_name(msg));  		fallthrough;  	case PPTP_START_SESSION_REQUEST:  	case PPTP_START_SESSION_REPLY: diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index fdfca534d094..563f71bcb2d7 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -63,9 +63,16 @@ static void nexthop_free_mpath(struct nexthop *nh)  	int i;  	nhg = rcu_dereference_raw(nh->nh_grp); -	for (i = 0; i < nhg->num_nh; ++i) -		WARN_ON(nhg->nh_entries[i].nh); +	for (i = 0; i < nhg->num_nh; ++i) { +		struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + +		WARN_ON(!list_empty(&nhge->nh_list)); +		nexthop_put(nhge->nh); +	} + +	WARN_ON(nhg->spare == nhg); +	kfree(nhg->spare);  	kfree(nhg);  } @@ -276,6 +283,7 @@ out:  	return 0;  nla_put_failure: +	nlmsg_cancel(skb, nlh);  	return -EMSGSIZE;  } @@ -433,7 +441,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],  		if (!valid_group_nh(nh, len, extack))  			return -EINVAL;  	} -	for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { +	for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) {  		if (!tb[i])  			continue; @@ -693,41 +701,56 @@ static void nh_group_rebalance(struct nh_group *nhg)  	}  } -static void remove_nh_grp_entry(struct nh_grp_entry *nhge, -				struct nh_group *nhg, +static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,  				struct nl_info *nlinfo)  { +	struct nh_grp_entry *nhges, *new_nhges; +	struct nexthop *nhp = nhge->nh_parent;  	struct nexthop *nh = nhge->nh; -	struct nh_grp_entry *nhges; -	bool found = false; -	int i; +	struct nh_group *nhg, *newg; +	int i, j;  	WARN_ON(!nh); -	nhges = nhg->nh_entries; -	for (i = 0; i < nhg->num_nh; ++i) { -		if (found) { -			nhges[i-1].nh = nhges[i].nh; -			nhges[i-1].weight = nhges[i].weight; -			list_del(&nhges[i].nh_list); -			list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list); -		} else if (nhg->nh_entries[i].nh == nh) { -			found = true; -		} -	} +	nhg = rtnl_dereference(nhp->nh_grp); +	newg = nhg->spare; -	if (WARN_ON(!found)) +	/* last entry, keep it visible and remove the parent */ +	if (nhg->num_nh == 1) { +		remove_nexthop(net, nhp, nlinfo);  		return; +	} + +	newg->has_v4 = nhg->has_v4; +	newg->mpath = nhg->mpath; +	newg->num_nh = nhg->num_nh; -	nhg->num_nh--; -	nhg->nh_entries[nhg->num_nh].nh = NULL; +	/* copy old entries to new except the one getting removed */ +	nhges = nhg->nh_entries; +	new_nhges = newg->nh_entries; +	for (i = 0, j = 0; i < nhg->num_nh; ++i) { +		/* current nexthop getting removed */ +		if (nhg->nh_entries[i].nh == nh) { +			newg->num_nh--; +			continue; +		} -	nh_group_rebalance(nhg); +		list_del(&nhges[i].nh_list); +		new_nhges[j].nh_parent = nhges[i].nh_parent; +		new_nhges[j].nh = nhges[i].nh; +		new_nhges[j].weight = nhges[i].weight; +		list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); +		j++; +	} -	nexthop_put(nh); +	nh_group_rebalance(newg); +	rcu_assign_pointer(nhp->nh_grp, newg); + +	list_del(&nhge->nh_list); +	nexthop_put(nhge->nh);  	if (nlinfo) -		nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo); +		nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);  }  static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, @@ -735,17 +758,11 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,  {  	struct nh_grp_entry *nhge, *tmp; -	list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) { -		struct nh_group *nhg; - -		list_del(&nhge->nh_list); -		nhg = rtnl_dereference(nhge->nh_parent->nh_grp); -		remove_nh_grp_entry(nhge, nhg, nlinfo); +	list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) +		remove_nh_grp_entry(net, nhge, nlinfo); -		/* if this group has no more entries then remove it */ -		if (!nhg->num_nh) -			remove_nexthop(net, nhge->nh_parent, nlinfo); -	} +	/* make sure all see the newly published array before releasing rtnl */ +	synchronize_rcu();  }  static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) @@ -759,10 +776,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)  		if (WARN_ON(!nhge->nh))  			continue; -		list_del(&nhge->nh_list); -		nexthop_put(nhge->nh); -		nhge->nh = NULL; -		nhg->num_nh--; +		list_del_init(&nhge->nh_list);  	}  } @@ -1085,6 +1099,7 @@ static struct nexthop *nexthop_create_group(struct net *net,  {  	struct nlattr *grps_attr = cfg->nh_grp;  	struct nexthop_grp *entry = nla_data(grps_attr); +	u16 num_nh = nla_len(grps_attr) / sizeof(*entry);  	struct nh_group *nhg;  	struct nexthop *nh;  	int i; @@ -1095,12 +1110,21 @@ static struct nexthop *nexthop_create_group(struct net *net,  	nh->is_group = 1; -	nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry)); +	nhg = nexthop_grp_alloc(num_nh);  	if (!nhg) {  		kfree(nh);  		return ERR_PTR(-ENOMEM);  	} +	/* spare group used for removals */ +	nhg->spare = nexthop_grp_alloc(num_nh); +	if (!nhg) { +		kfree(nhg); +		kfree(nh); +		return NULL; +	} +	nhg->spare->spare = nhg; +  	for (i = 0; i < nhg->num_nh; ++i) {  		struct nexthop *nhe;  		struct nh_info *nhi; @@ -1132,6 +1156,7 @@ out_no_nh:  	for (; i >= 0; --i)  		nexthop_put(nhg->nh_entries[i].nh); +	kfree(nhg->spare);  	kfree(nhg);  	kfree(nh); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 788c69d9bfe0..b73f540fa19b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -491,18 +491,16 @@ u32 ip_idents_reserve(u32 hash, int segs)  	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;  	u32 old = READ_ONCE(*p_tstamp);  	u32 now = (u32)jiffies; -	u32 new, delta = 0; +	u32 delta = 0;  	if (old != now && cmpxchg(p_tstamp, old, now) == old)  		delta = prandom_u32_max(now - old); -	/* Do not use atomic_add_return() as it makes UBSAN unhappy */ -	do { -		old = (u32)atomic_read(p_id); -		new = old + delta + segs; -	} while (atomic_cmpxchg(p_id, old, new) != old); - -	return new - segs; +	/* If UBSAN reports an error there, please make sure your compiler +	 * supports -fno-strict-overflow before reporting it that was a bug +	 * in UBSAN, and it has been fixed in GCC-8. +	 */ +	return atomic_add_return(segs + delta, p_id) - segs;  }  EXPORT_SYMBOL(ip_idents_reserve); @@ -915,7 +913,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)  	/* Check for load limit; set rate_last to the latest sent  	 * redirect.  	 */ -	if (peer->rate_tokens == 0 || +	if (peer->n_redirects == 0 ||  	    time_after(jiffies,  		       (peer->rate_last +  			(ip_rt_redirect_load << peer->n_redirects)))) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6d87de434377..dd401757eea1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -476,9 +476,17 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)  static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,  					  int target, struct sock *sk)  { -	return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) || -		(sk->sk_prot->stream_memory_read ? -		sk->sk_prot->stream_memory_read(sk) : false); +	int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); + +	if (avail > 0) { +		if (avail >= target) +			return true; +		if (tcp_rmem_pressure(sk)) +			return true; +	} +	if (sk->sk_prot->stream_memory_read) +		return sk->sk_prot->stream_memory_read(sk); +	return false;  }  /* @@ -1756,10 +1764,11 @@ static int tcp_zerocopy_receive(struct sock *sk,  	down_read(¤t->mm->mmap_sem); -	ret = -EINVAL;  	vma = find_vma(current->mm, address); -	if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) -		goto out; +	if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) { +		up_read(¤t->mm->mmap_sem); +		return -EINVAL; +	}  	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);  	tp = tcp_sk(sk); @@ -2154,13 +2163,15 @@ skip_copy:  			tp->urg_data = 0;  			tcp_fast_path_check(sk);  		} -		if (used + offset < skb->len) -			continue;  		if (TCP_SKB_CB(skb)->has_rxtstamp) {  			tcp_update_recv_tstamps(skb, &tss);  			cmsg_flags |= 2;  		} + +		if (used + offset < skb->len) +			continue; +  		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)  			goto found_fin_ok;  		if (!(flags & MSG_PEEK)) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 5a05327f97c1..629aaa9a1eb9 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -125,7 +125,6 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,  	if (!ret) {  		msg->sg.start = i; -		msg->sg.size -= apply_bytes;  		sk_psock_queue_msg(psock, tmp);  		sk_psock_data_ready(sk, psock);  	} else { @@ -262,14 +261,17 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,  	struct sk_psock *psock;  	int copied, ret; +	if (unlikely(flags & MSG_ERRQUEUE)) +		return inet_recv_error(sk, msg, len, addr_len); +  	psock = sk_psock_get(sk);  	if (unlikely(!psock))  		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); -	if (unlikely(flags & MSG_ERRQUEUE)) -		return inet_recv_error(sk, msg, len, addr_len);  	if (!skb_queue_empty(&sk->sk_receive_queue) && -	    sk_psock_queue_empty(psock)) +	    sk_psock_queue_empty(psock)) { +		sk_psock_put(sk, psock);  		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); +	}  	lock_sock(sk);  msg_bytes_ready:  	copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf4ced9273e8..29c6fc8c7716 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3926,10 +3926,6 @@ void tcp_parse_options(const struct net *net,  				 */  				break;  #endif -			case TCPOPT_MPTCP: -				mptcp_parse_option(skb, ptr, opsize, opt_rx); -				break; -  			case TCPOPT_FASTOPEN:  				tcp_parse_fastopen_option(  					opsize - TCPOLEN_FASTOPEN_BASE, @@ -4761,7 +4757,8 @@ void tcp_data_ready(struct sock *sk)  	const struct tcp_sock *tp = tcp_sk(sk);  	int avail = tp->rcv_nxt - tp->copied_seq; -	if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) +	if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && +	    !sock_flag(sk, SOCK_DONE))  		return;  	sk->sk_data_ready(sk); @@ -5990,9 +5987,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,  		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);  		tcp_initialize_rcv_mss(sk); -		if (sk_is_mptcp(sk)) -			mptcp_rcv_synsent(sk); -  		/* Remember, tcp_poll() does not lock socket!  		 * Change state from SYN-SENT only after copied_seq  		 * is initialized. */ diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 89ba7c87de5d..30ddb9dc9398 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -58,9 +58,7 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)  {  	memset(IPCB(skb), 0, sizeof(*IPCB(skb))); -#ifdef CONFIG_NETFILTER  	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; -#endif  	return xfrm_output(sk, skb);  } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24e319dfb510..f131cedf5ba6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3222,11 +3222,11 @@ static int ipv6_generate_stable_address(struct in6_addr *address,  					const struct inet6_dev *idev)  {  	static DEFINE_SPINLOCK(lock); -	static __u32 digest[SHA_DIGEST_WORDS]; -	static __u32 workspace[SHA_WORKSPACE_WORDS]; +	static __u32 digest[SHA1_DIGEST_WORDS]; +	static __u32 workspace[SHA1_WORKSPACE_WORDS];  	static union { -		char __data[SHA_MESSAGE_BYTES]; +		char __data[SHA1_BLOCK_SIZE];  		struct {  			struct in6_addr secret;  			__be32 prefix[2]; @@ -3251,7 +3251,7 @@ static int ipv6_generate_stable_address(struct in6_addr *address,  retry:  	spin_lock_bh(&lock); -	sha_init(digest); +	sha1_init(digest);  	memset(&data, 0, sizeof(data));  	memset(workspace, 0, sizeof(workspace));  	memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); @@ -3260,7 +3260,7 @@ retry:  	data.secret = secret;  	data.dad_count = dad_count; -	sha_transform(digest, data.__data, workspace); +	sha1_transform(digest, data.__data, workspace);  	temp = *address;  	temp.s6_addr32[2] = (__force __be32)digest[0]; diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 221c81f85cbf..8d3f66c310db 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -1047,7 +1047,8 @@ static int calipso_opt_getattr(const unsigned char *calipso,  			goto getattr_return;  		} -		secattr->flags |= NETLBL_SECATTR_MLS_CAT; +		if (secattr->attr.mls.cat) +			secattr->flags |= NETLBL_SECATTR_MLS_CAT;  	}  	secattr->type = NETLBL_NLTYPE_CALIPSO; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 8eab2c869d61..ab0eea336c70 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -85,10 +85,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head,  		sp->olen++;  		xo = xfrm_offload(skb); -		if (!xo) { -			xfrm_state_put(x); +		if (!xo)  			goto out_reset; -		}  	}  	xo->flags |= XFRM_GRO; @@ -123,9 +121,16 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb)  	struct ip_esp_hdr *esph;  	struct ipv6hdr *iph = ipv6_hdr(skb);  	struct xfrm_offload *xo = xfrm_offload(skb); -	int proto = iph->nexthdr; +	u8 proto = iph->nexthdr;  	skb_push(skb, -skb_network_offset(skb)); + +	if (x->outer_mode.encap == XFRM_MODE_TRANSPORT) { +		__be16 frag; + +		ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, &frag); +	} +  	esph = ip_esp_hdr(skb);  	*skb_mac_header(skb) = IPPROTO_ESP; @@ -166,23 +171,31 @@ static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x,  	struct xfrm_offload *xo = xfrm_offload(skb);  	struct sk_buff *segs = ERR_PTR(-EINVAL);  	const struct net_offload *ops; -	int proto = xo->proto; +	u8 proto = xo->proto;  	skb->transport_header += x->props.header_len; -	if (proto == IPPROTO_BEETPH) { -		struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; - -		skb->transport_header += ph->hdrlen * 8; -		proto = ph->nexthdr; -	} -  	if (x->sel.family != AF_INET6) {  		skb->transport_header -=  			(sizeof(struct ipv6hdr) - sizeof(struct iphdr)); +		if (proto == IPPROTO_BEETPH) { +			struct ip_beet_phdr *ph = +				(struct ip_beet_phdr *)skb->data; + +			skb->transport_header += ph->hdrlen * 8; +			proto = ph->nexthdr; +		} else { +			skb->transport_header -= IPV4_BEET_PHMAXLEN; +		} +  		if (proto == IPPROTO_TCP)  			skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; +	} else { +		__be16 frag; + +		skb->transport_header += +			ipv6_skip_exthdr(skb, 0, &proto, &frag);  	}  	__skb_pull(skb, skb_transport_offset(skb)); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 2688f3e82165..fc5000370030 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -229,6 +229,25 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,  	return res;  } +static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type, +				  struct flowi6 *fl6) +{ +	struct net *net = sock_net(sk); +	struct dst_entry *dst; +	bool res = false; + +	dst = ip6_route_output(net, sk, fl6); +	if (!dst->error) { +		struct rt6_info *rt = (struct rt6_info *)dst; +		struct in6_addr prefsrc; + +		rt6_get_prefsrc(rt, &prefsrc); +		res = !ipv6_addr_any(&prefsrc); +	} +	dst_release(dst); +	return res; +} +  /*   *	an inline helper for the "simple" if statement below   *	checks if parameter problem report is caused by an @@ -527,7 +546,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,  		saddr = force_saddr;  	if (saddr) {  		fl6.saddr = *saddr; -	} else { +	} else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) {  		/* select a more meaningful saddr from input if */  		struct net_device *in_netdev; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 46ed56719476..20314895509c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -664,7 +664,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)  	if (arg.filter.table_id) {  		tb = fib6_get_table(net, arg.filter.table_id);  		if (!tb) { -			if (arg.filter.dump_all_families) +			if (rtnl_msg_family(cb->nlh) != PF_INET6)  				goto out;  			NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 65a54d74acc1..1f4d20e97c07 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -98,7 +98,8 @@ static void ipmr_expire_process(struct timer_list *t);  #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES  #define ip6mr_for_each_table(mrt, net) \  	list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ -				lockdep_rtnl_is_held()) +				lockdep_rtnl_is_held() || \ +				list_empty(&net->ipv6.mr6_tables))  static struct mr_table *ip6mr_mr_table_iter(struct net *net,  					    struct mr_table *mrt) @@ -2502,7 +2503,7 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)  		mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id);  		if (!mrt) { -			if (filter.dump_all_families) +			if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR)  				return skb->len;  			NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index debdaeba5d8c..18d05403d3b5 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -183,15 +183,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,  					retv = -EBUSY;  					break;  				} -			} else if (sk->sk_protocol == IPPROTO_TCP) { -				if (sk->sk_prot != &tcpv6_prot) { -					retv = -EBUSY; -					break; -				} -				break; -			} else { +			} +			if (sk->sk_protocol == IPPROTO_TCP && +			    sk->sk_prot != &tcpv6_prot) { +				retv = -EBUSY;  				break;  			} +			if (sk->sk_protocol != IPPROTO_TCP) +				break;  			if (sk->sk_state != TCP_ESTABLISHED) {  				retv = -ENOTCONN;  				break; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..ff847a324220 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1385,9 +1385,18 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)  	}  	ip6_rt_copy_init(pcpu_rt, res);  	pcpu_rt->rt6i_flags |= RTF_PCPU; + +	if (f6i->nh) +		pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); +  	return pcpu_rt;  } +static bool rt6_is_valid(const struct rt6_info *rt6) +{ +	return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); +} +  /* It should be called with rcu_read_lock() acquired */  static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)  { @@ -1395,6 +1404,19 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)  	pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); +	if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { +		struct rt6_info *prev, **p; + +		p = this_cpu_ptr(res->nh->rt6i_pcpu); +		prev = xchg(p, NULL); +		if (prev) { +			dst_dev_put(&prev->dst); +			dst_release(&prev->dst); +		} + +		pcpu_rt = NULL; +	} +  	return pcpu_rt;  } @@ -2593,6 +2615,9 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)  	rt = container_of(dst, struct rt6_info, dst); +	if (rt->sernum) +		return rt6_is_valid(rt) ? dst : NULL; +  	rcu_read_lock();  	/* All IPV6 dsts are created with ->obsolete set to the value @@ -2697,8 +2722,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,  	const struct in6_addr *daddr, *saddr;  	struct rt6_info *rt6 = (struct rt6_info *)dst; -	if (dst_metric_locked(dst, RTAX_MTU)) -		return; +	/* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) +	 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. +	 * [see also comment in rt6_mtu_change_route()] +	 */  	if (iph) {  		daddr = &iph->daddr; diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index d38b476fc7f2..307f336b5353 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -8,6 +8,7 @@  #include <net/rpl.h>  #define IPV6_PFXTAIL_LEN(x) (sizeof(struct in6_addr) - (x)) +#define IPV6_RPL_BEST_ADDR_COMPRESSION 15  static void ipv6_rpl_addr_decompress(struct in6_addr *dst,  				     const struct in6_addr *daddr, @@ -73,7 +74,7 @@ static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr  		}  	} -	return plen; +	return IPV6_RPL_BEST_ADDR_COMPRESSION;  }  static unsigned char ipv6_rpl_srh_calc_cmpre(const struct in6_addr *daddr, @@ -83,10 +84,10 @@ static unsigned char ipv6_rpl_srh_calc_cmpre(const struct in6_addr *daddr,  	for (plen = 0; plen < sizeof(*daddr); plen++) {  		if (daddr->s6_addr[plen] != last_segment->s6_addr[plen]) -			break; +			return plen;  	} -	return plen; +	return IPV6_RPL_BEST_ADDR_COMPRESSION;  }  void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr, diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 75421a472d25..37b434293bda 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -27,8 +27,9 @@  bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)  { -	int trailing;  	unsigned int tlv_offset; +	int max_last_entry; +	int trailing;  	if (srh->type != IPV6_SRCRT_TYPE_4)  		return false; @@ -36,7 +37,12 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)  	if (((srh->hdrlen + 1) << 3) != len)  		return false; -	if (srh->segments_left > srh->first_segment) +	max_last_entry = (srh->hdrlen / 2) - 1; + +	if (srh->first_segment > max_last_entry) +		return false; + +	if (srh->segments_left > srh->first_segment + 1)  		return false;  	tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4); @@ -434,7 +440,7 @@ static struct genl_family seg6_genl_family __ro_after_init = {  int __init seg6_init(void)  { -	int err = -ENOMEM; +	int err;  	err = genl_register_family(&seg6_genl_family);  	if (err) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index ffcfcd2b128f..85dddfe3a2c6 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -34,7 +34,6 @@  #include <net/addrconf.h>  #include <net/xfrm.h> -#include <linux/cryptohash.h>  #include <crypto/hash.h>  #include <crypto/sha.h>  #include <net/seg6.h> diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index fbe51d40bd7e..e34167f790e6 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -111,9 +111,7 @@ int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb)  {  	memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); -#ifdef CONFIG_NETFILTER  	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -#endif  	return xfrm_output(sk, skb);  } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index fcb53ed1c4fb..6d7ef78c88af 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1458,6 +1458,9 @@ static int l2tp_validate_socket(const struct sock *sk, const struct net *net,  	if (sk->sk_type != SOCK_DGRAM)  		return -EPROTONOSUPPORT; +	if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) +		return -EPROTONOSUPPORT; +  	if ((encap == L2TP_ENCAPTYPE_UDP && sk->sk_protocol != IPPROTO_UDP) ||  	    (encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP))  		return -EPROTONOSUPPORT; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 0d7c887a2b75..955662a6dee7 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -20,7 +20,6 @@  #include <net/icmp.h>  #include <net/udp.h>  #include <net/inet_common.h> -#include <net/inet_hashtables.h>  #include <net/tcp_states.h>  #include <net/protocol.h>  #include <net/xfrm.h> @@ -209,15 +208,31 @@ discard:  	return 0;  } -static int l2tp_ip_open(struct sock *sk) +static int l2tp_ip_hash(struct sock *sk)  { -	/* Prevent autobind. We don't have ports. */ -	inet_sk(sk)->inet_num = IPPROTO_L2TP; +	if (sk_unhashed(sk)) { +		write_lock_bh(&l2tp_ip_lock); +		sk_add_node(sk, &l2tp_ip_table); +		write_unlock_bh(&l2tp_ip_lock); +	} +	return 0; +} +static void l2tp_ip_unhash(struct sock *sk) +{ +	if (sk_unhashed(sk)) +		return;  	write_lock_bh(&l2tp_ip_lock); -	sk_add_node(sk, &l2tp_ip_table); +	sk_del_node_init(sk);  	write_unlock_bh(&l2tp_ip_lock); +} + +static int l2tp_ip_open(struct sock *sk) +{ +	/* Prevent autobind. We don't have ports. */ +	inet_sk(sk)->inet_num = IPPROTO_L2TP; +	l2tp_ip_hash(sk);  	return 0;  } @@ -594,8 +609,8 @@ static struct proto l2tp_ip_prot = {  	.sendmsg	   = l2tp_ip_sendmsg,  	.recvmsg	   = l2tp_ip_recvmsg,  	.backlog_rcv	   = l2tp_ip_backlog_recv, -	.hash		   = inet_hash, -	.unhash		   = inet_unhash, +	.hash		   = l2tp_ip_hash, +	.unhash		   = l2tp_ip_unhash,  	.obj_size	   = sizeof(struct l2tp_ip_sock),  #ifdef CONFIG_COMPAT  	.compat_setsockopt = compat_ip_setsockopt, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index d148766f40d1..0fa694bd3f6a 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -20,8 +20,6 @@  #include <net/icmp.h>  #include <net/udp.h>  #include <net/inet_common.h> -#include <net/inet_hashtables.h> -#include <net/inet6_hashtables.h>  #include <net/tcp_states.h>  #include <net/protocol.h>  #include <net/xfrm.h> @@ -222,15 +220,31 @@ discard:  	return 0;  } -static int l2tp_ip6_open(struct sock *sk) +static int l2tp_ip6_hash(struct sock *sk)  { -	/* Prevent autobind. We don't have ports. */ -	inet_sk(sk)->inet_num = IPPROTO_L2TP; +	if (sk_unhashed(sk)) { +		write_lock_bh(&l2tp_ip6_lock); +		sk_add_node(sk, &l2tp_ip6_table); +		write_unlock_bh(&l2tp_ip6_lock); +	} +	return 0; +} +static void l2tp_ip6_unhash(struct sock *sk) +{ +	if (sk_unhashed(sk)) +		return;  	write_lock_bh(&l2tp_ip6_lock); -	sk_add_node(sk, &l2tp_ip6_table); +	sk_del_node_init(sk);  	write_unlock_bh(&l2tp_ip6_lock); +} + +static int l2tp_ip6_open(struct sock *sk) +{ +	/* Prevent autobind. We don't have ports. */ +	inet_sk(sk)->inet_num = IPPROTO_L2TP; +	l2tp_ip6_hash(sk);  	return 0;  } @@ -728,8 +742,8 @@ static struct proto l2tp_ip6_prot = {  	.sendmsg	   = l2tp_ip6_sendmsg,  	.recvmsg	   = l2tp_ip6_recvmsg,  	.backlog_rcv	   = l2tp_ip6_backlog_recv, -	.hash		   = inet6_hash, -	.unhash		   = inet_unhash, +	.hash		   = l2tp_ip6_hash, +	.unhash		   = l2tp_ip6_unhash,  	.obj_size	   = sizeof(struct l2tp_ip6_sock),  #ifdef CONFIG_COMPAT  	.compat_setsockopt = compat_ipv6_setsockopt, diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index f5a9bdc4980c..ebb381c3f1b9 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -920,51 +920,51 @@ static const struct genl_ops l2tp_nl_ops[] = {  		.cmd = L2TP_CMD_TUNNEL_CREATE,  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit = l2tp_nl_cmd_tunnel_create, -		.flags = GENL_ADMIN_PERM, +		.flags = GENL_UNS_ADMIN_PERM,  	},  	{  		.cmd = L2TP_CMD_TUNNEL_DELETE,  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit = l2tp_nl_cmd_tunnel_delete, -		.flags = GENL_ADMIN_PERM, +		.flags = GENL_UNS_ADMIN_PERM,  	},  	{  		.cmd = L2TP_CMD_TUNNEL_MODIFY,  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit = l2tp_nl_cmd_tunnel_modify, -		.flags = GENL_ADMIN_PERM, +		.flags = GENL_UNS_ADMIN_PERM,  	},  	{  		.cmd = L2TP_CMD_TUNNEL_GET,  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit = l2tp_nl_cmd_tunnel_get,  		.dumpit = l2tp_nl_cmd_tunnel_dump, -		.flags = GENL_ADMIN_PERM, +		.flags = GENL_UNS_ADMIN_PERM,  	},  	{  		.cmd = L2TP_CMD_SESSION_CREATE,  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit = l2tp_nl_cmd_session_create, -		.flags = GENL_ADMIN_PERM, +		.flags = GENL_UNS_ADMIN_PERM,  	},  	{  		.cmd = L2TP_CMD_SESSION_DELETE,  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit = l2tp_nl_cmd_session_delete, -		.flags = GENL_ADMIN_PERM, +		.flags = GENL_UNS_ADMIN_PERM,  	},  	{  		.cmd = L2TP_CMD_SESSION_MODIFY,  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit = l2tp_nl_cmd_session_modify, -		.flags = GENL_ADMIN_PERM, +		.flags = GENL_UNS_ADMIN_PERM,  	},  	{  		.cmd = L2TP_CMD_SESSION_GET,  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,  		.doit = l2tp_nl_cmd_session_get,  		.dumpit = l2tp_nl_cmd_session_dump, -		.flags = GENL_ADMIN_PERM, +		.flags = GENL_UNS_ADMIN_PERM,  	},  }; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 8345926193de..6423173bb87e 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1069,7 +1069,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)  		local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC;  		if (hw->max_signal <= 0) {  			result = -EINVAL; -			goto fail_wiphy_register; +			goto fail_workqueue;  		}  	} @@ -1135,7 +1135,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)  	result = ieee80211_init_cipher_suites(local);  	if (result < 0) -		goto fail_wiphy_register; +		goto fail_workqueue;  	if (!local->ops->remain_on_channel)  		local->hw.wiphy->max_remain_on_channel_duration = 5000; @@ -1161,10 +1161,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)  	local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM; -	result = wiphy_register(local->hw.wiphy); -	if (result < 0) -		goto fail_wiphy_register; -  	/*  	 * We use the number of queues for feature tests (QoS, HT) internally  	 * so restrict them appropriately. @@ -1187,8 +1183,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)  	local->tx_headroom = max_t(unsigned int , local->hw.extra_tx_headroom,  				   IEEE80211_TX_STATUS_HEADROOM); -	debugfs_hw_add(local); -  	/*  	 * if the driver doesn't specify a max listen interval we  	 * use 5 which should be a safe default @@ -1217,9 +1211,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)  		goto fail_flows;  	rtnl_lock(); -  	result = ieee80211_init_rate_ctrl_alg(local,  					      hw->rate_control_algorithm); +	rtnl_unlock();  	if (result < 0) {  		wiphy_debug(local->hw.wiphy,  			    "Failed to initialize rate control algorithm\n"); @@ -1273,6 +1267,15 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)  		local->sband_allocated |= BIT(band);  	} +	result = wiphy_register(local->hw.wiphy); +	if (result < 0) +		goto fail_wiphy_register; + +	debugfs_hw_add(local); +	rate_control_add_debugfs(local); + +	rtnl_lock(); +  	/* add one default STA interface if supported */  	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) &&  	    !ieee80211_hw_check(hw, NO_AUTO_VIF)) { @@ -1312,17 +1315,17 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)  #if defined(CONFIG_INET) || defined(CONFIG_IPV6)   fail_ifa:  #endif +	wiphy_unregister(local->hw.wiphy); + fail_wiphy_register:  	rtnl_lock();  	rate_control_deinitialize(local);  	ieee80211_remove_interfaces(local); - fail_rate:  	rtnl_unlock(); + fail_rate:   fail_flows:  	ieee80211_led_exit(local);  	destroy_workqueue(local->workqueue);   fail_workqueue: -	wiphy_unregister(local->hw.wiphy); - fail_wiphy_register:  	if (local->wiphy_ciphers_allocated)  		kfree(local->hw.wiphy->cipher_suites);  	kfree(local->int_scan_req); @@ -1372,8 +1375,8 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)  	skb_queue_purge(&local->skb_queue_unreliable);  	skb_queue_purge(&local->skb_queue_tdls_chsw); -	destroy_workqueue(local->workqueue);  	wiphy_unregister(local->hw.wiphy); +	destroy_workqueue(local->workqueue);  	ieee80211_led_exit(local);  	kfree(local->int_scan_req);  } diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index d09b3c789314..36978a0e5000 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1257,15 +1257,15 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,  		    sdata->u.mesh.mshcfg.rssi_threshold < rx_status->signal)  			mesh_neighbour_update(sdata, mgmt->sa, &elems,  					      rx_status); + +		if (ifmsh->csa_role != IEEE80211_MESH_CSA_ROLE_INIT && +		    !sdata->vif.csa_active) +			ieee80211_mesh_process_chnswitch(sdata, &elems, true);  	}  	if (ifmsh->sync_ops)  		ifmsh->sync_ops->rx_bcn_presp(sdata,  			stype, mgmt, &elems, rx_status); - -	if (ifmsh->csa_role != IEEE80211_MESH_CSA_ROLE_INIT && -	    !sdata->vif.csa_active) -		ieee80211_mesh_process_chnswitch(sdata, &elems, true);  }  int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata) @@ -1373,6 +1373,9 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata,  	ieee802_11_parse_elems(pos, len - baselen, true, &elems,  			       mgmt->bssid, NULL); +	if (!mesh_matches_local(sdata, &elems)) +		return; +  	ifmsh->chsw_ttl = elems.mesh_chansw_params_ie->mesh_ttl;  	if (!--ifmsh->chsw_ttl)  		fwd_csa = false; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 38a0383dfbcf..aa5150929996 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1103,7 +1103,14 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)  	mesh_path_sel_frame_tx(MPATH_PREQ, 0, sdata->vif.addr, ifmsh->sn,  			       target_flags, mpath->dst, mpath->sn, da, 0,  			       ttl, lifetime, 0, ifmsh->preq_id++, sdata); + +	spin_lock_bh(&mpath->state_lock); +	if (mpath->flags & MESH_PATH_DELETED) { +		spin_unlock_bh(&mpath->state_lock); +		goto enddiscovery; +	}  	mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout); +	spin_unlock_bh(&mpath->state_lock);  enddiscovery:  	rcu_read_unlock(); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index a1e9fc7878aa..b051f125d3af 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -214,17 +214,16 @@ static ssize_t rcname_read(struct file *file, char __user *userbuf,  				       ref->ops->name, len);  } -static const struct file_operations rcname_ops = { +const struct file_operations rcname_ops = {  	.read = rcname_read,  	.open = simple_open,  	.llseek = default_llseek,  };  #endif -static struct rate_control_ref *rate_control_alloc(const char *name, -					    struct ieee80211_local *local) +static struct rate_control_ref * +rate_control_alloc(const char *name, struct ieee80211_local *local)  { -	struct dentry *debugfsdir = NULL;  	struct rate_control_ref *ref;  	ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL); @@ -234,13 +233,7 @@ static struct rate_control_ref *rate_control_alloc(const char *name,  	if (!ref->ops)  		goto free; -#ifdef CONFIG_MAC80211_DEBUGFS -	debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir); -	local->debugfs.rcdir = debugfsdir; -	debugfs_create_file("name", 0400, debugfsdir, ref, &rcname_ops); -#endif - -	ref->priv = ref->ops->alloc(&local->hw, debugfsdir); +	ref->priv = ref->ops->alloc(&local->hw);  	if (!ref->priv)  		goto free;  	return ref; diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 5397c6dad056..79b44d3db171 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -60,6 +60,29 @@ static inline void rate_control_add_sta_debugfs(struct sta_info *sta)  #endif  } +extern const struct file_operations rcname_ops; + +static inline void rate_control_add_debugfs(struct ieee80211_local *local) +{ +#ifdef CONFIG_MAC80211_DEBUGFS +	struct dentry *debugfsdir; + +	if (!local->rate_ctrl) +		return; + +	if (!local->rate_ctrl->ops->add_debugfs) +		return; + +	debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir); +	local->debugfs.rcdir = debugfsdir; +	debugfs_create_file("name", 0400, debugfsdir, +			    local->rate_ctrl, &rcname_ops); + +	local->rate_ctrl->ops->add_debugfs(&local->hw, local->rate_ctrl->priv, +					   debugfsdir); +#endif +} +  void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata);  /* Get a reference to the rate control algorithm. If `name' is NULL, get the diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 694a31978a04..5dc3e5bc4e64 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1635,7 +1635,7 @@ minstrel_ht_init_cck_rates(struct minstrel_priv *mp)  }  static void * -minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) +minstrel_ht_alloc(struct ieee80211_hw *hw)  {  	struct minstrel_priv *mp; @@ -1673,7 +1673,17 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)  	mp->update_interval = HZ / 10;  	mp->new_avg = true; +	minstrel_ht_init_cck_rates(mp); + +	return mp; +} +  #ifdef CONFIG_MAC80211_DEBUGFS +static void minstrel_ht_add_debugfs(struct ieee80211_hw *hw, void *priv, +				    struct dentry *debugfsdir) +{ +	struct minstrel_priv *mp = priv; +  	mp->fixed_rate_idx = (u32) -1;  	debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir,  			   &mp->fixed_rate_idx); @@ -1681,12 +1691,8 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)  			   &mp->sample_switch);  	debugfs_create_bool("new_avg", S_IRUGO | S_IWUSR, debugfsdir,  			   &mp->new_avg); -#endif - -	minstrel_ht_init_cck_rates(mp); - -	return mp;  } +#endif  static void  minstrel_ht_free(void *priv) @@ -1725,6 +1731,7 @@ static const struct rate_control_ops mac80211_minstrel_ht = {  	.alloc = minstrel_ht_alloc,  	.free = minstrel_ht_free,  #ifdef CONFIG_MAC80211_DEBUGFS +	.add_debugfs = minstrel_ht_add_debugfs,  	.add_sta_debugfs = minstrel_ht_add_sta_debugfs,  #endif  	.get_expected_throughput = minstrel_ht_get_expected_throughput, diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index f8d5c2515829..cd8487bc6fc2 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -231,7 +231,8 @@ struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,  	struct sta_info *sta;  	int i = 0; -	list_for_each_entry_rcu(sta, &local->sta_list, list) { +	list_for_each_entry_rcu(sta, &local->sta_list, list, +				lockdep_is_held(&local->sta_mtx)) {  		if (sdata != sta->sdata)  			continue;  		if (i < idx) { diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index c151628bd416..3d980713a9e2 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -47,8 +47,6 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)  void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)  {  	u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; -	__be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; -	__be32 *hash_out = (__force __be32 *)hmac;  	struct sha256_state state;  	u8 key1be[8];  	u8 key2be[8]; @@ -61,7 +59,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)  	put_unaligned_be64(key2, key2be);  	/* Generate key xored with ipad */ -	memset(input, 0x36, SHA_MESSAGE_BYTES); +	memset(input, 0x36, SHA256_BLOCK_SIZE);  	for (i = 0; i < 8; i++)  		input[i] ^= key1be[i];  	for (i = 0; i < 8; i++) @@ -78,7 +76,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)  	sha256_final(&state, &input[SHA256_BLOCK_SIZE]);  	/* Prepare second part of hmac */ -	memset(input, 0x5C, SHA_MESSAGE_BYTES); +	memset(input, 0x5C, SHA256_BLOCK_SIZE);  	for (i = 0; i < 8; i++)  		input[i] ^= key1be[i];  	for (i = 0; i < 8; i++) @@ -86,11 +84,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)  	sha256_init(&state);  	sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE); -	sha256_final(&state, (u8 *)mptcp_hashed_key); - -	/* takes only first 160 bits */ -	for (i = 0; i < 5; i++) -		hash_out[i] = mptcp_hashed_key[i]; +	sha256_final(&state, (u8 *)hmac);  }  #ifdef CONFIG_MPTCP_HMAC_TEST @@ -101,29 +95,29 @@ struct test_cast {  };  /* we can't reuse RFC 4231 test vectors, as we have constraint on the - * input and key size, and we truncate the output. + * input and key size.   */  static struct test_cast tests[] = {  	{  		.key = "0b0b0b0b0b0b0b0b",  		.msg = "48692054", -		.result = "8385e24fb4235ac37556b6b886db106284a1da67", +		.result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa",  	},  	{  		.key = "aaaaaaaaaaaaaaaa",  		.msg = "dddddddd", -		.result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492", +		.result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9",  	},  	{  		.key = "0102030405060708",  		.msg = "cdcdcdcd", -		.result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6", +		.result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d",  	},  };  static int __init test_mptcp_crypto(void)  { -	char hmac[20], hmac_hex[41]; +	char hmac[32], hmac_hex[65];  	u32 nonce1, nonce2;  	u64 key1, key2;  	u8 msg[8]; @@ -140,11 +134,11 @@ static int __init test_mptcp_crypto(void)  		put_unaligned_be32(nonce2, &msg[4]);  		mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); -		for (j = 0; j < 20; ++j) +		for (j = 0; j < 32; ++j)  			sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); -		hmac_hex[40] = 0; +		hmac_hex[64] = 0; -		if (memcmp(hmac_hex, tests[i].result, 40)) +		if (memcmp(hmac_hex, tests[i].result, 64))  			pr_err("test %d failed, got %s expected %s", i,  			       hmac_hex, tests[i].result);  		else diff --git a/net/mptcp/options.c b/net/mptcp/options.c index faf57585b892..7793b6011fa7 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -7,6 +7,7 @@  #define pr_fmt(fmt) "MPTCP: " fmt  #include <linux/kernel.h> +#include <crypto/sha.h>  #include <net/tcp.h>  #include <net/mptcp.h>  #include "protocol.h" @@ -16,10 +17,10 @@ static bool mptcp_cap_flag_sha256(u8 flags)  	return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;  } -void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, -			int opsize, struct tcp_options_received *opt_rx) +static void mptcp_parse_option(const struct sk_buff *skb, +			       const unsigned char *ptr, int opsize, +			       struct mptcp_options_received *mp_opt)  { -	struct mptcp_options_received *mp_opt = &opt_rx->mptcp;  	u8 subtype = *ptr >> 4;  	int expected_opsize;  	u8 version; @@ -283,12 +284,20 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,  }  void mptcp_get_options(const struct sk_buff *skb, -		       struct tcp_options_received *opt_rx) +		       struct mptcp_options_received *mp_opt)  { -	const unsigned char *ptr;  	const struct tcphdr *th = tcp_hdr(skb); -	int length = (th->doff * 4) - sizeof(struct tcphdr); +	const unsigned char *ptr; +	int length; + +	/* initialize option status */ +	mp_opt->mp_capable = 0; +	mp_opt->mp_join = 0; +	mp_opt->add_addr = 0; +	mp_opt->rm_addr = 0; +	mp_opt->dss = 0; +	length = (th->doff * 4) - sizeof(struct tcphdr);  	ptr = (const unsigned char *)(th + 1);  	while (length > 0) { @@ -308,7 +317,7 @@ void mptcp_get_options(const struct sk_buff *skb,  			if (opsize > length)  				return;	/* don't parse partial options */  			if (opcode == TCPOPT_MPTCP) -				mptcp_parse_option(skb, ptr, opsize, opt_rx); +				mptcp_parse_option(skb, ptr, opsize, mp_opt);  			ptr += opsize - 2;  			length -= opsize;  		} @@ -344,28 +353,6 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,  	return false;  } -void mptcp_rcv_synsent(struct sock *sk) -{ -	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); -	struct tcp_sock *tp = tcp_sk(sk); - -	if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { -		subflow->mp_capable = 1; -		subflow->can_ack = 1; -		subflow->remote_key = tp->rx_opt.mptcp.sndr_key; -		pr_debug("subflow=%p, remote_key=%llu", subflow, -			 subflow->remote_key); -	} else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) { -		subflow->mp_join = 1; -		subflow->thmac = tp->rx_opt.mptcp.thmac; -		subflow->remote_nonce = tp->rx_opt.mptcp.nonce; -		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, -			 subflow->thmac, subflow->remote_nonce); -	} else if (subflow->request_mptcp) { -		tcp_sk(sk)->is_mptcp = 0; -	} -} -  /* MP_JOIN client subflow must wait for 4th ack before sending any data:   * TCP can't schedule delack timer before the subflow is fully established.   * MPTCP uses the delack timer to do 3rd ack retransmissions @@ -549,7 +536,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,  static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,  				  struct in_addr *addr)  { -	u8 hmac[MPTCP_ADDR_HMAC_LEN]; +	u8 hmac[SHA256_DIGEST_SIZE];  	u8 msg[7];  	msg[0] = addr_id; @@ -559,14 +546,14 @@ static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,  	mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); -	return get_unaligned_be64(hmac); +	return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);  }  #if IS_ENABLED(CONFIG_MPTCP_IPV6)  static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,  				   struct in6_addr *addr)  { -	u8 hmac[MPTCP_ADDR_HMAC_LEN]; +	u8 hmac[SHA256_DIGEST_SIZE];  	u8 msg[19];  	msg[0] = addr_id; @@ -576,7 +563,7 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,  	mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); -	return get_unaligned_be64(hmac); +	return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);  }  #endif @@ -709,7 +696,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,  	if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)  		return subflow->mp_capable; -	if (mp_opt->use_ack) { +	if (mp_opt->dss && mp_opt->use_ack) {  		/* subflows are fully established as soon as we get any  		 * additional ack.  		 */ @@ -717,8 +704,6 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,  		goto fully_established;  	} -	WARN_ON_ONCE(subflow->can_ack); -  	/* If the first established packet does not contain MP_CAPABLE + data  	 * then fallback to TCP  	 */ @@ -728,6 +713,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,  		return false;  	} +	if (unlikely(!READ_ONCE(msk->pm.server_side))) +		pr_warn_once("bogus mpc option on established client sk");  	subflow->fully_established = 1;  	subflow->remote_key = mp_opt->sndr_key;  	subflow->can_ack = 1; @@ -819,41 +806,41 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,  {  	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);  	struct mptcp_sock *msk = mptcp_sk(subflow->conn); -	struct mptcp_options_received *mp_opt; +	struct mptcp_options_received mp_opt;  	struct mptcp_ext *mpext; -	mp_opt = &opt_rx->mptcp; -	if (!check_fully_established(msk, sk, subflow, skb, mp_opt)) +	mptcp_get_options(skb, &mp_opt); +	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))  		return; -	if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) { +	if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) {  		struct mptcp_addr_info addr; -		addr.port = htons(mp_opt->port); -		addr.id = mp_opt->addr_id; -		if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { +		addr.port = htons(mp_opt.port); +		addr.id = mp_opt.addr_id; +		if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) {  			addr.family = AF_INET; -			addr.addr = mp_opt->addr; +			addr.addr = mp_opt.addr;  		}  #if IS_ENABLED(CONFIG_MPTCP_IPV6) -		else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) { +		else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) {  			addr.family = AF_INET6; -			addr.addr6 = mp_opt->addr6; +			addr.addr6 = mp_opt.addr6;  		}  #endif -		if (!mp_opt->echo) +		if (!mp_opt.echo)  			mptcp_pm_add_addr_received(msk, &addr); -		mp_opt->add_addr = 0; +		mp_opt.add_addr = 0;  	} -	if (!mp_opt->dss) +	if (!mp_opt.dss)  		return;  	/* we can't wait for recvmsg() to update the ack_seq, otherwise  	 * monodirectional flows will stuck  	 */ -	if (mp_opt->use_ack) -		update_una(msk, mp_opt); +	if (mp_opt.use_ack) +		update_una(msk, &mp_opt);  	mpext = skb_ext_add(skb, SKB_EXT_MPTCP);  	if (!mpext) @@ -861,8 +848,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,  	memset(mpext, 0, sizeof(*mpext)); -	if (mp_opt->use_map) { -		if (mp_opt->mpc_map) { +	if (mp_opt.use_map) { +		if (mp_opt.mpc_map) {  			/* this is an MP_CAPABLE carrying MPTCP data  			 * we know this map the first chunk of data  			 */ @@ -872,16 +859,16 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,  			mpext->subflow_seq = 1;  			mpext->dsn64 = 1;  			mpext->mpc_map = 1; +			mpext->data_fin = 0;  		} else { -			mpext->data_seq = mp_opt->data_seq; -			mpext->subflow_seq = mp_opt->subflow_seq; -			mpext->dsn64 = mp_opt->dsn64; +			mpext->data_seq = mp_opt.data_seq; +			mpext->subflow_seq = mp_opt.subflow_seq; +			mpext->dsn64 = mp_opt.dsn64; +			mpext->data_fin = mp_opt.data_fin;  		} -		mpext->data_len = mp_opt->data_len; +		mpext->data_len = mp_opt.data_len;  		mpext->use_map = 1;  	} - -	mpext->data_fin = mp_opt->data_fin;  }  void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 86d61ab34c7c..b78edf237ba0 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -599,12 +599,14 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb,  	    nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))  		goto nla_put_failure; -	if (addr->family == AF_INET) -		nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, -				addr->addr.s_addr); +	if (addr->family == AF_INET && +	    nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, +			    addr->addr.s_addr)) +		goto nla_put_failure;  #if IS_ENABLED(CONFIG_MPTCP_IPV6) -	else if (addr->family == AF_INET6) -		nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6); +	else if (addr->family == AF_INET6 && +		 nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6)) +		goto nla_put_failure;  #endif  	nla_nest_end(skb, attr);  	return 0; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 939a5045181a..34dd0e278a82 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -97,12 +97,7 @@ static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)  	if (likely(!__mptcp_needs_tcp_fallback(msk)))  		return NULL; -	if (msk->subflow) { -		release_sock((struct sock *)msk); -		return msk->subflow; -	} - -	return NULL; +	return msk->subflow;  }  static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) @@ -734,9 +729,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)  			goto out;  	} +fallback:  	ssock = __mptcp_tcp_fallback(msk);  	if (unlikely(ssock)) { -fallback: +		release_sock(sk);  		pr_debug("fallback passthrough");  		ret = sock_sendmsg(ssock, msg);  		return ret >= 0 ? ret + copied : (copied ? copied : ret); @@ -769,8 +765,14 @@ fallback:  		if (ret < 0)  			break;  		if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { +			/* Can happen for passive sockets: +			 * 3WHS negotiated MPTCP, but first packet after is +			 * plain TCP (e.g. due to middlebox filtering unknown +			 * options). +			 * +			 * Fall back to TCP. +			 */  			release_sock(ssk); -			ssock = __mptcp_tcp_fallback(msk);  			goto fallback;  		} @@ -883,6 +885,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,  	ssock = __mptcp_tcp_fallback(msk);  	if (unlikely(ssock)) {  fallback: +		release_sock(sk);  		pr_debug("fallback-read subflow=%p",  			 mptcp_subflow_ctx(ssock->sk));  		copied = sock_recvmsg(ssock, msg, flags); @@ -951,7 +954,8 @@ fallback:  		pr_debug("block timeout %ld", timeo);  		mptcp_wait_data(sk, &timeo); -		if (unlikely(__mptcp_tcp_fallback(msk))) +		ssock = __mptcp_tcp_fallback(msk); +		if (unlikely(ssock))  			goto fallback;  	} @@ -1259,11 +1263,14 @@ static void mptcp_close(struct sock *sk, long timeout)  	lock_sock(sk); -	mptcp_token_destroy(msk->token);  	inet_sk_state_store(sk, TCP_CLOSE); -	__mptcp_flush_join_list(msk); - +	/* be sure to always acquire the join list lock, to sync vs +	 * mptcp_finish_join(). +	 */ +	spin_lock_bh(&msk->join_list_lock); +	list_splice_tail_init(&msk->join_list, &msk->conn_list); +	spin_unlock_bh(&msk->join_list_lock);  	list_splice_init(&msk->conn_list, &conn_list);  	data_fin_tx_seq = msk->write_seq; @@ -1313,11 +1320,12 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)  static int mptcp_disconnect(struct sock *sk, int flags)  { -	lock_sock(sk); -	__mptcp_clear_xmit(sk); -	release_sock(sk); -	mptcp_cancel_work(sk); -	return tcp_disconnect(sk, flags); +	/* Should never be called. +	 * inet_stream_connect() calls ->disconnect, but that +	 * refers to the subflow socket, not the mptcp one. +	 */ +	WARN_ON_ONCE(1); +	return 0;  }  #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1329,7 +1337,9 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)  }  #endif -struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) +struct sock *mptcp_sk_clone(const struct sock *sk, +			    const struct mptcp_options_received *mp_opt, +			    struct request_sock *req)  {  	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);  	struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); @@ -1352,26 +1362,30 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req)  	msk->subflow = NULL;  	if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) { +		nsk->sk_state = TCP_CLOSE;  		bh_unlock_sock(nsk);  		/* we can't call into mptcp_close() here - possible BH context -		 * free the sock directly +		 * free the sock directly. +		 * sk_clone_lock() sets nsk refcnt to two, hence call sk_free() +		 * too.  		 */ -		nsk->sk_prot->destroy(nsk); +		sk_common_release(nsk);  		sk_free(nsk);  		return NULL;  	}  	msk->write_seq = subflow_req->idsn + 1;  	atomic64_set(&msk->snd_una, msk->write_seq); -	if (subflow_req->remote_key_valid) { +	if (mp_opt->mp_capable) {  		msk->can_ack = true; -		msk->remote_key = subflow_req->remote_key; +		msk->remote_key = mp_opt->sndr_key;  		mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);  		ack_seq++;  		msk->ack_seq = ack_seq;  	} +	sock_reset_flag(nsk, SOCK_RCU_FREE);  	/* will be fully established after successful MPC subflow creation */  	inet_sk_state_store(nsk, TCP_SYN_RECV);  	bh_unlock_sock(nsk); @@ -1428,6 +1442,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,  		newsk = new_mptcp_sock;  		mptcp_copy_inaddrs(newsk, ssk);  		list_add(&subflow->node, &msk->conn_list); +		inet_sk_state_store(newsk, TCP_ESTABLISHED);  		bh_unlock_sock(new_mptcp_sock); @@ -1445,6 +1460,7 @@ static void mptcp_destroy(struct sock *sk)  {  	struct mptcp_sock *msk = mptcp_sk(sk); +	mptcp_token_destroy(msk->token);  	if (msk->cached_ext)  		__skb_ext_put(msk->cached_ext); @@ -1467,12 +1483,11 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname,  	 */  	lock_sock(sk);  	ssock = __mptcp_tcp_fallback(msk); +	release_sock(sk);  	if (ssock)  		return tcp_setsockopt(ssock->sk, level, optname, optval,  				      optlen); -	release_sock(sk); -  	return -EOPNOTSUPP;  } @@ -1492,12 +1507,11 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,  	 */  	lock_sock(sk);  	ssock = __mptcp_tcp_fallback(msk); +	release_sock(sk);  	if (ssock)  		return tcp_getsockopt(ssock->sk, level, optname, optval,  				      option); -	release_sock(sk); -  	return -EOPNOTSUPP;  } @@ -1613,20 +1627,30 @@ bool mptcp_finish_join(struct sock *sk)  	if (!msk->pm.server_side)  		return true; -	/* passive connection, attach to msk socket */ +	if (!mptcp_pm_allow_new_subflow(msk)) +		return false; + +	/* active connections are already on conn_list, and we can't acquire +	 * msk lock here. +	 * use the join list lock as synchronization point and double-check +	 * msk status to avoid racing with mptcp_close() +	 */ +	spin_lock_bh(&msk->join_list_lock); +	ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; +	if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) +		list_add_tail(&subflow->node, &msk->join_list); +	spin_unlock_bh(&msk->join_list_lock); +	if (!ret) +		return false; + +	/* attach to msk socket only after we are sure he will deal with us +	 * at close time +	 */  	parent_sock = READ_ONCE(parent->sk_socket);  	if (parent_sock && !sk->sk_socket)  		mptcp_sock_graft(sk, parent_sock); - -	ret = mptcp_pm_allow_new_subflow(msk); -	if (ret) { -		/* active connections are already on conn_list */ -		spin_lock_bh(&msk->join_list_lock); -		if (!WARN_ON_ONCE(!list_empty(&subflow->node))) -			list_add_tail(&subflow->node, &msk->join_list); -		spin_unlock_bh(&msk->join_list_lock); -	} -	return ret; +	subflow->map_seq = msk->ack_seq; +	return true;  }  bool mptcp_sk_is_subflow(const struct sock *sk) @@ -1700,6 +1724,14 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	int err;  	lock_sock(sock->sk); +	if (sock->state != SS_UNCONNECTED && msk->subflow) { +		/* pending connection or invalid state, let existing subflow +		 * cope with that +		 */ +		ssock = msk->subflow; +		goto do_connect; +	} +  	ssock = __mptcp_socket_create(msk, TCP_SYN_SENT);  	if (IS_ERR(ssock)) {  		err = PTR_ERR(ssock); @@ -1714,9 +1746,17 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,  		mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0;  #endif +do_connect:  	err = ssock->ops->connect(ssock, uaddr, addr_len, flags); -	inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); -	mptcp_copy_inaddrs(sock->sk, ssock->sk); +	sock->state = ssock->state; + +	/* on successful connect, the msk state will be moved to established by +	 * subflow_finish_connect() +	 */ +	if (!err || err == EINPROGRESS) +		mptcp_copy_inaddrs(sock->sk, ssock->sk); +	else +		inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));  unlock:  	release_sock(sock->sk); @@ -1774,6 +1814,8 @@ static int mptcp_listen(struct socket *sock, int backlog)  		goto unlock;  	} +	sock_set_flag(sock->sk, SOCK_RCU_FREE); +  	err = ssock->ops->listen(ssock, backlog);  	inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));  	if (!err) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 67448002a2d7..d0803dfb8108 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -81,7 +81,6 @@  /* MPTCP ADD_ADDR flags */  #define MPTCP_ADDR_ECHO		BIT(0) -#define MPTCP_ADDR_HMAC_LEN	20  #define MPTCP_ADDR_IPVERSION_4	4  #define MPTCP_ADDR_IPVERSION_6	6 @@ -91,6 +90,45 @@  #define MPTCP_WORK_RTX		2  #define MPTCP_WORK_EOF		3 +struct mptcp_options_received { +	u64	sndr_key; +	u64	rcvr_key; +	u64	data_ack; +	u64	data_seq; +	u32	subflow_seq; +	u16	data_len; +	u16	mp_capable : 1, +		mp_join : 1, +		dss : 1, +		add_addr : 1, +		rm_addr : 1, +		family : 4, +		echo : 1, +		backup : 1; +	u32	token; +	u32	nonce; +	u64	thmac; +	u8	hmac[20]; +	u8	join_id; +	u8	use_map:1, +		dsn64:1, +		data_fin:1, +		use_ack:1, +		ack64:1, +		mpc_map:1, +		__unused:2; +	u8	addr_id; +	u8	rm_id; +	union { +		struct in_addr	addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +		struct in6_addr	addr6; +#endif +	}; +	u64	ahmac; +	u16	port; +}; +  static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)  {  	return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | @@ -206,12 +244,10 @@ struct mptcp_subflow_request_sock {  	struct	tcp_request_sock sk;  	u16	mp_capable : 1,  		mp_join : 1, -		backup : 1, -		remote_key_valid : 1; +		backup : 1;  	u8	local_id;  	u8	remote_id;  	u64	local_key; -	u64	remote_key;  	u64	idsn;  	u32	token;  	u32	ssn_offset; @@ -332,9 +368,11 @@ void mptcp_proto_init(void);  int mptcp_proto_v6_init(void);  #endif -struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req); +struct sock *mptcp_sk_clone(const struct sock *sk, +			    const struct mptcp_options_received *mp_opt, +			    struct request_sock *req);  void mptcp_get_options(const struct sk_buff *skb, -		       struct tcp_options_received *opt_rx); +		       struct mptcp_options_received *mp_opt);  void mptcp_finish_connect(struct sock *sk);  void mptcp_data_ready(struct sock *sk, struct sock *ssk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 50a8bea987c6..8968b2c065e7 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -10,6 +10,7 @@  #include <linux/module.h>  #include <linux/netdevice.h>  #include <crypto/algapi.h> +#include <crypto/sha.h>  #include <net/sock.h>  #include <net/inet_common.h>  #include <net/inet_hashtables.h> @@ -89,7 +90,7 @@ static bool subflow_token_join_request(struct request_sock *req,  				       const struct sk_buff *skb)  {  	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); -	u8 hmac[MPTCPOPT_HMAC_LEN]; +	u8 hmac[SHA256_DIGEST_SIZE];  	struct mptcp_sock *msk;  	int local_id; @@ -124,16 +125,14 @@ static void subflow_init_req(struct request_sock *req,  {  	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);  	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); -	struct tcp_options_received rx_opt; +	struct mptcp_options_received mp_opt;  	pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); -	memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp)); -	mptcp_get_options(skb, &rx_opt); +	mptcp_get_options(skb, &mp_opt);  	subflow_req->mp_capable = 0;  	subflow_req->mp_join = 0; -	subflow_req->remote_key_valid = 0;  #ifdef CONFIG_TCP_MD5SIG  	/* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -143,16 +142,16 @@ static void subflow_init_req(struct request_sock *req,  		return;  #endif -	if (rx_opt.mptcp.mp_capable) { +	if (mp_opt.mp_capable) {  		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); -		if (rx_opt.mptcp.mp_join) +		if (mp_opt.mp_join)  			return; -	} else if (rx_opt.mptcp.mp_join) { +	} else if (mp_opt.mp_join) {  		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);  	} -	if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { +	if (mp_opt.mp_capable && listener->request_mptcp) {  		int err;  		err = mptcp_token_new_request(req); @@ -160,13 +159,13 @@ static void subflow_init_req(struct request_sock *req,  			subflow_req->mp_capable = 1;  		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; -	} else if (rx_opt.mptcp.mp_join && listener->request_mptcp) { +	} else if (mp_opt.mp_join && listener->request_mptcp) {  		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;  		subflow_req->mp_join = 1; -		subflow_req->backup = rx_opt.mptcp.backup; -		subflow_req->remote_id = rx_opt.mptcp.join_id; -		subflow_req->token = rx_opt.mptcp.token; -		subflow_req->remote_nonce = rx_opt.mptcp.nonce; +		subflow_req->backup = mp_opt.backup; +		subflow_req->remote_id = mp_opt.join_id; +		subflow_req->token = mp_opt.token; +		subflow_req->remote_nonce = mp_opt.nonce;  		pr_debug("token=%u, remote_nonce=%u", subflow_req->token,  			 subflow_req->remote_nonce);  		if (!subflow_token_join_request(req, skb)) { @@ -203,7 +202,7 @@ static void subflow_v6_init_req(struct request_sock *req,  /* validate received truncated hmac and create hmac for third ACK */  static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)  { -	u8 hmac[MPTCPOPT_HMAC_LEN]; +	u8 hmac[SHA256_DIGEST_SIZE];  	u64 thmac;  	subflow_generate_hmac(subflow->remote_key, subflow->local_key, @@ -222,29 +221,55 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)  static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)  {  	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); +	struct mptcp_options_received mp_opt;  	struct sock *parent = subflow->conn; +	struct tcp_sock *tp = tcp_sk(sk);  	subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); -	if (inet_sk_state_load(parent) != TCP_ESTABLISHED) { +	if (inet_sk_state_load(parent) == TCP_SYN_SENT) {  		inet_sk_state_store(parent, TCP_ESTABLISHED);  		parent->sk_state_change(parent);  	} -	if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp) +	/* be sure no special action on any packet other than syn-ack */ +	if (subflow->conn_finished) +		return; + +	subflow->conn_finished = 1; + +	mptcp_get_options(skb, &mp_opt); +	if (subflow->request_mptcp && mp_opt.mp_capable) { +		subflow->mp_capable = 1; +		subflow->can_ack = 1; +		subflow->remote_key = mp_opt.sndr_key; +		pr_debug("subflow=%p, remote_key=%llu", subflow, +			 subflow->remote_key); +	} else if (subflow->request_join && mp_opt.mp_join) { +		subflow->mp_join = 1; +		subflow->thmac = mp_opt.thmac; +		subflow->remote_nonce = mp_opt.nonce; +		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, +			 subflow->thmac, subflow->remote_nonce); +	} else if (subflow->request_mptcp) { +		tp->is_mptcp = 0; +	} + +	if (!tp->is_mptcp)  		return;  	if (subflow->mp_capable) {  		pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),  			 subflow->remote_key);  		mptcp_finish_connect(sk); -		subflow->conn_finished = 1;  		if (skb) {  			pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);  			subflow->ssn_offset = TCP_SKB_CB(skb)->seq;  		}  	} else if (subflow->mp_join) { +		u8 hmac[SHA256_DIGEST_SIZE]; +  		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",  			 subflow, subflow->thmac,  			 subflow->remote_nonce); @@ -257,7 +282,9 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)  		subflow_generate_hmac(subflow->local_key, subflow->remote_key,  				      subflow->local_nonce,  				      subflow->remote_nonce, -				      subflow->hmac); +				      hmac); + +		memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);  		if (skb)  			subflow->ssn_offset = TCP_SKB_CB(skb)->seq; @@ -265,7 +292,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)  		if (!mptcp_finish_join(sk))  			goto do_reset; -		subflow->conn_finished = 1;  		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);  	} else {  do_reset: @@ -323,10 +349,10 @@ drop:  /* validate hmac received in third ACK */  static bool subflow_hmac_valid(const struct request_sock *req, -			       const struct tcp_options_received *rx_opt) +			       const struct mptcp_options_received *mp_opt)  {  	const struct mptcp_subflow_request_sock *subflow_req; -	u8 hmac[MPTCPOPT_HMAC_LEN]; +	u8 hmac[SHA256_DIGEST_SIZE];  	struct mptcp_sock *msk;  	bool ret; @@ -340,13 +366,53 @@ static bool subflow_hmac_valid(const struct request_sock *req,  			      subflow_req->local_nonce, hmac);  	ret = true; -	if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac))) +	if (crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN))  		ret = false;  	sock_put((struct sock *)msk);  	return ret;  } +static void mptcp_sock_destruct(struct sock *sk) +{ +	/* if new mptcp socket isn't accepted, it is free'd +	 * from the tcp listener sockets request queue, linked +	 * from req->sk.  The tcp socket is released. +	 * This calls the ULP release function which will +	 * also remove the mptcp socket, via +	 * sock_put(ctx->conn). +	 * +	 * Problem is that the mptcp socket will not be in +	 * SYN_RECV state and doesn't have SOCK_DEAD flag. +	 * Both result in warnings from inet_sock_destruct. +	 */ + +	if (sk->sk_state == TCP_SYN_RECV) { +		sk->sk_state = TCP_CLOSE; +		WARN_ON_ONCE(sk->sk_socket); +		sock_orphan(sk); +	} + +	inet_sock_destruct(sk); +} + +static void mptcp_force_close(struct sock *sk) +{ +	inet_sk_state_store(sk, TCP_CLOSE); +	sk_common_release(sk); +} + +static void subflow_ulp_fallback(struct sock *sk, +				 struct mptcp_subflow_context *old_ctx) +{ +	struct inet_connection_sock *icsk = inet_csk(sk); + +	mptcp_subflow_tcp_fallback(sk, old_ctx); +	icsk->icsk_ulp_ops = NULL; +	rcu_assign_pointer(icsk->icsk_ulp_data, NULL); +	tcp_sk(sk)->is_mptcp = 0; +} +  static struct sock *subflow_syn_recv_sock(const struct sock *sk,  					  struct sk_buff *skb,  					  struct request_sock *req, @@ -356,13 +422,18 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,  {  	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);  	struct mptcp_subflow_request_sock *subflow_req; -	struct tcp_options_received opt_rx; +	struct mptcp_options_received mp_opt;  	bool fallback_is_fatal = false;  	struct sock *new_msk = NULL; +	bool fallback = false;  	struct sock *child;  	pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); +	/* we need later a valid 'mp_capable' value even when options are not +	 * parsed +	 */ +	mp_opt.mp_capable = 0;  	if (tcp_rsk(req)->is_mptcp == 0)  		goto create_child; @@ -377,26 +448,21 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,  			goto create_msk;  		} -		opt_rx.mptcp.mp_capable = 0; -		mptcp_get_options(skb, &opt_rx); -		if (opt_rx.mptcp.mp_capable) { -			subflow_req->remote_key = opt_rx.mptcp.sndr_key; -			subflow_req->remote_key_valid = 1; -		} else { -			subflow_req->mp_capable = 0; +		mptcp_get_options(skb, &mp_opt); +		if (!mp_opt.mp_capable) { +			fallback = true;  			goto create_child;  		}  create_msk: -		new_msk = mptcp_sk_clone(listener->conn, req); +		new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);  		if (!new_msk) -			subflow_req->mp_capable = 0; +			fallback = true;  	} else if (subflow_req->mp_join) {  		fallback_is_fatal = true; -		opt_rx.mptcp.mp_join = 0; -		mptcp_get_options(skb, &opt_rx); -		if (!opt_rx.mptcp.mp_join || -		    !subflow_hmac_valid(req, &opt_rx)) { +		mptcp_get_options(skb, &mp_opt); +		if (!mp_opt.mp_join || +		    !subflow_hmac_valid(req, &mp_opt)) {  			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);  			return NULL;  		} @@ -409,12 +475,18 @@ create_child:  	if (child && *own_req) {  		struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); -		/* we have null ctx on TCP fallback, which is fatal on -		 * MPJ handshake +		/* we need to fallback on ctx allocation failure and on pre-reqs +		 * checking above. In the latter scenario we additionally need +		 * to reset the context to non MPTCP status.  		 */ -		if (!ctx) { +		if (!ctx || fallback) {  			if (fallback_is_fatal)  				goto close_child; + +			if (ctx) { +				subflow_ulp_fallback(child, ctx); +				kfree_rcu(ctx, rcu); +			}  			goto out;  		} @@ -422,10 +494,17 @@ create_child:  			/* new mpc subflow takes ownership of the newly  			 * created mptcp socket  			 */ -			inet_sk_state_store(new_msk, TCP_ESTABLISHED); +			new_msk->sk_destruct = mptcp_sock_destruct;  			mptcp_pm_new_connection(mptcp_sk(new_msk), 1);  			ctx->conn = new_msk;  			new_msk = NULL; + +			/* with OoO packets we can reach here without ingress +			 * mpc option +			 */ +			ctx->remote_key = mp_opt.sndr_key; +			ctx->fully_established = mp_opt.mp_capable; +			ctx->can_ack = mp_opt.mp_capable;  		} else if (ctx->mp_join) {  			struct mptcp_sock *owner; @@ -444,7 +523,14 @@ create_child:  out:  	/* dispose of the left over mptcp master, if any */  	if (unlikely(new_msk)) -		sock_put(new_msk); +		mptcp_force_close(new_msk); + +	/* check for expected invariant - should never trigger, just help +	 * catching eariler subtle bugs +	 */ +	WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && +		     (!mptcp_subflow_ctx(child) || +		      !mptcp_subflow_ctx(child)->conn));  	return child;  close_child: @@ -931,6 +1017,16 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)  	if (err)  		return err; +	/* the newly created socket really belongs to the owning MPTCP master +	 * socket, even if for additional subflows the allocation is performed +	 * by a kernel workqueue. Adjust inode references, so that the +	 * procfs/diag interaces really show this one belonging to the correct +	 * user. +	 */ +	SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; +	SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; +	SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; +  	subflow = mptcp_subflow_ctx(sf->sk);  	pr_debug("subflow=%p", subflow); @@ -1047,17 +1143,6 @@ static void subflow_ulp_release(struct sock *sk)  	kfree_rcu(ctx, rcu);  } -static void subflow_ulp_fallback(struct sock *sk, -				 struct mptcp_subflow_context *old_ctx) -{ -	struct inet_connection_sock *icsk = inet_csk(sk); - -	mptcp_subflow_tcp_fallback(sk, old_ctx); -	icsk->icsk_ulp_ops = NULL; -	rcu_assign_pointer(icsk->icsk_ulp_data, NULL); -	tcp_sk(sk)->is_mptcp = 0; -} -  static void subflow_ulp_clone(const struct request_sock *req,  			      struct sock *newsk,  			      const gfp_t priority) @@ -1091,9 +1176,6 @@ static void subflow_ulp_clone(const struct request_sock *req,  		 * is fully established only after we receive the remote key  		 */  		new_ctx->mp_capable = 1; -		new_ctx->fully_established = subflow_req->remote_key_valid; -		new_ctx->can_ack = subflow_req->remote_key_valid; -		new_ctx->remote_key = subflow_req->remote_key;  		new_ctx->local_key = subflow_req->local_key;  		new_ctx->token = subflow_req->token;  		new_ctx->ssn_offset = subflow_req->ssn_offset; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 8dd17589217d..340cb955af25 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -86,7 +86,8 @@ find_set_type(const char *name, u8 family, u8 revision)  {  	struct ip_set_type *type; -	list_for_each_entry_rcu(type, &ip_set_type_list, list) +	list_for_each_entry_rcu(type, &ip_set_type_list, list, +				lockdep_is_held(&ip_set_type_mutex))  		if (STRNCMP(type->name, name) &&  		    (type->family == family ||  		     type->family == NFPROTO_UNSPEC) && diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index cd747c0962fd..5a67f7966574 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -59,7 +59,7 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,  	/* Don't lookup sub-counters at all */  	opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;  	if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) -		opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; +		opt->cmdflags |= IPSET_FLAG_SKIP_COUNTER_UPDATE;  	list_for_each_entry_rcu(e, &map->members, list) {  		ret = ip_set_test(e->id, skb, par, opt);  		if (ret <= 0) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c4582eb71766..bb72ca5f3999 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1519,9 +1519,9 @@ __nf_conntrack_alloc(struct net *net,  	ct->status = 0;  	ct->timeout = 0;  	write_pnet(&ct->ct_net, net); -	memset(&ct->__nfct_init_offset[0], 0, +	memset(&ct->__nfct_init_offset, 0,  	       offsetof(struct nf_conn, proto) - -	       offsetof(struct nf_conn, __nfct_init_offset[0])); +	       offsetof(struct nf_conn, __nfct_init_offset));  	nf_ct_zone_add(ct, zone); @@ -2016,22 +2016,18 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)  	nf_conntrack_get(skb_nfct(nskb));  } -static int nf_conntrack_update(struct net *net, struct sk_buff *skb) +static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, +				 struct nf_conn *ct, +				 enum ip_conntrack_info ctinfo)  {  	struct nf_conntrack_tuple_hash *h;  	struct nf_conntrack_tuple tuple; -	enum ip_conntrack_info ctinfo;  	struct nf_nat_hook *nat_hook;  	unsigned int status; -	struct nf_conn *ct;  	int dataoff;  	u16 l3num;  	u8 l4num; -	ct = nf_ct_get(skb, &ctinfo); -	if (!ct || nf_ct_is_confirmed(ct)) -		return 0; -  	l3num = nf_ct_l3num(ct);  	dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); @@ -2088,6 +2084,76 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)  	return 0;  } +/* This packet is coming from userspace via nf_queue, complete the packet + * processing after the helper invocation in nf_confirm(). + */ +static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, +			       enum ip_conntrack_info ctinfo) +{ +	const struct nf_conntrack_helper *helper; +	const struct nf_conn_help *help; +	int protoff; + +	help = nfct_help(ct); +	if (!help) +		return 0; + +	helper = rcu_dereference(help->helper); +	if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) +		return 0; + +	switch (nf_ct_l3num(ct)) { +	case NFPROTO_IPV4: +		protoff = skb_network_offset(skb) + ip_hdrlen(skb); +		break; +#if IS_ENABLED(CONFIG_IPV6) +	case NFPROTO_IPV6: { +		__be16 frag_off; +		u8 pnum; + +		pnum = ipv6_hdr(skb)->nexthdr; +		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, +					   &frag_off); +		if (protoff < 0 || (frag_off & htons(~0x7)) != 0) +			return 0; +		break; +	} +#endif +	default: +		return 0; +	} + +	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && +	    !nf_is_loopback_packet(skb)) { +		if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { +			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); +			return -1; +		} +	} + +	/* We've seen it coming out the other side: confirm it */ +	return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; +} + +static int nf_conntrack_update(struct net *net, struct sk_buff *skb) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; +	int err; + +	ct = nf_ct_get(skb, &ctinfo); +	if (!ct) +		return 0; + +	if (!nf_ct_is_confirmed(ct)) { +		err = __nf_conntrack_update(net, skb, ct, ctinfo); +		if (err < 0) +			return err; +	} + +	return nf_confirm_cthelper(skb, ct, ctinfo); +} +  static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,  				       const struct sk_buff *skb)  { @@ -2139,8 +2205,19 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),  		nf_conntrack_lock(lockp);  		if (*bucket < nf_conntrack_htable_size) {  			hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { -				if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) +				if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)  					continue; +				/* All nf_conn objects are added to hash table twice, one +				 * for original direction tuple, once for the reply tuple. +				 * +				 * Exception: In the IPS_NAT_CLASH case, only the reply +				 * tuple is added (the original tuple already existed for +				 * a different object). +				 * +				 * We only need to call the iterator once for each +				 * conntrack, so we just use the 'reply' direction +				 * tuple while iterating. +				 */  				ct = nf_ct_tuplehash_to_ctrack(h);  				if (iter(ct, data))  					goto found; diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index a971183f11af..1f44d523b512 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -72,24 +72,32 @@ EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);  #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)  /* PptpControlMessageType names */ -const char *const pptp_msg_name[] = { -	"UNKNOWN_MESSAGE", -	"START_SESSION_REQUEST", -	"START_SESSION_REPLY", -	"STOP_SESSION_REQUEST", -	"STOP_SESSION_REPLY", -	"ECHO_REQUEST", -	"ECHO_REPLY", -	"OUT_CALL_REQUEST", -	"OUT_CALL_REPLY", -	"IN_CALL_REQUEST", -	"IN_CALL_REPLY", -	"IN_CALL_CONNECT", -	"CALL_CLEAR_REQUEST", -	"CALL_DISCONNECT_NOTIFY", -	"WAN_ERROR_NOTIFY", -	"SET_LINK_INFO" +static const char *const pptp_msg_name_array[PPTP_MSG_MAX + 1] = { +	[0]				= "UNKNOWN_MESSAGE", +	[PPTP_START_SESSION_REQUEST]	= "START_SESSION_REQUEST", +	[PPTP_START_SESSION_REPLY]	= "START_SESSION_REPLY", +	[PPTP_STOP_SESSION_REQUEST]	= "STOP_SESSION_REQUEST", +	[PPTP_STOP_SESSION_REPLY]	= "STOP_SESSION_REPLY", +	[PPTP_ECHO_REQUEST]		= "ECHO_REQUEST", +	[PPTP_ECHO_REPLY]		= "ECHO_REPLY", +	[PPTP_OUT_CALL_REQUEST]		= "OUT_CALL_REQUEST", +	[PPTP_OUT_CALL_REPLY]		= "OUT_CALL_REPLY", +	[PPTP_IN_CALL_REQUEST]		= "IN_CALL_REQUEST", +	[PPTP_IN_CALL_REPLY]		= "IN_CALL_REPLY", +	[PPTP_IN_CALL_CONNECT]		= "IN_CALL_CONNECT", +	[PPTP_CALL_CLEAR_REQUEST]	= "CALL_CLEAR_REQUEST", +	[PPTP_CALL_DISCONNECT_NOTIFY]	= "CALL_DISCONNECT_NOTIFY", +	[PPTP_WAN_ERROR_NOTIFY]		= "WAN_ERROR_NOTIFY", +	[PPTP_SET_LINK_INFO]		= "SET_LINK_INFO"  }; + +const char *pptp_msg_name(u_int16_t msg) +{ +	if (msg > PPTP_MSG_MAX) +		return pptp_msg_name_array[0]; + +	return pptp_msg_name_array[msg]; +}  EXPORT_SYMBOL(pptp_msg_name);  #endif @@ -276,7 +284,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,  	typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;  	msg = ntohs(ctlh->messageType); -	pr_debug("inbound control message %s\n", pptp_msg_name[msg]); +	pr_debug("inbound control message %s\n", pptp_msg_name(msg));  	switch (msg) {  	case PPTP_START_SESSION_REPLY: @@ -311,7 +319,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,  		pcid = pptpReq->ocack.peersCallID;  		if (info->pns_call_id != pcid)  			goto invalid; -		pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], +		pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name(msg),  			 ntohs(cid), ntohs(pcid));  		if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) { @@ -328,7 +336,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,  			goto invalid;  		cid = pptpReq->icreq.callID; -		pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); +		pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid));  		info->cstate = PPTP_CALL_IN_REQ;  		info->pac_call_id = cid;  		break; @@ -347,7 +355,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,  		if (info->pns_call_id != pcid)  			goto invalid; -		pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); +		pr_debug("%s, PCID=%X\n", pptp_msg_name(msg), ntohs(pcid));  		info->cstate = PPTP_CALL_IN_CONF;  		/* we expect a GRE connection from PAC to PNS */ @@ -357,7 +365,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,  	case PPTP_CALL_DISCONNECT_NOTIFY:  		/* server confirms disconnect */  		cid = pptpReq->disc.callID; -		pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); +		pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid));  		info->cstate = PPTP_CALL_NONE;  		/* untrack this call id, unexpect GRE packets */ @@ -384,7 +392,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,  invalid:  	pr_debug("invalid %s: type=%d cid=%u pcid=%u "  		 "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", -		 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], +		 pptp_msg_name(msg),  		 msg, ntohs(cid), ntohs(pcid),  info->cstate, info->sstate,  		 ntohs(info->pns_call_id), ntohs(info->pac_call_id));  	return NF_ACCEPT; @@ -404,7 +412,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,  	typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;  	msg = ntohs(ctlh->messageType); -	pr_debug("outbound control message %s\n", pptp_msg_name[msg]); +	pr_debug("outbound control message %s\n", pptp_msg_name(msg));  	switch (msg) {  	case PPTP_START_SESSION_REQUEST: @@ -426,7 +434,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,  		info->cstate = PPTP_CALL_OUT_REQ;  		/* track PNS call id */  		cid = pptpReq->ocreq.callID; -		pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); +		pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid));  		info->pns_call_id = cid;  		break; @@ -440,7 +448,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,  		pcid = pptpReq->icack.peersCallID;  		if (info->pac_call_id != pcid)  			goto invalid; -		pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], +		pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name(msg),  			 ntohs(cid), ntohs(pcid));  		if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) { @@ -480,7 +488,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,  invalid:  	pr_debug("invalid %s: type=%d cid=%u pcid=%u "  		 "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", -		 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], +		 pptp_msg_name(msg),  		 msg, ntohs(cid), ntohs(pcid),  info->cstate, info->sstate,  		 ntohs(info->pns_call_id), ntohs(info->pac_call_id));  	return NF_ACCEPT; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index c0cb79495c35..42da6e337276 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -284,7 +284,7 @@ static void flow_offload_del(struct nf_flowtable *flow_table,  	if (nf_flow_has_expired(flow))  		flow_offload_fixup_ct(flow->ct); -	else if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) +	else  		flow_offload_fixup_ct_timeout(flow->ct);  	flow_offload_free(flow); @@ -361,8 +361,10 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)  {  	struct nf_flowtable *flow_table = data; -	if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || -	    test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { +	if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct)) +		set_bit(NF_FLOW_TEARDOWN, &flow->flags); + +	if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {  		if (test_bit(NF_FLOW_HW, &flow->flags)) {  			if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))  				nf_flow_offload_del(flow_table, flow); @@ -421,10 +423,12 @@ void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table,  	down_write(&flow_table->flow_block_lock);  	block_cb = flow_block_cb_lookup(block, cb, cb_priv); -	if (block_cb) +	if (block_cb) {  		list_del(&block_cb->list); -	else +		flow_block_cb_free(block_cb); +	} else {  		WARN_ON(true); +	}  	up_write(&flow_table->flow_block_lock);  }  EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e3b099c14eff..2276a73ccba2 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -817,6 +817,7 @@ static void flow_offload_work_handler(struct work_struct *work)  			WARN_ON_ONCE(1);  	} +	clear_bit(NF_FLOW_HW_PENDING, &offload->flow->flags);  	kfree(offload);  } @@ -831,9 +832,14 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable,  {  	struct flow_offload_work *offload; +	if (test_and_set_bit(NF_FLOW_HW_PENDING, &flow->flags)) +		return NULL; +  	offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); -	if (!offload) +	if (!offload) { +		clear_bit(NF_FLOW_HW_PENDING, &flow->flags);  		return NULL; +	}  	offload->cmd = cmd;  	offload->flow = flow; @@ -1056,7 +1062,7 @@ static struct flow_indr_block_entry block_ing_entry = {  int nf_flow_table_offload_init(void)  {  	nf_flow_offload_wq  = alloc_workqueue("nf_flow_table_offload", -					      WQ_UNBOUND | WQ_MEM_RECLAIM, 0); +					      WQ_UNBOUND, 0);  	if (!nf_flow_offload_wq)  		return -ENOMEM; diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 64eedc17037a..59151dc07fdc 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -68,15 +68,13 @@ static bool udp_manip_pkt(struct sk_buff *skb,  			  enum nf_nat_manip_type maniptype)  {  	struct udphdr *hdr; -	bool do_csum;  	if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))  		return false;  	hdr = (struct udphdr *)(skb->data + hdroff); -	do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL; +	__udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check); -	__udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, do_csum);  	return true;  } @@ -1035,8 +1033,8 @@ int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops)  	ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops,  				 ARRAY_SIZE(nf_nat_ipv4_ops));  	if (ret) -		nf_nat_ipv6_unregister_fn(net, ops); - +		nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, +					ARRAY_SIZE(nf_nat_ipv6_ops));  	return ret;  }  EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4471393da6d8..9780bd93b7e4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3542,6 +3542,7 @@ cont:  			continue;  		if (!strcmp(set->name, i->name)) {  			kfree(set->name); +			set->name = NULL;  			return -ENFILE;  		}  	} @@ -3961,8 +3962,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,  		if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |  			      NFT_SET_INTERVAL | NFT_SET_TIMEOUT |  			      NFT_SET_MAP | NFT_SET_EVAL | -			      NFT_SET_OBJECT)) -			return -EINVAL; +			      NFT_SET_OBJECT | NFT_SET_CONCAT)) +			return -EOPNOTSUPP;  		/* Only one of these operations is supported */  		if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==  			     (NFT_SET_MAP | NFT_SET_OBJECT)) @@ -4000,7 +4001,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,  		objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));  		if (objtype == NFT_OBJECT_UNSPEC ||  		    objtype > NFT_OBJECT_MAX) -			return -EINVAL; +			return -EOPNOTSUPP;  	} else if (flags & NFT_SET_OBJECT)  		return -EINVAL;  	else diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index a5f294aa8e4c..5b0d0a77379c 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -103,7 +103,7 @@ nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)  	if (help->helper->data_len == 0)  		return -EINVAL; -	nla_memcpy(help->data, nla_data(attr), sizeof(help->data)); +	nla_memcpy(help->data, attr, sizeof(help->data));  	return 0;  } @@ -240,6 +240,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[],  		ret = -ENOMEM;  		goto err2;  	} +	helper->data_len = size;  	helper->flags |= NF_CT_HELPER_F_USERSPACE;  	memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 9f5dea0064ea..916a3c7f9eaf 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -165,12 +165,12 @@ static bool nf_osf_match_one(const struct sk_buff *skb,  static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,  						const struct sk_buff *skb,  						const struct iphdr *ip, -						unsigned char *opts) +						unsigned char *opts, +						struct tcphdr *_tcph)  {  	const struct tcphdr *tcp; -	struct tcphdr _tcph; -	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); +	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph);  	if (!tcp)  		return NULL; @@ -205,10 +205,11 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,  	int fmatch = FMATCH_WRONG;  	struct nf_osf_hdr_ctx ctx;  	const struct tcphdr *tcp; +	struct tcphdr _tcph;  	memset(&ctx, 0, sizeof(ctx)); -	tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); +	tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph);  	if (!tcp)  		return false; @@ -265,10 +266,11 @@ bool nf_osf_find(const struct sk_buff *skb,  	const struct nf_osf_finger *kf;  	struct nf_osf_hdr_ctx ctx;  	const struct tcphdr *tcp; +	struct tcphdr _tcph;  	memset(&ctx, 0, sizeof(ctx)); -	tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); +	tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph);  	if (!tcp)  		return false; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 1e70359d633c..f1363b8aabba 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -29,7 +29,7 @@ void nft_lookup_eval(const struct nft_expr *expr,  {  	const struct nft_lookup *priv = nft_expr_priv(expr);  	const struct nft_set *set = priv->set; -	const struct nft_set_ext *ext; +	const struct nft_set_ext *ext = NULL;  	bool found;  	found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], @@ -39,11 +39,13 @@ void nft_lookup_eval(const struct nft_expr *expr,  		return;  	} -	if (set->flags & NFT_SET_MAP) -		nft_data_copy(®s->data[priv->dreg], -			      nft_set_ext_data(ext), set->dlen); +	if (ext) { +		if (set->flags & NFT_SET_MAP) +			nft_data_copy(®s->data[priv->dreg], +				      nft_set_ext_data(ext), set->dlen); -	nft_set_elem_update_expr(ext, regs, pkt); +		nft_set_elem_update_expr(ext, regs, pkt); +	}  }  static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 32f0fc8be3a4..2a81ea421819 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -81,7 +81,6 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,  	u32 idx, off;  	nft_bitmap_location(set, key, &idx, &off); -	*ext = NULL;  	return nft_bitmap_active(priv->bitmap, idx, off, genmask);  } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 3a5552e14f75..62f416bc0579 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -79,6 +79,10 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set  				parent = rcu_dereference_raw(parent->rb_left);  				continue;  			} + +			if (nft_set_elem_expired(&rbe->ext)) +				return false; +  			if (nft_rbtree_interval_end(rbe)) {  				if (nft_set_is_anonymous(set))  					return false; @@ -94,6 +98,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set  	if (set->flags & NFT_SET_INTERVAL && interval != NULL &&  	    nft_set_elem_active(&interval->ext, genmask) && +	    !nft_set_elem_expired(&interval->ext) &&  	    nft_rbtree_interval_start(interval)) {  		*ext = &interval->ext;  		return true; @@ -154,6 +159,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,  				continue;  			} +			if (nft_set_elem_expired(&rbe->ext)) +				return false; +  			if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||  			    (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==  			    (flags & NFT_SET_ELEM_INTERVAL_END)) { @@ -170,6 +178,7 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,  	if (set->flags & NFT_SET_INTERVAL && interval != NULL &&  	    nft_set_elem_active(&interval->ext, genmask) && +	    !nft_set_elem_expired(&interval->ext) &&  	    ((!nft_rbtree_interval_end(interval) &&  	      !(flags & NFT_SET_ELEM_INTERVAL_END)) ||  	     (nft_rbtree_interval_end(interval) && @@ -218,27 +227,26 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,  	/* Detect overlaps as we descend the tree. Set the flag in these cases:  	 * -	 * a1. |__ _ _?  >|__ _ _  (insert start after existing start) -	 * a2. _ _ __>|  ?_ _ __|  (insert end before existing end) -	 * a3. _ _ ___|  ?_ _ _>|  (insert end after existing end) -	 * a4. >|__ _ _   _ _ __|  (insert start before existing end) +	 * a1. _ _ __>|  ?_ _ __|  (insert end before existing end) +	 * a2. _ _ ___|  ?_ _ _>|  (insert end after existing end) +	 * a3. _ _ ___? >|_ _ __|  (insert start before existing end)  	 *  	 * and clear it later on, as we eventually reach the points indicated by  	 * '?' above, in the cases described below. We'll always meet these  	 * later, locally, due to tree ordering, and overlaps for the intervals  	 * that are the closest together are always evaluated last.  	 * -	 * b1. |__ _ _!  >|__ _ _  (insert start after existing end) -	 * b2. _ _ __>|  !_ _ __|  (insert end before existing start) -	 * b3. !_____>|            (insert end after existing start) +	 * b1. _ _ __>|  !_ _ __|  (insert end before existing start) +	 * b2. _ _ ___|  !_ _ _>|  (insert end after existing start) +	 * b3. _ _ ___! >|_ _ __|  (insert start after existing end)  	 * -	 * Case a4. resolves to b1.: +	 * Case a3. resolves to b3.:  	 * - if the inserted start element is the leftmost, because the '0'  	 *   element in the tree serves as end element  	 * - otherwise, if an existing end is found. Note that end elements are  	 *   always inserted after corresponding start elements.  	 * -	 * For a new, rightmost pair of elements, we'll hit cases b1. and b3., +	 * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,  	 * in that order.  	 *  	 * The flag is also cleared in two special cases: @@ -262,9 +270,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,  			p = &parent->rb_left;  			if (nft_rbtree_interval_start(new)) { -				overlap = nft_rbtree_interval_start(rbe) && -					  nft_set_elem_active(&rbe->ext, -							      genmask); +				if (nft_rbtree_interval_end(rbe) && +				    nft_set_elem_active(&rbe->ext, genmask)) +					overlap = false;  			} else {  				overlap = nft_rbtree_interval_end(rbe) &&  					  nft_set_elem_active(&rbe->ext, @@ -419,6 +427,8 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,  		if (iter->count < iter->skip)  			goto cont; +		if (nft_set_elem_expired(&rbe->ext)) +			goto cont;  		if (!nft_set_elem_active(&rbe->ext, iter->genmask))  			goto cont; diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 75bd0e5dd312..7b2f359bfce4 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -346,6 +346,9 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par)  	pr_debug("checkentry targinfo%s\n", info->label); +	if (info->send_nl_msg) +		return -EOPNOTSUPP; +  	ret = idletimer_tg_helper((struct idletimer_tg_info *)info);  	if(ret < 0)  	{ diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig index 64280a1d3906..07b03c306f28 100644 --- a/net/netlabel/Kconfig +++ b/net/netlabel/Kconfig @@ -14,6 +14,6 @@ config NETLABEL  	  Documentation/netlabel as well as the NetLabel SourceForge project  	  for configuration tools and additional documentation. -	   * http://netlabel.sf.net +	   * https://github.com/netlabel/netlabel_tools  	  If you are unsure, say N. diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 409a3ae47ce2..5e1239cef000 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -734,6 +734,12 @@ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,  	if ((off & (BITS_PER_LONG - 1)) != 0)  		return -EINVAL; +	/* a null catmap is equivalent to an empty one */ +	if (!catmap) { +		*offset = (u32)-1; +		return 0; +	} +  	if (off < catmap->startbit) {  		off = catmap->startbit;  		*offset = off; diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 79f12d8c7b86..0891ee02ca4f 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -208,6 +208,7 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic,  		/* refcount initialized at 1 */  		spin_unlock_bh(&nr_node_list_lock); +		nr_neigh_put(nr_neigh);  		return 0;  	}  	nr_node_lock(nr_node); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e726159cfcfa..4340f25fe390 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1895,7 +1895,8 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)  		struct hlist_head *head = &info->limits[i];  		struct ovs_ct_limit *ct_limit; -		hlist_for_each_entry_rcu(ct_limit, head, hlist_node) +		hlist_for_each_entry_rcu(ct_limit, head, hlist_node, +					 lockdep_ovsl_is_held())  			kfree_rcu(ct_limit, rcu);  	}  	kfree(ovs_net->ct_limit_info->limits); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d8ae541d22a8..94b024534987 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2466,8 +2466,10 @@ static void __net_exit ovs_exit_net(struct net *dnet)  	struct net *net;  	LIST_HEAD(head); -	ovs_ct_exit(dnet);  	ovs_lock(); + +	ovs_ct_exit(dnet); +  	list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)  		__dp_destroy(dp); diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index e7d0fe3f4330..c5b3202a14ca 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -712,6 +712,10 @@ void qrtr_ns_init(void)  		goto err_sock;  	} +	qrtr_ns.workqueue = alloc_workqueue("qrtr_ns_handler", WQ_UNBOUND, 1); +	if (!qrtr_ns.workqueue) +		goto err_sock; +  	qrtr_ns.sock->sk->sk_data_ready = qrtr_ns_data_ready;  	sq.sq_port = QRTR_PORT_CTRL; @@ -720,17 +724,13 @@ void qrtr_ns_init(void)  	ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq));  	if (ret < 0) {  		pr_err("failed to bind to socket\n"); -		goto err_sock; +		goto err_wq;  	}  	qrtr_ns.bcast_sq.sq_family = AF_QIPCRTR;  	qrtr_ns.bcast_sq.sq_node = QRTR_NODE_BCAST;  	qrtr_ns.bcast_sq.sq_port = QRTR_PORT_CTRL; -	qrtr_ns.workqueue = alloc_workqueue("qrtr_ns_handler", WQ_UNBOUND, 1); -	if (!qrtr_ns.workqueue) -		goto err_sock; -  	ret = say_hello(&qrtr_ns.bcast_sq);  	if (ret < 0)  		goto err_wq; diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index e22092e4a783..2d8d6131bc5f 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -854,7 +854,7 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb,  	}  	mutex_unlock(&qrtr_node_lock); -	qrtr_local_enqueue(node, skb, type, from, to); +	qrtr_local_enqueue(NULL, skb, type, from, to);  	return 0;  } @@ -906,20 +906,21 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)  	node = NULL;  	if (addr->sq_node == QRTR_NODE_BCAST) { -		enqueue_fn = qrtr_bcast_enqueue; -		if (addr->sq_port != QRTR_PORT_CTRL) { +		if (addr->sq_port != QRTR_PORT_CTRL && +		    qrtr_local_nid != QRTR_NODE_BCAST) {  			release_sock(sk);  			return -ENOTCONN;  		} +		enqueue_fn = qrtr_bcast_enqueue;  	} else if (addr->sq_node == ipc->us.sq_node) {  		enqueue_fn = qrtr_local_enqueue;  	} else { -		enqueue_fn = qrtr_node_enqueue;  		node = qrtr_node_lookup(addr->sq_node);  		if (!node) {  			release_sock(sk);  			return -ECONNRESET;  		} +		enqueue_fn = qrtr_node_enqueue;  	}  	plen = (len + 3) & ~3; diff --git a/net/rds/message.c b/net/rds/message.c index 50f13f1d4ae0..071a261fdaab 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -1,5 +1,5 @@  /* - * Copyright (c) 2006 Oracle.  All rights reserved. + * Copyright (c) 2006, 2020 Oracle and/or its affiliates.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -162,12 +162,12 @@ static void rds_message_purge(struct rds_message *rm)  	if (rm->rdma.op_active)  		rds_rdma_free_op(&rm->rdma);  	if (rm->rdma.op_rdma_mr) -		rds_mr_put(rm->rdma.op_rdma_mr); +		kref_put(&rm->rdma.op_rdma_mr->r_kref, __rds_put_mr_final);  	if (rm->atomic.op_active)  		rds_atomic_free_op(&rm->atomic);  	if (rm->atomic.op_rdma_mr) -		rds_mr_put(rm->atomic.op_rdma_mr); +		kref_put(&rm->atomic.op_rdma_mr->r_kref, __rds_put_mr_final);  }  void rds_message_put(struct rds_message *rm) @@ -308,26 +308,20 @@ out:  /*   * RDS ops use this to grab SG entries from the rm's sg pool.   */ -struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents, -					  int *ret) +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)  {  	struct scatterlist *sg_first = (struct scatterlist *) &rm[1];  	struct scatterlist *sg_ret; -	if (WARN_ON(!ret)) -		return NULL; -  	if (nents <= 0) {  		pr_warn("rds: alloc sgs failed! nents <= 0\n"); -		*ret = -EINVAL; -		return NULL; +		return ERR_PTR(-EINVAL);  	}  	if (rm->m_used_sgs + nents > rm->m_total_sgs) {  		pr_warn("rds: alloc sgs failed! total %d used %d nents %d\n",  			rm->m_total_sgs, rm->m_used_sgs, nents); -		*ret = -ENOMEM; -		return NULL; +		return ERR_PTR(-ENOMEM);  	}  	sg_ret = &sg_first[rm->m_used_sgs]; @@ -343,7 +337,6 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in  	unsigned int i;  	int num_sgs = DIV_ROUND_UP(total_len, PAGE_SIZE);  	int extra_bytes = num_sgs * sizeof(struct scatterlist); -	int ret;  	rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);  	if (!rm) @@ -352,10 +345,10 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in  	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);  	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);  	rm->data.op_nents = DIV_ROUND_UP(total_len, PAGE_SIZE); -	rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret); -	if (!rm->data.op_sg) { +	rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); +	if (IS_ERR(rm->data.op_sg)) {  		rds_message_put(rm); -		return ERR_PTR(ret); +		return ERR_CAST(rm->data.op_sg);  	}  	for (i = 0; i < rm->data.op_nents; ++i) { diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 585e6b3b69ce..a7ae11846cd7 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -1,5 +1,5 @@  /* - * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2020 Oracle and/or its affiliates.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -84,7 +84,7 @@ static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,  	if (insert) {  		rb_link_node(&insert->r_rb_node, parent, p);  		rb_insert_color(&insert->r_rb_node, root); -		refcount_inc(&insert->r_refcount); +		kref_get(&insert->r_kref);  	}  	return NULL;  } @@ -99,10 +99,7 @@ static void rds_destroy_mr(struct rds_mr *mr)  	unsigned long flags;  	rdsdebug("RDS: destroy mr key is %x refcnt %u\n", -			mr->r_key, refcount_read(&mr->r_refcount)); - -	if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state)) -		return; +		 mr->r_key, kref_read(&mr->r_kref));  	spin_lock_irqsave(&rs->rs_rdma_lock, flags);  	if (!RB_EMPTY_NODE(&mr->r_rb_node)) @@ -115,8 +112,10 @@ static void rds_destroy_mr(struct rds_mr *mr)  		mr->r_trans->free_mr(trans_private, mr->r_invalidate);  } -void __rds_put_mr_final(struct rds_mr *mr) +void __rds_put_mr_final(struct kref *kref)  { +	struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref); +  	rds_destroy_mr(mr);  	kfree(mr);  } @@ -140,8 +139,7 @@ void rds_rdma_drop_keys(struct rds_sock *rs)  		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);  		RB_CLEAR_NODE(&mr->r_rb_node);  		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); -		rds_destroy_mr(mr); -		rds_mr_put(mr); +		kref_put(&mr->r_kref, __rds_put_mr_final);  		spin_lock_irqsave(&rs->rs_rdma_lock, flags);  	}  	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); @@ -242,7 +240,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,  		goto out;  	} -	refcount_set(&mr->r_refcount, 1); +	kref_init(&mr->r_kref);  	RB_CLEAR_NODE(&mr->r_rb_node);  	mr->r_trans = rs->rs_transport;  	mr->r_sock = rs; @@ -343,7 +341,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,  	rdsdebug("RDS: get_mr key is %x\n", mr->r_key);  	if (mr_ret) { -		refcount_inc(&mr->r_refcount); +		kref_get(&mr->r_kref);  		*mr_ret = mr;  	} @@ -351,7 +349,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,  out:  	kfree(pages);  	if (mr) -		rds_mr_put(mr); +		kref_put(&mr->r_kref, __rds_put_mr_final);  	return ret;  } @@ -434,13 +432,7 @@ int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)  	if (!mr)  		return -EINVAL; -	/* -	 * call rds_destroy_mr() ourselves so that we're sure it's done by the time -	 * we return.  If we let rds_mr_put() do it it might not happen until -	 * someone else drops their ref. -	 */ -	rds_destroy_mr(mr); -	rds_mr_put(mr); +	kref_put(&mr->r_kref, __rds_put_mr_final);  	return 0;  } @@ -464,6 +456,14 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)  		return;  	} +	/* Get a reference so that the MR won't go away before calling +	 * sync_mr() below. +	 */ +	kref_get(&mr->r_kref); + +	/* If it is going to be freed, remove it from the tree now so +	 * that no other thread can find it and free it. +	 */  	if (mr->r_use_once || force) {  		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);  		RB_CLEAR_NODE(&mr->r_rb_node); @@ -477,12 +477,13 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)  	if (mr->r_trans->sync_mr)  		mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); +	/* Release the reference held above. */ +	kref_put(&mr->r_kref, __rds_put_mr_final); +  	/* If the MR was marked as invalidate, this will  	 * trigger an async flush. */ -	if (zot_me) { -		rds_destroy_mr(mr); -		rds_mr_put(mr); -	} +	if (zot_me) +		kref_put(&mr->r_kref, __rds_put_mr_final);  }  void rds_rdma_free_op(struct rm_rdma_op *ro) @@ -490,7 +491,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)  	unsigned int i;  	if (ro->op_odp_mr) { -		rds_mr_put(ro->op_odp_mr); +		kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final);  	} else {  		for (i = 0; i < ro->op_nents; i++) {  			struct page *page = sg_page(&ro->op_sg[i]); @@ -664,9 +665,11 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,  	op->op_odp_mr = NULL;  	WARN_ON(!nr_pages); -	op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret); -	if (!op->op_sg) +	op->op_sg = rds_message_alloc_sgs(rm, nr_pages); +	if (IS_ERR(op->op_sg)) { +		ret = PTR_ERR(op->op_sg);  		goto out_pages; +	}  	if (op->op_notify || op->op_recverr) {  		/* We allocate an uninitialized notifier here, because @@ -730,7 +733,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,  				goto out_pages;  			}  			RB_CLEAR_NODE(&local_odp_mr->r_rb_node); -			refcount_set(&local_odp_mr->r_refcount, 1); +			kref_init(&local_odp_mr->r_kref);  			local_odp_mr->r_trans = rs->rs_transport;  			local_odp_mr->r_sock = rs;  			local_odp_mr->r_trans_private = @@ -827,7 +830,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,  	if (!mr)  		err = -EINVAL;	/* invalid r_key */  	else -		refcount_inc(&mr->r_refcount); +		kref_get(&mr->r_kref);  	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);  	if (mr) { @@ -905,9 +908,11 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,  	rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);  	rm->atomic.op_active = 1;  	rm->atomic.op_recverr = rs->rs_recverr; -	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1, &ret); -	if (!rm->atomic.op_sg) +	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); +	if (IS_ERR(rm->atomic.op_sg)) { +		ret = PTR_ERR(rm->atomic.op_sg);  		goto err; +	}  	/* verify 8 byte-aligned */  	if (args->local_addr & 0x7) { diff --git a/net/rds/rds.h b/net/rds/rds.h index e4a603523083..6019b0c004a9 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -291,7 +291,7 @@ struct rds_incoming {  struct rds_mr {  	struct rb_node		r_rb_node; -	refcount_t		r_refcount; +	struct kref		r_kref;  	u32			r_key;  	/* A copy of the creation flags */ @@ -299,19 +299,11 @@ struct rds_mr {  	unsigned int		r_invalidate:1;  	unsigned int		r_write:1; -	/* This is for RDS_MR_DEAD. -	 * It would be nice & consistent to make this part of the above -	 * bit field here, but we need to use test_and_set_bit. -	 */ -	unsigned long		r_state;  	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */  	struct rds_transport	*r_trans;  	void			*r_trans_private;  }; -/* Flags for mr->r_state */ -#define RDS_MR_DEAD		0 -  static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)  {  	return r_key | (((u64) offset) << 32); @@ -852,8 +844,7 @@ rds_conn_connecting(struct rds_connection *conn)  /* message.c */  struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); -struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents, -					  int *ret); +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);  int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,  			       bool zcopy);  struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); @@ -946,12 +937,7 @@ void rds_atomic_send_complete(struct rds_message *rm, int wc_status);  int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,  		    struct cmsghdr *cmsg); -void __rds_put_mr_final(struct rds_mr *mr); -static inline void rds_mr_put(struct rds_mr *mr) -{ -	if (refcount_dec_and_test(&mr->r_refcount)) -		__rds_put_mr_final(mr); -} +void __rds_put_mr_final(struct kref *kref);  static inline bool rds_destroy_pending(struct rds_connection *conn)  { diff --git a/net/rds/send.c b/net/rds/send.c index 82dcd8b84fe7..68e2bdb08fd0 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1274,9 +1274,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)  	/* Attach data to the rm */  	if (payload_len) { -		rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret); -		if (!rm->data.op_sg) +		rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); +		if (IS_ERR(rm->data.op_sg)) { +			ret = PTR_ERR(rm->data.op_sg);  			goto out; +		}  		ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);  		if (ret)  			goto out; diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 6ffb7e9887ce..ddd0f95713a9 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -25,6 +25,7 @@ rxrpc-y := \  	peer_event.o \  	peer_object.o \  	recvmsg.o \ +	rtt.o \  	security.o \  	sendmsg.o \  	skbuff.o \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 3eb1ab40ca5c..9fe264bec70c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -7,6 +7,7 @@  #include <linux/atomic.h>  #include <linux/seqlock.h> +#include <linux/win_minmax.h>  #include <net/net_namespace.h>  #include <net/netns/generic.h>  #include <net/sock.h> @@ -311,11 +312,14 @@ struct rxrpc_peer {  #define RXRPC_RTT_CACHE_SIZE 32  	spinlock_t		rtt_input_lock;	/* RTT lock for input routine */  	ktime_t			rtt_last_req;	/* Time of last RTT request */ -	u64			rtt;		/* Current RTT estimate (in nS) */ -	u64			rtt_sum;	/* Sum of cache contents */ -	u64			rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */ -	u8			rtt_cursor;	/* next entry at which to insert */ -	u8			rtt_usage;	/* amount of cache actually used */ +	unsigned int		rtt_count;	/* Number of samples we've got */ + +	u32			srtt_us;	/* smoothed round trip time << 3 in usecs */ +	u32			mdev_us;	/* medium deviation			*/ +	u32			mdev_max_us;	/* maximal mdev for the last rtt period	*/ +	u32			rttvar_us;	/* smoothed mdev_max			*/ +	u32			rto_j;		/* Retransmission timeout in jiffies */ +	u8			backoff;	/* Backoff timeout */  	u8			cong_cwnd;	/* Congestion window size */  }; @@ -1041,7 +1045,6 @@ extern unsigned long rxrpc_idle_ack_delay;  extern unsigned int rxrpc_rx_window_size;  extern unsigned int rxrpc_rx_mtu;  extern unsigned int rxrpc_rx_jumbo_max; -extern unsigned long rxrpc_resend_timeout;  extern const s8 rxrpc_ack_priority[]; @@ -1069,8 +1072,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *);   * peer_event.c   */  void rxrpc_error_report(struct sock *); -void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, -			rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);  void rxrpc_peer_keepalive_worker(struct work_struct *);  /* @@ -1103,6 +1104,14 @@ void rxrpc_notify_socket(struct rxrpc_call *);  int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);  /* + * rtt.c + */ +void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, +			rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); +unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool); +void rxrpc_peer_init_rtt(struct rxrpc_peer *); + +/*   * rxkad.c   */  #ifdef CONFIG_RXKAD diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 70e44abf106c..b7611cc159e5 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -248,7 +248,7 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb)  	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);  	ktime_t now = skb->tstamp; -	if (call->peer->rtt_usage < 3 || +	if (call->peer->rtt_count < 3 ||  	    ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))  		rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial,  				  true, true, diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index cedbbb3a7c2e..2a65ac41055f 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -111,8 +111,8 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,  	} else {  		unsigned long now = jiffies, ack_at; -		if (call->peer->rtt_usage > 0) -			ack_at = nsecs_to_jiffies(call->peer->rtt); +		if (call->peer->srtt_us != 0) +			ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3);  		else  			ack_at = expiry; @@ -157,24 +157,18 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)  static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)  {  	struct sk_buff *skb; -	unsigned long resend_at; +	unsigned long resend_at, rto_j;  	rxrpc_seq_t cursor, seq, top; -	ktime_t now, max_age, oldest, ack_ts, timeout, min_timeo; +	ktime_t now, max_age, oldest, ack_ts;  	int ix;  	u8 annotation, anno_type, retrans = 0, unacked = 0;  	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top); -	if (call->peer->rtt_usage > 1) -		timeout = ns_to_ktime(call->peer->rtt * 3 / 2); -	else -		timeout = ms_to_ktime(rxrpc_resend_timeout); -	min_timeo = ns_to_ktime((1000000000 / HZ) * 4); -	if (ktime_before(timeout, min_timeo)) -		timeout = min_timeo; +	rto_j = call->peer->rto_j;  	now = ktime_get_real(); -	max_age = ktime_sub(now, timeout); +	max_age = ktime_sub(now, jiffies_to_usecs(rto_j));  	spin_lock_bh(&call->lock); @@ -219,7 +213,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)  	}  	resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); -	resend_at += jiffies + rxrpc_resend_timeout; +	resend_at += jiffies + rto_j;  	WRITE_ONCE(call->resend_at, resend_at);  	if (unacked) @@ -234,7 +228,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)  					rxrpc_timer_set_for_resend);  		spin_unlock_bh(&call->lock);  		ack_ts = ktime_sub(now, call->acks_latest_ts); -		if (ktime_to_ns(ack_ts) < call->peer->rtt) +		if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3))  			goto out;  		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false,  				  rxrpc_propose_ack_ping_for_lost_ack); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 69e09d69c896..3be4177baf70 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -91,11 +91,11 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,  		/* We analyse the number of packets that get ACK'd per RTT  		 * period and increase the window if we managed to fill it.  		 */ -		if (call->peer->rtt_usage == 0) +		if (call->peer->rtt_count == 0)  			goto out;  		if (ktime_before(skb->tstamp, -				 ktime_add_ns(call->cong_tstamp, -					      call->peer->rtt))) +				 ktime_add_us(call->cong_tstamp, +					      call->peer->srtt_us >> 3)))  			goto out_no_clear_ca;  		change = rxrpc_cong_rtt_window_end;  		call->cong_tstamp = skb->tstamp; @@ -803,6 +803,30 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,  }  /* + * Return true if the ACK is valid - ie. it doesn't appear to have regressed + * with respect to the ack state conveyed by preceding ACKs. + */ +static bool rxrpc_is_ack_valid(struct rxrpc_call *call, +			       rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt) +{ +	rxrpc_seq_t base = READ_ONCE(call->ackr_first_seq); + +	if (after(first_pkt, base)) +		return true; /* The window advanced */ + +	if (before(first_pkt, base)) +		return false; /* firstPacket regressed */ + +	if (after_eq(prev_pkt, call->ackr_prev_seq)) +		return true; /* previousPacket hasn't regressed. */ + +	/* Some rx implementations put a serial number in previousPacket. */ +	if (after_eq(prev_pkt, base + call->tx_winsize)) +		return false; +	return true; +} + +/*   * Process an ACK packet.   *   * ack.firstPacket is the sequence number of the first soft-ACK'd/NAK'd packet @@ -865,9 +889,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)  	}  	/* Discard any out-of-order or duplicate ACKs (outside lock). */ -	if (before(first_soft_ack, call->ackr_first_seq) || -	    before(prev_pkt, call->ackr_prev_seq)) +	if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { +		trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, +					   first_soft_ack, call->ackr_first_seq, +					   prev_pkt, call->ackr_prev_seq);  		return; +	}  	buf.info.rxMTU = 0;  	ioffset = offset + nr_acks + 3; @@ -878,9 +905,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)  	spin_lock(&call->input_lock);  	/* Discard any out-of-order or duplicate ACKs (inside lock). */ -	if (before(first_soft_ack, call->ackr_first_seq) || -	    before(prev_pkt, call->ackr_prev_seq)) +	if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { +		trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial, +					   first_soft_ack, call->ackr_first_seq, +					   prev_pkt, call->ackr_prev_seq);  		goto out; +	}  	call->acks_latest_ts = skb->tstamp;  	call->ackr_first_seq = first_soft_ack; diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index a6c1349e965d..01135e54d95d 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -165,15 +165,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)  			goto error;  		} -		/* we want to set the don't fragment bit */ -		opt = IPV6_PMTUDISC_DO; -		ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER, -					(char *) &opt, sizeof(opt)); -		if (ret < 0) { -			_debug("setsockopt failed"); -			goto error; -		} -  		/* Fall through and set IPv4 options too otherwise we don't get  		 * errors from IPv4 packets sent through the IPv6 socket.  		 */ diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 214405f75346..d4144fd86f84 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -63,11 +63,6 @@ unsigned int rxrpc_rx_mtu = 5692;   */  unsigned int rxrpc_rx_jumbo_max = 4; -/* - * Time till packet resend (in milliseconds). - */ -unsigned long rxrpc_resend_timeout = 4 * HZ; -  const s8 rxrpc_ack_priority[] = {  	[0]				= 0,  	[RXRPC_ACK_DELAY]		= 1, diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index bad3d2420344..f8b632a5c619 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -369,7 +369,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,  	    (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) ||  	     retrans ||  	     call->cong_mode == RXRPC_CALL_SLOW_START || -	     (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || +	     (call->peer->rtt_count < 3 && sp->hdr.seq & 1) ||  	     ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),  			  ktime_get_real())))  		whdr.flags |= RXRPC_REQUEST_ACK; @@ -423,13 +423,10 @@ done:  		if (whdr.flags & RXRPC_REQUEST_ACK) {  			call->peer->rtt_last_req = skb->tstamp;  			trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); -			if (call->peer->rtt_usage > 1) { +			if (call->peer->rtt_count > 1) {  				unsigned long nowj = jiffies, ack_lost_at; -				ack_lost_at = nsecs_to_jiffies(2 * call->peer->rtt); -				if (ack_lost_at < 1) -					ack_lost_at = 1; - +				ack_lost_at = rxrpc_get_rto_backoff(call->peer, retrans);  				ack_lost_at += nowj;  				WRITE_ONCE(call->ack_lost_at, ack_lost_at);  				rxrpc_reduce_call_timer(call, ack_lost_at, nowj, @@ -474,41 +471,21 @@ send_fragmentable:  	skb->tstamp = ktime_get_real();  	switch (conn->params.local->srx.transport.family) { +	case AF_INET6:  	case AF_INET:  		opt = IP_PMTUDISC_DONT; -		ret = kernel_setsockopt(conn->params.local->socket, -					SOL_IP, IP_MTU_DISCOVER, -					(char *)&opt, sizeof(opt)); -		if (ret == 0) { -			ret = kernel_sendmsg(conn->params.local->socket, &msg, -					     iov, 2, len); -			conn->params.peer->last_tx_at = ktime_get_seconds(); - -			opt = IP_PMTUDISC_DO; -			kernel_setsockopt(conn->params.local->socket, SOL_IP, -					  IP_MTU_DISCOVER, -					  (char *)&opt, sizeof(opt)); -		} -		break; - -#ifdef CONFIG_AF_RXRPC_IPV6 -	case AF_INET6: -		opt = IPV6_PMTUDISC_DONT; -		ret = kernel_setsockopt(conn->params.local->socket, -					SOL_IPV6, IPV6_MTU_DISCOVER, -					(char *)&opt, sizeof(opt)); -		if (ret == 0) { -			ret = kernel_sendmsg(conn->params.local->socket, &msg, -					     iov, 2, len); -			conn->params.peer->last_tx_at = ktime_get_seconds(); - -			opt = IPV6_PMTUDISC_DO; -			kernel_setsockopt(conn->params.local->socket, -					  SOL_IPV6, IPV6_MTU_DISCOVER, -					  (char *)&opt, sizeof(opt)); -		} +		kernel_setsockopt(conn->params.local->socket, +				  SOL_IP, IP_MTU_DISCOVER, +				  (char *)&opt, sizeof(opt)); +		ret = kernel_sendmsg(conn->params.local->socket, &msg, +				     iov, 2, len); +		conn->params.peer->last_tx_at = ktime_get_seconds(); + +		opt = IP_PMTUDISC_DO; +		kernel_setsockopt(conn->params.local->socket, +				  SOL_IP, IP_MTU_DISCOVER, +				  (char *)&opt, sizeof(opt));  		break; -#endif  	default:  		BUG(); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 923b263c401b..b1449d971883 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -296,52 +296,6 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error,  }  /* - * Add RTT information to cache.  This is called in softirq mode and has - * exclusive access to the peer RTT data. - */ -void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, -			rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, -			ktime_t send_time, ktime_t resp_time) -{ -	struct rxrpc_peer *peer = call->peer; -	s64 rtt; -	u64 sum = peer->rtt_sum, avg; -	u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage; - -	rtt = ktime_to_ns(ktime_sub(resp_time, send_time)); -	if (rtt < 0) -		return; - -	spin_lock(&peer->rtt_input_lock); - -	/* Replace the oldest datum in the RTT buffer */ -	sum -= peer->rtt_cache[cursor]; -	sum += rtt; -	peer->rtt_cache[cursor] = rtt; -	peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1); -	peer->rtt_sum = sum; -	if (usage < RXRPC_RTT_CACHE_SIZE) { -		usage++; -		peer->rtt_usage = usage; -	} - -	spin_unlock(&peer->rtt_input_lock); - -	/* Now recalculate the average */ -	if (usage == RXRPC_RTT_CACHE_SIZE) { -		avg = sum / RXRPC_RTT_CACHE_SIZE; -	} else { -		avg = sum; -		do_div(avg, usage); -	} - -	/* Don't need to update this under lock */ -	peer->rtt = avg; -	trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt, -			   usage, avg); -} - -/*   * Perform keep-alive pings.   */  static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 452163eadb98..ca29976bb193 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -225,6 +225,8 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)  		spin_lock_init(&peer->rtt_input_lock);  		peer->debug_id = atomic_inc_return(&rxrpc_debug_id); +		rxrpc_peer_init_rtt(peer); +  		if (RXRPC_TX_SMSS > 2190)  			peer->cong_cwnd = 2;  		else if (RXRPC_TX_SMSS > 1095) @@ -497,14 +499,14 @@ void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call,  EXPORT_SYMBOL(rxrpc_kernel_get_peer);  /** - * rxrpc_kernel_get_rtt - Get a call's peer RTT + * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT   * @sock: The socket on which the call is in progress.   * @call: The call to query   * - * Get the call's peer RTT. + * Get the call's peer smoothed RTT.   */ -u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call) +u32 rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call)  { -	return call->peer->rtt; +	return call->peer->srtt_us >> 3;  } -EXPORT_SYMBOL(rxrpc_kernel_get_rtt); +EXPORT_SYMBOL(rxrpc_kernel_get_srtt); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index b9d053e42821..8b179e3c802a 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -222,7 +222,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)  		seq_puts(seq,  			 "Proto Local                                          "  			 " Remote                                         " -			 " Use CW  MTU   LastUse          RTT Rc\n" +			 " Use  CW   MTU LastUse      RTT      RTO\n"  			 );  		return 0;  	} @@ -236,15 +236,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)  	now = ktime_get_seconds();  	seq_printf(seq,  		   "UDP   %-47.47s %-47.47s %3u" -		   " %3u %5u %6llus %12llu %2u\n", +		   " %3u %5u %6llus %8u %8u\n",  		   lbuff,  		   rbuff,  		   atomic_read(&peer->usage),  		   peer->cong_cwnd,  		   peer->mtu,  		   now - peer->last_tx_at, -		   peer->rtt, -		   peer->rtt_cursor); +		   peer->srtt_us >> 3, +		   jiffies_to_usecs(peer->rto_j));  	return 0;  } diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c new file mode 100644 index 000000000000..928d8b34a3ee --- /dev/null +++ b/net/rxrpc/rtt.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* RTT/RTO calculation. + * + * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) + * + * https://tools.ietf.org/html/rfc6298 + * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 + * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf + */ + +#include <linux/net.h> +#include "ar-internal.h" + +#define RXRPC_RTO_MAX	((unsigned)(120 * HZ)) +#define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC6298 2.1 initial RTO value	*/ +#define rxrpc_jiffies32 ((u32)jiffies)		/* As rxrpc_jiffies32 */ +#define rxrpc_min_rtt_wlen 300			/* As sysctl_tcp_min_rtt_wlen */ + +static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) +{ +	return 200; +} + +static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) +{ +	return _usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); +} + +static u32 rxrpc_bound_rto(u32 rto) +{ +	return min(rto, RXRPC_RTO_MAX); +} + +/* + * Called to compute a smoothed rtt estimate. The data fed to this + * routine either comes from timestamps, or from segments that were + * known _not_ to have been retransmitted [see Karn/Partridge + * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 + * piece by Van Jacobson. + * NOTE: the next three routines used to be one big routine. + * To save cycles in the RFC 1323 implementation it was better to break + * it up into three procedures. -- erics + */ +static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) +{ +	long m = sample_rtt_us; /* RTT */ +	u32 srtt = peer->srtt_us; + +	/*	The following amusing code comes from Jacobson's +	 *	article in SIGCOMM '88.  Note that rtt and mdev +	 *	are scaled versions of rtt and mean deviation. +	 *	This is designed to be as fast as possible +	 *	m stands for "measurement". +	 * +	 *	On a 1990 paper the rto value is changed to: +	 *	RTO = rtt + 4 * mdev +	 * +	 * Funny. This algorithm seems to be very broken. +	 * These formulae increase RTO, when it should be decreased, increase +	 * too slowly, when it should be increased quickly, decrease too quickly +	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely +	 * does not matter how to _calculate_ it. Seems, it was trap +	 * that VJ failed to avoid. 8) +	 */ +	if (srtt != 0) { +		m -= (srtt >> 3);	/* m is now error in rtt est */ +		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */ +		if (m < 0) { +			m = -m;		/* m is now abs(error) */ +			m -= (peer->mdev_us >> 2);   /* similar update on mdev */ +			/* This is similar to one of Eifel findings. +			 * Eifel blocks mdev updates when rtt decreases. +			 * This solution is a bit different: we use finer gain +			 * for mdev in this case (alpha*beta). +			 * Like Eifel it also prevents growth of rto, +			 * but also it limits too fast rto decreases, +			 * happening in pure Eifel. +			 */ +			if (m > 0) +				m >>= 3; +		} else { +			m -= (peer->mdev_us >> 2);   /* similar update on mdev */ +		} + +		peer->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */ +		if (peer->mdev_us > peer->mdev_max_us) { +			peer->mdev_max_us = peer->mdev_us; +			if (peer->mdev_max_us > peer->rttvar_us) +				peer->rttvar_us = peer->mdev_max_us; +		} +	} else { +		/* no previous measure. */ +		srtt = m << 3;		/* take the measured time to be rtt */ +		peer->mdev_us = m << 1;	/* make sure rto = 3*rtt */ +		peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); +		peer->mdev_max_us = peer->rttvar_us; +	} + +	peer->srtt_us = max(1U, srtt); +} + +/* + * Calculate rto without backoff.  This is the second half of Van Jacobson's + * routine referred to above. + */ +static void rxrpc_set_rto(struct rxrpc_peer *peer) +{ +	u32 rto; + +	/* 1. If rtt variance happened to be less 50msec, it is hallucination. +	 *    It cannot be less due to utterly erratic ACK generation made +	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_ +	 *    to do with delayed acks, because at cwnd>2 true delack timeout +	 *    is invisible. Actually, Linux-2.4 also generates erratic +	 *    ACKs in some circumstances. +	 */ +	rto = __rxrpc_set_rto(peer); + +	/* 2. Fixups made earlier cannot be right. +	 *    If we do not estimate RTO correctly without them, +	 *    all the algo is pure shit and should be replaced +	 *    with correct one. It is exactly, which we pretend to do. +	 */ + +	/* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo +	 * guarantees that rto is higher. +	 */ +	peer->rto_j = rxrpc_bound_rto(rto); +} + +static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) +{ +	if (rtt_us < 0) +		return; + +	//rxrpc_update_rtt_min(peer, rtt_us); +	rxrpc_rtt_estimator(peer, rtt_us); +	rxrpc_set_rto(peer); + +	/* RFC6298: only reset backoff on valid RTT measurement. */ +	peer->backoff = 0; +} + +/* + * Add RTT information to cache.  This is called in softirq mode and has + * exclusive access to the peer RTT data. + */ +void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, +			rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, +			ktime_t send_time, ktime_t resp_time) +{ +	struct rxrpc_peer *peer = call->peer; +	s64 rtt_us; + +	rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); +	if (rtt_us < 0) +		return; + +	spin_lock(&peer->rtt_input_lock); +	rxrpc_ack_update_rtt(peer, rtt_us); +	if (peer->rtt_count < 3) +		peer->rtt_count++; +	spin_unlock(&peer->rtt_input_lock); + +	trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, +			   peer->srtt_us >> 3, peer->rto_j); +} + +/* + * Get the retransmission timeout to set in jiffies, backing it off each time + * we retransmit. + */ +unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) +{ +	u64 timo_j; +	u8 backoff = READ_ONCE(peer->backoff); + +	timo_j = peer->rto_j; +	timo_j <<= backoff; +	if (retrans && timo_j * 2 <= RXRPC_RTO_MAX) +		WRITE_ONCE(peer->backoff, backoff + 1); + +	if (timo_j < 1) +		timo_j = 1; + +	return timo_j; +} + +void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) +{ +	peer->rto_j	= RXRPC_TIMEOUT_INIT; +	peer->mdev_us	= jiffies_to_usecs(RXRPC_TIMEOUT_INIT); +	peer->backoff	= 0; +	//minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); +} diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 098f1f9ec53b..52a24d4ef5d8 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -1148,7 +1148,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,  	ret = rxkad_decrypt_ticket(conn, skb, ticket, ticket_len, &session_key,  				   &expiry, _abort_code);  	if (ret < 0) -		goto temporary_error_free_resp; +		goto temporary_error_free_ticket;  	/* use the session key from inside the ticket to decrypt the  	 * response */ @@ -1230,7 +1230,6 @@ protocol_error:  temporary_error_free_ticket:  	kfree(ticket); -temporary_error_free_resp:  	kfree(response);  temporary_error:  	/* Ignore the response packet if we got a temporary error such as diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 0fcf157aa09f..5e9c43d4a314 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -66,15 +66,14 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,  					    struct rxrpc_call *call)  {  	rxrpc_seq_t tx_start, tx_win; -	signed long rtt2, timeout; -	u64 rtt; +	signed long rtt, timeout; -	rtt = READ_ONCE(call->peer->rtt); -	rtt2 = nsecs_to_jiffies64(rtt) * 2; -	if (rtt2 < 2) -		rtt2 = 2; +	rtt = READ_ONCE(call->peer->srtt_us) >> 3; +	rtt = usecs_to_jiffies(rtt) * 2; +	if (rtt < 2) +		rtt = 2; -	timeout = rtt2; +	timeout = rtt;  	tx_start = READ_ONCE(call->tx_hard_ack);  	for (;;) { @@ -92,7 +91,7 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,  			return -EINTR;  		if (tx_win != tx_start) { -			timeout = rtt2; +			timeout = rtt;  			tx_start = tx_win;  		} @@ -271,16 +270,9 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,  		_debug("need instant resend %d", ret);  		rxrpc_instant_resend(call, ix);  	} else { -		unsigned long now = jiffies, resend_at; +		unsigned long now = jiffies; +		unsigned long resend_at = now + call->peer->rto_j; -		if (call->peer->rtt_usage > 1) -			resend_at = nsecs_to_jiffies(call->peer->rtt * 3 / 2); -		else -			resend_at = rxrpc_resend_timeout; -		if (resend_at < 1) -			resend_at = 1; - -		resend_at += now;  		WRITE_ONCE(call->resend_at, resend_at);  		rxrpc_reduce_call_timer(call, resend_at, now,  					rxrpc_timer_set_for_send); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 2bbb38161851..18dade4e6f9a 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -71,15 +71,6 @@ static struct ctl_table rxrpc_sysctl_table[] = {  		.extra1		= (void *)&one_jiffy,  		.extra2		= (void *)&max_jiffies,  	}, -	{ -		.procname	= "resend_timeout", -		.data		= &rxrpc_resend_timeout, -		.maxlen		= sizeof(unsigned long), -		.mode		= 0644, -		.proc_handler	= proc_doulongvec_ms_jiffies_minmax, -		.extra1		= (void *)&one_jiffy, -		.extra2		= (void *)&max_jiffies, -	},  	/* Non-time values */  	{ diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 1a766393be62..20577355235a 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -199,6 +199,9 @@ static int tcf_ct_flow_table_add_action_nat(struct net *net,  	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;  	struct nf_conntrack_tuple target; +	if (!(ct->status & IPS_NAT_MASK)) +		return 0; +  	nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);  	switch (tuple->src.l3num) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f6a3b969ead0..0a7ecc292bd3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1667,6 +1667,7 @@ int tcf_classify_ingress(struct sk_buff *skb,  		skb_ext_del(skb, TC_SKB_EXT);  		tp = rcu_dereference_bh(fchain->filter_chain); +		last_executed_chain = fchain->index;  	}  	ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, @@ -2069,6 +2070,7 @@ replay:  		err = PTR_ERR(block);  		goto errout;  	} +	block->classid = parent;  	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;  	if (chain_index > TC_ACT_EXT_VAL_MASK) { @@ -2611,12 +2613,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)  			return skb->len;  		parent = tcm->tcm_parent; -		if (!parent) { +		if (!parent)  			q = dev->qdisc; -			parent = q->handle; -		} else { +		else  			q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); -		}  		if (!q)  			goto out;  		cops = q->ops->cl_ops; @@ -2632,6 +2632,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)  		block = cops->tcf_block(q, cl, NULL);  		if (!block)  			goto out; +		parent = block->classid;  		if (tcf_block_shared(block))  			q = NULL;  	} @@ -3522,6 +3523,16 @@ static void tcf_sample_get_group(struct flow_action_entry *entry,  #endif  } +static enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) +{ +	if (WARN_ON_ONCE(hw_stats > TCA_ACT_HW_STATS_ANY)) +		return FLOW_ACTION_HW_STATS_DONT_CARE; +	else if (!hw_stats) +		return FLOW_ACTION_HW_STATS_DISABLED; + +	return hw_stats; +} +  int tc_setup_flow_action(struct flow_action *flow_action,  			 const struct tcf_exts *exts)  { @@ -3545,7 +3556,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,  		if (err)  			goto err_out_locked; -		entry->hw_stats = act->hw_stats; +		entry->hw_stats = tc_act_hw_stats(act->hw_stats);  		if (is_tcf_gact_ok(act)) {  			entry->id = FLOW_ACTION_ACCEPT; @@ -3613,7 +3624,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,  				entry->mangle.mask = tcf_pedit_mask(act, k);  				entry->mangle.val = tcf_pedit_val(act, k);  				entry->mangle.offset = tcf_pedit_offset(act, k); -				entry->hw_stats = act->hw_stats; +				entry->hw_stats = tc_act_hw_stats(act->hw_stats);  				entry = &flow_action->entries[++j];  			}  		} else if (is_tcf_csum(act)) { diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index a36974e9c601..1bcf8fbfd40e 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -323,7 +323,8 @@ static void choke_reset(struct Qdisc *sch)  	sch->q.qlen = 0;  	sch->qstats.backlog = 0; -	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); +	if (q->tab) +		memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));  	q->head = q->tail = 0;  	red_restart(&q->vars);  } diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index b1da5589a0c6..c48f91075b5c 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -82,7 +82,7 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)  	if (q->skip_sock_check)  		goto skip; -	if (!sk) +	if (!sk || !sk_fullsock(sk))  		return false;  	if (!sock_flag(sk, SOCK_TXTIME)) @@ -137,8 +137,9 @@ static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)  	struct sock_exterr_skb *serr;  	struct sk_buff *clone;  	ktime_t txtime = skb->tstamp; +	struct sock *sk = skb->sk; -	if (!skb->sk || !(skb->sk->sk_txtime_report_errors)) +	if (!sk || !sk_fullsock(sk) || !(sk->sk_txtime_report_errors))  		return;  	clone = skb_clone(skb, GFP_ATOMIC); @@ -154,7 +155,7 @@ static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)  	serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */  	serr->ee.ee_info = txtime; /* low part of tstamp */ -	if (sock_queue_err_skb(skb->sk, clone)) +	if (sock_queue_err_skb(sk, clone))  		kfree_skb(clone);  } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 968519ff36e9..436160be9c18 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -416,7 +416,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,  		q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));  	if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) -		q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); +		q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));  	if (tb[TCA_FQ_CODEL_MEMORY_LIMIT])  		q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT])); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index a9da8776bf5b..fb760cee824e 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -297,9 +297,9 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,  			goto flow_error;  		}  		q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]); -		if (!q->flows_cnt || q->flows_cnt > 65536) { +		if (!q->flows_cnt || q->flows_cnt >= 65536) {  			NL_SET_ERR_MSG_MOD(extack, -					   "Number of flows must be < 65536"); +					   "Number of flows must range in [1..65535]");  			goto flow_error;  		}  	} diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index c787d4d46017..5a6def5e4e6d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -637,6 +637,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)  	if (ctl->divisor &&  	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))  		return -EINVAL; + +	/* slot->allot is a short, make sure quantum is not too big. */ +	if (ctl->quantum) { +		unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum); + +		if (scaled <= 0 || scaled > SHRT_MAX) +			return -EINVAL; +	} +  	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,  					ctl_v1->Wlog))  		return -EINVAL; diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index 0fb10abf7579..7a5e4c454715 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -169,6 +169,9 @@ static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,  {  	struct tc_skbprio_qopt *ctl = nla_data(opt); +	if (opt->nla_len != nla_attr_size(sizeof(*ctl))) +		return -EINVAL; +  	sch->limit = ctl->limit;  	return 0;  } diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 6e2eb1dd64ed..68934438ee19 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -31,7 +31,7 @@ menuconfig IP_SCTP  	  homing at either or both ends of an association."  	  To compile this protocol support as a module, choose M here: the -	  module will be called sctp. Debug messages are handeled by the +	  module will be called sctp. Debug messages are handled by the  	  kernel's dynamic debugging framework.  	  If in doubt, say N. diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 4278764d82b8..83e97e8892e0 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -741,14 +741,8 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,  	if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len))  		goto free; -	{ -		SHASH_DESC_ON_STACK(desc, tfm); - -		desc->tfm = tfm; -		crypto_shash_digest(desc, (u8 *)auth, -				    end - (unsigned char *)auth, digest); -		shash_desc_zero(desc); -	} +	crypto_shash_tfm_digest(tfm, (u8 *)auth, end - (unsigned char *)auth, +				digest);  free:  	if (free_key) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 09050c1d5517..47910470e532 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -858,7 +858,11 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,  	struct sctp_chunk *retval;  	__u32 ctsn; -	ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); +	if (chunk && chunk->asoc) +		ctsn = sctp_tsnmap_get_ctsn(&chunk->asoc->peer.tsn_map); +	else +		ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); +  	shut.cum_tsn_ack = htonl(ctsn);  	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, @@ -1666,17 +1670,14 @@ static struct sctp_cookie_param *sctp_pack_cookie(  	       ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);  	if (sctp_sk(ep->base.sk)->hmac) { -		SHASH_DESC_ON_STACK(desc, sctp_sk(ep->base.sk)->hmac); +		struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac;  		int err;  		/* Sign the message.  */ -		desc->tfm = sctp_sk(ep->base.sk)->hmac; - -		err = crypto_shash_setkey(desc->tfm, ep->secret_key, +		err = crypto_shash_setkey(tfm, ep->secret_key,  					  sizeof(ep->secret_key)) ?: -		      crypto_shash_digest(desc, (u8 *)&cookie->c, bodysize, -					  cookie->signature); -		shash_desc_zero(desc); +		      crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, +					      cookie->signature);  		if (err)  			goto free_cookie;  	} @@ -1737,17 +1738,13 @@ struct sctp_association *sctp_unpack_cookie(  	/* Check the signature.  */  	{ -		SHASH_DESC_ON_STACK(desc, sctp_sk(ep->base.sk)->hmac); +		struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac;  		int err; -		desc->tfm = sctp_sk(ep->base.sk)->hmac; - -		err = crypto_shash_setkey(desc->tfm, ep->secret_key, +		err = crypto_shash_setkey(tfm, ep->secret_key,  					  sizeof(ep->secret_key)) ?: -		      crypto_shash_digest(desc, (u8 *)bear_cookie, bodysize, -					  digest); -		shash_desc_zero(desc); - +		      crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, +					      digest);  		if (err) {  			*error = -SCTP_IERROR_NOMEM;  			goto fail; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 2bc29463e1dc..9f36fe911d08 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1523,9 +1523,17 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type,  			timeout = asoc->timeouts[cmd->obj.to];  			BUG_ON(!timeout); -			timer->expires = jiffies + timeout; -			sctp_association_hold(asoc); -			add_timer(timer); +			/* +			 * SCTP has a hard time with timer starts.  Because we process +			 * timer starts as side effects, it can be hard to tell if we +			 * have already started a timer or not, which leads to BUG +			 * halts when we call add_timer. So here, instead of just starting +			 * a timer, if the timer is already started, and just mod +			 * the timer with the shorter of the two expiration times +			 */ +			if (!timer_pending(timer)) +				sctp_association_hold(asoc); +			timer_reduce(timer, jiffies + timeout);  			break;  		case SCTP_CMD_TIMER_RESTART: diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 6a16af4b1ef6..e86620fbd90f 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1856,16 +1856,17 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(  	/* Update the content of current association. */  	sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));  	sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); -	if (sctp_state(asoc, SHUTDOWN_PENDING) && +	if ((sctp_state(asoc, SHUTDOWN_PENDING) || +	     sctp_state(asoc, SHUTDOWN_SENT)) &&  	    (sctp_sstate(asoc->base.sk, CLOSING) ||  	     sock_flag(asoc->base.sk, SOCK_DEAD))) { -		/* if were currently in SHUTDOWN_PENDING, but the socket -		 * has been closed by user, don't transition to ESTABLISHED. -		 * Instead trigger SHUTDOWN bundled with COOKIE_ACK. +		/* If the socket has been closed by user, don't +		 * transition to ESTABLISHED. Instead trigger SHUTDOWN +		 * bundled with COOKIE_ACK.  		 */  		sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));  		return sctp_sf_do_9_2_start_shutdown(net, ep, asoc, -						     SCTP_ST_CHUNK(0), NULL, +						     SCTP_ST_CHUNK(0), repl,  						     commands);  	} else {  		sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, @@ -5470,7 +5471,7 @@ enum sctp_disposition sctp_sf_do_9_2_start_shutdown(  	 * in the Cumulative TSN Ack field the last sequential TSN it  	 * has received from the peer.  	 */ -	reply = sctp_make_shutdown(asoc, NULL); +	reply = sctp_make_shutdown(asoc, arg);  	if (!reply)  		goto nomem; @@ -6068,7 +6069,7 @@ enum sctp_disposition sctp_sf_autoclose_timer_expire(  	disposition = SCTP_DISPOSITION_CONSUME;  	if (sctp_outq_is_empty(&asoc->outqueue)) {  		disposition = sctp_sf_do_9_2_start_shutdown(net, ep, asoc, type, -							    arg, commands); +							    NULL, commands);  	}  	return disposition; diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index c82dbdcf13f2..77d5c36a8991 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -343,6 +343,9 @@ void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport,  	struct sockaddr_storage addr;  	struct sctp_ulpevent *event; +	if (asoc->state < SCTP_STATE_ESTABLISHED) +		return; +  	memset(&addr, 0, sizeof(struct sockaddr_storage));  	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 25fbd8d9de74..ac5cac0dd24b 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -2032,7 +2032,6 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,  	struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;  	struct kvec *head = rqstp->rq_rcv_buf.head;  	struct rpc_auth *auth = cred->cr_auth; -	unsigned int savedlen = rcv_buf->len;  	u32 offset, opaque_len, maj_stat;  	__be32 *p; @@ -2043,9 +2042,9 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,  	offset = (u8 *)(p) - (u8 *)head->iov_base;  	if (offset + opaque_len > rcv_buf->len)  		goto unwrap_failed; -	rcv_buf->len = offset + opaque_len; -	maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf); +	maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, +			      offset + opaque_len, rcv_buf);  	if (maj_stat == GSS_S_CONTEXT_EXPIRED)  		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);  	if (maj_stat != GSS_S_COMPLETE) @@ -2059,10 +2058,9 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,  	 */  	xdr_init_decode(xdr, rcv_buf, p, rqstp); -	auth->au_rslack = auth->au_verfsize + 2 + -			  XDR_QUADLEN(savedlen - rcv_buf->len); -	auth->au_ralign = auth->au_verfsize + 2 + -			  XDR_QUADLEN(savedlen - rcv_buf->len); +	auth->au_rslack = auth->au_verfsize + 2 + ctx->gc_gss_ctx->slack; +	auth->au_ralign = auth->au_verfsize + 2 + ctx->gc_gss_ctx->align; +  	return 0;  unwrap_failed:  	trace_rpcgss_unwrap_failed(task); diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 6f2d30d7b766..e7180da1fc6a 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -851,8 +851,8 @@ out_err:  }  u32 -gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, -		     u32 *headskip, u32 *tailskip) +gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len, +		     struct xdr_buf *buf, u32 *headskip, u32 *tailskip)  {  	struct xdr_buf subbuf;  	u32 ret = 0; @@ -881,7 +881,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,  	/* create a segment skipping the header and leaving out the checksum */  	xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN, -				    (buf->len - offset - GSS_KRB5_TOK_HDR_LEN - +				    (len - offset - GSS_KRB5_TOK_HDR_LEN -  				     kctx->gk5e->cksumlength));  	nblocks = (subbuf.len + blocksize - 1) / blocksize; @@ -926,7 +926,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,  		goto out_err;  	/* Get the packet's hmac value */ -	ret = read_bytes_from_xdr_buf(buf, buf->len - kctx->gk5e->cksumlength, +	ret = read_bytes_from_xdr_buf(buf, len - kctx->gk5e->cksumlength,  				      pkt_hmac, kctx->gk5e->cksumlength);  	if (ret)  		goto out_err; diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 6c1920eed771..cf0fd170ac18 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -261,7 +261,9 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,  }  static u32 -gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) +gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, int len, +		       struct xdr_buf *buf, unsigned int *slack, +		       unsigned int *align)  {  	int			signalg;  	int			sealalg; @@ -279,12 +281,13 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)  	u32			conflen = kctx->gk5e->conflen;  	int			crypt_offset;  	u8			*cksumkey; +	unsigned int		saved_len = buf->len;  	dprintk("RPC:       gss_unwrap_kerberos\n");  	ptr = (u8 *)buf->head[0].iov_base + offset;  	if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, -					buf->len - offset)) +					len - offset))  		return GSS_S_DEFECTIVE_TOKEN;  	if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || @@ -324,6 +327,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)  	    (!kctx->initiate && direction != 0))  		return GSS_S_BAD_SIG; +	buf->len = len;  	if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {  		struct crypto_sync_skcipher *cipher;  		int err; @@ -376,11 +380,15 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)  	data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start;  	memmove(orig_start, data_start, data_len);  	buf->head[0].iov_len -= (data_start - orig_start); -	buf->len -= (data_start - orig_start); +	buf->len = len - (data_start - orig_start);  	if (gss_krb5_remove_padding(buf, blocksize))  		return GSS_S_DEFECTIVE_TOKEN; +	/* slack must include room for krb5 padding */ +	*slack = XDR_QUADLEN(saved_len - buf->len); +	/* The GSS blob always precedes the RPC message payload */ +	*align = *slack;  	return GSS_S_COMPLETE;  } @@ -486,7 +494,9 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,  }  static u32 -gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) +gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, +		       struct xdr_buf *buf, unsigned int *slack, +		       unsigned int *align)  {  	time64_t	now;  	u8		*ptr; @@ -532,7 +542,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)  	if (rrc != 0)  		rotate_left(offset + 16, buf, rrc); -	err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf, +	err = (*kctx->gk5e->decrypt_v2)(kctx, offset, len, buf,  					&headskip, &tailskip);  	if (err)  		return GSS_S_FAILURE; @@ -542,7 +552,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)  	 * it against the original  	 */  	err = read_bytes_from_xdr_buf(buf, -				buf->len - GSS_KRB5_TOK_HDR_LEN - tailskip, +				len - GSS_KRB5_TOK_HDR_LEN - tailskip,  				decrypted_hdr, GSS_KRB5_TOK_HDR_LEN);  	if (err) {  		dprintk("%s: error %u getting decrypted_hdr\n", __func__, err); @@ -568,18 +578,19 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)  	 * Note that buf->head[0].iov_len may indicate the available  	 * head buffer space rather than that actually occupied.  	 */ -	movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len); +	movelen = min_t(unsigned int, buf->head[0].iov_len, len);  	movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip; -	if (offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > -	    buf->head[0].iov_len) -		return GSS_S_FAILURE; +	BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > +							buf->head[0].iov_len);  	memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen);  	buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; -	buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip; +	buf->len = len - GSS_KRB5_TOK_HDR_LEN + headskip;  	/* Trim off the trailing "extra count" and checksum blob */ -	buf->len -= ec + GSS_KRB5_TOK_HDR_LEN + tailskip; +	xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip); +	*align = XDR_QUADLEN(GSS_KRB5_TOK_HDR_LEN + headskip); +	*slack = *align + XDR_QUADLEN(ec + GSS_KRB5_TOK_HDR_LEN + tailskip);  	return GSS_S_COMPLETE;  } @@ -603,7 +614,8 @@ gss_wrap_kerberos(struct gss_ctx *gctx, int offset,  }  u32 -gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf) +gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, +		    int len, struct xdr_buf *buf)  {  	struct krb5_ctx	*kctx = gctx->internal_ctx_id; @@ -613,9 +625,11 @@ gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf)  	case ENCTYPE_DES_CBC_RAW:  	case ENCTYPE_DES3_CBC_RAW:  	case ENCTYPE_ARCFOUR_HMAC: -		return gss_unwrap_kerberos_v1(kctx, offset, buf); +		return gss_unwrap_kerberos_v1(kctx, offset, len, buf, +					      &gctx->slack, &gctx->align);  	case ENCTYPE_AES128_CTS_HMAC_SHA1_96:  	case ENCTYPE_AES256_CTS_HMAC_SHA1_96: -		return gss_unwrap_kerberos_v2(kctx, offset, buf); +		return gss_unwrap_kerberos_v2(kctx, offset, len, buf, +					      &gctx->slack, &gctx->align);  	}  } diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index db550bfc2642..69316ab1b9fa 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -411,10 +411,11 @@ gss_wrap(struct gss_ctx	*ctx_id,  u32  gss_unwrap(struct gss_ctx	*ctx_id,  	   int			offset, +	   int			len,  	   struct xdr_buf	*buf)  {  	return ctx_id->mech_type->gm_ops -		->gss_unwrap(ctx_id, offset, buf); +		->gss_unwrap(ctx_id, offset, len, buf);  } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 54ae5be62f6a..50d93c49ef1a 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -906,7 +906,7 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g  	if (svc_getnl(&buf->head[0]) != seq)  		goto out;  	/* trim off the mic and padding at the end before returning */ -	buf->len -= 4 + round_up_to_quad(mic.len); +	xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);  	stat = 0;  out:  	kfree(mic.data); @@ -934,7 +934,7 @@ static int  unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)  {  	u32 priv_len, maj_stat; -	int pad, saved_len, remaining_len, offset; +	int pad, remaining_len, offset;  	clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); @@ -954,12 +954,8 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs  	buf->len -= pad;  	fix_priv_head(buf, pad); -	/* Maybe it would be better to give gss_unwrap a length parameter: */ -	saved_len = buf->len; -	buf->len = priv_len; -	maj_stat = gss_unwrap(ctx, 0, buf); +	maj_stat = gss_unwrap(ctx, 0, priv_len, buf);  	pad = priv_len - buf->len; -	buf->len = saved_len;  	buf->len -= pad;  	/* The upper layers assume the buffer is aligned on 4-byte boundaries.  	 * In the krb5p case, at least, the data ends up offset, so we need to diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index af0ddd28b081..baef5ee43dbb 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -529,7 +529,6 @@ void cache_purge(struct cache_detail *detail)  {  	struct cache_head *ch = NULL;  	struct hlist_head *head = NULL; -	struct hlist_node *tmp = NULL;  	int i = 0;  	spin_lock(&detail->hash_lock); @@ -541,7 +540,9 @@ void cache_purge(struct cache_detail *detail)  	dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);  	for (i = 0; i < detail->hash_size; i++) {  		head = &detail->hash_table[i]; -		hlist_for_each_entry_safe(ch, tmp, head, cache_list) { +		while (!hlist_empty(head)) { +			ch = hlist_entry(head->first, struct cache_head, +					 cache_list);  			sunrpc_begin_cache_remove_entry(ch, detail);  			spin_unlock(&detail->hash_lock);  			sunrpc_end_cache_remove_entry(ch, detail); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 325a0858700f..61b21dafd7c0 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -880,6 +880,22 @@ EXPORT_SYMBOL_GPL(rpc_shutdown_client);  /*   * Free an RPC client   */ +static void rpc_free_client_work(struct work_struct *work) +{ +	struct rpc_clnt *clnt = container_of(work, struct rpc_clnt, cl_work); + +	/* These might block on processes that might allocate memory, +	 * so they cannot be called in rpciod, so they are handled separately +	 * here. +	 */ +	rpc_clnt_debugfs_unregister(clnt); +	rpc_free_clid(clnt); +	rpc_clnt_remove_pipedir(clnt); +	xprt_put(rcu_dereference_raw(clnt->cl_xprt)); + +	kfree(clnt); +	rpciod_down(); +}  static struct rpc_clnt *  rpc_free_client(struct rpc_clnt *clnt)  { @@ -890,17 +906,14 @@ rpc_free_client(struct rpc_clnt *clnt)  			rcu_dereference(clnt->cl_xprt)->servername);  	if (clnt->cl_parent != clnt)  		parent = clnt->cl_parent; -	rpc_clnt_debugfs_unregister(clnt); -	rpc_clnt_remove_pipedir(clnt);  	rpc_unregister_client(clnt);  	rpc_free_iostats(clnt->cl_metrics);  	clnt->cl_metrics = NULL; -	xprt_put(rcu_dereference_raw(clnt->cl_xprt));  	xprt_iter_destroy(&clnt->cl_xpi); -	rpciod_down();  	put_cred(clnt->cl_cred); -	rpc_free_clid(clnt); -	kfree(clnt); + +	INIT_WORK(&clnt->cl_work, rpc_free_client_work); +	schedule_work(&clnt->cl_work);  	return parent;  } @@ -2420,6 +2433,11 @@ rpc_check_timeout(struct rpc_task *task)  {  	struct rpc_clnt	*clnt = task->tk_client; +	if (RPC_SIGNALLED(task)) { +		rpc_call_rpcerror(task, -ERESTARTSYS); +		return; +	} +  	if (xprt_adjust_timeout(task->tk_rqstp) == 0)  		return; @@ -2808,8 +2826,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,  	task = rpc_call_null_helper(clnt, xprt, NULL,  			RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS,  			&rpc_cb_add_xprt_call_ops, data); -	if (IS_ERR(task)) -		return PTR_ERR(task); +  	rpc_put_task(task);  success:  	return 1; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e27e3532ec75..2284ff038dad 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -908,9 +908,6 @@ int svc_send(struct svc_rqst *rqstp)  	if (!xprt)  		goto out; -	/* release the receive skb before sending the reply */ -	xprt->xpt_ops->xpo_release_rqst(rqstp); -  	/* calculate over-all length */  	xb = &rqstp->rq_res;  	xb->len = xb->head[0].iov_len + @@ -1040,6 +1037,8 @@ static void svc_delete_xprt(struct svc_xprt *xprt)  	dprintk("svc: svc_delete_xprt(%p)\n", xprt);  	xprt->xpt_ops->xpo_detach(xprt); +	if (xprt->xpt_bc_xprt) +		xprt->xpt_bc_xprt->ops->close(xprt->xpt_bc_xprt);  	spin_lock_bh(&serv->sv_lock);  	list_del_init(&xprt->xpt_list); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 519cf9c4f8fd..023514e392b3 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -527,6 +527,8 @@ static int svc_udp_sendto(struct svc_rqst *rqstp)  	unsigned int uninitialized_var(sent);  	int err; +	svc_release_udp_skb(rqstp); +  	svc_set_cmsg_data(rqstp, cmh);  	err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); @@ -1076,6 +1078,8 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)  	unsigned int uninitialized_var(sent);  	int err; +	svc_release_skb(rqstp); +  	err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);  	xdr_free_bvec(xdr);  	if (err < 0 || sent != (xdr->len + sizeof(marker))) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 15b58c5144f9..6f7d82fb1eb0 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1150,6 +1150,47 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,  }  EXPORT_SYMBOL_GPL(xdr_buf_subsegment); +/** + * xdr_buf_trim - lop at most "len" bytes off the end of "buf" + * @buf: buf to be trimmed + * @len: number of bytes to reduce "buf" by + * + * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note + * that it's possible that we'll trim less than that amount if the xdr_buf is + * too small, or if (for instance) it's all in the head and the parser has + * already read too far into it. + */ +void xdr_buf_trim(struct xdr_buf *buf, unsigned int len) +{ +	size_t cur; +	unsigned int trim = len; + +	if (buf->tail[0].iov_len) { +		cur = min_t(size_t, buf->tail[0].iov_len, trim); +		buf->tail[0].iov_len -= cur; +		trim -= cur; +		if (!trim) +			goto fix_len; +	} + +	if (buf->page_len) { +		cur = min_t(unsigned int, buf->page_len, trim); +		buf->page_len -= cur; +		trim -= cur; +		if (!trim) +			goto fix_len; +	} + +	if (buf->head[0].iov_len) { +		cur = min_t(size_t, buf->head[0].iov_len, trim); +		buf->head[0].iov_len -= cur; +		trim -= cur; +	} +fix_len: +	buf->len -= (len - trim); +} +EXPORT_SYMBOL_GPL(xdr_buf_trim); +  static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)  {  	unsigned int this_len; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 4a81e6995d3e..3c627dc685cc 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -388,7 +388,9 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,  	} while (nsegs);  done: -	return xdr_stream_encode_item_absent(xdr); +	if (xdr_stream_encode_item_absent(xdr) < 0) +		return -EMSGSIZE; +	return 0;  }  /* Register and XDR encode the Write list. Supports encoding a list @@ -454,7 +456,9 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,  	*segcount = cpu_to_be32(nchunks);  done: -	return xdr_stream_encode_item_absent(xdr); +	if (xdr_stream_encode_item_absent(xdr) < 0) +		return -EMSGSIZE; +	return 0;  }  /* Register and XDR encode the Reply chunk. Supports encoding an array @@ -480,8 +484,11 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,  	int nsegs, nchunks;  	__be32 *segcount; -	if (wtype != rpcrdma_replych) -		return xdr_stream_encode_item_absent(xdr); +	if (wtype != rpcrdma_replych) { +		if (xdr_stream_encode_item_absent(xdr) < 0) +			return -EMSGSIZE; +		return 0; +	}  	seg = req->rl_segments;  	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index d510a3a15d4b..af7eb8d202ae 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -244,6 +244,8 @@ static void  xprt_rdma_bc_close(struct rpc_xprt *xprt)  {  	dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + +	xprt_disconnect_done(xprt);  	xprt->cwnd = RPC_CWNDSHIFT;  } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 54469b72b25f..efa5fcb5793f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -223,6 +223,26 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,  		svc_rdma_recv_ctxt_destroy(rdma, ctxt);  } +/** + * svc_rdma_release_rqst - Release transport-specific per-rqst resources + * @rqstp: svc_rqst being released + * + * Ensure that the recv_ctxt is released whether or not a Reply + * was sent. For example, the client could close the connection, + * or svc_process could drop an RPC, before the Reply is sent. + */ +void svc_rdma_release_rqst(struct svc_rqst *rqstp) +{ +	struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt; +	struct svc_xprt *xprt = rqstp->rq_xprt; +	struct svcxprt_rdma *rdma = +		container_of(xprt, struct svcxprt_rdma, sc_xprt); + +	rqstp->rq_xprt_ctxt = NULL; +	if (ctxt) +		svc_rdma_recv_ctxt_put(rdma, ctxt); +} +  static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,  				struct svc_rdma_recv_ctxt *ctxt)  { @@ -820,6 +840,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)  	__be32 *p;  	int ret; +	rqstp->rq_xprt_ctxt = NULL; +  	spin_lock(&rdma_xprt->sc_rq_dto_lock);  	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);  	if (ctxt) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index bd7c195d872e..23c2d3ce0dc9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -323,8 +323,6 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)  		if (atomic_sub_return(cc->cc_sqecount,  				      &rdma->sc_sq_avail) > 0) {  			ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); -			trace_svcrdma_post_rw(&cc->cc_cqe, -					      cc->cc_sqecount, ret);  			if (ret)  				break;  			return 0; @@ -337,6 +335,7 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)  		trace_svcrdma_sq_retry(rdma);  	} while (1); +	trace_svcrdma_sq_post_err(rdma, ret);  	set_bit(XPT_CLOSE, &xprt->xpt_flags);  	/* If even one was posted, there will be a completion. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 90cba3058f04..b6c8643867f2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -322,15 +322,17 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)  		}  		svc_xprt_get(&rdma->sc_xprt); +		trace_svcrdma_post_send(wr);  		ret = ib_post_send(rdma->sc_qp, wr, NULL); -		trace_svcrdma_post_send(wr, ret); -		if (ret) { -			set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); -			svc_xprt_put(&rdma->sc_xprt); -			wake_up(&rdma->sc_send_wait); -		} -		break; +		if (ret) +			break; +		return 0;  	} + +	trace_svcrdma_sq_post_err(rdma, ret); +	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); +	svc_xprt_put(&rdma->sc_xprt); +	wake_up(&rdma->sc_send_wait);  	return ret;  } @@ -924,12 +926,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)  	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);  	if (ret < 0)  		goto err1; -	ret = 0; - -out: -	rqstp->rq_xprt_ctxt = NULL; -	svc_rdma_recv_ctxt_put(rdma, rctxt); -	return ret; +	return 0;   err2:  	if (ret != -E2BIG && ret != -EINVAL) @@ -938,16 +935,14 @@ out:  	ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp);  	if (ret < 0)  		goto err1; -	ret = 0; -	goto out; +	return 0;   err1:  	svc_rdma_send_ctxt_put(rdma, sctxt);   err0:  	trace_svcrdma_send_failed(rqstp, ret);  	set_bit(XPT_CLOSE, &xprt->xpt_flags); -	ret = -ENOTCONN; -	goto out; +	return -ENOTCONN;  }  /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 8bb99980ae85..ea54785db4f8 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -71,7 +71,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,  					struct sockaddr *sa, int salen,  					int flags);  static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); -static void svc_rdma_release_rqst(struct svc_rqst *);  static void svc_rdma_detach(struct svc_xprt *xprt);  static void svc_rdma_free(struct svc_xprt *xprt);  static int svc_rdma_has_wspace(struct svc_xprt *xprt); @@ -552,10 +551,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)  	return NULL;  } -static void svc_rdma_release_rqst(struct svc_rqst *rqstp) -{ -} -  /*   * When connected, an svc_xprt has at least two references:   * diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index cdd84c09df10..05c4d3a9cda2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -289,6 +289,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)  	case RDMA_CM_EVENT_DISCONNECTED:  		ep->re_connect_status = -ECONNABORTED;  disconnected: +		xprt_force_disconnect(xprt);  		return rpcrdma_ep_destroy(ep);  	default:  		break; @@ -1355,8 +1356,8 @@ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)  		--ep->re_send_count;  	} +	trace_xprtrdma_post_send(req);  	rc = frwr_send(r_xprt, req); -	trace_xprtrdma_post_send(req, rc);  	if (rc)  		return -ENOTCONN;  	return 0; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0bda8a73e8a8..845d0be805ec 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2584,6 +2584,7 @@ static int bc_send_request(struct rpc_rqst *req)  static void bc_close(struct rpc_xprt *xprt)  { +	xprt_disconnect_done(xprt);  }  /* diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index c8c47fc72653..8c47ded2edb6 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1712,6 +1712,7 @@ exit:  	case -EBUSY:  		this_cpu_inc(stats->stat[STAT_ASYNC]);  		*skb = NULL; +		tipc_aead_put(aead);  		return rc;  	default:  		this_cpu_inc(stats->stat[STAT_NOK]); diff --git a/net/tipc/link.c b/net/tipc/link.c index 467c53a1fb5c..d4675e922a8f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1065,7 +1065,7 @@ static void tipc_link_update_cwin(struct tipc_link *l, int released,  	/* Enter fast recovery */  	if (unlikely(retransmitted)) {  		l->ssthresh = max_t(u16, l->window / 2, 300); -		l->window = l->ssthresh; +		l->window = min_t(u16, l->ssthresh, l->window);  		return;  	}  	/* Enter slow start */ diff --git a/net/tipc/node.c b/net/tipc/node.c index 10292c942384..803a3a6d0f50 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2038,6 +2038,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)  		n = tipc_node_find_by_id(net, ehdr->id);  	}  	tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); +	tipc_node_put(n);  	if (!skb)  		return; @@ -2090,7 +2091,7 @@ rcv:  	/* Check/update node state before receiving */  	if (unlikely(skb)) {  		if (unlikely(skb_linearize(skb))) -			goto discard; +			goto out_node_put;  		tipc_node_write_lock(n);  		if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) {  			if (le->link) { @@ -2119,6 +2120,7 @@ rcv:  	if (!skb_queue_empty(&xmitq))  		tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); +out_node_put:  	tipc_node_put(n);  discard:  	kfree_skb(skb); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 87466607097f..e370ad0edd76 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1739,22 +1739,21 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb,  	return 0;  } -static void tipc_sk_send_ack(struct tipc_sock *tsk) +static struct sk_buff *tipc_sk_build_ack(struct tipc_sock *tsk)  {  	struct sock *sk = &tsk->sk; -	struct net *net = sock_net(sk);  	struct sk_buff *skb = NULL;  	struct tipc_msg *msg;  	u32 peer_port = tsk_peer_port(tsk);  	u32 dnode = tsk_peer_node(tsk);  	if (!tipc_sk_connected(sk)) -		return; +		return NULL;  	skb = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0,  			      dnode, tsk_own_node(tsk), peer_port,  			      tsk->portid, TIPC_OK);  	if (!skb) -		return; +		return NULL;  	msg = buf_msg(skb);  	msg_set_conn_ack(msg, tsk->rcv_unacked);  	tsk->rcv_unacked = 0; @@ -1764,7 +1763,19 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk)  		tsk->rcv_win = tsk_adv_blocks(tsk->sk.sk_rcvbuf);  		msg_set_adv_win(msg, tsk->rcv_win);  	} -	tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg)); +	return skb; +} + +static void tipc_sk_send_ack(struct tipc_sock *tsk) +{ +	struct sk_buff *skb; + +	skb = tipc_sk_build_ack(tsk); +	if (!skb) +		return; + +	tipc_node_xmit_skb(sock_net(&tsk->sk), skb, tsk_peer_node(tsk), +			   msg_link_selector(buf_msg(skb)));  }  static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) @@ -1938,7 +1949,6 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m,  	bool peek = flags & MSG_PEEK;  	int offset, required, copy, copied = 0;  	int hlen, dlen, err, rc; -	bool ack = false;  	long timeout;  	/* Catch invalid receive attempts */ @@ -1983,7 +1993,6 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m,  		/* Copy data if msg ok, otherwise return error/partial data */  		if (likely(!err)) { -			ack = msg_ack_required(hdr);  			offset = skb_cb->bytes_read;  			copy = min_t(int, dlen - offset, buflen - copied);  			rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); @@ -2011,7 +2020,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m,  		/* Send connection flow control advertisement when applicable */  		tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); -		if (ack || tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) +		if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)  			tipc_sk_send_ack(tsk);  		/* Exit if all requested data or FIN/error received */ @@ -2105,9 +2114,11 @@ static void tipc_sk_proto_rcv(struct sock *sk,   * tipc_sk_filter_connect - check incoming message for a connection-based socket   * @tsk: TIPC socket   * @skb: pointer to message buffer. + * @xmitq: for Nagle ACK if any   * Returns true if message should be added to receive queue, false otherwise   */ -static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) +static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, +				   struct sk_buff_head *xmitq)  {  	struct sock *sk = &tsk->sk;  	struct net *net = sock_net(sk); @@ -2171,8 +2182,17 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)  		if (!skb_queue_empty(&sk->sk_write_queue))  			tipc_sk_push_backlog(tsk);  		/* Accept only connection-based messages sent by peer */ -		if (likely(con_msg && !err && pport == oport && pnode == onode)) +		if (likely(con_msg && !err && pport == oport && +			   pnode == onode)) { +			if (msg_ack_required(hdr)) { +				struct sk_buff *skb; + +				skb = tipc_sk_build_ack(tsk); +				if (skb) +					__skb_queue_tail(xmitq, skb); +			}  			return true; +		}  		if (!tsk_peer_msg(tsk, hdr))  			return false;  		if (!err) @@ -2267,7 +2287,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,  	while ((skb = __skb_dequeue(&inputq))) {  		hdr = buf_msg(skb);  		limit = rcvbuf_limit(sk, skb); -		if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) || +		if ((sk_conn && !tipc_sk_filter_connect(tsk, skb, xmitq)) ||  		    (!sk_conn && msg_connected(hdr)) ||  		    (!grp && msg_in_group(hdr)))  			err = TIPC_ERR_NO_PORT; diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index aa015c233898..6ebbec1bedd1 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -96,6 +96,16 @@ void tipc_sub_get(struct tipc_subscription *subscription);  		(swap_ ? swab32(val__) : val__);			\  	}) +/* tipc_sub_write - write val_ to field_ of struct sub_ in user endian format + */ +#define tipc_sub_write(sub_, field_, val_)				\ +	({								\ +		struct tipc_subscr *sub__ = sub_;			\ +		u32 val__ = val_;					\ +		int swap_ = !((sub__)->filter & TIPC_FILTER_MASK);	\ +		(sub__)->field_ = swap_ ? swab32(val__) : val__;	\ +	}) +  /* tipc_evt_write - write val_ to field_ of struct evt_ in user endian format   */  #define tipc_evt_write(evt_, field_, val_)				\ diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 3a12fc18239b..446af7bbd13e 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -237,8 +237,8 @@ static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s)  		if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) {  			tipc_sub_unsubscribe(sub);  			atomic_dec(&tn->subscription_count); -		} else if (s) { -			break; +			if (s) +				break;  		}  	}  	spin_unlock_bh(&con->sub_lock); @@ -362,9 +362,10 @@ static int tipc_conn_rcv_sub(struct tipc_topsrv *srv,  {  	struct tipc_net *tn = tipc_net(srv->net);  	struct tipc_subscription *sub; +	u32 s_filter = tipc_sub_read(s, filter); -	if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) { -		s->filter &= __constant_ntohl(~TIPC_SUB_CANCEL); +	if (s_filter & TIPC_SUB_CANCEL) { +		tipc_sub_write(s, filter, s_filter & ~TIPC_SUB_CANCEL);  		tipc_conn_delete_sub(con, s);  		return 0;  	} @@ -400,12 +401,15 @@ static int tipc_conn_rcv_from_sock(struct tipc_conn *con)  		return -EWOULDBLOCK;  	if (ret == sizeof(s)) {  		read_lock_bh(&sk->sk_callback_lock); -		ret = tipc_conn_rcv_sub(srv, con, &s); +		/* RACE: the connection can be closed in the meantime */ +		if (likely(connected(con))) +			ret = tipc_conn_rcv_sub(srv, con, &s);  		read_unlock_bh(&sk->sk_callback_lock); +		if (!ret) +			return 0;  	} -	if (ret < 0) -		tipc_conn_close(con); +	tipc_conn_close(con);  	return ret;  } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index d6620ad53546..28a283f26a8d 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -161,9 +161,11 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,  			 struct udp_bearer *ub, struct udp_media_addr *src,  			 struct udp_media_addr *dst, struct dst_cache *cache)  { -	struct dst_entry *ndst = dst_cache_get(cache); +	struct dst_entry *ndst;  	int ttl, err = 0; +	local_bh_disable(); +	ndst = dst_cache_get(cache);  	if (dst->proto == htons(ETH_P_IP)) {  		struct rtable *rt = (struct rtable *)ndst; @@ -210,9 +212,11 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,  					   src->port, dst->port, false);  #endif  	} +	local_bh_enable();  	return err;  tx_error: +	local_bh_enable();  	kfree_skb(skb);  	return err;  } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 156efce50dbd..0e989005bdc2 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -56,9 +56,9 @@ enum {  	TLS_NUM_PROTS,  }; -static struct proto *saved_tcpv6_prot; +static const struct proto *saved_tcpv6_prot;  static DEFINE_MUTEX(tcpv6_prot_mutex); -static struct proto *saved_tcpv4_prot; +static const struct proto *saved_tcpv4_prot;  static DEFINE_MUTEX(tcpv4_prot_mutex);  static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];  static struct proto_ops tls_sw_proto_ops; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index c98e602a1a2d..8c2763eb6aae 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -206,10 +206,12 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err)  	kfree(aead_req); +	spin_lock_bh(&ctx->decrypt_compl_lock);  	pending = atomic_dec_return(&ctx->decrypt_pending); -	if (!pending && READ_ONCE(ctx->async_notify)) +	if (!pending && ctx->async_notify)  		complete(&ctx->async_wait.completion); +	spin_unlock_bh(&ctx->decrypt_compl_lock);  }  static int tls_do_decryption(struct sock *sk, @@ -467,10 +469,12 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)  			ready = true;  	} +	spin_lock_bh(&ctx->encrypt_compl_lock);  	pending = atomic_dec_return(&ctx->encrypt_pending); -	if (!pending && READ_ONCE(ctx->async_notify)) +	if (!pending && ctx->async_notify)  		complete(&ctx->async_wait.completion); +	spin_unlock_bh(&ctx->encrypt_compl_lock);  	if (!ready)  		return; @@ -780,7 +784,7 @@ static int tls_push_record(struct sock *sk, int flags,  static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,  			       bool full_record, u8 record_type, -			       size_t *copied, int flags) +			       ssize_t *copied, int flags)  {  	struct tls_context *tls_ctx = tls_get_ctx(sk);  	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); @@ -796,10 +800,13 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,  	psock = sk_psock_get(sk);  	if (!psock || !policy) {  		err = tls_push_record(sk, flags, record_type); -		if (err && err != -EINPROGRESS) { +		if (err && sk->sk_err == EBADMSG) {  			*copied -= sk_msg_free(sk, msg);  			tls_free_open_rec(sk); +			err = -sk->sk_err;  		} +		if (psock) +			sk_psock_put(sk, psock);  		return err;  	}  more_data: @@ -822,9 +829,10 @@ more_data:  	switch (psock->eval) {  	case __SK_PASS:  		err = tls_push_record(sk, flags, record_type); -		if (err && err != -EINPROGRESS) { +		if (err && sk->sk_err == EBADMSG) {  			*copied -= sk_msg_free(sk, msg);  			tls_free_open_rec(sk); +			err = -sk->sk_err;  			goto out_err;  		}  		break; @@ -914,7 +922,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)  	unsigned char record_type = TLS_RECORD_TYPE_DATA;  	bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);  	bool eor = !(msg->msg_flags & MSG_MORE); -	size_t try_to_copy, copied = 0; +	size_t try_to_copy; +	ssize_t copied = 0;  	struct sk_msg *msg_pl, *msg_en;  	struct tls_rec *rec;  	int required_size; @@ -924,6 +933,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)  	int num_zc = 0;  	int orig_size;  	int ret = 0; +	int pending;  	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))  		return -EOPNOTSUPP; @@ -1090,13 +1100,19 @@ trim_sgl:  		goto send_end;  	} else if (num_zc) {  		/* Wait for pending encryptions to get completed */ -		smp_store_mb(ctx->async_notify, true); +		spin_lock_bh(&ctx->encrypt_compl_lock); +		ctx->async_notify = true; -		if (atomic_read(&ctx->encrypt_pending)) +		pending = atomic_read(&ctx->encrypt_pending); +		spin_unlock_bh(&ctx->encrypt_compl_lock); +		if (pending)  			crypto_wait_req(-EINPROGRESS, &ctx->async_wait);  		else  			reinit_completion(&ctx->async_wait.completion); +		/* There can be no concurrent accesses, since we have no +		 * pending encrypt operations +		 */  		WRITE_ONCE(ctx->async_notify, false);  		if (ctx->async_wait.err) { @@ -1116,7 +1132,7 @@ send_end:  	release_sock(sk);  	mutex_unlock(&tls_ctx->tx_lock); -	return copied ? copied : ret; +	return copied > 0 ? copied : ret;  }  static int tls_sw_do_sendpage(struct sock *sk, struct page *page, @@ -1130,7 +1146,7 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page,  	struct sk_msg *msg_pl;  	struct tls_rec *rec;  	int num_async = 0; -	size_t copied = 0; +	ssize_t copied = 0;  	bool full_record;  	int record_room;  	int ret = 0; @@ -1232,7 +1248,7 @@ wait_for_memory:  	}  sendpage_end:  	ret = sk_stream_error(sk, flags, ret); -	return copied ? copied : ret; +	return copied > 0 ? copied : ret;  }  int tls_sw_sendpage_locked(struct sock *sk, struct page *page, @@ -1727,6 +1743,7 @@ int tls_sw_recvmsg(struct sock *sk,  	bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);  	bool is_peek = flags & MSG_PEEK;  	int num_async = 0; +	int pending;  	flags |= nonblock; @@ -1889,8 +1906,11 @@ pick_next_record:  recv_end:  	if (num_async) {  		/* Wait for all previously submitted records to be decrypted */ -		smp_store_mb(ctx->async_notify, true); -		if (atomic_read(&ctx->decrypt_pending)) { +		spin_lock_bh(&ctx->decrypt_compl_lock); +		ctx->async_notify = true; +		pending = atomic_read(&ctx->decrypt_pending); +		spin_unlock_bh(&ctx->decrypt_compl_lock); +		if (pending) {  			err = crypto_wait_req(-EINPROGRESS, &ctx->async_wait);  			if (err) {  				/* one of async decrypt failed */ @@ -1902,6 +1922,10 @@ recv_end:  		} else {  			reinit_completion(&ctx->async_wait.completion);  		} + +		/* There can be no concurrent accesses, since we have no +		 * pending decrypt operations +		 */  		WRITE_ONCE(ctx->async_notify, false);  		/* Drain records from the rx_list & copy if required */ @@ -2081,8 +2105,9 @@ static void tls_data_ready(struct sock *sk)  	strp_data_ready(&ctx->strp);  	psock = sk_psock_get(sk); -	if (psock && !list_empty(&psock->ingress_msg)) { -		ctx->saved_data_ready(sk); +	if (psock) { +		if (!list_empty(&psock->ingress_msg)) +			ctx->saved_data_ready(sk);  		sk_psock_put(sk, psock);  	}  } @@ -2287,6 +2312,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)  	if (tx) {  		crypto_init_wait(&sw_ctx_tx->async_wait); +		spin_lock_init(&sw_ctx_tx->encrypt_compl_lock);  		crypto_info = &ctx->crypto_send.info;  		cctx = &ctx->tx;  		aead = &sw_ctx_tx->aead_send; @@ -2295,6 +2321,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)  		sw_ctx_tx->tx_work.sk = sk;  	} else {  		crypto_init_wait(&sw_ctx_rx->async_wait); +		spin_lock_init(&sw_ctx_rx->decrypt_compl_lock);  		crypto_info = &ctx->crypto_recv.info;  		cctx = &ctx->rx;  		skb_queue_head_init(&sw_ctx_rx->rx_list); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index a5f28708e0e7..626bf9044418 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1408,7 +1408,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,  	/* Wait for children sockets to appear; these are the new sockets  	 * created upon connection establishment.  	 */ -	timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); +	timeout = sock_rcvtimeo(listener, flags & O_NONBLOCK);  	prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);  	while ((connected = vsock_dequeue_accept(listener)) == NULL && diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 709038a4783e..0edda1edf988 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -157,7 +157,11 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)  void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt)  { +	if (pkt->tap_delivered) +		return; +  	vsock_deliver_tap(virtio_transport_build_skb, pkt); +	pkt->tap_delivered = true;  }  EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); @@ -1128,6 +1132,14 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,  	lock_sock(sk); +	/* Check if sk has been released before lock_sock */ +	if (sk->sk_shutdown == SHUTDOWN_MASK) { +		(void)virtio_transport_reset_no_sock(t, pkt); +		release_sock(sk); +		sock_put(sk); +		goto free_pkt; +	} +  	/* Update CID in case it has changed after a transport reset event */  	vsk->local_addr.svm_cid = dst.svm_cid; diff --git a/net/wireless/core.c b/net/wireless/core.c index 341402b4f178..ce024440fa51 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -142,7 +142,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,  	if (result)  		return result; -	if (rdev->wiphy.debugfsdir) +	if (!IS_ERR_OR_NULL(rdev->wiphy.debugfsdir))  		debugfs_rename(rdev->wiphy.debugfsdir->d_parent,  			       rdev->wiphy.debugfsdir,  			       rdev->wiphy.debugfsdir->d_parent, newname); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5fa402144cda..692bcd35f809 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -644,10 +644,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {  	[NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,  					 .len = NL80211_HE_MAX_CAPABILITY_LEN }, -	[NL80211_ATTR_FTM_RESPONDER] = { -		.type = NLA_NESTED, -		.validation_data = nl80211_ftm_responder_policy, -	}, +	[NL80211_ATTR_FTM_RESPONDER] = +		NLA_POLICY_NESTED(nl80211_ftm_responder_policy),  	[NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1),  	[NL80211_ATTR_PEER_MEASUREMENTS] =  		NLA_POLICY_NESTED(nl80211_pmsr_attr_policy), diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 00e782335cb0..25bf72ee6cad 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -115,8 +115,10 @@ int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,  		goto drop;  	} -	if (!pskb_may_pull(skb, 1)) +	if (!pskb_may_pull(skb, 1)) { +		x25_neigh_put(nb);  		return 0; +	}  	switch (skb->data[0]) { diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 8aa415a38814..0285aaa1e93c 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -357,6 +357,12 @@ void x25_disconnect(struct sock *sk, int reason, unsigned char cause,  		sk->sk_state_change(sk);  		sock_set_flag(sk, SOCK_DEAD);  	} +	if (x25->neighbour) { +		read_lock_bh(&x25_list_lock); +		x25_neigh_put(x25->neighbour); +		x25->neighbour = NULL; +		read_unlock_bh(&x25_list_lock); +	}  }  /* diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index fa7bb5e060d0..3889bd9aec46 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -341,9 +341,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)  {  	bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;  	u32 chunk_size = mr->chunk_size, headroom = mr->headroom; +	u64 npgs, addr = mr->addr, size = mr->len;  	unsigned int chunks, chunks_per_page; -	u64 addr = mr->addr, size = mr->len; -	int size_chk, err; +	int err;  	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {  		/* Strictly speaking we could support this, if: @@ -372,6 +372,10 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)  	if ((addr + size) < addr)  		return -EINVAL; +	npgs = div_u64(size, PAGE_SIZE); +	if (npgs > U32_MAX) +		return -EINVAL; +  	chunks = (unsigned int)div_u64(size, chunk_size);  	if (chunks == 0)  		return -EINVAL; @@ -382,8 +386,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)  			return -EINVAL;  	} -	size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; -	if (size_chk < 0) +	if (headroom >= chunk_size - XDP_PACKET_HEADROOM)  		return -EINVAL;  	umem->address = (unsigned long)addr; @@ -392,7 +395,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)  	umem->size = size;  	umem->headroom = headroom;  	umem->chunk_size_nohr = chunk_size - headroom; -	umem->npgs = size / PAGE_SIZE; +	umem->npgs = (u32)npgs;  	umem->pgs = NULL;  	umem->user = NULL;  	umem->flags = mr->flags; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 356f90e4522b..c350108aa38d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -131,8 +131,9 @@ static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,  		u64 page_start = addr & ~(PAGE_SIZE - 1);  		u64 first_len = PAGE_SIZE - (addr - page_start); -		memcpy(to_buf, from_buf, first_len + metalen); -		memcpy(next_pg_addr, from_buf + first_len, len - first_len); +		memcpy(to_buf, from_buf, first_len); +		memcpy(next_pg_addr, from_buf + first_len, +		       len + metalen - first_len);  		return;  	} diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 037ea156d2f9..5a0ff665b71a 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -379,6 +379,7 @@ static void espintcp_destruct(struct sock *sk)  {  	struct espintcp_ctx *ctx = espintcp_getctx(sk); +	ctx->saved_destruct(sk);  	kfree(ctx);  } @@ -419,6 +420,7 @@ static int espintcp_init_sk(struct sock *sk)  	sk->sk_socket->ops = &espintcp_ops;  	ctx->saved_data_ready = sk->sk_data_ready;  	ctx->saved_write_space = sk->sk_write_space; +	ctx->saved_destruct = sk->sk_destruct;  	sk->sk_data_ready = espintcp_data_ready;  	sk->sk_write_space = espintcp_write_space;  	sk->sk_destruct = espintcp_destruct; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 6cc7f7f1dd68..f50d1f97cf8e 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -25,12 +25,10 @@ static void __xfrm_transport_prep(struct xfrm_state *x, struct sk_buff *skb,  	struct xfrm_offload *xo = xfrm_offload(skb);  	skb_reset_mac_len(skb); -	pskb_pull(skb, skb->mac_len + hsize + x->props.header_len); - -	if (xo->flags & XFRM_GSO_SEGMENT) { -		skb_reset_transport_header(skb); +	if (xo->flags & XFRM_GSO_SEGMENT)  		skb->transport_header -= x->props.header_len; -	} + +	pskb_pull(skb, skb_transport_offset(skb) + x->props.header_len);  }  static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index aa35f23c4912..8a202c44f89a 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -644,7 +644,7 @@ resume:  		dev_put(skb->dev);  		spin_lock(&x->lock); -		if (nexthdr <= 0) { +		if (nexthdr < 0) {  			if (nexthdr == -EBADMSG) {  				xfrm_audit_state_icvfail(x, skb,  							 x->type->proto); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 3361e3ac5714..1e115cbf21d3 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -750,7 +750,28 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {  	.get_link_net	= xfrmi_get_link_net,  }; +static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) +{ +	struct net *net; +	LIST_HEAD(list); + +	rtnl_lock(); +	list_for_each_entry(net, net_exit_list, exit_list) { +		struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); +		struct xfrm_if __rcu **xip; +		struct xfrm_if *xi; + +		for (xip = &xfrmn->xfrmi[0]; +		     (xi = rtnl_dereference(*xip)) != NULL; +		     xip = &xi->next) +			unregister_netdevice_queue(xi->dev, &list); +	} +	unregister_netdevice_many(&list); +	rtnl_unlock(); +} +  static struct pernet_operations xfrmi_net_ops = { +	.exit_batch = xfrmi_exit_batch_net,  	.id   = &xfrmi_net_id,  	.size = sizeof(struct xfrmi_net),  }; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 2fd3d990d992..69c4900db817 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -583,18 +583,20 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)  		xfrm_state_hold(x);  		if (skb_is_gso(skb)) { -			skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; +			if (skb->inner_protocol) +				return xfrm_output_gso(net, sk, skb); -			return xfrm_output2(net, sk, skb); +			skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; +			goto out;  		}  		if (x->xso.dev && x->xso.dev->features & NETIF_F_HW_ESP_TX_CSUM)  			goto out; +	} else { +		if (skb_is_gso(skb)) +			return xfrm_output_gso(net, sk, skb);  	} -	if (skb_is_gso(skb)) -		return xfrm_output_gso(net, sk, skb); -  	if (skb->ip_summed == CHECKSUM_PARTIAL) {  		err = skb_checksum_help(skb);  		if (err) { @@ -640,7 +642,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu)  	if (skb->protocol == htons(ETH_P_IP))  		proto = AF_INET; -	else if (skb->protocol == htons(ETH_P_IPV6)) +	else if (skb->protocol == htons(ETH_P_IPV6) && +		 skb->sk->sk_family == AF_INET6)  		proto = AF_INET6;  	else  		return; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 297b2fdb3c29..564aa6492e7c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1436,12 +1436,7 @@ static void xfrm_policy_requeue(struct xfrm_policy *old,  static bool xfrm_policy_mark_match(struct xfrm_policy *policy,  				   struct xfrm_policy *pol)  { -	u32 mark = policy->mark.v & policy->mark.m; - -	if (policy->mark.v == pol->mark.v && policy->mark.m == pol->mark.m) -		return true; - -	if ((mark & pol->mark.m) == pol->mark.v && +	if (policy->mark.v == pol->mark.v &&  	    policy->priority == pol->priority)  		return true;  | 
