From e8d67fa5696e2fcaf956dae36d11e6eff5246101 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Thu, 30 May 2019 00:51:26 +0300
Subject: net: dsa: sja1105: Don't store frame type in skb->cb

Due to a confusion I thought that eth_type_trans() was called by the
network stack whereas it can actually be called by network drivers to
figure out the skb protocol and next packet_type handlers.

In light of the above, it is not safe to store the frame type from the
DSA tagger's .filter callback (first entry point on RX path), since GRO
is yet to be invoked on the received traffic.  Hence it is very likely
that the skb->cb will actually get overwritten between eth_type_trans()
and the actual DSA packet_type handler.

Of course, what this patch fixes is the actual overwriting of the
SJA1105_SKB_CB(skb)->type field from the GRO layer, which made all
frames be seen as SJA1105_FRAME_TYPE_NORMAL (0).

Fixes: 227d07a07ef1 ("net: dsa: sja1105: Add support for traffic through standalone ports")
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_sja1105.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 969402c7dbf1..d43737e6c3fb 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -28,14 +28,10 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb)
  */
 static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev)
 {
-	if (sja1105_is_link_local(skb)) {
-		SJA1105_SKB_CB(skb)->type = SJA1105_FRAME_TYPE_LINK_LOCAL;
+	if (sja1105_is_link_local(skb))
 		return true;
-	}
-	if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) {
-		SJA1105_SKB_CB(skb)->type = SJA1105_FRAME_TYPE_NORMAL;
+	if (!dsa_port_is_vlan_filtering(dev->dsa_ptr))
 		return true;
-	}
 	return false;
 }
 
@@ -84,7 +80,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 
 	skb->offload_fwd_mark = 1;
 
-	if (SJA1105_SKB_CB(skb)->type == SJA1105_FRAME_TYPE_LINK_LOCAL) {
+	if (sja1105_is_link_local(skb)) {
 		/* Management traffic path. Switch embeds the switch ID and
 		 * port ID into bytes of the destination MAC, courtesy of
 		 * the incl_srcpt options.
-- 
cgit v1.2.3-59-g8ed1b


From afa0925c6fcc6a8f610e996ca09bc3215048033c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 31 May 2019 12:37:23 -0400
Subject: packet: unconditionally free po->rollover

Rollover used to use a complex RCU mechanism for assignment, which had
a race condition. The below patch fixed the bug and greatly simplified
the logic.

The feature depends on fanout, but the state is private to the socket.
Fanout_release returns f only when the last member leaves and the
fanout struct is to be freed.

Destroy rollover unconditionally, regardless of fanout state.

Fixes: 57f015f5eccf2 ("packet: fix crash in fanout_demux_rollover()")
Reported-by: syzbot <syzkaller@googlegroups.com>
Diagnosed-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index fbc775fbf712..d4889bf7248e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3014,8 +3014,8 @@ static int packet_release(struct socket *sock)
 
 	synchronize_net();
 
+	kfree(po->rollover);
 	if (f) {
-		kfree(po->rollover);
 		fanout_release_data(f);
 		kfree(f);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 27393f8c6efc03b8e0b64134721b0d337fca0a80 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 4 Jun 2019 12:00:11 -0700
Subject: Revert "net/tls: avoid NULL-deref on resync during device removal"

This reverts commit 38030d7cb77963ba84cdbe034806e2b81245339f.
Unfortunately the RX resync may get called from soft IRQ,
so we can't take the rwsem to protect from the device
disappearing.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index b95c408fd771..49b3a2ff8ef3 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -553,8 +553,8 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
 void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct net_device *netdev = tls_ctx->netdev;
 	struct tls_offload_context_rx *rx_ctx;
-	struct net_device *netdev;
 	u32 is_req_pending;
 	s64 resync_req;
 	u32 req_seq;
@@ -568,15 +568,10 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 	is_req_pending = resync_req;
 
 	if (unlikely(is_req_pending) && req_seq == seq &&
-	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) {
-		seq += TLS_HEADER_SIZE - 1;
-		down_read(&device_offload_lock);
-		netdev = tls_ctx->netdev;
-		if (netdev)
-			netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq,
-							      rcd_sn);
-		up_read(&device_offload_lock);
-	}
+	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
+		netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk,
+						      seq + TLS_HEADER_SIZE - 1,
+						      rcd_sn);
 }
 
 static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
-- 
cgit v1.2.3-59-g8ed1b


From e52972c11d6b1262964db96d65934196db621685 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 4 Jun 2019 12:00:12 -0700
Subject: net/tls: replace the sleeping lock around RX resync with a bit lock

Commit 38030d7cb779 ("net/tls: avoid NULL-deref on resync during device removal")
tried to fix a potential NULL-dereference by taking the
context rwsem.  Unfortunately the RX resync may get called
from soft IRQ, so we can't use the rwsem to protect from
the device disappearing.  Because we are guaranteed there
can be only one resync at a time (it's called from strparser)
use a bit to indicate resync is busy and make device
removal wait for the bit to get cleared.

Note that there is a leftover "flags" field in struct
tls_context already.

Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h    |  4 ++++
 net/tls/tls_device.c | 27 +++++++++++++++++++++------
 2 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/tls.h b/include/net/tls.h
index 39ea62f0c1f6..4a55ce6a303f 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -209,6 +209,10 @@ struct tls_offload_context_tx {
 	(ALIGN(sizeof(struct tls_offload_context_tx), sizeof(void *)) +        \
 	 TLS_DRIVER_STATE_SIZE)
 
+enum tls_context_flags {
+	TLS_RX_SYNC_RUNNING = 0,
+};
+
 struct cipher_context {
 	char *iv;
 	char *rec_seq;
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 49b3a2ff8ef3..1f9cf57d9754 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -550,10 +550,22 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
 	}
 }
 
+static void tls_device_resync_rx(struct tls_context *tls_ctx,
+				 struct sock *sk, u32 seq, u64 rcd_sn)
+{
+	struct net_device *netdev;
+
+	if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags)))
+		return;
+	netdev = READ_ONCE(tls_ctx->netdev);
+	if (netdev)
+		netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn);
+	clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags);
+}
+
 void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct net_device *netdev = tls_ctx->netdev;
 	struct tls_offload_context_rx *rx_ctx;
 	u32 is_req_pending;
 	s64 resync_req;
@@ -568,10 +580,10 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 	is_req_pending = resync_req;
 
 	if (unlikely(is_req_pending) && req_seq == seq &&
-	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
-		netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk,
-						      seq + TLS_HEADER_SIZE - 1,
-						      rcd_sn);
+	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) {
+		seq += TLS_HEADER_SIZE - 1;
+		tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn);
+	}
 }
 
 static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
@@ -972,7 +984,10 @@ static int tls_device_down(struct net_device *netdev)
 		if (ctx->rx_conf == TLS_HW)
 			netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
 							TLS_OFFLOAD_CTX_DIR_RX);
-		ctx->netdev = NULL;
+		WRITE_ONCE(ctx->netdev, NULL);
+		smp_mb__before_atomic(); /* pairs with test_and_set_bit() */
+		while (test_bit(TLS_RX_SYNC_RUNNING, &ctx->flags))
+			usleep_range(10, 200);
 		dev_put(netdev);
 		list_del_init(&ctx->list);
 
-- 
cgit v1.2.3-59-g8ed1b


From 82ba25c6de200d7a9e9c970c998cdd6dfa8637ae Mon Sep 17 00:00:00 2001
From: Tim Beale <timbeale@catalyst.net.nz>
Date: Tue, 4 Jun 2019 13:56:23 +1200
Subject: udp: only choose unbound UDP socket for multicast when not in a VRF

By default, packets received in another VRF should not be passed to an
unbound socket in the default VRF. This patch updates the IPv4 UDP
multicast logic to match the unicast VRF logic (in compute_score()),
as well as the IPv6 mcast logic (in __udp_v6_is_mcast_sock()).

The particular case I noticed was DHCP discover packets going
to the 255.255.255.255 address, which are handled by
__udp4_lib_mcast_deliver(). The previous code meant that running
multiple different DHCP server or relay agent instances across VRFs
did not work correctly - any server/relay agent in the default VRF
received DHCP discover packets for all other VRFs.

Fixes: 6da5b0f027a8 ("net: ensure unbound datagram socket to be chosen when not in a VRF")
Signed-off-by: Tim Beale <timbeale@catalyst.net.nz>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8fb250ed53d4..efe9283a8a95 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -538,8 +538,7 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 	    (inet->inet_dport != rmt_port && inet->inet_dport) ||
 	    (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
 	    ipv6_only_sock(sk) ||
-	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
-	     sk->sk_bound_dev_if != sdif))
+	    !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
 		return false;
 	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
 		return false;
-- 
cgit v1.2.3-59-g8ed1b


From fdf71426e7c548d6e4f5e51e0238732d60e05f5f Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 4 Jun 2019 11:44:06 +0200
Subject: net: fix indirect calls helpers for ptype list hooks.

As Eric noted, the current wrapper for ptype func hook inside
__netif_receive_skb_list_ptype() has no chance of avoiding the indirect
call: we enter such code path only for protocols other than ipv4 and
ipv6.

Instead we can wrap the list_func invocation.

v1 -> v2:
 - use the correct fix tag

Fixes: f5737cbadb7d ("net: use indirect calls helpers for ptype hook")
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 66f7508825bd..1c4593ec4409 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5025,12 +5025,12 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 	if (list_empty(head))
 		return;
 	if (pt_prev->list_func != NULL)
-		pt_prev->list_func(head, pt_prev, orig_dev);
+		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
+				   ip_list_rcv, head, pt_prev, orig_dev);
 	else
 		list_for_each_entry_safe(skb, next, head, list) {
 			skb_list_del_init(skb);
-			INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
-					   skb->dev, pt_prev, orig_dev);
+			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 		}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0a90478b93a46bdcd56ba33c37566a993e455d54 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 2 Jun 2019 19:10:24 +0800
Subject: ipv4: not do cache for local delivery if bc_forwarding is enabled

With the topo:

    h1 ---| rp1            |
          |     route  rp3 |--- h3 (192.168.200.1)
    h2 ---| rp2            |

If rp1 bc_forwarding is set while rp2 bc_forwarding is not, after
doing "ping 192.168.200.255" on h1, then ping 192.168.200.255 on
h2, and the packets can still be forwared.

This issue was caused by the input route cache. It should only do
the cache for either bc forwarding or local delivery. Otherwise,
local delivery can use the route cache for bc forwarding of other
interfaces.

This patch is to fix it by not doing cache for local delivery if
all.bc_forwarding is enabled.

Note that we don't fix it by checking route cache local flag after
rt_cache_valid() in "local_input:" and "ip_mkroute_input", as the
common route code shouldn't be touched for bc_forwarding.

Fixes: 5cbf777cfdf6 ("route: add support for directed broadcast forwarding")
Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11ddc276776e..91bf75b8ac1c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1985,7 +1985,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	u32		itag = 0;
 	struct rtable	*rth;
 	struct flowi4	fl4;
-	bool do_cache;
+	bool do_cache = true;
 
 	/* IP on this device is disabled. */
 
@@ -2062,6 +2062,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (res->type == RTN_BROADCAST) {
 		if (IN_DEV_BFORWARD(in_dev))
 			goto make_route;
+		/* not do cache if bc_forwarding is enabled */
+		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
+			do_cache = false;
 		goto brd_input;
 	}
 
@@ -2099,18 +2102,15 @@ brd_input:
 	RT_CACHE_STAT_INC(in_brd);
 
 local_input:
-	do_cache = false;
-	if (res->fi) {
-		if (!itag) {
-			struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+	do_cache &= res->fi && !itag;
+	if (do_cache) {
+		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
 
-			rth = rcu_dereference(nhc->nhc_rth_input);
-			if (rt_cache_valid(rth)) {
-				skb_dst_set_noref(skb, &rth->dst);
-				err = 0;
-				goto out;
-			}
-			do_cache = true;
+		rth = rcu_dereference(nhc->nhc_rth_input);
+		if (rt_cache_valid(rth)) {
+			skb_dst_set_noref(skb, &rth->dst);
+			err = 0;
+			goto out;
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From b50e058746ba29f517e27299447831ab3d93f896 Mon Sep 17 00:00:00 2001
From: Zhu Yanjun <yanjun.zhu@oracle.com>
Date: Mon, 3 Jun 2019 08:48:19 -0400
Subject: net: rds: fix memory leak when unload rds_rdma
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When KASAN is enabled, after several rds connections are
created, then "rmmod rds_rdma" is run. The following will
appear.

"
BUG rds_ib_incoming (Not tainted): Objects remaining
in rds_ib_incoming on __kmem_cache_shutdown()

Call Trace:
 dump_stack+0x71/0xab
 slab_err+0xad/0xd0
 __kmem_cache_shutdown+0x17d/0x370
 shutdown_cache+0x17/0x130
 kmem_cache_destroy+0x1df/0x210
 rds_ib_recv_exit+0x11/0x20 [rds_rdma]
 rds_ib_exit+0x7a/0x90 [rds_rdma]
 __x64_sys_delete_module+0x224/0x2c0
 ? __ia32_sys_delete_module+0x2c0/0x2c0
 do_syscall_64+0x73/0x190
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
"
This is rds connection memory leak. The root cause is:
When "rmmod rds_rdma" is run, rds_ib_remove_one will call
rds_ib_dev_shutdown to drop the rds connections.
rds_ib_dev_shutdown will call rds_conn_drop to drop rds
connections as below.
"
rds_conn_path_drop(&conn->c_path[0], false);
"
In the above, destroy is set to false.
void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
{
        atomic_set(&cp->cp_state, RDS_CONN_ERROR);

        rcu_read_lock();
        if (!destroy && rds_destroy_pending(cp->cp_conn)) {
                rcu_read_unlock();
                return;
        }
        queue_work(rds_wq, &cp->cp_down_w);
        rcu_read_unlock();
}
In the above function, destroy is set to false. rds_destroy_pending
is called. This does not move rds connections to ib_nodev_conns.
So destroy is set to true to move rds connections to ib_nodev_conns.
In rds_ib_unregister_client, flush_workqueue is called to make rds_wq
finsh shutdown rds connections. The function rds_ib_destroy_nodev_conns
is called to shutdown rds connections finally.
Then rds_ib_recv_exit is called to destroy slab.

void rds_ib_recv_exit(void)
{
        kmem_cache_destroy(rds_ib_incoming_slab);
        kmem_cache_destroy(rds_ib_frag_slab);
}
The above slab memory leak will not occur again.

>From tests,
256 rds connections
[root@ca-dev14 ~]# time rmmod rds_rdma

real    0m16.522s
user    0m0.000s
sys     0m8.152s
512 rds connections
[root@ca-dev14 ~]# time rmmod rds_rdma

real    0m32.054s
user    0m0.000s
sys     0m15.568s

To rmmod rds_rdma with 256 rds connections, about 16 seconds are needed.
And with 512 rds connections, about 32 seconds are needed.
>From ftrace, when one rds connection is destroyed,

"
 19)               |  rds_conn_destroy [rds]() {
 19)   7.782 us    |    rds_conn_path_drop [rds]();
 15)               |  rds_shutdown_worker [rds]() {
 15)               |    rds_conn_shutdown [rds]() {
 15)   1.651 us    |      rds_send_path_reset [rds]();
 15)   7.195 us    |    }
 15) + 11.434 us   |  }
 19)   2.285 us    |    rds_cong_remove_conn [rds]();
 19) * 24062.76 us |  }
"
So if many rds connections will be destroyed, this function
rds_ib_destroy_nodev_conns uses most of time.

Suggested-by: Håkon Bugge <haakon.bugge@oracle.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib.c      | 2 +-
 net/rds/ib_recv.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 2da9b75bad16..b8d581b779b2 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -87,7 +87,7 @@ static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
 
 	spin_lock_irqsave(&rds_ibdev->spinlock, flags);
 	list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
-		rds_conn_drop(ic->conn);
+		rds_conn_path_drop(&ic->conn->c_path[0], true);
 	spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
 }
 
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 8946c89d7392..3cae88cbdaa0 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -168,6 +168,7 @@ void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
 		list_del(&inc->ii_cache_entry);
 		WARN_ON(!list_empty(&inc->ii_frags));
 		kmem_cache_free(rds_ib_incoming_slab, inc);
+		atomic_dec(&rds_ib_allocation);
 	}
 
 	rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
@@ -1057,6 +1058,8 @@ out:
 
 void rds_ib_recv_exit(void)
 {
+	WARN_ON(atomic_read(&rds_ib_allocation));
+
 	kmem_cache_destroy(rds_ib_incoming_slab);
 	kmem_cache_destroy(rds_ib_frag_slab);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 0a8dd9f67cd0da7dc284f48b032ce00db1a68791 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 3 Jun 2019 16:32:59 -0400
Subject: Fix memory leak in sctp_process_init

syzbot found the following leak in sctp_process_init
BUG: memory leak
unreferenced object 0xffff88810ef68400 (size 1024):
  comm "syz-executor273", pid 7046, jiffies 4294945598 (age 28.770s)
  hex dump (first 32 bytes):
    1d de 28 8d de 0b 1b e3 b5 c2 f9 68 fd 1a 97 25  ..(........h...%
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<00000000a02cebbd>] kmemleak_alloc_recursive include/linux/kmemleak.h:55
[inline]
    [<00000000a02cebbd>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<00000000a02cebbd>] slab_alloc mm/slab.c:3326 [inline]
    [<00000000a02cebbd>] __do_kmalloc mm/slab.c:3658 [inline]
    [<00000000a02cebbd>] __kmalloc_track_caller+0x15d/0x2c0 mm/slab.c:3675
    [<000000009e6245e6>] kmemdup+0x27/0x60 mm/util.c:119
    [<00000000dfdc5d2d>] kmemdup include/linux/string.h:432 [inline]
    [<00000000dfdc5d2d>] sctp_process_init+0xa7e/0xc20
net/sctp/sm_make_chunk.c:2437
    [<00000000b58b62f8>] sctp_cmd_process_init net/sctp/sm_sideeffect.c:682
[inline]
    [<00000000b58b62f8>] sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1384
[inline]
    [<00000000b58b62f8>] sctp_side_effects net/sctp/sm_sideeffect.c:1194
[inline]
    [<00000000b58b62f8>] sctp_do_sm+0xbdc/0x1d60 net/sctp/sm_sideeffect.c:1165
    [<0000000044e11f96>] sctp_assoc_bh_rcv+0x13c/0x200
net/sctp/associola.c:1074
    [<00000000ec43804d>] sctp_inq_push+0x7f/0xb0 net/sctp/inqueue.c:95
    [<00000000726aa954>] sctp_backlog_rcv+0x5e/0x2a0 net/sctp/input.c:354
    [<00000000d9e249a8>] sk_backlog_rcv include/net/sock.h:950 [inline]
    [<00000000d9e249a8>] __release_sock+0xab/0x110 net/core/sock.c:2418
    [<00000000acae44fa>] release_sock+0x37/0xd0 net/core/sock.c:2934
    [<00000000963cc9ae>] sctp_sendmsg+0x2c0/0x990 net/sctp/socket.c:2122
    [<00000000a7fc7565>] inet_sendmsg+0x64/0x120 net/ipv4/af_inet.c:802
    [<00000000b732cbd3>] sock_sendmsg_nosec net/socket.c:652 [inline]
    [<00000000b732cbd3>] sock_sendmsg+0x54/0x70 net/socket.c:671
    [<00000000274c57ab>] ___sys_sendmsg+0x393/0x3c0 net/socket.c:2292
    [<000000008252aedb>] __sys_sendmsg+0x80/0xf0 net/socket.c:2330
    [<00000000f7bf23d1>] __do_sys_sendmsg net/socket.c:2339 [inline]
    [<00000000f7bf23d1>] __se_sys_sendmsg net/socket.c:2337 [inline]
    [<00000000f7bf23d1>] __x64_sys_sendmsg+0x23/0x30 net/socket.c:2337
    [<00000000a8b4131f>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:3

The problem was that the peer.cookie value points to an skb allocated
area on the first pass through this function, at which point it is
overwritten with a heap allocated value, but in certain cases, where a
COOKIE_ECHO chunk is included in the packet, a second pass through
sctp_process_init is made, where the cookie value is re-allocated,
leaking the first allocation.

Fix is to always allocate the cookie value, and free it when we are done
using it.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: syzbot+f7e9153b037eac9b1df8@syzkaller.appspotmail.com
CC: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: netdev@vger.kernel.org
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 13 +++----------
 net/sctp/sm_sideeffect.c |  5 +++++
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 92331e1195c1..f17908f5c4f3 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2312,7 +2312,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 	union sctp_addr addr;
 	struct sctp_af *af;
 	int src_match = 0;
-	char *cookie;
 
 	/* We must include the address that the INIT packet came from.
 	 * This is the only address that matters for an INIT packet.
@@ -2416,14 +2415,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 	/* Peer Rwnd   : Current calculated value of the peer's rwnd.  */
 	asoc->peer.rwnd = asoc->peer.i.a_rwnd;
 
-	/* Copy cookie in case we need to resend COOKIE-ECHO. */
-	cookie = asoc->peer.cookie;
-	if (cookie) {
-		asoc->peer.cookie = kmemdup(cookie, asoc->peer.cookie_len, gfp);
-		if (!asoc->peer.cookie)
-			goto clean_up;
-	}
-
 	/* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily
 	 * high (for example, implementations MAY use the size of the receiver
 	 * advertised window).
@@ -2592,7 +2583,9 @@ do_addr_param:
 	case SCTP_PARAM_STATE_COOKIE:
 		asoc->peer.cookie_len =
 			ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
-		asoc->peer.cookie = param.cookie->body;
+		asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp);
+		if (!asoc->peer.cookie)
+			retval = 0;
 		break;
 
 	case SCTP_PARAM_HEARTBEAT_INFO:
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 9b50da548db2..a554d6d15d1b 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -883,6 +883,11 @@ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds,
 						asoc->rto_initial;
 	}
 
+	if (sctp_state(asoc, ESTABLISHED)) {
+		kfree(asoc->peer.cookie);
+		asoc->peer.cookie = NULL;
+	}
+
 	if (sctp_state(asoc, ESTABLISHED) ||
 	    sctp_state(asoc, CLOSED) ||
 	    sctp_state(asoc, SHUTDOWN_RECEIVED)) {
-- 
cgit v1.2.3-59-g8ed1b


From 0ee4e76937d69128a6a66861ba393ebdc2ffc8a2 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Mon, 3 Jun 2019 16:57:13 -0400
Subject: ethtool: fix potential userspace buffer overflow

ethtool_get_regs() allocates a buffer of size ops->get_regs_len(),
and pass it to the kernel driver via ops->get_regs() for filling.

There is no restriction about what the kernel drivers can or cannot do
with the open ethtool_regs structure. They usually set regs->version
and ignore regs->len or set it to the same size as ops->get_regs_len().

But if userspace allocates a smaller buffer for the registers dump,
we would cause a userspace buffer overflow in the final copy_to_user()
call, which uses the regs.len value potentially reset by the driver.

To fix this, make this case obvious and store regs.len before calling
ops->get_regs(), to only copy as much data as requested by userspace,
up to the value returned by ops->get_regs_len().

While at it, remove the redundant check for non-null regbuf.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 43e9add58340..1a0196fbb49c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1359,13 +1359,16 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
 	if (!regbuf)
 		return -ENOMEM;
 
+	if (regs.len < reglen)
+		reglen = regs.len;
+
 	ops->get_regs(dev, &regs, regbuf);
 
 	ret = -EFAULT;
 	if (copy_to_user(useraddr, &regs, sizeof(regs)))
 		goto out;
 	useraddr += offsetof(struct ethtool_regs, data);
-	if (regbuf && copy_to_user(useraddr, regbuf, regs.len))
+	if (copy_to_user(useraddr, regbuf, reglen))
 		goto out;
 	ret = 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 4970b42d5c362bf873982db7d93245c5281e58f4 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 5 Jun 2019 12:27:14 +0800
Subject: Revert "fib_rules: return 0 directly if an exactly same rule exists
 when NLM_F_EXCL not supplied"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit e9919a24d3022f72bcadc407e73a6ef17093a849.

Nathan reported the new behaviour breaks Android, as Android just add
new rules and delete old ones.

If we return 0 without adding dup rules, Android will remove the new
added rules and causing system to soft-reboot.

Fixes: e9919a24d302 ("fib_rules: return 0 directly if an exactly same rule exists when NLM_F_EXCL not supplied")
Reported-by: Nathan Chancellor <natechancellor@gmail.com>
Reported-by: Yaro Slav <yaro330@gmail.com>
Reported-by: Maciej Żenczykowski <zenczykowski@gmail.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Tested-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 43f0115cce9c..18f8dd8329ed 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -757,9 +757,9 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		goto errout;
 
-	if (rule_exists(ops, frh, tb, rule)) {
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
-			err = -EEXIST;
+	if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
+	    rule_exists(ops, frh, tb, rule)) {
+		err = -EEXIST;
 		goto errout_free;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 59e3e4b52663a9d97efbce7307f62e4bc5c9ce91 Mon Sep 17 00:00:00 2001
From: Olivier Matz <olivier.matz@6wind.com>
Date: Thu, 6 Jun 2019 09:15:18 +0200
Subject: ipv6: use READ_ONCE() for inet->hdrincl as in ipv4

As it was done in commit 8f659a03a0ba ("net: ipv4: fix for a race
condition in raw_sendmsg") and commit 20b50d79974e ("net: ipv4: emulate
READ_ONCE() on ->hdrincl bit-field in raw_sendmsg()") for ipv4, copy the
value of inet->hdrincl in a local variable, to avoid introducing a race
condition in the next commit.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 96a3559f2a09..af2f9a833c4e 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -783,6 +783,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	struct flowi6 fl6;
 	struct ipcm6_cookie ipc6;
 	int addr_len = msg->msg_namelen;
+	int hdrincl;
 	u16 proto;
 	int err;
 
@@ -796,6 +797,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	if (msg->msg_flags & MSG_OOB)
 		return -EOPNOTSUPP;
 
+	/* hdrincl should be READ_ONCE(inet->hdrincl)
+	 * but READ_ONCE() doesn't work with bit fields.
+	 * Doing this indirectly yields the same result.
+	 */
+	hdrincl = inet->hdrincl;
+	hdrincl = READ_ONCE(hdrincl);
+
 	/*
 	 *	Get and verify the address.
 	 */
@@ -908,7 +916,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		fl6.flowi6_oif = np->ucast_oif;
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	if (inet->hdrincl)
+	if (hdrincl)
 		fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;
 
 	if (ipc6.tclass < 0)
@@ -931,7 +939,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		goto do_confirm;
 
 back_from_confirm:
-	if (inet->hdrincl)
+	if (hdrincl)
 		err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst,
 					msg->msg_flags, &ipc6.sockc);
 	else {
-- 
cgit v1.2.3-59-g8ed1b


From b9aa52c4cb457e7416cc0c95f475e72ef4a61336 Mon Sep 17 00:00:00 2001
From: Olivier Matz <olivier.matz@6wind.com>
Date: Thu, 6 Jun 2019 09:15:19 +0200
Subject: ipv6: fix EFAULT on sendto with icmpv6 and hdrincl

The following code returns EFAULT (Bad address):

  s = socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6);
  setsockopt(s, SOL_IPV6, IPV6_HDRINCL, 1);
  sendto(ipv6_icmp6_packet, addr);   /* returns -1, errno = EFAULT */

The IPv4 equivalent code works. A workaround is to use IPPROTO_RAW
instead of IPPROTO_ICMPV6.

The failure happens because 2 bytes are eaten from the msghdr by
rawv6_probe_proto_opt() starting from commit 19e3c66b52ca ("ipv6
equivalent of "ipv4: Avoid reading user iov twice after
raw_probe_proto_opt""), but at that time it was not a problem because
IPV6_HDRINCL was not yet introduced.

Only eat these 2 bytes if hdrincl == 0.

Fixes: 715f504b1189 ("ipv6: add IPV6_HDRINCL option for raw sockets")
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index af2f9a833c4e..1bb88b4b677b 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -895,11 +895,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	opt = ipv6_fixup_options(&opt_space, opt);
 
 	fl6.flowi6_proto = proto;
-	rfv.msg = msg;
-	rfv.hlen = 0;
-	err = rawv6_probe_proto_opt(&rfv, &fl6);
-	if (err)
-		goto out;
+
+	if (!hdrincl) {
+		rfv.msg = msg;
+		rfv.hlen = 0;
+		err = rawv6_probe_proto_opt(&rfv, &fl6);
+		if (err)
+			goto out;
+	}
 
 	if (!ipv6_addr_any(daddr))
 		fl6.daddr = *daddr;
-- 
cgit v1.2.3-59-g8ed1b


From 85cb928787eab6a2f4ca9d2a798b6f3bed53ced1 Mon Sep 17 00:00:00 2001
From: Zhu Yanjun <yanjun.zhu@oracle.com>
Date: Thu, 6 Jun 2019 04:00:03 -0400
Subject: net: rds: fix memory leak in rds_ib_flush_mr_pool

When the following tests last for several hours, the problem will occur.

Server:
    rds-stress -r 1.1.1.16 -D 1M
Client:
    rds-stress -r 1.1.1.14 -s 1.1.1.16 -D 1M -T 30

The following will occur.

"
Starting up....
tsks   tx/s   rx/s  tx+rx K/s    mbi K/s    mbo K/s tx us/c   rtt us cpu
%
  1      0      0       0.00       0.00       0.00    0.00 0.00 -1.00
  1      0      0       0.00       0.00       0.00    0.00 0.00 -1.00
  1      0      0       0.00       0.00       0.00    0.00 0.00 -1.00
  1      0      0       0.00       0.00       0.00    0.00 0.00 -1.00
"
>From vmcore, we can find that clean_list is NULL.

>From the source code, rds_mr_flushd calls rds_ib_mr_pool_flush_worker.
Then rds_ib_mr_pool_flush_worker calls
"
 rds_ib_flush_mr_pool(pool, 0, NULL);
"
Then in function
"
int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
                         int free_all, struct rds_ib_mr **ibmr_ret)
"
ibmr_ret is NULL.

In the source code,
"
...
list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
if (ibmr_ret)
        *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);

/* more than one entry in llist nodes */
if (clean_nodes->next)
        llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
...
"
When ibmr_ret is NULL, llist_entry is not executed. clean_nodes->next
instead of clean_nodes is added in clean_list.
So clean_nodes is discarded. It can not be used again.
The workqueue is executed periodically. So more and more clean_nodes are
discarded. Finally the clean_list is NULL.
Then this problem will occur.

Fixes: 1bc144b62524 ("net, rds, Replace xlist in net/rds/xlist.h with llist")
Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_rdma.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index d664e9ade74d..0b347f46b2f4 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -428,12 +428,14 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 		wait_clean_list_grace();
 
 		list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
-		if (ibmr_ret)
+		if (ibmr_ret) {
 			*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
-
+			clean_nodes = clean_nodes->next;
+		}
 		/* more than one entry in llist nodes */
-		if (clean_nodes->next)
-			llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
+		if (clean_nodes)
+			llist_add_batch(clean_nodes, clean_tail,
+					&pool->clean_list);
 
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 720f1de4021f09898b8c8443f3b3e995991b6e3a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 6 Jun 2019 15:45:03 +0200
Subject: pktgen: do not sleep with the thread lock held.

Currently, the process issuing a "start" command on the pktgen procfs
interface, acquires the pktgen thread lock and never release it, until
all pktgen threads are completed. The above can blocks indefinitely any
other pktgen command and any (even unrelated) netdevice removal - as
the pktgen netdev notifier acquires the same lock.

The issue is demonstrated by the following script, reported by Matteo:

ip -b - <<'EOF'
	link add type dummy
	link add type veth
	link set dummy0 up
EOF
modprobe pktgen
echo reset >/proc/net/pktgen/pgctrl
{
	echo rem_device_all
	echo add_device dummy0
} >/proc/net/pktgen/kpktgend_0
echo count 0 >/proc/net/pktgen/dummy0
echo start >/proc/net/pktgen/pgctrl &
sleep 1
rmmod veth

Fix the above releasing the thread lock around the sleep call.

Additionally we must prevent racing with forcefull rmmod - as the
thread lock no more protects from them. Instead, acquire a self-reference
before waiting for any thread. As a side effect, running

rmmod pktgen

while some thread is running now fails with "module in use" error,
before this patch such command hanged indefinitely.

Note: the issue predates the commit reported in the fixes tag, but
this fix can't be applied before the mentioned commit.

v1 -> v2:
 - no need to check for thread existence after flipping the lock,
   pktgen threads are freed only at net exit time
 -

Fixes: 6146e6a43b35 ("[PKTGEN]: Removes thread_{un,}lock() macros.")
Reported-and-tested-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 319ad5490fb3..a33d03c05ed6 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3066,7 +3066,13 @@ static int pktgen_wait_thread_run(struct pktgen_thread *t)
 {
 	while (thread_is_running(t)) {
 
+		/* note: 't' will still be around even after the unlock/lock
+		 * cycle because pktgen_thread threads are only cleared at
+		 * net exit
+		 */
+		mutex_unlock(&pktgen_thread_lock);
 		msleep_interruptible(100);
+		mutex_lock(&pktgen_thread_lock);
 
 		if (signal_pending(current))
 			goto signal;
@@ -3081,6 +3087,10 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
 	struct pktgen_thread *t;
 	int sig = 1;
 
+	/* prevent from racing with rmmod */
+	if (!try_module_get(THIS_MODULE))
+		return sig;
+
 	mutex_lock(&pktgen_thread_lock);
 
 	list_for_each_entry(t, &pn->pktgen_threads, th_list) {
@@ -3094,6 +3104,7 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
 			t->control |= (T_STOP);
 
 	mutex_unlock(&pktgen_thread_lock);
+	module_put(THIS_MODULE);
 	return sig;
 }
 
-- 
cgit v1.2.3-59-g8ed1b