From 086f95682114fd2d1790bd3226e76cbae9a2d192 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 25 Jul 2019 15:52:25 -0700
Subject: bpf/flow_dissector: pass input flags to BPF flow dissector program

C flow dissector supports input flags that tell it to customize parsing
by either stopping early or trying to parse as deep as possible. Pass
those flags to the BPF flow dissector so it can make the same
decisions. In the next commits I'll add support for those flags to
our reference bpf_flow.c

v3:
* Export copy of flow dissector flags instead of moving (Alexei Starovoitov)

Acked-by: Petar Penkov <ppenkov@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fa1c753dcdbc..88b9d743036f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3507,6 +3507,10 @@ enum bpf_task_fd_type {
 	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
 };
 
+#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG		(1U << 0)
+#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL		(1U << 1)
+#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP		(1U << 2)
+
 struct bpf_flow_keys {
 	__u16	nhoff;
 	__u16	thoff;
@@ -3528,6 +3532,7 @@ struct bpf_flow_keys {
 			__u32	ipv6_dst[4];	/* in6_addr; network order */
 		};
 	};
+	__u32	flags;
 };
 
 struct bpf_func_info {
-- 
cgit v1.2.3-59-g8ed1b


From 71c99e32b926159ea628352751f66383d7d04d17 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 25 Jul 2019 15:52:30 -0700
Subject: bpf/flow_dissector: support ipv6 flow_label and
 BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL

Add support for exporting ipv6 flow label via bpf_flow_keys.
Export flow label from bpf_flow.c and also return early when
BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL is passed.

Acked-by: Petar Penkov <ppenkov@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h                           |  1 +
 net/core/flow_dissector.c                          |  9 +++++
 tools/include/uapi/linux/bpf.h                     |  1 +
 .../selftests/bpf/prog_tests/flow_dissector.c      | 46 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/bpf_flow.c       | 10 +++++
 5 files changed, 67 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 88b9d743036f..e985f07a98ed 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3533,6 +3533,7 @@ struct bpf_flow_keys {
 		};
 	};
 	__u32	flags;
+	__be32	flow_label;
 };
 
 struct bpf_func_info {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 50ed1a688709..9741b593ea53 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -737,6 +737,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
+	struct flow_dissector_key_tags *key_tags;
 
 	key_control = skb_flow_dissector_target(flow_dissector,
 						FLOW_DISSECTOR_KEY_CONTROL,
@@ -781,6 +782,14 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
 		key_ports->src = flow_keys->sport;
 		key_ports->dst = flow_keys->dport;
 	}
+
+	if (dissector_uses_key(flow_dissector,
+			       FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+		key_tags = skb_flow_dissector_target(flow_dissector,
+						     FLOW_DISSECTOR_KEY_FLOW_LABEL,
+						     target_container);
+		key_tags->flow_label = ntohl(flow_keys->flow_label);
+	}
 }
 
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2e4b0848d795..3d7fc67ec1b8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3530,6 +3530,7 @@ struct bpf_flow_keys {
 		};
 	};
 	__u32	flags;
+	__be32	flow_label;
 };
 
 struct bpf_func_info {
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 8e8c18aced9b..ef83f145a6f1 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -20,6 +20,7 @@
 	      "is_encap=%u/%u "						\
 	      "ip_proto=0x%x/0x%x "					\
 	      "n_proto=0x%x/0x%x "					\
+	      "flow_label=0x%x/0x%x "					\
 	      "sport=%u/%u "						\
 	      "dport=%u/%u\n",						\
 	      got.nhoff, expected.nhoff,				\
@@ -30,6 +31,7 @@
 	      got.is_encap, expected.is_encap,				\
 	      got.ip_proto, expected.ip_proto,				\
 	      got.n_proto, expected.n_proto,				\
+	      got.flow_label, expected.flow_label,			\
 	      got.sport, expected.sport,				\
 	      got.dport, expected.dport)
 
@@ -257,6 +259,50 @@ struct test tests[] = {
 			.is_first_frag = true,
 		},
 	},
+	{
+		.name = "ipv6-flow-label",
+		.pkt.ipv6 = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.iph.nexthdr = IPPROTO_TCP,
+			.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph.flow_lbl = { 0xb, 0xee, 0xef },
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct ipv6hdr),
+			.addr_proto = ETH_P_IPV6,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.sport = 80,
+			.dport = 8080,
+			.flow_label = __bpf_constant_htonl(0xbeeef),
+		},
+	},
+	{
+		.name = "ipv6-no-flow-label",
+		.pkt.ipv6 = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.iph.nexthdr = IPPROTO_TCP,
+			.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph.flow_lbl = { 0xb, 0xee, 0xef },
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct ipv6hdr),
+			.addr_proto = ETH_P_IPV6,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.flow_label = __bpf_constant_htonl(0xbeeef),
+		},
+		.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
+	},
 };
 
 static int create_tap(const char *ifname)
diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c
index f931cd3db9d4..7fbfa22f33df 100644
--- a/tools/testing/selftests/bpf/progs/bpf_flow.c
+++ b/tools/testing/selftests/bpf/progs/bpf_flow.c
@@ -83,6 +83,12 @@ static __always_inline int export_flow_keys(struct bpf_flow_keys *keys,
 	return ret;
 }
 
+#define IPV6_FLOWLABEL_MASK		__bpf_constant_htonl(0x000FFFFF)
+static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr)
+{
+	return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK;
+}
+
 static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
 							 __u16 hdr_size,
 							 void *buffer)
@@ -308,6 +314,10 @@ PROG(IPV6)(struct __sk_buff *skb)
 
 	keys->thoff += sizeof(struct ipv6hdr);
 	keys->ip_proto = ip6h->nexthdr;
+	keys->flow_label = ip6_flowlabel(ip6h);
+
+	if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)
+		return export_flow_keys(keys, BPF_OK);
 
 	return parse_ipv6_proto(skb, ip6h->nexthdr);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7a113110fc8cdda14023c0bffc7bd8b5f3da1edf Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 22 Jul 2019 06:33:10 -0500
Subject: nl80211: document uapi for CMD_FRAME_WAIT_CANCEL

Commit 1c38c7f22068 ("nl80211: send event when CMD_FRAME duration
expires") added the possibility of NL80211_CMD_FRAME_WAIT_CANCEL
being sent whenever the off-channel wait time associated with a
CMD_FRAME completes.  Document this in the uapi/linux/nl80211.h file.

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Link: https://lore.kernel.org/r/20190722113312.14031-1-denkenz@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index beb9a9d0c00a..c45587c2cf44 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -657,7 +657,9 @@
  *	is used during CSA period.
  * @NL80211_CMD_FRAME_WAIT_CANCEL: When an off-channel TX was requested, this
  *	command may be used with the corresponding cookie to cancel the wait
- *	time if it is known that it is no longer necessary.
+ *	time if it is known that it is no longer necessary.  This command is
+ *	also sent as an event whenever the driver has completed the off-channel
+ *	wait time.
  * @NL80211_CMD_ACTION: Alias for @NL80211_CMD_FRAME for backward compatibility.
  * @NL80211_CMD_FRAME_TX_STATUS: Report TX status of a management frame
  *	transmitted with %NL80211_CMD_FRAME. %NL80211_ATTR_COOKIE identifies
-- 
cgit v1.2.3-59-g8ed1b


From 6f9d451ab1a33728adb72d7ff66a7b374d665176 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 26 Jul 2019 18:06:55 +0200
Subject: xdp: Add devmap_hash map type for looking up devices by hashed index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A common pattern when using xdp_redirect_map() is to create a device map
where the lookup key is simply ifindex. Because device maps are arrays,
this leaves holes in the map, and the map has to be sized to fit the
largest ifindex, regardless of how many devices actually are actually
needed in the map.

This patch adds a second type of device map where the key is looked up
using a hashmap, instead of being used as an array index. This allows maps
to be densely packed, so they can be smaller.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h        |   7 ++
 include/linux/bpf_types.h  |   1 +
 include/trace/events/xdp.h |   3 +-
 include/uapi/linux/bpf.h   |   1 +
 kernel/bpf/devmap.c        | 200 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c      |   2 +
 net/core/filter.c          |   9 +-
 7 files changed, 220 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bfdb54dd2ad1..f9a506147c8a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -713,6 +713,7 @@ struct xdp_buff;
 struct sk_buff;
 
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_flush(struct bpf_map *map);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
@@ -799,6 +800,12 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 	return NULL;
 }
 
+static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
+							     u32 key)
+{
+	return NULL;
+}
+
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index eec5aeeeaf92..36a9c2325176 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
 #if defined(CONFIG_BPF_STREAM_PARSER)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 68899fdc985b..8c8420230a10 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -175,7 +175,8 @@ struct _bpf_dtab_netdev {
 #endif /* __DEVMAP_OBJ_TYPE */
 
 #define devmap_ifindex(fwd, map)				\
-	((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	((map->map_type == BPF_MAP_TYPE_DEVMAP ||		\
+	  map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ?		\
 	  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e985f07a98ed..6bbef0c7f585 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -134,6 +134,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_QUEUE,
 	BPF_MAP_TYPE_STACK,
 	BPF_MAP_TYPE_SK_STORAGE,
+	BPF_MAP_TYPE_DEVMAP_HASH,
 };
 
 /* Note that tracing related programs such as
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a0501266bdb8..9af048a932b5 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -37,6 +37,12 @@
  * notifier hook walks the map we know that new dev references can not be
  * added by the user because core infrastructure ensures dev_get_by_index()
  * calls will fail at this point.
+ *
+ * The devmap_hash type is a map type which interprets keys as ifindexes and
+ * indexes these using a hashmap. This allows maps that use ifindex as key to be
+ * densely packed instead of having holes in the lookup array for unused
+ * ifindexes. The setup and packet enqueue/send code is shared between the two
+ * types of devmap; only the lookup and insertion is different.
  */
 #include <linux/bpf.h>
 #include <net/xdp.h>
@@ -59,6 +65,7 @@ struct xdp_bulk_queue {
 
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
+	struct hlist_node index_hlist;
 	struct bpf_dtab *dtab;
 	struct xdp_bulk_queue __percpu *bulkq;
 	struct rcu_head rcu;
@@ -70,11 +77,30 @@ struct bpf_dtab {
 	struct bpf_dtab_netdev **netdev_map;
 	struct list_head __percpu *flush_list;
 	struct list_head list;
+
+	/* these are only used for DEVMAP_HASH type maps */
+	struct hlist_head *dev_index_head;
+	spinlock_t index_lock;
+	unsigned int items;
+	u32 n_buckets;
 };
 
 static DEFINE_SPINLOCK(dev_map_lock);
 static LIST_HEAD(dev_map_list);
 
+static struct hlist_head *dev_map_create_hash(unsigned int entries)
+{
+	int i;
+	struct hlist_head *hash;
+
+	hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
+	if (hash != NULL)
+		for (i = 0; i < entries; i++)
+			INIT_HLIST_HEAD(&hash[i]);
+
+	return hash;
+}
+
 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 {
 	int err, cpu;
@@ -97,6 +123,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 	cost += sizeof(struct list_head) * num_possible_cpus();
 
+	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+		dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
+
+		if (!dtab->n_buckets) /* Overflow check */
+			return -EINVAL;
+		cost += sizeof(struct hlist_head) * dtab->n_buckets;
+	}
+
 	/* if map size is larger than memlock limit, reject it */
 	err = bpf_map_charge_init(&dtab->map.memory, cost);
 	if (err)
@@ -115,8 +149,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 	if (!dtab->netdev_map)
 		goto free_percpu;
 
+	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+		dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
+		if (!dtab->dev_index_head)
+			goto free_map_area;
+
+		spin_lock_init(&dtab->index_lock);
+	}
+
 	return 0;
 
+free_map_area:
+	bpf_map_area_free(dtab->netdev_map);
 free_percpu:
 	free_percpu(dtab->flush_list);
 free_charge:
@@ -198,6 +242,7 @@ static void dev_map_free(struct bpf_map *map)
 
 	free_percpu(dtab->flush_list);
 	bpf_map_area_free(dtab->netdev_map);
+	kfree(dtab->dev_index_head);
 	kfree(dtab);
 }
 
@@ -218,6 +263,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
+static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
+						    int idx)
+{
+	return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
+}
+
+struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct hlist_head *head = dev_map_index_hash(dtab, key);
+	struct bpf_dtab_netdev *dev;
+
+	hlist_for_each_entry_rcu(dev, head, index_hlist)
+		if (dev->idx == key)
+			return dev;
+
+	return NULL;
+}
+
+static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
+				    void *next_key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	u32 idx, *next = next_key;
+	struct bpf_dtab_netdev *dev, *next_dev;
+	struct hlist_head *head;
+	int i = 0;
+
+	if (!key)
+		goto find_first;
+
+	idx = *(u32 *)key;
+
+	dev = __dev_map_hash_lookup_elem(map, idx);
+	if (!dev)
+		goto find_first;
+
+	next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
+				    struct bpf_dtab_netdev, index_hlist);
+
+	if (next_dev) {
+		*next = next_dev->idx;
+		return 0;
+	}
+
+	i = idx & (dtab->n_buckets - 1);
+	i++;
+
+ find_first:
+	for (; i < dtab->n_buckets; i++) {
+		head = dev_map_index_hash(dtab, i);
+
+		next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					    struct bpf_dtab_netdev,
+					    index_hlist);
+		if (next_dev) {
+			*next = next_dev->idx;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
 static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
 		       bool in_napi_ctx)
 {
@@ -373,6 +482,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 	return dev ? &dev->ifindex : NULL;
 }
 
+static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
+								*(u32 *)key);
+	struct net_device *dev = obj ? obj->dev : NULL;
+
+	return dev ? &dev->ifindex : NULL;
+}
+
 static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
 	if (dev->dev->netdev_ops->ndo_xdp_xmit) {
@@ -422,6 +540,28 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
+static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *old_dev;
+	int k = *(u32 *)key;
+	unsigned long flags;
+	int ret = -ENOENT;
+
+	spin_lock_irqsave(&dtab->index_lock, flags);
+
+	old_dev = __dev_map_hash_lookup_elem(map, k);
+	if (old_dev) {
+		dtab->items--;
+		hlist_del_init_rcu(&old_dev->index_hlist);
+		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+	return ret;
+}
+
 static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 						    struct bpf_dtab *dtab,
 						    u32 ifindex,
@@ -502,6 +642,56 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 				     map, key, value, map_flags);
 }
 
+static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+				     void *key, void *value, u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *dev, *old_dev;
+	u32 ifindex = *(u32 *)value;
+	u32 idx = *(u32 *)key;
+	unsigned long flags;
+
+	if (unlikely(map_flags > BPF_EXIST || !ifindex))
+		return -EINVAL;
+
+	old_dev = __dev_map_hash_lookup_elem(map, idx);
+	if (old_dev && (map_flags & BPF_NOEXIST))
+		return -EEXIST;
+
+	dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	spin_lock_irqsave(&dtab->index_lock, flags);
+
+	if (old_dev) {
+		hlist_del_rcu(&old_dev->index_hlist);
+	} else {
+		if (dtab->items >= dtab->map.max_entries) {
+			spin_unlock_irqrestore(&dtab->index_lock, flags);
+			call_rcu(&dev->rcu, __dev_map_entry_free);
+			return -E2BIG;
+		}
+		dtab->items++;
+	}
+
+	hlist_add_head_rcu(&dev->index_hlist,
+			   dev_map_index_hash(dtab, idx));
+	spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+	if (old_dev)
+		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+	return 0;
+}
+
+static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+				   u64 map_flags)
+{
+	return __dev_map_hash_update_elem(current->nsproxy->net_ns,
+					 map, key, value, map_flags);
+}
+
 const struct bpf_map_ops dev_map_ops = {
 	.map_alloc = dev_map_alloc,
 	.map_free = dev_map_free,
@@ -512,6 +702,16 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_check_btf = map_check_no_btf,
 };
 
+const struct bpf_map_ops dev_map_hash_ops = {
+	.map_alloc = dev_map_alloc,
+	.map_free = dev_map_free,
+	.map_get_next_key = dev_map_hash_get_next_key,
+	.map_lookup_elem = dev_map_hash_lookup_elem,
+	.map_update_elem = dev_map_hash_update_elem,
+	.map_delete_elem = dev_map_hash_delete_elem,
+	.map_check_btf = map_check_no_btf,
+};
+
 static int dev_map_notification(struct notifier_block *notifier,
 				ulong event, void *ptr)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5900cbb966b1..cef851cd5c36 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3457,6 +3457,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_HASH:
 		if (func_id != BPF_FUNC_redirect_map &&
 		    func_id != BPF_FUNC_map_lookup_elem)
 			goto error;
@@ -3539,6 +3540,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_redirect_map:
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
 		    map->map_type != BPF_MAP_TYPE_CPUMAP &&
 		    map->map_type != BPF_MAP_TYPE_XSKMAP)
 			goto error;
diff --git a/net/core/filter.c b/net/core/filter.c
index 3961437ccc50..1eee70f80fba 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3517,7 +3517,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	int err;
 
 	switch (map->map_type) {
-	case BPF_MAP_TYPE_DEVMAP: {
+	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_HASH: {
 		struct bpf_dtab_netdev *dst = fwd;
 
 		err = dev_map_enqueue(dst, xdp, dev_rx);
@@ -3554,6 +3555,7 @@ void xdp_do_flush_map(void)
 	if (map) {
 		switch (map->map_type) {
 		case BPF_MAP_TYPE_DEVMAP:
+		case BPF_MAP_TYPE_DEVMAP_HASH:
 			__dev_map_flush(map);
 			break;
 		case BPF_MAP_TYPE_CPUMAP:
@@ -3574,6 +3576,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP:
 		return __dev_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_DEVMAP_HASH:
+		return __dev_map_hash_lookup_elem(map, index);
 	case BPF_MAP_TYPE_CPUMAP:
 		return __cpu_map_lookup_elem(map, index);
 	case BPF_MAP_TYPE_XSKMAP:
@@ -3655,7 +3659,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 	ri->tgt_value = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
-	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
+	    map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 		struct bpf_dtab_netdev *dst = fwd;
 
 		err = dev_map_generic_redirect(dst, skb, xdp_prog);
-- 
cgit v1.2.3-59-g8ed1b


From 70d66244317e958092e9c971b08dd5b7fd29d9cb Mon Sep 17 00:00:00 2001
From: Petar Penkov <ppenkov@google.com>
Date: Mon, 29 Jul 2019 09:59:15 -0700
Subject: bpf: add bpf_tcp_gen_syncookie helper

This helper function allows BPF programs to try to generate SYN
cookies, given a reference to a listener socket. The function works
from XDP and with an skb context since bpf_skc_lookup_tcp can lookup a
socket in both cases.

Signed-off-by: Petar Penkov <ppenkov@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 30 +++++++++++++++++++-
 net/core/filter.c        | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6bbef0c7f585..4393bd4b2419 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2714,6 +2714,33 @@ union bpf_attr {
  *		**-EPERM** if no permission to send the *sig*.
  *
  *		**-EAGAIN** if bpf program can try again.
+ *
+ * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
+ *	Description
+ *		Try to issue a SYN cookie for the packet with corresponding
+ *		IP/TCP headers, *iph* and *th*, on the listening socket in *sk*.
+ *
+ *		*iph* points to the start of the IPv4 or IPv6 header, while
+ *		*iph_len* contains **sizeof**\ (**struct iphdr**) or
+ *		**sizeof**\ (**struct ip6hdr**).
+ *
+ *		*th* points to the start of the TCP header, while *th_len*
+ *		contains the length of the TCP header.
+ *
+ *	Return
+ *		On success, lower 32 bits hold the generated SYN cookie in
+ *		followed by 16 bits which hold the MSS value for that cookie,
+ *		and the top 16 bits are unused.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** SYN cookie cannot be issued due to error
+ *
+ *		**-ENOENT** SYN cookie should not be issued (no SYN flood)
+ *
+ *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
+ *
+ *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2825,7 +2852,8 @@ union bpf_attr {
 	FN(strtoul),			\
 	FN(sk_storage_get),		\
 	FN(sk_storage_delete),		\
-	FN(send_signal),
+	FN(send_signal),		\
+	FN(tcp_gen_syncookie),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 1eee70f80fba..5a2707918629 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5855,6 +5855,75 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
+	   struct tcphdr *, th, u32, th_len)
+{
+#ifdef CONFIG_SYN_COOKIES
+	u32 cookie;
+	u16 mss;
+
+	if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+		return -EINVAL;
+
+	if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
+		return -EINVAL;
+
+	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
+		return -ENOENT;
+
+	if (!th->syn || th->ack || th->fin || th->rst)
+		return -EINVAL;
+
+	if (unlikely(iph_len < sizeof(struct iphdr)))
+		return -EINVAL;
+
+	/* Both struct iphdr and struct ipv6hdr have the version field at the
+	 * same offset so we can cast to the shorter header (struct iphdr).
+	 */
+	switch (((struct iphdr *)iph)->version) {
+	case 4:
+		if (sk->sk_family == AF_INET6 && sk->sk_ipv6only)
+			return -EINVAL;
+
+		mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
+		break;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+	case 6:
+		if (unlikely(iph_len < sizeof(struct ipv6hdr)))
+			return -EINVAL;
+
+		if (sk->sk_family != AF_INET6)
+			return -EINVAL;
+
+		mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
+		break;
+#endif /* CONFIG_IPV6 */
+
+	default:
+		return -EPROTONOSUPPORT;
+	}
+	if (mss <= 0)
+		return -ENOENT;
+
+	return cookie | ((u64)mss << 32);
+#else
+	return -EOPNOTSUPP;
+#endif /* CONFIG_SYN_COOKIES */
+}
+
+static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
+	.func		= bpf_tcp_gen_syncookie,
+	.gpl_only	= true, /* __cookie_v*_init_sequence() is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -6144,6 +6213,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_check_syncookie_proto;
 	case BPF_FUNC_skb_ecn_set_ce:
 		return &bpf_skb_ecn_set_ce_proto;
+	case BPF_FUNC_tcp_gen_syncookie:
+		return &bpf_tcp_gen_syncookie_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
@@ -6183,6 +6254,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_skc_lookup_tcp_proto;
 	case BPF_FUNC_tcp_check_syncookie:
 		return &bpf_tcp_check_syncookie_proto;
+	case BPF_FUNC_tcp_gen_syncookie:
+		return &bpf_tcp_gen_syncookie_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
-- 
cgit v1.2.3-59-g8ed1b


From 796e90f42b7e52cf1c88e978e1d5ee69c102d85d Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Tue, 30 Jul 2019 18:37:00 +0200
Subject: cfg80211: add support for parsing OBBS_PD attributes

Add the data structure, policy and parsing code allowing userland to send
the OBSS PD information into the kernel.

Signed-off-by: John Crispin <john@phrozen.org>
Link: https://lore.kernel.org/r/20190730163701.18836-2-john@phrozen.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 15 +++++++++++++++
 include/uapi/linux/nl80211.h | 27 ++++++++++++++++++++++++++
 net/wireless/nl80211.c       | 45 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 45850a8391d9..35ec1f0a2bf9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -246,6 +246,19 @@ struct ieee80211_rate {
 	u16 hw_value, hw_value_short;
 };
 
+/**
+ * struct ieee80211_he_obss_pd - AP settings for spatial reuse
+ *
+ * @enable: is the feature enabled.
+ * @min_offset: minimal tx power offset an associated station shall use
+ * @max_offset: maximum tx power offset an associated station shall use
+ */
+struct ieee80211_he_obss_pd {
+	bool enable;
+	u8 min_offset;
+	u8 max_offset;
+};
+
 /**
  * struct ieee80211_sta_ht_cap - STA's HT capabilities
  *
@@ -896,6 +909,7 @@ enum cfg80211_ap_settings_flags {
  * @vht_required: stations must support VHT
  * @twt_responder: Enable Target Wait Time
  * @flags: flags, as defined in enum cfg80211_ap_settings_flags
+ * @he_obss_pd: OBSS Packet Detection settings
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -923,6 +937,7 @@ struct cfg80211_ap_settings {
 	bool ht_required, vht_required;
 	bool twt_responder;
 	u32 flags;
+	struct ieee80211_he_obss_pd he_obss_pd;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index c45587c2cf44..822851d369ab 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2358,6 +2358,9 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_TWT_RESPONDER: Enable target wait time responder support.
  *
+ * @NL80211_ATTR_HE_OBSS_PD: nested attribute for OBSS Packet Detection
+ *	functionality.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2815,6 +2818,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_TWT_RESPONDER,
 
+	NL80211_ATTR_HE_OBSS_PD,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -6490,4 +6495,26 @@ enum nl80211_peer_measurement_ftm_resp {
 	NL80211_PMSR_FTM_RESP_ATTR_MAX = NUM_NL80211_PMSR_FTM_RESP_ATTR - 1
 };
 
+/**
+ * enum nl80211_obss_pd_attributes - OBSS packet detection attributes
+ * @__NL80211_HE_OBSS_PD_ATTR_INVALID: Invalid
+ *
+ * @NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET: the OBSS PD minimum tx power offset.
+ * @NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET: the OBSS PD maximum tx power offset.
+ *
+ * @__NL80211_HE_OBSS_PD_ATTR_LAST: Internal
+ * @NL80211_HE_OBSS_PD_ATTR_MAX: highest OBSS PD attribute.
+ */
+enum nl80211_obss_pd_attributes {
+	__NL80211_HE_OBSS_PD_ATTR_INVALID,
+
+	NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET,
+	NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET,
+
+	/* keep last */
+	__NL80211_HE_OBSS_PD_ATTR_LAST,
+	NL80211_HE_OBSS_PD_ATTR_MAX = __NL80211_HE_OBSS_PD_ATTR_LAST - 1,
+};
+
+
 #endif /* __LINUX_NL80211_H */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1e78ed45a759..3006cfce7158 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -281,6 +281,14 @@ nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = {
 		NLA_POLICY_NESTED_ARRAY(nl80211_psmr_peer_attr_policy),
 };
 
+static const struct nla_policy
+he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = {
+	[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] =
+		NLA_POLICY_RANGE(NLA_U8, 1, 20),
+	[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET] =
+		NLA_POLICY_RANGE(NLA_U8, 1, 20),
+};
+
 const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
 	[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
@@ -574,6 +582,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY,
 					.len = SAE_PASSWORD_MAX_LEN },
 	[NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG },
+	[NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy),
 };
 
 /* policy for the key attributes */
@@ -4405,6 +4414,34 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
 	return 0;
 }
 
+static int nl80211_parse_he_obss_pd(struct nlattr *attrs,
+				    struct ieee80211_he_obss_pd *he_obss_pd)
+{
+	struct nlattr *tb[NL80211_HE_OBSS_PD_ATTR_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, NL80211_HE_OBSS_PD_ATTR_MAX, attrs,
+			       he_obss_pd_policy, NULL);
+	if (err)
+		return err;
+
+	if (!tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] ||
+	    !tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET])
+		return -EINVAL;
+
+	he_obss_pd->min_offset =
+		nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]);
+	he_obss_pd->max_offset =
+		nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]);
+
+	if (he_obss_pd->min_offset >= he_obss_pd->max_offset)
+		return -EINVAL;
+
+	he_obss_pd->enable = true;
+
+	return 0;
+}
+
 static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
 					    const u8 *rates)
 {
@@ -4689,6 +4726,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	params.twt_responder =
 		    nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]);
 
+	if (info->attrs[NL80211_ATTR_HE_OBSS_PD]) {
+		err = nl80211_parse_he_obss_pd(
+					info->attrs[NL80211_ATTR_HE_OBSS_PD],
+					&params.he_obss_pd);
+		if (err)
+			return err;
+	}
+
 	nl80211_calculate_ap_params(&params);
 
 	if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])
-- 
cgit v1.2.3-59-g8ed1b


From 3247b272048ffefc12c7dcfa3169bd03047a49bc Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 30 Jul 2019 15:20:41 +0300
Subject: net: bridge: mcast: add delete due to fast-leave mdb flag

In user-space there's no way to distinguish why an mdb entry was deleted
and that is a problem for daemons which would like to keep the mdb in
sync with remote ends (e.g. mlag) but would also like to converge faster.
In almost all cases we'd like to age-out the remote entry for performance
and convergence reasons except when fast-leave is enabled. In that case we
want explicit immediate remote delete, thus add mdb flag which is set only
when the entry is being deleted due to fast-leave.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 1 +
 net/bridge/br_mdb.c            | 2 ++
 net/bridge/br_multicast.c      | 2 +-
 net/bridge/br_private.h        | 1 +
 4 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 773e476a8e54..1b3c2b643a02 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -237,6 +237,7 @@ struct br_mdb_entry {
 #define MDB_PERMANENT 1
 	__u8 state;
 #define MDB_FLAGS_OFFLOAD	(1 << 0)
+#define MDB_FLAGS_FAST_LEAVE	(1 << 1)
 	__u8 flags;
 	__u16 vid;
 	struct {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index bf6acd34234d..428af1abf8cc 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -60,6 +60,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
 	e->flags = 0;
 	if (flags & MDB_PG_FLAGS_OFFLOAD)
 		e->flags |= MDB_FLAGS_OFFLOAD;
+	if (flags & MDB_PG_FLAGS_FAST_LEAVE)
+		e->flags |= MDB_FLAGS_FAST_LEAVE;
 }
 
 static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 3d8deac2353d..3d4b2817687f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1393,7 +1393,7 @@ br_multicast_leave_group(struct net_bridge *br,
 			del_timer(&p->timer);
 			kfree_rcu(p, rcu);
 			br_mdb_notify(br->dev, port, group, RTM_DELMDB,
-				      p->flags);
+				      p->flags | MDB_PG_FLAGS_FAST_LEAVE);
 
 			if (!mp->ports && !mp->host_joined &&
 			    netif_running(br->dev))
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e8cf03b43b7d..c4fd307fbfdc 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -199,6 +199,7 @@ struct net_bridge_fdb_entry {
 
 #define MDB_PG_FLAGS_PERMANENT	BIT(0)
 #define MDB_PG_FLAGS_OFFLOAD	BIT(1)
+#define MDB_PG_FLAGS_FAST_LEAVE	BIT(2)
 
 struct net_bridge_port_group {
 	struct net_bridge_port		*port;
-- 
cgit v1.2.3-59-g8ed1b


From 28315f7999870bb56da236f6b4ffce63efcc7897 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 11 Aug 2019 10:35:50 +0300
Subject: drop_monitor: Add alert mode operations

The next patch is going to add another alert mode in which the dropped
packet is notified to user space, instead of only a summary of recent
drops.

Abstract the differences between the modes by adding alert mode
operations. The operations are selected based on the currently
configured mode and associated with the probes and the work item just
before tracing starts.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |  9 +++++++++
 net/core/drop_monitor.c          | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 5edbd0a675fd..0fecdedeb6ca 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -62,4 +62,13 @@ enum {
  * Our group identifiers
  */
 #define NET_DM_GRP_ALERT 1
+
+/**
+ * enum net_dm_alert_mode - Alert mode.
+ * @NET_DM_ALERT_MODE_SUMMARY: A summary of recent drops is sent to user space.
+ */
+enum net_dm_alert_mode {
+	NET_DM_ALERT_MODE_SUMMARY,
+};
+
 #endif
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index cd2f3069f34e..9cd2f662cb9e 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -75,6 +75,16 @@ static int dm_delay = 1;
 static unsigned long dm_hw_check_delta = 2*HZ;
 static LIST_HEAD(hw_stats_list);
 
+static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
+
+struct net_dm_alert_ops {
+	void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
+				void *location);
+	void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
+				int work, int budget);
+	void (*work_item_func)(struct work_struct *work);
+};
+
 static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
 	size_t al;
@@ -241,10 +251,23 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
 	rcu_read_unlock();
 }
 
+static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
+	.kfree_skb_probe	= trace_kfree_skb_hit,
+	.napi_poll_probe	= trace_napi_poll_hit,
+	.work_item_func		= send_dm_alert,
+};
+
+static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
+	[NET_DM_ALERT_MODE_SUMMARY]	= &net_dm_alert_summary_ops,
+};
+
 static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
 {
+	const struct net_dm_alert_ops *ops;
 	int cpu, rc;
 
+	ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
 	if (!try_module_get(THIS_MODULE)) {
 		NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module");
 		return -ENODEV;
@@ -254,7 +277,7 @@ static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
 		struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
 		struct sk_buff *skb;
 
-		INIT_WORK(&data->dm_alert_work, send_dm_alert);
+		INIT_WORK(&data->dm_alert_work, ops->work_item_func);
 		timer_setup(&data->send_timer, sched_send_work, 0);
 		/* Allocate a new per-CPU skb for the summary alert message and
 		 * free the old one which might contain stale data from
@@ -264,13 +287,13 @@ static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
 		consume_skb(skb);
 	}
 
-	rc = register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+	rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL);
 	if (rc) {
 		NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint");
 		goto err_module_put;
 	}
 
-	rc = register_trace_napi_poll(trace_napi_poll_hit, NULL);
+	rc = register_trace_napi_poll(ops->napi_poll_probe, NULL);
 	if (rc) {
 		NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint");
 		goto err_unregister_trace;
@@ -279,7 +302,7 @@ static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
 	return 0;
 
 err_unregister_trace:
-	unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+	unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
 err_module_put:
 	module_put(THIS_MODULE);
 	return rc;
@@ -288,10 +311,13 @@ err_module_put:
 static void net_dm_trace_off_set(void)
 {
 	struct dm_hw_stat_delta *new_stat, *temp;
+	const struct net_dm_alert_ops *ops;
 	int cpu;
 
-	unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
-	unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+	ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+	unregister_trace_napi_poll(ops->napi_poll_probe, NULL);
+	unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
 
 	tracepoint_synchronize_unregister();
 
-- 
cgit v1.2.3-59-g8ed1b


From ca30707dee2bc8bc81cfd8b4277fe90f7ca6df1f Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 11 Aug 2019 10:35:51 +0300
Subject: drop_monitor: Add packet alert mode

So far drop monitor supported only one alert mode in which a summary of
locations in which packets were recently dropped was sent to user space.

This alert mode is sufficient in order to understand that packets were
dropped, but lacks information to perform a more detailed analysis.

Add a new alert mode in which the dropped packet itself is passed to
user space along with metadata: The drop location (as program counter
and resolved symbol), ingress netdevice and drop timestamp. More
metadata can be added in the future.

To avoid performing expensive operations in the context in which
kfree_skb() is invoked (can be hard IRQ), the dropped skb is cloned and
queued on per-CPU skb drop list. Then, in process context the netlink
message is allocated, prepared and finally sent to user space.

The per-CPU skb drop list is limited to 1000 skbs to prevent exhausting
the system's memory. Subsequent patches will make this limit
configurable and also add a counter that indicates how many skbs were
tail dropped.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |  27 ++++
 net/core/drop_monitor.c          | 280 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 305 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 0fecdedeb6ca..cfaaf75371b8 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -53,6 +53,7 @@ enum {
 	NET_DM_CMD_CONFIG,
 	NET_DM_CMD_START,
 	NET_DM_CMD_STOP,
+	NET_DM_CMD_PACKET_ALERT,
 	_NET_DM_CMD_MAX,
 };
 
@@ -63,12 +64,38 @@ enum {
  */
 #define NET_DM_GRP_ALERT 1
 
+enum net_dm_attr {
+	NET_DM_ATTR_UNSPEC,
+
+	NET_DM_ATTR_ALERT_MODE,			/* u8 */
+	NET_DM_ATTR_PC,				/* u64 */
+	NET_DM_ATTR_SYMBOL,			/* string */
+	NET_DM_ATTR_IN_PORT,			/* nested */
+	NET_DM_ATTR_TIMESTAMP,			/* struct timespec */
+	NET_DM_ATTR_PROTO,			/* u16 */
+	NET_DM_ATTR_PAYLOAD,			/* binary */
+	NET_DM_ATTR_PAD,
+
+	__NET_DM_ATTR_MAX,
+	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
+};
+
 /**
  * enum net_dm_alert_mode - Alert mode.
  * @NET_DM_ALERT_MODE_SUMMARY: A summary of recent drops is sent to user space.
+ * @NET_DM_ALERT_MODE_PACKET: Each dropped packet is sent to user space along
+ *                            with metadata.
  */
 enum net_dm_alert_mode {
 	NET_DM_ALERT_MODE_SUMMARY,
+	NET_DM_ALERT_MODE_PACKET,
+};
+
+enum {
+	NET_DM_ATTR_PORT_NETDEV_IFINDEX,	/* u32 */
+
+	__NET_DM_ATTR_PORT_MAX,
+	NET_DM_ATTR_PORT_MAX = __NET_DM_ATTR_PORT_MAX - 1
 };
 
 #endif
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 9cd2f662cb9e..ba765832413b 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -54,6 +54,7 @@ static DEFINE_MUTEX(net_dm_mutex);
 struct per_cpu_dm_data {
 	spinlock_t		lock;	/* Protects 'skb' and 'send_timer' */
 	struct sk_buff		*skb;
+	struct sk_buff_head	drop_queue;
 	struct work_struct	dm_alert_work;
 	struct timer_list	send_timer;
 };
@@ -85,6 +86,14 @@ struct net_dm_alert_ops {
 	void (*work_item_func)(struct work_struct *work);
 };
 
+struct net_dm_skb_cb {
+	void *pc;
+};
+
+#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
+
+#define NET_DM_QUEUE_LEN 1000
+
 static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
 	size_t al;
@@ -257,8 +266,214 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
 	.work_item_func		= send_dm_alert,
 };
 
+static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
+					      struct sk_buff *skb,
+					      void *location)
+{
+	ktime_t tstamp = ktime_get_real();
+	struct per_cpu_dm_data *data;
+	struct sk_buff *nskb;
+	unsigned long flags;
+
+	nskb = skb_clone(skb, GFP_ATOMIC);
+	if (!nskb)
+		return;
+
+	NET_DM_SKB_CB(nskb)->pc = location;
+	/* Override the timestamp because we care about the time when the
+	 * packet was dropped.
+	 */
+	nskb->tstamp = tstamp;
+
+	data = this_cpu_ptr(&dm_cpu_data);
+
+	spin_lock_irqsave(&data->drop_queue.lock, flags);
+	if (skb_queue_len(&data->drop_queue) < NET_DM_QUEUE_LEN)
+		__skb_queue_tail(&data->drop_queue, nskb);
+	else
+		goto unlock_free;
+	spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+	schedule_work(&data->dm_alert_work);
+
+	return;
+
+unlock_free:
+	spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+	consume_skb(nskb);
+}
+
+static void net_dm_packet_trace_napi_poll_hit(void *ignore,
+					      struct napi_struct *napi,
+					      int work, int budget)
+{
+}
+
+static size_t net_dm_in_port_size(void)
+{
+	       /* NET_DM_ATTR_IN_PORT nest */
+	return nla_total_size(0) +
+	       /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */
+	       nla_total_size(sizeof(u32));
+}
+
+#define NET_DM_MAX_SYMBOL_LEN 40
+
+static size_t net_dm_packet_report_size(size_t payload_len)
+{
+	size_t size;
+
+	size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
+
+	return NLMSG_ALIGN(size) +
+	       /* NET_DM_ATTR_PC */
+	       nla_total_size(sizeof(u64)) +
+	       /* NET_DM_ATTR_SYMBOL */
+	       nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) +
+	       /* NET_DM_ATTR_IN_PORT */
+	       net_dm_in_port_size() +
+	       /* NET_DM_ATTR_TIMESTAMP */
+	       nla_total_size(sizeof(struct timespec)) +
+	       /* NET_DM_ATTR_PROTO */
+	       nla_total_size(sizeof(u16)) +
+	       /* NET_DM_ATTR_PAYLOAD */
+	       nla_total_size(payload_len);
+}
+
+static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex)
+{
+	struct nlattr *attr;
+
+	attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT);
+	if (!attr)
+		return -EMSGSIZE;
+
+	if (ifindex &&
+	    nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, attr);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, attr);
+	return -EMSGSIZE;
+}
+
+static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
+				     size_t payload_len)
+{
+	u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc;
+	char buf[NET_DM_MAX_SYMBOL_LEN];
+	struct nlattr *attr;
+	struct timespec ts;
+	void *hdr;
+	int rc;
+
+	hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+			  NET_DM_CMD_PACKET_ALERT);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
+		goto nla_put_failure;
+
+	snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc);
+	if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
+		goto nla_put_failure;
+
+	rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif);
+	if (rc)
+		goto nla_put_failure;
+
+	if (ktime_to_timespec_cond(skb->tstamp, &ts) &&
+	    nla_put(msg, NET_DM_ATTR_TIMESTAMP, sizeof(ts), &ts))
+		goto nla_put_failure;
+
+	if (!payload_len)
+		goto out;
+
+	if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
+		goto nla_put_failure;
+
+	attr = skb_put(msg, nla_total_size(payload_len));
+	attr->nla_type = NET_DM_ATTR_PAYLOAD;
+	attr->nla_len = nla_attr_size(payload_len);
+	if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
+		goto nla_put_failure;
+
+out:
+	genlmsg_end(msg, hdr);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+#define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO)
+
+static void net_dm_packet_report(struct sk_buff *skb)
+{
+	struct sk_buff *msg;
+	size_t payload_len;
+	int rc;
+
+	/* Make sure we start copying the packet from the MAC header */
+	if (skb->data > skb_mac_header(skb))
+		skb_push(skb, skb->data - skb_mac_header(skb));
+	else
+		skb_pull(skb, skb_mac_header(skb) - skb->data);
+
+	/* Ensure packet fits inside a single netlink attribute */
+	payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+
+	msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+	if (!msg)
+		goto out;
+
+	rc = net_dm_packet_report_fill(msg, skb, payload_len);
+	if (rc) {
+		nlmsg_free(msg);
+		goto out;
+	}
+
+	genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+	consume_skb(skb);
+}
+
+static void net_dm_packet_work(struct work_struct *work)
+{
+	struct per_cpu_dm_data *data;
+	struct sk_buff_head list;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+	__skb_queue_head_init(&list);
+
+	spin_lock_irqsave(&data->drop_queue.lock, flags);
+	skb_queue_splice_tail_init(&data->drop_queue, &list);
+	spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+	while ((skb = __skb_dequeue(&list)))
+		net_dm_packet_report(skb);
+}
+
+static const struct net_dm_alert_ops net_dm_alert_packet_ops = {
+	.kfree_skb_probe	= net_dm_packet_trace_kfree_skb_hit,
+	.napi_poll_probe	= net_dm_packet_trace_napi_poll_hit,
+	.work_item_func		= net_dm_packet_work,
+};
+
 static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
 	[NET_DM_ALERT_MODE_SUMMARY]	= &net_dm_alert_summary_ops,
+	[NET_DM_ALERT_MODE_PACKET]	= &net_dm_alert_packet_ops,
 };
 
 static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
@@ -326,9 +541,12 @@ static void net_dm_trace_off_set(void)
 	 */
 	for_each_possible_cpu(cpu) {
 		struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+		struct sk_buff *skb;
 
 		del_timer_sync(&data->send_timer);
 		cancel_work_sync(&data->dm_alert_work);
+		while ((skb = __skb_dequeue(&data->drop_queue)))
+			consume_skb(skb);
 	}
 
 	list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
@@ -370,12 +588,61 @@ static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack)
 	return rc;
 }
 
+static int net_dm_alert_mode_get_from_info(struct genl_info *info,
+					   enum net_dm_alert_mode *p_alert_mode)
+{
+	u8 val;
+
+	val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]);
+
+	switch (val) {
+	case NET_DM_ALERT_MODE_SUMMARY: /* fall-through */
+	case NET_DM_ALERT_MODE_PACKET:
+		*p_alert_mode = val;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int net_dm_alert_mode_set(struct genl_info *info)
+{
+	struct netlink_ext_ack *extack = info->extack;
+	enum net_dm_alert_mode alert_mode;
+	int rc;
+
+	if (!info->attrs[NET_DM_ATTR_ALERT_MODE])
+		return 0;
+
+	rc = net_dm_alert_mode_get_from_info(info, &alert_mode);
+	if (rc) {
+		NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode");
+		return -EINVAL;
+	}
+
+	net_dm_alert_mode = alert_mode;
+
+	return 0;
+}
+
 static int net_dm_cmd_config(struct sk_buff *skb,
 			struct genl_info *info)
 {
-	NL_SET_ERR_MSG_MOD(info->extack, "Command not supported");
+	struct netlink_ext_ack *extack = info->extack;
+	int rc;
 
-	return -EOPNOTSUPP;
+	if (trace_state == TRACE_ON) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor while tracing is on");
+		return -EBUSY;
+	}
+
+	rc = net_dm_alert_mode_set(info);
+	if (rc)
+		return rc;
+
+	return 0;
 }
 
 static int net_dm_cmd_trace(struct sk_buff *skb,
@@ -430,6 +697,11 @@ out:
 	return NOTIFY_DONE;
 }
 
+static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
+	[NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 },
+	[NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
+};
+
 static const struct genl_ops dropmon_ops[] = {
 	{
 		.cmd = NET_DM_CMD_CONFIG,
@@ -467,6 +739,8 @@ static struct genl_family net_drop_monitor_family __ro_after_init = {
 	.hdrsize        = 0,
 	.name           = "NET_DM",
 	.version        = 2,
+	.maxattr	= NET_DM_ATTR_MAX,
+	.policy		= net_dm_nl_policy,
 	.pre_doit	= net_dm_nl_pre_doit,
 	.post_doit	= net_dm_nl_post_doit,
 	.module		= THIS_MODULE,
@@ -510,6 +784,7 @@ static int __init init_net_drop_monitor(void)
 	for_each_possible_cpu(cpu) {
 		data = &per_cpu(dm_cpu_data, cpu);
 		spin_lock_init(&data->lock);
+		skb_queue_head_init(&data->drop_queue);
 	}
 
 	goto out;
@@ -539,6 +814,7 @@ static void exit_net_drop_monitor(void)
 		 * to this struct and can free the skb inside it
 		 */
 		kfree_skb(data->skb);
+		WARN_ON(!skb_queue_empty(&data->drop_queue));
 	}
 
 	BUG_ON(genl_unregister_family(&net_drop_monitor_family));
-- 
cgit v1.2.3-59-g8ed1b


From 57986617a736aec2980c1c78a9dd8dcdf477ee6e Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 11 Aug 2019 10:35:52 +0300
Subject: drop_monitor: Allow truncation of dropped packets

When sending dropped packets to user space it is not always necessary to
copy the entire packet as usually only the headers are of interest.

Allow user to specify the truncation length and add the original length
of the packet as additional metadata to the netlink message.

By default no truncation is performed.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |  2 ++
 net/core/drop_monitor.c          | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index cfaaf75371b8..5cd7eb1f66ba 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -75,6 +75,8 @@ enum net_dm_attr {
 	NET_DM_ATTR_PROTO,			/* u16 */
 	NET_DM_ATTR_PAYLOAD,			/* binary */
 	NET_DM_ATTR_PAD,
+	NET_DM_ATTR_TRUNC_LEN,			/* u32 */
+	NET_DM_ATTR_ORIG_LEN,			/* u32 */
 
 	__NET_DM_ATTR_MAX,
 	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index ba765832413b..9f884adaa85f 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -77,6 +77,7 @@ static unsigned long dm_hw_check_delta = 2*HZ;
 static LIST_HEAD(hw_stats_list);
 
 static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
+static u32 net_dm_trunc_len;
 
 struct net_dm_alert_ops {
 	void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
@@ -334,6 +335,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
 	       net_dm_in_port_size() +
 	       /* NET_DM_ATTR_TIMESTAMP */
 	       nla_total_size(sizeof(struct timespec)) +
+	       /* NET_DM_ATTR_ORIG_LEN */
+	       nla_total_size(sizeof(u32)) +
 	       /* NET_DM_ATTR_PROTO */
 	       nla_total_size(sizeof(u16)) +
 	       /* NET_DM_ATTR_PAYLOAD */
@@ -391,6 +394,9 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 	    nla_put(msg, NET_DM_ATTR_TIMESTAMP, sizeof(ts), &ts))
 		goto nla_put_failure;
 
+	if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
+		goto nla_put_failure;
+
 	if (!payload_len)
 		goto out;
 
@@ -429,6 +435,8 @@ static void net_dm_packet_report(struct sk_buff *skb)
 
 	/* Ensure packet fits inside a single netlink attribute */
 	payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+	if (net_dm_trunc_len)
+		payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
 
 	msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
 	if (!msg)
@@ -627,6 +635,14 @@ static int net_dm_alert_mode_set(struct genl_info *info)
 	return 0;
 }
 
+static void net_dm_trunc_len_set(struct genl_info *info)
+{
+	if (!info->attrs[NET_DM_ATTR_TRUNC_LEN])
+		return;
+
+	net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]);
+}
+
 static int net_dm_cmd_config(struct sk_buff *skb,
 			struct genl_info *info)
 {
@@ -642,6 +658,8 @@ static int net_dm_cmd_config(struct sk_buff *skb,
 	if (rc)
 		return rc;
 
+	net_dm_trunc_len_set(info);
+
 	return 0;
 }
 
@@ -700,6 +718,7 @@ out:
 static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
 	[NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 },
 	[NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
+	[NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
 };
 
 static const struct genl_ops dropmon_ops[] = {
-- 
cgit v1.2.3-59-g8ed1b


From 444be061d012f1a8ebf95292a648a4e0e2afa83f Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 11 Aug 2019 10:35:53 +0300
Subject: drop_monitor: Add a command to query current configuration

Users should be able to query the current configuration of drop monitor
before they start using it. Add a command to query the existing
configuration which currently consists of alert mode and packet
truncation length.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |  2 ++
 net/core/drop_monitor.c          | 48 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 5cd7eb1f66ba..3b765a8428b5 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -54,6 +54,8 @@ enum {
 	NET_DM_CMD_START,
 	NET_DM_CMD_STOP,
 	NET_DM_CMD_PACKET_ALERT,
+	NET_DM_CMD_CONFIG_GET,
+	NET_DM_CMD_CONFIG_NEW,
 	_NET_DM_CMD_MAX,
 };
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 9f884adaa85f..135638474ab8 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -676,6 +676,50 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	return -EOPNOTSUPP;
 }
 
+static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			  &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	int rc;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	rc = net_dm_config_fill(msg, info);
+	if (rc)
+		goto free_msg;
+
+	return genlmsg_reply(msg, info);
+
+free_msg:
+	nlmsg_free(msg);
+	return rc;
+}
+
 static int dropmon_net_event(struct notifier_block *ev_block,
 			     unsigned long event, void *ptr)
 {
@@ -738,6 +782,10 @@ static const struct genl_ops dropmon_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = net_dm_cmd_trace,
 	},
+	{
+		.cmd = NET_DM_CMD_CONFIG_GET,
+		.doit = net_dm_cmd_config_get,
+	},
 };
 
 static int net_dm_nl_pre_doit(const struct genl_ops *ops,
-- 
cgit v1.2.3-59-g8ed1b


From 30328d46af593dcf24582f2a431d84ea0cf4bdef Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 11 Aug 2019 10:35:54 +0300
Subject: drop_monitor: Make drop queue length configurable

In packet alert mode, each CPU holds a list of dropped skbs that need to
be processed in process context and sent to user space. To avoid
exhausting the system's memory the maximum length of this queue is
currently set to 1000.

Allow users to tune the length of this queue according to their needs.
The configured length is reported to user space when drop monitor
configuration is queried.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |  1 +
 net/core/drop_monitor.c          | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 3b765a8428b5..1d0bdb1ba954 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -79,6 +79,7 @@ enum net_dm_attr {
 	NET_DM_ATTR_PAD,
 	NET_DM_ATTR_TRUNC_LEN,			/* u32 */
 	NET_DM_ATTR_ORIG_LEN,			/* u32 */
+	NET_DM_ATTR_QUEUE_LEN,			/* u32 */
 
 	__NET_DM_ATTR_MAX,
 	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 135638474ab8..eb3c34d69ea9 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -78,6 +78,7 @@ static LIST_HEAD(hw_stats_list);
 
 static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
 static u32 net_dm_trunc_len;
+static u32 net_dm_queue_len = 1000;
 
 struct net_dm_alert_ops {
 	void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
@@ -93,8 +94,6 @@ struct net_dm_skb_cb {
 
 #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
 
-#define NET_DM_QUEUE_LEN 1000
-
 static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
 	size_t al;
@@ -289,7 +288,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
 	data = this_cpu_ptr(&dm_cpu_data);
 
 	spin_lock_irqsave(&data->drop_queue.lock, flags);
-	if (skb_queue_len(&data->drop_queue) < NET_DM_QUEUE_LEN)
+	if (skb_queue_len(&data->drop_queue) < net_dm_queue_len)
 		__skb_queue_tail(&data->drop_queue, nskb);
 	else
 		goto unlock_free;
@@ -643,6 +642,14 @@ static void net_dm_trunc_len_set(struct genl_info *info)
 	net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]);
 }
 
+static void net_dm_queue_len_set(struct genl_info *info)
+{
+	if (!info->attrs[NET_DM_ATTR_QUEUE_LEN])
+		return;
+
+	net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]);
+}
+
 static int net_dm_cmd_config(struct sk_buff *skb,
 			struct genl_info *info)
 {
@@ -660,6 +667,8 @@ static int net_dm_cmd_config(struct sk_buff *skb,
 
 	net_dm_trunc_len_set(info);
 
+	net_dm_queue_len_set(info);
+
 	return 0;
 }
 
@@ -691,6 +700,9 @@ static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info)
 	if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len))
 		goto nla_put_failure;
 
+	if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len))
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 
 	return 0;
@@ -763,6 +775,7 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
 	[NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 },
 	[NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
 	[NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
+	[NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
 };
 
 static const struct genl_ops dropmon_ops[] = {
-- 
cgit v1.2.3-59-g8ed1b


From e9feb58020f952f7d9de785ede9a7d54ab1eda5c Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sun, 11 Aug 2019 10:35:55 +0300
Subject: drop_monitor: Expose tail drop counter

Previous patch made the length of the per-CPU skb drop list
configurable. Expose a counter that shows how many packets could not be
enqueued to this list.

This allows users determine the desired queue length.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |  10 ++++
 net/core/drop_monitor.c          | 101 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 1d0bdb1ba954..405b31cbf723 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -56,6 +56,8 @@ enum {
 	NET_DM_CMD_PACKET_ALERT,
 	NET_DM_CMD_CONFIG_GET,
 	NET_DM_CMD_CONFIG_NEW,
+	NET_DM_CMD_STATS_GET,
+	NET_DM_CMD_STATS_NEW,
 	_NET_DM_CMD_MAX,
 };
 
@@ -80,6 +82,7 @@ enum net_dm_attr {
 	NET_DM_ATTR_TRUNC_LEN,			/* u32 */
 	NET_DM_ATTR_ORIG_LEN,			/* u32 */
 	NET_DM_ATTR_QUEUE_LEN,			/* u32 */
+	NET_DM_ATTR_STATS,			/* nested */
 
 	__NET_DM_ATTR_MAX,
 	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
@@ -103,4 +106,11 @@ enum {
 	NET_DM_ATTR_PORT_MAX = __NET_DM_ATTR_PORT_MAX - 1
 };
 
+enum {
+	NET_DM_ATTR_STATS_DROPPED,		/* u64 */
+
+	__NET_DM_ATTR_STATS_MAX,
+	NET_DM_ATTR_STATS_MAX = __NET_DM_ATTR_STATS_MAX - 1
+};
+
 #endif
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index eb3c34d69ea9..39e094907391 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -51,12 +51,18 @@ static int trace_state = TRACE_OFF;
  */
 static DEFINE_MUTEX(net_dm_mutex);
 
+struct net_dm_stats {
+	u64 dropped;
+	struct u64_stats_sync syncp;
+};
+
 struct per_cpu_dm_data {
 	spinlock_t		lock;	/* Protects 'skb' and 'send_timer' */
 	struct sk_buff		*skb;
 	struct sk_buff_head	drop_queue;
 	struct work_struct	dm_alert_work;
 	struct timer_list	send_timer;
+	struct net_dm_stats	stats;
 };
 
 struct dm_hw_stat_delta {
@@ -300,6 +306,9 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
 
 unlock_free:
 	spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+	u64_stats_update_begin(&data->stats.syncp);
+	data->stats.dropped++;
+	u64_stats_update_end(&data->stats.syncp);
 	consume_skb(nskb);
 }
 
@@ -732,6 +741,93 @@ free_msg:
 	return rc;
 }
 
+static void net_dm_stats_read(struct net_dm_stats *stats)
+{
+	int cpu;
+
+	memset(stats, 0, sizeof(*stats));
+	for_each_possible_cpu(cpu) {
+		struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+		struct net_dm_stats *cpu_stats = &data->stats;
+		unsigned int start;
+		u64 dropped;
+
+		do {
+			start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+			dropped = cpu_stats->dropped;
+		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+		stats->dropped += dropped;
+	}
+}
+
+static int net_dm_stats_put(struct sk_buff *msg)
+{
+	struct net_dm_stats stats;
+	struct nlattr *attr;
+
+	net_dm_stats_read(&stats);
+
+	attr = nla_nest_start(msg, NET_DM_ATTR_STATS);
+	if (!attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
+			      stats.dropped, NET_DM_ATTR_PAD))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, attr);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, attr);
+	return -EMSGSIZE;
+}
+
+static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info)
+{
+	void *hdr;
+	int rc;
+
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			  &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	rc = net_dm_stats_put(msg);
+	if (rc)
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	int rc;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	rc = net_dm_stats_fill(msg, info);
+	if (rc)
+		goto free_msg;
+
+	return genlmsg_reply(msg, info);
+
+free_msg:
+	nlmsg_free(msg);
+	return rc;
+}
+
 static int dropmon_net_event(struct notifier_block *ev_block,
 			     unsigned long event, void *ptr)
 {
@@ -799,6 +895,10 @@ static const struct genl_ops dropmon_ops[] = {
 		.cmd = NET_DM_CMD_CONFIG_GET,
 		.doit = net_dm_cmd_config_get,
 	},
+	{
+		.cmd = NET_DM_CMD_STATS_GET,
+		.doit = net_dm_cmd_stats_get,
+	},
 };
 
 static int net_dm_nl_pre_doit(const struct genl_ops *ops,
@@ -865,6 +965,7 @@ static int __init init_net_drop_monitor(void)
 		data = &per_cpu(dm_cpu_data, cpu);
 		spin_lock_init(&data->lock);
 		skb_queue_head_init(&data->drop_queue);
+		u64_stats_init(&data->stats.syncp);
 	}
 
 	goto out;
-- 
cgit v1.2.3-59-g8ed1b


From a1b2f04ea527397fcacacd09e0d690927feef429 Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Wed, 7 Aug 2019 15:16:59 +0100
Subject: netfilter: add missing includes to a number of header-files.

A number of netfilter header-files used declarations and definitions
from other headers without including them.  Added include directives to
make those declarations and definitions available.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set_getport.h   | 4 ++++
 include/linux/netfilter/nf_conntrack_amanda.h    | 4 ++++
 include/linux/netfilter/nf_conntrack_ftp.h       | 8 +++++---
 include/linux/netfilter/nf_conntrack_h323.h      | 7 +++++--
 include/linux/netfilter/nf_conntrack_h323_asn1.h | 2 ++
 include/linux/netfilter/nf_conntrack_irc.h       | 4 ++++
 include/linux/netfilter/nf_conntrack_pptp.h      | 9 +++++----
 include/linux/netfilter/nf_conntrack_sip.h       | 4 ++--
 include/linux/netfilter/nf_conntrack_snmp.h      | 3 +++
 include/linux/netfilter/nf_conntrack_tftp.h      | 5 +++++
 include/net/netfilter/br_netfilter.h             | 2 ++
 include/net/netfilter/ipv4/nf_dup_ipv4.h         | 3 +++
 include/net/netfilter/ipv6/nf_defrag_ipv6.h      | 4 +++-
 include/net/netfilter/ipv6/nf_dup_ipv6.h         | 2 ++
 include/net/netfilter/nf_conntrack_bridge.h      | 4 ++++
 include/net/netfilter/nf_conntrack_count.h       | 3 +++
 include/net/netfilter/nf_dup_netdev.h            | 2 ++
 include/net/netfilter/nf_flow_table.h            | 1 +
 include/net/netfilter/nf_nat_helper.h            | 4 ++--
 include/net/netfilter/nf_nat_redirect.h          | 3 +++
 include/net/netfilter/nf_queue.h                 | 2 ++
 include/net/netfilter/nf_reject.h                | 3 +++
 include/net/netfilter/nf_tables_ipv6.h           | 1 +
 include/net/netfilter/nft_fib.h                  | 2 ++
 include/net/netfilter/nft_meta.h                 | 2 ++
 include/net/netfilter/nft_reject.h               | 5 +++++
 include/uapi/linux/netfilter/xt_policy.h         | 1 +
 27 files changed, 80 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/netfilter/ipset/ip_set_getport.h b/include/linux/netfilter/ipset/ip_set_getport.h
index ac6a11d38a19..a906df06948b 100644
--- a/include/linux/netfilter/ipset/ip_set_getport.h
+++ b/include/linux/netfilter/ipset/ip_set_getport.h
@@ -2,6 +2,10 @@
 #ifndef _IP_SET_GETPORT_H
 #define _IP_SET_GETPORT_H
 
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <uapi/linux/in.h>
+
 extern bool ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
 				__be16 *port, u8 *proto);
 
diff --git a/include/linux/netfilter/nf_conntrack_amanda.h b/include/linux/netfilter/nf_conntrack_amanda.h
index 34345e543ba2..6f0ac896fcc9 100644
--- a/include/linux/netfilter/nf_conntrack_amanda.h
+++ b/include/linux/netfilter/nf_conntrack_amanda.h
@@ -3,6 +3,10 @@
 #define _NF_CONNTRACK_AMANDA_H
 /* AMANDA tracking. */
 
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
 extern unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb,
 					  enum ip_conntrack_info ctinfo,
 					  unsigned int protoff,
diff --git a/include/linux/netfilter/nf_conntrack_ftp.h b/include/linux/netfilter/nf_conntrack_ftp.h
index 73a296dfd019..0e38302820b9 100644
--- a/include/linux/netfilter/nf_conntrack_ftp.h
+++ b/include/linux/netfilter/nf_conntrack_ftp.h
@@ -2,8 +2,12 @@
 #ifndef _NF_CONNTRACK_FTP_H
 #define _NF_CONNTRACK_FTP_H
 
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/netfilter/nf_conntrack_expect.h>
 #include <uapi/linux/netfilter/nf_conntrack_ftp.h>
-
+#include <uapi/linux/netfilter/nf_conntrack_tuple_common.h>
 
 #define FTP_PORT	21
 
@@ -20,8 +24,6 @@ struct nf_ct_ftp_master {
 	u_int16_t flags[IP_CT_DIR_MAX];
 };
 
-struct nf_conntrack_expect;
-
 /* For NAT to hook in when we find a packet which describes what other
  * connection we should expect. */
 extern unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb,
diff --git a/include/linux/netfilter/nf_conntrack_h323.h b/include/linux/netfilter/nf_conntrack_h323.h
index f76ed373a2a5..96dfa886f8c0 100644
--- a/include/linux/netfilter/nf_conntrack_h323.h
+++ b/include/linux/netfilter/nf_conntrack_h323.h
@@ -4,7 +4,12 @@
 
 #ifdef __KERNEL__
 
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
 #include <linux/netfilter/nf_conntrack_h323_asn1.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <uapi/linux/netfilter/nf_conntrack_tuple_common.h>
 
 #define RAS_PORT 1719
 #define Q931_PORT 1720
@@ -28,8 +33,6 @@ struct nf_ct_h323_master {
 	};
 };
 
-struct nf_conn;
-
 int get_h225_addr(struct nf_conn *ct, unsigned char *data,
 		  TransportAddress *taddr, union nf_inet_addr *addr,
 		  __be16 *port);
diff --git a/include/linux/netfilter/nf_conntrack_h323_asn1.h b/include/linux/netfilter/nf_conntrack_h323_asn1.h
index 19df78341fb3..bd6797f823b2 100644
--- a/include/linux/netfilter/nf_conntrack_h323_asn1.h
+++ b/include/linux/netfilter/nf_conntrack_h323_asn1.h
@@ -37,6 +37,8 @@
 /*****************************************************************************
  * H.323 Types
  ****************************************************************************/
+
+#include <linux/types.h>
 #include <linux/netfilter/nf_conntrack_h323_types.h>
 
 typedef struct {
diff --git a/include/linux/netfilter/nf_conntrack_irc.h b/include/linux/netfilter/nf_conntrack_irc.h
index 00c2b74206e1..f75e005db969 100644
--- a/include/linux/netfilter/nf_conntrack_irc.h
+++ b/include/linux/netfilter/nf_conntrack_irc.h
@@ -4,6 +4,10 @@
 
 #ifdef __KERNEL__
 
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
 #define IRC_PORT	6667
 
 extern unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
diff --git a/include/linux/netfilter/nf_conntrack_pptp.h b/include/linux/netfilter/nf_conntrack_pptp.h
index 833a5b2255ea..3f10e806f0dc 100644
--- a/include/linux/netfilter/nf_conntrack_pptp.h
+++ b/include/linux/netfilter/nf_conntrack_pptp.h
@@ -3,7 +3,12 @@
 #ifndef _NF_CONNTRACK_PPTP_H
 #define _NF_CONNTRACK_PPTP_H
 
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
 #include <linux/netfilter/nf_conntrack_common.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <uapi/linux/netfilter/nf_conntrack_tuple_common.h>
 
 extern const char *const pptp_msg_name[];
 
@@ -297,10 +302,6 @@ union pptp_ctrl_union {
 	struct PptpSetLinkInfo		setlink;
 };
 
-/* crap needed for nf_conntrack_compat.h */
-struct nf_conn;
-struct nf_conntrack_expect;
-
 extern int
 (*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
 			     struct nf_conn *ct, enum ip_conntrack_info ctinfo,
diff --git a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h
index c7fc38807a33..f6437f7841af 100644
--- a/include/linux/netfilter/nf_conntrack_sip.h
+++ b/include/linux/netfilter/nf_conntrack_sip.h
@@ -3,9 +3,9 @@
 #define __NF_CONNTRACK_SIP_H__
 #ifdef __KERNEL__
 
-#include <net/netfilter/nf_conntrack_expect.h>
-
+#include <linux/skbuff.h>
 #include <linux/types.h>
+#include <net/netfilter/nf_conntrack_expect.h>
 
 #define SIP_PORT	5060
 #define SIP_TIMEOUT	3600
diff --git a/include/linux/netfilter/nf_conntrack_snmp.h b/include/linux/netfilter/nf_conntrack_snmp.h
index 818088c47475..87e4f33eb55f 100644
--- a/include/linux/netfilter/nf_conntrack_snmp.h
+++ b/include/linux/netfilter/nf_conntrack_snmp.h
@@ -2,6 +2,9 @@
 #ifndef _NF_CONNTRACK_SNMP_H
 #define _NF_CONNTRACK_SNMP_H
 
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+
 extern int (*nf_nat_snmp_hook)(struct sk_buff *skb,
 				unsigned int protoff,
 				struct nf_conn *ct,
diff --git a/include/linux/netfilter/nf_conntrack_tftp.h b/include/linux/netfilter/nf_conntrack_tftp.h
index 5769e12dd0a2..dc4c1b9beac0 100644
--- a/include/linux/netfilter/nf_conntrack_tftp.h
+++ b/include/linux/netfilter/nf_conntrack_tftp.h
@@ -4,6 +4,11 @@
 
 #define TFTP_PORT 69
 
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
 struct tftphdr {
 	__be16 opcode;
 };
diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h
index 302fcd3aade2..ca121ed906df 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h
@@ -2,6 +2,8 @@
 #ifndef _BR_NETFILTER_H_
 #define _BR_NETFILTER_H_
 
+#include <linux/netfilter.h>
+
 #include "../../../net/bridge/br_private.h"
 
 static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
diff --git a/include/net/netfilter/ipv4/nf_dup_ipv4.h b/include/net/netfilter/ipv4/nf_dup_ipv4.h
index c962e0be3549..a2bc16cdbcd3 100644
--- a/include/net/netfilter/ipv4/nf_dup_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_dup_ipv4.h
@@ -2,6 +2,9 @@
 #ifndef _NF_DUP_IPV4_H_
 #define _NF_DUP_IPV4_H_
 
+#include <linux/skbuff.h>
+#include <uapi/linux/in.h>
+
 void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 		 const struct in_addr *gw, int oif);
 
diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
index 9d7e28736da9..6d31cd041143 100644
--- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
@@ -2,7 +2,9 @@
 #ifndef _NF_DEFRAG_IPV6_H
 #define _NF_DEFRAG_IPV6_H
 
-struct net;
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
 int nf_defrag_ipv6_enable(struct net *);
 
 int nf_ct_frag6_init(void);
diff --git a/include/net/netfilter/ipv6/nf_dup_ipv6.h b/include/net/netfilter/ipv6/nf_dup_ipv6.h
index caf0c2dd8ee7..f6312bb04a13 100644
--- a/include/net/netfilter/ipv6/nf_dup_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_dup_ipv6.h
@@ -2,6 +2,8 @@
 #ifndef _NF_DUP_IPV6_H_
 #define _NF_DUP_IPV6_H_
 
+#include <linux/skbuff.h>
+
 void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 		 const struct in6_addr *gw, int oif);
 
diff --git a/include/net/netfilter/nf_conntrack_bridge.h b/include/net/netfilter/nf_conntrack_bridge.h
index 9a5514d5bc51..8f2e5b2ab523 100644
--- a/include/net/netfilter/nf_conntrack_bridge.h
+++ b/include/net/netfilter/nf_conntrack_bridge.h
@@ -1,6 +1,10 @@
 #ifndef NF_CONNTRACK_BRIDGE_
 #define NF_CONNTRACK_BRIDGE_
 
+#include <linux/module.h>
+#include <linux/types.h>
+#include <uapi/linux/if_ether.h>
+
 struct nf_ct_bridge_info {
 	struct nf_hook_ops	*ops;
 	unsigned int		ops_size;
diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index f32fc8289473..9645b47fa7e4 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -2,6 +2,9 @@
 #define _NF_CONNTRACK_COUNT_H
 
 #include <linux/list.h>
+#include <linux/spinlock.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
 
 struct nf_conncount_data;
 
diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h
index 2a6f6dcad3d9..181672672160 100644
--- a/include/net/netfilter/nf_dup_netdev.h
+++ b/include/net/netfilter/nf_dup_netdev.h
@@ -2,6 +2,8 @@
 #ifndef _NF_DUP_NETDEV_H_
 #define _NF_DUP_NETDEV_H_
 
+#include <net/netfilter/nf_tables.h>
+
 void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif);
 void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif);
 
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index d8c187936bec..7249e331bd0b 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -6,6 +6,7 @@
 #include <linux/netdevice.h>
 #include <linux/rhashtable-types.h>
 #include <linux/rcupdate.h>
+#include <linux/netfilter.h>
 #include <linux/netfilter/nf_conntrack_tuple_common.h>
 #include <net/dst.h>
 
diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h
index 97d7033e93a4..efae84646353 100644
--- a/include/net/netfilter/nf_nat_helper.h
+++ b/include/net/netfilter/nf_nat_helper.h
@@ -3,9 +3,9 @@
 #define _NF_NAT_HELPER_H
 /* NAT protocol helper routines. */
 
+#include <linux/skbuff.h>
 #include <net/netfilter/nf_conntrack.h>
-
-struct sk_buff;
+#include <net/netfilter/nf_conntrack_expect.h>
 
 /* These return true or false. */
 bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb, struct nf_conn *ct,
diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h
index c129aacc8ae8..2418653a66db 100644
--- a/include/net/netfilter/nf_nat_redirect.h
+++ b/include/net/netfilter/nf_nat_redirect.h
@@ -2,6 +2,9 @@
 #ifndef _NF_NAT_REDIRECT_H_
 #define _NF_NAT_REDIRECT_H_
 
+#include <linux/skbuff.h>
+#include <uapi/linux/netfilter/nf_nat.h>
+
 unsigned int
 nf_nat_redirect_ipv4(struct sk_buff *skb,
 		     const struct nf_nat_ipv4_multi_range_compat *mr,
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 3cb6dcf53a4e..359b80b43169 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -5,6 +5,8 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/jhash.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
 
 /* Each queued (to userspace) skbuff has one of these. */
 struct nf_queue_entry {
diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h
index 221f877f29d1..9051c3a0c8e7 100644
--- a/include/net/netfilter/nf_reject.h
+++ b/include/net/netfilter/nf_reject.h
@@ -2,6 +2,9 @@
 #ifndef _NF_REJECT_H
 #define _NF_REJECT_H
 
+#include <linux/types.h>
+#include <uapi/linux/in.h>
+
 static inline bool nf_reject_verify_csum(__u8 proto)
 {
 	/* Skip protocols that don't use 16-bit one's complement checksum
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
index dabe6fdb553a..d0f1c537b017 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -4,6 +4,7 @@
 
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/ipv6.h>
+#include <net/netfilter/nf_tables.h>
 
 static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 					struct sk_buff *skb)
diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h
index e4c4d8eaca8c..628b6fa579cd 100644
--- a/include/net/netfilter/nft_fib.h
+++ b/include/net/netfilter/nft_fib.h
@@ -2,6 +2,8 @@
 #ifndef _NFT_FIB_H_
 #define _NFT_FIB_H_
 
+#include <net/netfilter/nf_tables.h>
+
 struct nft_fib {
 	enum nft_registers	dreg:8;
 	u8			result;
diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index 5c69e9b09388..07e2fd507963 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -2,6 +2,8 @@
 #ifndef _NFT_META_H_
 #define _NFT_META_H_
 
+#include <net/netfilter/nf_tables.h>
+
 struct nft_meta {
 	enum nft_meta_keys	key:8;
 	union {
diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h
index de80c50761f0..56b123a42220 100644
--- a/include/net/netfilter/nft_reject.h
+++ b/include/net/netfilter/nft_reject.h
@@ -2,6 +2,11 @@
 #ifndef _NFT_REJECT_H_
 #define _NFT_REJECT_H_
 
+#include <linux/types.h>
+#include <net/netlink.h>
+#include <net/netfilter/nf_tables.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+
 struct nft_reject {
 	enum nft_reject_types	type:8;
 	u8			icmp_code;
diff --git a/include/uapi/linux/netfilter/xt_policy.h b/include/uapi/linux/netfilter/xt_policy.h
index 323bfa3074c5..4cf2ce2a8a44 100644
--- a/include/uapi/linux/netfilter/xt_policy.h
+++ b/include/uapi/linux/netfilter/xt_policy.h
@@ -2,6 +2,7 @@
 #ifndef _XT_POLICY_H
 #define _XT_POLICY_H
 
+#include <linux/netfilter.h>
 #include <linux/types.h>
 #include <linux/in.h>
 #include <linux/in6.h>
-- 
cgit v1.2.3-59-g8ed1b


From e9dc7c60507c822992e793bd3845f0556ae0ff98 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Sat, 10 Aug 2019 21:18:09 +0200
Subject: can: gw: use struct canfd_frame as internal data structure

To prepare the CAN FD support this patch implements the first adaptions in
data structures for CAN FD without changing the current functionality.

Additionally some code at the end of this patch is moved or indented to
simplify the review of the next implementation step.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/gw.h |   5 +-
 net/can/gw.c                | 222 +++++++++++++++++++++++---------------------
 2 files changed, 120 insertions(+), 107 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h
index 7bee7a0b9800..ed811bc463b5 100644
--- a/include/uapi/linux/can/gw.h
+++ b/include/uapi/linux/can/gw.h
@@ -93,10 +93,11 @@ enum {
 
 /* CAN frame elements that are affected by curr. 3 CAN frame modifications */
 #define CGW_MOD_ID	0x01
-#define CGW_MOD_DLC	0x02
+#define CGW_MOD_DLC	0x02		/* contains the data length in bytes */
+#define CGW_MOD_LEN	CGW_MOD_DLC	/* CAN FD length representation */
 #define CGW_MOD_DATA	0x04
 
-#define CGW_FRAME_MODS 3 /* ID DLC DATA */
+#define CGW_FRAME_MODS 3 /* ID DLC/LEN DATA */
 
 #define MAX_MODFUNCTIONS (CGW_MOD_FUNCS * CGW_FRAME_MODS)
 
diff --git a/net/can/gw.c b/net/can/gw.c
index 4c6ad0054a4c..3a1ad206fbef 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -85,10 +85,10 @@ static struct kmem_cache *cgw_cache __read_mostly;
 /* structure that contains the (on-the-fly) CAN frame modifications */
 struct cf_mod {
 	struct {
-		struct can_frame and;
-		struct can_frame or;
-		struct can_frame xor;
-		struct can_frame set;
+		struct canfd_frame and;
+		struct canfd_frame or;
+		struct canfd_frame xor;
+		struct canfd_frame set;
 	} modframe;
 	struct {
 		u8 and;
@@ -96,7 +96,7 @@ struct cf_mod {
 		u8 xor;
 		u8 set;
 	} modtype;
-	void (*modfunc[MAX_MODFUNCTIONS])(struct can_frame *cf,
+	void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf,
 					  struct cf_mod *mod);
 
 	/* CAN frame checksum calculation after CAN frame modifications */
@@ -105,8 +105,10 @@ struct cf_mod {
 		struct cgw_csum_crc8 crc8;
 	} csum;
 	struct {
-		void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor);
-		void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8);
+		void (*xor)(struct canfd_frame *cf,
+			    struct cgw_csum_xor *xor);
+		void (*crc8)(struct canfd_frame *cf,
+			     struct cgw_csum_crc8 *crc8);
 	} csumfunc;
 	u32 uid;
 };
@@ -149,23 +151,23 @@ struct cgw_job {
 
 /* modification functions that are invoked in the hot path in can_can_gw_rcv */
 
-#define MODFUNC(func, op) static void func(struct can_frame *cf, \
+#define MODFUNC(func, op) static void func(struct canfd_frame *cf, \
 					   struct cf_mod *mod) { op ; }
 
 MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id)
-MODFUNC(mod_and_dlc, cf->can_dlc &= mod->modframe.and.can_dlc)
+MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len)
 MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data)
 MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id)
-MODFUNC(mod_or_dlc, cf->can_dlc |= mod->modframe.or.can_dlc)
+MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len)
 MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data)
 MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id)
-MODFUNC(mod_xor_dlc, cf->can_dlc ^= mod->modframe.xor.can_dlc)
+MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len)
 MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data)
 MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id)
-MODFUNC(mod_set_dlc, cf->can_dlc = mod->modframe.set.can_dlc)
+MODFUNC(mod_set_len, cf->len = mod->modframe.set.len)
 MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data)
 
-static inline void canframecpy(struct can_frame *dst, struct can_frame *src)
+static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
 {
 	/* Copy the struct members separately to ensure that no uninitialized
 	 * data are copied in the 3 bytes hole of the struct. This is needed
@@ -173,12 +175,14 @@ static inline void canframecpy(struct can_frame *dst, struct can_frame *src)
 	 */
 
 	dst->can_id = src->can_id;
-	dst->can_dlc = src->can_dlc;
+	dst->len = src->can_dlc;
 	*(u64 *)dst->data = *(u64 *)src->data;
 }
 
 static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re)
 {
+	s8 dlen = CAN_MAX_DLEN;
+
 	/* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0]
 	 * relative to received dlc -1 .. -8 :
 	 * e.g. for received dlc = 8
@@ -187,27 +191,27 @@ static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re)
 	 * -8 => index = 0 (data[0])
 	 */
 
-	if (fr > -9 && fr < 8 &&
-	    to > -9 && to < 8 &&
-	    re > -9 && re < 8)
+	if (fr >= -dlen && fr < dlen &&
+	    to >= -dlen && to < dlen &&
+	    re >= -dlen && re < dlen)
 		return 0;
 	else
 		return -EINVAL;
 }
 
-static inline int calc_idx(int idx, int rx_dlc)
+static inline int calc_idx(int idx, int rx_len)
 {
 	if (idx < 0)
-		return rx_dlc + idx;
+		return rx_len + idx;
 	else
 		return idx;
 }
 
-static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor)
+static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor)
 {
-	int from = calc_idx(xor->from_idx, cf->can_dlc);
-	int to = calc_idx(xor->to_idx, cf->can_dlc);
-	int res = calc_idx(xor->result_idx, cf->can_dlc);
+	int from = calc_idx(xor->from_idx, cf->len);
+	int to = calc_idx(xor->to_idx, cf->len);
+	int res = calc_idx(xor->result_idx, cf->len);
 	u8 val = xor->init_xor_val;
 	int i;
 
@@ -225,7 +229,7 @@ static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor)
 	cf->data[res] = val;
 }
 
-static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor)
+static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor)
 {
 	u8 val = xor->init_xor_val;
 	int i;
@@ -236,7 +240,7 @@ static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor)
 	cf->data[xor->result_idx] = val;
 }
 
-static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor)
+static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor)
 {
 	u8 val = xor->init_xor_val;
 	int i;
@@ -247,11 +251,12 @@ static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor)
 	cf->data[xor->result_idx] = val;
 }
 
-static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+static void cgw_csum_crc8_rel(struct canfd_frame *cf,
+			      struct cgw_csum_crc8 *crc8)
 {
-	int from = calc_idx(crc8->from_idx, cf->can_dlc);
-	int to = calc_idx(crc8->to_idx, cf->can_dlc);
-	int res = calc_idx(crc8->result_idx, cf->can_dlc);
+	int from = calc_idx(crc8->from_idx, cf->len);
+	int to = calc_idx(crc8->to_idx, cf->len);
+	int res = calc_idx(crc8->result_idx, cf->len);
 	u8 crc = crc8->init_crc_val;
 	int i;
 
@@ -284,7 +289,8 @@ static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
 	cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
 }
 
-static void cgw_csum_crc8_pos(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+static void cgw_csum_crc8_pos(struct canfd_frame *cf,
+			      struct cgw_csum_crc8 *crc8)
 {
 	u8 crc = crc8->init_crc_val;
 	int i;
@@ -310,7 +316,8 @@ static void cgw_csum_crc8_pos(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
 	cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
 }
 
-static void cgw_csum_crc8_neg(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+static void cgw_csum_crc8_neg(struct canfd_frame *cf,
+			      struct cgw_csum_crc8 *crc8)
 {
 	u8 crc = crc8->init_crc_val;
 	int i;
@@ -340,7 +347,7 @@ static void cgw_csum_crc8_neg(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
 static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 {
 	struct cgw_job *gwj = (struct cgw_job *)data;
-	struct can_frame *cf;
+	struct canfd_frame *cf;
 	struct sk_buff *nskb;
 	int modidx = 0;
 
@@ -400,7 +407,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 	nskb->dev = gwj->dst.dev;
 
 	/* pointer to modifiable CAN frame */
-	cf = (struct can_frame *)nskb->data;
+	cf = (struct canfd_frame *)nskb->data;
 
 	/* perform preprocessed modification functions if there are any */
 	while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx])
@@ -409,22 +416,22 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 	/* Has the CAN frame been modified? */
 	if (modidx) {
 		/* get available space for the processed CAN frame type */
-		int max_len = nskb->len - offsetof(struct can_frame, data);
+		int max_len = nskb->len - offsetof(struct canfd_frame, data);
 
 		/* dlc may have changed, make sure it fits to the CAN frame */
-		if (cf->can_dlc > max_len)
+		if (cf->len > max_len)
 			goto out_delete;
 
 		/* check for checksum updates in classic CAN length only */
 		if (gwj->mod.csumfunc.crc8) {
-			if (cf->can_dlc > 8)
+			if (cf->len > 8)
 				goto out_delete;
 
 			(*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);
 		}
 
 		if (gwj->mod.csumfunc.xor) {
-			if (cf->can_dlc > 8)
+			if (cf->len > 8)
 				goto out_delete;
 
 			(*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor);
@@ -492,7 +499,6 @@ static int cgw_notifier(struct notifier_block *nb,
 static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
 		       u32 pid, u32 seq, int flags)
 {
-	struct cgw_frame_mod mb;
 	struct rtcanmsg *rtcan;
 	struct nlmsghdr *nlh;
 
@@ -529,32 +535,36 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
 			goto cancel;
 	}
 
-	if (gwj->mod.modtype.and) {
-		memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
-		mb.modtype = gwj->mod.modtype.and;
-		if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0)
-			goto cancel;
-	}
+	if (1) {
+		struct cgw_frame_mod mb;
 
-	if (gwj->mod.modtype.or) {
-		memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
-		mb.modtype = gwj->mod.modtype.or;
-		if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0)
-			goto cancel;
-	}
+		if (gwj->mod.modtype.and) {
+			memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
+			mb.modtype = gwj->mod.modtype.and;
+			if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0)
+				goto cancel;
+		}
 
-	if (gwj->mod.modtype.xor) {
-		memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
-		mb.modtype = gwj->mod.modtype.xor;
-		if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0)
-			goto cancel;
-	}
+		if (gwj->mod.modtype.or) {
+			memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
+			mb.modtype = gwj->mod.modtype.or;
+			if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0)
+				goto cancel;
+		}
 
-	if (gwj->mod.modtype.set) {
-		memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
-		mb.modtype = gwj->mod.modtype.set;
-		if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0)
-			goto cancel;
+		if (gwj->mod.modtype.xor) {
+			memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
+			mb.modtype = gwj->mod.modtype.xor;
+			if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0)
+				goto cancel;
+		}
+
+		if (gwj->mod.modtype.set) {
+			memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
+			mb.modtype = gwj->mod.modtype.set;
+			if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0)
+				goto cancel;
+		}
 	}
 
 	if (gwj->mod.uid) {
@@ -642,7 +652,6 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
 			  u8 gwtype, void *gwtypeattr, u8 *limhops)
 {
 	struct nlattr *tb[CGW_MAX + 1];
-	struct cgw_frame_mod mb;
 	int modidx = 0;
 	int err = 0;
 
@@ -662,69 +671,72 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
 	}
 
 	/* check for AND/OR/XOR/SET modifications */
+	if (1) {
+		struct cgw_frame_mod mb;
 
-	if (tb[CGW_MOD_AND]) {
-		nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN);
+		if (tb[CGW_MOD_AND]) {
+			nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN);
 
-		canframecpy(&mod->modframe.and, &mb.cf);
-		mod->modtype.and = mb.modtype;
+			canframecpy(&mod->modframe.and, &mb.cf);
+			mod->modtype.and = mb.modtype;
 
-		if (mb.modtype & CGW_MOD_ID)
-			mod->modfunc[modidx++] = mod_and_id;
+			if (mb.modtype & CGW_MOD_ID)
+				mod->modfunc[modidx++] = mod_and_id;
 
-		if (mb.modtype & CGW_MOD_DLC)
-			mod->modfunc[modidx++] = mod_and_dlc;
+			if (mb.modtype & CGW_MOD_LEN)
+				mod->modfunc[modidx++] = mod_and_len;
 
-		if (mb.modtype & CGW_MOD_DATA)
-			mod->modfunc[modidx++] = mod_and_data;
-	}
+			if (mb.modtype & CGW_MOD_DATA)
+				mod->modfunc[modidx++] = mod_and_data;
+		}
 
-	if (tb[CGW_MOD_OR]) {
-		nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN);
+		if (tb[CGW_MOD_OR]) {
+			nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN);
 
-		canframecpy(&mod->modframe.or, &mb.cf);
-		mod->modtype.or = mb.modtype;
+			canframecpy(&mod->modframe.or, &mb.cf);
+			mod->modtype.or = mb.modtype;
 
-		if (mb.modtype & CGW_MOD_ID)
-			mod->modfunc[modidx++] = mod_or_id;
+			if (mb.modtype & CGW_MOD_ID)
+				mod->modfunc[modidx++] = mod_or_id;
 
-		if (mb.modtype & CGW_MOD_DLC)
-			mod->modfunc[modidx++] = mod_or_dlc;
+			if (mb.modtype & CGW_MOD_LEN)
+				mod->modfunc[modidx++] = mod_or_len;
 
-		if (mb.modtype & CGW_MOD_DATA)
-			mod->modfunc[modidx++] = mod_or_data;
-	}
+			if (mb.modtype & CGW_MOD_DATA)
+				mod->modfunc[modidx++] = mod_or_data;
+		}
 
-	if (tb[CGW_MOD_XOR]) {
-		nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN);
+		if (tb[CGW_MOD_XOR]) {
+			nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN);
 
-		canframecpy(&mod->modframe.xor, &mb.cf);
-		mod->modtype.xor = mb.modtype;
+			canframecpy(&mod->modframe.xor, &mb.cf);
+			mod->modtype.xor = mb.modtype;
 
-		if (mb.modtype & CGW_MOD_ID)
-			mod->modfunc[modidx++] = mod_xor_id;
+			if (mb.modtype & CGW_MOD_ID)
+				mod->modfunc[modidx++] = mod_xor_id;
 
-		if (mb.modtype & CGW_MOD_DLC)
-			mod->modfunc[modidx++] = mod_xor_dlc;
+			if (mb.modtype & CGW_MOD_LEN)
+				mod->modfunc[modidx++] = mod_xor_len;
 
-		if (mb.modtype & CGW_MOD_DATA)
-			mod->modfunc[modidx++] = mod_xor_data;
-	}
+			if (mb.modtype & CGW_MOD_DATA)
+				mod->modfunc[modidx++] = mod_xor_data;
+		}
 
-	if (tb[CGW_MOD_SET]) {
-		nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN);
+		if (tb[CGW_MOD_SET]) {
+			nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN);
 
-		canframecpy(&mod->modframe.set, &mb.cf);
-		mod->modtype.set = mb.modtype;
+			canframecpy(&mod->modframe.set, &mb.cf);
+			mod->modtype.set = mb.modtype;
 
-		if (mb.modtype & CGW_MOD_ID)
-			mod->modfunc[modidx++] = mod_set_id;
+			if (mb.modtype & CGW_MOD_ID)
+				mod->modfunc[modidx++] = mod_set_id;
 
-		if (mb.modtype & CGW_MOD_DLC)
-			mod->modfunc[modidx++] = mod_set_dlc;
+			if (mb.modtype & CGW_MOD_LEN)
+				mod->modfunc[modidx++] = mod_set_len;
 
-		if (mb.modtype & CGW_MOD_DATA)
-			mod->modfunc[modidx++] = mod_set_data;
+			if (mb.modtype & CGW_MOD_DATA)
+				mod->modfunc[modidx++] = mod_set_data;
+		}
 	}
 
 	/* check for checksum operations after CAN frame modifications */
-- 
cgit v1.2.3-59-g8ed1b


From 456a8a646b2563438c16a9b27decf9aa717f1ebb Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Sat, 10 Aug 2019 21:18:10 +0200
Subject: can: gw: add support for CAN FD frames

Introduce CAN FD support which needs an extension of the netlink API to
pass CAN FD type content to the kernel which has a different size to
Classic CAN. Additionally the struct canfd_frame has a new 'flags' element
that can now be modified with can-gw.

The new CGW_FLAGS_CAN_FD option flag defines whether the routing job
handles Classic CAN or CAN FD frames. This setting is very strict at
reception time and enables the new possibilities, e.g. CGW_FDMOD_* and
modifying the flags element of struct canfd_frame, only when
CGW_FLAGS_CAN_FD is set.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/gw.h |  14 ++-
 net/can/gw.c                | 214 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 200 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h
index ed811bc463b5..3aea5388c8e4 100644
--- a/include/uapi/linux/can/gw.h
+++ b/include/uapi/linux/can/gw.h
@@ -80,6 +80,10 @@ enum {
 	CGW_DELETED,	/* number of deleted CAN frames (see max_hops param) */
 	CGW_LIM_HOPS,	/* limit the number of hops of this specific rule */
 	CGW_MOD_UID,	/* user defined identifier for modification updates */
+	CGW_FDMOD_AND,	/* CAN FD frame modification binary AND */
+	CGW_FDMOD_OR,	/* CAN FD frame modification binary OR */
+	CGW_FDMOD_XOR,	/* CAN FD frame modification binary XOR */
+	CGW_FDMOD_SET,	/* CAN FD frame modification set alternate values */
 	__CGW_MAX
 };
 
@@ -88,6 +92,7 @@ enum {
 #define CGW_FLAGS_CAN_ECHO 0x01
 #define CGW_FLAGS_CAN_SRC_TSTAMP 0x02
 #define CGW_FLAGS_CAN_IIF_TX_OK 0x04
+#define CGW_FLAGS_CAN_FD 0x08
 
 #define CGW_MOD_FUNCS 4 /* AND OR XOR SET */
 
@@ -96,8 +101,9 @@ enum {
 #define CGW_MOD_DLC	0x02		/* contains the data length in bytes */
 #define CGW_MOD_LEN	CGW_MOD_DLC	/* CAN FD length representation */
 #define CGW_MOD_DATA	0x04
+#define CGW_MOD_FLAGS	0x08		/* CAN FD flags */
 
-#define CGW_FRAME_MODS 3 /* ID DLC/LEN DATA */
+#define CGW_FRAME_MODS 4 /* ID DLC/LEN DATA FLAGS */
 
 #define MAX_MODFUNCTIONS (CGW_MOD_FUNCS * CGW_FRAME_MODS)
 
@@ -106,7 +112,13 @@ struct cgw_frame_mod {
 	__u8 modtype;
 } __attribute__((packed));
 
+struct cgw_fdframe_mod {
+	struct canfd_frame cf;
+	__u8 modtype;
+} __attribute__((packed));
+
 #define CGW_MODATTR_LEN sizeof(struct cgw_frame_mod)
+#define CGW_FDMODATTR_LEN sizeof(struct cgw_fdframe_mod)
 
 struct cgw_csum_xor {
 	__s8 from_idx;
diff --git a/net/can/gw.c b/net/can/gw.c
index 3a1ad206fbef..65d60c93af29 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
 /* gw.c - CAN frame Gateway/Router/Bridge with netlink interface
  *
- * Copyright (c) 2017 Volkswagen Group Electronic Research
+ * Copyright (c) 2019 Volkswagen Group Electronic Research
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -59,7 +59,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 
-#define CAN_GW_VERSION "20170425"
+#define CAN_GW_VERSION "20190810"
 #define CAN_GW_NAME "can-gw"
 
 MODULE_DESCRIPTION("PF_CAN netlink gateway");
@@ -156,17 +156,50 @@ struct cgw_job {
 
 MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id)
 MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len)
+MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags)
 MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data)
 MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id)
 MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len)
+MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags)
 MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data)
 MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id)
 MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len)
+MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags)
 MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data)
 MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id)
 MODFUNC(mod_set_len, cf->len = mod->modframe.set.len)
+MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags)
 MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data)
 
+static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+	int i;
+
+	for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+		*(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i);
+}
+
+static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+	int i;
+
+	for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+		*(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i);
+}
+
+static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+	int i;
+
+	for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+		*(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i);
+}
+
+static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+	memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN);
+}
+
 static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
 {
 	/* Copy the struct members separately to ensure that no uninitialized
@@ -179,10 +212,26 @@ static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
 	*(u64 *)dst->data = *(u64 *)src->data;
 }
 
-static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re)
+static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src)
+{
+	/* Copy the struct members separately to ensure that no uninitialized
+	 * data are copied in the 2 bytes hole of the struct. This is needed
+	 * to make easy compares of the data in the struct cf_mod.
+	 */
+
+	dst->can_id = src->can_id;
+	dst->flags = src->flags;
+	dst->len = src->len;
+	memcpy(dst->data, src->data, CANFD_MAX_DLEN);
+}
+
+static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r)
 {
 	s8 dlen = CAN_MAX_DLEN;
 
+	if (r->flags & CGW_FLAGS_CAN_FD)
+		dlen = CANFD_MAX_DLEN;
+
 	/* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0]
 	 * relative to received dlc -1 .. -8 :
 	 * e.g. for received dlc = 8
@@ -351,6 +400,15 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 	struct sk_buff *nskb;
 	int modidx = 0;
 
+	/* process strictly Classic CAN or CAN FD frames */
+	if (gwj->flags & CGW_FLAGS_CAN_FD) {
+		if (skb->len != CANFD_MTU)
+			return;
+	} else {
+		if (skb->len != CAN_MTU)
+			return;
+	}
+
 	/* Do not handle CAN frames routed more than 'max_hops' times.
 	 * In general we should never catch this delimiter which is intended
 	 * to cover a misconfiguration protection (e.g. circular CAN routes).
@@ -419,23 +477,19 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 		int max_len = nskb->len - offsetof(struct canfd_frame, data);
 
 		/* dlc may have changed, make sure it fits to the CAN frame */
-		if (cf->len > max_len)
-			goto out_delete;
-
-		/* check for checksum updates in classic CAN length only */
-		if (gwj->mod.csumfunc.crc8) {
-			if (cf->len > 8)
-				goto out_delete;
-
-			(*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);
+		if (cf->len > max_len) {
+			/* delete frame due to misconfiguration */
+			gwj->deleted_frames++;
+			kfree_skb(nskb);
+			return;
 		}
 
-		if (gwj->mod.csumfunc.xor) {
-			if (cf->len > 8)
-				goto out_delete;
+		/* check for checksum updates */
+		if (gwj->mod.csumfunc.crc8)
+			(*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);
 
+		if (gwj->mod.csumfunc.xor)
 			(*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor);
-		}
 	}
 
 	/* clear the skb timestamp if not configured the other way */
@@ -447,13 +501,6 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 		gwj->dropped_frames++;
 	else
 		gwj->handled_frames++;
-
-	return;
-
- out_delete:
-	/* delete frame due to misconfiguration */
-	gwj->deleted_frames++;
-	kfree_skb(nskb);
 }
 
 static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj)
@@ -535,7 +582,37 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
 			goto cancel;
 	}
 
-	if (1) {
+	if (gwj->flags & CGW_FLAGS_CAN_FD) {
+		struct cgw_fdframe_mod mb;
+
+		if (gwj->mod.modtype.and) {
+			memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
+			mb.modtype = gwj->mod.modtype.and;
+			if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0)
+				goto cancel;
+		}
+
+		if (gwj->mod.modtype.or) {
+			memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
+			mb.modtype = gwj->mod.modtype.or;
+			if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0)
+				goto cancel;
+		}
+
+		if (gwj->mod.modtype.xor) {
+			memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
+			mb.modtype = gwj->mod.modtype.xor;
+			if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0)
+				goto cancel;
+		}
+
+		if (gwj->mod.modtype.set) {
+			memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
+			mb.modtype = gwj->mod.modtype.set;
+			if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0)
+				goto cancel;
+		}
+	} else {
 		struct cgw_frame_mod mb;
 
 		if (gwj->mod.modtype.and) {
@@ -645,6 +722,10 @@ static const struct nla_policy cgw_policy[CGW_MAX + 1] = {
 	[CGW_FILTER]	= { .len = sizeof(struct can_filter) },
 	[CGW_LIM_HOPS]	= { .type = NLA_U8 },
 	[CGW_MOD_UID]	= { .type = NLA_U32 },
+	[CGW_FDMOD_AND]	= { .len = sizeof(struct cgw_fdframe_mod) },
+	[CGW_FDMOD_OR]	= { .len = sizeof(struct cgw_fdframe_mod) },
+	[CGW_FDMOD_XOR]	= { .len = sizeof(struct cgw_fdframe_mod) },
+	[CGW_FDMOD_SET]	= { .len = sizeof(struct cgw_fdframe_mod) },
 };
 
 /* check for common and gwtype specific attributes */
@@ -652,6 +733,7 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
 			  u8 gwtype, void *gwtypeattr, u8 *limhops)
 {
 	struct nlattr *tb[CGW_MAX + 1];
+	struct rtcanmsg *r = nlmsg_data(nlh);
 	int modidx = 0;
 	int err = 0;
 
@@ -671,7 +753,85 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
 	}
 
 	/* check for AND/OR/XOR/SET modifications */
-	if (1) {
+	if (r->flags & CGW_FLAGS_CAN_FD) {
+		struct cgw_fdframe_mod mb;
+
+		if (tb[CGW_FDMOD_AND]) {
+			nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN);
+
+			canfdframecpy(&mod->modframe.and, &mb.cf);
+			mod->modtype.and = mb.modtype;
+
+			if (mb.modtype & CGW_MOD_ID)
+				mod->modfunc[modidx++] = mod_and_id;
+
+			if (mb.modtype & CGW_MOD_LEN)
+				mod->modfunc[modidx++] = mod_and_len;
+
+			if (mb.modtype & CGW_MOD_FLAGS)
+				mod->modfunc[modidx++] = mod_and_flags;
+
+			if (mb.modtype & CGW_MOD_DATA)
+				mod->modfunc[modidx++] = mod_and_fddata;
+		}
+
+		if (tb[CGW_FDMOD_OR]) {
+			nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN);
+
+			canfdframecpy(&mod->modframe.or, &mb.cf);
+			mod->modtype.or = mb.modtype;
+
+			if (mb.modtype & CGW_MOD_ID)
+				mod->modfunc[modidx++] = mod_or_id;
+
+			if (mb.modtype & CGW_MOD_LEN)
+				mod->modfunc[modidx++] = mod_or_len;
+
+			if (mb.modtype & CGW_MOD_FLAGS)
+				mod->modfunc[modidx++] = mod_or_flags;
+
+			if (mb.modtype & CGW_MOD_DATA)
+				mod->modfunc[modidx++] = mod_or_fddata;
+		}
+
+		if (tb[CGW_FDMOD_XOR]) {
+			nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN);
+
+			canfdframecpy(&mod->modframe.xor, &mb.cf);
+			mod->modtype.xor = mb.modtype;
+
+			if (mb.modtype & CGW_MOD_ID)
+				mod->modfunc[modidx++] = mod_xor_id;
+
+			if (mb.modtype & CGW_MOD_LEN)
+				mod->modfunc[modidx++] = mod_xor_len;
+
+			if (mb.modtype & CGW_MOD_FLAGS)
+				mod->modfunc[modidx++] = mod_xor_flags;
+
+			if (mb.modtype & CGW_MOD_DATA)
+				mod->modfunc[modidx++] = mod_xor_fddata;
+		}
+
+		if (tb[CGW_FDMOD_SET]) {
+			nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN);
+
+			canfdframecpy(&mod->modframe.set, &mb.cf);
+			mod->modtype.set = mb.modtype;
+
+			if (mb.modtype & CGW_MOD_ID)
+				mod->modfunc[modidx++] = mod_set_id;
+
+			if (mb.modtype & CGW_MOD_LEN)
+				mod->modfunc[modidx++] = mod_set_len;
+
+			if (mb.modtype & CGW_MOD_FLAGS)
+				mod->modfunc[modidx++] = mod_set_flags;
+
+			if (mb.modtype & CGW_MOD_DATA)
+				mod->modfunc[modidx++] = mod_set_fddata;
+		}
+	} else {
 		struct cgw_frame_mod mb;
 
 		if (tb[CGW_MOD_AND]) {
@@ -745,7 +905,7 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
 			struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]);
 
 			err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
-						 c->result_idx);
+						 c->result_idx, r);
 			if (err)
 				return err;
 
@@ -768,7 +928,7 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
 			struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]);
 
 			err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
-						 c->result_idx);
+						 c->result_idx, r);
 			if (err)
 				return err;
 
-- 
cgit v1.2.3-59-g8ed1b


From 3ca3c4aad2efa2931b663acc4ece7a38b31071d1 Mon Sep 17 00:00:00 2001
From: Andre Hartmann <aha_1980@gmx.de>
Date: Sat, 23 Mar 2019 16:04:19 +0100
Subject: can: netlink: fix documentation typos

This patch fixes some documentation typos in struct can_bittiming_const.

Signed-off-by: Andre Hartmann <aha_1980@gmx.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/netlink.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h
index 9f56fad4785b..1bc70d3a4d39 100644
--- a/include/uapi/linux/can/netlink.h
+++ b/include/uapi/linux/can/netlink.h
@@ -40,15 +40,15 @@ struct can_bittiming {
 };
 
 /*
- * CAN harware-dependent bit-timing constant
+ * CAN hardware-dependent bit-timing constant
  *
  * Used for calculating and checking bit-timing parameters
  */
 struct can_bittiming_const {
 	char name[16];		/* Name of the CAN controller hardware */
-	__u32 tseg1_min;	/* Time segement 1 = prop_seg + phase_seg1 */
+	__u32 tseg1_min;	/* Time segment 1 = prop_seg + phase_seg1 */
 	__u32 tseg1_max;
-	__u32 tseg2_min;	/* Time segement 2 = phase_seg2 */
+	__u32 tseg2_min;	/* Time segment 2 = phase_seg2 */
 	__u32 tseg2_max;
 	__u32 sjw_max;		/* Synchronisation jump width */
 	__u32 brp_min;		/* Bit-rate prescaler */
-- 
cgit v1.2.3-59-g8ed1b


From 707816c8b0508ba7afcee9a13b02bb4261e4bd8c Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Wed, 14 Aug 2019 09:01:28 +0100
Subject: netfilter: remove deprecation warnings from uapi headers.

There are two netfilter userspace headers which contain deprecation
warnings.  While these headers are not used within the kernel, they are
compiled stand-alone for header-testing.

Pablo informs me that userspace iptables still refer to these headers,
and the intention was to use xt_LOG.h instead and remove these, but
userspace was never updated.

Remove the warnings.

Fixes: 2a475c409fe8 ("kbuild: remove all netfilter headers from header-test blacklist.")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv4/ipt_LOG.h  | 2 --
 include/uapi/linux/netfilter_ipv6/ip6t_LOG.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter_ipv4/ipt_LOG.h b/include/uapi/linux/netfilter_ipv4/ipt_LOG.h
index 6dec14ba851b..b7cf2c669f40 100644
--- a/include/uapi/linux/netfilter_ipv4/ipt_LOG.h
+++ b/include/uapi/linux/netfilter_ipv4/ipt_LOG.h
@@ -2,8 +2,6 @@
 #ifndef _IPT_LOG_H
 #define _IPT_LOG_H
 
-#warning "Please update iptables, this file will be removed soon!"
-
 /* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */
 #define IPT_LOG_TCPSEQ		0x01	/* Log TCP sequence numbers */
 #define IPT_LOG_TCPOPT		0x02	/* Log TCP options */
diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
index 7553a434e4da..23e91a9c2583 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
@@ -2,8 +2,6 @@
 #ifndef _IP6T_LOG_H
 #define _IP6T_LOG_H
 
-#warning "Please update iptables, this file will be removed soon!"
-
 /* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */
 #define IP6T_LOG_TCPSEQ		0x01	/* Log TCP sequence numbers */
 #define IP6T_LOG_TCPOPT		0x02	/* Log TCP options */
-- 
cgit v1.2.3-59-g8ed1b


From 5e58109b1ea454b93e455e0e8fc0bc4c226b8c0a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 17 Aug 2019 16:28:14 +0300
Subject: drop_monitor: Add support for packet alert mode for hardware drops

In a similar fashion to software drops, extend drop monitor to send
netlink events when packets are dropped by the underlying hardware.

The main difference is that instead of encoding the program counter (PC)
from which kfree_skb() was called in the netlink message, we encode the
hardware trap name. The two are mostly equivalent since they should both
help the user understand why the packet was dropped.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |  10 ++
 net/core/drop_monitor.c          | 304 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 310 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 405b31cbf723..9f8fb1bb4aa4 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -83,6 +83,10 @@ enum net_dm_attr {
 	NET_DM_ATTR_ORIG_LEN,			/* u32 */
 	NET_DM_ATTR_QUEUE_LEN,			/* u32 */
 	NET_DM_ATTR_STATS,			/* nested */
+	NET_DM_ATTR_HW_STATS,			/* nested */
+	NET_DM_ATTR_ORIGIN,			/* u16 */
+	NET_DM_ATTR_HW_TRAP_GROUP_NAME,		/* string */
+	NET_DM_ATTR_HW_TRAP_NAME,		/* string */
 
 	__NET_DM_ATTR_MAX,
 	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
@@ -101,6 +105,7 @@ enum net_dm_alert_mode {
 
 enum {
 	NET_DM_ATTR_PORT_NETDEV_IFINDEX,	/* u32 */
+	NET_DM_ATTR_PORT_NETDEV_NAME,		/* string */
 
 	__NET_DM_ATTR_PORT_MAX,
 	NET_DM_ATTR_PORT_MAX = __NET_DM_ATTR_PORT_MAX - 1
@@ -113,4 +118,9 @@ enum {
 	NET_DM_ATTR_STATS_MAX = __NET_DM_ATTR_STATS_MAX - 1
 };
 
+enum net_dm_origin {
+	NET_DM_ORIGIN_SW,
+	NET_DM_ORIGIN_HW,
+};
+
 #endif
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index a2c7f9162c9d..5a950b5af8fd 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -95,12 +95,16 @@ struct net_dm_alert_ops {
 	void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
 				int work, int budget);
 	void (*work_item_func)(struct work_struct *work);
+	void (*hw_work_item_func)(struct work_struct *work);
 	void (*hw_probe)(struct sk_buff *skb,
 			 const struct net_dm_hw_metadata *hw_metadata);
 };
 
 struct net_dm_skb_cb {
-	void *pc;
+	union {
+		struct net_dm_hw_metadata *hw_metadata;
+		void *pc;
+	};
 };
 
 #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
@@ -335,7 +339,9 @@ static size_t net_dm_in_port_size(void)
 	       /* NET_DM_ATTR_IN_PORT nest */
 	return nla_total_size(0) +
 	       /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */
-	       nla_total_size(sizeof(u32));
+	       nla_total_size(sizeof(u32)) +
+	       /* NET_DM_ATTR_PORT_NETDEV_NAME */
+	       nla_total_size(IFNAMSIZ + 1);
 }
 
 #define NET_DM_MAX_SYMBOL_LEN 40
@@ -347,6 +353,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
 	size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
 
 	return NLMSG_ALIGN(size) +
+	       /* NET_DM_ATTR_ORIGIN */
+	       nla_total_size(sizeof(u16)) +
 	       /* NET_DM_ATTR_PC */
 	       nla_total_size(sizeof(u64)) +
 	       /* NET_DM_ATTR_SYMBOL */
@@ -363,7 +371,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
 	       nla_total_size(payload_len);
 }
 
-static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex)
+static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex,
+					    const char *name)
 {
 	struct nlattr *attr;
 
@@ -375,6 +384,9 @@ static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex)
 	    nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex))
 		goto nla_put_failure;
 
+	if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name))
+		goto nla_put_failure;
+
 	nla_nest_end(msg, attr);
 
 	return 0;
@@ -399,6 +411,9 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 	if (!hdr)
 		return -EMSGSIZE;
 
+	if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
+		goto nla_put_failure;
+
 	if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
 		goto nla_put_failure;
 
@@ -406,7 +421,7 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 	if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
 		goto nla_put_failure;
 
-	rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif);
+	rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL);
 	if (rc)
 		goto nla_put_failure;
 
@@ -493,16 +508,249 @@ static void net_dm_packet_work(struct work_struct *work)
 		net_dm_packet_report(skb);
 }
 
+static size_t
+net_dm_hw_packet_report_size(size_t payload_len,
+			     const struct net_dm_hw_metadata *hw_metadata)
+{
+	size_t size;
+
+	size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
+
+	return NLMSG_ALIGN(size) +
+	       /* NET_DM_ATTR_ORIGIN */
+	       nla_total_size(sizeof(u16)) +
+	       /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */
+	       nla_total_size(strlen(hw_metadata->trap_group_name) + 1) +
+	       /* NET_DM_ATTR_HW_TRAP_NAME */
+	       nla_total_size(strlen(hw_metadata->trap_name) + 1) +
+	       /* NET_DM_ATTR_IN_PORT */
+	       net_dm_in_port_size() +
+	       /* NET_DM_ATTR_TIMESTAMP */
+	       nla_total_size(sizeof(struct timespec)) +
+	       /* NET_DM_ATTR_ORIG_LEN */
+	       nla_total_size(sizeof(u32)) +
+	       /* NET_DM_ATTR_PROTO */
+	       nla_total_size(sizeof(u16)) +
+	       /* NET_DM_ATTR_PAYLOAD */
+	       nla_total_size(payload_len);
+}
+
+static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
+					struct sk_buff *skb, size_t payload_len)
+{
+	struct net_dm_hw_metadata *hw_metadata;
+	struct nlattr *attr;
+	struct timespec ts;
+	void *hdr;
+
+	hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+
+	hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+			  NET_DM_CMD_PACKET_ALERT);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW))
+		goto nla_put_failure;
+
+	if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME,
+			   hw_metadata->trap_group_name))
+		goto nla_put_failure;
+
+	if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME,
+			   hw_metadata->trap_name))
+		goto nla_put_failure;
+
+	if (hw_metadata->input_dev) {
+		struct net_device *dev = hw_metadata->input_dev;
+		int rc;
+
+		rc = net_dm_packet_report_in_port_put(msg, dev->ifindex,
+						      dev->name);
+		if (rc)
+			goto nla_put_failure;
+	}
+
+	if (ktime_to_timespec_cond(skb->tstamp, &ts) &&
+	    nla_put(msg, NET_DM_ATTR_TIMESTAMP, sizeof(ts), &ts))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
+		goto nla_put_failure;
+
+	if (!payload_len)
+		goto out;
+
+	if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
+		goto nla_put_failure;
+
+	attr = skb_put(msg, nla_total_size(payload_len));
+	attr->nla_type = NET_DM_ATTR_PAYLOAD;
+	attr->nla_len = nla_attr_size(payload_len);
+	if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
+		goto nla_put_failure;
+
+out:
+	genlmsg_end(msg, hdr);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static struct net_dm_hw_metadata *
+net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata)
+{
+	struct net_dm_hw_metadata *n_hw_metadata;
+	const char *trap_group_name;
+	const char *trap_name;
+
+	n_hw_metadata = kmalloc(sizeof(*hw_metadata), GFP_ATOMIC);
+	if (!n_hw_metadata)
+		return NULL;
+
+	trap_group_name = kmemdup(hw_metadata->trap_group_name,
+				  strlen(hw_metadata->trap_group_name) + 1,
+				  GFP_ATOMIC | __GFP_ZERO);
+	if (!trap_group_name)
+		goto free_hw_metadata;
+	n_hw_metadata->trap_group_name = trap_group_name;
+
+	trap_name = kmemdup(hw_metadata->trap_name,
+			    strlen(hw_metadata->trap_name) + 1,
+			    GFP_ATOMIC | __GFP_ZERO);
+	if (!trap_name)
+		goto free_trap_group;
+	n_hw_metadata->trap_name = trap_name;
+
+	n_hw_metadata->input_dev = hw_metadata->input_dev;
+	if (n_hw_metadata->input_dev)
+		dev_hold(n_hw_metadata->input_dev);
+
+	return n_hw_metadata;
+
+free_trap_group:
+	kfree(trap_group_name);
+free_hw_metadata:
+	kfree(n_hw_metadata);
+	return NULL;
+}
+
+static void
+net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata)
+{
+	if (hw_metadata->input_dev)
+		dev_put(hw_metadata->input_dev);
+	kfree(hw_metadata->trap_name);
+	kfree(hw_metadata->trap_group_name);
+	kfree(hw_metadata);
+}
+
+static void net_dm_hw_packet_report(struct sk_buff *skb)
+{
+	struct net_dm_hw_metadata *hw_metadata;
+	struct sk_buff *msg;
+	size_t payload_len;
+	int rc;
+
+	if (skb->data > skb_mac_header(skb))
+		skb_push(skb, skb->data - skb_mac_header(skb));
+	else
+		skb_pull(skb, skb_mac_header(skb) - skb->data);
+
+	payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+	if (net_dm_trunc_len)
+		payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
+
+	hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+	msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata),
+			GFP_KERNEL);
+	if (!msg)
+		goto out;
+
+	rc = net_dm_hw_packet_report_fill(msg, skb, payload_len);
+	if (rc) {
+		nlmsg_free(msg);
+		goto out;
+	}
+
+	genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+	net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata);
+	consume_skb(skb);
+}
+
+static void net_dm_hw_packet_work(struct work_struct *work)
+{
+	struct per_cpu_dm_data *hw_data;
+	struct sk_buff_head list;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+	__skb_queue_head_init(&list);
+
+	spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
+	skb_queue_splice_tail_init(&hw_data->drop_queue, &list);
+	spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+
+	while ((skb = __skb_dequeue(&list)))
+		net_dm_hw_packet_report(skb);
+}
+
 static void
 net_dm_hw_packet_probe(struct sk_buff *skb,
 		       const struct net_dm_hw_metadata *hw_metadata)
 {
+	struct net_dm_hw_metadata *n_hw_metadata;
+	ktime_t tstamp = ktime_get_real();
+	struct per_cpu_dm_data *hw_data;
+	struct sk_buff *nskb;
+	unsigned long flags;
+
+	nskb = skb_clone(skb, GFP_ATOMIC);
+	if (!nskb)
+		return;
+
+	n_hw_metadata = net_dm_hw_metadata_clone(hw_metadata);
+	if (!n_hw_metadata)
+		goto free;
+
+	NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata;
+	nskb->tstamp = tstamp;
+
+	hw_data = this_cpu_ptr(&dm_hw_cpu_data);
+
+	spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
+	if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len)
+		__skb_queue_tail(&hw_data->drop_queue, nskb);
+	else
+		goto unlock_free;
+	spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+
+	schedule_work(&hw_data->dm_alert_work);
+
+	return;
+
+unlock_free:
+	spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+	u64_stats_update_begin(&hw_data->stats.syncp);
+	hw_data->stats.dropped++;
+	u64_stats_update_end(&hw_data->stats.syncp);
+	net_dm_hw_metadata_free(n_hw_metadata);
+free:
+	consume_skb(nskb);
 }
 
 static const struct net_dm_alert_ops net_dm_alert_packet_ops = {
 	.kfree_skb_probe	= net_dm_packet_trace_kfree_skb_hit,
 	.napi_poll_probe	= net_dm_packet_trace_napi_poll_hit,
 	.work_item_func		= net_dm_packet_work,
+	.hw_work_item_func	= net_dm_hw_packet_work,
 	.hw_probe		= net_dm_hw_packet_probe,
 };
 
@@ -819,6 +1067,50 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static void net_dm_hw_stats_read(struct net_dm_stats *stats)
+{
+	int cpu;
+
+	memset(stats, 0, sizeof(*stats));
+	for_each_possible_cpu(cpu) {
+		struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+		struct net_dm_stats *cpu_stats = &hw_data->stats;
+		unsigned int start;
+		u64 dropped;
+
+		do {
+			start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+			dropped = cpu_stats->dropped;
+		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+		stats->dropped += dropped;
+	}
+}
+
+static int net_dm_hw_stats_put(struct sk_buff *msg)
+{
+	struct net_dm_stats stats;
+	struct nlattr *attr;
+
+	net_dm_hw_stats_read(&stats);
+
+	attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS);
+	if (!attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
+			      stats.dropped, NET_DM_ATTR_PAD))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, attr);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, attr);
+	return -EMSGSIZE;
+}
+
 static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info)
 {
 	void *hdr;
@@ -833,6 +1125,10 @@ static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info)
 	if (rc)
 		goto nla_put_failure;
 
+	rc = net_dm_hw_stats_put(msg);
+	if (rc)
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From d40e1deb930f4bd51a5214983e50aafc26db686e Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 17 Aug 2019 16:28:15 +0300
Subject: drop_monitor: Add support for summary alert mode for hardware drops

In summary alert mode a notification is sent with a list of recent drop
reasons and a count of how many packets were dropped due to this reason.

To avoid expensive operations in the context in which packets are
dropped, each CPU holds an array whose number of entries is the maximum
number of drop reasons that can be encoded in the netlink notification.
Each entry stores the drop reason and a count. When a packet is dropped
the array is traversed and a new entry is created or the count of an
existing entry is incremented.

Later, in process context, the array is replaced with a newly allocated
copy and the old array is encoded in a netlink notification. To avoid
breaking user space, the notification includes the ancillary header,
which is 'struct net_dm_alert_msg' with number of entries set to '0'.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |   3 +
 net/core/drop_monitor.c          | 195 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 196 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 9f8fb1bb4aa4..3bddc9ec978c 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -87,6 +87,9 @@ enum net_dm_attr {
 	NET_DM_ATTR_ORIGIN,			/* u16 */
 	NET_DM_ATTR_HW_TRAP_GROUP_NAME,		/* string */
 	NET_DM_ATTR_HW_TRAP_NAME,		/* string */
+	NET_DM_ATTR_HW_ENTRIES,			/* nested */
+	NET_DM_ATTR_HW_ENTRY,			/* nested */
+	NET_DM_ATTR_HW_TRAP_COUNT,		/* u32 */
 
 	__NET_DM_ATTR_MAX,
 	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 5a950b5af8fd..807c79d606aa 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -58,9 +58,26 @@ struct net_dm_stats {
 	struct u64_stats_sync syncp;
 };
 
+#define NET_DM_MAX_HW_TRAP_NAME_LEN 40
+
+struct net_dm_hw_entry {
+	char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN];
+	u32 count;
+};
+
+struct net_dm_hw_entries {
+	u32 num_entries;
+	struct net_dm_hw_entry entries[0];
+};
+
 struct per_cpu_dm_data {
-	spinlock_t		lock;	/* Protects 'skb' and 'send_timer' */
-	struct sk_buff		*skb;
+	spinlock_t		lock;	/* Protects 'skb', 'hw_entries' and
+					 * 'send_timer'
+					 */
+	union {
+		struct sk_buff			*skb;
+		struct net_dm_hw_entries	*hw_entries;
+	};
 	struct sk_buff_head	drop_queue;
 	struct work_struct	dm_alert_work;
 	struct timer_list	send_timer;
@@ -275,16 +292,189 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
 	rcu_read_unlock();
 }
 
+static struct net_dm_hw_entries *
+net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data)
+{
+	struct net_dm_hw_entries *hw_entries;
+	unsigned long flags;
+
+	hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit),
+			     GFP_KERNEL);
+	if (!hw_entries) {
+		/* If the memory allocation failed, we try to perform another
+		 * allocation in 1/10 second. Otherwise, the probe function
+		 * will constantly bail out.
+		 */
+		mod_timer(&hw_data->send_timer, jiffies + HZ / 10);
+	}
+
+	spin_lock_irqsave(&hw_data->lock, flags);
+	swap(hw_data->hw_entries, hw_entries);
+	spin_unlock_irqrestore(&hw_data->lock, flags);
+
+	return hw_entries;
+}
+
+static int net_dm_hw_entry_put(struct sk_buff *msg,
+			       const struct net_dm_hw_entry *hw_entry)
+{
+	struct nlattr *attr;
+
+	attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY);
+	if (!attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, attr);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, attr);
+	return -EMSGSIZE;
+}
+
+static int net_dm_hw_entries_put(struct sk_buff *msg,
+				 const struct net_dm_hw_entries *hw_entries)
+{
+	struct nlattr *attr;
+	int i;
+
+	attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES);
+	if (!attr)
+		return -EMSGSIZE;
+
+	for (i = 0; i < hw_entries->num_entries; i++) {
+		int rc;
+
+		rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]);
+		if (rc)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(msg, attr);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, attr);
+	return -EMSGSIZE;
+}
+
+static int
+net_dm_hw_summary_report_fill(struct sk_buff *msg,
+			      const struct net_dm_hw_entries *hw_entries)
+{
+	struct net_dm_alert_msg anc_hdr = { 0 };
+	void *hdr;
+	int rc;
+
+	hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+			  NET_DM_CMD_ALERT);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	/* We need to put the ancillary header in order not to break user
+	 * space.
+	 */
+	if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr))
+		goto nla_put_failure;
+
+	rc = net_dm_hw_entries_put(msg, hw_entries);
+	if (rc)
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void net_dm_hw_summary_work(struct work_struct *work)
+{
+	struct net_dm_hw_entries *hw_entries;
+	struct per_cpu_dm_data *hw_data;
+	struct sk_buff *msg;
+	int rc;
+
+	hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+	hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
+	if (!hw_entries)
+		return;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		goto out;
+
+	rc = net_dm_hw_summary_report_fill(msg, hw_entries);
+	if (rc) {
+		nlmsg_free(msg);
+		goto out;
+	}
+
+	genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+	kfree(hw_entries);
+}
+
 static void
 net_dm_hw_summary_probe(struct sk_buff *skb,
 			const struct net_dm_hw_metadata *hw_metadata)
 {
+	struct net_dm_hw_entries *hw_entries;
+	struct net_dm_hw_entry *hw_entry;
+	struct per_cpu_dm_data *hw_data;
+	unsigned long flags;
+	int i;
+
+	hw_data = this_cpu_ptr(&dm_hw_cpu_data);
+	spin_lock_irqsave(&hw_data->lock, flags);
+	hw_entries = hw_data->hw_entries;
+
+	if (!hw_entries)
+		goto out;
+
+	for (i = 0; i < hw_entries->num_entries; i++) {
+		hw_entry = &hw_entries->entries[i];
+		if (!strncmp(hw_entry->trap_name, hw_metadata->trap_name,
+			     NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) {
+			hw_entry->count++;
+			goto out;
+		}
+	}
+	if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit))
+		goto out;
+
+	hw_entry = &hw_entries->entries[hw_entries->num_entries];
+	strlcpy(hw_entry->trap_name, hw_metadata->trap_name,
+		NET_DM_MAX_HW_TRAP_NAME_LEN - 1);
+	hw_entry->count = 1;
+	hw_entries->num_entries++;
+
+	if (!timer_pending(&hw_data->send_timer)) {
+		hw_data->send_timer.expires = jiffies + dm_delay * HZ;
+		add_timer(&hw_data->send_timer);
+	}
+
+out:
+	spin_unlock_irqrestore(&hw_data->lock, flags);
 }
 
 static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
 	.kfree_skb_probe	= trace_kfree_skb_hit,
 	.napi_poll_probe	= trace_napi_poll_hit,
 	.work_item_func		= send_dm_alert,
+	.hw_work_item_func	= net_dm_hw_summary_work,
 	.hw_probe		= net_dm_hw_summary_probe,
 };
 
@@ -1309,6 +1499,7 @@ static void net_dm_hw_cpu_data_fini(int cpu)
 	struct per_cpu_dm_data *hw_data;
 
 	hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+	kfree(hw_data->hw_entries);
 	__net_dm_cpu_data_fini(hw_data);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 8e94c3bc922e70225bd35891c8e6002bddd984eb Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 17 Aug 2019 16:28:16 +0300
Subject: drop_monitor: Allow user to start monitoring hardware drops

Drop monitor has start and stop commands, but so far these were only
used to start and stop monitoring of software drops.

Now that drop monitor can also monitor hardware drops, we should allow
the user to control these as well.

Do that by adding SW and HW flags to these commands. If no flag is
specified, then only start / stop monitoring software drops. This is
done in order to maintain backward-compatibility with existing user
space applications.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |   2 +
 net/core/drop_monitor.c          | 124 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 123 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 3bddc9ec978c..75a35dccb675 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -90,6 +90,8 @@ enum net_dm_attr {
 	NET_DM_ATTR_HW_ENTRIES,			/* nested */
 	NET_DM_ATTR_HW_ENTRY,			/* nested */
 	NET_DM_ATTR_HW_TRAP_COUNT,		/* u32 */
+	NET_DM_ATTR_SW_DROPS,			/* flag */
+	NET_DM_ATTR_HW_DROPS,			/* flag */
 
 	__NET_DM_ATTR_MAX,
 	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 807c79d606aa..bfc024024aa3 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -952,13 +952,82 @@ static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
 void net_dm_hw_report(struct sk_buff *skb,
 		      const struct net_dm_hw_metadata *hw_metadata)
 {
+	rcu_read_lock();
+
 	if (!monitor_hw)
-		return;
+		goto out;
 
 	net_dm_alert_ops_arr[net_dm_alert_mode]->hw_probe(skb, hw_metadata);
+
+out:
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(net_dm_hw_report);
 
+static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
+{
+	const struct net_dm_alert_ops *ops;
+	int cpu;
+
+	if (monitor_hw) {
+		NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled");
+		return -EAGAIN;
+	}
+
+	ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+	if (!try_module_get(THIS_MODULE)) {
+		NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module");
+		return -ENODEV;
+	}
+
+	for_each_possible_cpu(cpu) {
+		struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+		struct net_dm_hw_entries *hw_entries;
+
+		INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func);
+		timer_setup(&hw_data->send_timer, sched_send_work, 0);
+		hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
+		kfree(hw_entries);
+	}
+
+	monitor_hw = true;
+
+	return 0;
+}
+
+static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
+{
+	int cpu;
+
+	if (!monitor_hw)
+		NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled");
+
+	monitor_hw = false;
+
+	/* After this call returns we are guaranteed that no CPU is processing
+	 * any hardware drops.
+	 */
+	synchronize_rcu();
+
+	for_each_possible_cpu(cpu) {
+		struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+		struct sk_buff *skb;
+
+		del_timer_sync(&hw_data->send_timer);
+		cancel_work_sync(&hw_data->dm_alert_work);
+		while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+			struct net_dm_hw_metadata *hw_metadata;
+
+			hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+			net_dm_hw_metadata_free(hw_metadata);
+			consume_skb(skb);
+		}
+	}
+
+	module_put(THIS_MODULE);
+}
+
 static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
 {
 	const struct net_dm_alert_ops *ops;
@@ -1153,14 +1222,61 @@ static int net_dm_cmd_config(struct sk_buff *skb,
 	return 0;
 }
 
+static int net_dm_monitor_start(bool set_sw, bool set_hw,
+				struct netlink_ext_ack *extack)
+{
+	bool sw_set = false;
+	int rc;
+
+	if (set_sw) {
+		rc = set_all_monitor_traces(TRACE_ON, extack);
+		if (rc)
+			return rc;
+		sw_set = true;
+	}
+
+	if (set_hw) {
+		rc = net_dm_hw_monitor_start(extack);
+		if (rc)
+			goto err_monitor_hw;
+	}
+
+	return 0;
+
+err_monitor_hw:
+	if (sw_set)
+		set_all_monitor_traces(TRACE_OFF, extack);
+	return rc;
+}
+
+static void net_dm_monitor_stop(bool set_sw, bool set_hw,
+				struct netlink_ext_ack *extack)
+{
+	if (set_hw)
+		net_dm_hw_monitor_stop(extack);
+	if (set_sw)
+		set_all_monitor_traces(TRACE_OFF, extack);
+}
+
 static int net_dm_cmd_trace(struct sk_buff *skb,
 			struct genl_info *info)
 {
+	bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS];
+	bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS];
+	struct netlink_ext_ack *extack = info->extack;
+
+	/* To maintain backward compatibility, we start / stop monitoring of
+	 * software drops if no flag is specified.
+	 */
+	if (!set_sw && !set_hw)
+		set_sw = true;
+
 	switch (info->genlhdr->cmd) {
 	case NET_DM_CMD_START:
-		return set_all_monitor_traces(TRACE_ON, info->extack);
+		return net_dm_monitor_start(set_sw, set_hw, extack);
 	case NET_DM_CMD_STOP:
-		return set_all_monitor_traces(TRACE_OFF, info->extack);
+		net_dm_monitor_stop(set_sw, set_hw, extack);
+		return 0;
 	}
 
 	return -EOPNOTSUPP;
@@ -1392,6 +1508,8 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
 	[NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
 	[NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
 	[NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
+	[NET_DM_ATTR_SW_DROPS]	= {. type = NLA_FLAG },
+	[NET_DM_ATTR_HW_DROPS]	= {. type = NLA_FLAG },
 };
 
 static const struct genl_ops dropmon_ops[] = {
-- 
cgit v1.2.3-59-g8ed1b


From 0f420b6c52e9799f664429e739a421fb4c5527a3 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 17 Aug 2019 16:28:17 +0300
Subject: devlink: Add packet trap infrastructure

Add the basic packet trap infrastructure that allows device drivers to
register their supported packet traps and trap groups with devlink.

Each driver is expected to provide basic information about each
supported trap, such as name and ID, but also the supported metadata
types that will accompany each packet trapped via the trap. The
currently supported metadata type is just the input port, but more will
be added in the future. For example, output port and traffic class.

Trap groups allow users to set the action of all member traps. In
addition, users can retrieve per-group statistics in case per-trap
statistics are too narrow. In the future, the trap group object can be
extended with more attributes, such as policer settings which will limit
the amount of traffic generated by member traps towards the CPU.

Beside registering their packet traps with devlink, drivers are also
expected to report trapped packets to devlink along with relevant
metadata. devlink will maintain packets and bytes statistics for each
packet trap and will potentially report the trapped packet with its
metadata to user space via drop monitor netlink channel.

The interface towards the drivers is simple and allows devlink to set
the action of the trap. Currently, only two actions are supported:
'trap' and 'drop'. When set to 'trap', the device is expected to provide
the sole copy of the packet to the driver which will pass it to devlink.
When set to 'drop', the device is expected to drop the packet and not
send a copy to the driver. In the future, more actions can be added,
such as 'mirror'.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  129 +++++
 include/uapi/linux/devlink.h |   62 +++
 net/Kconfig                  |    1 +
 net/core/devlink.c           | 1068 +++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 1255 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 451268f64880..03b32e33e93e 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -14,6 +14,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
+#include <linux/refcount.h>
 #include <net/net_namespace.h>
 #include <uapi/linux/devlink.h>
 
@@ -31,6 +32,8 @@ struct devlink {
 	struct list_head reporter_list;
 	struct mutex reporters_lock; /* protects reporter_list */
 	struct devlink_dpipe_headers *dpipe_headers;
+	struct list_head trap_list;
+	struct list_head trap_group_list;
 	const struct devlink_ops *ops;
 	struct device *dev;
 	possible_net_t _net;
@@ -497,6 +500,89 @@ struct devlink_health_reporter_ops {
 			struct devlink_fmsg *fmsg);
 };
 
+/**
+ * struct devlink_trap_group - Immutable packet trap group attributes.
+ * @name: Trap group name.
+ * @id: Trap group identifier.
+ * @generic: Whether the trap group is generic or not.
+ *
+ * Describes immutable attributes of packet trap groups that drivers register
+ * with devlink.
+ */
+struct devlink_trap_group {
+	const char *name;
+	u16 id;
+	bool generic;
+};
+
+#define DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT	BIT(0)
+
+/**
+ * struct devlink_trap - Immutable packet trap attributes.
+ * @type: Trap type.
+ * @init_action: Initial trap action.
+ * @generic: Whether the trap is generic or not.
+ * @id: Trap identifier.
+ * @name: Trap name.
+ * @group: Immutable packet trap group attributes.
+ * @metadata_cap: Metadata types that can be provided by the trap.
+ *
+ * Describes immutable attributes of packet traps that drivers register with
+ * devlink.
+ */
+struct devlink_trap {
+	enum devlink_trap_type type;
+	enum devlink_trap_action init_action;
+	bool generic;
+	u16 id;
+	const char *name;
+	struct devlink_trap_group group;
+	u32 metadata_cap;
+};
+
+enum devlink_trap_generic_id {
+	/* Add new generic trap IDs above */
+	__DEVLINK_TRAP_GENERIC_ID_MAX,
+	DEVLINK_TRAP_GENERIC_ID_MAX = __DEVLINK_TRAP_GENERIC_ID_MAX - 1,
+};
+
+enum devlink_trap_group_generic_id {
+	/* Add new generic trap group IDs above */
+	__DEVLINK_TRAP_GROUP_GENERIC_ID_MAX,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_MAX =
+		__DEVLINK_TRAP_GROUP_GENERIC_ID_MAX - 1,
+};
+
+#define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group, _metadata_cap) \
+	{								      \
+		.type = DEVLINK_TRAP_TYPE_##_type,			      \
+		.init_action = DEVLINK_TRAP_ACTION_##_init_action,	      \
+		.generic = true,					      \
+		.id = DEVLINK_TRAP_GENERIC_ID_##_id,			      \
+		.name = DEVLINK_TRAP_GENERIC_NAME_##_id,		      \
+		.group = _group,					      \
+		.metadata_cap = _metadata_cap,				      \
+	}
+
+#define DEVLINK_TRAP_DRIVER(_type, _init_action, _id, _name, _group,	      \
+			    _metadata_cap)				      \
+	{								      \
+		.type = DEVLINK_TRAP_TYPE_##_type,			      \
+		.init_action = DEVLINK_TRAP_ACTION_##_init_action,	      \
+		.generic = false,					      \
+		.id = _id,						      \
+		.name = _name,						      \
+		.group = _group,					      \
+		.metadata_cap = _metadata_cap,				      \
+	}
+
+#define DEVLINK_TRAP_GROUP_GENERIC(_id)					      \
+	{								      \
+		.name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id,		      \
+		.id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id,		      \
+		.generic = true,					      \
+	}
+
 struct devlink_ops {
 	int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
 	int (*port_type_set)(struct devlink_port *devlink_port,
@@ -558,6 +644,38 @@ struct devlink_ops {
 	int (*flash_update)(struct devlink *devlink, const char *file_name,
 			    const char *component,
 			    struct netlink_ext_ack *extack);
+	/**
+	 * @trap_init: Trap initialization function.
+	 *
+	 * Should be used by device drivers to initialize the trap in the
+	 * underlying device. Drivers should also store the provided trap
+	 * context, so that they could efficiently pass it to
+	 * devlink_trap_report() when the trap is triggered.
+	 */
+	int (*trap_init)(struct devlink *devlink,
+			 const struct devlink_trap *trap, void *trap_ctx);
+	/**
+	 * @trap_fini: Trap de-initialization function.
+	 *
+	 * Should be used by device drivers to de-initialize the trap in the
+	 * underlying device.
+	 */
+	void (*trap_fini)(struct devlink *devlink,
+			  const struct devlink_trap *trap, void *trap_ctx);
+	/**
+	 * @trap_action_set: Trap action set function.
+	 */
+	int (*trap_action_set)(struct devlink *devlink,
+			       const struct devlink_trap *trap,
+			       enum devlink_trap_action action);
+	/**
+	 * @trap_group_init: Trap group initialization function.
+	 *
+	 * Should be used by device drivers to initialize the trap group in the
+	 * underlying device.
+	 */
+	int (*trap_group_init)(struct devlink *devlink,
+			       const struct devlink_trap_group *group);
 };
 
 static inline void *devlink_priv(struct devlink *devlink)
@@ -774,6 +892,17 @@ void devlink_flash_update_status_notify(struct devlink *devlink,
 					unsigned long done,
 					unsigned long total);
 
+int devlink_traps_register(struct devlink *devlink,
+			   const struct devlink_trap *traps,
+			   size_t traps_count, void *priv);
+void devlink_traps_unregister(struct devlink *devlink,
+			      const struct devlink_trap *traps,
+			      size_t traps_count);
+void devlink_trap_report(struct devlink *devlink,
+			 struct sk_buff *skb, void *trap_ctx,
+			 struct devlink_port *in_devlink_port);
+void *devlink_trap_ctx_priv(void *trap_ctx);
+
 #if IS_ENABLED(CONFIG_NET_DEVLINK)
 
 void devlink_compat_running_version(struct net_device *dev,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index ffc993256527..546e75dd74ac 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -107,6 +107,16 @@ enum devlink_command {
 	DEVLINK_CMD_FLASH_UPDATE_END,		/* notification only */
 	DEVLINK_CMD_FLASH_UPDATE_STATUS,	/* notification only */
 
+	DEVLINK_CMD_TRAP_GET,		/* can dump */
+	DEVLINK_CMD_TRAP_SET,
+	DEVLINK_CMD_TRAP_NEW,
+	DEVLINK_CMD_TRAP_DEL,
+
+	DEVLINK_CMD_TRAP_GROUP_GET,	/* can dump */
+	DEVLINK_CMD_TRAP_GROUP_SET,
+	DEVLINK_CMD_TRAP_GROUP_NEW,
+	DEVLINK_CMD_TRAP_GROUP_DEL,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -194,6 +204,47 @@ enum devlink_param_fw_load_policy_value {
 	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH,
 };
 
+enum {
+	DEVLINK_ATTR_STATS_RX_PACKETS,		/* u64 */
+	DEVLINK_ATTR_STATS_RX_BYTES,		/* u64 */
+
+	__DEVLINK_ATTR_STATS_MAX,
+	DEVLINK_ATTR_STATS_MAX = __DEVLINK_ATTR_STATS_MAX - 1
+};
+
+/**
+ * enum devlink_trap_action - Packet trap action.
+ * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not
+ *                            sent to the CPU.
+ * @DEVLINK_TRAP_ACTION_TRAP: The sole copy of the packet is sent to the CPU.
+ */
+enum devlink_trap_action {
+	DEVLINK_TRAP_ACTION_DROP,
+	DEVLINK_TRAP_ACTION_TRAP,
+};
+
+/**
+ * enum devlink_trap_type - Packet trap type.
+ * @DEVLINK_TRAP_TYPE_DROP: Trap reason is a drop. Trapped packets are only
+ *                          processed by devlink and not injected to the
+ *                          kernel's Rx path.
+ * @DEVLINK_TRAP_TYPE_EXCEPTION: Trap reason is an exception. Packet was not
+ *                               forwarded as intended due to an exception
+ *                               (e.g., missing neighbour entry) and trapped to
+ *                               control plane for resolution. Trapped packets
+ *                               are processed by devlink and injected to
+ *                               the kernel's Rx path.
+ */
+enum devlink_trap_type {
+	DEVLINK_TRAP_TYPE_DROP,
+	DEVLINK_TRAP_TYPE_EXCEPTION,
+};
+
+enum {
+	/* Trap can report input port as metadata */
+	DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT,
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -348,6 +399,17 @@ enum devlink_attr {
 	DEVLINK_ATTR_PORT_PCI_PF_NUMBER,	/* u16 */
 	DEVLINK_ATTR_PORT_PCI_VF_NUMBER,	/* u16 */
 
+	DEVLINK_ATTR_STATS,				/* nested */
+
+	DEVLINK_ATTR_TRAP_NAME,				/* string */
+	/* enum devlink_trap_action */
+	DEVLINK_ATTR_TRAP_ACTION,			/* u8 */
+	/* enum devlink_trap_type */
+	DEVLINK_ATTR_TRAP_TYPE,				/* u8 */
+	DEVLINK_ATTR_TRAP_GENERIC,			/* flag */
+	DEVLINK_ATTR_TRAP_METADATA,			/* nested */
+	DEVLINK_ATTR_TRAP_GROUP_NAME,			/* string */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/Kconfig b/net/Kconfig
index 57f51a279ad6..3101bfcbdd7a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -430,6 +430,7 @@ config NET_SOCK_MSG
 config NET_DEVLINK
 	bool
 	default n
+	imply NET_DROP_MONITOR
 
 config PAGE_POOL
        bool
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d3dbb904bf3b..960ceaec98d6 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -18,6 +18,8 @@
 #include <linux/spinlock.h>
 #include <linux/refcount.h>
 #include <linux/workqueue.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/timekeeping.h>
 #include <rdma/ib_verbs.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
@@ -25,6 +27,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/devlink.h>
+#include <net/drop_monitor.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/devlink.h>
 
@@ -551,7 +554,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
 		goto nla_put_failure;
 
-	spin_lock(&devlink_port->type_lock);
+	spin_lock_bh(&devlink_port->type_lock);
 	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
 		goto nla_put_failure_type_locked;
 	if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
@@ -576,7 +579,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 				   ibdev->name))
 			goto nla_put_failure_type_locked;
 	}
-	spin_unlock(&devlink_port->type_lock);
+	spin_unlock_bh(&devlink_port->type_lock);
 	if (devlink_nl_port_attrs_put(msg, devlink_port))
 		goto nla_put_failure;
 
@@ -584,7 +587,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 	return 0;
 
 nla_put_failure_type_locked:
-	spin_unlock(&devlink_port->type_lock);
+	spin_unlock_bh(&devlink_port->type_lock);
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
 	return -EMSGSIZE;
@@ -5154,6 +5157,571 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
 	return 0;
 }
 
+struct devlink_stats {
+	u64 rx_bytes;
+	u64 rx_packets;
+	struct u64_stats_sync syncp;
+};
+
+/**
+ * struct devlink_trap_group_item - Packet trap group attributes.
+ * @group: Immutable packet trap group attributes.
+ * @refcount: Number of trap items using the group.
+ * @list: trap_group_list member.
+ * @stats: Trap group statistics.
+ *
+ * Describes packet trap group attributes. Created by devlink during trap
+ * registration.
+ */
+struct devlink_trap_group_item {
+	const struct devlink_trap_group *group;
+	refcount_t refcount;
+	struct list_head list;
+	struct devlink_stats __percpu *stats;
+};
+
+/**
+ * struct devlink_trap_item - Packet trap attributes.
+ * @trap: Immutable packet trap attributes.
+ * @group_item: Associated group item.
+ * @list: trap_list member.
+ * @action: Trap action.
+ * @stats: Trap statistics.
+ * @priv: Driver private information.
+ *
+ * Describes both mutable and immutable packet trap attributes. Created by
+ * devlink during trap registration and used for all trap related operations.
+ */
+struct devlink_trap_item {
+	const struct devlink_trap *trap;
+	struct devlink_trap_group_item *group_item;
+	struct list_head list;
+	enum devlink_trap_action action;
+	struct devlink_stats __percpu *stats;
+	void *priv;
+};
+
+static struct devlink_trap_item *
+devlink_trap_item_lookup(struct devlink *devlink, const char *name)
+{
+	struct devlink_trap_item *trap_item;
+
+	list_for_each_entry(trap_item, &devlink->trap_list, list) {
+		if (!strcmp(trap_item->trap->name, name))
+			return trap_item;
+	}
+
+	return NULL;
+}
+
+static struct devlink_trap_item *
+devlink_trap_item_get_from_info(struct devlink *devlink,
+				struct genl_info *info)
+{
+	struct nlattr *attr;
+
+	if (!info->attrs[DEVLINK_ATTR_TRAP_NAME])
+		return NULL;
+	attr = info->attrs[DEVLINK_ATTR_TRAP_NAME];
+
+	return devlink_trap_item_lookup(devlink, nla_data(attr));
+}
+
+static int
+devlink_trap_action_get_from_info(struct genl_info *info,
+				  enum devlink_trap_action *p_trap_action)
+{
+	u8 val;
+
+	val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]);
+	switch (val) {
+	case DEVLINK_TRAP_ACTION_DROP: /* fall-through */
+	case DEVLINK_TRAP_ACTION_TRAP:
+		*p_trap_action = val;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int devlink_trap_metadata_put(struct sk_buff *msg,
+				     const struct devlink_trap *trap)
+{
+	struct nlattr *attr;
+
+	attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA);
+	if (!attr)
+		return -EMSGSIZE;
+
+	if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) &&
+	    nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, attr);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, attr);
+	return -EMSGSIZE;
+}
+
+static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
+				    struct devlink_stats *stats)
+{
+	int i;
+
+	memset(stats, 0, sizeof(*stats));
+	for_each_possible_cpu(i) {
+		struct devlink_stats *cpu_stats;
+		u64 rx_packets, rx_bytes;
+		unsigned int start;
+
+		cpu_stats = per_cpu_ptr(trap_stats, i);
+		do {
+			start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+			rx_packets = cpu_stats->rx_packets;
+			rx_bytes = cpu_stats->rx_bytes;
+		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+		stats->rx_packets += rx_packets;
+		stats->rx_bytes += rx_bytes;
+	}
+}
+
+static int devlink_trap_stats_put(struct sk_buff *msg,
+				  struct devlink_stats __percpu *trap_stats)
+{
+	struct devlink_stats stats;
+	struct nlattr *attr;
+
+	devlink_trap_stats_read(trap_stats, &stats);
+
+	attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+	if (!attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
+			      stats.rx_packets, DEVLINK_ATTR_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES,
+			      stats.rx_bytes, DEVLINK_ATTR_PAD))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, attr);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, attr);
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
+				const struct devlink_trap_item *trap_item,
+				enum devlink_command cmd, u32 portid, u32 seq,
+				int flags)
+{
+	struct devlink_trap_group_item *group_item = trap_item->group_item;
+	void *hdr;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
+			   group_item->group->name))
+		goto nla_put_failure;
+
+	if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name))
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type))
+		goto nla_put_failure;
+
+	if (trap_item->trap->generic &&
+	    nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action))
+		goto nla_put_failure;
+
+	err = devlink_trap_metadata_put(msg, trap_item->trap);
+	if (err)
+		goto nla_put_failure;
+
+	err = devlink_trap_stats_put(msg, trap_item->stats);
+	if (err)
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_trap_get_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct netlink_ext_ack *extack = info->extack;
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_trap_item *trap_item;
+	struct sk_buff *msg;
+	int err;
+
+	if (list_empty(&devlink->trap_list))
+		return -EOPNOTSUPP;
+
+	trap_item = devlink_trap_item_get_from_info(devlink, info);
+	if (!trap_item) {
+		NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap");
+		return -ENOENT;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_trap_fill(msg, devlink, trap_item,
+				   DEVLINK_CMD_TRAP_NEW, info->snd_portid,
+				   info->snd_seq, 0);
+	if (err)
+		goto err_trap_fill;
+
+	return genlmsg_reply(msg, info);
+
+err_trap_fill:
+	nlmsg_free(msg);
+	return err;
+}
+
+static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg,
+					  struct netlink_callback *cb)
+{
+	struct devlink_trap_item *trap_item;
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		mutex_lock(&devlink->lock);
+		list_for_each_entry(trap_item, &devlink->trap_list, list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_trap_fill(msg, devlink, trap_item,
+						   DEVLINK_CMD_TRAP_NEW,
+						   NETLINK_CB(cb->skb).portid,
+						   cb->nlh->nlmsg_seq,
+						   NLM_F_MULTI);
+			if (err) {
+				mutex_unlock(&devlink->lock);
+				goto out;
+			}
+			idx++;
+		}
+		mutex_unlock(&devlink->lock);
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int __devlink_trap_action_set(struct devlink *devlink,
+				     struct devlink_trap_item *trap_item,
+				     enum devlink_trap_action trap_action,
+				     struct netlink_ext_ack *extack)
+{
+	int err;
+
+	if (trap_item->action != trap_action &&
+	    trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot change action of non-drop traps. Skipping");
+		return 0;
+	}
+
+	err = devlink->ops->trap_action_set(devlink, trap_item->trap,
+					    trap_action);
+	if (err)
+		return err;
+
+	trap_item->action = trap_action;
+
+	return 0;
+}
+
+static int devlink_trap_action_set(struct devlink *devlink,
+				   struct devlink_trap_item *trap_item,
+				   struct genl_info *info)
+{
+	enum devlink_trap_action trap_action;
+	int err;
+
+	if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
+		return 0;
+
+	err = devlink_trap_action_get_from_info(info, &trap_action);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action");
+		return -EINVAL;
+	}
+
+	return __devlink_trap_action_set(devlink, trap_item, trap_action,
+					 info->extack);
+}
+
+static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct netlink_ext_ack *extack = info->extack;
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_trap_item *trap_item;
+	int err;
+
+	if (list_empty(&devlink->trap_list))
+		return -EOPNOTSUPP;
+
+	trap_item = devlink_trap_item_get_from_info(devlink, info);
+	if (!trap_item) {
+		NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap");
+		return -ENOENT;
+	}
+
+	err = devlink_trap_action_set(devlink, trap_item, info);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_lookup(struct devlink *devlink, const char *name)
+{
+	struct devlink_trap_group_item *group_item;
+
+	list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+		if (!strcmp(group_item->group->name, name))
+			return group_item;
+	}
+
+	return NULL;
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_get_from_info(struct devlink *devlink,
+				      struct genl_info *info)
+{
+	char *name;
+
+	if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME])
+		return NULL;
+	name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]);
+
+	return devlink_trap_group_item_lookup(devlink, name);
+}
+
+static int
+devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
+			   const struct devlink_trap_group_item *group_item,
+			   enum devlink_command cmd, u32 portid, u32 seq,
+			   int flags)
+{
+	void *hdr;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
+			   group_item->group->name))
+		goto nla_put_failure;
+
+	if (group_item->group->generic &&
+	    nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
+		goto nla_put_failure;
+
+	err = devlink_trap_stats_put(msg, group_item->stats);
+	if (err)
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_trap_group_get_doit(struct sk_buff *skb,
+					      struct genl_info *info)
+{
+	struct netlink_ext_ack *extack = info->extack;
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_trap_group_item *group_item;
+	struct sk_buff *msg;
+	int err;
+
+	if (list_empty(&devlink->trap_group_list))
+		return -EOPNOTSUPP;
+
+	group_item = devlink_trap_group_item_get_from_info(devlink, info);
+	if (!group_item) {
+		NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group");
+		return -ENOENT;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_trap_group_fill(msg, devlink, group_item,
+					 DEVLINK_CMD_TRAP_GROUP_NEW,
+					 info->snd_portid, info->snd_seq, 0);
+	if (err)
+		goto err_trap_group_fill;
+
+	return genlmsg_reply(msg, info);
+
+err_trap_group_fill:
+	nlmsg_free(msg);
+	return err;
+}
+
+static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg,
+						struct netlink_callback *cb)
+{
+	enum devlink_command cmd = DEVLINK_CMD_TRAP_GROUP_NEW;
+	struct devlink_trap_group_item *group_item;
+	u32 portid = NETLINK_CB(cb->skb).portid;
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		mutex_lock(&devlink->lock);
+		list_for_each_entry(group_item, &devlink->trap_group_list,
+				    list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_trap_group_fill(msg, devlink,
+							 group_item, cmd,
+							 portid,
+							 cb->nlh->nlmsg_seq,
+							 NLM_F_MULTI);
+			if (err) {
+				mutex_unlock(&devlink->lock);
+				goto out;
+			}
+			idx++;
+		}
+		mutex_unlock(&devlink->lock);
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int
+__devlink_trap_group_action_set(struct devlink *devlink,
+				struct devlink_trap_group_item *group_item,
+				enum devlink_trap_action trap_action,
+				struct netlink_ext_ack *extack)
+{
+	const char *group_name = group_item->group->name;
+	struct devlink_trap_item *trap_item;
+	int err;
+
+	list_for_each_entry(trap_item, &devlink->trap_list, list) {
+		if (strcmp(trap_item->trap->group.name, group_name))
+			continue;
+		err = __devlink_trap_action_set(devlink, trap_item,
+						trap_action, extack);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int
+devlink_trap_group_action_set(struct devlink *devlink,
+			      struct devlink_trap_group_item *group_item,
+			      struct genl_info *info)
+{
+	enum devlink_trap_action trap_action;
+	int err;
+
+	if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
+		return 0;
+
+	err = devlink_trap_action_get_from_info(info, &trap_action);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action");
+		return -EINVAL;
+	}
+
+	err = __devlink_trap_group_action_set(devlink, group_item, trap_action,
+					      info->extack);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
+					      struct genl_info *info)
+{
+	struct netlink_ext_ack *extack = info->extack;
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_trap_group_item *group_item;
+	int err;
+
+	if (list_empty(&devlink->trap_group_list))
+		return -EOPNOTSUPP;
+
+	group_item = devlink_trap_group_item_get_from_info(devlink, info);
+	if (!group_item) {
+		NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group");
+		return -ENOENT;
+	}
+
+	err = devlink_trap_group_action_set(devlink, group_item, info);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -5184,6 +5752,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 },
+	[DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -5483,6 +6054,32 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_TRAP_GET,
+		.doit = devlink_nl_cmd_trap_get_doit,
+		.dumpit = devlink_nl_cmd_trap_get_dumpit,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_TRAP_SET,
+		.doit = devlink_nl_cmd_trap_set_doit,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
+	{
+		.cmd = DEVLINK_CMD_TRAP_GROUP_GET,
+		.doit = devlink_nl_cmd_trap_group_get_doit,
+		.dumpit = devlink_nl_cmd_trap_group_get_dumpit,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_TRAP_GROUP_SET,
+		.doit = devlink_nl_cmd_trap_group_set_doit,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
@@ -5528,6 +6125,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->param_list);
 	INIT_LIST_HEAD(&devlink->region_list);
 	INIT_LIST_HEAD(&devlink->reporter_list);
+	INIT_LIST_HEAD(&devlink->trap_list);
+	INIT_LIST_HEAD(&devlink->trap_group_list);
 	mutex_init(&devlink->lock);
 	mutex_init(&devlink->reporters_lock);
 	return devlink;
@@ -5574,6 +6173,8 @@ void devlink_free(struct devlink *devlink)
 {
 	mutex_destroy(&devlink->reporters_lock);
 	mutex_destroy(&devlink->lock);
+	WARN_ON(!list_empty(&devlink->trap_group_list));
+	WARN_ON(!list_empty(&devlink->trap_list));
 	WARN_ON(!list_empty(&devlink->reporter_list));
 	WARN_ON(!list_empty(&devlink->region_list));
 	WARN_ON(!list_empty(&devlink->param_list));
@@ -5678,10 +6279,10 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
 	if (WARN_ON(!devlink_port->registered))
 		return;
 	devlink_port_type_warn_cancel(devlink_port);
-	spin_lock(&devlink_port->type_lock);
+	spin_lock_bh(&devlink_port->type_lock);
 	devlink_port->type = type;
 	devlink_port->type_dev = type_dev;
-	spin_unlock(&devlink_port->type_lock);
+	spin_unlock_bh(&devlink_port->type_lock);
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 }
 
@@ -6834,6 +7435,463 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
 
+#define DEVLINK_TRAP(_id, _type)					      \
+	{								      \
+		.type = DEVLINK_TRAP_TYPE_##_type,			      \
+		.id = DEVLINK_TRAP_GENERIC_ID_##_id,			      \
+		.name = DEVLINK_TRAP_GENERIC_NAME_##_id,		      \
+	}
+
+static const struct devlink_trap devlink_trap_generic[] = {
+};
+
+#define DEVLINK_TRAP_GROUP(_id)						      \
+	{								      \
+		.id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id,		      \
+		.name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id,		      \
+	}
+
+static const struct devlink_trap_group devlink_trap_group_generic[] = {
+};
+
+static int devlink_trap_generic_verify(const struct devlink_trap *trap)
+{
+	if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX)
+		return -EINVAL;
+
+	if (strcmp(trap->name, devlink_trap_generic[trap->id].name))
+		return -EINVAL;
+
+	if (trap->type != devlink_trap_generic[trap->id].type)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int devlink_trap_driver_verify(const struct devlink_trap *trap)
+{
+	int i;
+
+	if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) {
+		if (!strcmp(trap->name, devlink_trap_generic[i].name))
+			return -EEXIST;
+	}
+
+	return 0;
+}
+
+static int devlink_trap_verify(const struct devlink_trap *trap)
+{
+	if (!trap || !trap->name || !trap->group.name)
+		return -EINVAL;
+
+	if (trap->generic)
+		return devlink_trap_generic_verify(trap);
+	else
+		return devlink_trap_driver_verify(trap);
+}
+
+static int
+devlink_trap_group_generic_verify(const struct devlink_trap_group *group)
+{
+	if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
+		return -EINVAL;
+
+	if (strcmp(group->name, devlink_trap_group_generic[group->id].name))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+devlink_trap_group_driver_verify(const struct devlink_trap_group *group)
+{
+	int i;
+
+	if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) {
+		if (!strcmp(group->name, devlink_trap_group_generic[i].name))
+			return -EEXIST;
+	}
+
+	return 0;
+}
+
+static int devlink_trap_group_verify(const struct devlink_trap_group *group)
+{
+	if (group->generic)
+		return devlink_trap_group_generic_verify(group);
+	else
+		return devlink_trap_group_driver_verify(group);
+}
+
+static void
+devlink_trap_group_notify(struct devlink *devlink,
+			  const struct devlink_trap_group_item *group_item,
+			  enum devlink_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW &&
+		     cmd != DEVLINK_CMD_TRAP_GROUP_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0,
+					 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_create(struct devlink *devlink,
+			       const struct devlink_trap_group *group)
+{
+	struct devlink_trap_group_item *group_item;
+	int err;
+
+	err = devlink_trap_group_verify(group);
+	if (err)
+		return ERR_PTR(err);
+
+	group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
+	if (!group_item)
+		return ERR_PTR(-ENOMEM);
+
+	group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+	if (!group_item->stats) {
+		err = -ENOMEM;
+		goto err_stats_alloc;
+	}
+
+	group_item->group = group;
+	refcount_set(&group_item->refcount, 1);
+
+	if (devlink->ops->trap_group_init) {
+		err = devlink->ops->trap_group_init(devlink, group);
+		if (err)
+			goto err_group_init;
+	}
+
+	list_add_tail(&group_item->list, &devlink->trap_group_list);
+	devlink_trap_group_notify(devlink, group_item,
+				  DEVLINK_CMD_TRAP_GROUP_NEW);
+
+	return group_item;
+
+err_group_init:
+	free_percpu(group_item->stats);
+err_stats_alloc:
+	kfree(group_item);
+	return ERR_PTR(err);
+}
+
+static void
+devlink_trap_group_item_destroy(struct devlink *devlink,
+				struct devlink_trap_group_item *group_item)
+{
+	devlink_trap_group_notify(devlink, group_item,
+				  DEVLINK_CMD_TRAP_GROUP_DEL);
+	list_del(&group_item->list);
+	free_percpu(group_item->stats);
+	kfree(group_item);
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_get(struct devlink *devlink,
+			    const struct devlink_trap_group *group)
+{
+	struct devlink_trap_group_item *group_item;
+
+	group_item = devlink_trap_group_item_lookup(devlink, group->name);
+	if (group_item) {
+		refcount_inc(&group_item->refcount);
+		return group_item;
+	}
+
+	return devlink_trap_group_item_create(devlink, group);
+}
+
+static void
+devlink_trap_group_item_put(struct devlink *devlink,
+			    struct devlink_trap_group_item *group_item)
+{
+	if (!refcount_dec_and_test(&group_item->refcount))
+		return;
+
+	devlink_trap_group_item_destroy(devlink, group_item);
+}
+
+static int
+devlink_trap_item_group_link(struct devlink *devlink,
+			     struct devlink_trap_item *trap_item)
+{
+	struct devlink_trap_group_item *group_item;
+
+	group_item = devlink_trap_group_item_get(devlink,
+						 &trap_item->trap->group);
+	if (IS_ERR(group_item))
+		return PTR_ERR(group_item);
+
+	trap_item->group_item = group_item;
+
+	return 0;
+}
+
+static void
+devlink_trap_item_group_unlink(struct devlink *devlink,
+			       struct devlink_trap_item *trap_item)
+{
+	devlink_trap_group_item_put(devlink, trap_item->group_item);
+}
+
+static void devlink_trap_notify(struct devlink *devlink,
+				const struct devlink_trap_item *trap_item,
+				enum devlink_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW &&
+		     cmd != DEVLINK_CMD_TRAP_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int
+devlink_trap_register(struct devlink *devlink,
+		      const struct devlink_trap *trap, void *priv)
+{
+	struct devlink_trap_item *trap_item;
+	int err;
+
+	if (devlink_trap_item_lookup(devlink, trap->name))
+		return -EEXIST;
+
+	trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL);
+	if (!trap_item)
+		return -ENOMEM;
+
+	trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+	if (!trap_item->stats) {
+		err = -ENOMEM;
+		goto err_stats_alloc;
+	}
+
+	trap_item->trap = trap;
+	trap_item->action = trap->init_action;
+	trap_item->priv = priv;
+
+	err = devlink_trap_item_group_link(devlink, trap_item);
+	if (err)
+		goto err_group_link;
+
+	err = devlink->ops->trap_init(devlink, trap, trap_item);
+	if (err)
+		goto err_trap_init;
+
+	list_add_tail(&trap_item->list, &devlink->trap_list);
+	devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW);
+
+	return 0;
+
+err_trap_init:
+	devlink_trap_item_group_unlink(devlink, trap_item);
+err_group_link:
+	free_percpu(trap_item->stats);
+err_stats_alloc:
+	kfree(trap_item);
+	return err;
+}
+
+static void devlink_trap_unregister(struct devlink *devlink,
+				    const struct devlink_trap *trap)
+{
+	struct devlink_trap_item *trap_item;
+
+	trap_item = devlink_trap_item_lookup(devlink, trap->name);
+	if (WARN_ON_ONCE(!trap_item))
+		return;
+
+	devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL);
+	list_del(&trap_item->list);
+	if (devlink->ops->trap_fini)
+		devlink->ops->trap_fini(devlink, trap, trap_item);
+	devlink_trap_item_group_unlink(devlink, trap_item);
+	free_percpu(trap_item->stats);
+	kfree(trap_item);
+}
+
+static void devlink_trap_disable(struct devlink *devlink,
+				 const struct devlink_trap *trap)
+{
+	struct devlink_trap_item *trap_item;
+
+	trap_item = devlink_trap_item_lookup(devlink, trap->name);
+	if (WARN_ON_ONCE(!trap_item))
+		return;
+
+	devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP);
+	trap_item->action = DEVLINK_TRAP_ACTION_DROP;
+}
+
+/**
+ * devlink_traps_register - Register packet traps with devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ * @priv: Driver private information.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_traps_register(struct devlink *devlink,
+			   const struct devlink_trap *traps,
+			   size_t traps_count, void *priv)
+{
+	int i, err;
+
+	if (!devlink->ops->trap_init || !devlink->ops->trap_action_set)
+		return -EINVAL;
+
+	mutex_lock(&devlink->lock);
+	for (i = 0; i < traps_count; i++) {
+		const struct devlink_trap *trap = &traps[i];
+
+		err = devlink_trap_verify(trap);
+		if (err)
+			goto err_trap_verify;
+
+		err = devlink_trap_register(devlink, trap, priv);
+		if (err)
+			goto err_trap_register;
+	}
+	mutex_unlock(&devlink->lock);
+
+	return 0;
+
+err_trap_register:
+err_trap_verify:
+	for (i--; i >= 0; i--)
+		devlink_trap_unregister(devlink, &traps[i]);
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_traps_register);
+
+/**
+ * devlink_traps_unregister - Unregister packet traps from devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ */
+void devlink_traps_unregister(struct devlink *devlink,
+			      const struct devlink_trap *traps,
+			      size_t traps_count)
+{
+	int i;
+
+	mutex_lock(&devlink->lock);
+	/* Make sure we do not have any packets in-flight while unregistering
+	 * traps by disabling all of them and waiting for a grace period.
+	 */
+	for (i = traps_count - 1; i >= 0; i--)
+		devlink_trap_disable(devlink, &traps[i]);
+	synchronize_rcu();
+	for (i = traps_count - 1; i >= 0; i--)
+		devlink_trap_unregister(devlink, &traps[i]);
+	mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_traps_unregister);
+
+static void
+devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats,
+			  size_t skb_len)
+{
+	struct devlink_stats *stats;
+
+	stats = this_cpu_ptr(trap_stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_bytes += skb_len;
+	stats->rx_packets++;
+	u64_stats_update_end(&stats->syncp);
+}
+
+static void
+devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata,
+				  const struct devlink_trap_item *trap_item,
+				  struct devlink_port *in_devlink_port)
+{
+	struct devlink_trap_group_item *group_item = trap_item->group_item;
+
+	hw_metadata->trap_group_name = group_item->group->name;
+	hw_metadata->trap_name = trap_item->trap->name;
+
+	spin_lock(&in_devlink_port->type_lock);
+	if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
+		hw_metadata->input_dev = in_devlink_port->type_dev;
+	spin_unlock(&in_devlink_port->type_lock);
+}
+
+/**
+ * devlink_trap_report - Report trapped packet to drop monitor.
+ * @devlink: devlink.
+ * @skb: Trapped packet.
+ * @trap_ctx: Trap context.
+ * @in_devlink_port: Input devlink port.
+ */
+void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
+			 void *trap_ctx, struct devlink_port *in_devlink_port)
+{
+	struct devlink_trap_item *trap_item = trap_ctx;
+	struct net_dm_hw_metadata hw_metadata = {};
+
+	devlink_trap_stats_update(trap_item->stats, skb->len);
+	devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
+
+	devlink_trap_report_metadata_fill(&hw_metadata, trap_item,
+					  in_devlink_port);
+	net_dm_hw_report(skb, &hw_metadata);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_report);
+
+/**
+ * devlink_trap_ctx_priv - Trap context to driver private information.
+ * @trap_ctx: Trap context.
+ *
+ * Return: Driver private information passed during registration.
+ */
+void *devlink_trap_ctx_priv(void *trap_ctx)
+{
+	struct devlink_trap_item *trap_item = trap_ctx;
+
+	return trap_item->priv;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv);
+
 static void __devlink_compat_running_version(struct devlink *devlink,
 					     char *buf, size_t len)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 77cd0d7b3f257fd0e3096b4fdcff1a7d38e99e10 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 14 Aug 2019 09:27:17 +0200
Subject: xsk: add support for need_wakeup flag in AF_XDP rings

This commit adds support for a new flag called need_wakeup in the
AF_XDP Tx and fill rings. When this flag is set, it means that the
application has to explicitly wake up the kernel Rx (for the bit in
the fill ring) or kernel Tx (for bit in the Tx ring) processing by
issuing a syscall. Poll() can wake up both depending on the flags
submitted and sendto() will wake up tx processing only.

The main reason for introducing this new flag is to be able to
efficiently support the case when application and driver is executing
on the same core. Previously, the driver was just busy-spinning on the
fill ring if it ran out of buffers in the HW and there were none on
the fill ring. This approach works when the application is running on
another core as it can replenish the fill ring while the driver is
busy-spinning. Though, this is a lousy approach if both of them are
running on the same core as the probability of the fill ring getting
more entries when the driver is busy-spinning is zero. With this new
feature the driver now sets the need_wakeup flag and returns to the
application. The application can then replenish the fill queue and
then explicitly wake up the Rx processing in the kernel using the
syscall poll(). For Tx, the flag is only set to one if the driver has
no outstanding Tx completion interrupts. If it has some, the flag is
zero as it will be woken up by a completion interrupt anyway.

As a nice side effect, this new flag also improves the performance of
the case where application and driver are running on two different
cores as it reduces the number of syscalls to the kernel. The kernel
tells user space if it needs to be woken up by a syscall, and this
eliminates many of the syscalls.

This flag needs some simple driver support. If the driver does not
support this, the Rx flag is always zero and the Tx flag is always
one. This makes any application relying on this feature default to the
old behaviour of not requiring any syscalls in the Rx path and always
having to call sendto() in the Tx path.

For backwards compatibility reasons, this feature has to be explicitly
turned on using a new bind flag (XDP_USE_NEED_WAKEUP). I recommend
that you always turn it on as it so far always have had a positive
performance impact.

The name and inspiration of the flag has been taken from io_uring by
Jens Axboe. Details about this feature in io_uring can be found in
http://kernel.dk/io_uring.pdf, section 8.3.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h      |  33 +++++++++-
 include/uapi/linux/if_xdp.h |  13 ++++
 net/xdp/xdp_umem.c          |   9 +++
 net/xdp/xsk.c               | 146 ++++++++++++++++++++++++++++++++++++++------
 net/xdp/xsk.h               |  13 ++++
 net/xdp/xsk_queue.h         |   1 +
 6 files changed, 195 insertions(+), 20 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 69796d264f06..6aebea2b18cb 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -27,6 +27,9 @@ struct xdp_umem_fq_reuse {
 	u64 handles[];
 };
 
+/* Flags for the umem flags field. */
+#define XDP_UMEM_USES_NEED_WAKEUP (1 << 0)
+
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
@@ -41,10 +44,12 @@ struct xdp_umem {
 	struct work_struct work;
 	struct page **pgs;
 	u32 npgs;
+	u16 queue_id;
+	u8 need_wakeup;
+	u8 flags;
 	int id;
 	struct net_device *dev;
 	struct xdp_umem_fq_reuse *fq_reuse;
-	u16 queue_id;
 	bool zc;
 	spinlock_t xsk_list_lock;
 	struct list_head xsk_list;
@@ -95,6 +100,11 @@ struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
 					  struct xdp_umem_fq_reuse *newq);
 void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
@@ -241,6 +251,27 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
 {
 }
 
+static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+	return false;
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index faaa5ca2a117..62b80d57b72a 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -16,6 +16,15 @@
 #define XDP_SHARED_UMEM	(1 << 0)
 #define XDP_COPY	(1 << 1) /* Force copy-mode */
 #define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
+/* If this option is set, the driver might go sleep and in that case
+ * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
+ * set. If it is set, the application need to explicitly wake up the
+ * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
+ * running the driver and the application on the same core, you should
+ * use this option so that the kernel will yield to the user space
+ * application.
+ */
+#define XDP_USE_NEED_WAKEUP (1 << 3)
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
@@ -25,10 +34,14 @@ struct sockaddr_xdp {
 	__u32 sxdp_shared_umem_fd;
 };
 
+/* XDP_RING flags */
+#define XDP_RING_NEED_WAKEUP (1 << 0)
+
 struct xdp_ring_offset {
 	__u64 producer;
 	__u64 consumer;
 	__u64 desc;
+	__u64 flags;
 };
 
 struct xdp_mmap_offsets {
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 6e2d4da4cdcd..cda6feb1edb0 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -106,6 +106,15 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	umem->dev = dev;
 	umem->queue_id = queue_id;
 
+	if (flags & XDP_USE_NEED_WAKEUP) {
+		umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
+		/* Tx needs to be explicitly woken up the first time.
+		 * Also for supporting drivers that do not implement this
+		 * feature. They will always have to call sendto().
+		 */
+		xsk_set_tx_need_wakeup(umem);
+	}
+
 	dev_hold(dev);
 
 	if (force_copy)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 1fe40a936456..9f900b56b15b 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -55,6 +55,66 @@ void xsk_umem_discard_addr(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_discard_addr);
 
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+	if (umem->need_wakeup & XDP_WAKEUP_RX)
+		return;
+
+	umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
+	umem->need_wakeup |= XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
+
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+	struct xdp_sock *xs;
+
+	if (umem->need_wakeup & XDP_WAKEUP_TX)
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
+	}
+	rcu_read_unlock();
+
+	umem->need_wakeup |= XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
+
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+	if (!(umem->need_wakeup & XDP_WAKEUP_RX))
+		return;
+
+	umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+	umem->need_wakeup &= ~XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
+
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+	struct xdp_sock *xs;
+
+	if (!(umem->need_wakeup & XDP_WAKEUP_TX))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+	}
+	rcu_read_unlock();
+
+	umem->need_wakeup &= ~XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
+
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+	return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
+}
+EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
+
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
 	void *to_buf, *from_buf;
@@ -320,6 +380,12 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
 	unsigned int mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
+	struct net_device *dev = xs->dev;
+	struct xdp_umem *umem = xs->umem;
+
+	if (umem->need_wakeup)
+		dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
+						umem->need_wakeup);
 
 	if (xs->rx && !xskq_empty_desc(xs->rx))
 		mask |= POLLIN | POLLRDNORM;
@@ -428,7 +494,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		return -EINVAL;
 
 	flags = sxdp->sxdp_flags;
-	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
+	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
+		      XDP_USE_NEED_WAKEUP))
 		return -EINVAL;
 
 	rtnl_lock();
@@ -455,7 +522,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		struct xdp_sock *umem_xs;
 		struct socket *sock;
 
-		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
+		    (flags & XDP_USE_NEED_WAKEUP)) {
 			/* Cannot specify flags for shared sockets. */
 			err = -EINVAL;
 			goto out_unlock;
@@ -550,6 +618,9 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		}
 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 		err = xsk_init_queue(entries, q, false);
+		if (!err && optname == XDP_TX_RING)
+			/* Tx needs to be explicitly woken up the first time */
+			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
@@ -611,6 +682,20 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
+{
+	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
+}
+
+static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
+{
+	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+	ring->desc = offsetof(struct xdp_umem_ring, desc);
+}
+
 static int xsk_getsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, int __user *optlen)
 {
@@ -650,26 +735,49 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 	case XDP_MMAP_OFFSETS:
 	{
 		struct xdp_mmap_offsets off;
+		struct xdp_mmap_offsets_v1 off_v1;
+		bool flags_supported = true;
+		void *to_copy;
 
-		if (len < sizeof(off))
+		if (len < sizeof(off_v1))
 			return -EINVAL;
+		else if (len < sizeof(off))
+			flags_supported = false;
+
+		if (flags_supported) {
+			/* xdp_ring_offset is identical to xdp_ring_offset_v1
+			 * except for the flags field added to the end.
+			 */
+			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+					       &off.rx);
+			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+					       &off.tx);
+			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+					       &off.fr);
+			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+					       &off.cr);
+			off.rx.flags = offsetof(struct xdp_rxtx_ring,
+						ptrs.flags);
+			off.tx.flags = offsetof(struct xdp_rxtx_ring,
+						ptrs.flags);
+			off.fr.flags = offsetof(struct xdp_umem_ring,
+						ptrs.flags);
+			off.cr.flags = offsetof(struct xdp_umem_ring,
+						ptrs.flags);
+
+			len = sizeof(off);
+			to_copy = &off;
+		} else {
+			xsk_enter_rxtx_offsets(&off_v1.rx);
+			xsk_enter_rxtx_offsets(&off_v1.tx);
+			xsk_enter_umem_offsets(&off_v1.fr);
+			xsk_enter_umem_offsets(&off_v1.cr);
+
+			len = sizeof(off_v1);
+			to_copy = &off_v1;
+		}
 
-		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
-		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
-		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
-		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
-		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
-		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
-
-		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
-		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
-		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
-		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
-		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
-		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
-
-		len = sizeof(off);
-		if (copy_to_user(optval, &off, len))
+		if (copy_to_user(optval, to_copy, len))
 			return -EFAULT;
 		if (put_user(len, optlen))
 			return -EFAULT;
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index ba8120610426..4cfd106bdb53 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -4,6 +4,19 @@
 #ifndef XSK_H_
 #define XSK_H_
 
+struct xdp_ring_offset_v1 {
+	__u64 producer;
+	__u64 consumer;
+	__u64 desc;
+};
+
+struct xdp_mmap_offsets_v1 {
+	struct xdp_ring_offset_v1 rx;
+	struct xdp_ring_offset_v1 tx;
+	struct xdp_ring_offset_v1 fr;
+	struct xdp_ring_offset_v1 cr;
+};
+
 static inline struct xdp_sock *xdp_sk(struct sock *sk)
 {
 	return (struct xdp_sock *)sk;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 909c5168ed0f..dd9e985c2461 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -16,6 +16,7 @@
 struct xdp_ring {
 	u32 producer ____cacheline_aligned_in_smp;
 	u32 consumer ____cacheline_aligned_in_smp;
+	u32 flags;
 };
 
 /* Used for the RX and TX queues for packets */
-- 
cgit v1.2.3-59-g8ed1b


From 8f51dfc73bf181f2304e1498f55d5f452e060cbe Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 14 Aug 2019 10:37:49 -0700
Subject: bpf: support cloning sk storage on accept()

Add new helper bpf_sk_storage_clone which optionally clones sk storage
and call it from sk_clone_lock.

Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/bpf_sk_storage.h |  10 +++++
 include/uapi/linux/bpf.h     |   3 ++
 net/core/bpf_sk_storage.c    | 104 +++++++++++++++++++++++++++++++++++++++++--
 net/core/sock.c              |   9 ++--
 4 files changed, 120 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index b9dcb02e756b..8e4f831d2e52 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
 extern const struct bpf_func_proto bpf_sk_storage_get_proto;
 extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
 
+#ifdef CONFIG_BPF_SYSCALL
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
+#else
+static inline int bpf_sk_storage_clone(const struct sock *sk,
+				       struct sock *newsk)
+{
+	return 0;
+}
+#endif
+
 #endif /* _BPF_SK_STORAGE_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4393bd4b2419..0ef594ac3899 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -337,6 +337,9 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY_PROG	(1U << 7)
 #define BPF_F_WRONLY_PROG	(1U << 8)
 
+/* Clone map from listener for newly accepted socket */
+#define BPF_F_CLONE		(1U << 9)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 94c7f77ecb6b..da5639a5bd3b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -12,6 +12,9 @@
 
 static atomic_t cache_idx;
 
+#define SK_STORAGE_CREATE_FLAG_MASK					\
+	(BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
 struct bucket {
 	struct hlist_head list;
 	raw_spinlock_t lock;
@@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
 		kfree_rcu(sk_storage, rcu);
 }
 
-/* sk_storage->lock must be held and sk_storage->list cannot be empty */
 static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
 			    struct bpf_sk_storage_elem *selem)
 {
@@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
 	return 0;
 }
 
-/* Called by __sk_destruct() */
+/* Called by __sk_destruct() & bpf_sk_storage_clone() */
 void bpf_sk_storage_free(struct sock *sk)
 {
 	struct bpf_sk_storage_elem *selem;
@@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
 	smap = (struct bpf_sk_storage_map *)map;
 
+	/* Note that this map might be concurrently cloned from
+	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+	 * RCU read section to finish before proceeding. New RCU
+	 * read sections should be prevented via bpf_map_inc_not_zero.
+	 */
 	synchronize_rcu();
 
 	/* bpf prog and the userspace can no longer access this map
@@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
 static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 {
-	if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
+	if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
+	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+	    attr->max_entries ||
 	    attr->key_size != sizeof(int) || !attr->value_size ||
 	    /* Enforce BTF for userspace sk dumping */
 	    !attr->btf_key_type_id || !attr->btf_value_type_id)
@@ -739,6 +748,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
 	return err;
 }
 
+static struct bpf_sk_storage_elem *
+bpf_sk_storage_clone_elem(struct sock *newsk,
+			  struct bpf_sk_storage_map *smap,
+			  struct bpf_sk_storage_elem *selem)
+{
+	struct bpf_sk_storage_elem *copy_selem;
+
+	copy_selem = selem_alloc(smap, newsk, NULL, true);
+	if (!copy_selem)
+		return NULL;
+
+	if (map_value_has_spin_lock(&smap->map))
+		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
+				      SDATA(selem)->data, true);
+	else
+		copy_map_value(&smap->map, SDATA(copy_selem)->data,
+			       SDATA(selem)->data);
+
+	return copy_selem;
+}
+
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
+{
+	struct bpf_sk_storage *new_sk_storage = NULL;
+	struct bpf_sk_storage *sk_storage;
+	struct bpf_sk_storage_elem *selem;
+	int ret = 0;
+
+	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+
+	rcu_read_lock();
+	sk_storage = rcu_dereference(sk->sk_bpf_storage);
+
+	if (!sk_storage || hlist_empty(&sk_storage->list))
+		goto out;
+
+	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+		struct bpf_sk_storage_elem *copy_selem;
+		struct bpf_sk_storage_map *smap;
+		struct bpf_map *map;
+
+		smap = rcu_dereference(SDATA(selem)->smap);
+		if (!(smap->map.map_flags & BPF_F_CLONE))
+			continue;
+
+		/* Note that for lockless listeners adding new element
+		 * here can race with cleanup in bpf_sk_storage_map_free.
+		 * Try to grab map refcnt to make sure that it's still
+		 * alive and prevent concurrent removal.
+		 */
+		map = bpf_map_inc_not_zero(&smap->map, false);
+		if (IS_ERR(map))
+			continue;
+
+		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
+		if (!copy_selem) {
+			ret = -ENOMEM;
+			bpf_map_put(map);
+			goto out;
+		}
+
+		if (new_sk_storage) {
+			selem_link_map(smap, copy_selem);
+			__selem_link_sk(new_sk_storage, copy_selem);
+		} else {
+			ret = sk_storage_alloc(newsk, smap, copy_selem);
+			if (ret) {
+				kfree(copy_selem);
+				atomic_sub(smap->elem_size,
+					   &newsk->sk_omem_alloc);
+				bpf_map_put(map);
+				goto out;
+			}
+
+			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+		}
+		bpf_map_put(map);
+	}
+
+out:
+	rcu_read_unlock();
+
+	/* In case of an error, don't free anything explicitly here, the
+	 * caller is responsible to call bpf_sk_storage_free.
+	 */
+
+	return ret;
+}
+
 BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 	   void *, value, u64, flags)
 {
diff --git a/net/core/sock.c b/net/core/sock.c
index d57b0cc995a0..f5e801a9cea4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 			goto out;
 		}
 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
-#ifdef CONFIG_BPF_SYSCALL
-		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
-#endif
+
+		if (bpf_sk_storage_clone(sk, newsk)) {
+			sk_free_unlock_clone(newsk);
+			newsk = NULL;
+			goto out;
+		}
 
 		newsk->sk_err	   = 0;
 		newsk->sk_err_soft = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 99b60d56a35b18af267f275559a530db372bfad7 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 16 Aug 2019 21:56:27 +0200
Subject: net: phy: add EEE-related constants

Add EEE-related constants. This includes the new MMD EEE registers for
NBase-T / 802.3bz.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mdio.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index 0a552061ff1c..4bcb41c71b8c 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -45,11 +45,14 @@
 #define MDIO_AN_ADVERTISE	16	/* AN advertising (base page) */
 #define MDIO_AN_LPA		19	/* AN LP abilities (base page) */
 #define MDIO_PCS_EEE_ABLE	20	/* EEE Capability register */
+#define MDIO_PCS_EEE_ABLE2	21	/* EEE Capability register 2 */
 #define MDIO_PMA_NG_EXTABLE	21	/* 2.5G/5G PMA/PMD extended ability */
 #define MDIO_PCS_EEE_WK_ERR	22	/* EEE wake error counter */
 #define MDIO_PHYXS_LNSTAT	24	/* PHY XGXS lane state */
 #define MDIO_AN_EEE_ADV		60	/* EEE advertisement */
 #define MDIO_AN_EEE_LPABLE	61	/* EEE link partner ability */
+#define MDIO_AN_EEE_ADV2	62	/* EEE advertisement 2 */
+#define MDIO_AN_EEE_LPABLE2	63	/* EEE link partner ability 2 */
 
 /* Media-dependent registers. */
 #define MDIO_PMA_10GBT_SWAPPOL	130	/* 10GBASE-T pair swap & polarity */
@@ -276,6 +279,13 @@
 #define MDIO_EEE_1000KX		0x0010	/* 1000KX EEE cap */
 #define MDIO_EEE_10GKX4		0x0020	/* 10G KX4 EEE cap */
 #define MDIO_EEE_10GKR		0x0040	/* 10G KR EEE cap */
+#define MDIO_EEE_40GR_FW	0x0100	/* 40G R fast wake */
+#define MDIO_EEE_40GR_DS	0x0200	/* 40G R deep sleep */
+#define MDIO_EEE_100GR_FW	0x1000	/* 100G R fast wake */
+#define MDIO_EEE_100GR_DS	0x2000	/* 100G R deep sleep */
+
+#define MDIO_EEE_2_5GT		0x0001	/* 2.5GT EEE cap */
+#define MDIO_EEE_5GT		0x0002	/* 5GT EEE cap */
 
 /* 2.5G/5G Extended abilities register. */
 #define MDIO_PMA_NG_EXTABLE_2_5GBT	0x0001	/* 2.5GBASET ability */
-- 
cgit v1.2.3-59-g8ed1b


From df2c71ffdfae58961981d7cbcccea93688fc4e96 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 19 Aug 2019 22:02:46 +0800
Subject: sctp: add SCTP_ASCONF_SUPPORTED sockopt

SCTP_ASCONF_SUPPORTED sockopt is used to set enpoint's asconf
flag. With this feature, each endpoint will have its own flag
for its future asoc's asconf_capable, instead of netns asconf
flag.

Note that when both ep's asconf_enable and auth_enable are
enabled, SCTP_CID_ASCONF and SCTP_CID_ASCONF_ACK should be
added into auth_chunk_list.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  1 +
 net/sctp/socket.c         | 82 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index b8f2c4d56532..9b9b82debc0d 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -134,6 +134,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_INTERLEAVING_SUPPORTED	125
 #define SCTP_SENDMSG_CONNECT	126
 #define SCTP_EVENT	127
+#define SCTP_ASCONF_SUPPORTED	128
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 559793f7f72a..b21a70708405 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4496,6 +4496,42 @@ static int sctp_setsockopt_event(struct sock *sk, char __user *optval,
 	return retval;
 }
 
+static int sctp_setsockopt_asconf_supported(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	struct sctp_endpoint *ep;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+	    sctp_style(sk, UDP))
+		goto out;
+
+	ep = sctp_sk(sk)->ep;
+	ep->asconf_enable = !!params.assoc_value;
+
+	if (ep->asconf_enable && ep->auth_enable) {
+		sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF);
+		sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK);
+	}
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4696,6 +4732,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_EVENT:
 		retval = sctp_setsockopt_event(sk, optval, optlen);
 		break;
+	case SCTP_ASCONF_SUPPORTED:
+		retval = sctp_setsockopt_asconf_supported(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -7675,6 +7714,45 @@ static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval,
 	return 0;
 }
 
+static int sctp_getsockopt_asconf_supported(struct sock *sk, int len,
+					    char __user *optval,
+					    int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+	    sctp_style(sk, UDP)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	params.assoc_value = asoc ? asoc->peer.asconf_capable
+				  : sctp_sk(sk)->ep->asconf_enable;
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -7876,6 +7954,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_EVENT:
 		retval = sctp_getsockopt_event(sk, len, optval, optlen);
 		break;
+	case SCTP_ASCONF_SUPPORTED:
+		retval = sctp_getsockopt_asconf_supported(sk, len, optval,
+							  optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 56dd525abd56f7acd7b44a52935726e3ada4916c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 19 Aug 2019 22:02:49 +0800
Subject: sctp: add SCTP_AUTH_SUPPORTED sockopt

SCTP_AUTH_SUPPORTED sockopt is used to set enpoint's auth
flag. With this feature, each endpoint will have its own
flag for its future asoc's auth_capable, instead of netns
auth flag.

Note that when both ep's auth_enable is enabled, endpoint
auth related data should be initialized. If asconf_enable
is also set, SCTP_CID_ASCONF/SCTP_CID_ASCONF_ACK should
be added into auth_chunk_list.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  1 +
 net/sctp/socket.c         | 86 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 9b9b82debc0d..62527aca8477 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -135,6 +135,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_SENDMSG_CONNECT	126
 #define SCTP_EVENT	127
 #define SCTP_ASCONF_SUPPORTED	128
+#define SCTP_AUTH_SUPPORTED	129
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index dcde8d92c568..82bc25223cfe 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4520,6 +4520,46 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_auth_supported(struct sock *sk,
+					  char __user *optval,
+					  unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	struct sctp_endpoint *ep;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+	    sctp_style(sk, UDP))
+		goto out;
+
+	ep = sctp_sk(sk)->ep;
+	if (params.assoc_value) {
+		retval = sctp_auth_init(ep, GFP_KERNEL);
+		if (retval)
+			goto out;
+		if (ep->asconf_enable) {
+			sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF);
+			sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK);
+		}
+	}
+
+	ep->auth_enable = !!params.assoc_value;
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4723,6 +4763,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_ASCONF_SUPPORTED:
 		retval = sctp_setsockopt_asconf_supported(sk, optval, optlen);
 		break;
+	case SCTP_AUTH_SUPPORTED:
+		retval = sctp_setsockopt_auth_supported(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -7746,6 +7789,45 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_auth_supported(struct sock *sk, int len,
+					  char __user *optval,
+					  int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+	    sctp_style(sk, UDP)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	params.assoc_value = asoc ? asoc->peer.auth_capable
+				  : sctp_sk(sk)->ep->auth_enable;
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -7951,6 +8033,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_asconf_supported(sk, len, optval,
 							  optlen);
 		break;
+	case SCTP_AUTH_SUPPORTED:
+		retval = sctp_getsockopt_auth_supported(sk, len, optval,
+							optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 1b9ed84ecf268904d89edf2908426a8eb3b5a4ba Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Tue, 20 Aug 2019 10:31:50 +0100
Subject: bpf: add new BPF_BTF_GET_NEXT_ID syscall command

Add a new command for the bpf() system call: BPF_BTF_GET_NEXT_ID is used
to cycle through all BTF objects loaded on the system.

The motivation is to be able to inspect (list) all BTF objects presents
on the system.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      | 3 +++
 include/uapi/linux/bpf.h | 1 +
 kernel/bpf/btf.c         | 4 ++--
 kernel/bpf/syscall.c     | 4 ++++
 4 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 15ae49862b82..5b9d22338606 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -24,6 +24,9 @@ struct seq_file;
 struct btf;
 struct btf_type;
 
+extern struct idr btf_idr;
+extern spinlock_t btf_idr_lock;
+
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
 	/* funcs callable from userspace (via syscall) */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0ef594ac3899..8aa6126f0b6e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -106,6 +106,7 @@ enum bpf_cmd {
 	BPF_TASK_FD_QUERY,
 	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 	BPF_MAP_FREEZE,
+	BPF_BTF_GET_NEXT_ID,
 };
 
 enum bpf_map_type {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6b403dc18486..adb3adcebe3c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -195,8 +195,8 @@
 	     i < btf_type_vlen(struct_type);					\
 	     i++, member++)
 
-static DEFINE_IDR(btf_idr);
-static DEFINE_SPINLOCK(btf_idr_lock);
+DEFINE_IDR(btf_idr);
+DEFINE_SPINLOCK(btf_idr_lock);
 
 struct btf {
 	void *data;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cf8052b016e7..c0f62fd67c6b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2884,6 +2884,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_obj_get_next_id(&attr, uattr,
 					  &map_idr, &map_idr_lock);
 		break;
+	case BPF_BTF_GET_NEXT_ID:
+		err = bpf_obj_get_next_id(&attr, uattr,
+					  &btf_idr, &btf_idr_lock);
+		break;
 	case BPF_PROG_GET_FD_BY_ID:
 		err = bpf_prog_get_fd_by_id(&attr);
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From c5b9a7f826735228a38fab4a7b2707f032468c88 Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend.vanspriel@broadcom.com>
Date: Fri, 2 Aug 2019 13:30:58 +0200
Subject: nl80211: add 6GHz band definition to enum nl80211_band

In the 802.11ax specification a new band is introduced, which
is also proposed by FCC for unlicensed use. This band is referred
to as 6GHz spanning frequency range from 5925 to 7125 MHz.

Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Leon Zegers <leon.zegers@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Link: https://lore.kernel.org/r/1564745465-21234-2-git-send-email-arend.vanspriel@broadcom.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 ++
 net/mac80211/tx.c            | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 822851d369ab..4d5988f47118 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4543,6 +4543,7 @@ enum nl80211_txrate_gi {
  * @NL80211_BAND_2GHZ: 2.4 GHz ISM band
  * @NL80211_BAND_5GHZ: around 5 GHz band (4.9 - 5.7 GHz)
  * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 69.12 GHz)
+ * @NL80211_BAND_6GHZ: around 6 GHz band (5.9 - 7.2 GHz)
  * @NUM_NL80211_BANDS: number of bands, avoid using this in userspace
  *	since newer kernel versions may support more bands
  */
@@ -4550,6 +4551,7 @@ enum nl80211_band {
 	NL80211_BAND_2GHZ,
 	NL80211_BAND_5GHZ,
 	NL80211_BAND_60GHZ,
+	NL80211_BAND_6GHZ,
 
 	NUM_NL80211_BANDS,
 };
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 235c6377a203..1fa422782905 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -162,6 +162,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
 			break;
 		}
 		case NL80211_BAND_5GHZ:
+		case NL80211_BAND_6GHZ:
 			if (r->flags & IEEE80211_RATE_MANDATORY_A)
 				mrate = r->bitrate;
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From 6c7a00339e2a64b068c986301f37bd31eb83d7e9 Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Fri, 9 Aug 2019 11:00:00 -0700
Subject: cfg80211: Support assoc-at timer in sta-info

Report timestamp of when sta became associated.

This is the boottime clock, units are nano-seconds.

Signed-off-by: Ben Greear <greearb@candelatech.com>
Link: https://lore.kernel.org/r/20190809180001.26393-1-greearb@candelatech.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/uapi/linux/nl80211.h | 3 +++
 net/wireless/nl80211.c       | 1 +
 3 files changed, 6 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8140c4837122..1e32b11e1730 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1330,6 +1330,7 @@ struct cfg80211_tid_stats {
  *	indicate the relevant values in this struct for them
  * @connected_time: time(in secs) since a station is last connected
  * @inactive_time: time since last station activity (tx/rx) in milliseconds
+ * @assoc_at: bootime (ns) of the last association
  * @rx_bytes: bytes (size of MPDUs) received from this station
  * @tx_bytes: bytes (size of MPDUs) transmitted to this station
  * @llid: mesh local link id
@@ -1390,6 +1391,7 @@ struct station_info {
 	u64 filled;
 	u32 connected_time;
 	u32 inactive_time;
+	u64 assoc_at;
 	u64 rx_bytes;
 	u64 tx_bytes;
 	u16 llid;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 4d5988f47118..04b58295c8d9 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3201,6 +3201,8 @@ enum nl80211_sta_bss_param {
  *	sent to the station (u64, usec)
  * @NL80211_STA_INFO_AIRTIME_WEIGHT: current airtime weight for station (u16)
  * @NL80211_STA_INFO_AIRTIME_LINK_METRIC: airtime link metric for mesh station
+ * @NL80211_STA_INFO_ASSOC_AT_BOOTTIME: Timestamp (CLOCK_BOOTTIME, nanoseconds)
+ *	of STA's association
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3247,6 +3249,7 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_TX_DURATION,
 	NL80211_STA_INFO_AIRTIME_WEIGHT,
 	NL80211_STA_INFO_AIRTIME_LINK_METRIC,
+	NL80211_STA_INFO_ASSOC_AT_BOOTTIME,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9a642219a8c7..cacd96704647 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5032,6 +5032,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 
 	PUT_SINFO(CONNECTED_TIME, connected_time, u32);
 	PUT_SINFO(INACTIVE_TIME, inactive_time, u32);
+	PUT_SINFO_U64(ASSOC_AT_BOOTTIME, assoc_at);
 
 	if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) |
 			     BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) &&
-- 
cgit v1.2.3-59-g8ed1b


From 2a38075cd0beefa4da326380cf54c7b365ddc035 Mon Sep 17 00:00:00 2001
From: Alexei Avshalom Lazar <ailizaro@codeaurora.org>
Date: Sun, 18 Aug 2019 17:35:17 +0300
Subject: nl80211: Add support for EDMG channels

802.11ay specification defines Enhanced Directional Multi-Gigabit
(EDMG) STA and AP which allow channel bonding of 2 channels and more.

Introduce new NL attributes that are needed for enabling and
configuring EDMG support.

Two new attributes are used by kernel to publish driver's EDMG
capabilities to the userspace:
NL80211_BAND_ATTR_EDMG_CHANNELS - bitmap field that indicates the 2.16
GHz channel(s) that are supported by the driver.
When this attribute is not set it means driver does not support EDMG.
NL80211_BAND_ATTR_EDMG_BW_CONFIG - represent the channel bandwidth
configurations supported by the driver.

Additional two new attributes are used by the userspace for connect
command and for AP configuration:
NL80211_ATTR_WIPHY_EDMG_CHANNELS
NL80211_ATTR_WIPHY_EDMG_BW_CONFIG

New rate info flag - RATE_INFO_FLAGS_EDMG, can be reported from driver
and used for bitrate calculation that will take into account EDMG
according to the 802.11ay specification.

Signed-off-by: Alexei Avshalom Lazar <ailizaro@codeaurora.org>
Link: https://lore.kernel.org/r/1566138918-3823-2-git-send-email-ailizaro@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/wil6210/cfg80211.c |   2 +-
 include/net/cfg80211.h                      |  86 ++++++++++++++-
 include/uapi/linux/nl80211.h                |  24 +++++
 net/mac80211/mlme.c                         |   2 +-
 net/mac80211/status.c                       |   6 +-
 net/wireless/chan.c                         | 159 ++++++++++++++++++++++++++++
 net/wireless/nl80211.c                      |  37 +++++++
 net/wireless/util.c                         |  42 +++++++-
 8 files changed, 349 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 2fb4258941a5..2414f574bf69 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -351,7 +351,7 @@ int wil_cid_fill_sinfo(struct wil6210_vif *vif, int cid,
 			BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC) |
 			BIT_ULL(NL80211_STA_INFO_TX_FAILED);
 
-	sinfo->txrate.flags = RATE_INFO_FLAGS_60G;
+	sinfo->txrate.flags = RATE_INFO_FLAGS_DMG;
 	sinfo->txrate.mcs = le16_to_cpu(reply.evt.bf_mcs);
 	sinfo->rxrate.mcs = stats->last_mcs_rx;
 	sinfo->rx_bytes = stats->rx_bytes;
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1e32b11e1730..5253e7f667bd 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -330,6 +330,60 @@ struct ieee80211_sband_iftype_data {
 	struct ieee80211_sta_he_cap he_cap;
 };
 
+/**
+ * enum ieee80211_edmg_bw_config - allowed channel bandwidth configurations
+ *
+ * @IEEE80211_EDMG_BW_CONFIG_4: 2.16GHz
+ * @IEEE80211_EDMG_BW_CONFIG_5: 2.16GHz and 4.32GHz
+ * @IEEE80211_EDMG_BW_CONFIG_6: 2.16GHz, 4.32GHz and 6.48GHz
+ * @IEEE80211_EDMG_BW_CONFIG_7: 2.16GHz, 4.32GHz, 6.48GHz and 8.64GHz
+ * @IEEE80211_EDMG_BW_CONFIG_8: 2.16GHz and 2.16GHz + 2.16GHz
+ * @IEEE80211_EDMG_BW_CONFIG_9: 2.16GHz, 4.32GHz and 2.16GHz + 2.16GHz
+ * @IEEE80211_EDMG_BW_CONFIG_10: 2.16GHz, 4.32GHz, 6.48GHz and 2.16GHz+2.16GHz
+ * @IEEE80211_EDMG_BW_CONFIG_11: 2.16GHz, 4.32GHz, 6.48GHz, 8.64GHz and
+ *	2.16GHz+2.16GHz
+ * @IEEE80211_EDMG_BW_CONFIG_12: 2.16GHz, 2.16GHz + 2.16GHz and
+ *	4.32GHz + 4.32GHz
+ * @IEEE80211_EDMG_BW_CONFIG_13: 2.16GHz, 4.32GHz, 2.16GHz + 2.16GHz and
+ *	4.32GHz + 4.32GHz
+ * @IEEE80211_EDMG_BW_CONFIG_14: 2.16GHz, 4.32GHz, 6.48GHz, 2.16GHz + 2.16GHz
+ *	and 4.32GHz + 4.32GHz
+ * @IEEE80211_EDMG_BW_CONFIG_15: 2.16GHz, 4.32GHz, 6.48GHz, 8.64GHz,
+ *	2.16GHz + 2.16GHz and 4.32GHz + 4.32GHz
+ */
+enum ieee80211_edmg_bw_config {
+	IEEE80211_EDMG_BW_CONFIG_4	= 4,
+	IEEE80211_EDMG_BW_CONFIG_5	= 5,
+	IEEE80211_EDMG_BW_CONFIG_6	= 6,
+	IEEE80211_EDMG_BW_CONFIG_7	= 7,
+	IEEE80211_EDMG_BW_CONFIG_8	= 8,
+	IEEE80211_EDMG_BW_CONFIG_9	= 9,
+	IEEE80211_EDMG_BW_CONFIG_10	= 10,
+	IEEE80211_EDMG_BW_CONFIG_11	= 11,
+	IEEE80211_EDMG_BW_CONFIG_12	= 12,
+	IEEE80211_EDMG_BW_CONFIG_13	= 13,
+	IEEE80211_EDMG_BW_CONFIG_14	= 14,
+	IEEE80211_EDMG_BW_CONFIG_15	= 15,
+};
+
+/**
+ * struct ieee80211_edmg - EDMG configuration
+ *
+ * This structure describes most essential parameters needed
+ * to describe 802.11ay EDMG configuration
+ *
+ * @channels: bitmap that indicates the 2.16 GHz channel(s)
+ *	that are allowed to be used for transmissions.
+ *	Bit 0 indicates channel 1, bit 1 indicates channel 2, etc.
+ *	Set to 0 indicate EDMG not supported.
+ * @bw_config: Channel BW Configuration subfield encodes
+ *	the allowed channel bandwidth configurations
+ */
+struct ieee80211_edmg {
+	u8 channels;
+	enum ieee80211_edmg_bw_config bw_config;
+};
+
 /**
  * struct ieee80211_supported_band - frequency band definition
  *
@@ -346,6 +400,7 @@ struct ieee80211_sband_iftype_data {
  * @n_bitrates: Number of bitrates in @bitrates
  * @ht_cap: HT capabilities in this band
  * @vht_cap: VHT capabilities in this band
+ * @edmg_cap: EDMG capabilities in this band
  * @n_iftype_data: number of iftype data entries
  * @iftype_data: interface type data entries.  Note that the bits in
  *	@types_mask inside this structure cannot overlap (i.e. only
@@ -360,6 +415,7 @@ struct ieee80211_supported_band {
 	int n_bitrates;
 	struct ieee80211_sta_ht_cap ht_cap;
 	struct ieee80211_sta_vht_cap vht_cap;
+	struct ieee80211_edmg edmg_cap;
 	u16 n_iftype_data;
 	const struct ieee80211_sband_iftype_data *iftype_data;
 };
@@ -527,12 +583,17 @@ struct key_params {
  * @center_freq1: center frequency of first segment
  * @center_freq2: center frequency of second segment
  *	(only with 80+80 MHz)
+ * @edmg: define the EDMG channels configuration.
+ *	If edmg is requested (i.e. the .channels member is non-zero),
+ *	chan will define the primary channel and all other
+ *	parameters are ignored.
  */
 struct cfg80211_chan_def {
 	struct ieee80211_channel *chan;
 	enum nl80211_chan_width width;
 	u32 center_freq1;
 	u32 center_freq2;
+	struct ieee80211_edmg edmg;
 };
 
 /**
@@ -590,6 +651,19 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1,
 		chandef1->center_freq2 == chandef2->center_freq2);
 }
 
+/**
+ * cfg80211_chandef_is_edmg - check if chandef represents an EDMG channel
+ *
+ * @chandef: the channel definition
+ *
+ * Return: %true if EDMG defined, %false otherwise.
+ */
+static inline bool
+cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef)
+{
+	return chandef->edmg.channels || chandef->edmg.bw_config;
+}
+
 /**
  * cfg80211_chandef_compatible - check if two channel definitions are compatible
  * @chandef1: first channel definition
@@ -1177,15 +1251,17 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
  * @RATE_INFO_FLAGS_MCS: mcs field filled with HT MCS
  * @RATE_INFO_FLAGS_VHT_MCS: mcs field filled with VHT MCS
  * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval
- * @RATE_INFO_FLAGS_60G: 60GHz MCS
+ * @RATE_INFO_FLAGS_DMG: 60GHz MCS
  * @RATE_INFO_FLAGS_HE_MCS: HE MCS information
+ * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode
  */
 enum rate_info_flags {
 	RATE_INFO_FLAGS_MCS			= BIT(0),
 	RATE_INFO_FLAGS_VHT_MCS			= BIT(1),
 	RATE_INFO_FLAGS_SHORT_GI		= BIT(2),
-	RATE_INFO_FLAGS_60G			= BIT(3),
+	RATE_INFO_FLAGS_DMG			= BIT(3),
 	RATE_INFO_FLAGS_HE_MCS			= BIT(4),
+	RATE_INFO_FLAGS_EDMG			= BIT(5),
 };
 
 /**
@@ -1225,6 +1301,7 @@ enum rate_info_bw {
  * @he_dcm: HE DCM value
  * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc,
  *	only valid if bw is %RATE_INFO_BW_HE_RU)
+ * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4)
  */
 struct rate_info {
 	u8 flags;
@@ -1235,6 +1312,7 @@ struct rate_info {
 	u8 he_gi;
 	u8 he_dcm;
 	u8 he_ru_alloc;
+	u8 n_bonded_ch;
 };
 
 /**
@@ -2438,6 +2516,9 @@ struct cfg80211_bss_selection {
  * @fils_erp_rrk_len: Length of @fils_erp_rrk in octets.
  * @want_1x: indicates user-space supports and wants to use 802.1X driver
  *	offload of 4-way handshake.
+ * @edmg: define the EDMG channels.
+ *	This may specify multiple channels and bonding options for the driver
+ *	to choose from, based on BSS configuration.
  */
 struct cfg80211_connect_params {
 	struct ieee80211_channel *channel;
@@ -2471,6 +2552,7 @@ struct cfg80211_connect_params {
 	const u8 *fils_erp_rrk;
 	size_t fils_erp_rrk_len;
 	bool want_1x;
+	struct ieee80211_edmg edmg;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 04b58295c8d9..bf7c4222f512 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -52,6 +52,11 @@
 #define NL80211_MULTICAST_GROUP_NAN		"nan"
 #define NL80211_MULTICAST_GROUP_TESTMODE	"testmode"
 
+#define NL80211_EDMG_BW_CONFIG_MIN	4
+#define NL80211_EDMG_BW_CONFIG_MAX	15
+#define NL80211_EDMG_CHANNELS_MIN	1
+#define NL80211_EDMG_CHANNELS_MAX	0x3c /* 0b00111100 */
+
 /**
  * DOC: Station handling
  *
@@ -2361,6 +2366,13 @@ enum nl80211_commands {
  * @NL80211_ATTR_HE_OBSS_PD: nested attribute for OBSS Packet Detection
  *	functionality.
  *
+ * @NL80211_ATTR_WIPHY_EDMG_CHANNELS: bitmap that indicates the 2.16 GHz
+ *	channel(s) that are allowed to be used for EDMG transmissions.
+ *	Defined by IEEE P802.11ay/D4.0 section 9.4.2.251. (u8 attribute)
+ * @NL80211_ATTR_WIPHY_EDMG_BW_CONFIG: Channel BW Configuration subfield encodes
+ *	the allowed channel bandwidth configurations. (u8 attribute)
+ *	Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2820,6 +2832,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_HE_OBSS_PD,
 
+	NL80211_ATTR_WIPHY_EDMG_CHANNELS,
+	NL80211_ATTR_WIPHY_EDMG_BW_CONFIG,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3431,6 +3446,12 @@ enum nl80211_band_iftype_attr {
  * @NL80211_BAND_ATTR_VHT_CAPA: VHT capabilities, as in the HT information IE
  * @NL80211_BAND_ATTR_IFTYPE_DATA: nested array attribute, with each entry using
  *	attributes from &enum nl80211_band_iftype_attr
+ * @NL80211_BAND_ATTR_EDMG_CHANNELS: bitmap that indicates the 2.16 GHz
+ *	channel(s) that are allowed to be used for EDMG transmissions.
+ *	Defined by IEEE P802.11ay/D4.0 section 9.4.2.251.
+ * @NL80211_BAND_ATTR_EDMG_BW_CONFIG: Channel BW Configuration subfield encodes
+ *	the allowed channel bandwidth configurations.
+ *	Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13.
  * @NL80211_BAND_ATTR_MAX: highest band attribute currently defined
  * @__NL80211_BAND_ATTR_AFTER_LAST: internal use
  */
@@ -3448,6 +3469,9 @@ enum nl80211_band_attr {
 	NL80211_BAND_ATTR_VHT_CAPA,
 	NL80211_BAND_ATTR_IFTYPE_DATA,
 
+	NL80211_BAND_ATTR_EDMG_CHANNELS,
+	NL80211_BAND_ATTR_EDMG_BW_CONFIG,
+
 	/* keep last */
 	__NL80211_BAND_ATTR_AFTER_LAST,
 	NL80211_BAND_ATTR_MAX = __NL80211_BAND_ATTR_AFTER_LAST - 1
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 641876982ab9..6471f552a942 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -158,10 +158,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 	memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
 	ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
 
+	memset(chandef, 0, sizeof(struct cfg80211_chan_def));
 	chandef->chan = channel;
 	chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
 	chandef->center_freq1 = channel->center_freq;
-	chandef->center_freq2 = 0;
 
 	if (!ht_oper || !sta_ht_cap.ht_supported) {
 		ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index f88f94d1f177..ab8ba5835ca0 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -262,7 +262,8 @@ static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info,
 	/* IEEE80211_RADIOTAP_RATE rate */
 	if (status && status->rate && !(status->rate->flags &
 					(RATE_INFO_FLAGS_MCS |
-					 RATE_INFO_FLAGS_60G |
+					 RATE_INFO_FLAGS_DMG |
+					 RATE_INFO_FLAGS_EDMG |
 					 RATE_INFO_FLAGS_VHT_MCS |
 					 RATE_INFO_FLAGS_HE_MCS)))
 		len += 2;
@@ -329,7 +330,8 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local,
 
 	if (status && status->rate) {
 		if (!(status->rate->flags & (RATE_INFO_FLAGS_MCS |
-					     RATE_INFO_FLAGS_60G |
+					     RATE_INFO_FLAGS_DMG |
+					     RATE_INFO_FLAGS_EDMG |
 					     RATE_INFO_FLAGS_VHT_MCS |
 					     RATE_INFO_FLAGS_HE_MCS)))
 			legacy_rate = status->rate->legacy;
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 7c9d204838d4..e851cafd8e2f 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -14,6 +14,11 @@
 #include "core.h"
 #include "rdev-ops.h"
 
+static bool cfg80211_valid_60g_freq(u32 freq)
+{
+	return freq >= 58320 && freq <= 70200;
+}
+
 void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
 			     struct ieee80211_channel *chan,
 			     enum nl80211_channel_type chan_type)
@@ -23,6 +28,8 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
 
 	chandef->chan = chan;
 	chandef->center_freq2 = 0;
+	chandef->edmg.bw_config = 0;
+	chandef->edmg.channels = 0;
 
 	switch (chan_type) {
 	case NL80211_CHAN_NO_HT:
@@ -47,6 +54,91 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
 }
 EXPORT_SYMBOL(cfg80211_chandef_create);
 
+static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef)
+{
+	int max_contiguous = 0;
+	int num_of_enabled = 0;
+	int contiguous = 0;
+	int i;
+
+	if (!chandef->edmg.channels || !chandef->edmg.bw_config)
+		return false;
+
+	if (!cfg80211_valid_60g_freq(chandef->chan->center_freq))
+		return false;
+
+	for (i = 0; i < 6; i++) {
+		if (chandef->edmg.channels & BIT(i)) {
+			contiguous++;
+			num_of_enabled++;
+		} else {
+			contiguous = 0;
+		}
+
+		max_contiguous = max(contiguous, max_contiguous);
+	}
+	/* basic verification of edmg configuration according to
+	 * IEEE P802.11ay/D4.0 section 9.4.2.251
+	 */
+	/* check bw_config against contiguous edmg channels */
+	switch (chandef->edmg.bw_config) {
+	case IEEE80211_EDMG_BW_CONFIG_4:
+	case IEEE80211_EDMG_BW_CONFIG_8:
+	case IEEE80211_EDMG_BW_CONFIG_12:
+		if (max_contiguous < 1)
+			return false;
+		break;
+	case IEEE80211_EDMG_BW_CONFIG_5:
+	case IEEE80211_EDMG_BW_CONFIG_9:
+	case IEEE80211_EDMG_BW_CONFIG_13:
+		if (max_contiguous < 2)
+			return false;
+		break;
+	case IEEE80211_EDMG_BW_CONFIG_6:
+	case IEEE80211_EDMG_BW_CONFIG_10:
+	case IEEE80211_EDMG_BW_CONFIG_14:
+		if (max_contiguous < 3)
+			return false;
+		break;
+	case IEEE80211_EDMG_BW_CONFIG_7:
+	case IEEE80211_EDMG_BW_CONFIG_11:
+	case IEEE80211_EDMG_BW_CONFIG_15:
+		if (max_contiguous < 4)
+			return false;
+		break;
+
+	default:
+		return false;
+	}
+
+	/* check bw_config against aggregated (non contiguous) edmg channels */
+	switch (chandef->edmg.bw_config) {
+	case IEEE80211_EDMG_BW_CONFIG_4:
+	case IEEE80211_EDMG_BW_CONFIG_5:
+	case IEEE80211_EDMG_BW_CONFIG_6:
+	case IEEE80211_EDMG_BW_CONFIG_7:
+		break;
+	case IEEE80211_EDMG_BW_CONFIG_8:
+	case IEEE80211_EDMG_BW_CONFIG_9:
+	case IEEE80211_EDMG_BW_CONFIG_10:
+	case IEEE80211_EDMG_BW_CONFIG_11:
+		if (num_of_enabled < 2)
+			return false;
+		break;
+	case IEEE80211_EDMG_BW_CONFIG_12:
+	case IEEE80211_EDMG_BW_CONFIG_13:
+	case IEEE80211_EDMG_BW_CONFIG_14:
+	case IEEE80211_EDMG_BW_CONFIG_15:
+		if (num_of_enabled < 4 || max_contiguous < 2)
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
 bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
 {
 	u32 control_freq;
@@ -112,6 +204,10 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
 		return false;
 	}
 
+	if (cfg80211_chandef_is_edmg(chandef) &&
+	    !cfg80211_edmg_chandef_valid(chandef))
+		return false;
+
 	return true;
 }
 EXPORT_SYMBOL(cfg80211_chandef_valid);
@@ -721,12 +817,66 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
 	return true;
 }
 
+/* check if the operating channels are valid and supported */
+static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels,
+				 enum ieee80211_edmg_bw_config edmg_bw_config,
+				 int primary_channel,
+				 struct ieee80211_edmg *edmg_cap)
+{
+	struct ieee80211_channel *chan;
+	int i, freq;
+	int channels_counter = 0;
+
+	if (!edmg_channels && !edmg_bw_config)
+		return true;
+
+	if ((!edmg_channels && edmg_bw_config) ||
+	    (edmg_channels && !edmg_bw_config))
+		return false;
+
+	if (!(edmg_channels & BIT(primary_channel - 1)))
+		return false;
+
+	/* 60GHz channels 1..6 */
+	for (i = 0; i < 6; i++) {
+		if (!(edmg_channels & BIT(i)))
+			continue;
+
+		if (!(edmg_cap->channels & BIT(i)))
+			return false;
+
+		channels_counter++;
+
+		freq = ieee80211_channel_to_frequency(i + 1,
+						      NL80211_BAND_60GHZ);
+		chan = ieee80211_get_channel(wiphy, freq);
+		if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
+			return false;
+	}
+
+	/* IEEE802.11 allows max 4 channels */
+	if (channels_counter > 4)
+		return false;
+
+	/* check bw_config is a subset of what driver supports
+	 * (see IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13)
+	 */
+	if ((edmg_bw_config % 4) > (edmg_cap->bw_config % 4))
+		return false;
+
+	if (edmg_bw_config > edmg_cap->bw_config)
+		return false;
+
+	return true;
+}
+
 bool cfg80211_chandef_usable(struct wiphy *wiphy,
 			     const struct cfg80211_chan_def *chandef,
 			     u32 prohibited_flags)
 {
 	struct ieee80211_sta_ht_cap *ht_cap;
 	struct ieee80211_sta_vht_cap *vht_cap;
+	struct ieee80211_edmg *edmg_cap;
 	u32 width, control_freq, cap;
 
 	if (WARN_ON(!cfg80211_chandef_valid(chandef)))
@@ -734,6 +884,15 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 
 	ht_cap = &wiphy->bands[chandef->chan->band]->ht_cap;
 	vht_cap = &wiphy->bands[chandef->chan->band]->vht_cap;
+	edmg_cap = &wiphy->bands[chandef->chan->band]->edmg_cap;
+
+	if (edmg_cap->channels &&
+	    !cfg80211_edmg_usable(wiphy,
+				  chandef->edmg.channels,
+				  chandef->edmg.bw_config,
+				  chandef->chan->hw_value,
+				  edmg_cap))
+		return false;
 
 	control_freq = chandef->chan->center_freq;
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index cacd96704647..4565d7385884 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -298,6 +298,13 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 
 	[NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 },
 	[NL80211_ATTR_WIPHY_CHANNEL_TYPE] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_EDMG_CHANNELS] = NLA_POLICY_RANGE(NLA_U8,
+						NL80211_EDMG_CHANNELS_MIN,
+						NL80211_EDMG_CHANNELS_MAX),
+	[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG] = NLA_POLICY_RANGE(NLA_U8,
+						NL80211_EDMG_BW_CONFIG_MIN,
+						NL80211_EDMG_BW_CONFIG_MAX),
+
 	[NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 },
 	[NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 },
 	[NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 },
@@ -1574,6 +1581,15 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 		nla_nest_end(msg, nl_iftype_data);
 	}
 
+	/* add EDMG info */
+	if (sband->edmg_cap.channels &&
+	    (nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_CHANNELS,
+		       sband->edmg_cap.channels) ||
+	    nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_BW_CONFIG,
+		       sband->edmg_cap.bw_config)))
+
+		return -ENOBUFS;
+
 	/* add bitrates */
 	nl_rates = nla_nest_start_noflag(msg, NL80211_BAND_ATTR_RATES);
 	if (!nl_rates)
@@ -2677,6 +2693,18 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 				nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]);
 	}
 
+	if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) {
+		chandef->edmg.channels =
+		      nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]);
+
+		if (info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG])
+			chandef->edmg.bw_config =
+		     nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]);
+	} else {
+		chandef->edmg.bw_config = 0;
+		chandef->edmg.channels = 0;
+	}
+
 	if (!cfg80211_chandef_valid(chandef)) {
 		NL_SET_ERR_MSG(extack, "invalid channel definition");
 		return -EINVAL;
@@ -9894,6 +9922,15 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 			return -EINVAL;
 	}
 
+	if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) {
+		connect.edmg.channels =
+		      nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]);
+
+		if (info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG])
+			connect.edmg.bw_config =
+				nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]);
+	}
+
 	if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) {
 		connkeys = nl80211_parse_connkeys(rdev, info, NULL);
 		if (IS_ERR(connkeys))
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 59ed0808c13e..c99939067bb0 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1043,7 +1043,7 @@ static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate)
 	return (bitrate + 50000) / 100000;
 }
 
-static u32 cfg80211_calculate_bitrate_60g(struct rate_info *rate)
+static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate)
 {
 	static const u32 __mcs2bitrate[] = {
 		/* control PHY */
@@ -1090,6 +1090,40 @@ static u32 cfg80211_calculate_bitrate_60g(struct rate_info *rate)
 	return __mcs2bitrate[rate->mcs];
 }
 
+static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate)
+{
+	static const u32 __mcs2bitrate[] = {
+		/* control PHY */
+		[0] =   275,
+		/* SC PHY */
+		[1] =  3850,
+		[2] =  7700,
+		[3] =  9625,
+		[4] = 11550,
+		[5] = 12512, /* 1251.25 mbps */
+		[6] = 13475,
+		[7] = 15400,
+		[8] = 19250,
+		[9] = 23100,
+		[10] = 25025,
+		[11] = 26950,
+		[12] = 30800,
+		[13] = 38500,
+		[14] = 46200,
+		[15] = 50050,
+		[16] = 53900,
+		[17] = 57750,
+		[18] = 69300,
+		[19] = 75075,
+		[20] = 80850,
+	};
+
+	if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate)))
+		return 0;
+
+	return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch;
+}
+
 static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
 {
 	static const u32 base[4][10] = {
@@ -1262,8 +1296,10 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
 {
 	if (rate->flags & RATE_INFO_FLAGS_MCS)
 		return cfg80211_calculate_bitrate_ht(rate);
-	if (rate->flags & RATE_INFO_FLAGS_60G)
-		return cfg80211_calculate_bitrate_60g(rate);
+	if (rate->flags & RATE_INFO_FLAGS_DMG)
+		return cfg80211_calculate_bitrate_dmg(rate);
+	if (rate->flags & RATE_INFO_FLAGS_EDMG)
+		return cfg80211_calculate_bitrate_edmg(rate);
 	if (rate->flags & RATE_INFO_FLAGS_VHT_MCS)
 		return cfg80211_calculate_bitrate_vht(rate);
 	if (rate->flags & RATE_INFO_FLAGS_HE_MCS)
-- 
cgit v1.2.3-59-g8ed1b


From 8050a395112db5bedb7caa6cb673e711a3c04cd9 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Wed, 21 Aug 2019 00:08:58 +0100
Subject: bpf: fix 'struct pt_reg' typo in documentation

There is no 'struct pt_reg'.

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8aa6126f0b6e..267544e140be 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1018,7 +1018,7 @@ union bpf_attr {
  * 		The realm of the route for the packet associated to *skb*, or 0
  * 		if none was found.
  *
- * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  * 	Description
  * 		Write raw *data* blob into a special BPF perf event held by
  * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1080,7 +1080,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
  * 	Description
  * 		Walk a user or a kernel stack and return its id. To achieve
  * 		this, the helper needs *ctx*, which is a pointer to the context
@@ -1729,7 +1729,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * int bpf_override_return(struct pt_regs *regs, u64 rc)
  * 	Description
  * 		Used for error injection, this helper uses kprobes to override
  * 		the return value of the probed function, and to set it to *rc*.
-- 
cgit v1.2.3-59-g8ed1b


From 55c33dfbeb831eb3ab7cc1a3e295b0d4d57f23a3 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Wed, 21 Aug 2019 00:08:59 +0100
Subject: bpf: clarify when bpf_trace_printk discards lines

I opened /sys/kernel/tracing/trace once and kept reading from it.
bpf_trace_printk somehow did not seem to work, no entries were appended
to that trace file. It turns out that tracing is disabled when that file
is open. Save the next person some time and document this.

The trace file is described in Documentation/trace/ftrace.rst, however
the implication "tracing is disabled" did not immediate translate to
"bpf_trace_printk silently discards entries".

Signed-off-by: Peter Wu <peter@lekensteyn.nl>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 267544e140be..b5889257cc33 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -580,6 +580,8 @@ union bpf_attr {
  * 		limited to five).
  *
  * 		Each time the helper is called, it appends a line to the trace.
+ * 		Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ * 		open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
  * 		The format of the trace is customizable, and the exact output
  * 		one will get depends on the options set in
  * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
-- 
cgit v1.2.3-59-g8ed1b


From bd1200b79510a68554890af2f48d92be6eb1daf8 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 23 Aug 2019 18:47:21 +0300
Subject: drop_monitor: Make timestamps y2038 safe

Timestamps are currently communicated to user space as 'struct
timespec', which is not considered y2038 safe since it uses a 32-bit
signed value for seconds.

Fix this while the API is still not part of any official kernel release
by using 64-bit nanoseconds timestamps instead.

Fixes: ca30707dee2b ("drop_monitor: Add packet alert mode")
Fixes: 5e58109b1ea4 ("drop_monitor: Add support for packet alert mode for hardware drops")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_dropmon.h |  2 +-
 net/core/drop_monitor.c          | 14 ++++++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 75a35dccb675..8bf79a9eb234 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -75,7 +75,7 @@ enum net_dm_attr {
 	NET_DM_ATTR_PC,				/* u64 */
 	NET_DM_ATTR_SYMBOL,			/* string */
 	NET_DM_ATTR_IN_PORT,			/* nested */
-	NET_DM_ATTR_TIMESTAMP,			/* struct timespec */
+	NET_DM_ATTR_TIMESTAMP,			/* u64 */
 	NET_DM_ATTR_PROTO,			/* u16 */
 	NET_DM_ATTR_PAYLOAD,			/* binary */
 	NET_DM_ATTR_PAD,
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index bfc024024aa3..cc60cc22e2db 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -552,7 +552,7 @@ static size_t net_dm_packet_report_size(size_t payload_len)
 	       /* NET_DM_ATTR_IN_PORT */
 	       net_dm_in_port_size() +
 	       /* NET_DM_ATTR_TIMESTAMP */
-	       nla_total_size(sizeof(struct timespec)) +
+	       nla_total_size(sizeof(u64)) +
 	       /* NET_DM_ATTR_ORIG_LEN */
 	       nla_total_size(sizeof(u32)) +
 	       /* NET_DM_ATTR_PROTO */
@@ -592,7 +592,6 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 	u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc;
 	char buf[NET_DM_MAX_SYMBOL_LEN];
 	struct nlattr *attr;
-	struct timespec ts;
 	void *hdr;
 	int rc;
 
@@ -615,8 +614,8 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 	if (rc)
 		goto nla_put_failure;
 
-	if (ktime_to_timespec_cond(skb->tstamp, &ts) &&
-	    nla_put(msg, NET_DM_ATTR_TIMESTAMP, sizeof(ts), &ts))
+	if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
+			      ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
 		goto nla_put_failure;
 
 	if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
@@ -716,7 +715,7 @@ net_dm_hw_packet_report_size(size_t payload_len,
 	       /* NET_DM_ATTR_IN_PORT */
 	       net_dm_in_port_size() +
 	       /* NET_DM_ATTR_TIMESTAMP */
-	       nla_total_size(sizeof(struct timespec)) +
+	       nla_total_size(sizeof(u64)) +
 	       /* NET_DM_ATTR_ORIG_LEN */
 	       nla_total_size(sizeof(u32)) +
 	       /* NET_DM_ATTR_PROTO */
@@ -730,7 +729,6 @@ static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
 {
 	struct net_dm_hw_metadata *hw_metadata;
 	struct nlattr *attr;
-	struct timespec ts;
 	void *hdr;
 
 	hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
@@ -761,8 +759,8 @@ static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
 			goto nla_put_failure;
 	}
 
-	if (ktime_to_timespec_cond(skb->tstamp, &ts) &&
-	    nla_put(msg, NET_DM_ATTR_TIMESTAMP, sizeof(ts), &ts))
+	if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
+			      ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
 		goto nla_put_failure;
 
 	if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
-- 
cgit v1.2.3-59-g8ed1b


From 63d10e12b00dfc8d8387bea9eaab376881335731 Mon Sep 17 00:00:00 2001
From: Ander Juaristi <a@juaristi.eus>
Date: Sat, 17 Aug 2019 13:17:53 +0200
Subject: netfilter: nft_meta: support for time matching

This patch introduces meta matches in the kernel for time (a UNIX timestamp),
day (a day of week, represented as an integer between 0-6), and
hour (an hour in the current day, or: number of seconds since midnight).

All values are taken as unsigned 64-bit integers.

The 'time' keyword is internally converted to nanoseconds by nft in
userspace, and hence the timestamp is taken in nanoseconds as well.

Signed-off-by: Ander Juaristi <a@juaristi.eus>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  6 +++++
 net/netfilter/nft_meta.c                 | 46 ++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 82abaa183fc3..b83b62eb4b01 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -799,6 +799,9 @@ enum nft_exthdr_attributes {
  * @NFT_META_OIFKIND: packet output interface kind name (dev->rtnl_link_ops->kind)
  * @NFT_META_BRI_IIFPVID: packet input bridge port pvid
  * @NFT_META_BRI_IIFVPROTO: packet input bridge vlan proto
+ * @NFT_META_TIME_NS: time since epoch (in nanoseconds)
+ * @NFT_META_TIME_DAY: day of week (from 0 = Sunday to 6 = Saturday)
+ * @NFT_META_TIME_HOUR: hour of day (in seconds)
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -831,6 +834,9 @@ enum nft_meta_keys {
 	NFT_META_OIFKIND,
 	NFT_META_BRI_IIFPVID,
 	NFT_META_BRI_IIFVPROTO,
+	NFT_META_TIME_NS,
+	NFT_META_TIME_DAY,
+	NFT_META_TIME_HOUR,
 };
 
 /**
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index f69afb9ff3cb..317e3a9e8c5b 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -26,8 +26,36 @@
 
 #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
 
+#define NFT_META_SECS_PER_MINUTE	60
+#define NFT_META_SECS_PER_HOUR		3600
+#define NFT_META_SECS_PER_DAY		86400
+#define NFT_META_DAYS_PER_WEEK		7
+
 static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
 
+static u8 nft_meta_weekday(unsigned long secs)
+{
+	unsigned int dse;
+	u8 wday;
+
+	secs -= NFT_META_SECS_PER_MINUTE * sys_tz.tz_minuteswest;
+	dse = secs / NFT_META_SECS_PER_DAY;
+	wday = (4 + dse) % NFT_META_DAYS_PER_WEEK;
+
+	return wday;
+}
+
+static u32 nft_meta_hour(unsigned long secs)
+{
+	struct tm tm;
+
+	time64_to_tm(secs, 0, &tm);
+
+	return tm.tm_hour * NFT_META_SECS_PER_HOUR
+		+ tm.tm_min * NFT_META_SECS_PER_MINUTE
+		+ tm.tm_sec;
+}
+
 void nft_meta_get_eval(const struct nft_expr *expr,
 		       struct nft_regs *regs,
 		       const struct nft_pktinfo *pkt)
@@ -218,6 +246,15 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 			goto err;
 		strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ);
 		break;
+	case NFT_META_TIME_NS:
+		nft_reg_store64(dest, ktime_get_real_ns());
+		break;
+	case NFT_META_TIME_DAY:
+		nft_reg_store8(dest, nft_meta_weekday(get_seconds()));
+		break;
+	case NFT_META_TIME_HOUR:
+		*dest = nft_meta_hour(get_seconds());
+		break;
 	default:
 		WARN_ON(1);
 		goto err;
@@ -330,6 +367,15 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 		len = sizeof(u8);
 		break;
 #endif
+	case NFT_META_TIME_NS:
+		len = sizeof(u64);
+		break;
+	case NFT_META_TIME_DAY:
+		len = sizeof(u8);
+		break;
+	case NFT_META_TIME_HOUR:
+		len = sizeof(u32);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 65af4a10743b766e319fb53812c5926c6d98b100 Mon Sep 17 00:00:00 2001
From: Michael Braun <michael-dev@fami-braun.de>
Date: Tue, 20 Aug 2019 15:11:46 +0200
Subject: netfilter: nfnetlink_log: add support for VLAN information

Currently, there is no vlan information (e.g. when used with a vlan aware
bridge) passed to userspache, HWHEADER will contain an 08 00 (ip) suffix
even for tagged ip packets.

Therefore, add an extra netlink attribute that passes the vlan information
to userspace similarly to 15824ab29f for nfqueue.

Signed-off-by: Michael Braun <michael-dev@fami-braun.de>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_log.h | 11 ++++++
 net/netfilter/nfnetlink_log.c                | 57 ++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nfnetlink_log.h b/include/uapi/linux/netfilter/nfnetlink_log.h
index 20983cb195a0..45c8d3b027e0 100644
--- a/include/uapi/linux/netfilter/nfnetlink_log.h
+++ b/include/uapi/linux/netfilter/nfnetlink_log.h
@@ -33,6 +33,15 @@ struct nfulnl_msg_packet_timestamp {
 	__aligned_be64	usec;
 };
 
+enum nfulnl_vlan_attr {
+	NFULA_VLAN_UNSPEC,
+	NFULA_VLAN_PROTO,		/* __be16 skb vlan_proto */
+	NFULA_VLAN_TCI,			/* __be16 skb htons(vlan_tci) */
+	__NFULA_VLAN_MAX,
+};
+
+#define NFULA_VLAN_MAX (__NFULA_VLAN_MAX + 1)
+
 enum nfulnl_attr_type {
 	NFULA_UNSPEC,
 	NFULA_PACKET_HDR,
@@ -54,6 +63,8 @@ enum nfulnl_attr_type {
 	NFULA_HWLEN,			/* hardware header length */
 	NFULA_CT,                       /* nf_conntrack_netlink.h */
 	NFULA_CT_INFO,                  /* enum ip_conntrack_info */
+	NFULA_VLAN,			/* nested attribute: packet vlan info */
+	NFULA_L2HDR,			/* full L2 header */
 
 	__NFULA_MAX
 };
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index d69e1863e536..0ba020ca38e6 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -385,6 +385,57 @@ nfulnl_timer(struct timer_list *t)
 	instance_put(inst);
 }
 
+static u32 nfulnl_get_bridge_size(const struct sk_buff *skb)
+{
+	u32 size = 0;
+
+	if (!skb_mac_header_was_set(skb))
+		return 0;
+
+	if (skb_vlan_tag_present(skb)) {
+		size += nla_total_size(0); /* nested */
+		size += nla_total_size(sizeof(u16)); /* id */
+		size += nla_total_size(sizeof(u16)); /* tag */
+	}
+
+	if (skb->network_header > skb->mac_header)
+		size += nla_total_size(skb->network_header - skb->mac_header);
+
+	return size;
+}
+
+static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb)
+{
+	if (!skb_mac_header_was_set(skb))
+		return 0;
+
+	if (skb_vlan_tag_present(skb)) {
+		struct nlattr *nest;
+
+		nest = nla_nest_start(inst->skb, NFULA_VLAN);
+		if (!nest)
+			goto nla_put_failure;
+
+		if (nla_put_be16(inst->skb, NFULA_VLAN_TCI, htons(skb->vlan_tci)) ||
+		    nla_put_be16(inst->skb, NFULA_VLAN_PROTO, skb->vlan_proto))
+			goto nla_put_failure;
+
+		nla_nest_end(inst->skb, nest);
+	}
+
+	if (skb->mac_header < skb->network_header) {
+		int len = (int)(skb->network_header - skb->mac_header);
+
+		if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb)))
+			goto nla_put_failure;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 /* This is an inline function, we don't really care about a long
  * list of arguments */
 static inline int
@@ -580,6 +631,10 @@ __build_packet_message(struct nfnl_log_net *log,
 				 NFULA_CT, NFULA_CT_INFO) < 0)
 		goto nla_put_failure;
 
+	if ((pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) &&
+	    nfulnl_put_bridge(inst, skb) < 0)
+		goto nla_put_failure;
+
 	if (data_len) {
 		struct nlattr *nla;
 		int size = nla_attr_size(data_len);
@@ -687,6 +742,8 @@ nfulnl_log_packet(struct net *net,
 				size += nfnl_ct->build_size(ct);
 		}
 	}
+	if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE)
+		size += nfulnl_get_bridge_size(skb);
 
 	qthreshold = inst->qthreshold;
 	/* per-rule qthreshold overrides per-instance */
-- 
cgit v1.2.3-59-g8ed1b


From d0a8d877da976c244092ce859683b2fa116217db Mon Sep 17 00:00:00 2001
From: Ander Juaristi <a@juaristi.eus>
Date: Sat, 17 Aug 2019 13:26:52 +0200
Subject: netfilter: nft_dynset: support for element deletion

This patch implements the delete operation from the ruleset.

It implements a new delete() function in nft_set_rhash. It is simpler
to use than the already existing remove(), because it only takes the set
and the key as arguments, whereas remove() expects a full
nft_set_elem structure.

Signed-off-by: Ander Juaristi <a@juaristi.eus>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        | 10 +++++++++-
 include/uapi/linux/netfilter/nf_tables.h |  1 +
 net/netfilter/nft_dynset.c               |  6 ++++++
 net/netfilter/nft_set_hash.c             | 19 +++++++++++++++++++
 4 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 64765140657b..498665158ee0 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -302,17 +302,23 @@ struct nft_expr;
  *	struct nft_set_ops - nf_tables set operations
  *
  *	@lookup: look up an element within the set
+ *	@update: update an element if exists, add it if doesn't exist
+ *	@delete: delete an element
  *	@insert: insert new element into set
  *	@activate: activate new element in the next generation
  *	@deactivate: lookup for element and deactivate it in the next generation
  *	@flush: deactivate element in the next generation
  *	@remove: remove element from set
- *	@walk: iterate over all set elemeennts
+ *	@walk: iterate over all set elements
  *	@get: get set elements
  *	@privsize: function to return size of set private data
  *	@init: initialize private data of new set instance
  *	@destroy: destroy private data of set instance
  *	@elemsize: element private size
+ *
+ *	Operations lookup, update and delete have simpler interfaces, are faster
+ *	and currently only used in the packet path. All the rest are slower,
+ *	control plane functions.
  */
 struct nft_set_ops {
 	bool				(*lookup)(const struct net *net,
@@ -327,6 +333,8 @@ struct nft_set_ops {
 						  const struct nft_expr *expr,
 						  struct nft_regs *regs,
 						  const struct nft_set_ext **ext);
+	bool				(*delete)(const struct nft_set *set,
+						  const u32 *key);
 
 	int				(*insert)(const struct net *net,
 						  const struct nft_set *set,
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index b83b62eb4b01..0ff932dadc8e 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -636,6 +636,7 @@ enum nft_lookup_attributes {
 enum nft_dynset_ops {
 	NFT_DYNSET_OP_ADD,
 	NFT_DYNSET_OP_UPDATE,
+	NFT_DYNSET_OP_DELETE,
 };
 
 enum nft_dynset_flags {
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 33833a0cb989..8887295414dc 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -84,6 +84,11 @@ void nft_dynset_eval(const struct nft_expr *expr,
 	const struct nft_expr *sexpr;
 	u64 timeout;
 
+	if (priv->op == NFT_DYNSET_OP_DELETE) {
+		set->ops->delete(set, &regs->data[priv->sreg_key]);
+		return;
+	}
+
 	if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
 			     expr, regs, &ext)) {
 		sexpr = NULL;
@@ -161,6 +166,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP]));
 	switch (priv->op) {
 	case NFT_DYNSET_OP_ADD:
+	case NFT_DYNSET_OP_DELETE:
 		break;
 	case NFT_DYNSET_OP_UPDATE:
 		if (!(set->flags & NFT_SET_TIMEOUT))
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index c490451fcebf..b331a3c9a3a8 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -234,6 +234,24 @@ static void nft_rhash_remove(const struct net *net,
 	rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
 }
 
+static bool nft_rhash_delete(const struct nft_set *set,
+			     const u32 *key)
+{
+	struct nft_rhash *priv = nft_set_priv(set);
+	struct nft_rhash_cmp_arg arg = {
+		.genmask = NFT_GENMASK_ANY,
+		.set = set,
+		.key = key,
+	};
+	struct nft_rhash_elem *he;
+
+	he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
+	if (he == NULL)
+		return false;
+
+	return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0;
+}
+
 static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 			   struct nft_set_iter *iter)
 {
@@ -662,6 +680,7 @@ struct nft_set_type nft_set_rhash_type __read_mostly = {
 		.remove		= nft_rhash_remove,
 		.lookup		= nft_rhash_lookup,
 		.update		= nft_rhash_update,
+		.delete		= nft_rhash_delete,
 		.walk		= nft_rhash_walk,
 		.get		= nft_rhash_get,
 	},
-- 
cgit v1.2.3-59-g8ed1b


From 10d274e880eb208ec6a76261a9f8f8155020f771 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 22 Aug 2019 22:52:12 -0700
Subject: bpf: introduce verifier internal test flag

Introduce BPF_F_TEST_STATE_FREQ flag to stress test parentage chain
and state pruning.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 1 +
 include/uapi/linux/bpf.h     | 3 +++
 kernel/bpf/syscall.c         | 1 +
 kernel/bpf/verifier.c        | 5 ++++-
 4 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5fe99f322b1c..26a6d58ca78c 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -355,6 +355,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
+	bool test_state_freq;		/* test verifier with different pruning frequency */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_verifier_state_list *free_list;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b5889257cc33..5d2fb183ee2d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -285,6 +285,9 @@ enum bpf_attach_type {
  */
 #define BPF_F_TEST_RND_HI32	(1U << 2)
 
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_STATE_FREQ	(1U << 3)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * two extensions:
  *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0f62fd67c6b..ca60eafa6922 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1629,6 +1629,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
 				 BPF_F_ANY_ALIGNMENT |
+				 BPF_F_TEST_STATE_FREQ |
 				 BPF_F_TEST_RND_HI32))
 		return -EINVAL;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 10c0ff93f52b..f340cfe53c6e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7222,7 +7222,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
-	bool add_new_state = false;
+	bool add_new_state = env->test_state_freq ? true : false;
 
 	cur->last_insn_idx = env->prev_insn_idx;
 	if (!env->insn_aux_data[insn_idx].prune_point)
@@ -9262,6 +9262,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 
 	env->allow_ptr_leaks = is_priv;
 
+	if (is_priv)
+		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.2.3-59-g8ed1b


From d5886b919a720ff859aebf569cb0f353b1d977a6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 26 Aug 2019 16:30:04 +0800
Subject: sctp: allow users to set ep ecn flag by sockopt

SCTP_ECN_SUPPORTED sockopt will be added to allow users to change
ep ecn flag, and it's similar with other feature flags.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  1 +
 net/sctp/socket.c         | 73 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 62527aca8477..6d5b164af55c 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -136,6 +136,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_EVENT	127
 #define SCTP_ASCONF_SUPPORTED	128
 #define SCTP_AUTH_SUPPORTED	129
+#define SCTP_ECN_SUPPORTED	130
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 82bc25223cfe..3e50a9712fb1 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4560,6 +4560,34 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_ecn_supported(struct sock *sk,
+					 char __user *optval,
+					 unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+	    sctp_style(sk, UDP))
+		goto out;
+
+	sctp_sk(sk)->ep->ecn_enable = !!params.assoc_value;
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4766,6 +4794,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTH_SUPPORTED:
 		retval = sctp_setsockopt_auth_supported(sk, optval, optlen);
 		break;
+	case SCTP_ECN_SUPPORTED:
+		retval = sctp_setsockopt_ecn_supported(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -7828,6 +7859,45 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_ecn_supported(struct sock *sk, int len,
+					 char __user *optval,
+					 int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+	    sctp_style(sk, UDP)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	params.assoc_value = asoc ? asoc->peer.ecn_capable
+				  : sctp_sk(sk)->ep->ecn_enable;
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -8037,6 +8107,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_auth_supported(sk, len, optval,
 							optlen);
 		break;
+	case SCTP_ECN_SUPPORTED:
+		retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From c8cd6e7f159e6f8d79a23df4aeaa7a540415951b Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 28 Aug 2019 12:20:42 +0200
Subject: cfg80211: add local BSS receive time to survey information

This is useful for checking how much airtime is being used up by other
transmissions on the channel, e.g. by calculating (time_rx - time_bss_rx)
or (time_busy - time_bss_rx - time_tx)

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://lore.kernel.org/r/20190828102042.58016-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 4 ++++
 include/uapi/linux/nl80211.h | 3 +++
 net/wireless/nl80211.c       | 4 ++++
 3 files changed, 11 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5253e7f667bd..ff45c3e1abff 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -768,6 +768,7 @@ ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef)
  * @SURVEY_INFO_TIME_RX: receive time was filled in
  * @SURVEY_INFO_TIME_TX: transmit time was filled in
  * @SURVEY_INFO_TIME_SCAN: scan time was filled in
+ * @SURVEY_INFO_TIME_BSS_RX: local BSS receive time was filled in
  *
  * Used by the driver to indicate which info in &struct survey_info
  * it has filled in during the get_survey().
@@ -781,6 +782,7 @@ enum survey_info_flags {
 	SURVEY_INFO_TIME_RX		= BIT(5),
 	SURVEY_INFO_TIME_TX		= BIT(6),
 	SURVEY_INFO_TIME_SCAN		= BIT(7),
+	SURVEY_INFO_TIME_BSS_RX		= BIT(8),
 };
 
 /**
@@ -797,6 +799,7 @@ enum survey_info_flags {
  * @time_rx: amount of time the radio spent receiving data
  * @time_tx: amount of time the radio spent transmitting data
  * @time_scan: amount of time the radio spent for scanning
+ * @time_bss_rx: amount of time the radio spent receiving data on a local BSS
  *
  * Used by dump_survey() to report back per-channel survey information.
  *
@@ -811,6 +814,7 @@ struct survey_info {
 	u64 time_rx;
 	u64 time_tx;
 	u64 time_scan;
+	u64 time_bss_rx;
 	u32 filled;
 	s8 noise;
 };
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index bf7c4222f512..beee59c831a7 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3870,6 +3870,8 @@ enum nl80211_user_reg_hint_type {
  * @NL80211_SURVEY_INFO_TIME_SCAN: time the radio spent for scan
  *	(on this channel or globally)
  * @NL80211_SURVEY_INFO_PAD: attribute used for padding for 64-bit alignment
+ * @NL80211_SURVEY_INFO_TIME_BSS_RX: amount of time the radio spent
+ *	receiving frames destined to the local BSS
  * @NL80211_SURVEY_INFO_MAX: highest survey info attribute number
  *	currently defined
  * @__NL80211_SURVEY_INFO_AFTER_LAST: internal use
@@ -3886,6 +3888,7 @@ enum nl80211_survey_info {
 	NL80211_SURVEY_INFO_TIME_TX,
 	NL80211_SURVEY_INFO_TIME_SCAN,
 	NL80211_SURVEY_INFO_PAD,
+	NL80211_SURVEY_INFO_TIME_BSS_RX,
 
 	/* keep last */
 	__NL80211_SURVEY_INFO_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8a6cef949210..3e30e18d1d89 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8806,6 +8806,10 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
 	    nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_SCAN,
 			      survey->time_scan, NL80211_SURVEY_INFO_PAD))
 		goto nla_put_failure;
+	if ((survey->filled & SURVEY_INFO_TIME_BSS_RX) &&
+	    nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_BSS_RX,
+			      survey->time_bss_rx, NL80211_SURVEY_INFO_PAD))
+		goto nla_put_failure;
 
 	nla_nest_end(msg, infoattr);
 
-- 
cgit v1.2.3-59-g8ed1b


From c05cd3645814724bdeb32a2b4d953b12bdea5f8c Mon Sep 17 00:00:00 2001
From: Kevin Laatz <kevin.laatz@intel.com>
Date: Tue, 27 Aug 2019 02:25:22 +0000
Subject: xsk: add support to allow unaligned chunk placement

Currently, addresses are chunk size aligned. This means, we are very
restricted in terms of where we can place chunk within the umem. For
example, if we have a chunk size of 2k, then our chunks can only be placed
at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).

This patch introduces the ability to use unaligned chunks. With these
changes, we are no longer bound to having to place chunks at a 2k (or
whatever your chunk size is) interval. Since we are no longer dealing with
aligned chunks, they can now cross page boundaries. Checks for page
contiguity have been added in order to keep track of which pages are
followed by a physically contiguous page.

Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h      | 75 ++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_xdp.h |  9 +++++
 net/xdp/xdp_umem.c          | 19 ++++++---
 net/xdp/xsk.c               | 94 ++++++++++++++++++++++++++++++++++++---------
 net/xdp/xsk_diag.c          |  2 +-
 net/xdp/xsk_queue.h         | 70 +++++++++++++++++++++++++++++----
 6 files changed, 233 insertions(+), 36 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index f023b9940d64..c9398ce7960f 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -16,6 +16,13 @@
 struct net_device;
 struct xsk_queue;
 
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
+
 struct xdp_umem_page {
 	void *addr;
 	dma_addr_t dma;
@@ -27,8 +34,12 @@ struct xdp_umem_fq_reuse {
 	u64 handles[];
 };
 
-/* Flags for the umem flags field. */
-#define XDP_UMEM_USES_NEED_WAKEUP (1 << 0)
+/* Flags for the umem flags field.
+ *
+ * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
+ * flags. See inlude/uapi/include/linux/if_xdp.h.
+ */
+#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
 
 struct xdp_umem {
 	struct xsk_queue *fq;
@@ -124,14 +135,36 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
 int xsk_map_inc(struct xsk_map *map);
 void xsk_map_put(struct xsk_map *map);
 
+static inline u64 xsk_umem_extract_addr(u64 addr)
+{
+	return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline u64 xsk_umem_extract_offset(u64 addr)
+{
+	return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
+{
+	return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
+	unsigned long page_addr;
+
+	addr = xsk_umem_add_offset_to_addr(addr);
+	page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
+
+	return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
 }
 
 static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
 {
-	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+	addr = xsk_umem_add_offset_to_addr(addr);
+
+	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
 }
 
 /* Reuse-queue aware version of FILL queue helpers */
@@ -172,6 +205,19 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
 
 	rq->handles[rq->length++] = addr;
 }
+
+/* Handle the offset appropriately depending on aligned or unaligned mode.
+ * For unaligned mode, we store the offset in the upper 16-bits of the address.
+ * For aligned mode, we simply add the offset to the address.
+ */
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
+					 u64 offset)
+{
+	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
+		return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
+	else
+		return address + offset;
+}
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
@@ -241,6 +287,21 @@ static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
 	return NULL;
 }
 
+static inline u64 xsk_umem_extract_addr(u64 addr)
+{
+	return 0;
+}
+
+static inline u64 xsk_umem_extract_offset(u64 addr)
+{
+	return 0;
+}
+
+static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
+{
+	return 0;
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
 	return NULL;
@@ -290,6 +351,12 @@ static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 	return false;
 }
 
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
+					 u64 offset)
+{
+	return 0;
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 62b80d57b72a..be328c59389d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -26,6 +26,9 @@
  */
 #define XDP_USE_NEED_WAKEUP (1 << 3)
 
+/* Flags for xsk_umem_config flags */
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+
 struct sockaddr_xdp {
 	__u16 sxdp_family;
 	__u16 sxdp_flags;
@@ -66,6 +69,7 @@ struct xdp_umem_reg {
 	__u64 len; /* Length of packet data area */
 	__u32 chunk_size;
 	__u32 headroom;
+	__u32 flags;
 };
 
 struct xdp_statistics {
@@ -87,6 +91,11 @@ struct xdp_options {
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
 #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
 
+/* Masks for unaligned chunks mode */
+#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
+#define XSK_UNALIGNED_BUF_ADDR_MASK \
+	((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+
 /* Rx/Tx descriptor */
 struct xdp_desc {
 	__u64 addr;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 2d65779282a1..e997b263a0dd 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -340,6 +340,7 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
 
 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 {
+	bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
 	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
 	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
@@ -355,7 +356,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 		return -EINVAL;
 	}
 
-	if (!is_power_of_2(chunk_size))
+	if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
+			XDP_UMEM_USES_NEED_WAKEUP))
+		return -EINVAL;
+
+	if (!unaligned_chunks && !is_power_of_2(chunk_size))
 		return -EINVAL;
 
 	if (!PAGE_ALIGNED(addr)) {
@@ -372,9 +377,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if (chunks == 0)
 		return -EINVAL;
 
-	chunks_per_page = PAGE_SIZE / chunk_size;
-	if (chunks < chunks_per_page || chunks % chunks_per_page)
-		return -EINVAL;
+	if (!unaligned_chunks) {
+		chunks_per_page = PAGE_SIZE / chunk_size;
+		if (chunks < chunks_per_page || chunks % chunks_per_page)
+			return -EINVAL;
+	}
 
 	headroom = ALIGN(headroom, 64);
 
@@ -383,13 +390,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 		return -EINVAL;
 
 	umem->address = (unsigned long)addr;
-	umem->chunk_mask = ~((u64)chunk_size - 1);
+	umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
+					    : ~((u64)chunk_size - 1);
 	umem->size = size;
 	umem->headroom = headroom;
 	umem->chunk_size_nohr = chunk_size - headroom;
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
+	umem->flags = mr->flags;
 	INIT_LIST_HEAD(&umem->xsk_list);
 	spin_lock_init(&umem->xsk_list_lock);
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ee4428a892fa..187fd157fcff 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL(xsk_umem_has_addrs);
 
 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
 {
-	return xskq_peek_addr(umem->fq, addr);
+	return xskq_peek_addr(umem->fq, addr, umem);
 }
 EXPORT_SYMBOL(xsk_umem_peek_addr);
 
@@ -115,21 +115,43 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
 
+/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
+ * each page. This is only required in copy mode.
+ */
+static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
+			     u32 len, u32 metalen)
+{
+	void *to_buf = xdp_umem_get_data(umem, addr);
+
+	addr = xsk_umem_add_offset_to_addr(addr);
+	if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
+		void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
+		u64 page_start = addr & ~(PAGE_SIZE - 1);
+		u64 first_len = PAGE_SIZE - (addr - page_start);
+
+		memcpy(to_buf, from_buf, first_len + metalen);
+		memcpy(next_pg_addr, from_buf + first_len, len - first_len);
+
+		return;
+	}
+
+	memcpy(to_buf, from_buf, len + metalen);
+}
+
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	void *to_buf, *from_buf;
+	u64 offset = xs->umem->headroom;
+	u64 addr, memcpy_addr;
+	void *from_buf;
 	u32 metalen;
-	u64 addr;
 	int err;
 
-	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 		xs->rx_dropped++;
 		return -ENOSPC;
 	}
 
-	addr += xs->umem->headroom;
-
 	if (unlikely(xdp_data_meta_unsupported(xdp))) {
 		from_buf = xdp->data;
 		metalen = 0;
@@ -138,9 +160,11 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 		metalen = xdp->data - xdp->data_meta;
 	}
 
-	to_buf = xdp_umem_get_data(xs->umem, addr);
-	memcpy(to_buf, from_buf, len + metalen);
-	addr += metalen;
+	memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
+	__xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
+
+	offset += metalen;
+	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 	err = xskq_produce_batch_desc(xs->rx, addr, len);
 	if (!err) {
 		xskq_discard_addr(xs->umem->fq);
@@ -185,6 +209,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	u32 metalen = xdp->data - xdp->data_meta;
 	u32 len = xdp->data_end - xdp->data;
+	u64 offset = xs->umem->headroom;
 	void *buffer;
 	u64 addr;
 	int err;
@@ -196,17 +221,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 		goto out_unlock;
 	}
 
-	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 		err = -ENOSPC;
 		goto out_drop;
 	}
 
-	addr += xs->umem->headroom;
-
+	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 	buffer = xdp_umem_get_data(xs->umem, addr);
 	memcpy(buffer, xdp->data_meta, len + metalen);
-	addr += metalen;
+
+	addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
 	err = xskq_produce_batch_desc(xs->rx, addr, len);
 	if (err)
 		goto out_drop;
@@ -250,7 +275,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
-		if (!xskq_peek_desc(xs->tx, desc))
+		if (!xskq_peek_desc(xs->tx, desc, umem))
 			continue;
 
 		if (xskq_produce_addr_lazy(umem->cq, desc->addr))
@@ -304,7 +329,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
 		goto out;
 
-	while (xskq_peek_desc(xs->tx, &desc)) {
+	while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
 		char *buffer;
 		u64 addr;
 		u32 len;
@@ -333,7 +358,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		skb->dev = xs->dev;
 		skb->priority = sk->sk_priority;
 		skb->mark = sk->sk_mark;
-		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
 		skb->destructor = xsk_destruct_skb;
 
 		err = dev_direct_xmit(skb, xs->queue_id);
@@ -526,6 +551,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
 	return sock;
 }
 
+/* Check if umem pages are contiguous.
+ * If zero-copy mode, use the DMA address to do the page contiguity check
+ * For all other modes we use addr (kernel virtual address)
+ * Store the result in the low bits of addr.
+ */
+static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
+{
+	struct xdp_umem_page *pgs = umem->pages;
+	int i, is_contig;
+
+	for (i = 0; i < umem->npgs - 1; i++) {
+		is_contig = (flags & XDP_ZEROCOPY) ?
+			(pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
+			(pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
+		pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
+	}
+}
+
 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -616,6 +659,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
 		if (err)
 			goto out_unlock;
+
+		xsk_check_page_contiguity(xs->umem, flags);
 	}
 
 	xs->dev = dev;
@@ -636,6 +681,13 @@ out_release:
 	return err;
 }
 
+struct xdp_umem_reg_v1 {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 chunk_size;
+	__u32 headroom;
+};
+
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -673,10 +725,16 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	}
 	case XDP_UMEM_REG:
 	{
-		struct xdp_umem_reg mr;
+		size_t mr_size = sizeof(struct xdp_umem_reg);
+		struct xdp_umem_reg mr = {};
 		struct xdp_umem *umem;
 
-		if (copy_from_user(&mr, optval, sizeof(mr)))
+		if (optlen < sizeof(struct xdp_umem_reg_v1))
+			return -EINVAL;
+		else if (optlen < sizeof(mr))
+			mr_size = sizeof(struct xdp_umem_reg_v1);
+
+		if (copy_from_user(&mr, optval, mr_size))
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index d5e06c8e0cbf..9986a759fe06 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
 	du.id = umem->id;
 	du.size = umem->size;
 	du.num_pages = umem->npgs;
-	du.chunk_size = (__u32)(~umem->chunk_mask + 1);
+	du.chunk_size = umem->chunk_size_nohr + umem->headroom;
 	du.headroom = umem->headroom;
 	du.ifindex = umem->dev ? umem->dev->ifindex : 0;
 	du.queue_id = umem->queue_id;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index dd9e985c2461..eddae4688862 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -134,6 +134,17 @@ static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
 
 /* UMEM queue */
 
+static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr,
+					      u64 length)
+{
+	bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
+	bool next_pg_contig =
+		(unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
+			XSK_NEXT_PG_CONTIG_MASK;
+
+	return cross_pg && !next_pg_contig;
+}
+
 static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
 	if (addr >= q->size) {
@@ -144,23 +155,51 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 	return true;
 }
 
-static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
+static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr,
+						u64 length,
+						struct xdp_umem *umem)
+{
+	u64 base_addr = xsk_umem_extract_addr(addr);
+
+	addr = xsk_umem_add_offset_to_addr(addr);
+	if (base_addr >= q->size || addr >= q->size ||
+	    xskq_crosses_non_contig_pg(umem, addr, length)) {
+		q->invalid_descs++;
+		return false;
+	}
+
+	return true;
+}
+
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
+				      struct xdp_umem *umem)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
 		*addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
+
+		if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+			if (xskq_is_valid_addr_unaligned(q, *addr,
+							 umem->chunk_size_nohr,
+							 umem))
+				return addr;
+			goto out;
+		}
+
 		if (xskq_is_valid_addr(q, *addr))
 			return addr;
 
+out:
 		q->cons_tail++;
 	}
 
 	return NULL;
 }
 
-static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
+				  struct xdp_umem *umem)
 {
 	if (q->cons_tail == q->cons_head) {
 		smp_mb(); /* D, matches A */
@@ -171,7 +210,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
 		smp_rmb();
 	}
 
-	return xskq_validate_addr(q, addr);
+	return xskq_validate_addr(q, addr, umem);
 }
 
 static inline void xskq_discard_addr(struct xsk_queue *q)
@@ -230,8 +269,21 @@ static inline int xskq_reserve_addr(struct xsk_queue *q)
 
 /* Rx/Tx queue */
 
-static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
+static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d,
+				      struct xdp_umem *umem)
 {
+	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+		if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem))
+			return false;
+
+		if (d->len > umem->chunk_size_nohr || d->options) {
+			q->invalid_descs++;
+			return false;
+		}
+
+		return true;
+	}
+
 	if (!xskq_is_valid_addr(q, d->addr))
 		return false;
 
@@ -245,14 +297,15 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
 }
 
 static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
-						  struct xdp_desc *desc)
+						  struct xdp_desc *desc,
+						  struct xdp_umem *umem)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
 		*desc = READ_ONCE(ring->desc[idx]);
-		if (xskq_is_valid_desc(q, desc))
+		if (xskq_is_valid_desc(q, desc, umem))
 			return desc;
 
 		q->cons_tail++;
@@ -262,7 +315,8 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
 }
 
 static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
-					      struct xdp_desc *desc)
+					      struct xdp_desc *desc,
+					      struct xdp_umem *umem)
 {
 	if (q->cons_tail == q->cons_head) {
 		smp_mb(); /* D, matches A */
@@ -273,7 +327,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
 		smp_rmb(); /* C, matches B */
 	}
 
-	return xskq_validate_desc(q, desc);
+	return xskq_validate_desc(q, desc, umem);
 }
 
 static inline void xskq_discard_desc(struct xsk_queue *q)
-- 
cgit v1.2.3-59-g8ed1b


From 61723b393292f1e4ea27f8d123384d50b176c29d Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 30 Aug 2019 12:25:48 +0200
Subject: tcp: ulp: add functions to dump ulp-specific information

currently, only getsockopt(TCP_ULP) can be invoked to know if a ULP is on
top of a TCP socket. Extend idiag_get_aux() and idiag_get_aux_size(),
introduced by commit b37e88407c1d ("inet_diag: allow protocols to provide
additional data"), to report the ULP name and other information that can
be made available by the ULP through optional functions.

Users having CAP_NET_ADMIN privileges will then be able to retrieve this
information through inet_diag_handler, if they specify INET_DIAG_INFO in
the request.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h              |  3 +++
 include/uapi/linux/inet_diag.h |  8 +++++++
 net/ipv4/tcp_diag.c            | 52 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 77fe87f7a992..c9a3f9688223 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2122,6 +2122,9 @@ struct tcp_ulp_ops {
 	void (*update)(struct sock *sk, struct proto *p);
 	/* cleanup ulp */
 	void (*release)(struct sock *sk);
+	/* diagnostic */
+	int (*get_info)(const struct sock *sk, struct sk_buff *skb);
+	size_t (*get_info_size)(const struct sock *sk);
 
 	char		name[TCP_ULP_NAME_MAX];
 	struct module	*owner;
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index e8baca85bac6..e2c6273274f3 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -153,11 +153,19 @@ enum {
 	INET_DIAG_BBRINFO,	/* request as INET_DIAG_VEGASINFO */
 	INET_DIAG_CLASS_ID,	/* request as INET_DIAG_TCLASS */
 	INET_DIAG_MD5SIG,
+	INET_DIAG_ULP_INFO,
 	__INET_DIAG_MAX,
 };
 
 #define INET_DIAG_MAX (__INET_DIAG_MAX - 1)
 
+enum {
+	INET_ULP_INFO_UNSPEC,
+	INET_ULP_INFO_NAME,
+	__INET_ULP_INFO_MAX,
+};
+#define INET_ULP_INFO_MAX (__INET_ULP_INFO_MAX - 1)
+
 /* INET_DIAG_MEM */
 
 struct inet_diag_meminfo {
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a3a386236d93..babc156deabb 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -81,13 +81,42 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb,
 }
 #endif
 
+static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk,
+			    const struct tcp_ulp_ops *ulp_ops)
+{
+	struct nlattr *nest;
+	int err;
+
+	nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO);
+	if (!nest)
+		return -EMSGSIZE;
+
+	err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name);
+	if (err)
+		goto nla_failure;
+
+	if (ulp_ops->get_info)
+		err = ulp_ops->get_info(sk, skb);
+	if (err)
+		goto nla_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_failure:
+	nla_nest_cancel(skb, nest);
+	return err;
+}
+
 static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
 			    struct sk_buff *skb)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int err = 0;
+
 #ifdef CONFIG_TCP_MD5SIG
 	if (net_admin) {
 		struct tcp_md5sig_info *md5sig;
-		int err = 0;
 
 		rcu_read_lock();
 		md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
@@ -99,11 +128,21 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
 	}
 #endif
 
+	if (net_admin) {
+		const struct tcp_ulp_ops *ulp_ops;
+
+		ulp_ops = icsk->icsk_ulp_ops;
+		if (ulp_ops)
+			err = tcp_diag_put_ulp(skb, sk, ulp_ops);
+		if (err)
+			return err;
+	}
 	return 0;
 }
 
 static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	size_t size = 0;
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -124,6 +163,17 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
 	}
 #endif
 
+	if (net_admin) {
+		const struct tcp_ulp_ops *ulp_ops;
+
+		ulp_ops = icsk->icsk_ulp_ops;
+		if (ulp_ops) {
+			size += nla_total_size(0) +
+				nla_total_size(TCP_ULP_NAME_MAX);
+			if (ulp_ops->get_info_size)
+				size += ulp_ops->get_info_size(sk);
+		}
+	}
 	return size;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 26811cc9f55acf835f7fdadc5ff2bbd6f06bc3ac Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 30 Aug 2019 12:25:49 +0200
Subject: net: tls: export protocol version, cipher, tx_conf/rx_conf to socket
 diag

When an application configures kernel TLS on top of a TCP socket, it's
now possible for inet_diag_handler() to collect information regarding the
protocol version, the cipher type and TX / RX configuration, in case
INET_DIAG_INFO is requested.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h              | 17 +++++++++++
 include/uapi/linux/inet_diag.h |  1 +
 include/uapi/linux/tls.h       | 15 ++++++++++
 net/tls/tls_main.c             | 64 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 97 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/tls.h b/include/net/tls.h
index 4997742475cd..ec3c3ed2c6c3 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -431,6 +431,23 @@ static inline bool is_tx_ready(struct tls_sw_context_tx *ctx)
 	return READ_ONCE(rec->tx_ready);
 }
 
+static inline u16 tls_user_config(struct tls_context *ctx, bool tx)
+{
+	u16 config = tx ? ctx->tx_conf : ctx->rx_conf;
+
+	switch (config) {
+	case TLS_BASE:
+		return TLS_CONF_BASE;
+	case TLS_SW:
+		return TLS_CONF_SW;
+	case TLS_HW:
+		return TLS_CONF_HW;
+	case TLS_HW_RECORD:
+		return TLS_CONF_HW_RECORD;
+	}
+	return 0;
+}
+
 struct sk_buff *
 tls_validate_xmit_skb(struct sock *sk, struct net_device *dev,
 		      struct sk_buff *skb);
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index e2c6273274f3..a1ff345b3f33 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -162,6 +162,7 @@ enum {
 enum {
 	INET_ULP_INFO_UNSPEC,
 	INET_ULP_INFO_NAME,
+	INET_ULP_INFO_TLS,
 	__INET_ULP_INFO_MAX,
 };
 #define INET_ULP_INFO_MAX (__INET_ULP_INFO_MAX - 1)
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index 5b9c26753e46..bcd2869ed472 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -109,4 +109,19 @@ struct tls12_crypto_info_aes_ccm_128 {
 	unsigned char rec_seq[TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE];
 };
 
+enum {
+	TLS_INFO_UNSPEC,
+	TLS_INFO_VERSION,
+	TLS_INFO_CIPHER,
+	TLS_INFO_TXCONF,
+	TLS_INFO_RXCONF,
+	__TLS_INFO_MAX,
+};
+#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
+
+#define TLS_CONF_BASE 1
+#define TLS_CONF_SW 2
+#define TLS_CONF_HW 3
+#define TLS_CONF_HW_RECORD 4
+
 #endif /* _UAPI_LINUX_TLS_H */
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index f8f2d2c3d627..277f7c209fed 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -39,6 +39,7 @@
 #include <linux/netdevice.h>
 #include <linux/sched/signal.h>
 #include <linux/inetdevice.h>
+#include <linux/inet_diag.h>
 
 #include <net/tls.h>
 
@@ -835,6 +836,67 @@ static void tls_update(struct sock *sk, struct proto *p)
 	}
 }
 
+static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
+{
+	u16 version, cipher_type;
+	struct tls_context *ctx;
+	struct nlattr *start;
+	int err;
+
+	start = nla_nest_start_noflag(skb, INET_ULP_INFO_TLS);
+	if (!start)
+		return -EMSGSIZE;
+
+	rcu_read_lock();
+	ctx = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
+	if (!ctx) {
+		err = 0;
+		goto nla_failure;
+	}
+	version = ctx->prot_info.version;
+	if (version) {
+		err = nla_put_u16(skb, TLS_INFO_VERSION, version);
+		if (err)
+			goto nla_failure;
+	}
+	cipher_type = ctx->prot_info.cipher_type;
+	if (cipher_type) {
+		err = nla_put_u16(skb, TLS_INFO_CIPHER, cipher_type);
+		if (err)
+			goto nla_failure;
+	}
+	err = nla_put_u16(skb, TLS_INFO_TXCONF, tls_user_config(ctx, true));
+	if (err)
+		goto nla_failure;
+
+	err = nla_put_u16(skb, TLS_INFO_RXCONF, tls_user_config(ctx, false));
+	if (err)
+		goto nla_failure;
+
+	rcu_read_unlock();
+	nla_nest_end(skb, start);
+	return 0;
+
+nla_failure:
+	rcu_read_unlock();
+	nla_nest_cancel(skb, start);
+	return err;
+}
+
+static size_t tls_get_info_size(const struct sock *sk)
+{
+	size_t size = 0;
+
+	size += nla_total_size(0) +		/* INET_ULP_INFO_TLS */
+		nla_total_size(sizeof(u16)) +	/* TLS_INFO_VERSION */
+		nla_total_size(sizeof(u16)) +	/* TLS_INFO_CIPHER */
+		nla_total_size(sizeof(u16)) +	/* TLS_INFO_RXCONF */
+		nla_total_size(sizeof(u16)) +	/* TLS_INFO_TXCONF */
+		0;
+
+	return size;
+}
+
 void tls_register_device(struct tls_device *device)
 {
 	spin_lock_bh(&device_spinlock);
@@ -856,6 +918,8 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.owner			= THIS_MODULE,
 	.init			= tls_init,
 	.update			= tls_update,
+	.get_info		= tls_get_info,
+	.get_info_size		= tls_get_info_size,
 };
 
 static int __init tls_register(void)
-- 
cgit v1.2.3-59-g8ed1b


From 2a0c9aaa6247c817e45bfc1aaa5eaeafe7a331d6 Mon Sep 17 00:00:00 2001
From: Kurt Van Dijck <dev.kurt@vandijck-laurijssen.be>
Date: Mon, 8 Oct 2018 11:48:34 +0200
Subject: can: add socket type for CAN_J1939

This patch is a preparation for SAE J1939 and adds CAN_J1939
socket type.

Signed-off-by: Kurt Van Dijck <dev.kurt@vandijck-laurijssen.be>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h
index 0afb7d8e867f..06d92d6be6e6 100644
--- a/include/uapi/linux/can.h
+++ b/include/uapi/linux/can.h
@@ -157,7 +157,8 @@ struct canfd_frame {
 #define CAN_TP20	4 /* VAG Transport Protocol v2.0 */
 #define CAN_MCNET	5 /* Bosch MCNet */
 #define CAN_ISOTP	6 /* ISO 15765-2 Transport Protocol */
-#define CAN_NPROTO	7
+#define CAN_J1939	7 /* SAE J1939 */
+#define CAN_NPROTO	8
 
 #define SOL_CAN_BASE 100
 
-- 
cgit v1.2.3-59-g8ed1b


From f5223e9eee651e005c0f6d6d078909087601b7e9 Mon Sep 17 00:00:00 2001
From: Kurt Van Dijck <dev.kurt@vandijck-laurijssen.be>
Date: Mon, 8 Oct 2018 11:48:35 +0200
Subject: can: extend sockaddr_can to include j1939 members

This patch prepares struct sockaddr_can for SAE J1939.

Signed-off-by: Kurt Van Dijck <dev.kurt@vandijck-laurijssen.be>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h
index 06d92d6be6e6..1e988fdeba34 100644
--- a/include/uapi/linux/can.h
+++ b/include/uapi/linux/can.h
@@ -175,6 +175,23 @@ struct sockaddr_can {
 		/* transport protocol class address information (e.g. ISOTP) */
 		struct { canid_t rx_id, tx_id; } tp;
 
+		/* J1939 address information */
+		struct {
+			/* 8 byte name when using dynamic addressing */
+			__u64 name;
+
+			/* pgn:
+			 * 8 bit: PS in PDU2 case, else 0
+			 * 8 bit: PF
+			 * 1 bit: DP
+			 * 1 bit: reserved
+			 */
+			__u32 pgn;
+
+			/* 1 byte address */
+			__u8 addr;
+		} j1939;
+
 		/* reserved for future CAN protocols address information */
 	} can_addr;
 };
-- 
cgit v1.2.3-59-g8ed1b


From 9d71dd0c70099914fcd063135da3c580865e924c Mon Sep 17 00:00:00 2001
From: The j1939 authors <linux-can@vger.kernel.org>
Date: Mon, 8 Oct 2018 11:48:36 +0200
Subject: can: add support of SAE J1939 protocol

SAE J1939 is the vehicle bus recommended practice used for communication
and diagnostics among vehicle components. Originating in the car and
heavy-duty truck industry in the United States, it is now widely used in
other parts of the world.

J1939, ISO 11783 and NMEA 2000 all share the same high level protocol.
SAE J1939 can be considered the replacement for the older SAE J1708 and
SAE J1587 specifications.

Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Bastian Stender <bst@pengutronix.de>
Signed-off-by: Elenita Hinds <ecathinds@gmail.com>
Signed-off-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Kurt Van Dijck <dev.kurt@vandijck-laurijssen.be>
Signed-off-by: Maxime Jayat <maxime.jayat@mobile-devices.fr>
Signed-off-by: Robin van der Gracht <robin@protonic.nl>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 Documentation/networking/index.rst |    1 +
 Documentation/networking/j1939.rst |  422 ++++++++
 MAINTAINERS                        |   10 +
 include/linux/can/can-ml.h         |    3 +
 include/uapi/linux/can/j1939.h     |   99 ++
 net/can/Kconfig                    |    2 +
 net/can/Makefile                   |    2 +
 net/can/j1939/Kconfig              |   15 +
 net/can/j1939/Makefile             |   10 +
 net/can/j1939/address-claim.c      |  230 ++++
 net/can/j1939/bus.c                |  333 ++++++
 net/can/j1939/j1939-priv.h         |  338 ++++++
 net/can/j1939/main.c               |  403 +++++++
 net/can/j1939/socket.c             | 1160 +++++++++++++++++++++
 net/can/j1939/transport.c          | 2027 ++++++++++++++++++++++++++++++++++++
 15 files changed, 5055 insertions(+)
 create mode 100644 Documentation/networking/j1939.rst
 create mode 100644 include/uapi/linux/can/j1939.h
 create mode 100644 net/can/j1939/Kconfig
 create mode 100644 net/can/j1939/Makefile
 create mode 100644 net/can/j1939/address-claim.c
 create mode 100644 net/can/j1939/bus.c
 create mode 100644 net/can/j1939/j1939-priv.h
 create mode 100644 net/can/j1939/main.c
 create mode 100644 net/can/j1939/socket.c
 create mode 100644 net/can/j1939/transport.c

(limited to 'include/uapi')

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 37eabc17894c..0481d0ffebed 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -17,6 +17,7 @@ Contents:
    devlink-trap
    devlink-trap-netdevsim
    ieee802154
+   j1939
    kapi
    z8530book
    msg_zerocopy
diff --git a/Documentation/networking/j1939.rst b/Documentation/networking/j1939.rst
new file mode 100644
index 000000000000..ce7e7a044e08
--- /dev/null
+++ b/Documentation/networking/j1939.rst
@@ -0,0 +1,422 @@
+.. SPDX-License-Identifier: (GPL-2.0 OR MIT)
+
+===================
+J1939 Documentation
+===================
+
+Overview / What Is J1939
+========================
+
+SAE J1939 defines a higher layer protocol on CAN. It implements a more
+sophisticated addressing scheme and extends the maximum packet size above 8
+bytes. Several derived specifications exist, which differ from the original
+J1939 on the application level, like MilCAN A, NMEA2000 and especially
+ISO-11783 (ISOBUS). This last one specifies the so-called ETP (Extended
+Transport Protocol) which is has been included in this implementation. This
+results in a maximum packet size of ((2 ^ 24) - 1) * 7 bytes == 111 MiB.
+
+Specifications used
+-------------------
+
+* SAE J1939-21 : data link layer
+* SAE J1939-81 : network management
+* ISO 11783-6  : Virtual Terminal (Extended Transport Protocol)
+
+.. _j1939-motivation:
+
+Motivation
+==========
+
+Given the fact there's something like SocketCAN with an API similar to BSD
+sockets, we found some reasons to justify a kernel implementation for the
+addressing and transport methods used by J1939.
+
+* **Addressing:** when a process on an ECU communicates via J1939, it should
+  not necessarily know its source address. Although at least one process per
+  ECU should know the source address. Other processes should be able to reuse
+  that address. This way, address parameters for different processes
+  cooperating for the same ECU, are not duplicated. This way of working is
+  closely related to the UNIX concept where programs do just one thing, and do
+  it well.
+
+* **Dynamic addressing:** Address Claiming in J1939 is time critical.
+  Furthermore data transport should be handled properly during the address
+  negotiation. Putting this functionality in the kernel eliminates it as a
+  requirement for _every_ user space process that communicates via J1939. This
+  results in a consistent J1939 bus with proper addressing.
+
+* **Transport:** both TP & ETP reuse some PGNs to relay big packets over them.
+  Different processes may thus use the same TP & ETP PGNs without actually
+  knowing it. The individual TP & ETP sessions _must_ be serialized
+  (synchronized) between different processes. The kernel solves this problem
+  properly and eliminates the serialization (synchronization) as a requirement
+  for _every_ user space process that communicates via J1939.
+
+J1939 defines some other features (relaying, gateway, fast packet transport,
+...). In-kernel code for these would not contribute to protocol stability.
+Therefore, these parts are left to user space.
+
+The J1939 sockets operate on CAN network devices (see SocketCAN). Any J1939
+user space library operating on CAN raw sockets will still operate properly.
+Since such library does not communicate with the in-kernel implementation, care
+must be taken that these two do not interfere. In practice, this means they
+cannot share ECU addresses. A single ECU (or virtual ECU) address is used by
+the library exclusively, or by the in-kernel system exclusively.
+
+J1939 concepts
+==============
+
+PGN
+---
+
+The PGN (Parameter Group Number) is a number to identify a packet. The PGN
+is composed as follows:
+1 bit  : Reserved Bit
+1 bit  : Data Page
+8 bits : PF (PDU Format)
+8 bits : PS (PDU Specific)
+
+In J1939-21 distinction is made between PDU1 format (where PF < 240) and PDU2
+format (where PF >= 240). Furthermore, when using PDU2 format, the PS-field
+contains a so-called Group Extension, which is part of the PGN. When using PDU2
+format, the Group Extension is set in the PS-field.
+
+On the other hand, when using PDU1 format, the PS-field contains a so-called
+Destination Address, which is _not_ part of the PGN. When communicating a PGN
+from user space to kernel (or visa versa) and PDU2 format is used, the PS-field
+of the PGN shall be set to zero. The Destination Address shall be set
+elsewhere.
+
+Regarding PGN mapping to 29-bit CAN identifier, the Destination Address shall
+be get/set from/to the appropriate bits of the identifier by the kernel.
+
+
+Addressing
+----------
+
+Both static and dynamic addressing methods can be used.
+
+For static addresses, no extra checks are made by the kernel, and provided
+addresses are considered right. This responsibility is for the OEM or system
+integrator.
+
+For dynamic addressing, so-called Address Claiming, extra support is foreseen
+in the kernel. In J1939 any ECU is known by it's 64-bit NAME. At the moment of
+a successful address claim, the kernel keeps track of both NAME and source
+address being claimed. This serves as a base for filter schemes. By default,
+packets with a destination that is not locally, will be rejected.
+
+Mixed mode packets (from a static to a dynamic address or vice versa) are
+allowed. The BSD sockets define separate API calls for getting/setting the
+local & remote address and are applicable for J1939 sockets.
+
+Filtering
+---------
+
+J1939 defines white list filters per socket that a user can set in order to
+receive a subset of the J1939 traffic. Filtering can be based on:
+
+* SA
+* SOURCE_NAME
+* PGN
+
+When multiple filters are in place for a single socket, and a packet comes in
+that matches several of those filters, the packet is only received once for
+that socket.
+
+How to Use J1939
+================
+
+API Calls
+---------
+
+On CAN, you first need to open a socket for communicating over a CAN network.
+To use J1939, #include <linux/can/j1939.h>. From there, <linux/can.h> will be
+included too. To open a socket, use:
+
+.. code-block:: C
+
+    s = socket(PF_CAN, SOCK_DGRAM, CAN_J1939);
+
+J1939 does use SOCK_DGRAM sockets. In the J1939 specification, connections are
+mentioned in the context of transport protocol sessions. These still deliver
+packets to the other end (using several CAN packets). SOCK_STREAM is not
+supported.
+
+After the successful creation of the socket, you would normally use the bind(2)
+and/or connect(2) system call to bind the socket to a CAN interface.  After
+binding and/or connecting the socket, you can read(2) and write(2) from/to the
+socket or use send(2), sendto(2), sendmsg(2) and the recv*() counterpart
+operations on the socket as usual. There are also J1939 specific socket options
+described below.
+
+In order to send data, a bind(2) must have been successful. bind(2) assigns a
+local address to a socket.
+
+Different from CAN is that the payload data is just the data that get send,
+without it's header info. The header info is derived from the sockaddr supplied
+to bind(2), connect(2), sendto(2) and recvfrom(2). A write(2) with size 4 will
+result in a packet with 4 bytes.
+
+The sockaddr structure has extensions for use with J1939 as specified below:
+
+.. code-block:: C
+
+      struct sockaddr_can {
+         sa_family_t can_family;
+         int         can_ifindex;
+         union {
+            struct {
+               __u64 name;
+                        /* pgn:
+                         * 8 bit: PS in PDU2 case, else 0
+                         * 8 bit: PF
+                         * 1 bit: DP
+                         * 1 bit: reserved
+                         */
+               __u32 pgn;
+               __u8  addr;
+            } j1939;
+         } can_addr;
+      }
+
+can_family & can_ifindex serve the same purpose as for other SocketCAN sockets.
+
+can_addr.j1939.pgn specifies the PGN (max 0x3ffff). Individual bits are
+specified above.
+
+can_addr.j1939.name contains the 64-bit J1939 NAME.
+
+can_addr.j1939.addr contains the address.
+
+The bind(2) system call assigns the local address, i.e. the source address when
+sending packages. If a PGN during bind(2) is set, it's used as a RX filter.
+I.e.  only packets with a matching PGN are received. If an ADDR or NAME is set
+it is used as a receive filter, too. It will match the destination NAME or ADDR
+of the incoming packet. The NAME filter will work only if appropriate Address
+Claiming for this name was done on the CAN bus and registered/cached by the
+kernel.
+
+On the other hand connect(2) assigns the remote address, i.e. the destination
+address. The PGN from connect(2) is used as the default PGN when sending
+packets. If ADDR or NAME is set it will be used as the default destination ADDR
+or NAME. Further a set ADDR or NAME during connect(2) is used as a receive
+filter. It will match the source NAME or ADDR of the incoming packet.
+
+Both write(2) and send(2) will send a packet with local address from bind(2) and
+the remote address from connect(2). Use sendto(2) to overwrite the destination
+address.
+
+If can_addr.j1939.name is set (!= 0) the NAME is looked up by the kernel and
+the corresponding ADDR is used. If can_addr.j1939.name is not set (== 0),
+can_addr.j1939.addr is used.
+
+When creating a socket, reasonable defaults are set. Some options can be
+modified with setsockopt(2) & getsockopt(2).
+
+RX path related options:
+
+- SO_J1939_FILTER - configure array of filters
+- SO_J1939_PROMISC - disable filters set by bind(2) and connect(2)
+
+By default no broadcast packets can be send or received. To enable sending or
+receiving broadcast packets use the socket option SO_BROADCAST:
+
+.. code-block:: C
+
+     int value = 1;
+     setsockopt(sock, SOL_SOCKET, SO_BROADCAST, &value, sizeof(value));
+
+The following diagram illustrates the RX path:
+
+.. code::
+
+                    +--------------------+
+                    |  incoming packet   |
+                    +--------------------+
+                              |
+                              V
+                    +--------------------+
+                    | SO_J1939_PROMISC?  |
+                    +--------------------+
+                             |  |
+                         no  |  | yes
+                             |  |
+                   .---------'  `---------.
+                   |                      |
+     +---------------------------+        |
+     | bind() + connect() +      |        |
+     | SOCK_BROADCAST filter     |        |
+     +---------------------------+        |
+                   |                      |
+                   |<---------------------'
+                   V
+     +---------------------------+
+     |      SO_J1939_FILTER      |
+     +---------------------------+
+                   |
+                   V
+     +---------------------------+
+     |        socket recv()      |
+     +---------------------------+
+
+TX path related options:
+SO_J1939_SEND_PRIO - change default send priority for the socket
+
+Message Flags during send() and Related System Calls
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+send(2), sendto(2) and sendmsg(2) take a 'flags' argument. Currently
+supported flags are:
+
+* MSG_DONTWAIT, i.e. non-blocking operation.
+
+recvmsg(2)
+^^^^^^^^^
+
+In most cases recvmsg(2) is needed if you want to extract more information than
+recvfrom(2) can provide. For example package priority and timestamp. The
+Destination Address, name and packet priority (if applicable) are attached to
+the msghdr in the recvmsg(2) call. They can be extracted using cmsg(3) macros,
+with cmsg_level == SOL_J1939 && cmsg_type == SCM_J1939_DEST_ADDR,
+SCM_J1939_DEST_NAME or SCM_J1939_PRIO. The returned data is a uint8_t for
+priority and dst_addr, and uint64_t for dst_name.
+
+.. code-block:: C
+
+	uint8_t priority, dst_addr;
+	uint64_t dst_name;
+
+	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+		switch (cmsg->cmsg_level) {
+		case SOL_CAN_J1939:
+			if (cmsg->cmsg_type == SCM_J1939_DEST_ADDR)
+				dst_addr = *CMSG_DATA(cmsg);
+			else if (cmsg->cmsg_type == SCM_J1939_DEST_NAME)
+				memcpy(&dst_name, CMSG_DATA(cmsg), cmsg->cmsg_len - CMSG_LEN(0));
+			else if (cmsg->cmsg_type == SCM_J1939_PRIO)
+				priority = *CMSG_DATA(cmsg);
+			break;
+		}
+	}
+
+Dynamic Addressing
+------------------
+
+Distinction has to be made between using the claimed address and doing an
+address claim. To use an already claimed address, one has to fill in the
+j1939.name member and provide it to bind(2). If the name had claimed an address
+earlier, all further messages being sent will use that address. And the
+j1939.addr member will be ignored.
+
+An exception on this is PGN 0x0ee00. This is the "Address Claim/Cannot Claim
+Address" message and the kernel will use the j1939.addr member for that PGN if
+necessary.
+
+To claim an address following code example can be used:
+
+.. code-block:: C
+
+	struct sockaddr_can baddr = {
+		.can_family = AF_CAN,
+		.can_addr.j1939 = {
+			.name = name,
+			.addr = J1939_IDLE_ADDR,
+			.pgn = J1939_NO_PGN,	/* to disable bind() rx filter for PGN */
+		},
+		.can_ifindex = if_nametoindex("can0"),
+	};
+
+	bind(sock, (struct sockaddr *)&baddr, sizeof(baddr));
+
+	/* for Address Claiming broadcast must be allowed */
+	int value = 1;
+	setsockopt(sock, SOL_SOCKET, SO_BROADCAST, &value, sizeof(value));
+
+	/* configured advanced RX filter with PGN needed for Address Claiming */
+	const struct j1939_filter filt[] = {
+		{
+			.pgn = J1939_PGN_ADDRESS_CLAIMED,
+			.pgn_mask = J1939_PGN_PDU1_MAX,
+		}, {
+			.pgn = J1939_PGN_ADDRESS_REQUEST,
+			.pgn_mask = J1939_PGN_PDU1_MAX,
+		}, {
+			.pgn = J1939_PGN_ADDRESS_COMMANDED,
+			.pgn_mask = J1939_PGN_MAX,
+		},
+	};
+
+	setsockopt(sock, SOL_CAN_J1939, SO_J1939_FILTER, &filt, sizeof(filt));
+
+	uint64_t dat = htole64(name);
+	const struct sockaddr_can saddr = {
+		.can_family = AF_CAN,
+		.can_addr.j1939 = {
+			.pgn = J1939_PGN_ADDRESS_CLAIMED,
+			.addr = J1939_NO_ADDR,
+		},
+	};
+
+	/* Afterwards do a sendto(2) with data set to the NAME (Little Endian). If the
+	 * NAME provided, does not match the j1939.name provided to bind(2), EPROTO
+	 * will be returned.
+	 */
+	sendto(sock, dat, sizeof(dat), 0, (const struct sockaddr *)&saddr, sizeof(saddr));
+
+If no-one else contests the address claim within 250ms after transmission, the
+kernel marks the NAME-SA assignment as valid. The valid assignment will be kept
+among other valid NAME-SA assignments. From that point, any socket bound to the
+NAME can send packets.
+
+If another ECU claims the address, the kernel will mark the NAME-SA expired.
+No socket bound to the NAME can send packets (other than address claims). To
+claim another address, some socket bound to NAME, must bind(2) again, but with
+only j1939.addr changed to the new SA, and must then send a valid address claim
+packet. This restarts the state machine in the kernel (and any other
+participant on the bus) for this NAME.
+
+can-utils also include the jacd tool, so it can be used as code example or as
+default Address Claiming daemon.
+
+Send Examples
+-------------
+
+Static Addressing
+^^^^^^^^^^^^^^^^^
+
+This example will send a PGN (0x12300) from SA 0x20 to DA 0x30.
+
+Bind:
+
+.. code-block:: C
+
+	struct sockaddr_can baddr = {
+		.can_family = AF_CAN,
+		.can_addr.j1939 = {
+			.name = J1939_NO_NAME,
+			.addr = 0x20,
+			.pgn = J1939_NO_PGN,
+		},
+		.can_ifindex = if_nametoindex("can0"),
+	};
+
+	bind(sock, (struct sockaddr *)&baddr, sizeof(baddr));
+
+Now, the socket 'sock' is bound to the SA 0x20. Since no connect(2) was called,
+at this point we can use only sendto(2) or sendmsg(2).
+
+Send:
+
+.. code-block:: C
+
+	const struct sockaddr_can saddr = {
+		.can_family = AF_CAN,
+		.can_addr.j1939 = {
+			.name = J1939_NO_NAME;
+			.pgn = 0x30,
+			.addr = 0x12300,
+		},
+	};
+
+	sendto(sock, dat, sizeof(dat), 0, (const struct sockaddr *)&saddr, sizeof(saddr));
diff --git a/MAINTAINERS b/MAINTAINERS
index a081c477d1d1..844f416437c4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3669,6 +3669,16 @@ F:	include/uapi/linux/can/bcm.h
 F:	include/uapi/linux/can/raw.h
 F:	include/uapi/linux/can/gw.h
 
+CAN-J1939 NETWORK LAYER
+M:	Robin van der Gracht <robin@protonic.nl>
+M:	Oleksij Rempel <o.rempel@pengutronix.de>
+R:	Pengutronix Kernel Team <kernel@pengutronix.de>
+L:	linux-can@vger.kernel.org
+S:	Maintained
+F:	Documentation/networking/j1939.txt
+F:	net/can/j1939/
+F:	include/uapi/linux/can/j1939.h
+
 CAPABILITIES
 M:	Serge Hallyn <serge@hallyn.com>
 L:	linux-security-module@vger.kernel.org
diff --git a/include/linux/can/can-ml.h b/include/linux/can/can-ml.h
index 79ccf6bfa232..2f5d731ae251 100644
--- a/include/linux/can/can-ml.h
+++ b/include/linux/can/can-ml.h
@@ -60,6 +60,9 @@ struct can_dev_rcv_lists {
 
 struct can_ml_priv {
 	struct can_dev_rcv_lists dev_rcv_lists;
+#ifdef CAN_J1939
+	struct j1939_priv *j1939_priv;
+#endif
 };
 
 #endif /* CAN_ML_H */
diff --git a/include/uapi/linux/can/j1939.h b/include/uapi/linux/can/j1939.h
new file mode 100644
index 000000000000..c32325342d30
--- /dev/null
+++ b/include/uapi/linux/can/j1939.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * j1939.h
+ *
+ * Copyright (c) 2010-2011 EIA Electronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _UAPI_CAN_J1939_H_
+#define _UAPI_CAN_J1939_H_
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/can.h>
+
+#define J1939_MAX_UNICAST_ADDR 0xfd
+#define J1939_IDLE_ADDR 0xfe
+#define J1939_NO_ADDR 0xff		/* == broadcast or no addr */
+#define J1939_NO_NAME 0
+#define J1939_PGN_REQUEST 0x0ea00		/* Request PG */
+#define J1939_PGN_ADDRESS_CLAIMED 0x0ee00	/* Address Claimed */
+#define J1939_PGN_ADDRESS_COMMANDED 0x0fed8	/* Commanded Address */
+#define J1939_PGN_PDU1_MAX 0x3ff00
+#define J1939_PGN_MAX 0x3ffff
+#define J1939_NO_PGN 0x40000
+
+/* J1939 Parameter Group Number
+ *
+ * bit 0-7	: PDU Specific (PS)
+ * bit 8-15	: PDU Format (PF)
+ * bit 16	: Data Page (DP)
+ * bit 17	: Reserved (R)
+ * bit 19-31	: set to zero
+ */
+typedef __u32 pgn_t;
+
+/* J1939 Priority
+ *
+ * bit 0-2	: Priority (P)
+ * bit 3-7	: set to zero
+ */
+typedef __u8 priority_t;
+
+/* J1939 NAME
+ *
+ * bit 0-20	: Identity Number
+ * bit 21-31	: Manufacturer Code
+ * bit 32-34	: ECU Instance
+ * bit 35-39	: Function Instance
+ * bit 40-47	: Function
+ * bit 48	: Reserved
+ * bit 49-55	: Vehicle System
+ * bit 56-59	: Vehicle System Instance
+ * bit 60-62	: Industry Group
+ * bit 63	: Arbitrary Address Capable
+ */
+typedef __u64 name_t;
+
+/* J1939 socket options */
+#define SOL_CAN_J1939 (SOL_CAN_BASE + CAN_J1939)
+enum {
+	SO_J1939_FILTER = 1,	/* set filters */
+	SO_J1939_PROMISC = 2,	/* set/clr promiscuous mode */
+	SO_J1939_SEND_PRIO = 3,
+	SO_J1939_ERRQUEUE = 4,
+};
+
+enum {
+	SCM_J1939_DEST_ADDR = 1,
+	SCM_J1939_DEST_NAME = 2,
+	SCM_J1939_PRIO = 3,
+	SCM_J1939_ERRQUEUE = 4,
+};
+
+enum {
+	J1939_NLA_PAD,
+	J1939_NLA_BYTES_ACKED,
+};
+
+enum {
+	J1939_EE_INFO_NONE,
+	J1939_EE_INFO_TX_ABORT,
+};
+
+struct j1939_filter {
+	name_t name;
+	name_t name_mask;
+	pgn_t pgn;
+	pgn_t pgn_mask;
+	__u8 addr;
+	__u8 addr_mask;
+};
+
+#define J1939_FILTER_MAX 512 /* maximum number of j1939_filter set via setsockopt() */
+
+#endif /* !_UAPI_CAN_J1939_H_ */
diff --git a/net/can/Kconfig b/net/can/Kconfig
index d4319aa3e1b1..d77042752457 100644
--- a/net/can/Kconfig
+++ b/net/can/Kconfig
@@ -53,6 +53,8 @@ config CAN_GW
 	  They can be modified with AND/OR/XOR/SET operations as configured
 	  by the netlink configuration interface known e.g. from iptables.
 
+source "net/can/j1939/Kconfig"
+
 source "drivers/net/can/Kconfig"
 
 endif
diff --git a/net/can/Makefile b/net/can/Makefile
index 1242bbbfe57f..08bd217fc051 100644
--- a/net/can/Makefile
+++ b/net/can/Makefile
@@ -15,3 +15,5 @@ can-bcm-y		:= bcm.o
 
 obj-$(CONFIG_CAN_GW)	+= can-gw.o
 can-gw-y		:= gw.o
+
+obj-$(CONFIG_CAN_J1939)	+= j1939/
diff --git a/net/can/j1939/Kconfig b/net/can/j1939/Kconfig
new file mode 100644
index 000000000000..2998298b71ec
--- /dev/null
+++ b/net/can/j1939/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# SAE J1939 network layer core configuration
+#
+
+config CAN_J1939
+	tristate "SAE J1939"
+	depends on CAN
+	help
+	  SAE J1939
+	  Say Y to have in-kernel support for j1939 socket type. This
+	  allows communication according to SAE j1939.
+	  The relevant parts in kernel are
+	  SAE j1939-21 (datalink & transport protocol)
+	  & SAE j1939-81 (network management).
diff --git a/net/can/j1939/Makefile b/net/can/j1939/Makefile
new file mode 100644
index 000000000000..19181bdae173
--- /dev/null
+++ b/net/can/j1939/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_CAN_J1939) += can-j1939.o
+
+can-j1939-objs := \
+	address-claim.o \
+	bus.o \
+	main.o \
+	socket.o \
+	transport.o
diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c
new file mode 100644
index 000000000000..f33c47327927
--- /dev/null
+++ b/net/can/j1939/address-claim.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+//                         Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+//                         Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Oleksij Rempel <kernel@pengutronix.de>
+
+/* J1939 Address Claiming.
+ * Address Claiming in the kernel
+ * - keeps track of the AC states of ECU's,
+ * - resolves NAME<=>SA taking into account the AC states of ECU's.
+ *
+ * All Address Claim msgs (including host-originated msg) are processed
+ * at the receive path (a sent msg is always received again via CAN echo).
+ * As such, the processing of AC msgs is done in the order on which msgs
+ * are sent on the bus.
+ *
+ * This module doesn't send msgs itself (e.g. replies on Address Claims),
+ * this is the responsibility of a user space application or daemon.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include "j1939-priv.h"
+
+static inline name_t j1939_skb_to_name(const struct sk_buff *skb)
+{
+	return le64_to_cpup((__le64 *)skb->data);
+}
+
+static inline bool j1939_ac_msg_is_request(struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	int req_pgn;
+
+	if (skb->len < 3 || skcb->addr.pgn != J1939_PGN_REQUEST)
+		return false;
+
+	req_pgn = skb->data[0] | (skb->data[1] << 8) | (skb->data[2] << 16);
+
+	return req_pgn == J1939_PGN_ADDRESS_CLAIMED;
+}
+
+static int j1939_ac_verify_outgoing(struct j1939_priv *priv,
+				    struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+
+	if (skb->len != 8) {
+		netdev_notice(priv->ndev, "tx address claim with dlc %i\n",
+			      skb->len);
+		return -EPROTO;
+	}
+
+	if (skcb->addr.src_name != j1939_skb_to_name(skb)) {
+		netdev_notice(priv->ndev, "tx address claim with different name\n");
+		return -EPROTO;
+	}
+
+	if (skcb->addr.sa == J1939_NO_ADDR) {
+		netdev_notice(priv->ndev, "tx address claim with broadcast sa\n");
+		return -EPROTO;
+	}
+
+	/* ac must always be a broadcast */
+	if (skcb->addr.dst_name || skcb->addr.da != J1939_NO_ADDR) {
+		netdev_notice(priv->ndev, "tx address claim with dest, not broadcast\n");
+		return -EPROTO;
+	}
+	return 0;
+}
+
+int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	int ret;
+	u8 addr;
+
+	/* network mgmt: address claiming msgs */
+	if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) {
+		struct j1939_ecu *ecu;
+
+		ret = j1939_ac_verify_outgoing(priv, skb);
+		/* return both when failure & when successful */
+		if (ret < 0)
+			return ret;
+		ecu = j1939_ecu_get_by_name(priv, skcb->addr.src_name);
+		if (!ecu)
+			return -ENODEV;
+
+		if (ecu->addr != skcb->addr.sa)
+			/* hold further traffic for ecu, remove from parent */
+			j1939_ecu_unmap(ecu);
+		j1939_ecu_put(ecu);
+	} else if (skcb->addr.src_name) {
+		/* assign source address */
+		addr = j1939_name_to_addr(priv, skcb->addr.src_name);
+		if (!j1939_address_is_unicast(addr) &&
+		    !j1939_ac_msg_is_request(skb)) {
+			netdev_notice(priv->ndev, "tx drop: invalid sa for name 0x%016llx\n",
+				      skcb->addr.src_name);
+			return -EADDRNOTAVAIL;
+		}
+		skcb->addr.sa = addr;
+	}
+
+	/* assign destination address */
+	if (skcb->addr.dst_name) {
+		addr = j1939_name_to_addr(priv, skcb->addr.dst_name);
+		if (!j1939_address_is_unicast(addr)) {
+			netdev_notice(priv->ndev, "tx drop: invalid da for name 0x%016llx\n",
+				      skcb->addr.dst_name);
+			return -EADDRNOTAVAIL;
+		}
+		skcb->addr.da = addr;
+	}
+	return 0;
+}
+
+static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_ecu *ecu, *prev;
+	name_t name;
+
+	if (skb->len != 8) {
+		netdev_notice(priv->ndev, "rx address claim with wrong dlc %i\n",
+			      skb->len);
+		return;
+	}
+
+	name = j1939_skb_to_name(skb);
+	skcb->addr.src_name = name;
+	if (!name) {
+		netdev_notice(priv->ndev, "rx address claim without name\n");
+		return;
+	}
+
+	if (!j1939_address_is_valid(skcb->addr.sa)) {
+		netdev_notice(priv->ndev, "rx address claim with broadcast sa\n");
+		return;
+	}
+
+	write_lock_bh(&priv->lock);
+
+	/* Few words on the ECU ref counting:
+	 *
+	 * First we get an ECU handle, either with
+	 * j1939_ecu_get_by_name_locked() (increments the ref counter)
+	 * or j1939_ecu_create_locked() (initializes an ECU object
+	 * with a ref counter of 1).
+	 *
+	 * j1939_ecu_unmap_locked() will decrement the ref counter,
+	 * but only if the ECU was mapped before. So "ecu" still
+	 * belongs to us.
+	 *
+	 * j1939_ecu_timer_start() will increment the ref counter
+	 * before it starts the timer, so we can put the ecu when
+	 * leaving this function.
+	 */
+	ecu = j1939_ecu_get_by_name_locked(priv, name);
+	if (!ecu && j1939_address_is_unicast(skcb->addr.sa))
+		ecu = j1939_ecu_create_locked(priv, name);
+
+	if (IS_ERR_OR_NULL(ecu))
+		goto out_unlock_bh;
+
+	/* cancel pending (previous) address claim */
+	j1939_ecu_timer_cancel(ecu);
+
+	if (j1939_address_is_idle(skcb->addr.sa)) {
+		j1939_ecu_unmap_locked(ecu);
+		goto out_ecu_put;
+	}
+
+	/* save new addr */
+	if (ecu->addr != skcb->addr.sa)
+		j1939_ecu_unmap_locked(ecu);
+	ecu->addr = skcb->addr.sa;
+
+	prev = j1939_ecu_get_by_addr_locked(priv, skcb->addr.sa);
+	if (prev) {
+		if (ecu->name > prev->name) {
+			j1939_ecu_unmap_locked(ecu);
+			j1939_ecu_put(prev);
+			goto out_ecu_put;
+		} else {
+			/* kick prev if less or equal */
+			j1939_ecu_unmap_locked(prev);
+			j1939_ecu_put(prev);
+		}
+	}
+
+	j1939_ecu_timer_start(ecu);
+ out_ecu_put:
+	j1939_ecu_put(ecu);
+ out_unlock_bh:
+	write_unlock_bh(&priv->lock);
+}
+
+void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_ecu *ecu;
+
+	/* network mgmt */
+	if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) {
+		j1939_ac_process(priv, skb);
+	} else if (j1939_address_is_unicast(skcb->addr.sa)) {
+		/* assign source name */
+		ecu = j1939_ecu_get_by_addr(priv, skcb->addr.sa);
+		if (ecu) {
+			skcb->addr.src_name = ecu->name;
+			j1939_ecu_put(ecu);
+		}
+	}
+
+	/* assign destination name */
+	ecu = j1939_ecu_get_by_addr(priv, skcb->addr.da);
+	if (ecu) {
+		skcb->addr.dst_name = ecu->name;
+		j1939_ecu_put(ecu);
+	}
+}
diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c
new file mode 100644
index 000000000000..486687901602
--- /dev/null
+++ b/net/can/j1939/bus.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+//                         Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Oleksij Rempel <kernel@pengutronix.de>
+
+/* bus for j1939 remote devices
+ * Since rtnetlink, no real bus is used.
+ */
+
+#include <net/sock.h>
+
+#include "j1939-priv.h"
+
+static void __j1939_ecu_release(struct kref *kref)
+{
+	struct j1939_ecu *ecu = container_of(kref, struct j1939_ecu, kref);
+	struct j1939_priv *priv = ecu->priv;
+
+	list_del(&ecu->list);
+	kfree(ecu);
+	j1939_priv_put(priv);
+}
+
+void j1939_ecu_put(struct j1939_ecu *ecu)
+{
+	kref_put(&ecu->kref, __j1939_ecu_release);
+}
+
+static void j1939_ecu_get(struct j1939_ecu *ecu)
+{
+	kref_get(&ecu->kref);
+}
+
+static bool j1939_ecu_is_mapped_locked(struct j1939_ecu *ecu)
+{
+	struct j1939_priv *priv = ecu->priv;
+
+	lockdep_assert_held(&priv->lock);
+
+	return j1939_ecu_find_by_addr_locked(priv, ecu->addr) == ecu;
+}
+
+/* ECU device interface */
+/* map ECU to a bus address space */
+static void j1939_ecu_map_locked(struct j1939_ecu *ecu)
+{
+	struct j1939_priv *priv = ecu->priv;
+	struct j1939_addr_ent *ent;
+
+	lockdep_assert_held(&priv->lock);
+
+	if (!j1939_address_is_unicast(ecu->addr))
+		return;
+
+	ent = &priv->ents[ecu->addr];
+
+	if (ent->ecu) {
+		netdev_warn(priv->ndev, "Trying to map already mapped ECU, addr: 0x%02x, name: 0x%016llx. Skip it.\n",
+			    ecu->addr, ecu->name);
+		return;
+	}
+
+	j1939_ecu_get(ecu);
+	ent->ecu = ecu;
+	ent->nusers += ecu->nusers;
+}
+
+/* unmap ECU from a bus address space */
+void j1939_ecu_unmap_locked(struct j1939_ecu *ecu)
+{
+	struct j1939_priv *priv = ecu->priv;
+	struct j1939_addr_ent *ent;
+
+	lockdep_assert_held(&priv->lock);
+
+	if (!j1939_address_is_unicast(ecu->addr))
+		return;
+
+	if (!j1939_ecu_is_mapped_locked(ecu))
+		return;
+
+	ent = &priv->ents[ecu->addr];
+	ent->ecu = NULL;
+	ent->nusers -= ecu->nusers;
+	j1939_ecu_put(ecu);
+}
+
+void j1939_ecu_unmap(struct j1939_ecu *ecu)
+{
+	write_lock_bh(&ecu->priv->lock);
+	j1939_ecu_unmap_locked(ecu);
+	write_unlock_bh(&ecu->priv->lock);
+}
+
+void j1939_ecu_unmap_all(struct j1939_priv *priv)
+{
+	int i;
+
+	write_lock_bh(&priv->lock);
+	for (i = 0; i < ARRAY_SIZE(priv->ents); i++)
+		if (priv->ents[i].ecu)
+			j1939_ecu_unmap_locked(priv->ents[i].ecu);
+	write_unlock_bh(&priv->lock);
+}
+
+void j1939_ecu_timer_start(struct j1939_ecu *ecu)
+{
+	/* The ECU is held here and released in the
+	 * j1939_ecu_timer_handler() or j1939_ecu_timer_cancel().
+	 */
+	j1939_ecu_get(ecu);
+
+	/* Schedule timer in 250 msec to commit address change. */
+	hrtimer_start(&ecu->ac_timer, ms_to_ktime(250),
+		      HRTIMER_MODE_REL_SOFT);
+}
+
+void j1939_ecu_timer_cancel(struct j1939_ecu *ecu)
+{
+	if (hrtimer_cancel(&ecu->ac_timer))
+		j1939_ecu_put(ecu);
+}
+
+static enum hrtimer_restart j1939_ecu_timer_handler(struct hrtimer *hrtimer)
+{
+	struct j1939_ecu *ecu =
+		container_of(hrtimer, struct j1939_ecu, ac_timer);
+	struct j1939_priv *priv = ecu->priv;
+
+	write_lock_bh(&priv->lock);
+	/* TODO: can we test if ecu->addr is unicast before starting
+	 * the timer?
+	 */
+	j1939_ecu_map_locked(ecu);
+
+	/* The corresponding j1939_ecu_get() is in
+	 * j1939_ecu_timer_start().
+	 */
+	j1939_ecu_put(ecu);
+	write_unlock_bh(&priv->lock);
+
+	return HRTIMER_NORESTART;
+}
+
+struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name)
+{
+	struct j1939_ecu *ecu;
+
+	lockdep_assert_held(&priv->lock);
+
+	ecu = kzalloc(sizeof(*ecu), gfp_any());
+	if (!ecu)
+		return ERR_PTR(-ENOMEM);
+	kref_init(&ecu->kref);
+	ecu->addr = J1939_IDLE_ADDR;
+	ecu->name = name;
+
+	hrtimer_init(&ecu->ac_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+	ecu->ac_timer.function = j1939_ecu_timer_handler;
+	INIT_LIST_HEAD(&ecu->list);
+
+	j1939_priv_get(priv);
+	ecu->priv = priv;
+	list_add_tail(&ecu->list, &priv->ecus);
+
+	return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv,
+						u8 addr)
+{
+	lockdep_assert_held(&priv->lock);
+
+	return priv->ents[addr].ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv, u8 addr)
+{
+	struct j1939_ecu *ecu;
+
+	lockdep_assert_held(&priv->lock);
+
+	if (!j1939_address_is_unicast(addr))
+		return NULL;
+
+	ecu = j1939_ecu_find_by_addr_locked(priv, addr);
+	if (ecu)
+		j1939_ecu_get(ecu);
+
+	return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr)
+{
+	struct j1939_ecu *ecu;
+
+	read_lock_bh(&priv->lock);
+	ecu = j1939_ecu_get_by_addr_locked(priv, addr);
+	read_unlock_bh(&priv->lock);
+
+	return ecu;
+}
+
+/* get pointer to ecu without increasing ref counter */
+static struct j1939_ecu *j1939_ecu_find_by_name_locked(struct j1939_priv *priv,
+						       name_t name)
+{
+	struct j1939_ecu *ecu;
+
+	lockdep_assert_held(&priv->lock);
+
+	list_for_each_entry(ecu, &priv->ecus, list) {
+		if (ecu->name == name)
+			return ecu;
+	}
+
+	return NULL;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv,
+					       name_t name)
+{
+	struct j1939_ecu *ecu;
+
+	lockdep_assert_held(&priv->lock);
+
+	if (!name)
+		return NULL;
+
+	ecu = j1939_ecu_find_by_name_locked(priv, name);
+	if (ecu)
+		j1939_ecu_get(ecu);
+
+	return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name)
+{
+	struct j1939_ecu *ecu;
+
+	read_lock_bh(&priv->lock);
+	ecu = j1939_ecu_get_by_name_locked(priv, name);
+	read_unlock_bh(&priv->lock);
+
+	return ecu;
+}
+
+u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name)
+{
+	struct j1939_ecu *ecu;
+	int addr = J1939_IDLE_ADDR;
+
+	if (!name)
+		return J1939_NO_ADDR;
+
+	read_lock_bh(&priv->lock);
+	ecu = j1939_ecu_find_by_name_locked(priv, name);
+	if (ecu && j1939_ecu_is_mapped_locked(ecu))
+		/* ecu's SA is registered */
+		addr = ecu->addr;
+
+	read_unlock_bh(&priv->lock);
+
+	return addr;
+}
+
+/* TX addr/name accounting
+ * Transport protocol needs to know if a SA is local or not
+ * These functions originate from userspace manipulating sockets,
+ * so locking is straigforward
+ */
+
+int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa)
+{
+	struct j1939_ecu *ecu;
+	int err = 0;
+
+	write_lock_bh(&priv->lock);
+
+	if (j1939_address_is_unicast(sa))
+		priv->ents[sa].nusers++;
+
+	if (!name)
+		goto done;
+
+	ecu = j1939_ecu_get_by_name_locked(priv, name);
+	if (!ecu)
+		ecu = j1939_ecu_create_locked(priv, name);
+	err = PTR_ERR_OR_ZERO(ecu);
+	if (err)
+		goto done;
+
+	ecu->nusers++;
+	/* TODO: do we care if ecu->addr != sa? */
+	if (j1939_ecu_is_mapped_locked(ecu))
+		/* ecu's sa is active already */
+		priv->ents[ecu->addr].nusers++;
+
+ done:
+	write_unlock_bh(&priv->lock);
+
+	return err;
+}
+
+void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa)
+{
+	struct j1939_ecu *ecu;
+
+	write_lock_bh(&priv->lock);
+
+	if (j1939_address_is_unicast(sa))
+		priv->ents[sa].nusers--;
+
+	if (!name)
+		goto done;
+
+	ecu = j1939_ecu_find_by_name_locked(priv, name);
+	if (WARN_ON_ONCE(!ecu))
+		goto done;
+
+	ecu->nusers--;
+	/* TODO: do we care if ecu->addr != sa? */
+	if (j1939_ecu_is_mapped_locked(ecu))
+		/* ecu's sa is active already */
+		priv->ents[ecu->addr].nusers--;
+	j1939_ecu_put(ecu);
+
+ done:
+	write_unlock_bh(&priv->lock);
+}
diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h
new file mode 100644
index 000000000000..12369b604ce9
--- /dev/null
+++ b/net/can/j1939/j1939-priv.h
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2010-2011 EIA Electronics,
+//                         Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Oleksij Rempel <kernel@pengutronix.de>
+
+#ifndef _J1939_PRIV_H_
+#define _J1939_PRIV_H_
+
+#include <linux/can/j1939.h>
+#include <net/sock.h>
+
+/* Timeout to receive the abort signal over loop back. In case CAN
+ * bus is open, the timeout should be triggered.
+ */
+#define J1939_XTP_ABORT_TIMEOUT_MS 500
+#define J1939_SIMPLE_ECHO_TIMEOUT_MS (10 * 1000)
+
+struct j1939_session;
+enum j1939_sk_errqueue_type {
+	J1939_ERRQUEUE_ACK,
+	J1939_ERRQUEUE_SCHED,
+	J1939_ERRQUEUE_ABORT,
+};
+
+/* j1939 devices */
+struct j1939_ecu {
+	struct list_head list;
+	name_t name;
+	u8 addr;
+
+	/* indicates that this ecu successfully claimed @sa as its address */
+	struct hrtimer ac_timer;
+	struct kref kref;
+	struct j1939_priv *priv;
+
+	/* count users, to help transport protocol decide for interaction */
+	int nusers;
+};
+
+struct j1939_priv {
+	struct list_head ecus;
+	/* local list entry in priv
+	 * These allow irq (& softirq) context lookups on j1939 devices
+	 * This approach (separate lists) is done as the other 2 alternatives
+	 * are not easier or even wrong
+	 * 1) using the pure kobject methods involves mutexes, which are not
+	 *    allowed in irq context.
+	 * 2) duplicating data structures would require a lot of synchronization
+	 *    code
+	 * usage:
+	 */
+
+	/* segments need a lock to protect the above list */
+	rwlock_t lock;
+
+	struct net_device *ndev;
+
+	/* list of 256 ecu ptrs, that cache the claimed addresses.
+	 * also protected by the above lock
+	 */
+	struct j1939_addr_ent {
+		struct j1939_ecu *ecu;
+		/* count users, to help transport protocol */
+		int nusers;
+	} ents[256];
+
+	struct kref kref;
+
+	/* List of active sessions to prevent start of conflicting
+	 * one.
+	 *
+	 * Do not start two sessions of same type, addresses and
+	 * direction.
+	 */
+	struct list_head active_session_list;
+
+	/* protects active_session_list */
+	spinlock_t active_session_list_lock;
+
+	unsigned int tp_max_packet_size;
+
+	/* lock for j1939_socks list */
+	spinlock_t j1939_socks_lock;
+	struct list_head j1939_socks;
+
+	struct kref rx_kref;
+};
+
+void j1939_ecu_put(struct j1939_ecu *ecu);
+
+/* keep the cache of what is local */
+int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa);
+void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa);
+
+static inline bool j1939_address_is_unicast(u8 addr)
+{
+	return addr <= J1939_MAX_UNICAST_ADDR;
+}
+
+static inline bool j1939_address_is_idle(u8 addr)
+{
+	return addr == J1939_IDLE_ADDR;
+}
+
+static inline bool j1939_address_is_valid(u8 addr)
+{
+	return addr != J1939_NO_ADDR;
+}
+
+static inline bool j1939_pgn_is_pdu1(pgn_t pgn)
+{
+	/* ignore dp & res bits for this */
+	return (pgn & 0xff00) < 0xf000;
+}
+
+/* utility to correctly unmap an ECU */
+void j1939_ecu_unmap_locked(struct j1939_ecu *ecu);
+void j1939_ecu_unmap(struct j1939_ecu *ecu);
+
+u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name);
+struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv,
+						u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv,
+					       u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name);
+struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv,
+					       name_t name);
+
+enum j1939_transfer_type {
+	J1939_TP,
+	J1939_ETP,
+	J1939_SIMPLE,
+};
+
+struct j1939_addr {
+	name_t src_name;
+	name_t dst_name;
+	pgn_t pgn;
+
+	u8 sa;
+	u8 da;
+
+	u8 type;
+};
+
+/* control buffer of the sk_buff */
+struct j1939_sk_buff_cb {
+	/* Offset in bytes within one ETP session */
+	u32 offset;
+
+	/* for tx, MSG_SYN will be used to sync on sockets */
+	u32 msg_flags;
+	u32 tskey;
+
+	struct j1939_addr addr;
+
+	/* Flags for quick lookups during skb processing.
+	 * These are set in the receive path only.
+	 */
+#define J1939_ECU_LOCAL_SRC BIT(0)
+#define J1939_ECU_LOCAL_DST BIT(1)
+	u8 flags;
+
+	priority_t priority;
+};
+
+static inline
+struct j1939_sk_buff_cb *j1939_skb_to_cb(const struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(struct j1939_sk_buff_cb) > sizeof(skb->cb));
+
+	return (struct j1939_sk_buff_cb *)skb->cb;
+}
+
+int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb);
+bool j1939_sk_recv_match(struct j1939_priv *priv,
+			 struct j1939_sk_buff_cb *skcb);
+void j1939_sk_send_loop_abort(struct sock *sk, int err);
+void j1939_sk_errqueue(struct j1939_session *session,
+		       enum j1939_sk_errqueue_type type);
+void j1939_sk_queue_activate_next(struct j1939_session *session);
+
+/* stack entries */
+struct j1939_session *j1939_tp_send(struct j1939_priv *priv,
+				    struct sk_buff *skb, size_t size);
+int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb);
+int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb);
+
+/* network management */
+struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name);
+
+void j1939_ecu_timer_start(struct j1939_ecu *ecu);
+void j1939_ecu_timer_cancel(struct j1939_ecu *ecu);
+void j1939_ecu_unmap_all(struct j1939_priv *priv);
+
+struct j1939_priv *j1939_netdev_start(struct net_device *ndev);
+void j1939_netdev_stop(struct j1939_priv *priv);
+
+void j1939_priv_put(struct j1939_priv *priv);
+void j1939_priv_get(struct j1939_priv *priv);
+
+/* notify/alert all j1939 sockets bound to ifindex */
+void j1939_sk_netdev_event_netdown(struct j1939_priv *priv);
+int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk);
+void j1939_tp_init(struct j1939_priv *priv);
+
+/* decrement pending skb for a j1939 socket */
+void j1939_sock_pending_del(struct sock *sk);
+
+enum j1939_session_state {
+	J1939_SESSION_NEW,
+	J1939_SESSION_ACTIVE,
+	/* waiting for abort signal on the bus */
+	J1939_SESSION_WAITING_ABORT,
+	J1939_SESSION_ACTIVE_MAX,
+	J1939_SESSION_DONE,
+};
+
+struct j1939_session {
+	struct j1939_priv *priv;
+	struct list_head active_session_list_entry;
+	struct list_head sk_session_queue_entry;
+	struct kref kref;
+	struct sock *sk;
+
+	/* ifindex, src, dst, pgn define the session block
+	 * the are _never_ modified after insertion in the list
+	 * this decreases locking problems a _lot_
+	 */
+	struct j1939_sk_buff_cb skcb;
+	struct sk_buff_head skb_queue;
+
+	/* all tx related stuff (last_txcmd, pkt.tx)
+	 * is protected (modified only) with the txtimer hrtimer
+	 * 'total' & 'block' are never changed,
+	 * last_cmd, last & block are protected by ->lock
+	 * this means that the tx may run after cts is received that should
+	 * have stopped tx, but this time discrepancy is never avoided anyhow
+	 */
+	u8 last_cmd, last_txcmd;
+	bool transmission;
+	bool extd;
+	/* Total message size, number of bytes */
+	unsigned int total_message_size;
+	/* Total number of bytes queue from socket to the session */
+	unsigned int total_queued_size;
+	unsigned int tx_retry;
+
+	int err;
+	u32 tskey;
+	enum j1939_session_state state;
+
+	/* Packets counters for a (extended) transfer session. The packet is
+	 * maximal of 7 bytes.
+	 */
+	struct {
+		/* total - total number of packets for this session */
+		unsigned int total;
+		/* last - last packet of a transfer block after which
+		 * responder should send ETP.CM_CTS and originator
+		 * ETP.CM_DPO
+		 */
+		unsigned int last;
+		/* tx - number of packets send by originator node.
+		 * this counter can be set back if responder node
+		 * didn't received all packets send by originator.
+		 */
+		unsigned int tx;
+		unsigned int tx_acked;
+		/* rx - number of packets received */
+		unsigned int rx;
+		/* block - amount of packets expected in one block */
+		unsigned int block;
+		/* dpo - ETP.CM_DPO, Data Packet Offset */
+		unsigned int dpo;
+	} pkt;
+	struct hrtimer txtimer, rxtimer;
+};
+
+struct j1939_sock {
+	struct sock sk; /* must be first to skip with memset */
+	struct j1939_priv *priv;
+	struct list_head list;
+
+#define J1939_SOCK_BOUND BIT(0)
+#define J1939_SOCK_CONNECTED BIT(1)
+#define J1939_SOCK_PROMISC BIT(2)
+#define J1939_SOCK_ERRQUEUE BIT(3)
+	int state;
+
+	int ifindex;
+	struct j1939_addr addr;
+	struct j1939_filter *filters;
+	int nfilters;
+	pgn_t pgn_rx_filter;
+
+	/* j1939 may emit equal PGN (!= equal CAN-id's) out of order
+	 * when transport protocol comes in.
+	 * To allow emitting in order, keep a 'pending' nr. of packets
+	 */
+	atomic_t skb_pending;
+	wait_queue_head_t waitq;
+
+	/* lock for the sk_session_queue list */
+	spinlock_t sk_session_queue_lock;
+	struct list_head sk_session_queue;
+};
+
+static inline struct j1939_sock *j1939_sk(const struct sock *sk)
+{
+	return container_of(sk, struct j1939_sock, sk);
+}
+
+void j1939_session_get(struct j1939_session *session);
+void j1939_session_put(struct j1939_session *session);
+void j1939_session_skb_queue(struct j1939_session *session,
+			     struct sk_buff *skb);
+int j1939_session_activate(struct j1939_session *session);
+void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec);
+void j1939_session_timers_cancel(struct j1939_session *session);
+
+#define J1939_MAX_TP_PACKET_SIZE (7 * 0xff)
+#define J1939_MAX_ETP_PACKET_SIZE (7 * 0x00ffffff)
+
+#define J1939_REGULAR 0
+#define J1939_EXTENDED 1
+
+/* CAN protocol */
+extern const struct can_proto j1939_can_proto;
+
+#endif /* _J1939_PRIV_H_ */
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
new file mode 100644
index 000000000000..def2f813ffce
--- /dev/null
+++ b/net/can/j1939/main.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+//                         Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+//                         Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+//                         Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Oleksij Rempel <kernel@pengutronix.de>
+
+/* Core of can-j1939 that links j1939 to CAN. */
+
+#include <linux/can/can-ml.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+
+#include "j1939-priv.h"
+
+MODULE_DESCRIPTION("PF_CAN SAE J1939");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)");
+MODULE_ALIAS("can-proto-" __stringify(CAN_J1939));
+
+/* LOWLEVEL CAN interface */
+
+/* CAN_HDR: #bytes before can_frame data part */
+#define J1939_CAN_HDR (offsetof(struct can_frame, data))
+
+/* CAN_FTR: #bytes beyond data part */
+#define J1939_CAN_FTR (sizeof(struct can_frame) - J1939_CAN_HDR - \
+		 sizeof(((struct can_frame *)0)->data))
+
+/* lowest layer */
+static void j1939_can_recv(struct sk_buff *iskb, void *data)
+{
+	struct j1939_priv *priv = data;
+	struct sk_buff *skb;
+	struct j1939_sk_buff_cb *skcb, *iskcb;
+	struct can_frame *cf;
+
+	/* create a copy of the skb
+	 * j1939 only delivers the real data bytes,
+	 * the header goes into sockaddr.
+	 * j1939 may not touch the incoming skb in such way
+	 */
+	skb = skb_clone(iskb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	can_skb_set_owner(skb, iskb->sk);
+
+	/* get a pointer to the header of the skb
+	 * the skb payload (pointer) is moved, so that the next skb_data
+	 * returns the actual payload
+	 */
+	cf = (void *)skb->data;
+	skb_pull(skb, J1939_CAN_HDR);
+
+	/* fix length, set to dlc, with 8 maximum */
+	skb_trim(skb, min_t(uint8_t, cf->can_dlc, 8));
+
+	/* set addr */
+	skcb = j1939_skb_to_cb(skb);
+	memset(skcb, 0, sizeof(*skcb));
+
+	iskcb = j1939_skb_to_cb(iskb);
+	skcb->tskey = iskcb->tskey;
+	skcb->priority = (cf->can_id >> 26) & 0x7;
+	skcb->addr.sa = cf->can_id;
+	skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX;
+	/* set default message type */
+	skcb->addr.type = J1939_TP;
+	if (j1939_pgn_is_pdu1(skcb->addr.pgn)) {
+		/* Type 1: with destination address */
+		skcb->addr.da = skcb->addr.pgn;
+		/* normalize pgn: strip dst address */
+		skcb->addr.pgn &= 0x3ff00;
+	} else {
+		/* set broadcast address */
+		skcb->addr.da = J1939_NO_ADDR;
+	}
+
+	/* update localflags */
+	read_lock_bh(&priv->lock);
+	if (j1939_address_is_unicast(skcb->addr.sa) &&
+	    priv->ents[skcb->addr.sa].nusers)
+		skcb->flags |= J1939_ECU_LOCAL_SRC;
+	if (j1939_address_is_unicast(skcb->addr.da) &&
+	    priv->ents[skcb->addr.da].nusers)
+		skcb->flags |= J1939_ECU_LOCAL_DST;
+	read_unlock_bh(&priv->lock);
+
+	/* deliver into the j1939 stack ... */
+	j1939_ac_recv(priv, skb);
+
+	if (j1939_tp_recv(priv, skb))
+		/* this means the transport layer processed the message */
+		goto done;
+
+	j1939_simple_recv(priv, skb);
+	j1939_sk_recv(priv, skb);
+ done:
+	kfree_skb(skb);
+}
+
+/* NETDEV MANAGEMENT */
+
+/* values for can_rx_(un)register */
+#define J1939_CAN_ID CAN_EFF_FLAG
+#define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG)
+
+static DEFINE_SPINLOCK(j1939_netdev_lock);
+
+static struct j1939_priv *j1939_priv_create(struct net_device *ndev)
+{
+	struct j1939_priv *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return NULL;
+
+	rwlock_init(&priv->lock);
+	INIT_LIST_HEAD(&priv->ecus);
+	priv->ndev = ndev;
+	kref_init(&priv->kref);
+	kref_init(&priv->rx_kref);
+	dev_hold(ndev);
+
+	netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv);
+
+	return priv;
+}
+
+static inline void j1939_priv_set(struct net_device *ndev,
+				  struct j1939_priv *priv)
+{
+	struct can_ml_priv *can_ml_priv = ndev->ml_priv;
+
+	can_ml_priv->j1939_priv = priv;
+}
+
+static void __j1939_priv_release(struct kref *kref)
+{
+	struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref);
+	struct net_device *ndev = priv->ndev;
+
+	netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv);
+
+	dev_put(ndev);
+	kfree(priv);
+}
+
+void j1939_priv_put(struct j1939_priv *priv)
+{
+	kref_put(&priv->kref, __j1939_priv_release);
+}
+
+void j1939_priv_get(struct j1939_priv *priv)
+{
+	kref_get(&priv->kref);
+}
+
+static int j1939_can_rx_register(struct j1939_priv *priv)
+{
+	struct net_device *ndev = priv->ndev;
+	int ret;
+
+	j1939_priv_get(priv);
+	ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
+			      j1939_can_recv, priv, "j1939", NULL);
+	if (ret < 0) {
+		j1939_priv_put(priv);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void j1939_can_rx_unregister(struct j1939_priv *priv)
+{
+	struct net_device *ndev = priv->ndev;
+
+	can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
+			  j1939_can_recv, priv);
+
+	j1939_priv_put(priv);
+}
+
+static void __j1939_rx_release(struct kref *kref)
+	__releases(&j1939_netdev_lock)
+{
+	struct j1939_priv *priv = container_of(kref, struct j1939_priv,
+					       rx_kref);
+
+	j1939_can_rx_unregister(priv);
+	j1939_ecu_unmap_all(priv);
+	j1939_priv_set(priv->ndev, NULL);
+	spin_unlock(&j1939_netdev_lock);
+}
+
+/* get pointer to priv without increasing ref counter */
+static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev)
+{
+	struct can_ml_priv *can_ml_priv = ndev->ml_priv;
+
+	return can_ml_priv->j1939_priv;
+}
+
+static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev)
+{
+	struct j1939_priv *priv;
+
+	lockdep_assert_held(&j1939_netdev_lock);
+
+	if (ndev->type != ARPHRD_CAN)
+		return NULL;
+
+	priv = j1939_ndev_to_priv(ndev);
+	if (priv)
+		j1939_priv_get(priv);
+
+	return priv;
+}
+
+static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev)
+{
+	struct j1939_priv *priv;
+
+	spin_lock(&j1939_netdev_lock);
+	priv = j1939_priv_get_by_ndev_locked(ndev);
+	spin_unlock(&j1939_netdev_lock);
+
+	return priv;
+}
+
+struct j1939_priv *j1939_netdev_start(struct net_device *ndev)
+{
+	struct j1939_priv *priv, *priv_new;
+	int ret;
+
+	priv = j1939_priv_get_by_ndev(ndev);
+	if (priv) {
+		kref_get(&priv->rx_kref);
+		return priv;
+	}
+
+	priv = j1939_priv_create(ndev);
+	if (!priv)
+		return ERR_PTR(-ENOMEM);
+
+	j1939_tp_init(priv);
+	spin_lock_init(&priv->j1939_socks_lock);
+	INIT_LIST_HEAD(&priv->j1939_socks);
+
+	spin_lock(&j1939_netdev_lock);
+	priv_new = j1939_priv_get_by_ndev_locked(ndev);
+	if (priv_new) {
+		/* Someone was faster than us, use their priv and roll
+		 * back our's.
+		 */
+		spin_unlock(&j1939_netdev_lock);
+		dev_put(ndev);
+		kfree(priv);
+		kref_get(&priv_new->rx_kref);
+		return priv_new;
+	}
+	j1939_priv_set(ndev, priv);
+	spin_unlock(&j1939_netdev_lock);
+
+	ret = j1939_can_rx_register(priv);
+	if (ret < 0)
+		goto out_priv_put;
+
+	return priv;
+
+ out_priv_put:
+	j1939_priv_set(ndev, NULL);
+	dev_put(ndev);
+	kfree(priv);
+
+	return ERR_PTR(ret);
+}
+
+void j1939_netdev_stop(struct j1939_priv *priv)
+{
+	kref_put_lock(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock);
+	j1939_priv_put(priv);
+}
+
+int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb)
+{
+	int ret, dlc;
+	canid_t canid;
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct can_frame *cf;
+
+	/* apply sanity checks */
+	if (j1939_pgn_is_pdu1(skcb->addr.pgn))
+		skcb->addr.pgn &= J1939_PGN_PDU1_MAX;
+	else
+		skcb->addr.pgn &= J1939_PGN_MAX;
+
+	if (skcb->priority > 7)
+		skcb->priority = 6;
+
+	ret = j1939_ac_fixup(priv, skb);
+	if (unlikely(ret))
+		goto failed;
+	dlc = skb->len;
+
+	/* re-claim the CAN_HDR from the SKB */
+	cf = skb_push(skb, J1939_CAN_HDR);
+
+	/* make it a full can frame again */
+	skb_put(skb, J1939_CAN_FTR + (8 - dlc));
+
+	canid = CAN_EFF_FLAG |
+		(skcb->priority << 26) |
+		(skcb->addr.pgn << 8) |
+		skcb->addr.sa;
+	if (j1939_pgn_is_pdu1(skcb->addr.pgn))
+		canid |= skcb->addr.da << 8;
+
+	cf->can_id = canid;
+	cf->can_dlc = dlc;
+
+	return can_send(skb, 1);
+
+ failed:
+	kfree_skb(skb);
+	return ret;
+}
+
+static int j1939_netdev_notify(struct notifier_block *nb,
+			       unsigned long msg, void *data)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(data);
+	struct j1939_priv *priv;
+
+	priv = j1939_priv_get_by_ndev(ndev);
+	if (!priv)
+		goto notify_done;
+
+	if (ndev->type != ARPHRD_CAN)
+		goto notify_put;
+
+	switch (msg) {
+	case NETDEV_DOWN:
+		j1939_cancel_active_session(priv, NULL);
+		j1939_sk_netdev_event_netdown(priv);
+		j1939_ecu_unmap_all(priv);
+		break;
+	}
+
+notify_put:
+	j1939_priv_put(priv);
+
+notify_done:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block j1939_netdev_notifier = {
+	.notifier_call = j1939_netdev_notify,
+};
+
+/* MODULE interface */
+static __init int j1939_module_init(void)
+{
+	int ret;
+
+	pr_info("can: SAE J1939\n");
+
+	ret = register_netdevice_notifier(&j1939_netdev_notifier);
+	if (ret)
+		goto fail_notifier;
+
+	ret = can_proto_register(&j1939_can_proto);
+	if (ret < 0) {
+		pr_err("can: registration of j1939 protocol failed\n");
+		goto fail_sk;
+	}
+
+	return 0;
+
+ fail_sk:
+	unregister_netdevice_notifier(&j1939_netdev_notifier);
+ fail_notifier:
+	return ret;
+}
+
+static __exit void j1939_module_exit(void)
+{
+	can_proto_unregister(&j1939_can_proto);
+
+	unregister_netdevice_notifier(&j1939_netdev_notifier);
+}
+
+module_init(j1939_module_init);
+module_exit(j1939_module_exit);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
new file mode 100644
index 000000000000..37c1040bcb9c
--- /dev/null
+++ b/net/can/j1939/socket.c
@@ -0,0 +1,1160 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+//                         Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+//                         Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+//                         Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Oleksij Rempel <kernel@pengutronix.de>
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/errqueue.h>
+#include <linux/if_arp.h>
+
+#include "j1939-priv.h"
+
+#define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939)
+
+/* conversion function between struct sock::sk_priority from linux and
+ * j1939 priority field
+ */
+static inline priority_t j1939_prio(u32 sk_priority)
+{
+	sk_priority = min(sk_priority, 7U);
+
+	return 7 - sk_priority;
+}
+
+static inline u32 j1939_to_sk_priority(priority_t prio)
+{
+	return 7 - prio;
+}
+
+/* function to see if pgn is to be evaluated */
+static inline bool j1939_pgn_is_valid(pgn_t pgn)
+{
+	return pgn <= J1939_PGN_MAX;
+}
+
+/* test function to avoid non-zero DA placeholder for pdu1 pgn's */
+static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn)
+{
+	if (j1939_pgn_is_pdu1(pgn))
+		return !(pgn & 0xff);
+	else
+		return true;
+}
+
+static inline void j1939_sock_pending_add(struct sock *sk)
+{
+	struct j1939_sock *jsk = j1939_sk(sk);
+
+	atomic_inc(&jsk->skb_pending);
+}
+
+static int j1939_sock_pending_get(struct sock *sk)
+{
+	struct j1939_sock *jsk = j1939_sk(sk);
+
+	return atomic_read(&jsk->skb_pending);
+}
+
+void j1939_sock_pending_del(struct sock *sk)
+{
+	struct j1939_sock *jsk = j1939_sk(sk);
+
+	/* atomic_dec_return returns the new value */
+	if (!atomic_dec_return(&jsk->skb_pending))
+		wake_up(&jsk->waitq);	/* no pending SKB's */
+}
+
+static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk)
+{
+	jsk->state |= J1939_SOCK_BOUND;
+	j1939_priv_get(priv);
+	jsk->priv = priv;
+
+	spin_lock_bh(&priv->j1939_socks_lock);
+	list_add_tail(&jsk->list, &priv->j1939_socks);
+	spin_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk)
+{
+	spin_lock_bh(&priv->j1939_socks_lock);
+	list_del_init(&jsk->list);
+	spin_unlock_bh(&priv->j1939_socks_lock);
+
+	jsk->priv = NULL;
+	j1939_priv_put(priv);
+	jsk->state &= ~J1939_SOCK_BOUND;
+}
+
+static bool j1939_sk_queue_session(struct j1939_session *session)
+{
+	struct j1939_sock *jsk = j1939_sk(session->sk);
+	bool empty;
+
+	spin_lock_bh(&jsk->sk_session_queue_lock);
+	empty = list_empty(&jsk->sk_session_queue);
+	j1939_session_get(session);
+	list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue);
+	spin_unlock_bh(&jsk->sk_session_queue_lock);
+	j1939_sock_pending_add(&jsk->sk);
+
+	return empty;
+}
+
+static struct
+j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk)
+{
+	struct j1939_session *session = NULL;
+
+	spin_lock_bh(&jsk->sk_session_queue_lock);
+	if (!list_empty(&jsk->sk_session_queue)) {
+		session = list_last_entry(&jsk->sk_session_queue,
+					  struct j1939_session,
+					  sk_session_queue_entry);
+		if (session->total_queued_size == session->total_message_size)
+			session = NULL;
+		else
+			j1939_session_get(session);
+	}
+	spin_unlock_bh(&jsk->sk_session_queue_lock);
+
+	return session;
+}
+
+static void j1939_sk_queue_drop_all(struct j1939_priv *priv,
+				    struct j1939_sock *jsk, int err)
+{
+	struct j1939_session *session, *tmp;
+
+	netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err);
+	spin_lock_bh(&jsk->sk_session_queue_lock);
+	list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue,
+				 sk_session_queue_entry) {
+		list_del_init(&session->sk_session_queue_entry);
+		session->err = err;
+		j1939_session_put(session);
+	}
+	spin_unlock_bh(&jsk->sk_session_queue_lock);
+}
+
+static void j1939_sk_queue_activate_next_locked(struct j1939_session *session)
+{
+	struct j1939_sock *jsk;
+	struct j1939_session *first;
+	int err;
+
+	/* RX-Session don't have a socket (yet) */
+	if (!session->sk)
+		return;
+
+	jsk = j1939_sk(session->sk);
+	lockdep_assert_held(&jsk->sk_session_queue_lock);
+
+	err = session->err;
+
+	first = list_first_entry_or_null(&jsk->sk_session_queue,
+					 struct j1939_session,
+					 sk_session_queue_entry);
+
+	/* Some else has already activated the next session */
+	if (first != session)
+		return;
+
+activate_next:
+	list_del_init(&first->sk_session_queue_entry);
+	j1939_session_put(first);
+	first = list_first_entry_or_null(&jsk->sk_session_queue,
+					 struct j1939_session,
+					 sk_session_queue_entry);
+	if (!first)
+		return;
+
+	if (WARN_ON_ONCE(j1939_session_activate(first))) {
+		first->err = -EBUSY;
+		goto activate_next;
+	} else {
+		/* Give receiver some time (arbitrary chosen) to recover */
+		int time_ms = 0;
+
+		if (err)
+			time_ms = 10 + prandom_u32_max(16);
+
+		j1939_tp_schedule_txtimer(first, time_ms);
+	}
+}
+
+void j1939_sk_queue_activate_next(struct j1939_session *session)
+{
+	struct j1939_sock *jsk;
+
+	if (!session->sk)
+		return;
+
+	jsk = j1939_sk(session->sk);
+
+	spin_lock_bh(&jsk->sk_session_queue_lock);
+	j1939_sk_queue_activate_next_locked(session);
+	spin_unlock_bh(&jsk->sk_session_queue_lock);
+}
+
+static bool j1939_sk_match_dst(struct j1939_sock *jsk,
+			       const struct j1939_sk_buff_cb *skcb)
+{
+	if ((jsk->state & J1939_SOCK_PROMISC))
+		return true;
+
+	/* Destination address filter */
+	if (jsk->addr.src_name && skcb->addr.dst_name) {
+		if (jsk->addr.src_name != skcb->addr.dst_name)
+			return false;
+	} else {
+		/* receive (all sockets) if
+		 * - all packages that match our bind() address
+		 * - all broadcast on a socket if SO_BROADCAST
+		 *   is set
+		 */
+		if (j1939_address_is_unicast(skcb->addr.da)) {
+			if (jsk->addr.sa != skcb->addr.da)
+				return false;
+		} else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) {
+			/* receiving broadcast without SO_BROADCAST
+			 * flag is not allowed
+			 */
+			return false;
+		}
+	}
+
+	/* Source address filter */
+	if (jsk->state & J1939_SOCK_CONNECTED) {
+		/* receive (all sockets) if
+		 * - all packages that match our connect() name or address
+		 */
+		if (jsk->addr.dst_name && skcb->addr.src_name) {
+			if (jsk->addr.dst_name != skcb->addr.src_name)
+				return false;
+		} else {
+			if (jsk->addr.da != skcb->addr.sa)
+				return false;
+		}
+	}
+
+	/* PGN filter */
+	if (j1939_pgn_is_valid(jsk->pgn_rx_filter) &&
+	    jsk->pgn_rx_filter != skcb->addr.pgn)
+		return false;
+
+	return true;
+}
+
+/* matches skb control buffer (addr) with a j1939 filter */
+static bool j1939_sk_match_filter(struct j1939_sock *jsk,
+				  const struct j1939_sk_buff_cb *skcb)
+{
+	const struct j1939_filter *f = jsk->filters;
+	int nfilter = jsk->nfilters;
+
+	if (!nfilter)
+		/* receive all when no filters are assigned */
+		return true;
+
+	for (; nfilter; ++f, --nfilter) {
+		if ((skcb->addr.pgn & f->pgn_mask) != f->pgn)
+			continue;
+		if ((skcb->addr.sa & f->addr_mask) != f->addr)
+			continue;
+		if ((skcb->addr.src_name & f->name_mask) != f->name)
+			continue;
+		return true;
+	}
+	return false;
+}
+
+static bool j1939_sk_recv_match_one(struct j1939_sock *jsk,
+				    const struct j1939_sk_buff_cb *skcb)
+{
+	if (!(jsk->state & J1939_SOCK_BOUND))
+		return false;
+
+	if (!j1939_sk_match_dst(jsk, skcb))
+		return false;
+
+	if (!j1939_sk_match_filter(jsk, skcb))
+		return false;
+
+	return true;
+}
+
+static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb)
+{
+	const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb);
+	struct j1939_sk_buff_cb *skcb;
+	struct sk_buff *skb;
+
+	if (oskb->sk == &jsk->sk)
+		return;
+
+	if (!j1939_sk_recv_match_one(jsk, oskcb))
+		return;
+
+	skb = skb_clone(oskb, GFP_ATOMIC);
+	if (!skb) {
+		pr_warn("skb clone failed\n");
+		return;
+	}
+	can_skb_set_owner(skb, oskb->sk);
+
+	skcb = j1939_skb_to_cb(skb);
+	skcb->msg_flags &= ~(MSG_DONTROUTE);
+	if (skb->sk)
+		skcb->msg_flags |= MSG_DONTROUTE;
+
+	if (sock_queue_rcv_skb(&jsk->sk, skb) < 0)
+		kfree_skb(skb);
+}
+
+bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb)
+{
+	struct j1939_sock *jsk;
+	bool match = false;
+
+	spin_lock_bh(&priv->j1939_socks_lock);
+	list_for_each_entry(jsk, &priv->j1939_socks, list) {
+		match = j1939_sk_recv_match_one(jsk, skcb);
+		if (match)
+			break;
+	}
+	spin_unlock_bh(&priv->j1939_socks_lock);
+
+	return match;
+}
+
+void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+	struct j1939_sock *jsk;
+
+	spin_lock_bh(&priv->j1939_socks_lock);
+	list_for_each_entry(jsk, &priv->j1939_socks, list) {
+		j1939_sk_recv_one(jsk, skb);
+	}
+	spin_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static int j1939_sk_init(struct sock *sk)
+{
+	struct j1939_sock *jsk = j1939_sk(sk);
+
+	/* Ensure that "sk" is first member in "struct j1939_sock", so that we
+	 * can skip it during memset().
+	 */
+	BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0);
+	memset((void *)jsk + sizeof(jsk->sk), 0x0,
+	       sizeof(*jsk) - sizeof(jsk->sk));
+
+	INIT_LIST_HEAD(&jsk->list);
+	init_waitqueue_head(&jsk->waitq);
+	jsk->sk.sk_priority = j1939_to_sk_priority(6);
+	jsk->sk.sk_reuse = 1; /* per default */
+	jsk->addr.sa = J1939_NO_ADDR;
+	jsk->addr.da = J1939_NO_ADDR;
+	jsk->addr.pgn = J1939_NO_PGN;
+	jsk->pgn_rx_filter = J1939_NO_PGN;
+	atomic_set(&jsk->skb_pending, 0);
+	spin_lock_init(&jsk->sk_session_queue_lock);
+	INIT_LIST_HEAD(&jsk->sk_session_queue);
+
+	return 0;
+}
+
+static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len)
+{
+	if (!addr)
+		return -EDESTADDRREQ;
+	if (len < J1939_MIN_NAMELEN)
+		return -EINVAL;
+	if (addr->can_family != AF_CAN)
+		return -EINVAL;
+	if (!addr->can_ifindex)
+		return -ENODEV;
+	if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) &&
+	    !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
+{
+	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+	struct j1939_sock *jsk = j1939_sk(sock->sk);
+	struct j1939_priv *priv = jsk->priv;
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	int ret = 0;
+
+	ret = j1939_sk_sanity_check(addr, len);
+	if (ret)
+		return ret;
+
+	lock_sock(sock->sk);
+
+	/* Already bound to an interface? */
+	if (jsk->state & J1939_SOCK_BOUND) {
+		/* A re-bind() to a different interface is not
+		 * supported.
+		 */
+		if (jsk->ifindex != addr->can_ifindex) {
+			ret = -EINVAL;
+			goto out_release_sock;
+		}
+
+		/* drop old references */
+		j1939_jsk_del(priv, jsk);
+		j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
+	} else {
+		struct net_device *ndev;
+
+		ndev = dev_get_by_index(net, addr->can_ifindex);
+		if (!ndev) {
+			ret = -ENODEV;
+			goto out_release_sock;
+		}
+
+		if (ndev->type != ARPHRD_CAN) {
+			dev_put(ndev);
+			ret = -ENODEV;
+			goto out_release_sock;
+		}
+
+		priv = j1939_netdev_start(ndev);
+		dev_put(ndev);
+		if (IS_ERR(priv)) {
+			ret = PTR_ERR(priv);
+			goto out_release_sock;
+		}
+
+		jsk->ifindex = addr->can_ifindex;
+	}
+
+	/* set default transmit pgn */
+	if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+		jsk->pgn_rx_filter = addr->can_addr.j1939.pgn;
+	jsk->addr.src_name = addr->can_addr.j1939.name;
+	jsk->addr.sa = addr->can_addr.j1939.addr;
+
+	/* get new references */
+	ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa);
+	if (ret) {
+		j1939_netdev_stop(priv);
+		goto out_release_sock;
+	}
+
+	j1939_jsk_add(priv, jsk);
+
+ out_release_sock: /* fall through */
+	release_sock(sock->sk);
+
+	return ret;
+}
+
+static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr,
+			    int len, int flags)
+{
+	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+	struct j1939_sock *jsk = j1939_sk(sock->sk);
+	int ret = 0;
+
+	ret = j1939_sk_sanity_check(addr, len);
+	if (ret)
+		return ret;
+
+	lock_sock(sock->sk);
+
+	/* bind() before connect() is mandatory */
+	if (!(jsk->state & J1939_SOCK_BOUND)) {
+		ret = -EINVAL;
+		goto out_release_sock;
+	}
+
+	/* A connect() to a different interface is not supported. */
+	if (jsk->ifindex != addr->can_ifindex) {
+		ret = -EINVAL;
+		goto out_release_sock;
+	}
+
+	if (!addr->can_addr.j1939.name &&
+	    addr->can_addr.j1939.addr == J1939_NO_ADDR &&
+	    !sock_flag(&jsk->sk, SOCK_BROADCAST)) {
+		/* broadcast, but SO_BROADCAST not set */
+		ret = -EACCES;
+		goto out_release_sock;
+	}
+
+	jsk->addr.dst_name = addr->can_addr.j1939.name;
+	jsk->addr.da = addr->can_addr.j1939.addr;
+
+	if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+		jsk->addr.pgn = addr->can_addr.j1939.pgn;
+
+	jsk->state |= J1939_SOCK_CONNECTED;
+
+ out_release_sock: /* fall through */
+	release_sock(sock->sk);
+
+	return ret;
+}
+
+static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr,
+				       const struct j1939_sock *jsk, int peer)
+{
+	addr->can_family = AF_CAN;
+	addr->can_ifindex = jsk->ifindex;
+	addr->can_addr.j1939.pgn = jsk->addr.pgn;
+	if (peer) {
+		addr->can_addr.j1939.name = jsk->addr.dst_name;
+		addr->can_addr.j1939.addr = jsk->addr.da;
+	} else {
+		addr->can_addr.j1939.name = jsk->addr.src_name;
+		addr->can_addr.j1939.addr = jsk->addr.sa;
+	}
+}
+
+static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr,
+			    int peer)
+{
+	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+	struct sock *sk = sock->sk;
+	struct j1939_sock *jsk = j1939_sk(sk);
+	int ret = 0;
+
+	lock_sock(sk);
+
+	if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) {
+		ret = -EADDRNOTAVAIL;
+		goto failure;
+	}
+
+	j1939_sk_sock2sockaddr_can(addr, jsk, peer);
+	ret = J1939_MIN_NAMELEN;
+
+ failure:
+	release_sock(sk);
+
+	return ret;
+}
+
+static int j1939_sk_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct j1939_sock *jsk;
+
+	if (!sk)
+		return 0;
+
+	jsk = j1939_sk(sk);
+	lock_sock(sk);
+
+	if (jsk->state & J1939_SOCK_BOUND) {
+		struct j1939_priv *priv = jsk->priv;
+
+		if (wait_event_interruptible(jsk->waitq,
+					     !j1939_sock_pending_get(&jsk->sk))) {
+			j1939_cancel_active_session(priv, sk);
+			j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN);
+		}
+
+		j1939_jsk_del(priv, jsk);
+
+		j1939_local_ecu_put(priv, jsk->addr.src_name,
+				    jsk->addr.sa);
+
+		j1939_netdev_stop(priv);
+	}
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	release_sock(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, char __user *optval,
+				    unsigned int optlen, int flag)
+{
+	int tmp;
+
+	if (optlen != sizeof(tmp))
+		return -EINVAL;
+	if (copy_from_user(&tmp, optval, optlen))
+		return -EFAULT;
+	lock_sock(&jsk->sk);
+	if (tmp)
+		jsk->state |= flag;
+	else
+		jsk->state &= ~flag;
+	release_sock(&jsk->sk);
+	return tmp;
+}
+
+static int j1939_sk_setsockopt(struct socket *sock, int level, int optname,
+			       char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct j1939_sock *jsk = j1939_sk(sk);
+	int tmp, count = 0, ret = 0;
+	struct j1939_filter *filters = NULL, *ofilters;
+
+	if (level != SOL_CAN_J1939)
+		return -EINVAL;
+
+	switch (optname) {
+	case SO_J1939_FILTER:
+		if (optval) {
+			struct j1939_filter *f;
+			int c;
+
+			if (optlen % sizeof(*filters) != 0)
+				return -EINVAL;
+
+			if (optlen > J1939_FILTER_MAX *
+			    sizeof(struct j1939_filter))
+				return -EINVAL;
+
+			count = optlen / sizeof(*filters);
+			filters = memdup_user(optval, optlen);
+			if (IS_ERR(filters))
+				return PTR_ERR(filters);
+
+			for (f = filters, c = count; c; f++, c--) {
+				f->name &= f->name_mask;
+				f->pgn &= f->pgn_mask;
+				f->addr &= f->addr_mask;
+			}
+		}
+
+		lock_sock(&jsk->sk);
+		ofilters = jsk->filters;
+		jsk->filters = filters;
+		jsk->nfilters = count;
+		release_sock(&jsk->sk);
+		kfree(ofilters);
+		return 0;
+	case SO_J1939_PROMISC:
+		return j1939_sk_setsockopt_flag(jsk, optval, optlen,
+						J1939_SOCK_PROMISC);
+	case SO_J1939_ERRQUEUE:
+		ret = j1939_sk_setsockopt_flag(jsk, optval, optlen,
+					       J1939_SOCK_ERRQUEUE);
+		if (ret < 0)
+			return ret;
+
+		if (!(jsk->state & J1939_SOCK_ERRQUEUE))
+			skb_queue_purge(&sk->sk_error_queue);
+		return ret;
+	case SO_J1939_SEND_PRIO:
+		if (optlen != sizeof(tmp))
+			return -EINVAL;
+		if (copy_from_user(&tmp, optval, optlen))
+			return -EFAULT;
+		if (tmp < 0 || tmp > 7)
+			return -EDOM;
+		if (tmp < 2 && !capable(CAP_NET_ADMIN))
+			return -EPERM;
+		lock_sock(&jsk->sk);
+		jsk->sk.sk_priority = j1939_to_sk_priority(tmp);
+		release_sock(&jsk->sk);
+		return 0;
+	default:
+		return -ENOPROTOOPT;
+	}
+}
+
+static int j1939_sk_getsockopt(struct socket *sock, int level, int optname,
+			       char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct j1939_sock *jsk = j1939_sk(sk);
+	int ret, ulen;
+	/* set defaults for using 'int' properties */
+	int tmp = 0;
+	int len = sizeof(tmp);
+	void *val = &tmp;
+
+	if (level != SOL_CAN_J1939)
+		return -EINVAL;
+	if (get_user(ulen, optlen))
+		return -EFAULT;
+	if (ulen < 0)
+		return -EINVAL;
+
+	lock_sock(&jsk->sk);
+	switch (optname) {
+	case SO_J1939_PROMISC:
+		tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0;
+		break;
+	case SO_J1939_ERRQUEUE:
+		tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0;
+		break;
+	case SO_J1939_SEND_PRIO:
+		tmp = j1939_prio(jsk->sk.sk_priority);
+		break;
+	default:
+		ret = -ENOPROTOOPT;
+		goto no_copy;
+	}
+
+	/* copy to user, based on 'len' & 'val'
+	 * but most sockopt's are 'int' properties, and have 'len' & 'val'
+	 * left unchanged, but instead modified 'tmp'
+	 */
+	if (len > ulen)
+		ret = -EFAULT;
+	else if (put_user(len, optlen))
+		ret = -EFAULT;
+	else if (copy_to_user(optval, val, len))
+		ret = -EFAULT;
+	else
+		ret = 0;
+ no_copy:
+	release_sock(&jsk->sk);
+	return ret;
+}
+
+static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg,
+			    size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	struct j1939_sk_buff_cb *skcb;
+	int ret = 0;
+
+	if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE))
+		return -EINVAL;
+
+	if (flags & MSG_ERRQUEUE)
+		return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939,
+					  SCM_J1939_ERRQUEUE);
+
+	skb = skb_recv_datagram(sk, flags, 0, &ret);
+	if (!skb)
+		return ret;
+
+	if (size < skb->len)
+		msg->msg_flags |= MSG_TRUNC;
+	else
+		size = skb->len;
+
+	ret = memcpy_to_msg(msg, skb->data, size);
+	if (ret < 0) {
+		skb_free_datagram(sk, skb);
+		return ret;
+	}
+
+	skcb = j1939_skb_to_cb(skb);
+	if (j1939_address_is_valid(skcb->addr.da))
+		put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR,
+			 sizeof(skcb->addr.da), &skcb->addr.da);
+
+	if (skcb->addr.dst_name)
+		put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME,
+			 sizeof(skcb->addr.dst_name), &skcb->addr.dst_name);
+
+	put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO,
+		 sizeof(skcb->priority), &skcb->priority);
+
+	if (msg->msg_name) {
+		struct sockaddr_can *paddr = msg->msg_name;
+
+		msg->msg_namelen = J1939_MIN_NAMELEN;
+		memset(msg->msg_name, 0, msg->msg_namelen);
+		paddr->can_family = AF_CAN;
+		paddr->can_ifindex = skb->skb_iif;
+		paddr->can_addr.j1939.name = skcb->addr.src_name;
+		paddr->can_addr.j1939.addr = skcb->addr.sa;
+		paddr->can_addr.j1939.pgn = skcb->addr.pgn;
+	}
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+	msg->msg_flags |= skcb->msg_flags;
+	skb_free_datagram(sk, skb);
+
+	return size;
+}
+
+static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev,
+					  struct sock *sk,
+					  struct msghdr *msg, size_t size,
+					  int *errcode)
+{
+	struct j1939_sock *jsk = j1939_sk(sk);
+	struct j1939_sk_buff_cb *skcb;
+	struct sk_buff *skb;
+	int ret;
+
+	skb = sock_alloc_send_skb(sk,
+				  size +
+				  sizeof(struct can_frame) -
+				  sizeof(((struct can_frame *)NULL)->data) +
+				  sizeof(struct can_skb_priv),
+				  msg->msg_flags & MSG_DONTWAIT, &ret);
+	if (!skb)
+		goto failure;
+
+	can_skb_reserve(skb);
+	can_skb_prv(skb)->ifindex = ndev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
+	skb_reserve(skb, offsetof(struct can_frame, data));
+
+	ret = memcpy_from_msg(skb_put(skb, size), msg, size);
+	if (ret < 0)
+		goto free_skb;
+
+	skb->dev = ndev;
+
+	skcb = j1939_skb_to_cb(skb);
+	memset(skcb, 0, sizeof(*skcb));
+	skcb->addr = jsk->addr;
+	skcb->priority = j1939_prio(sk->sk_priority);
+
+	if (msg->msg_name) {
+		struct sockaddr_can *addr = msg->msg_name;
+
+		if (addr->can_addr.j1939.name ||
+		    addr->can_addr.j1939.addr != J1939_NO_ADDR) {
+			skcb->addr.dst_name = addr->can_addr.j1939.name;
+			skcb->addr.da = addr->can_addr.j1939.addr;
+		}
+		if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+			skcb->addr.pgn = addr->can_addr.j1939.pgn;
+	}
+
+	*errcode = ret;
+	return skb;
+
+free_skb:
+	kfree_skb(skb);
+failure:
+	*errcode = ret;
+	return NULL;
+}
+
+static size_t j1939_sk_opt_stats_get_size(void)
+{
+	return
+		nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */
+		0;
+}
+
+static struct sk_buff *
+j1939_sk_get_timestamping_opt_stats(struct j1939_session *session)
+{
+	struct sk_buff *stats;
+	u32 size;
+
+	stats = alloc_skb(j1939_sk_opt_stats_get_size(), GFP_ATOMIC);
+	if (!stats)
+		return NULL;
+
+	if (session->skcb.addr.type == J1939_SIMPLE)
+		size = session->total_message_size;
+	else
+		size = min(session->pkt.tx_acked * 7,
+			   session->total_message_size);
+
+	nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size);
+
+	return stats;
+}
+
+void j1939_sk_errqueue(struct j1939_session *session,
+		       enum j1939_sk_errqueue_type type)
+{
+	struct j1939_priv *priv = session->priv;
+	struct sock *sk = session->sk;
+	struct j1939_sock *jsk;
+	struct sock_exterr_skb *serr;
+	struct sk_buff *skb;
+	char *state = "UNK";
+	int err;
+
+	/* currently we have no sk for the RX session */
+	if (!sk)
+		return;
+
+	jsk = j1939_sk(sk);
+
+	if (!(jsk->state & J1939_SOCK_ERRQUEUE))
+		return;
+
+	skb = j1939_sk_get_timestamping_opt_stats(session);
+	if (!skb)
+		return;
+
+	skb->tstamp = ktime_get_real();
+
+	BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
+
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	switch (type) {
+	case J1939_ERRQUEUE_ACK:
+		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))
+			return;
+
+		serr->ee.ee_errno = ENOMSG;
+		serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+		serr->ee.ee_info = SCM_TSTAMP_ACK;
+		state = "ACK";
+		break;
+	case J1939_ERRQUEUE_SCHED:
+		if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED))
+			return;
+
+		serr->ee.ee_errno = ENOMSG;
+		serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+		serr->ee.ee_info = SCM_TSTAMP_SCHED;
+		state = "SCH";
+		break;
+	case J1939_ERRQUEUE_ABORT:
+		serr->ee.ee_errno = session->err;
+		serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+		serr->ee.ee_info = J1939_EE_INFO_TX_ABORT;
+		state = "ABT";
+		break;
+	default:
+		netdev_err(priv->ndev, "Unknown errqueue type %i\n", type);
+	}
+
+	serr->opt_stats = true;
+	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+		serr->ee.ee_data = session->tskey;
+
+	netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n",
+		   __func__, session, session->tskey, state);
+	err = sock_queue_err_skb(sk, skb);
+
+	if (err)
+		kfree_skb(skb);
+};
+
+void j1939_sk_send_loop_abort(struct sock *sk, int err)
+{
+	sk->sk_err = err;
+
+	sk->sk_error_report(sk);
+}
+
+static int j1939_sk_send_loop(struct j1939_priv *priv,  struct sock *sk,
+			      struct msghdr *msg, size_t size)
+
+{
+	struct j1939_sock *jsk = j1939_sk(sk);
+	struct j1939_session *session = j1939_sk_get_incomplete_session(jsk);
+	struct sk_buff *skb;
+	size_t segment_size, todo_size;
+	int ret = 0;
+
+	if (session &&
+	    session->total_message_size != session->total_queued_size + size) {
+		j1939_session_put(session);
+		return -EIO;
+	}
+
+	todo_size = size;
+
+	while (todo_size) {
+		struct j1939_sk_buff_cb *skcb;
+
+		segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE,
+				     todo_size);
+
+		/* Allocate skb for one segment */
+		skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size,
+					 &ret);
+		if (ret)
+			break;
+
+		skcb = j1939_skb_to_cb(skb);
+
+		if (!session) {
+			/* at this point the size should be full size
+			 * of the session
+			 */
+			skcb->offset = 0;
+			session = j1939_tp_send(priv, skb, size);
+			if (IS_ERR(session)) {
+				ret = PTR_ERR(session);
+				goto kfree_skb;
+			}
+			if (j1939_sk_queue_session(session)) {
+				/* try to activate session if we a
+				 * fist in the queue
+				 */
+				if (!j1939_session_activate(session)) {
+					j1939_tp_schedule_txtimer(session, 0);
+				} else {
+					ret = -EBUSY;
+					session->err = ret;
+					j1939_sk_queue_drop_all(priv, jsk,
+								EBUSY);
+					break;
+				}
+			}
+		} else {
+			skcb->offset = session->total_queued_size;
+			j1939_session_skb_queue(session, skb);
+		}
+
+		todo_size -= segment_size;
+		session->total_queued_size += segment_size;
+	}
+
+	switch (ret) {
+	case 0: /* OK */
+		if (todo_size)
+			netdev_warn(priv->ndev,
+				    "no error found and not completely queued?! %zu\n",
+				    todo_size);
+		ret = size;
+		break;
+	case -ERESTARTSYS:
+		ret = -EINTR;
+		/* fall through */
+	case -EAGAIN: /* OK */
+		if (todo_size != size)
+			ret = size - todo_size;
+		break;
+	default: /* ERROR */
+		break;
+	}
+
+	if (session)
+		j1939_session_put(session);
+
+	return ret;
+
+ kfree_skb:
+	kfree_skb(skb);
+	return ret;
+}
+
+static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg,
+			    size_t size)
+{
+	struct sock *sk = sock->sk;
+	struct j1939_sock *jsk = j1939_sk(sk);
+	struct j1939_priv *priv = jsk->priv;
+	int ifindex;
+	int ret;
+
+	/* various socket state tests */
+	if (!(jsk->state & J1939_SOCK_BOUND))
+		return -EBADFD;
+
+	ifindex = jsk->ifindex;
+
+	if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR)
+		/* no source address assigned yet */
+		return -EBADFD;
+
+	/* deal with provided destination address info */
+	if (msg->msg_name) {
+		struct sockaddr_can *addr = msg->msg_name;
+
+		if (msg->msg_namelen < J1939_MIN_NAMELEN)
+			return -EINVAL;
+
+		if (addr->can_family != AF_CAN)
+			return -EINVAL;
+
+		if (addr->can_ifindex && addr->can_ifindex != ifindex)
+			return -EBADFD;
+
+		if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) &&
+		    !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn))
+			return -EINVAL;
+
+		if (!addr->can_addr.j1939.name &&
+		    addr->can_addr.j1939.addr == J1939_NO_ADDR &&
+		    !sock_flag(sk, SOCK_BROADCAST))
+			/* broadcast, but SO_BROADCAST not set */
+			return -EACCES;
+	} else {
+		if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR &&
+		    !sock_flag(sk, SOCK_BROADCAST))
+			/* broadcast, but SO_BROADCAST not set */
+			return -EACCES;
+	}
+
+	ret = j1939_sk_send_loop(priv, sk, msg, size);
+
+	return ret;
+}
+
+void j1939_sk_netdev_event_netdown(struct j1939_priv *priv)
+{
+	struct j1939_sock *jsk;
+	int error_code = ENETDOWN;
+
+	spin_lock_bh(&priv->j1939_socks_lock);
+	list_for_each_entry(jsk, &priv->j1939_socks, list) {
+		jsk->sk.sk_err = error_code;
+		if (!sock_flag(&jsk->sk, SOCK_DEAD))
+			jsk->sk.sk_error_report(&jsk->sk);
+
+		j1939_sk_queue_drop_all(priv, jsk, error_code);
+	}
+	spin_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+				unsigned long arg)
+{
+	/* no ioctls for socket layer -> hand it down to NIC layer */
+	return -ENOIOCTLCMD;
+}
+
+static const struct proto_ops j1939_ops = {
+	.family = PF_CAN,
+	.release = j1939_sk_release,
+	.bind = j1939_sk_bind,
+	.connect = j1939_sk_connect,
+	.socketpair = sock_no_socketpair,
+	.accept = sock_no_accept,
+	.getname = j1939_sk_getname,
+	.poll = datagram_poll,
+	.ioctl = j1939_sk_no_ioctlcmd,
+	.listen = sock_no_listen,
+	.shutdown = sock_no_shutdown,
+	.setsockopt = j1939_sk_setsockopt,
+	.getsockopt = j1939_sk_getsockopt,
+	.sendmsg = j1939_sk_sendmsg,
+	.recvmsg = j1939_sk_recvmsg,
+	.mmap = sock_no_mmap,
+	.sendpage = sock_no_sendpage,
+};
+
+static struct proto j1939_proto __read_mostly = {
+	.name = "CAN_J1939",
+	.owner = THIS_MODULE,
+	.obj_size = sizeof(struct j1939_sock),
+	.init = j1939_sk_init,
+};
+
+const struct can_proto j1939_can_proto = {
+	.type = SOCK_DGRAM,
+	.protocol = CAN_J1939,
+	.ops = &j1939_ops,
+	.prot = &j1939_proto,
+};
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
new file mode 100644
index 000000000000..fe000ea757ea
--- /dev/null
+++ b/net/can/j1939/transport.c
@@ -0,0 +1,2027 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+//                         Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+//                         Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+//                         Oleksij Rempel <kernel@pengutronix.de>
+
+#include <linux/can/skb.h>
+
+#include "j1939-priv.h"
+
+#define J1939_XTP_TX_RETRY_LIMIT 100
+
+#define J1939_ETP_PGN_CTL 0xc800
+#define J1939_ETP_PGN_DAT 0xc700
+#define J1939_TP_PGN_CTL 0xec00
+#define J1939_TP_PGN_DAT 0xeb00
+
+#define J1939_TP_CMD_RTS 0x10
+#define J1939_TP_CMD_CTS 0x11
+#define J1939_TP_CMD_EOMA 0x13
+#define J1939_TP_CMD_BAM 0x20
+#define J1939_TP_CMD_ABORT 0xff
+
+#define J1939_ETP_CMD_RTS 0x14
+#define J1939_ETP_CMD_CTS 0x15
+#define J1939_ETP_CMD_DPO 0x16
+#define J1939_ETP_CMD_EOMA 0x17
+#define J1939_ETP_CMD_ABORT 0xff
+
+enum j1939_xtp_abort {
+	J1939_XTP_NO_ABORT = 0,
+	J1939_XTP_ABORT_BUSY = 1,
+	/* Already in one or more connection managed sessions and
+	 * cannot support another.
+	 *
+	 * EALREADY:
+	 * Operation already in progress
+	 */
+
+	J1939_XTP_ABORT_RESOURCE = 2,
+	/* System resources were needed for another task so this
+	 * connection managed session was terminated.
+	 *
+	 * EMSGSIZE:
+	 * The socket type requires that message be sent atomically,
+	 * and the size of the message to be sent made this
+	 * impossible.
+	 */
+
+	J1939_XTP_ABORT_TIMEOUT = 3,
+	/* A timeout occurred and this is the connection abort to
+	 * close the session.
+	 *
+	 * EHOSTUNREACH:
+	 * The destination host cannot be reached (probably because
+	 * the host is down or a remote router cannot reach it).
+	 */
+
+	J1939_XTP_ABORT_GENERIC = 4,
+	/* CTS messages received when data transfer is in progress
+	 *
+	 * EBADMSG:
+	 * Not a data message
+	 */
+
+	J1939_XTP_ABORT_FAULT = 5,
+	/* Maximal retransmit request limit reached
+	 *
+	 * ENOTRECOVERABLE:
+	 * State not recoverable
+	 */
+
+	J1939_XTP_ABORT_UNEXPECTED_DATA = 6,
+	/* Unexpected data transfer packet
+	 *
+	 * ENOTCONN:
+	 * Transport endpoint is not connected
+	 */
+
+	J1939_XTP_ABORT_BAD_SEQ = 7,
+	/* Bad sequence number (and software is not able to recover)
+	 *
+	 * EILSEQ:
+	 * Illegal byte sequence
+	 */
+
+	J1939_XTP_ABORT_DUP_SEQ = 8,
+	/* Duplicate sequence number (and software is not able to
+	 * recover)
+	 */
+
+	J1939_XTP_ABORT_EDPO_UNEXPECTED = 9,
+	/* Unexpected EDPO packet (ETP) or Message size > 1785 bytes
+	 * (TP)
+	 */
+
+	J1939_XTP_ABORT_BAD_EDPO_PGN = 10,
+	/* Unexpected EDPO PGN (PGN in EDPO is bad) */
+
+	J1939_XTP_ABORT_EDPO_OUTOF_CTS = 11,
+	/* EDPO number of packets is greater than CTS */
+
+	J1939_XTP_ABORT_BAD_EDPO_OFFSET = 12,
+	/* Bad EDPO offset */
+
+	J1939_XTP_ABORT_OTHER_DEPRECATED = 13,
+	/* Deprecated. Use 250 instead (Any other reason)  */
+
+	J1939_XTP_ABORT_ECTS_UNXPECTED_PGN = 14,
+	/* Unexpected ECTS PGN (PGN in ECTS is bad) */
+
+	J1939_XTP_ABORT_ECTS_TOO_BIG = 15,
+	/* ECTS requested packets exceeds message size */
+
+	J1939_XTP_ABORT_OTHER = 250,
+	/* Any other reason (if a Connection Abort reason is
+	 * identified that is not listed in the table use code 250)
+	 */
+};
+
+static unsigned int j1939_tp_block = 255;
+static unsigned int j1939_tp_packet_delay;
+static unsigned int j1939_tp_padding = 1;
+
+/* helpers */
+static const char *j1939_xtp_abort_to_str(enum j1939_xtp_abort abort)
+{
+	switch (abort) {
+	case J1939_XTP_ABORT_BUSY:
+		return "Already in one or more connection managed sessions and cannot support another.";
+	case J1939_XTP_ABORT_RESOURCE:
+		return "System resources were needed for another task so this connection managed session was terminated.";
+	case J1939_XTP_ABORT_TIMEOUT:
+		return "A timeout occurred and this is the connection abort to close the session.";
+	case J1939_XTP_ABORT_GENERIC:
+		return "CTS messages received when data transfer is in progress";
+	case J1939_XTP_ABORT_FAULT:
+		return "Maximal retransmit request limit reached";
+	case J1939_XTP_ABORT_UNEXPECTED_DATA:
+		return "Unexpected data transfer packet";
+	case J1939_XTP_ABORT_BAD_SEQ:
+		return "Bad sequence number (and software is not able to recover)";
+	case J1939_XTP_ABORT_DUP_SEQ:
+		return "Duplicate sequence number (and software is not able to recover)";
+	case J1939_XTP_ABORT_EDPO_UNEXPECTED:
+		return "Unexpected EDPO packet (ETP) or Message size > 1785 bytes (TP)";
+	case J1939_XTP_ABORT_BAD_EDPO_PGN:
+		return "Unexpected EDPO PGN (PGN in EDPO is bad)";
+	case J1939_XTP_ABORT_EDPO_OUTOF_CTS:
+		return "EDPO number of packets is greater than CTS";
+	case J1939_XTP_ABORT_BAD_EDPO_OFFSET:
+		return "Bad EDPO offset";
+	case J1939_XTP_ABORT_OTHER_DEPRECATED:
+		return "Deprecated. Use 250 instead (Any other reason)";
+	case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN:
+		return "Unexpected ECTS PGN (PGN in ECTS is bad)";
+	case J1939_XTP_ABORT_ECTS_TOO_BIG:
+		return "ECTS requested packets exceeds message size";
+	case J1939_XTP_ABORT_OTHER:
+		return "Any other reason (if a Connection Abort reason is identified that is not listed in the table use code 250)";
+	default:
+		return "<unknown>";
+	}
+}
+
+static int j1939_xtp_abort_to_errno(struct j1939_priv *priv,
+				    enum j1939_xtp_abort abort)
+{
+	int err;
+
+	switch (abort) {
+	case J1939_XTP_NO_ABORT:
+		WARN_ON_ONCE(abort == J1939_XTP_NO_ABORT);
+		err = 0;
+		break;
+	case J1939_XTP_ABORT_BUSY:
+		err = EALREADY;
+		break;
+	case J1939_XTP_ABORT_RESOURCE:
+		err = EMSGSIZE;
+		break;
+	case J1939_XTP_ABORT_TIMEOUT:
+		err = EHOSTUNREACH;
+		break;
+	case J1939_XTP_ABORT_GENERIC:
+		err = EBADMSG;
+		break;
+	case J1939_XTP_ABORT_FAULT:
+		err = ENOTRECOVERABLE;
+		break;
+	case J1939_XTP_ABORT_UNEXPECTED_DATA:
+		err = ENOTCONN;
+		break;
+	case J1939_XTP_ABORT_BAD_SEQ:
+		err = EILSEQ;
+		break;
+	case J1939_XTP_ABORT_DUP_SEQ:
+		err = EPROTO;
+		break;
+	case J1939_XTP_ABORT_EDPO_UNEXPECTED:
+		err = EPROTO;
+		break;
+	case J1939_XTP_ABORT_BAD_EDPO_PGN:
+		err = EPROTO;
+		break;
+	case J1939_XTP_ABORT_EDPO_OUTOF_CTS:
+		err = EPROTO;
+		break;
+	case J1939_XTP_ABORT_BAD_EDPO_OFFSET:
+		err = EPROTO;
+		break;
+	case J1939_XTP_ABORT_OTHER_DEPRECATED:
+		err = EPROTO;
+		break;
+	case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN:
+		err = EPROTO;
+		break;
+	case J1939_XTP_ABORT_ECTS_TOO_BIG:
+		err = EPROTO;
+		break;
+	case J1939_XTP_ABORT_OTHER:
+		err = EPROTO;
+		break;
+	default:
+		netdev_warn(priv->ndev, "Unknown abort code %i", abort);
+		err = EPROTO;
+	}
+
+	return err;
+}
+
+static inline void j1939_session_list_lock(struct j1939_priv *priv)
+{
+	spin_lock_bh(&priv->active_session_list_lock);
+}
+
+static inline void j1939_session_list_unlock(struct j1939_priv *priv)
+{
+	spin_unlock_bh(&priv->active_session_list_lock);
+}
+
+void j1939_session_get(struct j1939_session *session)
+{
+	kref_get(&session->kref);
+}
+
+/* session completion functions */
+static void __j1939_session_drop(struct j1939_session *session)
+{
+	if (!session->transmission)
+		return;
+
+	j1939_sock_pending_del(session->sk);
+}
+
+static void j1939_session_destroy(struct j1939_session *session)
+{
+	if (session->err)
+		j1939_sk_errqueue(session, J1939_ERRQUEUE_ABORT);
+	else
+		j1939_sk_errqueue(session, J1939_ERRQUEUE_ACK);
+
+	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+	skb_queue_purge(&session->skb_queue);
+	__j1939_session_drop(session);
+	j1939_priv_put(session->priv);
+	kfree(session);
+}
+
+static void __j1939_session_release(struct kref *kref)
+{
+	struct j1939_session *session = container_of(kref, struct j1939_session,
+						     kref);
+
+	j1939_session_destroy(session);
+}
+
+void j1939_session_put(struct j1939_session *session)
+{
+	kref_put(&session->kref, __j1939_session_release);
+}
+
+static void j1939_session_txtimer_cancel(struct j1939_session *session)
+{
+	if (hrtimer_cancel(&session->txtimer))
+		j1939_session_put(session);
+}
+
+static void j1939_session_rxtimer_cancel(struct j1939_session *session)
+{
+	if (hrtimer_cancel(&session->rxtimer))
+		j1939_session_put(session);
+}
+
+void j1939_session_timers_cancel(struct j1939_session *session)
+{
+	j1939_session_txtimer_cancel(session);
+	j1939_session_rxtimer_cancel(session);
+}
+
+static inline bool j1939_cb_is_broadcast(const struct j1939_sk_buff_cb *skcb)
+{
+	return (!skcb->addr.dst_name && (skcb->addr.da == 0xff));
+}
+
+static void j1939_session_skb_drop_old(struct j1939_session *session)
+{
+	struct sk_buff *do_skb;
+	struct j1939_sk_buff_cb *do_skcb;
+	unsigned int offset_start;
+	unsigned long flags;
+
+	if (skb_queue_len(&session->skb_queue) < 2)
+		return;
+
+	offset_start = session->pkt.tx_acked * 7;
+
+	spin_lock_irqsave(&session->skb_queue.lock, flags);
+	do_skb = skb_peek(&session->skb_queue);
+	do_skcb = j1939_skb_to_cb(do_skb);
+
+	if ((do_skcb->offset + do_skb->len) < offset_start) {
+		__skb_unlink(do_skb, &session->skb_queue);
+		kfree_skb(do_skb);
+	}
+	spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+}
+
+void j1939_session_skb_queue(struct j1939_session *session,
+			     struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_priv *priv = session->priv;
+
+	j1939_ac_fixup(priv, skb);
+
+	if (j1939_address_is_unicast(skcb->addr.da) &&
+	    priv->ents[skcb->addr.da].nusers)
+		skcb->flags |= J1939_ECU_LOCAL_DST;
+
+	skcb->flags |= J1939_ECU_LOCAL_SRC;
+
+	skb_queue_tail(&session->skb_queue, skb);
+}
+
+static struct sk_buff *j1939_session_skb_find(struct j1939_session *session)
+{
+	struct j1939_priv *priv = session->priv;
+	struct sk_buff *skb = NULL;
+	struct sk_buff *do_skb;
+	struct j1939_sk_buff_cb *do_skcb;
+	unsigned int offset_start;
+	unsigned long flags;
+
+	offset_start = session->pkt.dpo * 7;
+
+	spin_lock_irqsave(&session->skb_queue.lock, flags);
+	skb_queue_walk(&session->skb_queue, do_skb) {
+		do_skcb = j1939_skb_to_cb(do_skb);
+
+		if (offset_start >= do_skcb->offset &&
+		    offset_start < (do_skcb->offset + do_skb->len)) {
+			skb = do_skb;
+		}
+	}
+	spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+
+	if (!skb)
+		netdev_dbg(priv->ndev, "%s: 0x%p: no skb found for start: %i, queue size: %i\n",
+			   __func__, session, offset_start,
+			   skb_queue_len(&session->skb_queue));
+
+	return skb;
+}
+
+/* see if we are receiver
+ * returns 0 for broadcasts, although we will receive them
+ */
+static inline int j1939_tp_im_receiver(const struct j1939_sk_buff_cb *skcb)
+{
+	return skcb->flags & J1939_ECU_LOCAL_DST;
+}
+
+/* see if we are sender */
+static inline int j1939_tp_im_transmitter(const struct j1939_sk_buff_cb *skcb)
+{
+	return skcb->flags & J1939_ECU_LOCAL_SRC;
+}
+
+/* see if we are involved as either receiver or transmitter */
+static int j1939_tp_im_involved(const struct j1939_sk_buff_cb *skcb, bool swap)
+{
+	if (swap)
+		return j1939_tp_im_receiver(skcb);
+	else
+		return j1939_tp_im_transmitter(skcb);
+}
+
+static int j1939_tp_im_involved_anydir(struct j1939_sk_buff_cb *skcb)
+{
+	return skcb->flags & (J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST);
+}
+
+/* extract pgn from flow-ctl message */
+static inline pgn_t j1939_xtp_ctl_to_pgn(const u8 *dat)
+{
+	pgn_t pgn;
+
+	pgn = (dat[7] << 16) | (dat[6] << 8) | (dat[5] << 0);
+	if (j1939_pgn_is_pdu1(pgn))
+		pgn &= 0xffff00;
+	return pgn;
+}
+
+static inline unsigned int j1939_tp_ctl_to_size(const u8 *dat)
+{
+	return (dat[2] << 8) + (dat[1] << 0);
+}
+
+static inline unsigned int j1939_etp_ctl_to_packet(const u8 *dat)
+{
+	return (dat[4] << 16) | (dat[3] << 8) | (dat[2] << 0);
+}
+
+static inline unsigned int j1939_etp_ctl_to_size(const u8 *dat)
+{
+	return (dat[4] << 24) | (dat[3] << 16) |
+		(dat[2] << 8) | (dat[1] << 0);
+}
+
+/* find existing session:
+ * reverse: swap cb's src & dst
+ * there is no problem with matching broadcasts, since
+ * broadcasts (no dst, no da) would never call this
+ * with reverse == true
+ */
+static bool j1939_session_match(struct j1939_addr *se_addr,
+				struct j1939_addr *sk_addr, bool reverse)
+{
+	if (se_addr->type != sk_addr->type)
+		return false;
+
+	if (reverse) {
+		if (se_addr->src_name) {
+			if (se_addr->src_name != sk_addr->dst_name)
+				return false;
+		} else if (se_addr->sa != sk_addr->da) {
+			return false;
+		}
+
+		if (se_addr->dst_name) {
+			if (se_addr->dst_name != sk_addr->src_name)
+				return false;
+		} else if (se_addr->da != sk_addr->sa) {
+			return false;
+		}
+	} else {
+		if (se_addr->src_name) {
+			if (se_addr->src_name != sk_addr->src_name)
+				return false;
+		} else if (se_addr->sa != sk_addr->sa) {
+			return false;
+		}
+
+		if (se_addr->dst_name) {
+			if (se_addr->dst_name != sk_addr->dst_name)
+				return false;
+		} else if (se_addr->da != sk_addr->da) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static struct
+j1939_session *j1939_session_get_by_addr_locked(struct j1939_priv *priv,
+						struct list_head *root,
+						struct j1939_addr *addr,
+						bool reverse, bool transmitter)
+{
+	struct j1939_session *session;
+
+	lockdep_assert_held(&priv->active_session_list_lock);
+
+	list_for_each_entry(session, root, active_session_list_entry) {
+		j1939_session_get(session);
+		if (j1939_session_match(&session->skcb.addr, addr, reverse) &&
+		    session->transmission == transmitter)
+			return session;
+		j1939_session_put(session);
+	}
+
+	return NULL;
+}
+
+static struct
+j1939_session *j1939_session_get_simple(struct j1939_priv *priv,
+					struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_session *session;
+
+	lockdep_assert_held(&priv->active_session_list_lock);
+
+	list_for_each_entry(session, &priv->active_session_list,
+			    active_session_list_entry) {
+		j1939_session_get(session);
+		if (session->skcb.addr.type == J1939_SIMPLE &&
+		    session->tskey == skcb->tskey && session->sk == skb->sk)
+			return session;
+		j1939_session_put(session);
+	}
+
+	return NULL;
+}
+
+static struct
+j1939_session *j1939_session_get_by_addr(struct j1939_priv *priv,
+					 struct j1939_addr *addr,
+					 bool reverse, bool transmitter)
+{
+	struct j1939_session *session;
+
+	j1939_session_list_lock(priv);
+	session = j1939_session_get_by_addr_locked(priv,
+						   &priv->active_session_list,
+						   addr, reverse, transmitter);
+	j1939_session_list_unlock(priv);
+
+	return session;
+}
+
+static void j1939_skbcb_swap(struct j1939_sk_buff_cb *skcb)
+{
+	u8 tmp = 0;
+
+	swap(skcb->addr.dst_name, skcb->addr.src_name);
+	swap(skcb->addr.da, skcb->addr.sa);
+
+	/* swap SRC and DST flags, leave other untouched */
+	if (skcb->flags & J1939_ECU_LOCAL_SRC)
+		tmp |= J1939_ECU_LOCAL_DST;
+	if (skcb->flags & J1939_ECU_LOCAL_DST)
+		tmp |= J1939_ECU_LOCAL_SRC;
+	skcb->flags &= ~(J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST);
+	skcb->flags |= tmp;
+}
+
+static struct
+sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv,
+			     const struct j1939_sk_buff_cb *re_skcb,
+			     bool ctl,
+			     bool swap_src_dst)
+{
+	struct sk_buff *skb;
+	struct j1939_sk_buff_cb *skcb;
+
+	skb = alloc_skb(sizeof(struct can_frame) + sizeof(struct can_skb_priv),
+			GFP_ATOMIC);
+	if (unlikely(!skb))
+		return ERR_PTR(-ENOMEM);
+
+	skb->dev = priv->ndev;
+	can_skb_reserve(skb);
+	can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+	/* reserve CAN header */
+	skb_reserve(skb, offsetof(struct can_frame, data));
+
+	memcpy(skb->cb, re_skcb, sizeof(skb->cb));
+	skcb = j1939_skb_to_cb(skb);
+	if (swap_src_dst)
+		j1939_skbcb_swap(skcb);
+
+	if (ctl) {
+		if (skcb->addr.type == J1939_ETP)
+			skcb->addr.pgn = J1939_ETP_PGN_CTL;
+		else
+			skcb->addr.pgn = J1939_TP_PGN_CTL;
+	} else {
+		if (skcb->addr.type == J1939_ETP)
+			skcb->addr.pgn = J1939_ETP_PGN_DAT;
+		else
+			skcb->addr.pgn = J1939_TP_PGN_DAT;
+	}
+
+	return skb;
+}
+
+/* TP transmit packet functions */
+static int j1939_tp_tx_dat(struct j1939_session *session,
+			   const u8 *dat, int len)
+{
+	struct j1939_priv *priv = session->priv;
+	struct sk_buff *skb;
+
+	skb = j1939_tp_tx_dat_new(priv, &session->skcb,
+				  false, false);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	skb_put_data(skb, dat, len);
+	if (j1939_tp_padding && len < 8)
+		memset(skb_put(skb, 8 - len), 0xff, 8 - len);
+
+	return j1939_send_one(priv, skb);
+}
+
+static int j1939_xtp_do_tx_ctl(struct j1939_priv *priv,
+			       const struct j1939_sk_buff_cb *re_skcb,
+			       bool swap_src_dst, pgn_t pgn, const u8 *dat)
+{
+	struct sk_buff *skb;
+	u8 *skdat;
+
+	if (!j1939_tp_im_involved(re_skcb, swap_src_dst))
+		return 0;
+
+	skb = j1939_tp_tx_dat_new(priv, re_skcb, true, swap_src_dst);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	skdat = skb_put(skb, 8);
+	memcpy(skdat, dat, 5);
+	skdat[5] = (pgn >> 0);
+	skdat[6] = (pgn >> 8);
+	skdat[7] = (pgn >> 16);
+
+	return j1939_send_one(priv, skb);
+}
+
+static inline int j1939_tp_tx_ctl(struct j1939_session *session,
+				  bool swap_src_dst, const u8 *dat)
+{
+	struct j1939_priv *priv = session->priv;
+
+	return j1939_xtp_do_tx_ctl(priv, &session->skcb,
+				   swap_src_dst,
+				   session->skcb.addr.pgn, dat);
+}
+
+static int j1939_xtp_tx_abort(struct j1939_priv *priv,
+			      const struct j1939_sk_buff_cb *re_skcb,
+			      bool swap_src_dst,
+			      enum j1939_xtp_abort err,
+			      pgn_t pgn)
+{
+	u8 dat[5];
+
+	if (!j1939_tp_im_involved(re_skcb, swap_src_dst))
+		return 0;
+
+	memset(dat, 0xff, sizeof(dat));
+	dat[0] = J1939_TP_CMD_ABORT;
+	dat[1] = err;
+	return j1939_xtp_do_tx_ctl(priv, re_skcb, swap_src_dst, pgn, dat);
+}
+
+void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec)
+{
+	j1939_session_get(session);
+	hrtimer_start(&session->txtimer, ms_to_ktime(msec),
+		      HRTIMER_MODE_REL_SOFT);
+}
+
+static inline void j1939_tp_set_rxtimeout(struct j1939_session *session,
+					  int msec)
+{
+	j1939_session_rxtimer_cancel(session);
+	j1939_session_get(session);
+	hrtimer_start(&session->rxtimer, ms_to_ktime(msec),
+		      HRTIMER_MODE_REL_SOFT);
+}
+
+static int j1939_session_tx_rts(struct j1939_session *session)
+{
+	u8 dat[8];
+	int ret;
+
+	memset(dat, 0xff, sizeof(dat));
+
+	dat[1] = (session->total_message_size >> 0);
+	dat[2] = (session->total_message_size >> 8);
+	dat[3] = session->pkt.total;
+
+	if (session->skcb.addr.type == J1939_ETP) {
+		dat[0] = J1939_ETP_CMD_RTS;
+		dat[1] = (session->total_message_size >> 0);
+		dat[2] = (session->total_message_size >> 8);
+		dat[3] = (session->total_message_size >> 16);
+		dat[4] = (session->total_message_size >> 24);
+	} else if (j1939_cb_is_broadcast(&session->skcb)) {
+		dat[0] = J1939_TP_CMD_BAM;
+		/* fake cts for broadcast */
+		session->pkt.tx = 0;
+	} else {
+		dat[0] = J1939_TP_CMD_RTS;
+		dat[4] = dat[3];
+	}
+
+	if (dat[0] == session->last_txcmd)
+		/* done already */
+		return 0;
+
+	ret = j1939_tp_tx_ctl(session, false, dat);
+	if (ret < 0)
+		return ret;
+
+	session->last_txcmd = dat[0];
+	if (dat[0] == J1939_TP_CMD_BAM)
+		j1939_tp_schedule_txtimer(session, 50);
+
+	j1939_tp_set_rxtimeout(session, 1250);
+
+	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+	return 0;
+}
+
+static int j1939_session_tx_dpo(struct j1939_session *session)
+{
+	unsigned int pkt;
+	u8 dat[8];
+	int ret;
+
+	memset(dat, 0xff, sizeof(dat));
+
+	dat[0] = J1939_ETP_CMD_DPO;
+	session->pkt.dpo = session->pkt.tx_acked;
+	pkt = session->pkt.dpo;
+	dat[1] = session->pkt.last - session->pkt.tx_acked;
+	dat[2] = (pkt >> 0);
+	dat[3] = (pkt >> 8);
+	dat[4] = (pkt >> 16);
+
+	ret = j1939_tp_tx_ctl(session, false, dat);
+	if (ret < 0)
+		return ret;
+
+	session->last_txcmd = dat[0];
+	j1939_tp_set_rxtimeout(session, 1250);
+	session->pkt.tx = session->pkt.tx_acked;
+
+	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+	return 0;
+}
+
+static int j1939_session_tx_dat(struct j1939_session *session)
+{
+	struct j1939_priv *priv = session->priv;
+	struct j1939_sk_buff_cb *skcb;
+	int offset, pkt_done, pkt_end;
+	unsigned int len, pdelay;
+	struct sk_buff *se_skb;
+	const u8 *tpdat;
+	int ret = 0;
+	u8 dat[8];
+
+	se_skb = j1939_session_skb_find(session);
+	if (!se_skb)
+		return -ENOBUFS;
+
+	skcb = j1939_skb_to_cb(se_skb);
+	tpdat = se_skb->data;
+	ret = 0;
+	pkt_done = 0;
+	if (session->skcb.addr.type != J1939_ETP &&
+	    j1939_cb_is_broadcast(&session->skcb))
+		pkt_end = session->pkt.total;
+	else
+		pkt_end = session->pkt.last;
+
+	while (session->pkt.tx < pkt_end) {
+		dat[0] = session->pkt.tx - session->pkt.dpo + 1;
+		offset = (session->pkt.tx * 7) - skcb->offset;
+		len =  se_skb->len - offset;
+		if (len > 7)
+			len = 7;
+
+		memcpy(&dat[1], &tpdat[offset], len);
+		ret = j1939_tp_tx_dat(session, dat, len + 1);
+		if (ret < 0) {
+			/* ENOBUS == CAN interface TX queue is full */
+			if (ret != -ENOBUFS)
+				netdev_alert(priv->ndev,
+					     "%s: 0x%p: queue data error: %i\n",
+					     __func__, session, ret);
+			break;
+		}
+
+		session->last_txcmd = 0xff;
+		pkt_done++;
+		session->pkt.tx++;
+		pdelay = j1939_cb_is_broadcast(&session->skcb) ? 50 :
+			j1939_tp_packet_delay;
+
+		if (session->pkt.tx < session->pkt.total && pdelay) {
+			j1939_tp_schedule_txtimer(session, pdelay);
+			break;
+		}
+	}
+
+	if (pkt_done)
+		j1939_tp_set_rxtimeout(session, 250);
+
+	return ret;
+}
+
+static int j1939_xtp_txnext_transmiter(struct j1939_session *session)
+{
+	struct j1939_priv *priv = session->priv;
+	int ret = 0;
+
+	if (!j1939_tp_im_transmitter(&session->skcb)) {
+		netdev_alert(priv->ndev, "%s: 0x%p: called by not transmitter!\n",
+			     __func__, session);
+		return -EINVAL;
+	}
+
+	switch (session->last_cmd) {
+	case 0:
+		ret = j1939_session_tx_rts(session);
+		break;
+
+	case J1939_ETP_CMD_CTS:
+		if (session->last_txcmd != J1939_ETP_CMD_DPO) {
+			ret = j1939_session_tx_dpo(session);
+			if (ret)
+				return ret;
+		}
+
+		/* fall through */
+	case J1939_TP_CMD_CTS:
+	case 0xff: /* did some data */
+	case J1939_ETP_CMD_DPO:
+	case J1939_TP_CMD_BAM:
+		ret = j1939_session_tx_dat(session);
+
+		break;
+	default:
+		netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n",
+			     __func__, session, session->last_cmd);
+	}
+
+	return ret;
+}
+
+static int j1939_session_tx_cts(struct j1939_session *session)
+{
+	struct j1939_priv *priv = session->priv;
+	unsigned int pkt, len;
+	int ret;
+	u8 dat[8];
+
+	if (!j1939_sk_recv_match(priv, &session->skcb))
+		return -ENOENT;
+
+	len = session->pkt.total - session->pkt.rx;
+	len = min3(len, session->pkt.block, j1939_tp_block ?: 255);
+	memset(dat, 0xff, sizeof(dat));
+
+	if (session->skcb.addr.type == J1939_ETP) {
+		pkt = session->pkt.rx + 1;
+		dat[0] = J1939_ETP_CMD_CTS;
+		dat[1] = len;
+		dat[2] = (pkt >> 0);
+		dat[3] = (pkt >> 8);
+		dat[4] = (pkt >> 16);
+	} else {
+		dat[0] = J1939_TP_CMD_CTS;
+		dat[1] = len;
+		dat[2] = session->pkt.rx + 1;
+	}
+
+	if (dat[0] == session->last_txcmd)
+		/* done already */
+		return 0;
+
+	ret = j1939_tp_tx_ctl(session, true, dat);
+	if (ret < 0)
+		return ret;
+
+	if (len)
+		/* only mark cts done when len is set */
+		session->last_txcmd = dat[0];
+	j1939_tp_set_rxtimeout(session, 1250);
+
+	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+	return 0;
+}
+
+static int j1939_session_tx_eoma(struct j1939_session *session)
+{
+	struct j1939_priv *priv = session->priv;
+	u8 dat[8];
+	int ret;
+
+	if (!j1939_sk_recv_match(priv, &session->skcb))
+		return -ENOENT;
+
+	memset(dat, 0xff, sizeof(dat));
+
+	if (session->skcb.addr.type == J1939_ETP) {
+		dat[0] = J1939_ETP_CMD_EOMA;
+		dat[1] = session->total_message_size >> 0;
+		dat[2] = session->total_message_size >> 8;
+		dat[3] = session->total_message_size >> 16;
+		dat[4] = session->total_message_size >> 24;
+	} else {
+		dat[0] = J1939_TP_CMD_EOMA;
+		dat[1] = session->total_message_size;
+		dat[2] = session->total_message_size >> 8;
+		dat[3] = session->pkt.total;
+	}
+
+	if (dat[0] == session->last_txcmd)
+		/* done already */
+		return 0;
+
+	ret = j1939_tp_tx_ctl(session, true, dat);
+	if (ret < 0)
+		return ret;
+
+	session->last_txcmd = dat[0];
+
+	/* wait for the EOMA packet to come in */
+	j1939_tp_set_rxtimeout(session, 1250);
+
+	netdev_dbg(session->priv->ndev, "%p: 0x%p\n", __func__, session);
+
+	return 0;
+}
+
+static int j1939_xtp_txnext_receiver(struct j1939_session *session)
+{
+	struct j1939_priv *priv = session->priv;
+	int ret = 0;
+
+	if (!j1939_tp_im_receiver(&session->skcb)) {
+		netdev_alert(priv->ndev, "%s: 0x%p: called by not receiver!\n",
+			     __func__, session);
+		return -EINVAL;
+	}
+
+	switch (session->last_cmd) {
+	case J1939_TP_CMD_RTS:
+	case J1939_ETP_CMD_RTS:
+		ret = j1939_session_tx_cts(session);
+		break;
+
+	case J1939_ETP_CMD_CTS:
+	case J1939_TP_CMD_CTS:
+	case 0xff: /* did some data */
+	case J1939_ETP_CMD_DPO:
+		if ((session->skcb.addr.type == J1939_TP &&
+		     j1939_cb_is_broadcast(&session->skcb)))
+			break;
+
+		if (session->pkt.rx >= session->pkt.total) {
+			ret = j1939_session_tx_eoma(session);
+		} else if (session->pkt.rx >= session->pkt.last) {
+			session->last_txcmd = 0;
+			ret = j1939_session_tx_cts(session);
+		}
+		break;
+	default:
+		netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n",
+			     __func__, session, session->last_cmd);
+	}
+
+	return ret;
+}
+
+static int j1939_simple_txnext(struct j1939_session *session)
+{
+	struct j1939_priv *priv = session->priv;
+	struct sk_buff *se_skb = j1939_session_skb_find(session);
+	struct sk_buff *skb;
+	int ret;
+
+	if (!se_skb)
+		return 0;
+
+	skb = skb_clone(se_skb, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	can_skb_set_owner(skb, se_skb->sk);
+
+	j1939_tp_set_rxtimeout(session, J1939_SIMPLE_ECHO_TIMEOUT_MS);
+
+	ret = j1939_send_one(priv, skb);
+	if (ret)
+		return ret;
+
+	j1939_sk_errqueue(session, J1939_ERRQUEUE_SCHED);
+	j1939_sk_queue_activate_next(session);
+
+	return 0;
+}
+
+static bool j1939_session_deactivate_locked(struct j1939_session *session)
+{
+	bool active = false;
+
+	lockdep_assert_held(&session->priv->active_session_list_lock);
+
+	if (session->state >= J1939_SESSION_ACTIVE &&
+	    session->state < J1939_SESSION_ACTIVE_MAX) {
+		active = true;
+
+		list_del_init(&session->active_session_list_entry);
+		session->state = J1939_SESSION_DONE;
+		j1939_session_put(session);
+	}
+
+	return active;
+}
+
+static bool j1939_session_deactivate(struct j1939_session *session)
+{
+	bool active;
+
+	j1939_session_list_lock(session->priv);
+	active = j1939_session_deactivate_locked(session);
+	j1939_session_list_unlock(session->priv);
+
+	return active;
+}
+
+static void
+j1939_session_deactivate_activate_next(struct j1939_session *session)
+{
+	if (j1939_session_deactivate(session))
+		j1939_sk_queue_activate_next(session);
+}
+
+static void j1939_session_cancel(struct j1939_session *session,
+				 enum j1939_xtp_abort err)
+{
+	struct j1939_priv *priv = session->priv;
+
+	WARN_ON_ONCE(!err);
+
+	session->err = j1939_xtp_abort_to_errno(priv, err);
+	/* do not send aborts on incoming broadcasts */
+	if (!j1939_cb_is_broadcast(&session->skcb)) {
+		session->state = J1939_SESSION_WAITING_ABORT;
+		j1939_xtp_tx_abort(priv, &session->skcb,
+				   !session->transmission,
+				   err, session->skcb.addr.pgn);
+	}
+
+	if (session->sk)
+		j1939_sk_send_loop_abort(session->sk, session->err);
+}
+
+static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer)
+{
+	struct j1939_session *session =
+		container_of(hrtimer, struct j1939_session, txtimer);
+	struct j1939_priv *priv = session->priv;
+	int ret = 0;
+
+	if (session->skcb.addr.type == J1939_SIMPLE) {
+		ret = j1939_simple_txnext(session);
+	} else {
+		if (session->transmission)
+			ret = j1939_xtp_txnext_transmiter(session);
+		else
+			ret = j1939_xtp_txnext_receiver(session);
+	}
+
+	switch (ret) {
+	case -ENOBUFS:
+		/* Retry limit is currently arbitrary chosen */
+		if (session->tx_retry < J1939_XTP_TX_RETRY_LIMIT) {
+			session->tx_retry++;
+			j1939_tp_schedule_txtimer(session,
+						  10 + prandom_u32_max(16));
+		} else {
+			netdev_alert(priv->ndev, "%s: 0x%p: tx retry count reached\n",
+				     __func__, session);
+			session->err = -ENETUNREACH;
+			j1939_session_rxtimer_cancel(session);
+			j1939_session_deactivate_activate_next(session);
+		}
+		break;
+	case -ENETDOWN:
+		/* In this case we should get a netdev_event(), all active
+		 * sessions will be cleared by
+		 * j1939_cancel_all_active_sessions(). So handle this as an
+		 * error, but let j1939_cancel_all_active_sessions() do the
+		 * cleanup including propagation of the error to user space.
+		 */
+		break;
+	case 0:
+		session->tx_retry = 0;
+		break;
+	default:
+		netdev_alert(priv->ndev, "%s: 0x%p: tx aborted with unknown reason: %i\n",
+			     __func__, session, ret);
+		if (session->skcb.addr.type != J1939_SIMPLE) {
+			j1939_tp_set_rxtimeout(session,
+					       J1939_XTP_ABORT_TIMEOUT_MS);
+			j1939_session_cancel(session, J1939_XTP_ABORT_OTHER);
+		} else {
+			session->err = ret;
+			j1939_session_rxtimer_cancel(session);
+			j1939_session_deactivate_activate_next(session);
+		}
+	}
+
+	j1939_session_put(session);
+
+	return HRTIMER_NORESTART;
+}
+
+static void j1939_session_completed(struct j1939_session *session)
+{
+	struct sk_buff *skb;
+
+	if (!session->transmission) {
+		skb = j1939_session_skb_find(session);
+		/* distribute among j1939 receivers */
+		j1939_sk_recv(session->priv, skb);
+	}
+
+	j1939_session_deactivate_activate_next(session);
+}
+
+static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer)
+{
+	struct j1939_session *session = container_of(hrtimer,
+						     struct j1939_session,
+						     rxtimer);
+	struct j1939_priv *priv = session->priv;
+
+	if (session->state == J1939_SESSION_WAITING_ABORT) {
+		netdev_alert(priv->ndev, "%s: 0x%p: abort rx timeout. Force session deactivation\n",
+			     __func__, session);
+
+		j1939_session_deactivate_activate_next(session);
+
+	} else if (session->skcb.addr.type == J1939_SIMPLE) {
+		netdev_alert(priv->ndev, "%s: 0x%p: Timeout. Failed to send simple message.\n",
+			     __func__, session);
+
+		/* The message is probably stuck in the CAN controller and can
+		 * be send as soon as CAN bus is in working state again.
+		 */
+		session->err = -ETIME;
+		j1939_session_deactivate(session);
+	} else {
+		netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n",
+			     __func__, session);
+
+		j1939_session_list_lock(session->priv);
+		if (session->state >= J1939_SESSION_ACTIVE &&
+		    session->state < J1939_SESSION_ACTIVE_MAX) {
+			j1939_session_get(session);
+			hrtimer_start(&session->rxtimer,
+				      ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS),
+				      HRTIMER_MODE_REL_SOFT);
+			j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT);
+		}
+		j1939_session_list_unlock(session->priv);
+	}
+
+	j1939_session_put(session);
+
+	return HRTIMER_NORESTART;
+}
+
+static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session,
+				     const struct sk_buff *skb)
+{
+	const struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	pgn_t pgn = j1939_xtp_ctl_to_pgn(skb->data);
+	struct j1939_priv *priv = session->priv;
+	enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT;
+	u8 cmd = skb->data[0];
+
+	if (session->skcb.addr.pgn == pgn)
+		return false;
+
+	switch (cmd) {
+	case J1939_TP_CMD_BAM:
+		abort = J1939_XTP_NO_ABORT;
+		break;
+
+	case J1939_ETP_CMD_RTS:
+	case J1939_TP_CMD_RTS: /* fall through */
+		abort = J1939_XTP_ABORT_BUSY;
+		break;
+
+	case J1939_ETP_CMD_CTS:
+	case J1939_TP_CMD_CTS: /* fall through */
+		abort = J1939_XTP_ABORT_ECTS_UNXPECTED_PGN;
+		break;
+
+	case J1939_ETP_CMD_DPO:
+		abort = J1939_XTP_ABORT_BAD_EDPO_PGN;
+		break;
+
+	case J1939_ETP_CMD_EOMA:
+	case J1939_TP_CMD_EOMA: /* fall through */
+		abort = J1939_XTP_ABORT_OTHER;
+		break;
+
+	case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */
+		abort = J1939_XTP_NO_ABORT;
+		break;
+
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+	netdev_warn(priv->ndev, "%s: 0x%p: CMD 0x%02x with PGN 0x%05x for running session with different PGN 0x%05x.\n",
+		    __func__, session, cmd, pgn, session->skcb.addr.pgn);
+	if (abort != J1939_XTP_NO_ABORT)
+		j1939_xtp_tx_abort(priv, skcb, true, abort, pgn);
+
+	return true;
+}
+
+static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb,
+				   bool reverse, bool transmitter)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_session *session;
+	u8 abort = skb->data[1];
+
+	session = j1939_session_get_by_addr(priv, &skcb->addr, reverse,
+					    transmitter);
+	if (!session)
+		return;
+
+	if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+		goto abort_put;
+
+	netdev_info(priv->ndev, "%s: 0x%p: 0x%05x: (%u) %s\n", __func__,
+		    session, j1939_xtp_ctl_to_pgn(skb->data), abort,
+		    j1939_xtp_abort_to_str(abort));
+
+	j1939_session_timers_cancel(session);
+	session->err = j1939_xtp_abort_to_errno(priv, abort);
+	if (session->sk)
+		j1939_sk_send_loop_abort(session->sk, session->err);
+	j1939_session_deactivate_activate_next(session);
+
+abort_put:
+	j1939_session_put(session);
+}
+
+/* abort packets may come in 2 directions */
+static void
+j1939_xtp_rx_abort(struct j1939_priv *priv, struct sk_buff *skb,
+		   bool transmitter)
+{
+	j1939_xtp_rx_abort_one(priv, skb, false, transmitter);
+	j1939_xtp_rx_abort_one(priv, skb, true, transmitter);
+}
+
+static void
+j1939_xtp_rx_eoma_one(struct j1939_session *session, struct sk_buff *skb)
+{
+	if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+		return;
+
+	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+	session->pkt.tx_acked = session->pkt.total;
+	j1939_session_timers_cancel(session);
+	/* transmitted without problems */
+	j1939_session_completed(session);
+}
+
+static void
+j1939_xtp_rx_eoma(struct j1939_priv *priv, struct sk_buff *skb,
+		  bool transmitter)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_session *session;
+
+	session = j1939_session_get_by_addr(priv, &skcb->addr, true,
+					    transmitter);
+	if (!session)
+		return;
+
+	j1939_xtp_rx_eoma_one(session, skb);
+	j1939_session_put(session);
+}
+
+static void
+j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb)
+{
+	enum j1939_xtp_abort err = J1939_XTP_ABORT_FAULT;
+	unsigned int pkt;
+	const u8 *dat;
+
+	dat = skb->data;
+
+	if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+		return;
+
+	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+	if (session->last_cmd == dat[0]) {
+		err = J1939_XTP_ABORT_DUP_SEQ;
+		goto out_session_cancel;
+	}
+
+	if (session->skcb.addr.type == J1939_ETP)
+		pkt = j1939_etp_ctl_to_packet(dat);
+	else
+		pkt = dat[2];
+
+	if (!pkt)
+		goto out_session_cancel;
+	else if (dat[1] > session->pkt.block /* 0xff for etp */)
+		goto out_session_cancel;
+
+	/* set packet counters only when not CTS(0) */
+	session->pkt.tx_acked = pkt - 1;
+	j1939_session_skb_drop_old(session);
+	session->pkt.last = session->pkt.tx_acked + dat[1];
+	if (session->pkt.last > session->pkt.total)
+		/* safety measure */
+		session->pkt.last = session->pkt.total;
+	/* TODO: do not set tx here, do it in txtimer */
+	session->pkt.tx = session->pkt.tx_acked;
+
+	session->last_cmd = dat[0];
+	if (dat[1]) {
+		j1939_tp_set_rxtimeout(session, 1250);
+		if (session->transmission) {
+			if (session->pkt.tx_acked)
+				j1939_sk_errqueue(session,
+						  J1939_ERRQUEUE_SCHED);
+			j1939_session_txtimer_cancel(session);
+			j1939_tp_schedule_txtimer(session, 0);
+		}
+	} else {
+		/* CTS(0) */
+		j1939_tp_set_rxtimeout(session, 550);
+	}
+	return;
+
+ out_session_cancel:
+	j1939_session_timers_cancel(session);
+	j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
+	j1939_session_cancel(session, err);
+}
+
+static void
+j1939_xtp_rx_cts(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_session *session;
+
+	session = j1939_session_get_by_addr(priv, &skcb->addr, true,
+					    transmitter);
+	if (!session)
+		return;
+	j1939_xtp_rx_cts_one(session, skb);
+	j1939_session_put(session);
+}
+
+static struct j1939_session *j1939_session_new(struct j1939_priv *priv,
+					       struct sk_buff *skb, size_t size)
+{
+	struct j1939_session *session;
+	struct j1939_sk_buff_cb *skcb;
+
+	session = kzalloc(sizeof(*session), gfp_any());
+	if (!session)
+		return NULL;
+
+	INIT_LIST_HEAD(&session->active_session_list_entry);
+	INIT_LIST_HEAD(&session->sk_session_queue_entry);
+	kref_init(&session->kref);
+
+	j1939_priv_get(priv);
+	session->priv = priv;
+	session->total_message_size = size;
+	session->state = J1939_SESSION_NEW;
+
+	skb_queue_head_init(&session->skb_queue);
+	skb_queue_tail(&session->skb_queue, skb);
+
+	skcb = j1939_skb_to_cb(skb);
+	memcpy(&session->skcb, skcb, sizeof(session->skcb));
+
+	hrtimer_init(&session->txtimer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL_SOFT);
+	session->txtimer.function = j1939_tp_txtimer;
+	hrtimer_init(&session->rxtimer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL_SOFT);
+	session->rxtimer.function = j1939_tp_rxtimer;
+
+	netdev_dbg(priv->ndev, "%s: 0x%p: sa: %02x, da: %02x\n",
+		   __func__, session, skcb->addr.sa, skcb->addr.da);
+
+	return session;
+}
+
+static struct
+j1939_session *j1939_session_fresh_new(struct j1939_priv *priv,
+				       int size,
+				       const struct j1939_sk_buff_cb *rel_skcb)
+{
+	struct sk_buff *skb;
+	struct j1939_sk_buff_cb *skcb;
+	struct j1939_session *session;
+
+	skb = alloc_skb(size + sizeof(struct can_skb_priv), GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+
+	skb->dev = priv->ndev;
+	can_skb_reserve(skb);
+	can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+	skcb = j1939_skb_to_cb(skb);
+	memcpy(skcb, rel_skcb, sizeof(*skcb));
+
+	session = j1939_session_new(priv, skb, skb->len);
+	if (!session) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	/* alloc data area */
+	skb_put(skb, size);
+	/* skb is recounted in j1939_session_new() */
+	return session;
+}
+
+int j1939_session_activate(struct j1939_session *session)
+{
+	struct j1939_priv *priv = session->priv;
+	struct j1939_session *active = NULL;
+	int ret = 0;
+
+	j1939_session_list_lock(priv);
+	if (session->skcb.addr.type != J1939_SIMPLE)
+		active = j1939_session_get_by_addr_locked(priv,
+							  &priv->active_session_list,
+							  &session->skcb.addr, false,
+							  session->transmission);
+	if (active) {
+		j1939_session_put(active);
+		ret = -EAGAIN;
+	} else {
+		WARN_ON_ONCE(session->state != J1939_SESSION_NEW);
+		list_add_tail(&session->active_session_list_entry,
+			      &priv->active_session_list);
+		j1939_session_get(session);
+		session->state = J1939_SESSION_ACTIVE;
+
+		netdev_dbg(session->priv->ndev, "%s: 0x%p\n",
+			   __func__, session);
+	}
+	j1939_session_list_unlock(priv);
+
+	return ret;
+}
+
+static struct
+j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv,
+					    struct sk_buff *skb)
+{
+	enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT;
+	struct j1939_sk_buff_cb skcb = *j1939_skb_to_cb(skb);
+	struct j1939_session *session;
+	const u8 *dat;
+	pgn_t pgn;
+	int len;
+
+	netdev_dbg(priv->ndev, "%s\n", __func__);
+
+	dat = skb->data;
+	pgn = j1939_xtp_ctl_to_pgn(dat);
+	skcb.addr.pgn = pgn;
+
+	if (!j1939_sk_recv_match(priv, &skcb))
+		return NULL;
+
+	if (skcb.addr.type == J1939_ETP) {
+		len = j1939_etp_ctl_to_size(dat);
+		if (len > J1939_MAX_ETP_PACKET_SIZE)
+			abort = J1939_XTP_ABORT_FAULT;
+		else if (len > priv->tp_max_packet_size)
+			abort = J1939_XTP_ABORT_RESOURCE;
+		else if (len <= J1939_MAX_TP_PACKET_SIZE)
+			abort = J1939_XTP_ABORT_FAULT;
+	} else {
+		len = j1939_tp_ctl_to_size(dat);
+		if (len > J1939_MAX_TP_PACKET_SIZE)
+			abort = J1939_XTP_ABORT_FAULT;
+		else if (len > priv->tp_max_packet_size)
+			abort = J1939_XTP_ABORT_RESOURCE;
+	}
+
+	if (abort != J1939_XTP_NO_ABORT) {
+		j1939_xtp_tx_abort(priv, &skcb, true, abort, pgn);
+		return NULL;
+	}
+
+	session = j1939_session_fresh_new(priv, len, &skcb);
+	if (!session) {
+		j1939_xtp_tx_abort(priv, &skcb, true,
+				   J1939_XTP_ABORT_RESOURCE, pgn);
+		return NULL;
+	}
+
+	/* initialize the control buffer: plain copy */
+	session->pkt.total = (len + 6) / 7;
+	session->pkt.block = 0xff;
+	if (skcb.addr.type != J1939_ETP) {
+		if (dat[3] != session->pkt.total)
+			netdev_alert(priv->ndev, "%s: 0x%p: strange total, %u != %u\n",
+				     __func__, session, session->pkt.total,
+				     dat[3]);
+		session->pkt.total = dat[3];
+		session->pkt.block = min(dat[3], dat[4]);
+	}
+
+	session->pkt.rx = 0;
+	session->pkt.tx = 0;
+
+	WARN_ON_ONCE(j1939_session_activate(session));
+
+	return session;
+}
+
+static int j1939_xtp_rx_rts_session_active(struct j1939_session *session,
+					   struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_priv *priv = session->priv;
+
+	if (!session->transmission) {
+		if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+			return -EBUSY;
+
+		/* RTS on active session */
+		j1939_session_timers_cancel(session);
+		j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
+		j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
+	}
+
+	if (session->last_cmd != 0) {
+		/* we received a second rts on the same connection */
+		netdev_alert(priv->ndev, "%s: 0x%p: connection exists (%02x %02x). last cmd: %x\n",
+			     __func__, session, skcb->addr.sa, skcb->addr.da,
+			     session->last_cmd);
+
+		j1939_session_timers_cancel(session);
+		j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
+		j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
+
+		return -EBUSY;
+	}
+
+	if (session->skcb.addr.sa != skcb->addr.sa ||
+	    session->skcb.addr.da != skcb->addr.da)
+		netdev_warn(priv->ndev, "%s: 0x%p: session->skcb.addr.sa=0x%02x skcb->addr.sa=0x%02x session->skcb.addr.da=0x%02x skcb->addr.da=0x%02x\n",
+			    __func__, session,
+			    session->skcb.addr.sa, skcb->addr.sa,
+			    session->skcb.addr.da, skcb->addr.da);
+	/* make sure 'sa' & 'da' are correct !
+	 * They may be 'not filled in yet' for sending
+	 * skb's, since they did not pass the Address Claim ever.
+	 */
+	session->skcb.addr.sa = skcb->addr.sa;
+	session->skcb.addr.da = skcb->addr.da;
+
+	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+	return 0;
+}
+
+static void j1939_xtp_rx_rts(struct j1939_priv *priv, struct sk_buff *skb,
+			     bool transmitter)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_session *session;
+	u8 cmd = skb->data[0];
+
+	session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+					    transmitter);
+
+	if (!session) {
+		if (transmitter) {
+			/* If we're the transmitter and this function is called,
+			 * we received our own RTS. A session has already been
+			 * created.
+			 *
+			 * For some reasons however it might have been destroyed
+			 * already. So don't create a new one here (using
+			 * "j1939_xtp_rx_rts_session_new()") as this will be a
+			 * receiver session.
+			 *
+			 * The reasons the session is already destroyed might
+			 * be:
+			 * - user space closed socket was and the session was
+			 *   aborted
+			 * - session was aborted due to external abort message
+			 */
+			return;
+		}
+		session = j1939_xtp_rx_rts_session_new(priv, skb);
+		if (!session)
+			return;
+	} else {
+		if (j1939_xtp_rx_rts_session_active(session, skb)) {
+			j1939_session_put(session);
+			return;
+		}
+	}
+	session->last_cmd = cmd;
+
+	j1939_tp_set_rxtimeout(session, 1250);
+
+	if (cmd != J1939_TP_CMD_BAM && !session->transmission) {
+		j1939_session_txtimer_cancel(session);
+		j1939_tp_schedule_txtimer(session, 0);
+	}
+
+	j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dpo_one(struct j1939_session *session,
+				 struct sk_buff *skb)
+{
+	const u8 *dat = skb->data;
+
+	if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+		return;
+
+	netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+	/* transmitted without problems */
+	session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data);
+	session->last_cmd = dat[0];
+	j1939_tp_set_rxtimeout(session, 750);
+}
+
+static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb,
+			     bool transmitter)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_session *session;
+
+	session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+					    transmitter);
+	if (!session) {
+		netdev_info(priv->ndev,
+			    "%s: no connection found\n", __func__);
+		return;
+	}
+
+	j1939_xtp_rx_dpo_one(session, skb);
+	j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dat_one(struct j1939_session *session,
+				 struct sk_buff *skb)
+{
+	struct j1939_priv *priv = session->priv;
+	struct j1939_sk_buff_cb *skcb;
+	struct sk_buff *se_skb;
+	const u8 *dat;
+	u8 *tpdat;
+	int offset;
+	int nbytes;
+	bool final = false;
+	bool do_cts_eoma = false;
+	int packet;
+
+	skcb = j1939_skb_to_cb(skb);
+	dat = skb->data;
+	if (skb->len <= 1)
+		/* makes no sense */
+		goto out_session_cancel;
+
+	switch (session->last_cmd) {
+	case 0xff:
+		break;
+	case J1939_ETP_CMD_DPO:
+		if (skcb->addr.type == J1939_ETP)
+			break;
+		/* fall through */
+	case J1939_TP_CMD_BAM: /* fall through */
+	case J1939_TP_CMD_CTS: /* fall through */
+		if (skcb->addr.type != J1939_ETP)
+			break;
+		/* fall through */
+	default:
+		netdev_info(priv->ndev, "%s: 0x%p: last %02x\n", __func__,
+			    session, session->last_cmd);
+		goto out_session_cancel;
+	}
+
+	packet = (dat[0] - 1 + session->pkt.dpo);
+	if (packet > session->pkt.total ||
+	    (session->pkt.rx + 1) > session->pkt.total) {
+		netdev_info(priv->ndev, "%s: 0x%p: should have been completed\n",
+			    __func__, session);
+		goto out_session_cancel;
+	}
+	se_skb = j1939_session_skb_find(session);
+	if (!se_skb) {
+		netdev_warn(priv->ndev, "%s: 0x%p: no skb found\n", __func__,
+			    session);
+		goto out_session_cancel;
+	}
+
+	skcb = j1939_skb_to_cb(se_skb);
+	offset = packet * 7 - skcb->offset;
+	nbytes = se_skb->len - offset;
+	if (nbytes > 7)
+		nbytes = 7;
+	if (nbytes <= 0 || (nbytes + 1) > skb->len) {
+		netdev_info(priv->ndev, "%s: 0x%p: nbytes %i, len %i\n",
+			    __func__, session, nbytes, skb->len);
+		goto out_session_cancel;
+	}
+
+	tpdat = se_skb->data;
+	memcpy(&tpdat[offset], &dat[1], nbytes);
+	if (packet == session->pkt.rx)
+		session->pkt.rx++;
+
+	if (skcb->addr.type != J1939_ETP &&
+	    j1939_cb_is_broadcast(&session->skcb)) {
+		if (session->pkt.rx >= session->pkt.total)
+			final = true;
+	} else {
+		/* never final, an EOMA must follow */
+		if (session->pkt.rx >= session->pkt.last)
+			do_cts_eoma = true;
+	}
+
+	if (final) {
+		j1939_session_completed(session);
+	} else if (do_cts_eoma) {
+		j1939_tp_set_rxtimeout(session, 1250);
+		if (!session->transmission)
+			j1939_tp_schedule_txtimer(session, 0);
+	} else {
+		j1939_tp_set_rxtimeout(session, 250);
+	}
+	session->last_cmd = 0xff;
+	j1939_session_put(session);
+
+	return;
+
+ out_session_cancel:
+	j1939_session_timers_cancel(session);
+	j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
+	j1939_session_cancel(session, J1939_XTP_ABORT_FAULT);
+	j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dat(struct j1939_priv *priv, struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb;
+	struct j1939_session *session;
+
+	skcb = j1939_skb_to_cb(skb);
+
+	if (j1939_tp_im_transmitter(skcb)) {
+		session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+						    true);
+		if (!session)
+			netdev_info(priv->ndev, "%s: no tx connection found\n",
+				    __func__);
+		else
+			j1939_xtp_rx_dat_one(session, skb);
+	}
+
+	if (j1939_tp_im_receiver(skcb)) {
+		session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+						    false);
+		if (!session)
+			netdev_info(priv->ndev, "%s: no rx connection found\n",
+				    __func__);
+		else
+			j1939_xtp_rx_dat_one(session, skb);
+	}
+}
+
+/* j1939 main intf */
+struct j1939_session *j1939_tp_send(struct j1939_priv *priv,
+				    struct sk_buff *skb, size_t size)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	struct j1939_session *session;
+	int ret;
+
+	if (skcb->addr.pgn == J1939_TP_PGN_DAT ||
+	    skcb->addr.pgn == J1939_TP_PGN_CTL ||
+	    skcb->addr.pgn == J1939_ETP_PGN_DAT ||
+	    skcb->addr.pgn == J1939_ETP_PGN_CTL)
+		/* avoid conflict */
+		return ERR_PTR(-EDOM);
+
+	if (size > priv->tp_max_packet_size)
+		return ERR_PTR(-EMSGSIZE);
+
+	if (size <= 8)
+		skcb->addr.type = J1939_SIMPLE;
+	else if (size > J1939_MAX_TP_PACKET_SIZE)
+		skcb->addr.type = J1939_ETP;
+	else
+		skcb->addr.type = J1939_TP;
+
+	if (skcb->addr.type == J1939_ETP &&
+	    j1939_cb_is_broadcast(skcb))
+		return ERR_PTR(-EDESTADDRREQ);
+
+	/* fill in addresses from names */
+	ret = j1939_ac_fixup(priv, skb);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+
+	/* fix DST flags, it may be used there soon */
+	if (j1939_address_is_unicast(skcb->addr.da) &&
+	    priv->ents[skcb->addr.da].nusers)
+		skcb->flags |= J1939_ECU_LOCAL_DST;
+
+	/* src is always local, I'm sending ... */
+	skcb->flags |= J1939_ECU_LOCAL_SRC;
+
+	/* prepare new session */
+	session = j1939_session_new(priv, skb, size);
+	if (!session)
+		return ERR_PTR(-ENOMEM);
+
+	/* skb is recounted in j1939_session_new() */
+	session->sk = skb->sk;
+	session->transmission = true;
+	session->pkt.total = (size + 6) / 7;
+	session->pkt.block = skcb->addr.type == J1939_ETP ? 255 :
+		min(j1939_tp_block ?: 255, session->pkt.total);
+
+	if (j1939_cb_is_broadcast(&session->skcb))
+		/* set the end-packet for broadcast */
+		session->pkt.last = session->pkt.total;
+
+	skcb->tskey = session->sk->sk_tskey++;
+	session->tskey = skcb->tskey;
+
+	return session;
+}
+
+static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+	int extd = J1939_TP;
+	u8 cmd = skb->data[0];
+
+	switch (cmd) {
+	case J1939_ETP_CMD_RTS:
+		extd = J1939_ETP;
+		/* fall through */
+	case J1939_TP_CMD_BAM: /* fall through */
+	case J1939_TP_CMD_RTS: /* fall through */
+		if (skcb->addr.type != extd)
+			return;
+
+		if (cmd == J1939_TP_CMD_RTS && j1939_cb_is_broadcast(skcb)) {
+			netdev_alert(priv->ndev, "%s: rts without destination (%02x)\n",
+				     __func__, skcb->addr.sa);
+			return;
+		}
+
+		if (j1939_tp_im_transmitter(skcb))
+			j1939_xtp_rx_rts(priv, skb, true);
+
+		if (j1939_tp_im_receiver(skcb))
+			j1939_xtp_rx_rts(priv, skb, false);
+
+		break;
+
+	case J1939_ETP_CMD_CTS:
+		extd = J1939_ETP;
+		/* fall through */
+	case J1939_TP_CMD_CTS:
+		if (skcb->addr.type != extd)
+			return;
+
+		if (j1939_tp_im_transmitter(skcb))
+			j1939_xtp_rx_cts(priv, skb, false);
+
+		if (j1939_tp_im_receiver(skcb))
+			j1939_xtp_rx_cts(priv, skb, true);
+
+		break;
+
+	case J1939_ETP_CMD_DPO:
+		if (skcb->addr.type != J1939_ETP)
+			return;
+
+		if (j1939_tp_im_transmitter(skcb))
+			j1939_xtp_rx_dpo(priv, skb, true);
+
+		if (j1939_tp_im_receiver(skcb))
+			j1939_xtp_rx_dpo(priv, skb, false);
+
+		break;
+
+	case J1939_ETP_CMD_EOMA:
+		extd = J1939_ETP;
+		/* fall through */
+	case J1939_TP_CMD_EOMA:
+		if (skcb->addr.type != extd)
+			return;
+
+		if (j1939_tp_im_transmitter(skcb))
+			j1939_xtp_rx_eoma(priv, skb, false);
+
+		if (j1939_tp_im_receiver(skcb))
+			j1939_xtp_rx_eoma(priv, skb, true);
+
+		break;
+
+	case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */
+		if (j1939_tp_im_transmitter(skcb))
+			j1939_xtp_rx_abort(priv, skb, true);
+
+		if (j1939_tp_im_receiver(skcb))
+			j1939_xtp_rx_abort(priv, skb, false);
+
+		break;
+	default:
+		return;
+	}
+}
+
+int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+	struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+
+	if (!j1939_tp_im_involved_anydir(skcb))
+		return 0;
+
+	switch (skcb->addr.pgn) {
+	case J1939_ETP_PGN_DAT:
+		skcb->addr.type = J1939_ETP;
+		/* fall through */
+	case J1939_TP_PGN_DAT:
+		j1939_xtp_rx_dat(priv, skb);
+		break;
+
+	case J1939_ETP_PGN_CTL:
+		skcb->addr.type = J1939_ETP;
+		/* fall through */
+	case J1939_TP_PGN_CTL:
+		if (skb->len < 8)
+			return 0; /* Don't care. Nothing to extract here */
+
+		j1939_tp_cmd_recv(priv, skb);
+		break;
+	default:
+		return 0; /* no problem */
+	}
+	return 1; /* "I processed the message" */
+}
+
+void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+	struct j1939_session *session;
+
+	if (!skb->sk)
+		return;
+
+	j1939_session_list_lock(priv);
+	session = j1939_session_get_simple(priv, skb);
+	j1939_session_list_unlock(priv);
+	if (!session) {
+		netdev_warn(priv->ndev,
+			    "%s: Received already invalidated message\n",
+			    __func__);
+		return;
+	}
+
+	j1939_session_timers_cancel(session);
+	j1939_session_deactivate(session);
+	j1939_session_put(session);
+}
+
+int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk)
+{
+	struct j1939_session *session, *saved;
+
+	netdev_dbg(priv->ndev, "%s, sk: %p\n", __func__, sk);
+	j1939_session_list_lock(priv);
+	list_for_each_entry_safe(session, saved,
+				 &priv->active_session_list,
+				 active_session_list_entry) {
+		if (!sk || sk == session->sk) {
+			j1939_session_timers_cancel(session);
+			session->err = ESHUTDOWN;
+			j1939_session_deactivate_locked(session);
+		}
+	}
+	j1939_session_list_unlock(priv);
+	return NOTIFY_DONE;
+}
+
+void j1939_tp_init(struct j1939_priv *priv)
+{
+	spin_lock_init(&priv->active_session_list_lock);
+	INIT_LIST_HEAD(&priv->active_session_list);
+	priv->tp_max_packet_size = J1939_MAX_ETP_PACKET_SIZE;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 95a7233c452a58a4c2310c456c73997853b2ec46 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Wed, 4 Sep 2019 16:56:37 +0300
Subject: net: openvswitch: Set OvS recirc_id from tc chain index

Offloaded OvS datapath rules are translated one to one to tc rules,
for example the following simplified OvS rule:

recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)

Will be translated to the following tc rule:

$ tc filter add dev dev1 ingress \
	    prio 1 chain 0 proto ip \
		flower tcp ct_state -trk \
		action ct pipe \
		action goto chain 2

Received packets will first travel though tc, and if they aren't stolen
by it, like in the above rule, they will continue to OvS datapath.
Since we already did some actions (action ct in this case) which might
modify the packets, and updated action stats, we would like to continue
the proccessing with the correct recirc_id in OvS (here recirc_id(2))
where we left off.

To support this, introduce a new skb extension for tc, which
will be used for translating tc chain to ovs recirc_id to
handle these miss cases. Last tc chain index will be set
by tc goto chain action and read by OvS datapath.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h           | 13 +++++++++++++
 include/uapi/linux/openvswitch.h |  3 +++
 net/core/skbuff.c                |  6 ++++++
 net/openvswitch/datapath.c       | 38 +++++++++++++++++++++++++++++++++-----
 net/openvswitch/datapath.h       |  2 ++
 net/openvswitch/flow.c           | 13 +++++++++++++
 net/sched/Kconfig                | 13 +++++++++++++
 net/sched/cls_api.c              | 12 ++++++++++++
 8 files changed, 95 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 77c6dc88e95d..028e684fa974 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -279,6 +279,16 @@ struct nf_bridge_info {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+/* Chain in tc_skb_ext will be used to share the tc chain with
+ * ovs recirc_id. It will be set to the current chain by tc
+ * and read by ovs to recirc_id.
+ */
+struct tc_skb_ext {
+	__u32 chain;
+};
+#endif
+
 struct sk_buff_head {
 	/* These two members must be first. */
 	struct sk_buff	*next;
@@ -4057,6 +4067,9 @@ enum skb_ext_id {
 #endif
 #ifdef CONFIG_XFRM
 	SKB_EXT_SEC_PATH,
+#endif
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+	TC_SKB_EXT,
 #endif
 	SKB_EXT_NUM, /* must be last */
 };
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index f271f1ec50ae..1887a451c388 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -123,6 +123,9 @@ struct ovs_vport_stats {
 /* Allow datapath to associate multiple Netlink PIDs to each vport */
 #define OVS_DP_F_VPORT_PIDS	(1 << 1)
 
+/* Allow tc offload recirc sharing */
+#define OVS_DP_F_TC_RECIRC_SHARING	(1 << 2)
+
 /* Fixed logical ports. */
 #define OVSP_LOCAL      ((__u32)0)
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ea8e8d332d85..2b40b5a9425b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4087,6 +4087,9 @@ static const u8 skb_ext_type_len[] = {
 #ifdef CONFIG_XFRM
 	[SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
 #endif
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+	[TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
+#endif
 };
 
 static __always_inline unsigned int skb_ext_total_length(void)
@@ -4097,6 +4100,9 @@ static __always_inline unsigned int skb_ext_total_length(void)
 #endif
 #ifdef CONFIG_XFRM
 		skb_ext_type_len[SKB_EXT_SEC_PATH] +
+#endif
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+		skb_ext_type_len[TC_SKB_EXT] +
 #endif
 		0;
 }
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 65122bbccd27..dde9d762edee 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1545,10 +1545,34 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *in
 	dp->user_features = 0;
 }
 
-static void ovs_dp_change(struct datapath *dp, struct nlattr *a[])
+DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
+
+static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
 {
-	if (a[OVS_DP_ATTR_USER_FEATURES])
-		dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
+	u32 user_features = 0;
+
+	if (a[OVS_DP_ATTR_USER_FEATURES]) {
+		user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
+
+		if (user_features & ~(OVS_DP_F_VPORT_PIDS |
+				      OVS_DP_F_UNALIGNED |
+				      OVS_DP_F_TC_RECIRC_SHARING))
+			return -EOPNOTSUPP;
+
+#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+		if (user_features & OVS_DP_F_TC_RECIRC_SHARING)
+			return -EOPNOTSUPP;
+#endif
+	}
+
+	dp->user_features = user_features;
+
+	if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
+		static_branch_enable(&tc_recirc_sharing_support);
+	else
+		static_branch_disable(&tc_recirc_sharing_support);
+
+	return 0;
 }
 
 static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
@@ -1610,7 +1634,9 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	parms.port_no = OVSP_LOCAL;
 	parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
 
-	ovs_dp_change(dp, a);
+	err = ovs_dp_change(dp, a);
+	if (err)
+		goto err_destroy_meters;
 
 	/* So far only local changes have been made, now need the lock. */
 	ovs_lock();
@@ -1736,7 +1762,9 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	if (IS_ERR(dp))
 		goto err_unlock_free;
 
-	ovs_dp_change(dp, info->attrs);
+	err = ovs_dp_change(dp, info->attrs);
+	if (err)
+		goto err_unlock_free;
 
 	err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
 				   info->snd_seq, 0, OVS_DP_CMD_SET);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 751d34accdf9..81e85dde8217 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -218,6 +218,8 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
 extern struct notifier_block ovs_dp_device_notifier;
 extern struct genl_family dp_vport_genl_family;
 
+DECLARE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
+
 void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key);
 void ovs_dp_detach_port(struct vport *);
 int ovs_dp_upcall(struct datapath *, struct sk_buff *,
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 9d81d2c7bf82..38147e6a20f5 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -842,6 +842,9 @@ static int key_extract_mac_proto(struct sk_buff *skb)
 int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 			 struct sk_buff *skb, struct sw_flow_key *key)
 {
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+	struct tc_skb_ext *tc_ext;
+#endif
 	int res, err;
 
 	/* Extract metadata from packet. */
@@ -874,7 +877,17 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 	if (res < 0)
 		return res;
 	key->mac_proto = res;
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+	if (static_branch_unlikely(&tc_recirc_sharing_support)) {
+		tc_ext = skb_ext_find(skb, TC_SKB_EXT);
+		key->recirc_id = tc_ext ? tc_ext->chain : 0;
+	} else {
+		key->recirc_id = 0;
+	}
+#else
 	key->recirc_id = 0;
+#endif
 
 	err = key_extract(skb, key);
 	if (!err)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index afd2ba157a13..b3faafeafab9 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -963,6 +963,19 @@ config NET_IFE_SKBTCINDEX
         tristate "Support to encoding decoding skb tcindex on IFE action"
         depends on NET_ACT_IFE
 
+config NET_TC_SKB_EXT
+	bool "TC recirculation support"
+	depends on NET_CLS_ACT
+	default y if NET_CLS_ACT
+	select SKB_EXTENSIONS
+
+	help
+	  Say Y here to allow tc chain misses to continue in OvS datapath in
+	  the correct recirc_id, and hardware chain misses to continue in
+	  the correct chain in tc software datapath.
+
+	  Say N here if you won't be using tc<->ovs offload or tc chains offload.
+
 endif # NET_SCHED
 
 config NET_SCH_FIFO
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 671ca905dbb5..05c4fe1c3ca2 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1514,6 +1514,18 @@ reclassify:
 			goto reset;
 		} else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
 			first_tp = res->goto_tp;
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+			{
+				struct tc_skb_ext *ext;
+
+				ext = skb_ext_add(skb, TC_SKB_EXT);
+				if (WARN_ON_ONCE(!ext))
+					return TC_ACT_SHOT;
+
+				ext->chain = err & TC_ACT_EXT_VAL_MASK;
+			}
+#endif
 			goto reset;
 		}
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From d1967e495a8d1dd6b8ff2df9d3e045c5d60a37a6 Mon Sep 17 00:00:00 2001
From: David Dai <zdai@linux.vnet.ibm.com>
Date: Wed, 4 Sep 2019 10:03:43 -0500
Subject: net_sched: act_police: add 2 new attributes to support police 64bit
 rate and peakrate

For high speed adapter like Mellanox CX-5 card, it can reach upto
100 Gbits per second bandwidth. Currently htb already supports 64bit rate
in tc utility. However police action rate and peakrate are still limited
to 32bit value (upto 32 Gbits per second). Add 2 new attributes
TCA_POLICE_RATE64 and TCA_POLICE_RATE64 in kernel for 64bit support
so that tc utility can use them for 64bit rate and peakrate value to
break the 32bit limit, and still keep the backward binary compatibility.

Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Signed-off-by: David Dai <zdai@linux.vnet.ibm.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  2 ++
 net/sched/act_police.c       | 27 +++++++++++++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index b057aeeb6338..a6aa466fac9e 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -160,6 +160,8 @@ enum {
 	TCA_POLICE_RESULT,
 	TCA_POLICE_TM,
 	TCA_POLICE_PAD,
+	TCA_POLICE_RATE64,
+	TCA_POLICE_PEAKRATE64,
 	__TCA_POLICE_MAX
 #define TCA_POLICE_RESULT TCA_POLICE_RESULT
 };
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 6315e0f8d26e..89c04c52af3d 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -40,6 +40,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 	[TCA_POLICE_PEAKRATE]	= { .len = TC_RTAB_SIZE },
 	[TCA_POLICE_AVRATE]	= { .type = NLA_U32 },
 	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
+	[TCA_POLICE_RATE64]     = { .type = NLA_U64 },
+	[TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
 };
 
 static int tcf_police_init(struct net *net, struct nlattr *nla,
@@ -58,6 +60,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	struct tcf_police_params *new;
 	bool exists = false;
 	u32 index;
+	u64 rate64, prate64;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -155,14 +158,18 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	}
 	if (R_tab) {
 		new->rate_present = true;
-		psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0);
+		rate64 = tb[TCA_POLICE_RATE64] ?
+			 nla_get_u64(tb[TCA_POLICE_RATE64]) : 0;
+		psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64);
 		qdisc_put_rtab(R_tab);
 	} else {
 		new->rate_present = false;
 	}
 	if (P_tab) {
 		new->peak_present = true;
-		psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0);
+		prate64 = tb[TCA_POLICE_PEAKRATE64] ?
+			  nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0;
+		psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64);
 		qdisc_put_rtab(P_tab);
 	} else {
 		new->peak_present = false;
@@ -313,10 +320,22 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
 				      lockdep_is_held(&police->tcf_lock));
 	opt.mtu = p->tcfp_mtu;
 	opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
-	if (p->rate_present)
+	if (p->rate_present) {
 		psched_ratecfg_getrate(&opt.rate, &p->rate);
-	if (p->peak_present)
+		if ((police->params->rate.rate_bytes_ps >= (1ULL << 32)) &&
+		    nla_put_u64_64bit(skb, TCA_POLICE_RATE64,
+				      police->params->rate.rate_bytes_ps,
+				      TCA_POLICE_PAD))
+			goto nla_put_failure;
+	}
+	if (p->peak_present) {
 		psched_ratecfg_getrate(&opt.peakrate, &p->peak);
+		if ((police->params->peak.rate_bytes_ps >= (1ULL << 32)) &&
+		    nla_put_u64_64bit(skb, TCA_POLICE_PEAKRATE64,
+				      police->params->peak.rate_bytes_ps,
+				      TCA_POLICE_PAD))
+			goto nla_put_failure;
+	}
 	if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
 		goto nla_put_failure;
 	if (p->tcfp_result &&
-- 
cgit v1.2.3-59-g8ed1b


From e019a3b17f0d79894917e6d83b17e87e8f809deb Mon Sep 17 00:00:00 2001
From: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Date: Mon, 9 Sep 2019 00:54:17 +0100
Subject: devlink: extend 'fw_load_policy' values

Add the 'disk' value to the generic 'fw_load_policy' devlink parameter.
This value indicates that firmware should always be loaded from disk
only.

Signed-off-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink-params.txt | 2 ++
 include/uapi/linux/devlink.h                | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/networking/devlink-params.txt b/Documentation/networking/devlink-params.txt
index 2d26434ddcf8..fadb5436188d 100644
--- a/Documentation/networking/devlink-params.txt
+++ b/Documentation/networking/devlink-params.txt
@@ -48,4 +48,6 @@ fw_load_policy		[DEVICE, GENERIC]
 			  Load firmware version preferred by the driver.
 			* DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH (1)
 			  Load firmware currently stored in flash.
+			* DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DISK (2)
+			  Load firmware currently available on host's disk.
 			Type: u8
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 546e75dd74ac..c25cc29a6647 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -202,6 +202,7 @@ enum devlink_param_cmode {
 enum devlink_param_fw_load_policy_value {
 	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER,
 	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH,
+	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DISK,
 };
 
 enum {
-- 
cgit v1.2.3-59-g8ed1b


From 5bbd21df5a075a59ab53030d25f9848ccca93d73 Mon Sep 17 00:00:00 2001
From: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Date: Mon, 9 Sep 2019 00:54:18 +0100
Subject: devlink: add 'reset_dev_on_drv_probe' param

Add the 'reset_dev_on_drv_probe' devlink parameter, controlling the
device reset policy on driver probe.

This parameter is useful in conjunction with the existing
'fw_load_policy' parameter.

Signed-off-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink-params.txt | 14 ++++++++++++++
 include/net/devlink.h                       |  5 +++++
 include/uapi/linux/devlink.h                |  7 +++++++
 net/core/devlink.c                          |  5 +++++
 4 files changed, 31 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/networking/devlink-params.txt b/Documentation/networking/devlink-params.txt
index fadb5436188d..ddba3e9b55b1 100644
--- a/Documentation/networking/devlink-params.txt
+++ b/Documentation/networking/devlink-params.txt
@@ -51,3 +51,17 @@ fw_load_policy		[DEVICE, GENERIC]
 			* DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DISK (2)
 			  Load firmware currently available on host's disk.
 			Type: u8
+
+reset_dev_on_drv_probe	[DEVICE, GENERIC]
+			Controls the device's reset policy on driver probe.
+			Valid values:
+			* DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_UNKNOWN (0)
+			  Unknown or invalid value.
+			* DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_ALWAYS (1)
+			  Always reset device on driver probe.
+			* DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_NEVER (2)
+			  Never reset device on driver probe.
+			* DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_DISK (3)
+			  Reset only if device firmware can be found in the
+			  filesystem.
+			Type: u8
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 460bc629d1a4..03e4d9244ff3 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -398,6 +398,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
 	DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
 	DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY,
+	DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -428,6 +429,10 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME "fw_load_policy"
 #define DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE DEVLINK_PARAM_TYPE_U8
 
+#define DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME \
+	"reset_dev_on_drv_probe"
+#define DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE DEVLINK_PARAM_TYPE_U8
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index c25cc29a6647..1da3e83f1fd4 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -205,6 +205,13 @@ enum devlink_param_fw_load_policy_value {
 	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DISK,
 };
 
+enum devlink_param_reset_dev_on_drv_probe_value {
+	DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_UNKNOWN,
+	DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_ALWAYS,
+	DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_NEVER,
+	DEVLINK_PARAM_RESET_DEV_ON_DRV_PROBE_VALUE_DISK,
+};
+
 enum {
 	DEVLINK_ATTR_STATS_RX_PACKETS,		/* u64 */
 	DEVLINK_ATTR_STATS_RX_BYTES,		/* u64 */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 6e52d639dac6..4a2fb94c44cf 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2852,6 +2852,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME,
 		.type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE,
+		.name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME,
+		.type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3-59-g8ed1b


From ee394f96ad7517fbc0de9106dcc7ce9efb14f264 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Sat, 7 Sep 2019 18:04:26 +0200
Subject: netfilter: nft_synproxy: add synproxy stateful object support

Register a new synproxy stateful object type into the stateful object
infrastructure.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   3 +-
 net/netfilter/nft_synproxy.c             | 143 ++++++++++++++++++++++++++-----
 2 files changed, 124 insertions(+), 22 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 0ff932dadc8e..ed8881ad18ed 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1481,7 +1481,8 @@ enum nft_ct_expectation_attributes {
 #define NFT_OBJECT_CT_TIMEOUT	7
 #define NFT_OBJECT_SECMARK	8
 #define NFT_OBJECT_CT_EXPECT	9
-#define __NFT_OBJECT_MAX	10
+#define NFT_OBJECT_SYNPROXY	10
+#define __NFT_OBJECT_MAX	11
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index db4c23f5dfcb..e2c1fc608841 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -24,7 +24,7 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts,
 				     const struct tcphdr *tcp,
 				     struct synproxy_net *snet,
 				     struct nf_synproxy_info *info,
-				     struct nft_synproxy *priv)
+				     const struct nft_synproxy *priv)
 {
 	this_cpu_inc(snet->stats->syn_received);
 	if (tcp->ece && tcp->cwr)
@@ -41,14 +41,13 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts,
 				   NF_SYNPROXY_OPT_ECN);
 }
 
-static void nft_synproxy_eval_v4(const struct nft_expr *expr,
+static void nft_synproxy_eval_v4(const struct nft_synproxy *priv,
 				 struct nft_regs *regs,
 				 const struct nft_pktinfo *pkt,
 				 const struct tcphdr *tcp,
 				 struct tcphdr *_tcph,
 				 struct synproxy_options *opts)
 {
-	struct nft_synproxy *priv = nft_expr_priv(expr);
 	struct nf_synproxy_info info = priv->info;
 	struct net *net = nft_net(pkt);
 	struct synproxy_net *snet = synproxy_pernet(net);
@@ -73,14 +72,13 @@ static void nft_synproxy_eval_v4(const struct nft_expr *expr,
 }
 
 #if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
-static void nft_synproxy_eval_v6(const struct nft_expr *expr,
+static void nft_synproxy_eval_v6(const struct nft_synproxy *priv,
 				 struct nft_regs *regs,
 				 const struct nft_pktinfo *pkt,
 				 const struct tcphdr *tcp,
 				 struct tcphdr *_tcph,
 				 struct synproxy_options *opts)
 {
-	struct nft_synproxy *priv = nft_expr_priv(expr);
 	struct nf_synproxy_info info = priv->info;
 	struct net *net = nft_net(pkt);
 	struct synproxy_net *snet = synproxy_pernet(net);
@@ -105,9 +103,9 @@ static void nft_synproxy_eval_v6(const struct nft_expr *expr,
 }
 #endif /* CONFIG_NF_TABLES_IPV6*/
 
-static void nft_synproxy_eval(const struct nft_expr *expr,
-			      struct nft_regs *regs,
-			      const struct nft_pktinfo *pkt)
+static void nft_synproxy_do_eval(const struct nft_synproxy *priv,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
 {
 	struct synproxy_options opts = {};
 	struct sk_buff *skb = pkt->skb;
@@ -140,23 +138,22 @@ static void nft_synproxy_eval(const struct nft_expr *expr,
 
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-		nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts);
+		nft_synproxy_eval_v4(priv, regs, pkt, tcp, &_tcph, &opts);
 		return;
 #if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
 	case htons(ETH_P_IPV6):
-		nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts);
+		nft_synproxy_eval_v6(priv, regs, pkt, tcp, &_tcph, &opts);
 		return;
 #endif
 	}
 	regs->verdict.code = NFT_BREAK;
 }
 
-static int nft_synproxy_init(const struct nft_ctx *ctx,
-			     const struct nft_expr *expr,
-			     const struct nlattr * const tb[])
+static int nft_synproxy_do_init(const struct nft_ctx *ctx,
+				const struct nlattr * const tb[],
+				struct nft_synproxy *priv)
 {
 	struct synproxy_net *snet = synproxy_pernet(ctx->net);
-	struct nft_synproxy *priv = nft_expr_priv(expr);
 	u32 flags;
 	int err;
 
@@ -206,8 +203,7 @@ nf_ct_failure:
 	return err;
 }
 
-static void nft_synproxy_destroy(const struct nft_ctx *ctx,
-				 const struct nft_expr *expr)
+static void nft_synproxy_do_destroy(const struct nft_ctx *ctx)
 {
 	struct synproxy_net *snet = synproxy_pernet(ctx->net);
 
@@ -229,10 +225,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx,
 	nf_ct_netns_put(ctx->net, ctx->family);
 }
 
-static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_synproxy_do_dump(struct sk_buff *skb, struct nft_synproxy *priv)
 {
-	const struct nft_synproxy *priv = nft_expr_priv(expr);
-
 	if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) ||
 	    nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) ||
 	    nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options)))
@@ -244,6 +238,15 @@ nla_put_failure:
 	return -1;
 }
 
+static void nft_synproxy_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	const struct nft_synproxy *priv = nft_expr_priv(expr);
+
+	nft_synproxy_do_eval(priv, regs, pkt);
+}
+
 static int nft_synproxy_validate(const struct nft_ctx *ctx,
 				 const struct nft_expr *expr,
 				 const struct nft_data **data)
@@ -252,6 +255,28 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx,
 						    (1 << NF_INET_FORWARD));
 }
 
+static int nft_synproxy_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
+{
+	struct nft_synproxy *priv = nft_expr_priv(expr);
+
+	return nft_synproxy_do_init(ctx, tb, priv);
+}
+
+static void nft_synproxy_destroy(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr)
+{
+	nft_synproxy_do_destroy(ctx);
+}
+
+static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_synproxy *priv = nft_expr_priv(expr);
+
+	return nft_synproxy_do_dump(skb, priv);
+}
+
 static struct nft_expr_type nft_synproxy_type;
 static const struct nft_expr_ops nft_synproxy_ops = {
 	.eval		= nft_synproxy_eval,
@@ -271,14 +296,89 @@ static struct nft_expr_type nft_synproxy_type __read_mostly = {
 	.maxattr	= NFTA_SYNPROXY_MAX,
 };
 
+static int nft_synproxy_obj_init(const struct nft_ctx *ctx,
+				 const struct nlattr * const tb[],
+				 struct nft_object *obj)
+{
+	struct nft_synproxy *priv = nft_obj_data(obj);
+
+	return nft_synproxy_do_init(ctx, tb, priv);
+}
+
+static void nft_synproxy_obj_destroy(const struct nft_ctx *ctx,
+				     struct nft_object *obj)
+{
+	nft_synproxy_do_destroy(ctx);
+}
+
+static int nft_synproxy_obj_dump(struct sk_buff *skb,
+				 struct nft_object *obj, bool reset)
+{
+	struct nft_synproxy *priv = nft_obj_data(obj);
+
+	return nft_synproxy_do_dump(skb, priv);
+}
+
+static void nft_synproxy_obj_eval(struct nft_object *obj,
+				  struct nft_regs *regs,
+				  const struct nft_pktinfo *pkt)
+{
+	const struct nft_synproxy *priv = nft_obj_data(obj);
+
+	nft_synproxy_do_eval(priv, regs, pkt);
+}
+
+static void nft_synproxy_obj_update(struct nft_object *obj,
+				    struct nft_object *newobj)
+{
+	struct nft_synproxy *newpriv = nft_obj_data(newobj);
+	struct nft_synproxy *priv = nft_obj_data(obj);
+
+	priv->info = newpriv->info;
+}
+
+static struct nft_object_type nft_synproxy_obj_type;
+static const struct nft_object_ops nft_synproxy_obj_ops = {
+	.type		= &nft_synproxy_obj_type,
+	.size		= sizeof(struct nft_synproxy),
+	.init		= nft_synproxy_obj_init,
+	.destroy	= nft_synproxy_obj_destroy,
+	.dump		= nft_synproxy_obj_dump,
+	.eval		= nft_synproxy_obj_eval,
+	.update		= nft_synproxy_obj_update,
+};
+
+static struct nft_object_type nft_synproxy_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_SYNPROXY,
+	.ops		= &nft_synproxy_obj_ops,
+	.maxattr	= NFTA_SYNPROXY_MAX,
+	.policy		= nft_synproxy_policy,
+	.owner		= THIS_MODULE,
+};
+
 static int __init nft_synproxy_module_init(void)
 {
-	return nft_register_expr(&nft_synproxy_type);
+	int err;
+
+	err = nft_register_obj(&nft_synproxy_obj_type);
+	if (err < 0)
+		return err;
+
+	err = nft_register_expr(&nft_synproxy_type);
+	if (err < 0)
+		goto err;
+
+	return 0;
+
+err:
+	nft_unregister_obj(&nft_synproxy_obj_type);
+	return err;
 }
 
 static void __exit nft_synproxy_module_exit(void)
 {
-	return nft_unregister_expr(&nft_synproxy_type);
+	nft_unregister_expr(&nft_synproxy_type);
+	nft_unregister_obj(&nft_synproxy_obj_type);
 }
 
 module_init(nft_synproxy_module_init);
@@ -287,3 +387,4 @@ module_exit(nft_synproxy_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
 MODULE_ALIAS_NFT_EXPR("synproxy");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY);
-- 
cgit v1.2.3-59-g8ed1b


From 64f658ded48eccd23d429d636d49a03b3f53d741 Mon Sep 17 00:00:00 2001
From: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Date: Wed, 11 Sep 2019 12:08:32 +0100
Subject: devlink: add unknown 'fw_load_policy' value

Similar to the 'reset_dev_on_drv_probe' devlink parameter, it is useful
to have an unknown value which can be used by drivers to report that the
hardware value isn't recognized or is otherwise invalid instead of
failing the operation.

This is especially useful for u8/enum parameters.

Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 1da3e83f1fd4..8da5365850cd 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -203,6 +203,7 @@ enum devlink_param_fw_load_policy_value {
 	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER,
 	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH,
 	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DISK,
+	DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_UNKNOWN,
 };
 
 enum devlink_param_reset_dev_on_drv_probe_value {
-- 
cgit v1.2.3-59-g8ed1b


From 415606588c61230b7b4f0118fc2d64a0c1c4d102 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Wed, 11 Sep 2019 09:16:21 +0300
Subject: PTP: introduce new versions of IOCTLs

The current version of the IOCTL have a small problem which prevents us
from extending the API by making use of reserved fields. In these new
IOCTLs, we are now making sure that flags and rsv fields are zero which
will allow us to extend the API in the future.

Reviewed-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_chardev.c      | 63 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/ptp_clock.h | 24 +++++++++++++++-
 2 files changed, 86 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 18ffe449efdf..9c18476d8d10 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -126,7 +126,9 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 	switch (cmd) {
 
 	case PTP_CLOCK_GETCAPS:
+	case PTP_CLOCK_GETCAPS2:
 		memset(&caps, 0, sizeof(caps));
+
 		caps.max_adj = ptp->info->max_adj;
 		caps.n_alarm = ptp->info->n_alarm;
 		caps.n_ext_ts = ptp->info->n_ext_ts;
@@ -139,11 +141,24 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_EXTTS_REQUEST:
+	case PTP_EXTTS_REQUEST2:
+		memset(&req, 0, sizeof(req));
+
 		if (copy_from_user(&req.extts, (void __user *)arg,
 				   sizeof(req.extts))) {
 			err = -EFAULT;
 			break;
 		}
+		if (((req.extts.flags & ~PTP_EXTTS_VALID_FLAGS) ||
+			req.extts.rsv[0] || req.extts.rsv[1]) &&
+			cmd == PTP_EXTTS_REQUEST2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_EXTTS_REQUEST) {
+			req.extts.flags &= ~PTP_EXTTS_VALID_FLAGS;
+			req.extts.rsv[0] = 0;
+			req.extts.rsv[1] = 0;
+		}
 		if (req.extts.index >= ops->n_ext_ts) {
 			err = -EINVAL;
 			break;
@@ -154,11 +169,27 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_PEROUT_REQUEST:
+	case PTP_PEROUT_REQUEST2:
+		memset(&req, 0, sizeof(req));
+
 		if (copy_from_user(&req.perout, (void __user *)arg,
 				   sizeof(req.perout))) {
 			err = -EFAULT;
 			break;
 		}
+		if (((req.perout.flags & ~PTP_PEROUT_VALID_FLAGS) ||
+			req.perout.rsv[0] || req.perout.rsv[1] ||
+			req.perout.rsv[2] || req.perout.rsv[3]) &&
+			cmd == PTP_PEROUT_REQUEST2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_PEROUT_REQUEST) {
+			req.perout.flags &= ~PTP_PEROUT_VALID_FLAGS;
+			req.perout.rsv[0] = 0;
+			req.perout.rsv[1] = 0;
+			req.perout.rsv[2] = 0;
+			req.perout.rsv[3] = 0;
+		}
 		if (req.perout.index >= ops->n_per_out) {
 			err = -EINVAL;
 			break;
@@ -169,6 +200,9 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_ENABLE_PPS:
+	case PTP_ENABLE_PPS2:
+		memset(&req, 0, sizeof(req));
+
 		if (!capable(CAP_SYS_TIME))
 			return -EPERM;
 		req.type = PTP_CLK_REQ_PPS;
@@ -177,6 +211,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_SYS_OFFSET_PRECISE:
+	case PTP_SYS_OFFSET_PRECISE2:
 		if (!ptp->info->getcrosststamp) {
 			err = -EOPNOTSUPP;
 			break;
@@ -201,6 +236,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_SYS_OFFSET_EXTENDED:
+	case PTP_SYS_OFFSET_EXTENDED2:
 		if (!ptp->info->gettimex64) {
 			err = -EOPNOTSUPP;
 			break;
@@ -232,6 +268,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_SYS_OFFSET:
+	case PTP_SYS_OFFSET2:
 		sysoff = memdup_user((void __user *)arg, sizeof(*sysoff));
 		if (IS_ERR(sysoff)) {
 			err = PTR_ERR(sysoff);
@@ -266,10 +303,23 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_PIN_GETFUNC:
+	case PTP_PIN_GETFUNC2:
 		if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((pd.rsv[0] || pd.rsv[1] || pd.rsv[2]
+				|| pd.rsv[3] || pd.rsv[4])
+			&& cmd == PTP_PIN_GETFUNC2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_PIN_GETFUNC) {
+			pd.rsv[0] = 0;
+			pd.rsv[1] = 0;
+			pd.rsv[2] = 0;
+			pd.rsv[3] = 0;
+			pd.rsv[4] = 0;
+		}
 		pin_index = pd.index;
 		if (pin_index >= ops->n_pins) {
 			err = -EINVAL;
@@ -285,10 +335,23 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_PIN_SETFUNC:
+	case PTP_PIN_SETFUNC2:
 		if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((pd.rsv[0] || pd.rsv[1] || pd.rsv[2]
+				|| pd.rsv[3] || pd.rsv[4])
+			&& cmd == PTP_PIN_SETFUNC2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_PIN_SETFUNC) {
+			pd.rsv[0] = 0;
+			pd.rsv[1] = 0;
+			pd.rsv[2] = 0;
+			pd.rsv[3] = 0;
+			pd.rsv[4] = 0;
+		}
 		pin_index = pd.index;
 		if (pin_index >= ops->n_pins) {
 			err = -EINVAL;
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 1bc794ad957a..9a0af3511b68 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -25,10 +25,20 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
-/* PTP_xxx bits, for the flags field within the request structures. */
+/*
+ * Bits of the ptp_extts_request.flags field:
+ */
 #define PTP_ENABLE_FEATURE (1<<0)
 #define PTP_RISING_EDGE    (1<<1)
 #define PTP_FALLING_EDGE   (1<<2)
+#define PTP_EXTTS_VALID_FLAGS	(PTP_ENABLE_FEATURE |	\
+				 PTP_RISING_EDGE |	\
+				 PTP_FALLING_EDGE)
+
+/*
+ * Bits of the ptp_perout_request.flags field:
+ */
+#define PTP_PEROUT_VALID_FLAGS (0)
 
 /*
  * struct ptp_clock_time - represents a time value
@@ -149,6 +159,18 @@ struct ptp_pin_desc {
 #define PTP_SYS_OFFSET_EXTENDED \
 	_IOWR(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended)
 
+#define PTP_CLOCK_GETCAPS2  _IOR(PTP_CLK_MAGIC, 10, struct ptp_clock_caps)
+#define PTP_EXTTS_REQUEST2  _IOW(PTP_CLK_MAGIC, 11, struct ptp_extts_request)
+#define PTP_PEROUT_REQUEST2 _IOW(PTP_CLK_MAGIC, 12, struct ptp_perout_request)
+#define PTP_ENABLE_PPS2     _IOW(PTP_CLK_MAGIC, 13, int)
+#define PTP_SYS_OFFSET2     _IOW(PTP_CLK_MAGIC, 14, struct ptp_sys_offset)
+#define PTP_PIN_GETFUNC2    _IOWR(PTP_CLK_MAGIC, 15, struct ptp_pin_desc)
+#define PTP_PIN_SETFUNC2    _IOW(PTP_CLK_MAGIC, 16, struct ptp_pin_desc)
+#define PTP_SYS_OFFSET_PRECISE2 \
+	_IOWR(PTP_CLK_MAGIC, 17, struct ptp_sys_offset_precise)
+#define PTP_SYS_OFFSET_EXTENDED2 \
+	_IOWR(PTP_CLK_MAGIC, 18, struct ptp_sys_offset_extended)
+
 struct ptp_extts_event {
 	struct ptp_clock_time t; /* Time event occured. */
 	unsigned int index;      /* Which channel produced the event. */
-- 
cgit v1.2.3-59-g8ed1b


From 823eb2a3c4c7f1b3e749f0dddb70bf8b09a76a10 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Wed, 11 Sep 2019 09:16:22 +0300
Subject: PTP: add support for one-shot output

Some controllers allow for a one-shot output pulse, in contrast to
periodic output. Now that we have extensible versions of our IOCTLs, we
can finally make use of the 'flags' field to pass a bit telling driver
that if we want one-shot pulse output.

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Reviewed-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ptp_clock.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 9a0af3511b68..f16301015949 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -38,8 +38,8 @@
 /*
  * Bits of the ptp_perout_request.flags field:
  */
-#define PTP_PEROUT_VALID_FLAGS (0)
-
+#define PTP_PEROUT_ONE_SHOT (1<<0)
+#define PTP_PEROUT_VALID_FLAGS	(PTP_PEROUT_ONE_SHOT)
 /*
  * struct ptp_clock_time - represents a time value
  *
@@ -77,7 +77,7 @@ struct ptp_perout_request {
 	struct ptp_clock_time start;  /* Absolute start time. */
 	struct ptp_clock_time period; /* Desired period, zero means disable. */
 	unsigned int index;           /* Which channel to configure. */
-	unsigned int flags;           /* Reserved for future use. */
+	unsigned int flags;
 	unsigned int rsv[4];          /* Reserved for future use. */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 2670ac2625f98557fd7e083f8aa22c297e49039e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 12 Sep 2019 10:49:46 +0200
Subject: net: devlink: move reload fail indication to devlink core and expose
 to user

Currently the fact that devlink reload failed is stored in drivers.
Move this flag into devlink core. Also, expose it to the user.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c | 15 +++++----------
 include/net/devlink.h                      |  3 +++
 include/uapi/linux/devlink.h               |  2 ++
 net/core/devlink.c                         | 21 ++++++++++++++++++++-
 4 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index c71a1d9ea17b..3fa96076e8a5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -80,7 +80,6 @@ struct mlxsw_core {
 	struct mlxsw_thermal *thermal;
 	struct mlxsw_core_port *ports;
 	unsigned int max_ports;
-	bool reload_fail;
 	bool fw_flash_in_progress;
 	unsigned long driver_priv[0];
 	/* driver_priv has to be always the last item */
@@ -1002,15 +1001,11 @@ mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink,
 					struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	int err;
-
-	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
-					     mlxsw_core->bus,
-					     mlxsw_core->bus_priv, true,
-					     devlink);
-	mlxsw_core->reload_fail = !!err;
 
-	return err;
+	return mlxsw_core_bus_device_register(mlxsw_core->bus_info,
+					      mlxsw_core->bus,
+					      mlxsw_core->bus_priv, true,
+					      devlink);
 }
 
 static int mlxsw_devlink_flash_update(struct devlink *devlink,
@@ -1254,7 +1249,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_core);
 
-	if (mlxsw_core->reload_fail) {
+	if (devlink_is_reload_failed(devlink)) {
 		if (!reload)
 			/* Only the parts that were not de-initialized in the
 			 * failed reload attempt need to be de-initialized.
diff --git a/include/net/devlink.h b/include/net/devlink.h
index ab8d56d12ffd..23e4b65ec9df 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -38,6 +38,7 @@ struct devlink {
 	struct device *dev;
 	possible_net_t _net;
 	struct mutex lock;
+	bool reload_failed;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
@@ -945,6 +946,8 @@ void
 devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 				     enum devlink_health_reporter_state state);
 
+bool devlink_is_reload_failed(const struct devlink *devlink);
+
 void devlink_flash_update_begin_notify(struct devlink *devlink);
 void devlink_flash_update_end_notify(struct devlink *devlink);
 void devlink_flash_update_status_notify(struct devlink *devlink,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 8da5365850cd..580b7a2e40e1 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -419,6 +419,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_TRAP_METADATA,			/* nested */
 	DEVLINK_ATTR_TRAP_GROUP_NAME,			/* string */
 
+	DEVLINK_ATTR_RELOAD_FAILED,			/* u8 0 or 1 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 9e522639693d..e48680efe54a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -471,6 +471,8 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
 
 	if (devlink_nl_put_handle(msg, devlink))
 		goto nla_put_failure;
+	if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))
+		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
 	return 0;
@@ -2677,6 +2679,21 @@ static bool devlink_reload_supported(struct devlink *devlink)
 	return devlink->ops->reload_down && devlink->ops->reload_up;
 }
 
+static void devlink_reload_failed_set(struct devlink *devlink,
+				      bool reload_failed)
+{
+	if (devlink->reload_failed == reload_failed)
+		return;
+	devlink->reload_failed = reload_failed;
+	devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+bool devlink_is_reload_failed(const struct devlink *devlink)
+{
+	return devlink->reload_failed;
+}
+EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
+
 static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
@@ -2693,7 +2710,9 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 	err = devlink->ops->reload_down(devlink, info->extack);
 	if (err)
 		return err;
-	return devlink->ops->reload_up(devlink, info->extack);
+	err = devlink->ops->reload_up(devlink, info->extack);
+	devlink_reload_failed_set(devlink, !!err);
+	return err;
 }
 
 static int devlink_nl_flash_update_fill(struct sk_buff *msg,
-- 
cgit v1.2.3-59-g8ed1b


From f9af2dbbfe01def62765a58af7fbc488351893c3 Mon Sep 17 00:00:00 2001
From: Thomas Higdon <tph@fb.com>
Date: Fri, 13 Sep 2019 23:23:34 +0000
Subject: tcp: Add TCP_INFO counter for packets received out-of-order

For receive-heavy cases on the server-side, we want to track the
connection quality for individual client IPs. This counter, similar to
the existing system-wide TCPOFOQueue counter in /proc/net/netstat,
tracks out-of-order packet reception. By providing this counter in
TCP_INFO, it will allow understanding to what degree receive-heavy
sockets are experiencing out-of-order delivery and packet drops
indicating congestion.

Please note that this is similar to the counter in NetBSD TCP_INFO, and
has the same name.

Also note that we avoid increasing the size of the tcp_sock struct by
taking advantage of a hole.

Signed-off-by: Thomas Higdon <tph@fb.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 2 ++
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 2 ++
 net/ipv4/tcp_input.c     | 1 +
 4 files changed, 7 insertions(+)

(limited to 'include/uapi')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f3a85a7fb4b1..99617e528ea2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -354,6 +354,8 @@ struct tcp_sock {
 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
 #endif
 
+	u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */
+
 /* Receiver side RTT estimation */
 	u32 rcv_rtt_last_tsecr;
 	struct {
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index b3564f85a762..20237987ccc8 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -270,6 +270,8 @@ struct tcp_info {
 	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
 	__u32	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */
 	__u32	tcpi_reord_seen;     /* reordering events seen */
+
+	__u32	tcpi_rcv_ooopack;    /* Out-of-order packets received */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 94df48bcecc2..4cf58208270e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2653,6 +2653,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->rx_opt.saw_tstamp = 0;
 	tp->rx_opt.dsack = 0;
 	tp->rx_opt.num_sacks = 0;
+	tp->rcv_ooopack = 0;
 
 
 	/* Clean up fastopen related fields */
@@ -3295,6 +3296,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_bytes_retrans = tp->bytes_retrans;
 	info->tcpi_dsack_dups = tp->dsack_dups;
 	info->tcpi_reord_seen = tp->reord_seen;
+	info->tcpi_rcv_ooopack = tp->rcv_ooopack;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7e94223fdb2b..3578357abe30 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4555,6 +4555,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
 
+	tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
 	seq = TCP_SKB_CB(skb)->seq;
 	end_seq = TCP_SKB_CB(skb)->end_seq;
-- 
cgit v1.2.3-59-g8ed1b


From 8f7baad7f03543451af27f5380fc816b008aa1f2 Mon Sep 17 00:00:00 2001
From: Thomas Higdon <tph@fb.com>
Date: Fri, 13 Sep 2019 23:23:35 +0000
Subject: tcp: Add snd_wnd to TCP_INFO

Neal Cardwell mentioned that snd_wnd would be useful for diagnosing TCP
performance problems --
> (1) Usually when we're diagnosing TCP performance problems, we do so
> from the sender, since the sender makes most of the
> performance-critical decisions (cwnd, pacing, TSO size, TSQ, etc).
> From the sender-side the thing that would be most useful is to see
> tp->snd_wnd, the receive window that the receiver has advertised to
> the sender.

This serves the purpose of adding an additional __u32 to avoid the
would-be hole caused by the addition of the tcpi_rcvi_ooopack field.

Signed-off-by: Thomas Higdon <tph@fb.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 4 ++++
 net/ipv4/tcp.c           | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 20237987ccc8..81e697978e8b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -272,6 +272,10 @@ struct tcp_info {
 	__u32	tcpi_reord_seen;     /* reordering events seen */
 
 	__u32	tcpi_rcv_ooopack;    /* Out-of-order packets received */
+
+	__u32	tcpi_snd_wnd;	     /* peer's advertised receive window after
+				      * scaling (bytes)
+				      */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4cf58208270e..79c325a07ba5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3297,6 +3297,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_dsack_dups = tp->dsack_dups;
 	info->tcpi_reord_seen = tp->reord_seen;
 	info->tcpi_rcv_ooopack = tp->rcv_ooopack;
+	info->tcpi_snd_wnd = tp->snd_wnd;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
-- 
cgit v1.2.3-59-g8ed1b


From 9c66d15646760eb8982242b4531c4d4fd36118fd Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Sun, 15 Sep 2019 04:59:58 +0300
Subject: taprio: Add support for hardware offloading

This allows taprio to offload the schedule enforcement to capable
network cards, resulting in more precise windows and less CPU usage.

The gate mask acts on traffic classes (groups of queues of same
priority), as specified in IEEE 802.1Q-2018, and following the existing
taprio and mqprio semantics.
It is up to the driver to perform conversion between tc and individual
netdev queues if for some reason it needs to make that distinction.

Full offload is requested from the network interface by specifying
"flags 2" in the tc qdisc creation command, which in turn corresponds to
the TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD bit.

The important detail here is the clockid which is implicitly /dev/ptpN
for full offload, and hence not configurable.

A reference counting API is added to support the use case where Ethernet
drivers need to keep the taprio offload structure locally (i.e. they are
a multi-port switch driver, and configuring a port depends on the
settings of other ports as well). The refcount_t variable is kept in a
private structure (__tc_taprio_qopt_offload) and not exposed to drivers.

In the future, the private structure might also be expanded with a
backpointer to taprio_sched *q, to implement the notification system
described in the patch (of when admin became oper, or an error occurred,
etc, so the offload can be monitored with 'tc qdisc show').

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h      |   1 +
 include/net/pkt_sched.h        |  23 +++
 include/uapi/linux/pkt_sched.h |   3 +-
 net/sched/sch_taprio.c         | 409 ++++++++++++++++++++++++++++++++++++-----
 4 files changed, 392 insertions(+), 44 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d7d5626002e9..9eda1c31d1f7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -847,6 +847,7 @@ enum tc_setup_type {
 	TC_SETUP_QDISC_ETF,
 	TC_SETUP_ROOT_QDISC,
 	TC_SETUP_QDISC_GRED,
+	TC_SETUP_QDISC_TAPRIO,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index a16fbe9a2a67..d1632979622e 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -161,4 +161,27 @@ struct tc_etf_qopt_offload {
 	s32 queue;
 };
 
+struct tc_taprio_sched_entry {
+	u8 command; /* TC_TAPRIO_CMD_* */
+
+	/* The gate_mask in the offloading side refers to traffic classes */
+	u32 gate_mask;
+	u32 interval;
+};
+
+struct tc_taprio_qopt_offload {
+	u8 enable;
+	ktime_t base_time;
+	u64 cycle_time;
+	u64 cycle_time_extension;
+
+	size_t num_entries;
+	struct tc_taprio_sched_entry entries[0];
+};
+
+/* Reference counting */
+struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
+						  *offload);
+void taprio_offload_free(struct tc_taprio_qopt_offload *offload);
+
 #endif
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 18f185299f47..5011259b8f67 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1160,7 +1160,8 @@ enum {
  *       [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL]
  */
 
-#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST 0x1
+#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST	BIT(0)
+#define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD	BIT(1)
 
 enum {
 	TCA_TAPRIO_ATTR_UNSPEC,
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 84b863e2bdbd..2f7b34205c82 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -29,8 +29,8 @@ static DEFINE_SPINLOCK(taprio_list_lock);
 
 #define TAPRIO_ALL_GATES_OPEN -1
 
-#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST))
 #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
+#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
 
 struct sched_entry {
 	struct list_head list;
@@ -75,9 +75,16 @@ struct taprio_sched {
 	struct sched_gate_list __rcu *admin_sched;
 	struct hrtimer advance_timer;
 	struct list_head taprio_list;
+	struct sk_buff *(*dequeue)(struct Qdisc *sch);
+	struct sk_buff *(*peek)(struct Qdisc *sch);
 	u32 txtime_delay;
 };
 
+struct __tc_taprio_qopt_offload {
+	refcount_t users;
+	struct tc_taprio_qopt_offload offload;
+};
+
 static ktime_t sched_base_time(const struct sched_gate_list *sched)
 {
 	if (!sched)
@@ -268,6 +275,19 @@ static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
 	return entry;
 }
 
+static bool taprio_flags_valid(u32 flags)
+{
+	/* Make sure no other flag bits are set. */
+	if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST |
+		      TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
+		return false;
+	/* txtime-assist and full offload are mutually exclusive */
+	if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) &&
+	    (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
+		return false;
+	return true;
+}
+
 /* This returns the tstamp value set by TCP in terms of the set clock. */
 static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
 {
@@ -417,7 +437,7 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	return qdisc_enqueue(skb, child, to_free);
 }
 
-static struct sk_buff *taprio_peek(struct Qdisc *sch)
+static struct sk_buff *taprio_peek_soft(struct Qdisc *sch)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
@@ -461,6 +481,36 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 	return NULL;
 }
 
+static struct sk_buff *taprio_peek_offload(struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct Qdisc *child = q->qdiscs[i];
+
+		if (unlikely(!child))
+			continue;
+
+		skb = child->ops->peek(child);
+		if (!skb)
+			continue;
+
+		return skb;
+	}
+
+	return NULL;
+}
+
+static struct sk_buff *taprio_peek(struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+
+	return q->peek(sch);
+}
+
 static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
 {
 	atomic_set(&entry->budget,
@@ -468,7 +518,7 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
 			     atomic64_read(&q->picos_per_byte)));
 }
 
-static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
+static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
@@ -550,6 +600,40 @@ done:
 	return skb;
 }
 
+static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct Qdisc *child = q->qdiscs[i];
+
+		if (unlikely(!child))
+			continue;
+
+		skb = child->ops->dequeue(child);
+		if (unlikely(!skb))
+			continue;
+
+		qdisc_bstats_update(sch, skb);
+		qdisc_qstats_backlog_dec(sch, skb);
+		sch->q.qlen--;
+
+		return skb;
+	}
+
+	return NULL;
+}
+
+static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+
+	return q->dequeue(sch);
+}
+
 static bool should_restart_cycle(const struct sched_gate_list *oper,
 				 const struct sched_entry *entry)
 {
@@ -932,6 +1016,9 @@ static void taprio_start_sched(struct Qdisc *sch,
 	struct taprio_sched *q = qdisc_priv(sch);
 	ktime_t expires;
 
+	if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+		return;
+
 	expires = hrtimer_get_expires(&q->advance_timer);
 	if (expires == 0)
 		expires = KTIME_MAX;
@@ -1011,6 +1098,254 @@ static void setup_txtime(struct taprio_sched *q,
 	}
 }
 
+static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries)
+{
+	size_t size = sizeof(struct tc_taprio_sched_entry) * num_entries +
+		      sizeof(struct __tc_taprio_qopt_offload);
+	struct __tc_taprio_qopt_offload *__offload;
+
+	__offload = kzalloc(size, GFP_KERNEL);
+	if (!__offload)
+		return NULL;
+
+	refcount_set(&__offload->users, 1);
+
+	return &__offload->offload;
+}
+
+struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
+						  *offload)
+{
+	struct __tc_taprio_qopt_offload *__offload;
+
+	__offload = container_of(offload, struct __tc_taprio_qopt_offload,
+				 offload);
+
+	refcount_inc(&__offload->users);
+
+	return offload;
+}
+EXPORT_SYMBOL_GPL(taprio_offload_get);
+
+void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
+{
+	struct __tc_taprio_qopt_offload *__offload;
+
+	__offload = container_of(offload, struct __tc_taprio_qopt_offload,
+				 offload);
+
+	if (!refcount_dec_and_test(&__offload->users))
+		return;
+
+	kfree(__offload);
+}
+EXPORT_SYMBOL_GPL(taprio_offload_free);
+
+/* The function will only serve to keep the pointers to the "oper" and "admin"
+ * schedules valid in relation to their base times, so when calling dump() the
+ * users looks at the right schedules.
+ * When using full offload, the admin configuration is promoted to oper at the
+ * base_time in the PHC time domain.  But because the system time is not
+ * necessarily in sync with that, we can't just trigger a hrtimer to call
+ * switch_schedules at the right hardware time.
+ * At the moment we call this by hand right away from taprio, but in the future
+ * it will be useful to create a mechanism for drivers to notify taprio of the
+ * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump().
+ * This is left as TODO.
+ */
+void taprio_offload_config_changed(struct taprio_sched *q)
+{
+	struct sched_gate_list *oper, *admin;
+
+	spin_lock(&q->current_entry_lock);
+
+	oper = rcu_dereference_protected(q->oper_sched,
+					 lockdep_is_held(&q->current_entry_lock));
+	admin = rcu_dereference_protected(q->admin_sched,
+					  lockdep_is_held(&q->current_entry_lock));
+
+	switch_schedules(q, &admin, &oper);
+
+	spin_unlock(&q->current_entry_lock);
+}
+
+static void taprio_sched_to_offload(struct taprio_sched *q,
+				    struct sched_gate_list *sched,
+				    const struct tc_mqprio_qopt *mqprio,
+				    struct tc_taprio_qopt_offload *offload)
+{
+	struct sched_entry *entry;
+	int i = 0;
+
+	offload->base_time = sched->base_time;
+	offload->cycle_time = sched->cycle_time;
+	offload->cycle_time_extension = sched->cycle_time_extension;
+
+	list_for_each_entry(entry, &sched->entries, list) {
+		struct tc_taprio_sched_entry *e = &offload->entries[i];
+
+		e->command = entry->command;
+		e->interval = entry->interval;
+		e->gate_mask = entry->gate_mask;
+		i++;
+	}
+
+	offload->num_entries = i;
+}
+
+static int taprio_enable_offload(struct net_device *dev,
+				 struct tc_mqprio_qopt *mqprio,
+				 struct taprio_sched *q,
+				 struct sched_gate_list *sched,
+				 struct netlink_ext_ack *extack)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	struct tc_taprio_qopt_offload *offload;
+	int err = 0;
+
+	if (!ops->ndo_setup_tc) {
+		NL_SET_ERR_MSG(extack,
+			       "Device does not support taprio offload");
+		return -EOPNOTSUPP;
+	}
+
+	offload = taprio_offload_alloc(sched->num_entries);
+	if (!offload) {
+		NL_SET_ERR_MSG(extack,
+			       "Not enough memory for enabling offload mode");
+		return -ENOMEM;
+	}
+	offload->enable = 1;
+	taprio_sched_to_offload(q, sched, mqprio, offload);
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack,
+			       "Device failed to setup taprio offload");
+		goto done;
+	}
+
+	taprio_offload_config_changed(q);
+
+done:
+	taprio_offload_free(offload);
+
+	return err;
+}
+
+static int taprio_disable_offload(struct net_device *dev,
+				  struct taprio_sched *q,
+				  struct netlink_ext_ack *extack)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	struct tc_taprio_qopt_offload *offload;
+	int err;
+
+	if (!FULL_OFFLOAD_IS_ENABLED(q->flags))
+		return 0;
+
+	if (!ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	offload = taprio_offload_alloc(0);
+	if (!offload) {
+		NL_SET_ERR_MSG(extack,
+			       "Not enough memory to disable offload mode");
+		return -ENOMEM;
+	}
+	offload->enable = 0;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack,
+			       "Device failed to disable offload");
+		goto out;
+	}
+
+out:
+	taprio_offload_free(offload);
+
+	return err;
+}
+
+/* If full offload is enabled, the only possible clockid is the net device's
+ * PHC. For that reason, specifying a clockid through netlink is incorrect.
+ * For txtime-assist, it is implicitly assumed that the device's PHC is kept
+ * in sync with the specified clockid via a user space daemon such as phc2sys.
+ * For both software taprio and txtime-assist, the clockid is used for the
+ * hrtimer that advances the schedule and hence mandatory.
+ */
+static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
+				struct netlink_ext_ack *extack)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	int err = -EINVAL;
+
+	if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+		const struct ethtool_ops *ops = dev->ethtool_ops;
+		struct ethtool_ts_info info = {
+			.cmd = ETHTOOL_GET_TS_INFO,
+			.phc_index = -1,
+		};
+
+		if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+			NL_SET_ERR_MSG(extack,
+				       "The 'clockid' cannot be specified for full offload");
+			goto out;
+		}
+
+		if (ops && ops->get_ts_info)
+			err = ops->get_ts_info(dev, &info);
+
+		if (err || info.phc_index < 0) {
+			NL_SET_ERR_MSG(extack,
+				       "Device does not have a PTP clock");
+			err = -ENOTSUPP;
+			goto out;
+		}
+	} else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+		int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+
+		/* We only support static clockids and we don't allow
+		 * for it to be modified after the first init.
+		 */
+		if (clockid < 0 ||
+		    (q->clockid != -1 && q->clockid != clockid)) {
+			NL_SET_ERR_MSG(extack,
+				       "Changing the 'clockid' of a running schedule is not supported");
+			err = -ENOTSUPP;
+			goto out;
+		}
+
+		switch (clockid) {
+		case CLOCK_REALTIME:
+			q->tk_offset = TK_OFFS_REAL;
+			break;
+		case CLOCK_MONOTONIC:
+			q->tk_offset = TK_OFFS_MAX;
+			break;
+		case CLOCK_BOOTTIME:
+			q->tk_offset = TK_OFFS_BOOT;
+			break;
+		case CLOCK_TAI:
+			q->tk_offset = TK_OFFS_TAI;
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+			err = -EINVAL;
+			goto out;
+		}
+
+		q->clockid = clockid;
+	} else {
+		NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
+		goto out;
+	}
+out:
+	return err;
+}
+
 static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 			 struct netlink_ext_ack *extack)
 {
@@ -1020,9 +1355,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 	struct net_device *dev = qdisc_dev(sch);
 	struct tc_mqprio_qopt *mqprio = NULL;
 	u32 taprio_flags = 0;
-	int i, err, clockid;
 	unsigned long flags;
 	ktime_t start;
+	int i, err;
 
 	err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt,
 					  taprio_policy, extack);
@@ -1038,7 +1373,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 		if (q->flags != 0 && q->flags != taprio_flags) {
 			NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
 			return -EOPNOTSUPP;
-		} else if (!FLAGS_VALID(taprio_flags)) {
+		} else if (!taprio_flags_valid(taprio_flags)) {
 			NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
 			return -EINVAL;
 		}
@@ -1078,30 +1413,19 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 		goto free_sched;
 	}
 
-	if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
-		clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
-
-		/* We only support static clockids and we don't allow
-		 * for it to be modified after the first init.
-		 */
-		if (clockid < 0 ||
-		    (q->clockid != -1 && q->clockid != clockid)) {
-			NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported");
-			err = -ENOTSUPP;
-			goto free_sched;
-		}
-
-		q->clockid = clockid;
-	}
-
-	if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
-		NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
-		err = -EINVAL;
+	err = taprio_parse_clockid(sch, tb, extack);
+	if (err < 0)
 		goto free_sched;
-	}
 
 	taprio_set_picos_per_byte(dev, q);
 
+	if (FULL_OFFLOAD_IS_ENABLED(taprio_flags))
+		err = taprio_enable_offload(dev, mqprio, q, new_admin, extack);
+	else
+		err = taprio_disable_offload(dev, q, extack);
+	if (err)
+		goto free_sched;
+
 	/* Protects against enqueue()/dequeue() */
 	spin_lock_bh(qdisc_lock(sch));
 
@@ -1116,6 +1440,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) &&
+	    !FULL_OFFLOAD_IS_ENABLED(taprio_flags) &&
 	    !hrtimer_active(&q->advance_timer)) {
 		hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
 		q->advance_timer.function = advance_sched;
@@ -1134,23 +1459,15 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 					       mqprio->prio_tc_map[i]);
 	}
 
-	switch (q->clockid) {
-	case CLOCK_REALTIME:
-		q->tk_offset = TK_OFFS_REAL;
-		break;
-	case CLOCK_MONOTONIC:
-		q->tk_offset = TK_OFFS_MAX;
-		break;
-	case CLOCK_BOOTTIME:
-		q->tk_offset = TK_OFFS_BOOT;
-		break;
-	case CLOCK_TAI:
-		q->tk_offset = TK_OFFS_TAI;
-		break;
-	default:
-		NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
-		err = -EINVAL;
-		goto unlock;
+	if (FULL_OFFLOAD_IS_ENABLED(taprio_flags)) {
+		q->dequeue = taprio_dequeue_offload;
+		q->peek = taprio_peek_offload;
+	} else {
+		/* Be sure to always keep the function pointers
+		 * in a consistent state.
+		 */
+		q->dequeue = taprio_dequeue_soft;
+		q->peek = taprio_peek_soft;
 	}
 
 	err = taprio_get_start_time(sch, new_admin, &start);
@@ -1212,6 +1529,8 @@ static void taprio_destroy(struct Qdisc *sch)
 
 	hrtimer_cancel(&q->advance_timer);
 
+	taprio_disable_offload(dev, q, NULL);
+
 	if (q->qdiscs) {
 		for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
 			qdisc_put(q->qdiscs[i]);
@@ -1241,6 +1560,9 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
 	hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
 	q->advance_timer.function = advance_sched;
 
+	q->dequeue = taprio_dequeue_soft;
+	q->peek = taprio_peek_soft;
+
 	q->root = sch;
 
 	/* We only support static clockids. Use an invalid value as default
@@ -1423,7 +1745,8 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
 		goto options_error;
 
-	if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
+	if (!FULL_OFFLOAD_IS_ENABLED(q->flags) &&
+	    nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
 		goto options_error;
 
 	if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
-- 
cgit v1.2.3-59-g8ed1b


From 9f2f13f4ffb13ee914105dfc9f860d68cae98108 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 16 Sep 2019 10:35:25 +0300
Subject: ethtool: implement Energy Detect Powerdown support via phy-tunable

The `phy_tunable_id` has been named `ETHTOOL_PHY_EDPD` since it looks like
this feature is common across other PHYs (like EEE), and defining
`ETHTOOL_PHY_ENERGY_DETECT_POWER_DOWN` seems too long.

The way EDPD works, is that the RX block is put to a lower power mode,
except for link-pulse detection circuits. The TX block is also put to low
power mode, but the PHY wakes-up periodically to send link pulses, to avoid
lock-ups in case the other side is also in EDPD mode.

Currently, there are 2 PHY drivers that look like they could use this new
PHY tunable feature: the `adin` && `micrel` PHYs.

The ADIN's datasheet mentions that TX pulses are at intervals of 1 second
default each, and they can be disabled. For the Micrel KSZ9031 PHY, the
datasheet does not mention whether they can be disabled, but mentions that
they can modified.

The way this change is structured, is similar to the PHY tunable downshift
control:
* a `ETHTOOL_PHY_EDPD_DFLT_TX_MSECS` value is exposed to cover a default
  TX interval; some PHYs could specify a certain value that makes sense
* `ETHTOOL_PHY_EDPD_NO_TX` would disable TX when EDPD is enabled
* `ETHTOOL_PHY_EDPD_DISABLE` will disable EDPD

As noted by the `ETHTOOL_PHY_EDPD_DFLT_TX_MSECS` the interval unit is 1
millisecond, which should cover a reasonable range of intervals:
 - from 1 millisecond, which does not sound like much of a power-saver
 - to ~65 seconds which is quite a lot to wait for a link to come up when
   plugging a cable

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 22 ++++++++++++++++++++++
 net/core/ethtool.c           |  6 ++++++
 2 files changed, 28 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index dd06302aa93e..8938b76c4ee3 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -259,10 +259,32 @@ struct ethtool_tunable {
 #define ETHTOOL_PHY_FAST_LINK_DOWN_ON	0
 #define ETHTOOL_PHY_FAST_LINK_DOWN_OFF	0xff
 
+/* Energy Detect Power Down (EDPD) is a feature supported by some PHYs, where
+ * the PHY's RX & TX blocks are put into a low-power mode when there is no
+ * link detected (typically cable is un-plugged). For RX, only a minimal
+ * link-detection is available, and for TX the PHY wakes up to send link pulses
+ * to avoid any lock-ups in case the peer PHY may also be running in EDPD mode.
+ *
+ * Some PHYs may support configuration of the wake-up interval for TX pulses,
+ * and some PHYs may support only disabling TX pulses entirely. For the latter
+ * a special value is required (ETHTOOL_PHY_EDPD_NO_TX) so that this can be
+ * configured from userspace (should the user want it).
+ *
+ * The interval units for TX wake-up are in milliseconds, since this should
+ * cover a reasonable range of intervals:
+ *  - from 1 millisecond, which does not sound like much of a power-saver
+ *  - to ~65 seconds which is quite a lot to wait for a link to come up when
+ *    plugging a cable
+ */
+#define ETHTOOL_PHY_EDPD_DFLT_TX_MSECS		0xffff
+#define ETHTOOL_PHY_EDPD_NO_TX			0xfffe
+#define ETHTOOL_PHY_EDPD_DISABLE		0
+
 enum phy_tunable_id {
 	ETHTOOL_PHY_ID_UNSPEC,
 	ETHTOOL_PHY_DOWNSHIFT,
 	ETHTOOL_PHY_FAST_LINK_DOWN,
+	ETHTOOL_PHY_EDPD,
 	/*
 	 * Add your fresh new phy tunable attribute above and remember to update
 	 * phy_tunable_strings[] in net/core/ethtool.c
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6288e69e94fc..c763106c73fc 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -133,6 +133,7 @@ phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_ID_UNSPEC]     = "Unspec",
 	[ETHTOOL_PHY_DOWNSHIFT]	= "phy-downshift",
 	[ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down",
+	[ETHTOOL_PHY_EDPD]	= "phy-energy-detect-power-down",
 };
 
 static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
@@ -2451,6 +2452,11 @@ static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
 		    tuna->type_id != ETHTOOL_TUNABLE_U8)
 			return -EINVAL;
 		break;
+	case ETHTOOL_PHY_EDPD:
+		if (tuna->len != sizeof(u16) ||
+		    tuna->type_id != ETHTOOL_TUNABLE_U16)
+			return -EINVAL;
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3-59-g8ed1b