From 40e44a1e669d078946f46853808a60d29e6f0885 Mon Sep 17 00:00:00 2001 From: Scott Branden Date: Thu, 30 Nov 2017 11:35:59 -0800 Subject: net: ethtool: add support for reset of AP inside NIC interface. Add ETH_RESET_AP to reset the application processor(s) inside the NIC interface. Current ETH_RESET_MGMT supports a management processor inside this NIC. This is typically used for remote NIC management purposes. Application processors exist inside some SmartNICs to run various applications inside the NIC processor - be it a simple algorithm without an OS to as complex as hosting multiple VMs. Signed-off-by: Scott Branden Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index ac71559314e7..44a0b675a6bc 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1686,6 +1686,7 @@ enum ethtool_reset_flags { ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */ ETH_RESET_RAM = 1 << 7, /* RAM shared between * multiple components */ + ETH_RESET_AP = 1 << 8, /* Application processor */ ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to * this interface */ -- cgit v1.2.3-59-g8ed1b From f19397a5c65665d66e3866b42056f1f58b7a366b Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 1 Dec 2017 10:15:04 -0800 Subject: bpf: Add access to snd_cwnd and others in sock_ops Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these fields are only valid if the socket associated with the sock_ops program call is a full socket, the field is_fullsock is also added to the bpf_sock_ops struct. If the socket is not a full socket, reading these fields returns 0. Note that in most cases it will not be necessary to check is_fullsock to know if there is a full socket. The context of the call, as specified by the 'op' field, can sometimes determine whether there is a full socket. The struct bpf_sock_ops has the following fields added: __u32 is_fullsock; /* Some TCP fields are only valid if * there is a full socket. If not, the * fields read as zero. */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add read access to more 32 bit tcp_sock fields. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/net/tcp.h | 6 ++++-- include/uapi/linux/bpf.h | 6 ++++++ net/core/filter.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 80b5b482cb46..0062302e1285 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -985,6 +985,7 @@ struct bpf_sock_ops_kern { u32 reply; u32 replylong[4]; }; + u32 is_fullsock; }; #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 4e09398009c1..89a656077884 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2012,10 +2012,12 @@ static inline int tcp_call_bpf(struct sock *sk, int op) struct bpf_sock_ops_kern sock_ops; int ret; - if (sk_fullsock(sk)) + memset(&sock_ops, 0, sizeof(sock_ops)); + if (sk_fullsock(sk)) { + sock_ops.is_fullsock = 1; sock_owned_by_me(sk); + } - memset(&sock_ops, 0, sizeof(sock_ops)); sock_ops.sk = sk; sock_ops.op = op; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4c223ab30293..80d62e88590c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -941,6 +941,12 @@ struct bpf_sock_ops { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 is_fullsock; /* Some TCP fields are only valid if + * there is a full socket. If not, the + * fields read as zero. + */ + __u32 snd_cwnd; + __u32 srtt_us; /* Averaged RTT << 3 in usecs */ }; /* List of known BPF sock_ops operators. diff --git a/net/core/filter.c b/net/core/filter.c index 4d644ad17457..754abe1041b7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4437,6 +4437,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct bpf_sock_ops, is_fullsock): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + break; + +/* Helper macro for adding read access to tcp_sock fields. */ +#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ + offsetof(struct tcp_sock, FIELD_NAME)); \ + } while (0) + + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP32(snd_cwnd); + break; + + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP32(srtt_us); + break; } return insn - insn_buf; } -- cgit v1.2.3-59-g8ed1b From 96f84061620c6325a2ca9a9a05b410e6461d03c3 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Dec 2017 17:31:23 +0800 Subject: tun: add eBPF based queue selection method This patch introduces an eBPF based queue selection method. With this, the policy could be offloaded to userspace completely through a new ioctl TUNSETSTEERINGEBPF. Signed-off-by: Jason Wang Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/tun.c | 145 +++++++++++++++++++++++++++++++++++++------- include/uapi/linux/if_tun.h | 1 + 2 files changed, 123 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 80568f81a7c8..787cc35ef89b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -195,6 +195,11 @@ struct tun_flow_entry { #define TUN_NUM_FLOW_ENTRIES 1024 +struct tun_steering_prog { + struct rcu_head rcu; + struct bpf_prog *prog; +}; + /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. @@ -232,6 +237,7 @@ struct tun_struct { u32 rx_batched; struct tun_pcpu_stats __percpu *pcpu_stats; struct bpf_prog __rcu *xdp_prog; + struct tun_steering_prog __rcu *steering_prog; }; static int tun_napi_receive(struct napi_struct *napi, int budget) @@ -537,15 +543,12 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) * different rxq no. here. If we could not get rxhash, then we would * hope the rxq no. may help here. */ -static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) +static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { - struct tun_struct *tun = netdev_priv(dev); struct tun_flow_entry *e; u32 txq = 0; u32 numqueues = 0; - rcu_read_lock(); numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); @@ -563,10 +566,37 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, txq -= numqueues; } - rcu_read_unlock(); return txq; } +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) +{ + struct tun_steering_prog *prog; + u16 ret = 0; + + prog = rcu_dereference(tun->steering_prog); + if (prog) + ret = bpf_prog_run_clear_cb(prog->prog, skb); + + return ret % tun->numqueues; +} + +static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + struct tun_struct *tun = netdev_priv(dev); + u16 ret; + + rcu_read_lock(); + if (rcu_dereference(tun->steering_prog)) + ret = tun_ebpf_select_queue(tun, skb); + else + ret = tun_automq_select_queue(tun, skb); + rcu_read_unlock(); + + return ret; +} + static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); @@ -933,23 +963,10 @@ static int tun_net_close(struct net_device *dev) } /* Net device start xmit */ -static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) +static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { - struct tun_struct *tun = netdev_priv(dev); - int txq = skb->queue_mapping; - struct tun_file *tfile; - u32 numqueues = 0; - - rcu_read_lock(); - tfile = rcu_dereference(tun->tfiles[txq]); - numqueues = READ_ONCE(tun->numqueues); - - /* Drop packet if interface is not attached */ - if (txq >= numqueues) - goto drop; - #ifdef CONFIG_RPS - if (numqueues == 1 && static_key_false(&rps_needed)) { + if (tun->numqueues == 1 && static_key_false(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ @@ -965,6 +982,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) } } #endif +} + +/* Net device start xmit */ +static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + int txq = skb->queue_mapping; + struct tun_file *tfile; + u32 numqueues = 0; + + rcu_read_lock(); + tfile = rcu_dereference(tun->tfiles[txq]); + numqueues = READ_ONCE(tun->numqueues); + + /* Drop packet if interface is not attached */ + if (txq >= numqueues) + goto drop; + + if (!rcu_dereference(tun->steering_prog)) + tun_automq_xmit(tun, skb); tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); @@ -1547,7 +1584,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, int copylen; bool zerocopy = false; int err; - u32 rxhash; + u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tun); @@ -1735,7 +1772,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, rcu_read_unlock(); } - rxhash = __skb_get_hash_symmetric(skb); + rcu_read_lock(); + if (!rcu_dereference(tun->steering_prog)) + rxhash = __skb_get_hash_symmetric(skb); + rcu_read_unlock(); if (frags) { /* Exercise flow dissector code path. */ @@ -1779,7 +1819,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, u64_stats_update_end(&stats->syncp); put_cpu_ptr(stats); - tun_flow_update(tun, rxhash, tfile); + if (rxhash) + tun_flow_update(tun, rxhash, tfile); + return total_len; } @@ -1987,6 +2029,36 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } +static void tun_steering_prog_free(struct rcu_head *rcu) +{ + struct tun_steering_prog *prog = container_of(rcu, + struct tun_steering_prog, rcu); + + bpf_prog_destroy(prog->prog); + kfree(prog); +} + +static int __tun_set_steering_ebpf(struct tun_struct *tun, + struct bpf_prog *prog) +{ + struct tun_steering_prog *old, *new = NULL; + + if (prog) { + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + new->prog = prog; + } + + old = rtnl_dereference(tun->steering_prog); + rcu_assign_pointer(tun->steering_prog, new); + + if (old) + call_rcu(&old->rcu, tun_steering_prog_free); + + return 0; +} + static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@ -1995,6 +2067,9 @@ static void tun_free_netdev(struct net_device *dev) free_percpu(tun->pcpu_stats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); + rtnl_lock(); + __tun_set_steering_ebpf(tun, NULL); + rtnl_unlock(); } static void tun_setup(struct net_device *dev) @@ -2283,6 +2358,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; + RCU_INIT_POINTER(tun->steering_prog, NULL); tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); if (!tun->pcpu_stats) { @@ -2475,6 +2551,25 @@ unlock: return ret; } +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data) +{ + struct bpf_prog *prog; + int fd; + + if (copy_from_user(&fd, data, sizeof(fd))) + return -EFAULT; + + if (fd == -1) { + prog = NULL; + } else { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + return __tun_set_steering_ebpf(tun, prog); +} + static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { @@ -2751,6 +2846,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = 0; break; + case TUNSETSTEERINGEBPF: + ret = tun_set_steering_ebpf(tun, argp); + break; + default: ret = -EINVAL; break; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 030d3e6d6029..fb38c1797131 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -57,6 +57,7 @@ */ #define TUNSETVNETBE _IOW('T', 222, int) #define TUNGETVNETBE _IOR('T', 223, int) +#define TUNSETSTEERINGEBPF _IOR('T', 224, int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3-59-g8ed1b From 772a58693fc3116d05b7969223a80a6376e639eb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:03:58 +0800 Subject: sctp: add stream interleave enable members and sockopt This patch adds intl_enable in asoc and netns, and strm_interleave in sctp_sock to indicate if stream interleave is enabled and supported. netns intl_enable would be set via procfs, but that is not added yet until all stream interleave codes are completely implemented; asoc intl_enable will be set when doing 4-shakehands. sp strm_interleave can be set by sockopt SCTP_INTERLEAVING_SUPPORTED which is also added in this patch. This socket option is defined in section 4.3.1 of RFC8260. Note that strm_interleave can only be set by sockopt when both netns intl_enable and sp frag_interleave are set. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/netns/sctp.h | 5 ++- include/net/sctp/structs.h | 2 ++ include/uapi/linux/sctp.h | 1 + net/sctp/socket.c | 88 +++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 94 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index ebc813277662..0db7fb3e4e15 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -122,9 +122,12 @@ struct netns_sctp { /* Flag to indicate if PR-CONFIG is enabled. */ int reconf_enable; - /* Flag to idicate if SCTP-AUTH is enabled */ + /* Flag to indicate if SCTP-AUTH is enabled */ int auth_enable; + /* Flag to indicate if stream interleave is enabled */ + int intl_enable; + /* * Policy to control SCTP IPv4 address scoping * 0 - Disable IPv4 address scoping diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 2f8f93da5dc2..7030cbe11f45 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -217,6 +217,7 @@ struct sctp_sock { disable_fragments:1, v4mapped:1, frag_interleave:1, + strm_interleave:1, recvrcvinfo:1, recvnxtinfo:1, data_ready_signalled:1; @@ -1940,6 +1941,7 @@ struct sctp_association { __u8 need_ecne:1, /* Need to send an ECNE Chunk? */ temp:1, /* Is it a temporary association? */ force_delay:1, + intl_enable:1, prsctp_enable:1, reconf_enable:1; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index d9adab32dbee..6ed934c65a5f 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -125,6 +125,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_SOCKOPT_PEELOFF_FLAGS 122 #define SCTP_STREAM_SCHEDULER 123 #define SCTP_STREAM_SCHEDULER_VALUE 124 +#define SCTP_INTERLEAVING_SUPPORTED 125 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 3e55daa37e66..306c737bde87 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3350,7 +3350,10 @@ static int sctp_setsockopt_fragment_interleave(struct sock *sk, if (get_user(val, (int __user *)optval)) return -EFAULT; - sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1; + sctp_sk(sk)->frag_interleave = !!val; + + if (!sctp_sk(sk)->frag_interleave) + sctp_sk(sk)->strm_interleave = 0; return 0; } @@ -4019,6 +4022,40 @@ out: return retval; } +static int sctp_setsockopt_interleaving_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct net *net = sock_net(sk); + struct sctp_assoc_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_id) + goto out; + + if (!net->sctp.intl_enable || !sp->frag_interleave) { + retval = -EPERM; + goto out; + } + + sp->strm_interleave = !!params.assoc_value; + + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4206,6 +4243,10 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_STREAM_SCHEDULER_VALUE: retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); break; + case SCTP_INTERLEAVING_SUPPORTED: + retval = sctp_setsockopt_interleaving_supported(sk, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6969,6 +7010,47 @@ out: return retval; } +static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->intl_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->strm_interleave; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7159,6 +7241,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_scheduler_value(sk, len, optval, optlen); break; + case SCTP_INTERLEAVING_SUPPORTED: + retval = sctp_getsockopt_interleaving_supported(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; -- cgit v1.2.3-59-g8ed1b From 65f5e357839e40817aead853d7a7f61ff828b52b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:08 +0800 Subject: sctp: implement abort_pd for sctp_stream_interleave abort_pd is added as a member of sctp_stream_interleave, used to abort partial delivery for data or idata, called in sctp_cmd_assoc_failed. Since stream interleave allows to do partial delivery for each stream at the same time, sctp_intl_abort_pd for idata would be very different from the old function sctp_ulpq_abort_pd for data. Note that sctp_ulpevent_make_pdapi will support per stream in this patch by adding pdapi_stream and pdapi_seq in sctp_pdapi_event, as described in section 6.1.7 of RFC6458. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 1 + include/net/sctp/ulpevent.h | 3 +- include/uapi/linux/sctp.h | 2 + net/sctp/sm_sideeffect.c | 2 +- net/sctp/stream_interleave.c | 99 ++++++++++++++++++++++++++++++++++++ net/sctp/ulpevent.c | 9 ++-- net/sctp/ulpqueue.c | 2 +- 7 files changed, 112 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 317d9b3a5299..501b2be049a3 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -46,6 +46,7 @@ struct sctp_stream_interleave { void (*renege_events)(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp); void (*start_pd)(struct sctp_ulpq *ulpq, gfp_t gfp); + void (*abort_pd)(struct sctp_ulpq *ulpq, gfp_t gfp); }; void sctp_stream_interleave_init(struct sctp_stream *stream); diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index ce4f2aa35d56..51b4e0626c34 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -122,7 +122,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( struct sctp_ulpevent *sctp_ulpevent_make_pdapi( const struct sctp_association *asoc, - __u32 indication, gfp_t gfp); + __u32 indication, __u32 sid, __u32 seq, + __u32 flags, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication( const struct sctp_association *asoc, gfp_t gfp); diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6ed934c65a5f..4c4db14786bd 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -460,6 +460,8 @@ struct sctp_pdapi_event { __u32 pdapi_length; __u32 pdapi_indication; sctp_assoc_t pdapi_assoc_id; + __u32 pdapi_stream; + __u32 pdapi_seq; }; enum { SCTP_PARTIAL_DELIVERY_ABORTED=0, }; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 36710549a4ca..8adde71fdb31 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -632,7 +632,7 @@ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands, struct sctp_chunk *abort; /* Cancel any partial delivery in progress. */ - sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC); if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT) event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 4dce8d33c5ab..d15645ea338b 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -652,6 +652,103 @@ static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, sk_mem_reclaim(asoc->base.sk); } +static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, + __u32 mid, __u16 flags, gfp_t gfp) +{ + struct sock *sk = ulpq->asoc->base.sk; + struct sctp_ulpevent *ev = NULL; + + if (!sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, + &sctp_sk(sk)->subscribe)) + return; + + ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, + sid, mid, flags, gfp); + if (ev) { + __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); + + if (!sctp_sk(sk)->data_ready_signalled) { + sctp_sk(sk)->data_ready_signalled = 1; + sk->sk_data_ready(sk); + } + } +} + +static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) +{ + struct sctp_stream *stream = &ulpq->asoc->stream; + struct sctp_ulpevent *cevent, *event = NULL; + struct sk_buff_head *lobby = &ulpq->lobby; + struct sk_buff *pos, *tmp; + struct sk_buff_head temp; + __u16 csid; + __u32 cmid; + + skb_queue_head_init(&temp); + sctp_skb_for_each(pos, lobby, tmp) { + cevent = (struct sctp_ulpevent *)pos->cb; + csid = cevent->stream; + cmid = cevent->mid; + + if (csid > sid) + break; + + if (csid < sid) + continue; + + if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) + break; + + __skb_unlink(pos, lobby); + if (!event) + event = sctp_skb2event(pos); + + __skb_queue_tail(&temp, pos); + } + + if (!event && pos != (struct sk_buff *)lobby) { + cevent = (struct sctp_ulpevent *)pos->cb; + csid = cevent->stream; + cmid = cevent->mid; + + if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { + sctp_mid_next(stream, in, csid); + __skb_unlink(pos, lobby); + __skb_queue_tail(&temp, pos); + event = sctp_skb2event(pos); + } + } + + if (event) { + sctp_intl_retrieve_ordered(ulpq, event); + sctp_enqueue_event(ulpq, event); + } +} + +static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) +{ + struct sctp_stream *stream = &ulpq->asoc->stream; + __u16 sid; + + for (sid = 0; sid < stream->incnt; sid++) { + struct sctp_stream_in *sin = &stream->in[sid]; + __u32 mid; + + if (sin->pd_mode) { + sin->pd_mode = 0; + + mid = sin->mid; + sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); + sctp_mid_skip(stream, in, sid, mid); + + sctp_intl_reap_ordered(ulpq, sid); + } + } + + /* intl abort pd happens only when all data needs to be cleaned */ + sctp_ulpq_flush(ulpq); +} + static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), /* DATA process functions */ @@ -662,6 +759,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = { .enqueue_event = sctp_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, + .abort_pd = sctp_ulpq_abort_pd, }; static struct sctp_stream_interleave sctp_stream_interleave_1 = { @@ -674,6 +772,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = { .enqueue_event = sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, + .abort_pd = sctp_intl_abort_pd, }; void sctp_stream_interleave_init(struct sctp_stream *stream) diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index d3218f3e9cf7..84207ad33e8e 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -730,8 +730,9 @@ fail: * various events. */ struct sctp_ulpevent *sctp_ulpevent_make_pdapi( - const struct sctp_association *asoc, __u32 indication, - gfp_t gfp) + const struct sctp_association *asoc, + __u32 indication, __u32 sid, __u32 seq, + __u32 flags, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_pdapi_event *pd; @@ -752,7 +753,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_pdapi( * Currently unused. */ pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT; - pd->pdapi_flags = 0; + pd->pdapi_flags = flags; + pd->pdapi_stream = sid; + pd->pdapi_seq = seq; /* pdapi_length: 32 bits (unsigned integer) * diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 76ec5149a093..dd53daab4a25 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -1144,7 +1144,7 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) &sctp_sk(sk)->subscribe)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, - gfp); + 0, 0, 0, gfp); if (ev) __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); -- cgit v1.2.3-59-g8ed1b From f371b304f12e31fe30207c41ca7754564e0ea4dc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 11 Dec 2017 11:39:02 -0800 Subject: bpf/tracing: allow user space to query prog array on the same tp Commit e87c6bc3852b ("bpf: permit multiple bpf attachments for a single perf event") added support to attach multiple bpf programs to a single perf event. Although this provides flexibility, users may want to know what other bpf programs attached to the same tp interface. Besides getting visibility for the underlying bpf system, such information may also help consolidate multiple bpf programs, understand potential performance issues due to a large array, and debug (e.g., one bpf program which overwrites return code may impact subsequent program results). Commit 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") utilized the existing perf ioctl interface and added the command PERF_EVENT_IOC_SET_BPF to attach a bpf program to a tracepoint. This patch adds a new ioctl command, given a perf event fd, to query the bpf program array attached to the same perf tracepoint event. The new uapi ioctl command: PERF_EVENT_IOC_QUERY_BPF The new uapi/linux/perf_event.h structure: struct perf_event_query_bpf { __u32 ids_len; __u32 prog_cnt; __u32 ids[0]; }; User space provides buffer "ids" for kernel to copy to. When returning from the kernel, the number of available programs in the array is set in "prog_cnt". The usage: struct perf_event_query_bpf *query = malloc(sizeof(*query) + sizeof(u32) * ids_len); query.ids_len = ids_len; err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query); if (err == 0) { /* query.prog_cnt is the number of available progs, * number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt */ } else if (errno == ENOSPC) { /* query.ids_len number of progs copied, * query.prog_cnt is the number of available progs */ } else { /* other errors */ } Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ include/uapi/linux/perf_event.h | 22 ++++++++++++++++++++++ kernel/bpf/core.c | 21 +++++++++++++++++++++ kernel/events/core.c | 3 +++ kernel/trace/bpf_trace.c | 23 +++++++++++++++++++++++ 5 files changed, 73 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e55e4255a210..f812ac508e9f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -254,6 +254,7 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); +int bpf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); @@ -285,6 +286,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog); +int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b9a4953018ed..769533696483 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -418,6 +418,27 @@ struct perf_event_attr { __u16 __reserved_2; /* align to __u64 */ }; +/* + * Structure used by below PERF_EVENT_IOC_QUERY_BPF command + * to query bpf programs attached to the same perf tracepoint + * as the given perf event. + */ +struct perf_event_query_bpf { + /* + * The below ids array length + */ + __u32 ids_len; + /* + * Set by the kernel to indicate the number of + * available programs + */ + __u32 prog_cnt; + /* + * User provided buffer to store program ids + */ + __u32 ids[0]; +}; + #define perf_flags(attr) (*(&(attr)->read_format + 1)) /* @@ -433,6 +454,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..b16c6f8f42b6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1462,6 +1462,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) { + if (*prog == &dummy_bpf_prog.prog) + continue; id = (*prog)->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) { rcu_read_unlock(); @@ -1545,6 +1547,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } +int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt) +{ + u32 cnt = 0; + + if (array) + cnt = bpf_prog_array_length(array); + + if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + + /* return early if user requested only program count or nothing to copy */ + if (!request_cnt || !cnt) + return 0; + + return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/events/core.c b/kernel/events/core.c index 16beab4767e1..f10609e539d4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4723,6 +4723,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon rcu_read_unlock(); return 0; } + + case PERF_EVENT_IOC_QUERY_BPF: + return bpf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ce99c379c30..b143f2a05aff 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -820,3 +820,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event) unlock: mutex_unlock(&bpf_event_mutex); } + +int bpf_event_query_prog_array(struct perf_event *event, void __user *info) +{ + struct perf_event_query_bpf __user *uquery = info; + struct perf_event_query_bpf query = {}; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + if (copy_from_user(&query, uquery, sizeof(query))) + return -EFAULT; + + mutex_lock(&bpf_event_mutex); + ret = bpf_prog_array_copy_info(event->tp_event->prog_array, + uquery->ids, + query.ids_len, + &uquery->prog_cnt); + mutex_unlock(&bpf_event_mutex); + + return ret; +} -- cgit v1.2.3-59-g8ed1b From 9802d86585db91655c7d1929a4f6bbe0952ea88e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Dec 2017 11:36:48 -0500 Subject: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Acked-by: Ingo Molnar Signed-off-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + arch/x86/include/asm/kprobes.h | 4 +++ arch/x86/include/asm/ptrace.h | 5 ++++ arch/x86/kernel/kprobes/ftrace.c | 14 ++++++++++ include/linux/filter.h | 3 ++- include/linux/trace_events.h | 1 + include/uapi/linux/bpf.h | 7 ++++- kernel/bpf/core.c | 3 +++ kernel/bpf/verifier.c | 2 ++ kernel/events/core.c | 7 +++++ kernel/trace/Kconfig | 11 ++++++++ kernel/trace/bpf_trace.c | 35 +++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 55 +++++++++++++++++++++++++++++++++++----- kernel/trace/trace_probe.h | 12 +++++++++ 15 files changed, 154 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/Kconfig b/arch/Kconfig index 400b9e1b2f27..d3f4aaf9cb7a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -196,6 +196,9 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool +config HAVE_KPROBE_OVERRIDE + bool + config HAVE_NMI bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8eed3f94bfc7..04d66e6fa447 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -154,6 +154,7 @@ config X86 select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE + select HAVE_KPROBE_OVERRIDE select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 9f2e3102e0bb..36abb23a7a35 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); asmlinkage void kretprobe_trampoline(void); +#ifdef CONFIG_KPROBES_ON_FTRACE +extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs); +#endif + /* Architecture specific copy of original instruction*/ struct arch_specific_insn { /* copy of the original instruction */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 14131dd06b29..6de1fd3d0097 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->ax; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->ax = rc; +} + /* * user_mode(regs) determines whether a register set came from user * mode. On x86_32, this is true if V8086 mode was enabled OR if the diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 8dc0161cec8f..1ea748d682fd 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p) p->ainsn.boostable = false; return 0; } + +asmlinkage void override_func(void); +asm( + ".type override_func, @function\n" + "override_func:\n" + " ret\n" + ".size override_func, .-override_func\n" +); + +void arch_ftrace_kprobe_override_function(struct pt_regs *regs) +{ + regs->ip = (unsigned long)&override_func; +} +NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function); diff --git a/include/linux/filter.h b/include/linux/filter.h index 0062302e1285..5feb441d3dd9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -458,7 +458,8 @@ struct bpf_prog { locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1; /* Do we need dst entry? */ + dst_needed:1, /* Do we need dst entry? */ + kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index af44e7c2d577..5fea451f6e28 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -528,6 +528,7 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); +DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 80d62e88590c..595bda120cfb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -677,6 +677,10 @@ union bpf_attr { * @buf: buf to fill * @buf_size: size of the buf * Return : 0 on success or negative error code + * + * int bpf_override_return(pt_regs, rc) + * @pt_regs: pointer to struct pt_regs + * @rc: the return value to set */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -736,7 +740,8 @@ union bpf_attr { FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ FN(perf_prog_read_value), \ - FN(getsockopt), + FN(getsockopt), \ + FN(override_return), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b16c6f8f42b6..d32bebf4f2de 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1320,6 +1320,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + if (fp->kprobe_override) + return false; + if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7afa92e9b409..e807bda7fe29 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4413,6 +4413,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index f10609e539d4..5857c500721b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8080,6 +8080,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..3e6fd580fe7f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -529,6 +529,17 @@ config FUNCTION_PROFILER If in doubt, say N. +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on KPROBES_ON_FTRACE + depends on HAVE_KPROBE_OVERRIDE + depends on DYNAMIC_FTRACE_WITH_REGS + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b143f2a05aff..e009b7ecf473 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,10 @@ #include #include #include +#include +#include + +#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -76,6 +80,24 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + __this_cpu_write(bpf_kprobe_override, 1); + regs_set_return_value(regs, rc); + arch_ftrace_kprobe_override_function(regs); + return 0; +} + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; +#endif + BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -551,6 +573,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + case BPF_FUNC_override_return: + return &bpf_override_return_proto; +#endif default: return tracing_func_proto(func_id); } @@ -768,6 +794,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; + /* + * Kprobe override only works for ftrace based kprobes, and only if they + * are on the opt-in list. + */ + if (prog->kprobe_override && + (!trace_kprobe_ftrace(event->tp_event) || + !trace_kprobe_error_injectable(event->tp_event))) + return -EINVAL; + mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 492700c5fb4d..5db849809a56 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,6 +42,7 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) +DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -87,6 +88,27 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +int trace_kprobe_ftrace(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + return kprobe_ftrace(&tk->rp.kp); +} + +int trace_kprobe_error_injectable(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return within_kprobe_error_injection_list(addr); +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1170,7 +1192,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static void +static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1179,12 +1201,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so clear the kprobe and return 1 so that we + * don't do the instruction skipping. Also reset our state so + * we are clean the next pass through. + */ + if (__this_cpu_read(bpf_kprobe_override)) { + __this_cpu_write(bpf_kprobe_override, 0); + reset_current_kprobe(); + return 1; + } + if (!ret) + return 0; + } head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return; + return 0; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1193,13 +1232,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return; + return 0; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); + return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1275,6 +1315,7 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1282,9 +1323,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - kprobe_perf_func(tk, regs); + ret = kprobe_perf_func(tk, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return ret; } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb66e3eaa192..5e54d748c84c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -252,6 +252,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +int trace_kprobe_ftrace(struct trace_event_call *call); +int trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } + +static inline int trace_kprobe_ftrace(struct trace_event_call *call) +{ + return 0; +} + +static inline int trace_kprobe_error_injectable(struct trace_event_call *call) +{ + return 0; +} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { -- cgit v1.2.3-59-g8ed1b From 7db7d9f369a47e1a46f93c320b45cb89e81723e7 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 19 Nov 2017 15:05:11 +0100 Subject: batman-adv: Add SPDX license identifier above copyright header The "Linux kernel licensing rules" require that each file has a SPDX license identifier as first line (and sometimes as second line). The FSFE REUSE practices [1] would also require the same tags but have no restrictions on the placement in the source file. Using the "Linux kernel licensing rules" is therefore also fulfilling the FSFE REUSE practices requirements at the same time. [1] https://reuse.software/practices/ Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 1 + net/batman-adv/Makefile | 2 +- net/batman-adv/bat_algo.c | 1 + net/batman-adv/bat_algo.h | 1 + net/batman-adv/bat_iv_ogm.c | 1 + net/batman-adv/bat_iv_ogm.h | 1 + net/batman-adv/bat_v.c | 1 + net/batman-adv/bat_v.h | 1 + net/batman-adv/bat_v_elp.c | 1 + net/batman-adv/bat_v_elp.h | 1 + net/batman-adv/bat_v_ogm.c | 1 + net/batman-adv/bat_v_ogm.h | 1 + net/batman-adv/bitarray.c | 1 + net/batman-adv/bitarray.h | 1 + net/batman-adv/bridge_loop_avoidance.c | 1 + net/batman-adv/bridge_loop_avoidance.h | 1 + net/batman-adv/debugfs.c | 1 + net/batman-adv/debugfs.h | 1 + net/batman-adv/distributed-arp-table.c | 1 + net/batman-adv/distributed-arp-table.h | 1 + net/batman-adv/fragmentation.c | 1 + net/batman-adv/fragmentation.h | 1 + net/batman-adv/gateway_client.c | 1 + net/batman-adv/gateway_client.h | 1 + net/batman-adv/gateway_common.c | 1 + net/batman-adv/gateway_common.h | 1 + net/batman-adv/hard-interface.c | 1 + net/batman-adv/hard-interface.h | 1 + net/batman-adv/hash.c | 1 + net/batman-adv/hash.h | 1 + net/batman-adv/icmp_socket.c | 1 + net/batman-adv/icmp_socket.h | 1 + net/batman-adv/log.c | 1 + net/batman-adv/log.h | 1 + net/batman-adv/main.c | 1 + net/batman-adv/main.h | 1 + net/batman-adv/multicast.c | 1 + net/batman-adv/multicast.h | 1 + net/batman-adv/netlink.c | 1 + net/batman-adv/netlink.h | 1 + net/batman-adv/network-coding.c | 1 + net/batman-adv/network-coding.h | 1 + net/batman-adv/originator.c | 1 + net/batman-adv/originator.h | 1 + net/batman-adv/packet.h | 1 + net/batman-adv/routing.c | 1 + net/batman-adv/routing.h | 1 + net/batman-adv/send.c | 1 + net/batman-adv/send.h | 1 + net/batman-adv/soft-interface.c | 1 + net/batman-adv/soft-interface.h | 1 + net/batman-adv/sysfs.c | 1 + net/batman-adv/sysfs.h | 1 + net/batman-adv/tp_meter.c | 1 + net/batman-adv/tp_meter.h | 1 + net/batman-adv/translation-table.c | 1 + net/batman-adv/translation-table.h | 1 + net/batman-adv/tvlv.c | 1 + net/batman-adv/tvlv.h | 1 + net/batman-adv/types.h | 1 + 60 files changed, 60 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index efd641c8a5d6..fb4533826163 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: ISC */ /* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: * * Matthias Schiffer diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 915987bc6d29..022f6e77307b 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,4 +1,4 @@ -# +# SPDX-License-Identifier: GPL-2.0 # Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 44fd073b7546..fa306b25a78b 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 29f6312f9bf1..029221615ba3 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 1b659ab652fb..bff5ec66a2e1 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index ae2ab526bdb1..9dc0dd5c83df 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 341ceab8338d..16709552c21e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index dd7c4b647e6b..a17ab68bbce8 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1de992c58b35..8375fd679db3 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 376ead280ab9..5e39d0588a48 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index c251445a42a0..22d2bafa814a 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 2068770b542d..6a4c14ccc3c6 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 2b070c7e31da..125817c389e5 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index cc262c9d97e0..8cb2c874f5d3 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index cdd8e8e4df0b..007147f3ed9e 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 234775748b8e..b568cec819c5 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index e32ad47c6efd..d94585dc2dbe 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 9c5d4a65b98c..90a08d35c501 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 760c0de72582..3c2faf773335 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index ec364a3c1c66..d81a05a6e6f9 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index a98cf1104a30..22ce4c0c86c3 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 1a2d6c308745..30ffa992fcfc 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 10d521f0b17f..e8db19940ab8 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 3baa3d466e5e..981f58421a32 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 2c26039c23fc..a7039503d88e 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 0a6a97d201f2..7c298b05c1dc 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 4e3d5340ad96..2f067a507fd5 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 9f9890ff7a22..ac7311a91f9d 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index b5f7e13918ac..a6dbaf2e9fc9 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 0c905e91c5e2..81cf54eb2fad 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index bded31121d12..cc76f1365300 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index f3fec40aae86..84cddd01eeab 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 4ef4bde2cc2d..148e64e846d2 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 65ce97efa6b5..6744a64143c0 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 4daed7ad46f2..5ce2007ea11b 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index bb8003cf2296..4bdb39ab3b20 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index e553a8770a89..01546a42b7ad 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 2a78cddab0e9..51f273b5b77d 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index ab13b4d58733..ce424fe2f24e 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: * * Matthias Schiffer diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index f1cd8c5da966..0e7e57b69b54 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: * * Matthias Schiffer diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 3604d7899e2c..5cfac6e56610 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index c66efb81d2f4..adaeafa4f71e 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 0a565d0422bb..0716daf5b9a7 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 40c7f039d5d7..b5d2164532c9 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 8e8a5db197cb..4eaf4b426726 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 40d9bf3e5bfe..86b0ea1e5c1c 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 5ede16c32f15..a1289bc5f115 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 7895323fd2a7..a6c53684ba70 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index a16b34f473ef..eb36820e41bc 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 9f673cdfecf8..e543024f98ef 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 639c3abb214a..075c5b5b2ce1 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index aa187fd42475..ab0b95f15b36 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index e487412e256b..0384cb6c406b 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 15cd2139381e..601feb2c4ecf 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index a8ada5c123bd..c8b8f2cb2c2b 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8a3ce79b1307..281bd4cf7f90 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 411d586191da..8d9e3abec2c8 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 1d9e267caec9..67b2ba4b824b 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 4d01400ada30..a74df33f446d 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index a62795868794..1df798b32077 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich -- cgit v1.2.3-59-g8ed1b From a010579273bdfbee6ee79422dbebba3dcf18ebf7 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 19 Nov 2017 15:05:16 +0100 Subject: batman-adv: Change batman_adv.h license to MIT The ISC license is considered as not recommended in "Linux kernel licensing rules". It should only be used for existing code or for importing code from a different project with that license. But the kernel still has the similar sounding MIT/Expat license under the preferred licenses. Switching to this license for this relatively new file should therefore allow batman-adv to better follow the new licensing rules. Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Acked-by: Matthias Schiffer Acked-by: Andrew Lunn Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index fb4533826163..ae00c99cbed0 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -1,19 +1,25 @@ -/* SPDX-License-Identifier: ISC */ +/* SPDX-License-Identifier: MIT */ /* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: * * Matthias Schiffer * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ #ifndef _UAPI_LINUX_BATMAN_ADV_H_ -- cgit v1.2.3-59-g8ed1b From f551c91de262ba36b20c3ac19538afb4f4507441 Mon Sep 17 00:00:00 2001 From: William Tu Date: Wed, 13 Dec 2017 16:38:56 -0800 Subject: net: erspan: introduce erspan v2 for ip_gre The patch adds support for erspan version 2. Not all features are supported in this patch. The SGT (security group tag), GRA (timestamp granularity), FT (frame type) are set to fixed value. Only hardware ID and direction are configurable. Optional subheader is also not supported. Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 120 ++++++++++++++++++++++++++++++++++++++++- include/net/ip_tunnels.h | 5 +- include/uapi/linux/if_ether.h | 1 + include/uapi/linux/if_tunnel.h | 3 ++ net/ipv4/ip_gre.c | 105 ++++++++++++++++++++++++++++++------ 5 files changed, 216 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/erspan.h b/include/net/erspan.h index 70c40c7c75b2..acdf6843095d 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -24,11 +24,29 @@ * | Reserved | Index | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * + * + * ERSPAN Version 2 (Type III) header (12 octets [42:49]) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Ver | VLAN | COS |BSO|T| Session ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Timestamp | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SGT |P| FT | Hw ID |D|Gra|O| + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Platform Specific SubHeader (8 octets, optional) + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Platf ID | Platform Specific Info | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Platform Specific Info | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB */ #define ERSPAN_VERSION 0x1 /* ERSPAN type II */ - #define VER_MASK 0xf000 #define VLAN_MASK 0x0fff #define COS_MASK 0xe000 @@ -37,6 +55,28 @@ #define ID_MASK 0x03ff #define INDEX_MASK 0xfffff +#define ERSPAN_VERSION2 0x2 /* ERSPAN type III*/ +#define BSO_MASK EN_MASK +#define SGT_MASK 0xffff0000 +#define P_MASK 0x8000 +#define FT_MASK 0x7c00 +#define HWID_MASK 0x03f0 +#define DIR_MASK 0x0008 +#define GRA_MASK 0x0006 +#define O_MASK 0x0001 + +/* ERSPAN version 2 metadata header */ +struct erspan_md2 { + __be32 timestamp; + __be16 sgt; /* security group tag */ + __be16 flags; +#define P_OFFSET 15 +#define FT_OFFSET 10 +#define HWID_OFFSET 4 +#define DIR_OFFSET 3 +#define GRA_OFFSET 1 +}; + enum erspan_encap_type { ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ @@ -48,8 +88,10 @@ enum erspan_encap_type { #define ERSPAN_V2_MDSIZE 8 struct erspan_metadata { union { - __be32 index; /* Version 1 (type II)*/ + __be32 index; /* Version 1 (type II)*/ + struct erspan_md2 md2; /* Version 2 (type III) */ } u; + int version; }; struct erspan_base_hdr { @@ -58,6 +100,7 @@ struct erspan_base_hdr { __be16 session_id; #define COS_OFFSET 13 #define EN_OFFSET 11 +#define BSO_OFFSET EN_OFFSET #define T_OFFSET 10 }; @@ -123,4 +166,77 @@ static inline void erspan_build_header(struct sk_buff *skb, ersmd->u.index = htonl(index & INDEX_MASK); } +/* ERSPAN GRA: timestamp granularity + * 00b --> granularity = 100 microseconds + * 01b --> granularity = 100 nanoseconds + * 10b --> granularity = IEEE 1588 + * Here we only support 100 microseconds. + */ +static inline __be32 erspan_get_timestamp(void) +{ + u64 h_usecs; + ktime_t kt; + + kt = ktime_get_real(); + h_usecs = ktime_divns(kt, 100 * NSEC_PER_USEC); + + /* ERSPAN base header only has 32-bit, + * so it wraps around 4 days. + */ + return htonl((u32)h_usecs); +} + +static inline void erspan_build_header_v2(struct sk_buff *skb, + __be32 id, u8 direction, u16 hwid, + bool truncate, bool is_ipv4) +{ + struct ethhdr *eth = eth_hdr(skb); + struct erspan_base_hdr *ershdr; + struct erspan_metadata *md; + struct qtag_prefix { + __be16 eth_type; + __be16 tci; + } *qp; + u16 vlan_tci = 0; + u16 session_id; + u8 gra = 0; /* 100 usec */ + u8 bso = 0; /* Bad/Short/Oversized */ + u8 sgt = 0; + u8 tos; + + tos = is_ipv4 ? ip_hdr(skb)->tos : + (ipv6_hdr(skb)->priority << 4) + + (ipv6_hdr(skb)->flow_lbl[0] >> 4); + + /* Unlike v1, v2 does not have En field, + * so only extract vlan tci field. + */ + if (eth->h_proto == htons(ETH_P_8021Q)) { + qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); + vlan_tci = ntohs(qp->tci); + } + + skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); + ershdr = (struct erspan_base_hdr *)skb->data; + memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); + + /* Build base header */ + ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | + (ERSPAN_VERSION2 << VER_OFFSET)); + session_id = (u16)(ntohl(id) & ID_MASK) | + ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) | + (bso << BSO_OFFSET & BSO_MASK) | + ((truncate << T_OFFSET) & T_MASK); + ershdr->session_id = htons(session_id); + + /* Build metadata */ + md = (struct erspan_metadata *)(ershdr + 1); + md->u.md2.timestamp = erspan_get_timestamp(); + md->u.md2.sgt = htons(sgt); + md->u.md2.flags = htons(((1 << P_OFFSET) & P_MASK) | + ((hwid << HWID_OFFSET) & HWID_MASK) | + ((direction << DIR_OFFSET) & DIR_MASK) | + ((gra << GRA_OFFSET) & GRA_MASK)); +} + #endif diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 24628f6b09bf..1f16773cfd76 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -116,8 +116,11 @@ struct ip_tunnel { u32 o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ - /* This field used only by ERSPAN */ + /* These four fields used only by ERSPAN */ u32 index; /* ERSPAN type II index */ + u8 erspan_ver; /* ERSPAN version */ + u8 dir; /* ERSPAN direction */ + u16 hwid; /* ERSPAN hardware ID */ struct dst_cache dst_cache; diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 3ee3bf7c8526..87b7529fcdfe 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -47,6 +47,7 @@ #define ETH_P_PUP 0x0200 /* Xerox PUP packet */ #define ETH_P_PUPAT 0x0201 /* Xerox PUP Addr Trans packet */ #define ETH_P_TSN 0x22F0 /* TSN (IEEE 1722) packet */ +#define ETH_P_ERSPAN2 0x22EB /* ERSPAN version 2 (type III) */ #define ETH_P_IP 0x0800 /* Internet Protocol packet */ #define ETH_P_X25 0x0805 /* CCITT X.25 */ #define ETH_P_ARP 0x0806 /* Address Resolution packet */ diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index e68dadbd6d45..1b3d148c4560 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -137,6 +137,9 @@ enum { IFLA_GRE_IGNORE_DF, IFLA_GRE_FWMARK, IFLA_GRE_ERSPAN_INDEX, + IFLA_GRE_ERSPAN_VER, + IFLA_GRE_ERSPAN_DIR, + IFLA_GRE_ERSPAN_HWID, __IFLA_GRE_MAX, }; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 3e37402147f3..004800b923c6 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -315,11 +315,26 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, return PACKET_REJECT; memcpy(md, pkt_md, sizeof(*md)); + md->version = ver; + info = &tun_dst->u.tun_info; info->key.tun_flags |= TUNNEL_ERSPAN_OPT; info->options_len = sizeof(*md); } else { - tunnel->index = ntohl(pkt_md->u.index); + tunnel->erspan_ver = ver; + if (ver == 1) { + tunnel->index = ntohl(pkt_md->u.index); + } else { + u16 md2_flags; + u16 dir, hwid; + + md2_flags = ntohs(pkt_md->u.md2.flags); + dir = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + tunnel->dir = dir; + tunnel->hwid = hwid; + } + } skb_reset_mac_header(skb); @@ -413,7 +428,8 @@ static int gre_rcv(struct sk_buff *skb) if (hdr_len < 0) goto drop; - if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) { + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; } @@ -568,6 +584,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, bool truncate = false; struct flowi4 fl; int tunnel_hlen; + int version; __be16 df; tun_info = skb_tunnel_info(skb); @@ -576,9 +593,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto err_free_rt; /* ERSPAN has fixed 8 byte GRE header */ - tunnel_hlen = 8 + sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE; + version = md->version; + tunnel_hlen = 8 + erspan_hdr_len(version); rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); if (!rt) @@ -592,12 +613,23 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, truncate = true; } - md = ip_tunnel_info_opts(tun_info); - if (!md) - goto err_free_rt; + if (version == 1) { + erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), + ntohl(md->u.index), truncate, true); + } else if (version == 2) { + u16 md2_flags; + u8 direction; + u16 hwid; - erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), - ntohl(md->u.index), truncate, true); + md2_flags = ntohs(md->u.md2.flags); + direction = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + + erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id), + direction, hwid, truncate, true); + } else { + goto err_free_rt; + } gre_build_header(skb, 8, TUNNEL_SEQ, htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); @@ -699,8 +731,14 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, - truncate, true); + if (tunnel->erspan_ver == 1) + erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, + truncate, true); + else + erspan_build_header_v2(skb, tunnel->parms.o_key, + tunnel->dir, tunnel->hwid, + truncate, true); + tunnel->parms.o_flags &= ~TUNNEL_KEY; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); return NETDEV_TX_OK; @@ -1172,13 +1210,32 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); - if (data[IFLA_GRE_ERSPAN_INDEX]) { - t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (data[IFLA_GRE_ERSPAN_VER]) { + t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - if (t->index & ~INDEX_MASK) + if (t->erspan_ver != 1 && t->erspan_ver != 2) return -EINVAL; } + if (t->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (t->index & ~INDEX_MASK) + return -EINVAL; + } + } else if (t->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (t->dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + if (data[IFLA_GRE_ERSPAN_HWID]) { + t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + if (t->hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + return 0; } @@ -1245,7 +1302,7 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + - sizeof(struct erspan_base_hdr) + ERSPAN_V1_MDSIZE; + erspan_hdr_len(tunnel->erspan_ver); t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; @@ -1375,6 +1432,12 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_GRE_ERSPAN_INDEX */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_VER */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_DIR */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_HWID */ + nla_total_size(2) + 0; } @@ -1417,9 +1480,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (t->index) + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) goto nla_put_failure; + } else if (t->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } return 0; @@ -1455,6 +1527,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { -- cgit v1.2.3-59-g8ed1b From cc8b0b92a1699bc32f7fec71daa2bfc90de43a4d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:05 -0800 Subject: bpf: introduce function calls (function boundaries) Allow arbitrary function calls from bpf function to another bpf function. Since the beginning of bpf all bpf programs were represented as a single function and program authors were forced to use always_inline for all functions in their C code. That was causing llvm to unnecessary inflate the code size and forcing developers to move code to header files with little code reuse. With a bit of additional complexity teach verifier to recognize arbitrary function calls from one bpf function to another as long as all of functions are presented to the verifier as a single bpf program. New program layout: r6 = r1 // some code .. r1 = .. // arg1 r2 = .. // arg2 call pc+1 // function call pc-relative exit .. = r1 // access arg1 .. = r2 // access arg2 .. call pc+20 // second level of function call ... It allows for better optimized code and finally allows to introduce the core bpf libraries that can be reused in different projects, since programs are no longer limited by single elf file. With function calls bpf can be compiled into multiple .o files. This patch is the first step. It detects programs that contain multiple functions and checks that calls between them are valid. It splits the sequence of bpf instructions (one program) into a set of bpf functions that call each other. Calls to only known functions are allowed. In the future the verifier may allow calls to unresolved functions and will do dynamic linking. This logic supports statically linked bpf functions only. Such function boundary detection could have been done as part of control flow graph building in check_cfg(), but it's cleaner to separate function boundary detection vs control flow checks within a subprogram (function) into logically indepedent steps. Follow up patches may split check_cfg() further, but not check_subprogs(). Only allow bpf-to-bpf calls for root only and for non-hw-offloaded programs. These restrictions can be relaxed in the future. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 5 +- include/uapi/linux/bpf.h | 6 ++ kernel/bpf/disasm.c | 8 ++- kernel/bpf/verifier.c | 141 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 155 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c561b986bab0..91a583bb3fa7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -141,6 +141,8 @@ struct bpf_ext_analyzer_ops { int insn_idx, int prev_insn_idx); }; +#define BPF_MAX_SUBPROGS 256 + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -159,8 +161,9 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ - struct bpf_verifer_log log; + u32 subprog_starts[BPF_MAX_SUBPROGS]; + u32 subprog_cnt; }; static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 595bda120cfb..d01f1cb3cfc0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -197,8 +197,14 @@ enum bpf_attach_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) +/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ #define BPF_PSEUDO_MAP_FD 1 +/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative + * offset to another bpf function + */ +#define BPF_PSEUDO_CALL 1 + /* flags for BPF_MAP_UPDATE_ELEM command */ #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index e682850c9715..883f88fa5bfc 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -189,8 +189,12 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); + if (insn->src_reg == BPF_PSEUDO_CALL) + verbose(env, "(%02x) call pc%+d\n", insn->code, + insn->imm); + else + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e807bda7fe29..1d0f7ff0b9a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "disasm.h" @@ -636,6 +638,113 @@ enum reg_arg_type { DST_OP_NO_MARK /* same as above, check only, don't mark */ }; +static int cmp_subprogs(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +static int find_subprog(struct bpf_verifier_env *env, int off) +{ + u32 *p; + + p = bsearch(&off, env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs); + if (!p) + return -ENOENT; + return p - env->subprog_starts; + +} + +static int add_subprog(struct bpf_verifier_env *env, int off) +{ + int insn_cnt = env->prog->len; + int ret; + + if (off >= insn_cnt || off < 0) { + verbose(env, "call to invalid destination\n"); + return -EINVAL; + } + ret = find_subprog(env, off); + if (ret >= 0) + return 0; + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { + verbose(env, "too many subprograms\n"); + return -E2BIG; + } + env->subprog_starts[env->subprog_cnt++] = off; + sort(env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + + /* determine subprog starts. The end is one before the next starts */ + for (i = 0; i < insn_cnt; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + if (!env->allow_ptr_leaks) { + verbose(env, "function calls to other bpf functions are allowed for root only\n"); + return -EPERM; + } + if (bpf_prog_is_dev_bound(env->prog->aux)) { + verbose(env, "funcation calls in offloaded programs are not supported yet\n"); + return -EINVAL; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + } + + if (env->log.level > 1) + for (i = 0; i < env->subprog_cnt; i++) + verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + + /* now check that all jumps are within the same subprog */ + subprog_start = 0; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + for (i = 0; i < insn_cnt; i++) { + u8 code = insn[i].code; + + if (BPF_CLASS(code) != BPF_JMP) + goto next; + if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + goto next; + off = i + insn[i].off + 1; + if (off < subprog_start || off >= subprog_end) { + verbose(env, "jump out of range from insn %d to %d\n", i, off); + return -EINVAL; + } +next: + if (i == subprog_end - 1) { + /* to avoid fall-through from one subprog into another + * the last insn of the subprog should be either exit + * or unconditional jump back + */ + if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP | BPF_JA)) { + verbose(env, "last insn is not an exit or jmp\n"); + return -EINVAL; + } + subprog_start = subprog_end; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + } + } + return 0; +} + static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { struct bpf_verifier_state *parent = state->parent; @@ -3284,6 +3393,10 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; + ret = check_subprogs(env); + if (ret < 0) + return ret; + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -3316,6 +3429,14 @@ peek_stack: goto err_free; if (t + 1 < insn_cnt) env->explored_states[t + 1] = STATE_LIST_MARK; + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + env->explored_states[t] = STATE_LIST_MARK; + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -4245,6 +4366,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, return 0; } +static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + for (i = 0; i < env->subprog_cnt; i++) { + if (env->subprog_starts[i] < off) + continue; + env->subprog_starts[i] += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -4255,6 +4389,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return NULL; if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; + adjust_subprog_starts(env, off, len); return new_prog; } @@ -4408,6 +4543,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL)) continue; + if (insn->src_reg == BPF_PSEUDO_CALL) + continue; if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; @@ -4589,12 +4726,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->explored_states) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = check_cfg(env); if (ret < 0) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); -- cgit v1.2.3-59-g8ed1b From 06ef0ccb5a36e1feba9b413ff59a04ecc4407c1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Dec 2017 10:13:44 -0800 Subject: bpf/cgroup: fix a verification error for a CGROUP_DEVICE type prog The tools/testing/selftests/bpf test program test_dev_cgroup fails with the following error when compiled with llvm 6.0. (I did not try with earlier versions.) libbpf: load bpf program failed: Permission denied libbpf: -- BEGIN DUMP LOG --- libbpf: 0: (61) r2 = *(u32 *)(r1 +4) 1: (b7) r0 = 0 2: (55) if r2 != 0x1 goto pc+8 R0=inv0 R1=ctx(id=0,off=0,imm=0) R2=inv1 R10=fp0 3: (69) r2 = *(u16 *)(r1 +0) invalid bpf_context access off=0 size=2 ... The culprit is the following statement in dev_cgroup.c: short type = ctx->access_type & 0xFFFF; This code is typical as the ctx->access_type is assigned as below in kernel/bpf/cgroup.c: struct bpf_cgroup_dev_ctx ctx = { .access_type = (access << 16) | dev_type, .major = major, .minor = minor, }; The compiler converts it to u16 access while the verifier cgroup_dev_is_valid_access rejects any non u32 access. This patch permits the field access_type to be accessible with type u16 and u8 as well. Signed-off-by: Yonghong Song Tested-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 ++- kernel/bpf/cgroup.c | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d01f1cb3cfc0..69eabfcb9bdb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1012,7 +1012,8 @@ struct bpf_perf_event_value { #define BPF_DEVCG_DEV_CHAR (1ULL << 1) struct bpf_cgroup_dev_ctx { - __u32 access_type; /* (access << 16) | type */ + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; __u32 major; __u32 minor; }; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b789ab78d28f..c1c0b60d3f2f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -568,6 +568,8 @@ static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (type == BPF_WRITE) return false; @@ -576,8 +578,17 @@ static bool cgroup_dev_is_valid_access(int off, int size, /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + break; + default: + if (size != size_default) + return false; + } return true; } -- cgit v1.2.3-59-g8ed1b From 983dafaab799511e092ffd006f3a064b37ccbccf Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Wed, 13 Dec 2017 19:51:36 +0200 Subject: cfg80211: Scan results to also report the per chain signal strength This commit enhances the scan results to report the per chain signal strength based on the latest BSS update. This provides similar information to what is already available through STA information. Signed-off-by: Sunil Dutt Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 5 +++++ net/wireless/scan.c | 5 +++++ 4 files changed, 22 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d7f8e7b96bcb..3a4a1a903a4d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1773,6 +1773,8 @@ enum cfg80211_signal_type { * by %parent_bssid. * @parent_bssid: the BSS according to which %parent_tsf is set. This is set to * the BSS that requested the scan in which the beacon/probe was received. + * @chains: bitmask for filled values in @chain_signal. + * @chain_signal: per-chain signal strength of last received BSS in dBm. */ struct cfg80211_inform_bss { struct ieee80211_channel *chan; @@ -1781,6 +1783,8 @@ struct cfg80211_inform_bss { u64 boottime_ns; u64 parent_tsf; u8 parent_bssid[ETH_ALEN] __aligned(2); + u8 chains; + s8 chain_signal[IEEE80211_MAX_CHAINS]; }; /** @@ -1824,6 +1828,8 @@ struct cfg80211_bss_ies { * that holds the beacon data. @beacon_ies is still valid, of course, and * points to the same data as hidden_beacon_bss->beacon_ies in that case. * @signal: signal strength value (type depends on the wiphy's signal_type) + * @chains: bitmask for filled values in @chain_signal. + * @chain_signal: per-chain signal strength of last received BSS in dBm. * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes */ struct cfg80211_bss { @@ -1842,6 +1848,8 @@ struct cfg80211_bss { u16 capability; u8 bssid[ETH_ALEN]; + u8 chains; + s8 chain_signal[IEEE80211_MAX_CHAINS]; u8 priv[0] __aligned(sizeof(void *)); }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f882fe1f9709..c587a61c32bf 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3862,6 +3862,9 @@ enum nl80211_bss_scan_width { * @NL80211_BSS_PARENT_BSSID. (u64). * @NL80211_BSS_PARENT_BSSID: the BSS according to which @NL80211_BSS_PARENT_TSF * is set. + * @NL80211_BSS_CHAIN_SIGNAL: per-chain signal strength of last BSS update. + * Contains a nested array of signal strength attributes (u8, dBm), + * using the nesting index as the antenna number. * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -3885,6 +3888,7 @@ enum nl80211_bss { NL80211_BSS_PAD, NL80211_BSS_PARENT_TSF, NL80211_BSS_PARENT_BSSID, + NL80211_BSS_CHAIN_SIGNAL, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e4dddfb64ced..b3f8970c3a47 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7839,6 +7839,11 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, intbss->ts_boottime, NL80211_BSS_PAD)) goto nla_put_failure; + if (!nl80211_put_signal(msg, intbss->pub.chains, + intbss->pub.chain_signal, + NL80211_BSS_CHAIN_SIGNAL)) + goto nla_put_failure; + switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, res->signal)) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index f6c5fe482506..d36c3eb7b931 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -981,6 +981,9 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, found->ts = tmp->ts; found->ts_boottime = tmp->ts_boottime; found->parent_tsf = tmp->parent_tsf; + found->pub.chains = tmp->pub.chains; + memcpy(found->pub.chain_signal, tmp->pub.chain_signal, + IEEE80211_MAX_CHAINS); ether_addr_copy(found->parent_bssid, tmp->parent_bssid); } else { struct cfg80211_internal_bss *new; @@ -1233,6 +1236,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); tmp.ts_boottime = data->boottime_ns; tmp.parent_tsf = data->parent_tsf; + tmp.pub.chains = data->chains; + memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS); ether_addr_copy(tmp.parent_bssid, data->parent_bssid); signal_valid = abs(data->chan->center_freq - channel->center_freq) <= -- cgit v1.2.3-59-g8ed1b From fec149f5d3234c037ec761d1db4cc8c0550e9964 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 21 Dec 2017 10:17:41 +0100 Subject: batman-adv: Convert packet.h to uapi header The header file is used by different userspace programs to inject packets or to decode sniffed packets. It should therefore be available to them as userspace header. Also other components in the kernel (like the flow dissector) require access to the packet definitions to be able to decode ETH_P_BATMAN ethernet packets. Signed-off-by: Sven Eckelmann Signed-off-by: David S. Miller --- MAINTAINERS | 1 + include/uapi/linux/batadv_packet.h | 644 +++++++++++++++++++++++++++++++++ net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/bat_v.c | 2 +- net/batman-adv/bat_v_elp.c | 2 +- net/batman-adv/bat_v_ogm.c | 2 +- net/batman-adv/bridge_loop_avoidance.c | 2 +- net/batman-adv/distributed-arp-table.h | 2 +- net/batman-adv/fragmentation.c | 2 +- net/batman-adv/gateway_client.c | 2 +- net/batman-adv/gateway_common.c | 2 +- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/icmp_socket.c | 2 +- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 2 +- net/batman-adv/multicast.c | 2 +- net/batman-adv/netlink.c | 2 +- net/batman-adv/network-coding.c | 2 +- net/batman-adv/packet.h | 644 --------------------------------- net/batman-adv/routing.c | 2 +- net/batman-adv/send.h | 3 +- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/sysfs.c | 2 +- net/batman-adv/tp_meter.c | 2 +- net/batman-adv/translation-table.c | 2 +- net/batman-adv/tvlv.c | 2 +- net/batman-adv/types.h | 3 +- 27 files changed, 669 insertions(+), 670 deletions(-) create mode 100644 include/uapi/linux/batadv_packet.h delete mode 100644 net/batman-adv/packet.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 129c591e0f34..753799d24cd9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2564,6 +2564,7 @@ S: Maintained F: Documentation/ABI/testing/sysfs-class-net-batman-adv F: Documentation/ABI/testing/sysfs-class-net-mesh F: Documentation/networking/batman-adv.rst +F: include/uapi/linux/batadv_packet.h F: include/uapi/linux/batman_adv.h F: net/batman-adv/ diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h new file mode 100644 index 000000000000..5cb360be2a11 --- /dev/null +++ b/include/uapi/linux/batadv_packet.h @@ -0,0 +1,644 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _UAPI_LINUX_BATADV_PACKET_H_ +#define _UAPI_LINUX_BATADV_PACKET_H_ + +#include +#include +#include + +/** + * batadv_tp_is_error() - Check throughput meter return code for error + * @n: throughput meter return code + * + * Return: 0 when not error was detected, != 0 otherwise + */ +#define batadv_tp_is_error(n) ((__u8)(n) > 127 ? 1 : 0) + +/** + * enum batadv_packettype - types for batman-adv encapsulated packets + * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV + * @BATADV_BCAST: broadcast packets carrying broadcast payload + * @BATADV_CODED: network coded packets + * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V + * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V + * + * @BATADV_UNICAST: unicast packets carrying unicast payload traffic + * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original + * payload packet + * @BATADV_UNICAST_4ADDR: unicast packet including the originator address of + * the sender + * @BATADV_ICMP: unicast packet like IP ICMP used for ping or traceroute + * @BATADV_UNICAST_TVLV: unicast packet carrying TVLV containers + */ +enum batadv_packettype { + /* 0x00 - 0x3f: local packets or special rules for handling */ + BATADV_IV_OGM = 0x00, + BATADV_BCAST = 0x01, + BATADV_CODED = 0x02, + BATADV_ELP = 0x03, + BATADV_OGM2 = 0x04, + /* 0x40 - 0x7f: unicast */ +#define BATADV_UNICAST_MIN 0x40 + BATADV_UNICAST = 0x40, + BATADV_UNICAST_FRAG = 0x41, + BATADV_UNICAST_4ADDR = 0x42, + BATADV_ICMP = 0x43, + BATADV_UNICAST_TVLV = 0x44, +#define BATADV_UNICAST_MAX 0x7f + /* 0x80 - 0xff: reserved */ +}; + +/** + * enum batadv_subtype - packet subtype for unicast4addr + * @BATADV_P_DATA: user payload + * @BATADV_P_DAT_DHT_GET: DHT request message + * @BATADV_P_DAT_DHT_PUT: DHT store message + * @BATADV_P_DAT_CACHE_REPLY: ARP reply generated by DAT + */ +enum batadv_subtype { + BATADV_P_DATA = 0x01, + BATADV_P_DAT_DHT_GET = 0x02, + BATADV_P_DAT_DHT_PUT = 0x03, + BATADV_P_DAT_CACHE_REPLY = 0x04, +}; + +/* this file is included by batctl which needs these defines */ +#define BATADV_COMPAT_VERSION 15 + +/** + * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets + * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was + * previously received from someone else than the best neighbor. + * @BATADV_PRIMARIES_FIRST_HOP: flag unused. + * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a + * one hop neighbor on the interface where it was originally received. + */ +enum batadv_iv_flags { + BATADV_NOT_BEST_NEXT_HOP = 1UL << 0, + BATADV_PRIMARIES_FIRST_HOP = 1UL << 1, + BATADV_DIRECTLINK = 1UL << 2, +}; + +/** + * enum batadv_icmp_packettype - ICMP message types + * @BATADV_ECHO_REPLY: success reply to BATADV_ECHO_REQUEST + * @BATADV_DESTINATION_UNREACHABLE: failure when route to destination not found + * @BATADV_ECHO_REQUEST: request BATADV_ECHO_REPLY from destination + * @BATADV_TTL_EXCEEDED: error after BATADV_ECHO_REQUEST traversed too many hops + * @BATADV_PARAMETER_PROBLEM: return code for malformed messages + * @BATADV_TP: throughput meter packet + */ +enum batadv_icmp_packettype { + BATADV_ECHO_REPLY = 0, + BATADV_DESTINATION_UNREACHABLE = 3, + BATADV_ECHO_REQUEST = 8, + BATADV_TTL_EXCEEDED = 11, + BATADV_PARAMETER_PROBLEM = 12, + BATADV_TP = 15, +}; + +/** + * enum batadv_mcast_flags - flags for multicast capabilities and settings + * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for + * 224.0.0.0/24 or ff02::1 + * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets + * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets + */ +enum batadv_mcast_flags { + BATADV_MCAST_WANT_ALL_UNSNOOPABLES = 1UL << 0, + BATADV_MCAST_WANT_ALL_IPV4 = 1UL << 1, + BATADV_MCAST_WANT_ALL_IPV6 = 1UL << 2, +}; + +/* tt data subtypes */ +#define BATADV_TT_DATA_TYPE_MASK 0x0F + +/** + * enum batadv_tt_data_flags - flags for tt data tvlv + * @BATADV_TT_OGM_DIFF: TT diff propagated through OGM + * @BATADV_TT_REQUEST: TT request message + * @BATADV_TT_RESPONSE: TT response message + * @BATADV_TT_FULL_TABLE: contains full table to replace existing table + */ +enum batadv_tt_data_flags { + BATADV_TT_OGM_DIFF = 1UL << 0, + BATADV_TT_REQUEST = 1UL << 1, + BATADV_TT_RESPONSE = 1UL << 2, + BATADV_TT_FULL_TABLE = 1UL << 4, +}; + +/** + * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field + * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not + */ +enum batadv_vlan_flags { + BATADV_VLAN_HAS_TAG = 1UL << 15, +}; + +/** + * enum batadv_bla_claimframe - claim frame types for the bridge loop avoidance + * @BATADV_CLAIM_TYPE_CLAIM: claim of a client mac address + * @BATADV_CLAIM_TYPE_UNCLAIM: unclaim of a client mac address + * @BATADV_CLAIM_TYPE_ANNOUNCE: announcement of backbone with current crc + * @BATADV_CLAIM_TYPE_REQUEST: request of full claim table + * @BATADV_CLAIM_TYPE_LOOPDETECT: mesh-traversing loop detect packet + */ +enum batadv_bla_claimframe { + BATADV_CLAIM_TYPE_CLAIM = 0x00, + BATADV_CLAIM_TYPE_UNCLAIM = 0x01, + BATADV_CLAIM_TYPE_ANNOUNCE = 0x02, + BATADV_CLAIM_TYPE_REQUEST = 0x03, + BATADV_CLAIM_TYPE_LOOPDETECT = 0x04, +}; + +/** + * enum batadv_tvlv_type - tvlv type definitions + * @BATADV_TVLV_GW: gateway tvlv + * @BATADV_TVLV_DAT: distributed arp table tvlv + * @BATADV_TVLV_NC: network coding tvlv + * @BATADV_TVLV_TT: translation table tvlv + * @BATADV_TVLV_ROAM: roaming advertisement tvlv + * @BATADV_TVLV_MCAST: multicast capability tvlv + */ +enum batadv_tvlv_type { + BATADV_TVLV_GW = 0x01, + BATADV_TVLV_DAT = 0x02, + BATADV_TVLV_NC = 0x03, + BATADV_TVLV_TT = 0x04, + BATADV_TVLV_ROAM = 0x05, + BATADV_TVLV_MCAST = 0x06, +}; + +#pragma pack(2) +/* the destination hardware field in the ARP frame is used to + * transport the claim type and the group id + */ +struct batadv_bla_claim_dst { + __u8 magic[3]; /* FF:43:05 */ + __u8 type; /* bla_claimframe */ + __be16 group; /* group id */ +}; + +#pragma pack() + +/** + * struct batadv_ogm_packet - ogm (routing protocol) packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @flags: contains routing relevant flags - see enum batadv_iv_flags + * @seqno: sequence identification + * @orig: address of the source node + * @prev_sender: address of the previous sender + * @reserved: reserved byte for alignment + * @tq: transmission quality + * @tvlv_len: length of tvlv data following the ogm header + */ +struct batadv_ogm_packet { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 flags; + __be32 seqno; + __u8 orig[ETH_ALEN]; + __u8 prev_sender[ETH_ALEN]; + __u8 reserved; + __u8 tq; + __be16 tvlv_len; + /* __packed is not needed as the struct size is divisible by 4, + * and the largest data type in this struct has a size of 4. + */ +}; + +#define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet) + +/** + * struct batadv_ogm2_packet - ogm2 (routing protocol) packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header + * @flags: reseved for routing relevant flags - currently always 0 + * @seqno: sequence number + * @orig: originator mac address + * @tvlv_len: length of the appended tvlv buffer (in bytes) + * @throughput: the currently flooded path throughput + */ +struct batadv_ogm2_packet { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 flags; + __be32 seqno; + __u8 orig[ETH_ALEN]; + __be16 tvlv_len; + __be32 throughput; + /* __packed is not needed as the struct size is divisible by 4, + * and the largest data type in this struct has a size of 4. + */ +}; + +#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet) + +/** + * struct batadv_elp_packet - elp (neighbor discovery) packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @orig: originator mac address + * @seqno: sequence number + * @elp_interval: currently used ELP sending interval in ms + */ +struct batadv_elp_packet { + __u8 packet_type; + __u8 version; + __u8 orig[ETH_ALEN]; + __be32 seqno; + __be32 elp_interval; +}; + +#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet) + +/** + * struct batadv_icmp_header - common members among all the ICMP packets + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @msg_type: ICMP packet type + * @dst: address of the destination node + * @orig: address of the source node + * @uid: local ICMP socket identifier + * @align: not used - useful for alignment purposes only + * + * This structure is used for ICMP packets parsing only and it is never sent + * over the wire. The alignment field at the end is there to ensure that + * members are padded the same way as they are in real packets. + */ +struct batadv_icmp_header { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 msg_type; /* see ICMP message types above */ + __u8 dst[ETH_ALEN]; + __u8 orig[ETH_ALEN]; + __u8 uid; + __u8 align[3]; +}; + +/** + * struct batadv_icmp_packet - ICMP packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @msg_type: ICMP packet type + * @dst: address of the destination node + * @orig: address of the source node + * @uid: local ICMP socket identifier + * @reserved: not used - useful for alignment + * @seqno: ICMP sequence number + */ +struct batadv_icmp_packet { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 msg_type; /* see ICMP message types above */ + __u8 dst[ETH_ALEN]; + __u8 orig[ETH_ALEN]; + __u8 uid; + __u8 reserved; + __be16 seqno; +}; + +/** + * struct batadv_icmp_tp_packet - ICMP TP Meter packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @msg_type: ICMP packet type + * @dst: address of the destination node + * @orig: address of the source node + * @uid: local ICMP socket identifier + * @subtype: TP packet subtype (see batadv_icmp_tp_subtype) + * @session: TP session identifier + * @seqno: the TP sequence number + * @timestamp: time when the packet has been sent. This value is filled in a + * TP_MSG and echoed back in the next TP_ACK so that the sender can compute the + * RTT. Since it is read only by the host which wrote it, there is no need to + * store it using network order + */ +struct batadv_icmp_tp_packet { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 msg_type; /* see ICMP message types above */ + __u8 dst[ETH_ALEN]; + __u8 orig[ETH_ALEN]; + __u8 uid; + __u8 subtype; + __u8 session[2]; + __be32 seqno; + __be32 timestamp; +}; + +/** + * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes + * @BATADV_TP_MSG: Msg from sender to receiver + * @BATADV_TP_ACK: acknowledgment from receiver to sender + */ +enum batadv_icmp_tp_subtype { + BATADV_TP_MSG = 0, + BATADV_TP_ACK, +}; + +#define BATADV_RR_LEN 16 + +/** + * struct batadv_icmp_packet_rr - ICMP RouteRecord packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @msg_type: ICMP packet type + * @dst: address of the destination node + * @orig: address of the source node + * @uid: local ICMP socket identifier + * @rr_cur: number of entries the rr array + * @seqno: ICMP sequence number + * @rr: route record array + */ +struct batadv_icmp_packet_rr { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 msg_type; /* see ICMP message types above */ + __u8 dst[ETH_ALEN]; + __u8 orig[ETH_ALEN]; + __u8 uid; + __u8 rr_cur; + __be16 seqno; + __u8 rr[BATADV_RR_LEN][ETH_ALEN]; +}; + +#define BATADV_ICMP_MAX_PACKET_SIZE sizeof(struct batadv_icmp_packet_rr) + +/* All packet headers in front of an ethernet header have to be completely + * divisible by 2 but not by 4 to make the payload after the ethernet + * header again 4 bytes boundary aligned. + * + * A packing of 2 is necessary to avoid extra padding at the end of the struct + * caused by a structure member which is larger than two bytes. Otherwise + * the structure would not fulfill the previously mentioned rule to avoid the + * misalignment of the payload after the ethernet header. It may also lead to + * leakage of information when the padding it not initialized before sending. + */ +#pragma pack(2) + +/** + * struct batadv_unicast_packet - unicast packet for network payload + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @ttvn: translation table version number + * @dest: originator destination of the unicast packet + */ +struct batadv_unicast_packet { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 ttvn; /* destination translation table version number */ + __u8 dest[ETH_ALEN]; + /* "4 bytes boundary + 2 bytes" long to make the payload after the + * following ethernet header again 4 bytes boundary aligned + */ +}; + +/** + * struct batadv_unicast_4addr_packet - extended unicast packet + * @u: common unicast packet header + * @src: address of the source + * @subtype: packet subtype + * @reserved: reserved byte for alignment + */ +struct batadv_unicast_4addr_packet { + struct batadv_unicast_packet u; + __u8 src[ETH_ALEN]; + __u8 subtype; + __u8 reserved; + /* "4 bytes boundary + 2 bytes" long to make the payload after the + * following ethernet header again 4 bytes boundary aligned + */ +}; + +/** + * struct batadv_frag_packet - fragmented packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @dest: final destination used when routing fragments + * @orig: originator of the fragment used when merging the packet + * @no: fragment number within this sequence + * @priority: priority of frame, from ToS IP precedence or 802.1p + * @reserved: reserved byte for alignment + * @seqno: sequence identification + * @total_size: size of the merged packet + */ +struct batadv_frag_packet { + __u8 packet_type; + __u8 version; /* batman version field */ + __u8 ttl; +#if defined(__BIG_ENDIAN_BITFIELD) + __u8 no:4; + __u8 priority:3; + __u8 reserved:1; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + __u8 reserved:1; + __u8 priority:3; + __u8 no:4; +#else +#error "unknown bitfield endianness" +#endif + __u8 dest[ETH_ALEN]; + __u8 orig[ETH_ALEN]; + __be16 seqno; + __be16 total_size; +}; + +/** + * struct batadv_bcast_packet - broadcast packet for network payload + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @reserved: reserved byte for alignment + * @seqno: sequence identification + * @orig: originator of the broadcast packet + */ +struct batadv_bcast_packet { + __u8 packet_type; + __u8 version; /* batman version field */ + __u8 ttl; + __u8 reserved; + __be32 seqno; + __u8 orig[ETH_ALEN]; + /* "4 bytes boundary + 2 bytes" long to make the payload after the + * following ethernet header again 4 bytes boundary aligned + */ +}; + +/** + * struct batadv_coded_packet - network coded packet + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @first_source: original source of first included packet + * @first_orig_dest: original destinal of first included packet + * @first_crc: checksum of first included packet + * @first_ttvn: tt-version number of first included packet + * @second_ttl: ttl of second packet + * @second_dest: second receiver of this coded packet + * @second_source: original source of second included packet + * @second_orig_dest: original destination of second included packet + * @second_crc: checksum of second included packet + * @second_ttvn: tt version number of second included packet + * @coded_len: length of network coded part of the payload + */ +struct batadv_coded_packet { + __u8 packet_type; + __u8 version; /* batman version field */ + __u8 ttl; + __u8 first_ttvn; + /* __u8 first_dest[ETH_ALEN]; - saved in mac header destination */ + __u8 first_source[ETH_ALEN]; + __u8 first_orig_dest[ETH_ALEN]; + __be32 first_crc; + __u8 second_ttl; + __u8 second_ttvn; + __u8 second_dest[ETH_ALEN]; + __u8 second_source[ETH_ALEN]; + __u8 second_orig_dest[ETH_ALEN]; + __be32 second_crc; + __be16 coded_len; +}; + +#pragma pack() + +/** + * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the genereal header + * @ttl: time to live for this packet, part of the genereal header + * @reserved: reserved field (for packet alignment) + * @src: address of the source + * @dst: address of the destination + * @tvlv_len: length of tvlv data following the unicast tvlv header + * @align: 2 bytes to align the header to a 4 byte boundary + */ +struct batadv_unicast_tvlv_packet { + __u8 packet_type; + __u8 version; /* batman version field */ + __u8 ttl; + __u8 reserved; + __u8 dst[ETH_ALEN]; + __u8 src[ETH_ALEN]; + __be16 tvlv_len; + __u16 align; +}; + +/** + * struct batadv_tvlv_hdr - base tvlv header struct + * @type: tvlv container type (see batadv_tvlv_type) + * @version: tvlv container version + * @len: tvlv container length + */ +struct batadv_tvlv_hdr { + __u8 type; + __u8 version; + __be16 len; +}; + +/** + * struct batadv_tvlv_gateway_data - gateway data propagated through gw tvlv + * container + * @bandwidth_down: advertised uplink download bandwidth + * @bandwidth_up: advertised uplink upload bandwidth + */ +struct batadv_tvlv_gateway_data { + __be32 bandwidth_down; + __be32 bandwidth_up; +}; + +/** + * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container + * @flags: translation table flags (see batadv_tt_data_flags) + * @ttvn: translation table version number + * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by + * one batadv_tvlv_tt_vlan_data object per announced vlan + */ +struct batadv_tvlv_tt_data { + __u8 flags; + __u8 ttvn; + __be16 num_vlan; +}; + +/** + * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through + * the tt tvlv container + * @crc: crc32 checksum of the entries belonging to this vlan + * @vid: vlan identifier + * @reserved: unused, useful for alignment purposes + */ +struct batadv_tvlv_tt_vlan_data { + __be32 crc; + __be16 vid; + __u16 reserved; +}; + +/** + * struct batadv_tvlv_tt_change - translation table diff data + * @flags: status indicators concerning the non-mesh client (see + * batadv_tt_client_flags) + * @reserved: reserved field - useful for alignment purposes only + * @addr: mac address of non-mesh client that triggered this tt change + * @vid: VLAN identifier + */ +struct batadv_tvlv_tt_change { + __u8 flags; + __u8 reserved[3]; + __u8 addr[ETH_ALEN]; + __be16 vid; +}; + +/** + * struct batadv_tvlv_roam_adv - roaming advertisement + * @client: mac address of roaming client + * @vid: VLAN identifier + */ +struct batadv_tvlv_roam_adv { + __u8 client[ETH_ALEN]; + __be16 vid; +}; + +/** + * struct batadv_tvlv_mcast_data - payload of a multicast tvlv + * @flags: multicast flags announced by the orig node + * @reserved: reserved field + */ +struct batadv_tvlv_mcast_data { + __u8 flags; + __u8 reserved[3]; +}; + +#endif /* _UAPI_LINUX_BATADV_PACKET_H_ */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 84c36430c25a..79e326383726 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include "bat_algo.h" @@ -63,7 +64,6 @@ #include "netlink.h" #include "network-coding.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "translation-table.h" diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 78ddf3afa83a..27e165ac9302 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "bat_algo.h" @@ -49,7 +50,6 @@ #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" struct sk_buff; diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 59ae96cef596..a83478c46597 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -42,13 +42,13 @@ #include #include #include +#include #include "bat_algo.h" #include "bat_v_ogm.h" #include "hard-interface.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index e415974c540d..ba59b77c605d 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -39,13 +39,13 @@ #include #include #include +#include #include "bat_algo.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "translation-table.h" diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index e647450e5d0f..fad47853ad3c 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include "hard-interface.h" @@ -57,7 +58,6 @@ #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" #include "soft-interface.h" #include "sysfs.h" #include "translation-table.h" diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 3d47bedaf661..12897eb46268 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -24,9 +24,9 @@ #include #include #include +#include #include "originator.h" -#include "packet.h" struct seq_file; struct sk_buff; diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 4979350af9a7..22dde42fd80e 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -33,10 +33,10 @@ #include #include #include +#include #include "hard-interface.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "soft-interface.h" diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 2488e25d0eef..37fe9a644f22 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include "gateway_common.h" @@ -50,7 +51,6 @@ #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "soft-interface.h" #include "sysfs.h" diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 83bfeecf661c..b3e156af2256 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -27,10 +27,10 @@ #include #include #include +#include #include "gateway_client.h" #include "log.h" -#include "packet.h" #include "tvlv.h" /** diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 13d04dba0b3a..5f186bff284a 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "bat_v.h" #include "bridge_loop_avoidance.h" @@ -46,7 +47,6 @@ #include "gateway_client.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "send.h" #include "soft-interface.h" #include "sysfs.h" diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index f2ef75b7fa73..8041cf106c37 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -44,11 +44,11 @@ #include #include #include +#include #include "hard-interface.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "send.h" static struct batadv_socket_client *batadv_socket_client_hash[256]; diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 8bee4279d579..d31c8266e244 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include "bat_algo.h" @@ -63,7 +64,6 @@ #include "netlink.h" #include "network-coding.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "soft-interface.h" diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index d5484ac381d3..f7ba3f96d8f3 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -223,8 +223,8 @@ enum batadv_uev_type { #include #include #include +#include -#include "packet.h" #include "types.h" struct net_device; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 8a503c526b90..cbdeb47ec3f6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -55,11 +55,11 @@ #include #include #include +#include #include "hard-interface.h" #include "hash.h" #include "log.h" -#include "packet.h" #include "translation-table.h" #include "tvlv.h" diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 103d4bdcdbdb..a823d3899bad 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include "bat_algo.h" @@ -47,7 +48,6 @@ #include "gateway_client.h" #include "hard-interface.h" #include "originator.h" -#include "packet.h" #include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 3758be7fd881..b48116bb24ef 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -49,12 +49,12 @@ #include #include #include +#include #include "hard-interface.h" #include "hash.h" #include "log.h" #include "originator.h" -#include "packet.h" #include "routing.h" #include "send.h" #include "tvlv.h" diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h deleted file mode 100644 index 3b2d2db993aa..000000000000 --- a/net/batman-adv/packet.h +++ /dev/null @@ -1,644 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: - * - * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ - -#ifndef _NET_BATMAN_ADV_PACKET_H_ -#define _NET_BATMAN_ADV_PACKET_H_ - -#include -#include -#include - -/** - * batadv_tp_is_error() - Check throughput meter return code for error - * @n: throughput meter return code - * - * Return: 0 when not error was detected, != 0 otherwise - */ -#define batadv_tp_is_error(n) ((__u8)(n) > 127 ? 1 : 0) - -/** - * enum batadv_packettype - types for batman-adv encapsulated packets - * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV - * @BATADV_BCAST: broadcast packets carrying broadcast payload - * @BATADV_CODED: network coded packets - * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V - * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V - * - * @BATADV_UNICAST: unicast packets carrying unicast payload traffic - * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original - * payload packet - * @BATADV_UNICAST_4ADDR: unicast packet including the originator address of - * the sender - * @BATADV_ICMP: unicast packet like IP ICMP used for ping or traceroute - * @BATADV_UNICAST_TVLV: unicast packet carrying TVLV containers - */ -enum batadv_packettype { - /* 0x00 - 0x3f: local packets or special rules for handling */ - BATADV_IV_OGM = 0x00, - BATADV_BCAST = 0x01, - BATADV_CODED = 0x02, - BATADV_ELP = 0x03, - BATADV_OGM2 = 0x04, - /* 0x40 - 0x7f: unicast */ -#define BATADV_UNICAST_MIN 0x40 - BATADV_UNICAST = 0x40, - BATADV_UNICAST_FRAG = 0x41, - BATADV_UNICAST_4ADDR = 0x42, - BATADV_ICMP = 0x43, - BATADV_UNICAST_TVLV = 0x44, -#define BATADV_UNICAST_MAX 0x7f - /* 0x80 - 0xff: reserved */ -}; - -/** - * enum batadv_subtype - packet subtype for unicast4addr - * @BATADV_P_DATA: user payload - * @BATADV_P_DAT_DHT_GET: DHT request message - * @BATADV_P_DAT_DHT_PUT: DHT store message - * @BATADV_P_DAT_CACHE_REPLY: ARP reply generated by DAT - */ -enum batadv_subtype { - BATADV_P_DATA = 0x01, - BATADV_P_DAT_DHT_GET = 0x02, - BATADV_P_DAT_DHT_PUT = 0x03, - BATADV_P_DAT_CACHE_REPLY = 0x04, -}; - -/* this file is included by batctl which needs these defines */ -#define BATADV_COMPAT_VERSION 15 - -/** - * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets - * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was - * previously received from someone else than the best neighbor. - * @BATADV_PRIMARIES_FIRST_HOP: flag unused. - * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a - * one hop neighbor on the interface where it was originally received. - */ -enum batadv_iv_flags { - BATADV_NOT_BEST_NEXT_HOP = 1UL << 0, - BATADV_PRIMARIES_FIRST_HOP = 1UL << 1, - BATADV_DIRECTLINK = 1UL << 2, -}; - -/** - * enum batadv_icmp_packettype - ICMP message types - * @BATADV_ECHO_REPLY: success reply to BATADV_ECHO_REQUEST - * @BATADV_DESTINATION_UNREACHABLE: failure when route to destination not found - * @BATADV_ECHO_REQUEST: request BATADV_ECHO_REPLY from destination - * @BATADV_TTL_EXCEEDED: error after BATADV_ECHO_REQUEST traversed too many hops - * @BATADV_PARAMETER_PROBLEM: return code for malformed messages - * @BATADV_TP: throughput meter packet - */ -enum batadv_icmp_packettype { - BATADV_ECHO_REPLY = 0, - BATADV_DESTINATION_UNREACHABLE = 3, - BATADV_ECHO_REQUEST = 8, - BATADV_TTL_EXCEEDED = 11, - BATADV_PARAMETER_PROBLEM = 12, - BATADV_TP = 15, -}; - -/** - * enum batadv_mcast_flags - flags for multicast capabilities and settings - * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for - * 224.0.0.0/24 or ff02::1 - * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets - * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets - */ -enum batadv_mcast_flags { - BATADV_MCAST_WANT_ALL_UNSNOOPABLES = 1UL << 0, - BATADV_MCAST_WANT_ALL_IPV4 = 1UL << 1, - BATADV_MCAST_WANT_ALL_IPV6 = 1UL << 2, -}; - -/* tt data subtypes */ -#define BATADV_TT_DATA_TYPE_MASK 0x0F - -/** - * enum batadv_tt_data_flags - flags for tt data tvlv - * @BATADV_TT_OGM_DIFF: TT diff propagated through OGM - * @BATADV_TT_REQUEST: TT request message - * @BATADV_TT_RESPONSE: TT response message - * @BATADV_TT_FULL_TABLE: contains full table to replace existing table - */ -enum batadv_tt_data_flags { - BATADV_TT_OGM_DIFF = 1UL << 0, - BATADV_TT_REQUEST = 1UL << 1, - BATADV_TT_RESPONSE = 1UL << 2, - BATADV_TT_FULL_TABLE = 1UL << 4, -}; - -/** - * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field - * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not - */ -enum batadv_vlan_flags { - BATADV_VLAN_HAS_TAG = 1UL << 15, -}; - -/** - * enum batadv_bla_claimframe - claim frame types for the bridge loop avoidance - * @BATADV_CLAIM_TYPE_CLAIM: claim of a client mac address - * @BATADV_CLAIM_TYPE_UNCLAIM: unclaim of a client mac address - * @BATADV_CLAIM_TYPE_ANNOUNCE: announcement of backbone with current crc - * @BATADV_CLAIM_TYPE_REQUEST: request of full claim table - * @BATADV_CLAIM_TYPE_LOOPDETECT: mesh-traversing loop detect packet - */ -enum batadv_bla_claimframe { - BATADV_CLAIM_TYPE_CLAIM = 0x00, - BATADV_CLAIM_TYPE_UNCLAIM = 0x01, - BATADV_CLAIM_TYPE_ANNOUNCE = 0x02, - BATADV_CLAIM_TYPE_REQUEST = 0x03, - BATADV_CLAIM_TYPE_LOOPDETECT = 0x04, -}; - -/** - * enum batadv_tvlv_type - tvlv type definitions - * @BATADV_TVLV_GW: gateway tvlv - * @BATADV_TVLV_DAT: distributed arp table tvlv - * @BATADV_TVLV_NC: network coding tvlv - * @BATADV_TVLV_TT: translation table tvlv - * @BATADV_TVLV_ROAM: roaming advertisement tvlv - * @BATADV_TVLV_MCAST: multicast capability tvlv - */ -enum batadv_tvlv_type { - BATADV_TVLV_GW = 0x01, - BATADV_TVLV_DAT = 0x02, - BATADV_TVLV_NC = 0x03, - BATADV_TVLV_TT = 0x04, - BATADV_TVLV_ROAM = 0x05, - BATADV_TVLV_MCAST = 0x06, -}; - -#pragma pack(2) -/* the destination hardware field in the ARP frame is used to - * transport the claim type and the group id - */ -struct batadv_bla_claim_dst { - __u8 magic[3]; /* FF:43:05 */ - __u8 type; /* bla_claimframe */ - __be16 group; /* group id */ -}; - -#pragma pack() - -/** - * struct batadv_ogm_packet - ogm (routing protocol) packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @flags: contains routing relevant flags - see enum batadv_iv_flags - * @seqno: sequence identification - * @orig: address of the source node - * @prev_sender: address of the previous sender - * @reserved: reserved byte for alignment - * @tq: transmission quality - * @tvlv_len: length of tvlv data following the ogm header - */ -struct batadv_ogm_packet { - __u8 packet_type; - __u8 version; - __u8 ttl; - __u8 flags; - __be32 seqno; - __u8 orig[ETH_ALEN]; - __u8 prev_sender[ETH_ALEN]; - __u8 reserved; - __u8 tq; - __be16 tvlv_len; - /* __packed is not needed as the struct size is divisible by 4, - * and the largest data type in this struct has a size of 4. - */ -}; - -#define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet) - -/** - * struct batadv_ogm2_packet - ogm2 (routing protocol) packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the general header - * @ttl: time to live for this packet, part of the general header - * @flags: reseved for routing relevant flags - currently always 0 - * @seqno: sequence number - * @orig: originator mac address - * @tvlv_len: length of the appended tvlv buffer (in bytes) - * @throughput: the currently flooded path throughput - */ -struct batadv_ogm2_packet { - __u8 packet_type; - __u8 version; - __u8 ttl; - __u8 flags; - __be32 seqno; - __u8 orig[ETH_ALEN]; - __be16 tvlv_len; - __be32 throughput; - /* __packed is not needed as the struct size is divisible by 4, - * and the largest data type in this struct has a size of 4. - */ -}; - -#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet) - -/** - * struct batadv_elp_packet - elp (neighbor discovery) packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @orig: originator mac address - * @seqno: sequence number - * @elp_interval: currently used ELP sending interval in ms - */ -struct batadv_elp_packet { - __u8 packet_type; - __u8 version; - __u8 orig[ETH_ALEN]; - __be32 seqno; - __be32 elp_interval; -}; - -#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet) - -/** - * struct batadv_icmp_header - common members among all the ICMP packets - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @msg_type: ICMP packet type - * @dst: address of the destination node - * @orig: address of the source node - * @uid: local ICMP socket identifier - * @align: not used - useful for alignment purposes only - * - * This structure is used for ICMP packets parsing only and it is never sent - * over the wire. The alignment field at the end is there to ensure that - * members are padded the same way as they are in real packets. - */ -struct batadv_icmp_header { - __u8 packet_type; - __u8 version; - __u8 ttl; - __u8 msg_type; /* see ICMP message types above */ - __u8 dst[ETH_ALEN]; - __u8 orig[ETH_ALEN]; - __u8 uid; - __u8 align[3]; -}; - -/** - * struct batadv_icmp_packet - ICMP packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @msg_type: ICMP packet type - * @dst: address of the destination node - * @orig: address of the source node - * @uid: local ICMP socket identifier - * @reserved: not used - useful for alignment - * @seqno: ICMP sequence number - */ -struct batadv_icmp_packet { - __u8 packet_type; - __u8 version; - __u8 ttl; - __u8 msg_type; /* see ICMP message types above */ - __u8 dst[ETH_ALEN]; - __u8 orig[ETH_ALEN]; - __u8 uid; - __u8 reserved; - __be16 seqno; -}; - -/** - * struct batadv_icmp_tp_packet - ICMP TP Meter packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @msg_type: ICMP packet type - * @dst: address of the destination node - * @orig: address of the source node - * @uid: local ICMP socket identifier - * @subtype: TP packet subtype (see batadv_icmp_tp_subtype) - * @session: TP session identifier - * @seqno: the TP sequence number - * @timestamp: time when the packet has been sent. This value is filled in a - * TP_MSG and echoed back in the next TP_ACK so that the sender can compute the - * RTT. Since it is read only by the host which wrote it, there is no need to - * store it using network order - */ -struct batadv_icmp_tp_packet { - __u8 packet_type; - __u8 version; - __u8 ttl; - __u8 msg_type; /* see ICMP message types above */ - __u8 dst[ETH_ALEN]; - __u8 orig[ETH_ALEN]; - __u8 uid; - __u8 subtype; - __u8 session[2]; - __be32 seqno; - __be32 timestamp; -}; - -/** - * enum batadv_icmp_tp_subtype - ICMP TP Meter packet subtypes - * @BATADV_TP_MSG: Msg from sender to receiver - * @BATADV_TP_ACK: acknowledgment from receiver to sender - */ -enum batadv_icmp_tp_subtype { - BATADV_TP_MSG = 0, - BATADV_TP_ACK, -}; - -#define BATADV_RR_LEN 16 - -/** - * struct batadv_icmp_packet_rr - ICMP RouteRecord packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @msg_type: ICMP packet type - * @dst: address of the destination node - * @orig: address of the source node - * @uid: local ICMP socket identifier - * @rr_cur: number of entries the rr array - * @seqno: ICMP sequence number - * @rr: route record array - */ -struct batadv_icmp_packet_rr { - __u8 packet_type; - __u8 version; - __u8 ttl; - __u8 msg_type; /* see ICMP message types above */ - __u8 dst[ETH_ALEN]; - __u8 orig[ETH_ALEN]; - __u8 uid; - __u8 rr_cur; - __be16 seqno; - __u8 rr[BATADV_RR_LEN][ETH_ALEN]; -}; - -#define BATADV_ICMP_MAX_PACKET_SIZE sizeof(struct batadv_icmp_packet_rr) - -/* All packet headers in front of an ethernet header have to be completely - * divisible by 2 but not by 4 to make the payload after the ethernet - * header again 4 bytes boundary aligned. - * - * A packing of 2 is necessary to avoid extra padding at the end of the struct - * caused by a structure member which is larger than two bytes. Otherwise - * the structure would not fulfill the previously mentioned rule to avoid the - * misalignment of the payload after the ethernet header. It may also lead to - * leakage of information when the padding it not initialized before sending. - */ -#pragma pack(2) - -/** - * struct batadv_unicast_packet - unicast packet for network payload - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @ttvn: translation table version number - * @dest: originator destination of the unicast packet - */ -struct batadv_unicast_packet { - __u8 packet_type; - __u8 version; - __u8 ttl; - __u8 ttvn; /* destination translation table version number */ - __u8 dest[ETH_ALEN]; - /* "4 bytes boundary + 2 bytes" long to make the payload after the - * following ethernet header again 4 bytes boundary aligned - */ -}; - -/** - * struct batadv_unicast_4addr_packet - extended unicast packet - * @u: common unicast packet header - * @src: address of the source - * @subtype: packet subtype - * @reserved: reserved byte for alignment - */ -struct batadv_unicast_4addr_packet { - struct batadv_unicast_packet u; - __u8 src[ETH_ALEN]; - __u8 subtype; - __u8 reserved; - /* "4 bytes boundary + 2 bytes" long to make the payload after the - * following ethernet header again 4 bytes boundary aligned - */ -}; - -/** - * struct batadv_frag_packet - fragmented packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @dest: final destination used when routing fragments - * @orig: originator of the fragment used when merging the packet - * @no: fragment number within this sequence - * @priority: priority of frame, from ToS IP precedence or 802.1p - * @reserved: reserved byte for alignment - * @seqno: sequence identification - * @total_size: size of the merged packet - */ -struct batadv_frag_packet { - __u8 packet_type; - __u8 version; /* batman version field */ - __u8 ttl; -#if defined(__BIG_ENDIAN_BITFIELD) - __u8 no:4; - __u8 priority:3; - __u8 reserved:1; -#elif defined(__LITTLE_ENDIAN_BITFIELD) - __u8 reserved:1; - __u8 priority:3; - __u8 no:4; -#else -#error "unknown bitfield endianness" -#endif - __u8 dest[ETH_ALEN]; - __u8 orig[ETH_ALEN]; - __be16 seqno; - __be16 total_size; -}; - -/** - * struct batadv_bcast_packet - broadcast packet for network payload - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @reserved: reserved byte for alignment - * @seqno: sequence identification - * @orig: originator of the broadcast packet - */ -struct batadv_bcast_packet { - __u8 packet_type; - __u8 version; /* batman version field */ - __u8 ttl; - __u8 reserved; - __be32 seqno; - __u8 orig[ETH_ALEN]; - /* "4 bytes boundary + 2 bytes" long to make the payload after the - * following ethernet header again 4 bytes boundary aligned - */ -}; - -/** - * struct batadv_coded_packet - network coded packet - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @first_source: original source of first included packet - * @first_orig_dest: original destinal of first included packet - * @first_crc: checksum of first included packet - * @first_ttvn: tt-version number of first included packet - * @second_ttl: ttl of second packet - * @second_dest: second receiver of this coded packet - * @second_source: original source of second included packet - * @second_orig_dest: original destination of second included packet - * @second_crc: checksum of second included packet - * @second_ttvn: tt version number of second included packet - * @coded_len: length of network coded part of the payload - */ -struct batadv_coded_packet { - __u8 packet_type; - __u8 version; /* batman version field */ - __u8 ttl; - __u8 first_ttvn; - /* __u8 first_dest[ETH_ALEN]; - saved in mac header destination */ - __u8 first_source[ETH_ALEN]; - __u8 first_orig_dest[ETH_ALEN]; - __be32 first_crc; - __u8 second_ttl; - __u8 second_ttvn; - __u8 second_dest[ETH_ALEN]; - __u8 second_source[ETH_ALEN]; - __u8 second_orig_dest[ETH_ALEN]; - __be32 second_crc; - __be16 coded_len; -}; - -#pragma pack() - -/** - * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload - * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header - * @reserved: reserved field (for packet alignment) - * @src: address of the source - * @dst: address of the destination - * @tvlv_len: length of tvlv data following the unicast tvlv header - * @align: 2 bytes to align the header to a 4 byte boundary - */ -struct batadv_unicast_tvlv_packet { - __u8 packet_type; - __u8 version; /* batman version field */ - __u8 ttl; - __u8 reserved; - __u8 dst[ETH_ALEN]; - __u8 src[ETH_ALEN]; - __be16 tvlv_len; - __u16 align; -}; - -/** - * struct batadv_tvlv_hdr - base tvlv header struct - * @type: tvlv container type (see batadv_tvlv_type) - * @version: tvlv container version - * @len: tvlv container length - */ -struct batadv_tvlv_hdr { - __u8 type; - __u8 version; - __be16 len; -}; - -/** - * struct batadv_tvlv_gateway_data - gateway data propagated through gw tvlv - * container - * @bandwidth_down: advertised uplink download bandwidth - * @bandwidth_up: advertised uplink upload bandwidth - */ -struct batadv_tvlv_gateway_data { - __be32 bandwidth_down; - __be32 bandwidth_up; -}; - -/** - * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container - * @flags: translation table flags (see batadv_tt_data_flags) - * @ttvn: translation table version number - * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by - * one batadv_tvlv_tt_vlan_data object per announced vlan - */ -struct batadv_tvlv_tt_data { - __u8 flags; - __u8 ttvn; - __be16 num_vlan; -}; - -/** - * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through - * the tt tvlv container - * @crc: crc32 checksum of the entries belonging to this vlan - * @vid: vlan identifier - * @reserved: unused, useful for alignment purposes - */ -struct batadv_tvlv_tt_vlan_data { - __be32 crc; - __be16 vid; - __u16 reserved; -}; - -/** - * struct batadv_tvlv_tt_change - translation table diff data - * @flags: status indicators concerning the non-mesh client (see - * batadv_tt_client_flags) - * @reserved: reserved field - useful for alignment purposes only - * @addr: mac address of non-mesh client that triggered this tt change - * @vid: VLAN identifier - */ -struct batadv_tvlv_tt_change { - __u8 flags; - __u8 reserved[3]; - __u8 addr[ETH_ALEN]; - __be16 vid; -}; - -/** - * struct batadv_tvlv_roam_adv - roaming advertisement - * @client: mac address of roaming client - * @vid: VLAN identifier - */ -struct batadv_tvlv_roam_adv { - __u8 client[ETH_ALEN]; - __be16 vid; -}; - -/** - * struct batadv_tvlv_mcast_data - payload of a multicast tvlv - * @flags: multicast flags announced by the orig node - * @reserved: reserved field - */ -struct batadv_tvlv_mcast_data { - __u8 flags; - __u8 reserved[3]; -}; - -#endif /* _NET_BATMAN_ADV_PACKET_H_ */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index eb835bde502a..b6891e8b741c 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "bitarray.h" #include "bridge_loop_avoidance.h" @@ -44,7 +45,6 @@ #include "log.h" #include "network-coding.h" #include "originator.h" -#include "packet.h" #include "send.h" #include "soft-interface.h" #include "tp_meter.h" diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 8c7399dd06ca..1e8c79093623 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -24,8 +24,7 @@ #include #include #include - -#include "packet.h" +#include struct sk_buff; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 1eb5555c5fe4..900c5ce21cd4 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "bat_algo.h" #include "bridge_loop_avoidance.h" @@ -60,7 +61,6 @@ #include "multicast.h" #include "network-coding.h" #include "originator.h" -#include "packet.h" #include "send.h" #include "sysfs.h" #include "translation-table.h" diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 56fb42551453..c1578fa0b952 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" @@ -47,7 +48,6 @@ #include "hard-interface.h" #include "log.h" #include "network-coding.h" -#include "packet.h" #include "soft-interface.h" static struct net_device *batadv_kobj_to_netdev(struct kobject *obj) diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 7dcf2aa4deb5..8b576712d0c1 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -49,13 +49,13 @@ #include #include #include +#include #include #include "hard-interface.h" #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" #include "send.h" /** diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 0e53be3f8df0..7550a9ccd695 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include "bridge_loop_avoidance.h" @@ -60,7 +61,6 @@ #include "log.h" #include "netlink.h" #include "originator.h" -#include "packet.h" #include "soft-interface.h" #include "tvlv.h" diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index e189f026974c..5ffcb45ac6ff 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -36,9 +36,9 @@ #include #include #include +#include #include "originator.h" -#include "packet.h" #include "send.h" #include "tvlv.h" diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 77b145eba193..bb1578410e0c 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -35,10 +35,9 @@ #include #include #include +#include #include -#include "packet.h" - struct seq_file; #ifdef CONFIG_BATMAN_ADV_DAT -- cgit v1.2.3-59-g8ed1b From f15bc54eeecd86dfba3885aab839cd1f45172a38 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 22 Dec 2017 15:10:18 +0100 Subject: l2tp: add peer_offset parameter Introduce peer_offset parameter in order to add the capability to specify two different values for payload offset on tx/rx side. If just offset is provided by userspace use it for rx side as well in order to maintain compatibility with older l2tp versions Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 1 + net/l2tp/l2tp_core.c | 3 ++- net/l2tp/l2tp_core.h | 13 ++++++++++--- net/l2tp/l2tp_debugfs.c | 8 +++++--- net/l2tp/l2tp_netlink.c | 21 ++++++++++++++++++++- 5 files changed, 38 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index d84ce5c1c9aa..d6fee55dbded 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -127,6 +127,7 @@ enum { L2TP_ATTR_UDP_ZERO_CSUM6_TX, /* flag */ L2TP_ATTR_UDP_ZERO_CSUM6_RX, /* flag */ L2TP_ATTR_PAD, + L2TP_ATTR_PEER_OFFSET, /* u16 */ __L2TP_ATTR_MAX, }; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 115918ad8eca..6ff64717da1e 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -792,7 +792,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, ptr += 2 + offset; } } else - ptr += session->offset; + ptr += session->peer_offset; offset = ptr - optr; if (!pskb_may_pull(skb, offset)) @@ -1785,6 +1785,7 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn session->lns_mode = cfg->lns_mode; session->reorder_timeout = cfg->reorder_timeout; session->offset = cfg->offset; + session->peer_offset = cfg->peer_offset; session->l2specific_type = cfg->l2specific_type; session->l2specific_len = cfg->l2specific_len; session->cookie_len = cfg->cookie_len; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 9534e16965cc..c6fe7cc42a05 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -59,7 +59,8 @@ struct l2tp_session_cfg { int debug; /* bitmask of debug message * categories */ u16 vlan_id; /* VLAN pseudowire only */ - u16 offset; /* offset to payload */ + u16 offset; /* offset to tx payload */ + u16 peer_offset; /* offset to rx payload */ u16 l2specific_len; /* Layer 2 specific length */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ @@ -86,8 +87,14 @@ struct l2tp_session { int cookie_len; u8 peer_cookie[8]; int peer_cookie_len; - u16 offset; /* offset from end of L2TP header - to beginning of data */ + u16 offset; /* offset from end of L2TP + * header to beginning of + * tx data + */ + u16 peer_offset; /* offset from end of L2TP + * header to beginning of + * rx data + */ u16 l2specific_len; u16 l2specific_type; u16 hdr_len; diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index eb69411bcb47..4cc30b38aba4 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -180,8 +180,9 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) session->lns_mode ? "LNS" : "LAC", session->debug, jiffies_to_msecs(session->reorder_timeout)); - seq_printf(m, " offset %hu l2specific %hu/%hu\n", - session->offset, session->l2specific_type, session->l2specific_len); + seq_printf(m, " offset %hu peer_offset %hu l2specific %hu/%hu\n", + session->offset, session->peer_offset, + session->l2specific_type, session->l2specific_len); if (session->cookie_len) { seq_printf(m, " cookie %02x%02x%02x%02x", session->cookie[0], session->cookie[1], @@ -228,7 +229,8 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v) seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); seq_puts(m, " SESSION ID, peer ID, PWTYPE\n"); seq_puts(m, " refcnt cnt\n"); - seq_puts(m, " offset OFFSET l2specific TYPE/LEN\n"); + seq_puts(m, " offset OFFSET peer_offset OFFSET"); + seq_puts(m, " l2specific TYPE/LEN\n"); seq_puts(m, " [ cookie ]\n"); seq_puts(m, " [ peer cookie ]\n"); seq_puts(m, " config mtu/mru/rcvseq/sendseq/dataseq/lns debug reorderto\n"); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 7e9c50125556..d7d4d7a7a54d 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -547,9 +547,25 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf } if (tunnel->version > 2) { - if (info->attrs[L2TP_ATTR_OFFSET]) + if (info->attrs[L2TP_ATTR_PEER_OFFSET]) { + struct nlattr *peer_offset; + + peer_offset = info->attrs[L2TP_ATTR_PEER_OFFSET]; + cfg.peer_offset = nla_get_u16(peer_offset); + } + + if (info->attrs[L2TP_ATTR_OFFSET]) { cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]); + /* in order to maintain compatibility with older + * versions where offset was used for both tx and + * rx side, update rx side with offset if peer_offset + * is not provided by userspace + */ + if (!info->attrs[L2TP_ATTR_PEER_OFFSET]) + cfg.peer_offset = cfg.offset; + } + if (info->attrs[L2TP_ATTR_DATA_SEQ]) cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]); @@ -763,6 +779,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) || (session->offset && nla_put_u16(skb, L2TP_ATTR_OFFSET, session->offset)) || + (session->peer_offset && + nla_put_u16(skb, L2TP_ATTR_PEER_OFFSET, session->peer_offset)) || (session->cookie_len && nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, &session->cookie[0])) || @@ -903,6 +921,7 @@ static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_OFFSET] = { .type = NLA_U16, }, + [L2TP_ATTR_PEER_OFFSET] = { .type = NLA_U16, }, [L2TP_ATTR_DATA_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_TYPE] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_LEN] = { .type = NLA_U8, }, -- cgit v1.2.3-59-g8ed1b From 675fc275a3a2d905535207237402c6d8dcb5fa4b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:09 -0800 Subject: bpf: offload: report device information for offloaded programs Report to the user ifindex and namespace information of offloaded programs. If device has disappeared return -ENODEV. Specify the namespace using dev/inode combination. CC: Eric W. Biederman Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 3 +++ kernel/bpf/offload.c | 59 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 +++++ tools/include/uapi/linux/bpf.h | 3 +++ 5 files changed, 73 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9a916ab34299..7810ae57b357 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -531,6 +531,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69eabfcb9bdb..f2f8b36e2ad4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -921,6 +921,9 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index e4f1668a021c..040d4e0edf3f 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -16,9 +16,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -176,6 +178,63 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +struct ns_get_path_bpf_prog_args { + struct bpf_prog *prog; + struct bpf_prog_info *info; +}; + +static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_prog_args *args = private_data; + struct bpf_prog_aux *aux = args->prog->aux; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (aux->offload) { + args->info->ifindex = aux->offload->netdev->ifindex; + net = dev_net(aux->offload->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog) +{ + struct ns_get_path_bpf_prog_args args = { + .prog = prog, + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e02dafa6f402..ebf0fb23e237 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1707,6 +1707,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + err = bpf_prog_offload_info_fill(&info, prog); + if (err) + return err; + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index db1b0923a308..4e8c60acfa32 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -921,6 +921,9 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3-59-g8ed1b From bbb6189df4077cde8592cd2f804bb1122067dd32 Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Wed, 27 Dec 2017 18:27:58 +0100 Subject: inet_diag: Add equal-operator for ports inet_diag currently provides less/greater than or equal operators for comparing ports when filtering sockets. An equal comparison can be performed by combining the two existing operators, or a user can for example request a port range and then do the final filtering in userspace. However, these approaches both have drawbacks. Implementing equal using LE/GE causes the size and complexity of a filter to grow quickly as the number of ports increase, while it on busy machines would be great if the kernel only returns information about relevant sockets. This patch introduces source and destination port equal operators. INET_DIAG_BC_S_EQ is used to match a source port, INET_DIAG_BC_D_EQ a destination port, and usage is the same as for the existing port operators. I.e., the port to match is stored in the no-member of the next inet_diag_bc_op-struct in the filter. Signed-off-by: Kristian Evensen Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 2 ++ net/ipv4/inet_diag.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 817d807e9481..14565d703291 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -92,6 +92,8 @@ enum { INET_DIAG_BC_D_COND, INET_DIAG_BC_DEV_COND, /* u32 ifindex */ INET_DIAG_BC_MARK_COND, + INET_DIAG_BC_S_EQ, + INET_DIAG_BC_D_EQ, }; struct inet_diag_hostcond { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c9c35b61a027..a383f299ce24 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -564,12 +564,18 @@ static int inet_diag_bc_run(const struct nlattr *_bc, case INET_DIAG_BC_JMP: yes = 0; break; + case INET_DIAG_BC_S_EQ: + yes = entry->sport == op[1].no; + break; case INET_DIAG_BC_S_GE: yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: yes = entry->sport <= op[1].no; break; + case INET_DIAG_BC_D_EQ: + yes = entry->dport == op[1].no; + break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; break; @@ -802,8 +808,10 @@ static int inet_diag_bc_audit(const struct nlattr *attr, if (!valid_devcond(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_S_EQ: case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_EQ: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: if (!valid_port_comparison(bc, len, &min_len)) -- cgit v1.2.3-59-g8ed1b From 863def15b9755d9016df4d93addf3127f1dc67f4 Mon Sep 17 00:00:00 2001 From: James Chapman Date: Wed, 3 Jan 2018 22:48:04 +0000 Subject: l2tp: revert "l2tp: add peer_offset parameter" Revert commit f15bc54eeecd ("l2tp: add peer_offset parameter"). This is removed because it is adding another configurable offset and configurable offsets are being removed. Signed-off-by: James Chapman Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 1 - net/l2tp/l2tp_core.c | 3 +-- net/l2tp/l2tp_core.h | 13 +++---------- net/l2tp/l2tp_debugfs.c | 8 +++----- net/l2tp/l2tp_netlink.c | 21 +-------------------- 5 files changed, 8 insertions(+), 38 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index d6fee55dbded..d84ce5c1c9aa 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -127,7 +127,6 @@ enum { L2TP_ATTR_UDP_ZERO_CSUM6_TX, /* flag */ L2TP_ATTR_UDP_ZERO_CSUM6_RX, /* flag */ L2TP_ATTR_PAD, - L2TP_ATTR_PEER_OFFSET, /* u16 */ __L2TP_ATTR_MAX, }; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 6ff64717da1e..115918ad8eca 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -792,7 +792,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, ptr += 2 + offset; } } else - ptr += session->peer_offset; + ptr += session->offset; offset = ptr - optr; if (!pskb_may_pull(skb, offset)) @@ -1785,7 +1785,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn session->lns_mode = cfg->lns_mode; session->reorder_timeout = cfg->reorder_timeout; session->offset = cfg->offset; - session->peer_offset = cfg->peer_offset; session->l2specific_type = cfg->l2specific_type; session->l2specific_len = cfg->l2specific_len; session->cookie_len = cfg->cookie_len; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index c6fe7cc42a05..9534e16965cc 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -59,8 +59,7 @@ struct l2tp_session_cfg { int debug; /* bitmask of debug message * categories */ u16 vlan_id; /* VLAN pseudowire only */ - u16 offset; /* offset to tx payload */ - u16 peer_offset; /* offset to rx payload */ + u16 offset; /* offset to payload */ u16 l2specific_len; /* Layer 2 specific length */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ @@ -87,14 +86,8 @@ struct l2tp_session { int cookie_len; u8 peer_cookie[8]; int peer_cookie_len; - u16 offset; /* offset from end of L2TP - * header to beginning of - * tx data - */ - u16 peer_offset; /* offset from end of L2TP - * header to beginning of - * rx data - */ + u16 offset; /* offset from end of L2TP header + to beginning of data */ u16 l2specific_len; u16 l2specific_type; u16 hdr_len; diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 4cc30b38aba4..eb69411bcb47 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -180,9 +180,8 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) session->lns_mode ? "LNS" : "LAC", session->debug, jiffies_to_msecs(session->reorder_timeout)); - seq_printf(m, " offset %hu peer_offset %hu l2specific %hu/%hu\n", - session->offset, session->peer_offset, - session->l2specific_type, session->l2specific_len); + seq_printf(m, " offset %hu l2specific %hu/%hu\n", + session->offset, session->l2specific_type, session->l2specific_len); if (session->cookie_len) { seq_printf(m, " cookie %02x%02x%02x%02x", session->cookie[0], session->cookie[1], @@ -229,8 +228,7 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v) seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); seq_puts(m, " SESSION ID, peer ID, PWTYPE\n"); seq_puts(m, " refcnt cnt\n"); - seq_puts(m, " offset OFFSET peer_offset OFFSET"); - seq_puts(m, " l2specific TYPE/LEN\n"); + seq_puts(m, " offset OFFSET l2specific TYPE/LEN\n"); seq_puts(m, " [ cookie ]\n"); seq_puts(m, " [ peer cookie ]\n"); seq_puts(m, " config mtu/mru/rcvseq/sendseq/dataseq/lns debug reorderto\n"); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index d7d4d7a7a54d..7e9c50125556 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -547,25 +547,9 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf } if (tunnel->version > 2) { - if (info->attrs[L2TP_ATTR_PEER_OFFSET]) { - struct nlattr *peer_offset; - - peer_offset = info->attrs[L2TP_ATTR_PEER_OFFSET]; - cfg.peer_offset = nla_get_u16(peer_offset); - } - - if (info->attrs[L2TP_ATTR_OFFSET]) { + if (info->attrs[L2TP_ATTR_OFFSET]) cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]); - /* in order to maintain compatibility with older - * versions where offset was used for both tx and - * rx side, update rx side with offset if peer_offset - * is not provided by userspace - */ - if (!info->attrs[L2TP_ATTR_PEER_OFFSET]) - cfg.peer_offset = cfg.offset; - } - if (info->attrs[L2TP_ATTR_DATA_SEQ]) cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]); @@ -779,8 +763,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) || (session->offset && nla_put_u16(skb, L2TP_ATTR_OFFSET, session->offset)) || - (session->peer_offset && - nla_put_u16(skb, L2TP_ATTR_PEER_OFFSET, session->peer_offset)) || (session->cookie_len && nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, &session->cookie[0])) || @@ -921,7 +903,6 @@ static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_OFFSET] = { .type = NLA_U16, }, - [L2TP_ATTR_PEER_OFFSET] = { .type = NLA_U16, }, [L2TP_ATTR_DATA_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_TYPE] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_LEN] = { .type = NLA_U8, }, -- cgit v1.2.3-59-g8ed1b From 4887d8933a8dfdfa6602e0faaa0e31cd343ccefe Mon Sep 17 00:00:00 2001 From: James Chapman Date: Wed, 3 Jan 2018 22:48:07 +0000 Subject: l2tp: add comment in API header that L2TP_ATTR_OFFSET is not used Signed-off-by: James Chapman Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index d84ce5c1c9aa..f78eef4cc56a 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -94,7 +94,7 @@ enum { L2TP_ATTR_NONE, /* no data */ L2TP_ATTR_PW_TYPE, /* u16, enum l2tp_pwtype */ L2TP_ATTR_ENCAP_TYPE, /* u16, enum l2tp_encap_type */ - L2TP_ATTR_OFFSET, /* u16 */ + L2TP_ATTR_OFFSET, /* u16 (not used) */ L2TP_ATTR_DATA_SEQ, /* u16 */ L2TP_ATTR_L2SPEC_TYPE, /* u8, enum l2tp_l2spec_type */ L2TP_ATTR_L2SPEC_LEN, /* u8, enum l2tp_l2spec_type */ -- cgit v1.2.3-59-g8ed1b From 02dd3291b2f095bbc88e1d2628fd5bf2e92de69b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:26:14 +0100 Subject: bpf: finally expose xdp_rxq_info to XDP bpf-programs Now all XDP driver have been updated to setup xdp_rxq_info and assign this to xdp_buff->rxq. Thus, it is now safe to enable access to some of the xdp_rxq_info struct members. This patch extend xdp_md and expose UAPI to userspace for ingress_ifindex and rx_queue_index. Access happens via bpf instruction rewrite, that load data directly from struct xdp_rxq_info. * ingress_ifindex map to xdp_rxq_info->dev->ifindex * rx_queue_index map to xdp_rxq_info->queue_index Signed-off-by: Jesper Dangaard Brouer Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 +++ net/core/filter.c | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f2f8b36e2ad4..405317f9c064 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -899,6 +899,9 @@ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; + /* Below access go though struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ }; enum sk_action { diff --git a/net/core/filter.c b/net/core/filter.c index 130b842c3a15..acdb94c0e97f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4304,6 +4304,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; + case offsetof(struct xdp_md, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct net_device, + ifindex, 4, target_size)); + break; + case offsetof(struct xdp_md, rx_queue_index): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct xdp_rxq_info, + queue_index, 4, target_size)); + break; } return insn - insn_buf; -- cgit v1.2.3-59-g8ed1b From e58f33cc84bc089c430ac955f3cad6380ae98591 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 Dec 2017 16:28:23 +0100 Subject: netfilter: add defines for arp/decnet max hooks The kernel already has defines for this, but they are in uapi exposed headers. Including these from netns.h causes build errors and also adds unneeded dependencies on heads that we don't need. So move these defines to netfilter_defs.h and place the uapi ones in ifndef __KERNEL__ to keep them for userspace. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_defs.h | 6 ++++++ include/uapi/linux/netfilter_arp.h | 3 +++ include/uapi/linux/netfilter_decnet.h | 4 +++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter_defs.h b/include/linux/netfilter_defs.h index dc6111adea06..fdcdf2bf34df 100644 --- a/include/linux/netfilter_defs.h +++ b/include/linux/netfilter_defs.h @@ -7,4 +7,10 @@ /* Largest hook number + 1, see uapi/linux/netfilter_decnet.h */ #define NF_MAX_HOOKS 8 +/* in/out/forward only */ +#define NF_ARP_NUMHOOKS 3 + +/* max hook is NF_DN_ROUTE (6), also see uapi/linux/netfilter_decnet.h */ +#define NF_DN_NUMHOOKS 7 + #endif diff --git a/include/uapi/linux/netfilter_arp.h b/include/uapi/linux/netfilter_arp.h index 81b6a4cbcb72..791dfc5ae907 100644 --- a/include/uapi/linux/netfilter_arp.h +++ b/include/uapi/linux/netfilter_arp.h @@ -15,6 +15,9 @@ #define NF_ARP_IN 0 #define NF_ARP_OUT 1 #define NF_ARP_FORWARD 2 + +#ifndef __KERNEL__ #define NF_ARP_NUMHOOKS 3 +#endif #endif /* __LINUX_ARP_NETFILTER_H */ diff --git a/include/uapi/linux/netfilter_decnet.h b/include/uapi/linux/netfilter_decnet.h index 9089c38f6abe..61f1c7dfd033 100644 --- a/include/uapi/linux/netfilter_decnet.h +++ b/include/uapi/linux/netfilter_decnet.h @@ -24,6 +24,9 @@ #define NFC_DN_IF_IN 0x0004 /* Output device. */ #define NFC_DN_IF_OUT 0x0008 + +/* kernel define is in netfilter_defs.h */ +#define NF_DN_NUMHOOKS 7 #endif /* ! __KERNEL__ */ /* DECnet Hooks */ @@ -41,7 +44,6 @@ #define NF_DN_HELLO 5 /* Input Routing Packets */ #define NF_DN_ROUTE 6 -#define NF_DN_NUMHOOKS 7 enum nf_dn_hook_priorities { NF_DN_PRI_FIRST = INT_MIN, -- cgit v1.2.3-59-g8ed1b From 625c556118f3c2fd28bb8ef6da18c53bd4037be4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 9 Dec 2017 21:01:08 +0100 Subject: netfilter: connlimit: split xt_connlimit into front and backend This allows to reuse xt_connlimit infrastructure from nf_tables. The upcoming nf_tables frontend can just pass in an nftables register as input key, this allows limiting by any nft-supported key, including concatenations. For xt_connlimit, pass in the zone and the ip/ipv6 address. With help from Yi-Hung Wei. Signed-off-by: Florian Westphal Acked-by: Yi-Hung Wei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 17 ++ include/uapi/linux/netfilter/xt_connlimit.h | 2 +- net/netfilter/Kconfig | 3 + net/netfilter/Makefile | 2 + net/netfilter/nf_conncount.c | 373 ++++++++++++++++++++++++++++ net/netfilter/xt_connlimit.c | 369 ++------------------------- 6 files changed, 420 insertions(+), 346 deletions(-) create mode 100644 include/net/netfilter/nf_conntrack_count.h create mode 100644 net/netfilter/nf_conncount.c (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h new file mode 100644 index 000000000000..adf8db44cf86 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_count.h @@ -0,0 +1,17 @@ +#ifndef _NF_CONNTRACK_COUNT_H +#define _NF_CONNTRACK_COUNT_H + +struct nf_conncount_data; + +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, + unsigned int keylen); +void nf_conncount_destroy(struct net *net, unsigned int family, + struct nf_conncount_data *data); + +unsigned int nf_conncount_count(struct net *net, + struct nf_conncount_data *data, + const u32 *key, + unsigned int family, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); +#endif diff --git a/include/uapi/linux/netfilter/xt_connlimit.h b/include/uapi/linux/netfilter/xt_connlimit.h index 07e5e9d47882..d4d1943dcd11 100644 --- a/include/uapi/linux/netfilter/xt_connlimit.h +++ b/include/uapi/linux/netfilter/xt_connlimit.h @@ -27,7 +27,7 @@ struct xt_connlimit_info { __u32 flags; /* Used internally by the kernel */ - struct xt_connlimit_data *data __attribute__((aligned(8))); + struct nf_conncount_data *data __attribute__((aligned(8))); }; #endif /* _XT_CONNLIMIT_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 263609a7e010..af3d9f721b3f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -68,6 +68,8 @@ config NF_LOG_NETDEV select NF_LOG_COMMON if NF_CONNTRACK +config NETFILTER_CONNCOUNT + tristate config NF_CONNTRACK_MARK bool 'Connection mark tracking support' @@ -1126,6 +1128,7 @@ config NETFILTER_XT_MATCH_CONNLIMIT tristate '"connlimit" match support' depends on NF_CONNTRACK depends on NETFILTER_ADVANCED + select NETFILTER_CONNCOUNT ---help--- This match allows you to match against the number of parallel connections to a server per client IP address (or address block). diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index f78ed2470831..490a55e7166d 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -67,6 +67,8 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o # SYNPROXY obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o +obj-$(CONFIG_NETFILTER_CONNCOUNT) += nf_conncount.o + # generic packet duplication from netdev family obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c new file mode 100644 index 000000000000..a95518261168 --- /dev/null +++ b/net/netfilter/nf_conncount.c @@ -0,0 +1,373 @@ +/* + * count the number of connections matching an arbitrary key. + * + * (C) 2017 Red Hat GmbH + * Author: Florian Westphal + * + * split from xt_connlimit.c: + * (c) 2000 Gerd Knorr + * Nov 2002: Martin Bene : + * only ignore TIME_WAIT or gone connections + * (C) CC Computer Consultants GmbH, 2007 + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CONNCOUNT_SLOTS 256U + +#ifdef CONFIG_LOCKDEP +#define CONNCOUNT_LOCK_SLOTS 8U +#else +#define CONNCOUNT_LOCK_SLOTS 256U +#endif + +#define CONNCOUNT_GC_MAX_NODES 8 +#define MAX_KEYLEN 5 + +/* we will save the tuples of all connections we care about */ +struct nf_conncount_tuple { + struct hlist_node node; + struct nf_conntrack_tuple tuple; +}; + +struct nf_conncount_rb { + struct rb_node node; + struct hlist_head hhead; /* connections/hosts in same subnet */ + u32 key[MAX_KEYLEN]; +}; + +static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; + +struct nf_conncount_data { + unsigned int keylen; + struct rb_root root[CONNCOUNT_SLOTS]; +}; + +static u_int32_t conncount_rnd __read_mostly; +static struct kmem_cache *conncount_rb_cachep __read_mostly; +static struct kmem_cache *conncount_conn_cachep __read_mostly; + +static inline bool already_closed(const struct nf_conn *conn) +{ + if (nf_ct_protonum(conn) == IPPROTO_TCP) + return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || + conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; + else + return 0; +} + +static int key_diff(const u32 *a, const u32 *b, unsigned int klen) +{ + return memcmp(a, b, klen * sizeof(u32)); +} + +static bool add_hlist(struct hlist_head *head, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conncount_tuple *conn; + + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) + return false; + conn->tuple = *tuple; + hlist_add_head(&conn->node, head); + return true; +} + +static unsigned int check_hlist(struct net *net, + struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit) +{ + const struct nf_conntrack_tuple_hash *found; + struct nf_conncount_tuple *conn; + struct hlist_node *n; + struct nf_conn *found_ct; + unsigned int length = 0; + + *addit = true; + + /* check the saved connections */ + hlist_for_each_entry_safe(conn, n, head, node) { + found = nf_conntrack_find_get(net, zone, &conn->tuple); + if (found == NULL) { + hlist_del(&conn->node); + kmem_cache_free(conncount_conn_cachep, conn); + continue; + } + + found_ct = nf_ct_tuplehash_to_ctrack(found); + + if (nf_ct_tuple_equal(&conn->tuple, tuple)) { + /* + * Just to be sure we have it only once in the list. + * We should not see tuples twice unless someone hooks + * this into a table without "-p tcp --syn". + */ + *addit = false; + } else if (already_closed(found_ct)) { + /* + * we do not care about connections which are + * closed already -> ditch it + */ + nf_ct_put(found_ct); + hlist_del(&conn->node); + kmem_cache_free(conncount_conn_cachep, conn); + continue; + } + + nf_ct_put(found_ct); + length++; + } + + return length; +} + +static void tree_nodes_free(struct rb_root *root, + struct nf_conncount_rb *gc_nodes[], + unsigned int gc_count) +{ + struct nf_conncount_rb *rbconn; + + while (gc_count) { + rbconn = gc_nodes[--gc_count]; + rb_erase(&rbconn->node, root); + kmem_cache_free(conncount_rb_cachep, rbconn); + } +} + +static unsigned int +count_tree(struct net *net, struct rb_root *root, + const u32 *key, u8 keylen, + u8 family, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) +{ + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + struct rb_node **rbnode, *parent; + struct nf_conncount_rb *rbconn; + struct nf_conncount_tuple *conn; + unsigned int gc_count; + bool no_gc = false; + + restart: + gc_count = 0; + parent = NULL; + rbnode = &(root->rb_node); + while (*rbnode) { + int diff; + bool addit; + + rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); + + parent = *rbnode; + diff = key_diff(key, rbconn->key, keylen); + if (diff < 0) { + rbnode = &((*rbnode)->rb_left); + } else if (diff > 0) { + rbnode = &((*rbnode)->rb_right); + } else { + /* same source network -> be counted! */ + unsigned int count; + count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit); + + tree_nodes_free(root, gc_nodes, gc_count); + if (!addit) + return count; + + if (!add_hlist(&rbconn->hhead, tuple)) + return 0; /* hotdrop */ + + return count + 1; + } + + if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) + continue; + + /* only used for GC on hhead, retval and 'addit' ignored */ + check_hlist(net, &rbconn->hhead, tuple, zone, &addit); + if (hlist_empty(&rbconn->hhead)) + gc_nodes[gc_count++] = rbconn; + } + + if (gc_count) { + no_gc = true; + tree_nodes_free(root, gc_nodes, gc_count); + /* tree_node_free before new allocation permits + * allocator to re-use newly free'd object. + * + * This is a rare event; in most cases we will find + * existing node to re-use. (or gc_count is 0). + */ + goto restart; + } + + /* no match, need to insert new node */ + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + return 0; + + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(conncount_rb_cachep, rbconn); + return 0; + } + + conn->tuple = *tuple; + memcpy(rbconn->key, key, sizeof(u32) * keylen); + + INIT_HLIST_HEAD(&rbconn->hhead); + hlist_add_head(&conn->node, &rbconn->hhead); + + rb_link_node(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + return 1; +} + +unsigned int nf_conncount_count(struct net *net, + struct nf_conncount_data *data, + const u32 *key, + unsigned int family, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) +{ + struct rb_root *root; + int count; + u32 hash; + + hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; + root = &data->root[hash]; + + spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + + count = count_tree(net, root, key, data->keylen, family, tuple, zone); + + spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + + return count; +} +EXPORT_SYMBOL_GPL(nf_conncount_count); + +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, + unsigned int keylen) +{ + struct nf_conncount_data *data; + int ret, i; + + if (keylen % sizeof(u32) || + keylen / sizeof(u32) > MAX_KEYLEN || + keylen == 0) + return ERR_PTR(-EINVAL); + + net_get_random_once(&conncount_rnd, sizeof(conncount_rnd)); + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + ret = nf_ct_netns_get(net, family); + if (ret < 0) { + kfree(data); + return ERR_PTR(ret); + } + + for (i = 0; i < ARRAY_SIZE(data->root); ++i) + data->root[i] = RB_ROOT; + + data->keylen = keylen / sizeof(u32); + + return data; +} +EXPORT_SYMBOL_GPL(nf_conncount_init); + +static void destroy_tree(struct rb_root *r) +{ + struct nf_conncount_tuple *conn; + struct nf_conncount_rb *rbconn; + struct hlist_node *n; + struct rb_node *node; + + while ((node = rb_first(r)) != NULL) { + rbconn = rb_entry(node, struct nf_conncount_rb, node); + + rb_erase(node, r); + + hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) + kmem_cache_free(conncount_conn_cachep, conn); + + kmem_cache_free(conncount_rb_cachep, rbconn); + } +} + +void nf_conncount_destroy(struct net *net, unsigned int family, + struct nf_conncount_data *data) +{ + unsigned int i; + + nf_ct_netns_put(net, family); + + for (i = 0; i < ARRAY_SIZE(data->root); ++i) + destroy_tree(&data->root[i]); + + kfree(data); +} +EXPORT_SYMBOL_GPL(nf_conncount_destroy); + +static int __init nf_conncount_modinit(void) +{ + int i; + + BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS); + BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0); + + for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i) + spin_lock_init(&nf_conncount_locks[i]); + + conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", + sizeof(struct nf_conncount_tuple), + 0, 0, NULL); + if (!conncount_conn_cachep) + return -ENOMEM; + + conncount_rb_cachep = kmem_cache_create("nf_conncount_rb", + sizeof(struct nf_conncount_rb), + 0, 0, NULL); + if (!conncount_rb_cachep) { + kmem_cache_destroy(conncount_conn_cachep); + return -ENOMEM; + } + + return 0; +} + +static void __exit nf_conncount_modexit(void) +{ + kmem_cache_destroy(conncount_conn_cachep); + kmem_cache_destroy(conncount_rb_cachep); +} + +module_init(nf_conncount_modinit); +module_exit(nf_conncount_modexit); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_AUTHOR("Florian Westphal "); +MODULE_DESCRIPTION("netfilter: count number of connections matching a key"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index a6214f235333..b1b17b9353e1 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -12,292 +12,30 @@ * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au). */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include + #include -#include #include -#include -#include #include #include + #include #include #include #include - -#define CONNLIMIT_SLOTS 256U - -#ifdef CONFIG_LOCKDEP -#define CONNLIMIT_LOCK_SLOTS 8U -#else -#define CONNLIMIT_LOCK_SLOTS 256U -#endif - -#define CONNLIMIT_GC_MAX_NODES 8 - -/* we will save the tuples of all connections we care about */ -struct xt_connlimit_conn { - struct hlist_node node; - struct nf_conntrack_tuple tuple; -}; - -struct xt_connlimit_rb { - struct rb_node node; - struct hlist_head hhead; /* connections/hosts in same subnet */ - union nf_inet_addr addr; /* search key */ -}; - -static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; - -struct xt_connlimit_data { - struct rb_root climit_root[CONNLIMIT_SLOTS]; -}; - -static u_int32_t connlimit_rnd __read_mostly; -static struct kmem_cache *connlimit_rb_cachep __read_mostly; -static struct kmem_cache *connlimit_conn_cachep __read_mostly; - -static inline unsigned int connlimit_iphash(__be32 addr) -{ - return jhash_1word((__force __u32)addr, - connlimit_rnd) % CONNLIMIT_SLOTS; -} - -static inline unsigned int -connlimit_iphash6(const union nf_inet_addr *addr) -{ - return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), - connlimit_rnd) % CONNLIMIT_SLOTS; -} - -static inline bool already_closed(const struct nf_conn *conn) -{ - if (nf_ct_protonum(conn) == IPPROTO_TCP) - return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || - conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; - else - return 0; -} - -static int -same_source(const union nf_inet_addr *addr, - const union nf_inet_addr *u3, u_int8_t family) -{ - if (family == NFPROTO_IPV4) - return ntohl(addr->ip) - ntohl(u3->ip); - - return memcmp(addr->ip6, u3->ip6, sizeof(addr->ip6)); -} - -static bool add_hlist(struct hlist_head *head, - const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr) -{ - struct xt_connlimit_conn *conn; - - conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); - if (conn == NULL) - return false; - conn->tuple = *tuple; - hlist_add_head(&conn->node, head); - return true; -} - -static unsigned int check_hlist(struct net *net, - struct hlist_head *head, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit) -{ - const struct nf_conntrack_tuple_hash *found; - struct xt_connlimit_conn *conn; - struct hlist_node *n; - struct nf_conn *found_ct; - unsigned int length = 0; - - *addit = true; - - /* check the saved connections */ - hlist_for_each_entry_safe(conn, n, head, node) { - found = nf_conntrack_find_get(net, zone, &conn->tuple); - if (found == NULL) { - hlist_del(&conn->node); - kmem_cache_free(connlimit_conn_cachep, conn); - continue; - } - - found_ct = nf_ct_tuplehash_to_ctrack(found); - - if (nf_ct_tuple_equal(&conn->tuple, tuple)) { - /* - * Just to be sure we have it only once in the list. - * We should not see tuples twice unless someone hooks - * this into a table without "-p tcp --syn". - */ - *addit = false; - } else if (already_closed(found_ct)) { - /* - * we do not care about connections which are - * closed already -> ditch it - */ - nf_ct_put(found_ct); - hlist_del(&conn->node); - kmem_cache_free(connlimit_conn_cachep, conn); - continue; - } - - nf_ct_put(found_ct); - length++; - } - - return length; -} - -static void tree_nodes_free(struct rb_root *root, - struct xt_connlimit_rb *gc_nodes[], - unsigned int gc_count) -{ - struct xt_connlimit_rb *rbconn; - - while (gc_count) { - rbconn = gc_nodes[--gc_count]; - rb_erase(&rbconn->node, root); - kmem_cache_free(connlimit_rb_cachep, rbconn); - } -} - -static unsigned int -count_tree(struct net *net, struct rb_root *root, - const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr, - u8 family, const struct nf_conntrack_zone *zone) -{ - struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; - struct rb_node **rbnode, *parent; - struct xt_connlimit_rb *rbconn; - struct xt_connlimit_conn *conn; - unsigned int gc_count; - bool no_gc = false; - - restart: - gc_count = 0; - parent = NULL; - rbnode = &(root->rb_node); - while (*rbnode) { - int diff; - bool addit; - - rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node); - - parent = *rbnode; - diff = same_source(addr, &rbconn->addr, family); - if (diff < 0) { - rbnode = &((*rbnode)->rb_left); - } else if (diff > 0) { - rbnode = &((*rbnode)->rb_right); - } else { - /* same source network -> be counted! */ - unsigned int count; - count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit); - - tree_nodes_free(root, gc_nodes, gc_count); - if (!addit) - return count; - - if (!add_hlist(&rbconn->hhead, tuple, addr)) - return 0; /* hotdrop */ - - return count + 1; - } - - if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) - continue; - - /* only used for GC on hhead, retval and 'addit' ignored */ - check_hlist(net, &rbconn->hhead, tuple, zone, &addit); - if (hlist_empty(&rbconn->hhead)) - gc_nodes[gc_count++] = rbconn; - } - - if (gc_count) { - no_gc = true; - tree_nodes_free(root, gc_nodes, gc_count); - /* tree_node_free before new allocation permits - * allocator to re-use newly free'd object. - * - * This is a rare event; in most cases we will find - * existing node to re-use. (or gc_count is 0). - */ - goto restart; - } - - /* no match, need to insert new node */ - rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC); - if (rbconn == NULL) - return 0; - - conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC); - if (conn == NULL) { - kmem_cache_free(connlimit_rb_cachep, rbconn); - return 0; - } - - conn->tuple = *tuple; - rbconn->addr = *addr; - - INIT_HLIST_HEAD(&rbconn->hhead); - hlist_add_head(&conn->node, &rbconn->hhead); - - rb_link_node(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); - return 1; -} - -static int count_them(struct net *net, - struct xt_connlimit_data *data, - const struct nf_conntrack_tuple *tuple, - const union nf_inet_addr *addr, - u_int8_t family, - const struct nf_conntrack_zone *zone) -{ - struct rb_root *root; - int count; - u32 hash; - - if (family == NFPROTO_IPV6) - hash = connlimit_iphash6(addr); - else - hash = connlimit_iphash(addr->ip); - root = &data->climit_root[hash]; - - spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); - - count = count_tree(net, root, tuple, addr, family, zone); - - spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); - - return count; -} +#include static bool connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; - union nf_inet_addr addr; struct nf_conntrack_tuple tuple; const struct nf_conntrack_tuple *tuple_ptr = &tuple; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int connections; + u32 key[5]; ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) { @@ -310,6 +48,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); + union nf_inet_addr addr; unsigned int i; memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? @@ -317,22 +56,24 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) for (i = 0; i < ARRAY_SIZE(addr.ip6); ++i) addr.ip6[i] &= info->mask.ip6[i]; + memcpy(key, &addr, sizeof(addr.ip6)); + key[4] = zone->id; } else { const struct iphdr *iph = ip_hdr(skb); - addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ? + key[0] = (info->flags & XT_CONNLIMIT_DADDR) ? iph->daddr : iph->saddr; - addr.ip &= info->mask.ip; + key[0] &= info->mask.ip; + key[1] = zone->id; } - connections = count_them(net, info->data, tuple_ptr, &addr, - xt_family(par), zone); + connections = nf_conncount_count(net, info->data, key, + xt_family(par), tuple_ptr, zone); if (connections == 0) /* kmalloc failed, drop it entirely */ goto hotdrop; - return (connections > info->limit) ^ - !!(info->flags & XT_CONNLIMIT_INVERT); + return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); hotdrop: par->hotdrop = true; @@ -342,61 +83,27 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) static int connlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_connlimit_info *info = par->matchinfo; - unsigned int i; - int ret; + unsigned int keylen; - net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd)); - - ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) { - pr_info("cannot load conntrack support for " - "address family %u\n", par->family); - return ret; - } + keylen = sizeof(u32); + if (par->family == NFPROTO_IPV6) + keylen += sizeof(struct in6_addr); + else + keylen += sizeof(struct in_addr); /* init private data */ - info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); - if (info->data == NULL) { - nf_ct_netns_put(par->net, par->family); - return -ENOMEM; - } - - for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i) - info->data->climit_root[i] = RB_ROOT; + info->data = nf_conncount_init(par->net, par->family, keylen); + if (IS_ERR(info->data)) + return PTR_ERR(info->data); return 0; } -static void destroy_tree(struct rb_root *r) -{ - struct xt_connlimit_conn *conn; - struct xt_connlimit_rb *rbconn; - struct hlist_node *n; - struct rb_node *node; - - while ((node = rb_first(r)) != NULL) { - rbconn = rb_entry(node, struct xt_connlimit_rb, node); - - rb_erase(node, r); - - hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) - kmem_cache_free(connlimit_conn_cachep, conn); - - kmem_cache_free(connlimit_rb_cachep, rbconn); - } -} - static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; - unsigned int i; - - nf_ct_netns_put(par->net, par->family); - - for (i = 0; i < ARRAY_SIZE(info->data->climit_root); ++i) - destroy_tree(&info->data->climit_root[i]); - kfree(info->data); + nf_conncount_destroy(par->net, par->family, info->data); } static struct xt_match connlimit_mt_reg __read_mostly = { @@ -413,40 +120,12 @@ static struct xt_match connlimit_mt_reg __read_mostly = { static int __init connlimit_mt_init(void) { - int ret, i; - - BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); - BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); - - for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) - spin_lock_init(&xt_connlimit_locks[i]); - - connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", - sizeof(struct xt_connlimit_conn), - 0, 0, NULL); - if (!connlimit_conn_cachep) - return -ENOMEM; - - connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb", - sizeof(struct xt_connlimit_rb), - 0, 0, NULL); - if (!connlimit_rb_cachep) { - kmem_cache_destroy(connlimit_conn_cachep); - return -ENOMEM; - } - ret = xt_register_match(&connlimit_mt_reg); - if (ret != 0) { - kmem_cache_destroy(connlimit_conn_cachep); - kmem_cache_destroy(connlimit_rb_cachep); - } - return ret; + return xt_register_match(&connlimit_mt_reg); } static void __exit connlimit_mt_exit(void) { xt_unregister_match(&connlimit_mt_reg); - kmem_cache_destroy(connlimit_conn_cachep); - kmem_cache_destroy(connlimit_rb_cachep); } module_init(connlimit_mt_init); -- cgit v1.2.3-59-g8ed1b From f6931f5f5b713705c3cc91e4f9c222f2b181e2ef Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Dec 2017 16:18:16 +0100 Subject: netfilter: meta: secpath support replacement for iptables "-m policy --dir in --policy {ipsec,none}". Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_meta.c | 43 ++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index a3ee277b17a1..2efbf9744c2a 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -777,6 +777,7 @@ enum nft_exthdr_attributes { * @NFT_META_OIFGROUP: packet output interface group * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid) * @NFT_META_PRANDOM: a 32bit pseudo-random number + * @NFT_META_SECPATH: boolean, secpath_exists (!!skb->sp) */ enum nft_meta_keys { NFT_META_LEN, @@ -804,6 +805,7 @@ enum nft_meta_keys { NFT_META_OIFGROUP, NFT_META_CGROUP, NFT_META_PRANDOM, + NFT_META_SECPATH, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 5a60eb23a7ed..1a91e676f13e 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -210,6 +210,11 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = prandom_u32_state(state); break; } +#ifdef CONFIG_XFRM + case NFT_META_SECPATH: + nft_reg_store8(dest, !!skb->sp); + break; +#endif default: WARN_ON(1); goto err; @@ -308,6 +313,11 @@ int nft_meta_get_init(const struct nft_ctx *ctx, prandom_init_once(&nft_prandom_state); len = sizeof(u32); break; +#ifdef CONFIG_XFRM + case NFT_META_SECPATH: + len = sizeof(u8); + break; +#endif default: return -EOPNOTSUPP; } @@ -318,6 +328,38 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); +static int nft_meta_get_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ +#ifdef CONFIG_XFRM + const struct nft_meta *priv = nft_expr_priv(expr); + unsigned int hooks; + + if (priv->key != NFT_META_SECPATH) + return 0; + + switch (ctx->afi->family) { + case NFPROTO_NETDEV: + hooks = 1 << NF_NETDEV_INGRESS; + break; + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +#else + return 0; +#endif +} + int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) @@ -434,6 +476,7 @@ static const struct nft_expr_ops nft_meta_get_ops = { .eval = nft_meta_get_eval, .init = nft_meta_get_init, .dump = nft_meta_get_dump, + .validate = nft_meta_get_validate, }; static const struct nft_expr_ops nft_meta_set_ops = { -- cgit v1.2.3-59-g8ed1b From 90964016e5d34758033e75884e41d68ccb93212e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 7 Jan 2018 01:03:56 +0100 Subject: netfilter: nf_conntrack: add IPS_OFFLOAD status bit This new bit tells us that the conntrack entry is owned by the flow table offload infrastructure. # cat /proc/net/nf_conntrack ipv4 2 tcp 6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2 Note the [OFFLOAD] tag in the listing. The timer of such conntrack entries look like stopped from userspace. In practise, to make sure the conntrack entry does not go away, the conntrack timer is periodically set to an arbitrary large value that gets refreshed on every iteration from the garbage collector, so it never expires- and they display no internal state in the case of TCP flows. This allows us to save a bitcheck from the packet path via nf_ct_is_expired(). Conntrack entries that have been offloaded to the flow table infrastructure cannot be deleted/flushed via ctnetlink. The flow table infrastructure is also responsible for releasing this conntrack entry. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 6 +++++- net/netfilter/nf_conntrack_core.c | 20 ++++++++++++++++++++ net/netfilter/nf_conntrack_netlink.c | 15 ++++++++++++++- net/netfilter/nf_conntrack_proto_tcp.c | 3 +++ net/netfilter/nf_conntrack_standalone.c | 12 ++++++++---- 5 files changed, 50 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 3fea7709a441..fc8c15a24a43 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -101,12 +101,16 @@ enum ip_conntrack_status { IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), + /* Conntrack has been offloaded to flow table. */ + IPS_OFFLOAD_BIT = 14, + IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT), + /* Be careful here, modifying these bits can make things messy, * so don't let users modify them directly. */ IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | - IPS_SEQ_ADJUST | IPS_TEMPLATE), + IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD), __IPS_MAX_BIT = 14, }; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 85f643c1e227..6a64d528d076 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -901,6 +901,9 @@ static unsigned int early_drop_list(struct net *net, hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) + continue; + if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); continue; @@ -975,6 +978,18 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) return false; } +#define DAY (86400 * HZ) + +/* Set an arbitrary timeout large enough not to ever expire, this save + * us a check for the IPS_OFFLOAD_BIT from the packet path via + * nf_ct_is_expired(). + */ +static void nf_ct_offload_timeout(struct nf_conn *ct) +{ + if (nf_ct_expires(ct) < DAY / 2) + ct->timeout = nfct_time_stamp + DAY; +} + static void gc_worker(struct work_struct *work) { unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); @@ -1011,6 +1026,11 @@ static void gc_worker(struct work_struct *work) tmp = nf_ct_tuplehash_to_ctrack(h); scanned++; + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { + nf_ct_offload_timeout(tmp); + continue; + } + if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); expired_count++; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 316bbdc4a158..7c7921a53b13 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1110,6 +1110,14 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { .len = NF_CT_LABELS_MAX_SIZE }, }; +static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) +{ + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + return 0; + + return ctnetlink_filter_match(ct, data); +} + static int ctnetlink_flush_conntrack(struct net *net, const struct nlattr * const cda[], u32 portid, int report) @@ -1122,7 +1130,7 @@ static int ctnetlink_flush_conntrack(struct net *net, return PTR_ERR(filter); } - nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter, + nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter, portid, report); kfree(filter); @@ -1168,6 +1176,11 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, ct = nf_ct_tuplehash_to_ctrack(h); + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) { + nf_ct_put(ct); + return -EBUSY; + } + if (cda[CTA_ID]) { u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); if (id != (u32)(unsigned long)ct) { diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 684cc29010a0..e97cdc1cf98c 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + return; + seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } #endif diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5a101caa3e12..46d32baad095 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -309,10 +309,12 @@ static int ct_seq_show(struct seq_file *s, void *v) WARN_ON(!l4proto); ret = -ENOSPC; - seq_printf(s, "%-8s %u %-8s %u %ld ", + seq_printf(s, "%-8s %u %-8s %u ", l3proto_name(l3proto->l3proto), nf_ct_l3num(ct), - l4proto_name(l4proto->l4proto), nf_ct_protonum(ct), - nf_ct_expires(ct) / HZ); + l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); + + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) + seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ); if (l4proto->print_conntrack) l4proto->print_conntrack(s, ct); @@ -339,7 +341,9 @@ static int ct_seq_show(struct seq_file *s, void *v) if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; - if (test_bit(IPS_ASSURED_BIT, &ct->status)) + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + seq_puts(s, "[OFFLOAD] "); + else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); if (seq_has_overflowed(s)) -- cgit v1.2.3-59-g8ed1b From 3b49e2e94e6ebb8b23d0955d9e898254455734f8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 7 Jan 2018 01:04:07 +0100 Subject: netfilter: nf_tables: add flow table netlink frontend This patch introduces a netlink control plane to create, delete and dump flow tables. Flow tables are identified by name, this name is used from rules to refer to an specific flow table. Flow tables use the rhashtable class and a generic garbage collector to remove expired entries. This also adds the infrastructure to add different flow table types, so we can add one for each layer 3 protocol family. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 23 + include/net/netfilter/nf_tables.h | 48 ++ include/uapi/linux/netfilter/nf_tables.h | 53 +++ net/netfilter/nf_tables_api.c | 747 ++++++++++++++++++++++++++++++- 4 files changed, 870 insertions(+), 1 deletion(-) create mode 100644 include/net/netfilter/nf_flow_table.h (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h new file mode 100644 index 000000000000..3a0779589281 --- /dev/null +++ b/include/net/netfilter/nf_flow_table.h @@ -0,0 +1,23 @@ +#ifndef _NF_FLOW_TABLE_H +#define _NF_FLOW_TABLE_H + +#include + +struct nf_flowtable; + +struct nf_flowtable_type { + struct list_head list; + int family; + void (*gc)(struct work_struct *work); + const struct rhashtable_params *params; + nf_hookfn *hook; + struct module *owner; +}; + +struct nf_flowtable { + struct rhashtable rhashtable; + const struct nf_flowtable_type *type; + struct delayed_work gc_work; +}; + +#endif /* _FLOW_OFFLOAD_H */ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e3ec02fd0f67..dd238950df81 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #define NFT_JUMP_STACK_SIZE 16 @@ -943,6 +944,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @chains: chains in the table * @sets: sets in the table * @objects: stateful objects in the table + * @flowtables: flow tables in the table * @hgenerator: handle generator state * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) @@ -954,6 +956,7 @@ struct nft_table { struct list_head chains; struct list_head sets; struct list_head objects; + struct list_head flowtables; u64 hgenerator; u32 use; u16 flags:14, @@ -1084,6 +1087,44 @@ struct nft_object_ops { int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); +/** + * struct nft_flowtable - nf_tables flow table + * + * @list: flow table list node in table list + * @table: the table the flow table is contained in + * @name: name of this flow table + * @hooknum: hook number + * @priority: hook priority + * @ops_len: number of hooks in array + * @genmask: generation mask + * @use: number of references to this flow table + * @data: rhashtable and garbage collector + * @ops: array of hooks + */ +struct nft_flowtable { + struct list_head list; + struct nft_table *table; + char *name; + int hooknum; + int priority; + int ops_len; + u32 genmask:2, + use:30; + /* runtime data below here */ + struct nf_hook_ops *ops ____cacheline_aligned; + struct nf_flowtable data; +}; + +struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask); +void nft_flow_table_iterate(struct net *net, + void (*iter)(struct nf_flowtable *flowtable, void *data), + void *data); + +void nft_register_flowtable_type(struct nf_flowtable_type *type); +void nft_unregister_flowtable_type(struct nf_flowtable_type *type); + /** * struct nft_traceinfo - nft tracing information and state * @@ -1317,4 +1358,11 @@ struct nft_trans_obj { #define nft_trans_obj(trans) \ (((struct nft_trans_obj *)trans->data)->obj) +struct nft_trans_flowtable { + struct nft_flowtable *flowtable; +}; + +#define nft_trans_flowtable(trans) \ + (((struct nft_trans_flowtable *)trans->data)->flowtable) + #endif /* _NET_NF_TABLES_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2efbf9744c2a..591b53bce070 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -92,6 +92,9 @@ enum nft_verdicts { * @NFT_MSG_GETOBJ: get a stateful object (enum nft_obj_attributes) * @NFT_MSG_DELOBJ: delete a stateful object (enum nft_obj_attributes) * @NFT_MSG_GETOBJ_RESET: get and reset a stateful object (enum nft_obj_attributes) + * @NFT_MSG_NEWFLOWTABLE: add new flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes) + * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -116,6 +119,9 @@ enum nf_tables_msg_types { NFT_MSG_GETOBJ, NFT_MSG_DELOBJ, NFT_MSG_GETOBJ_RESET, + NFT_MSG_NEWFLOWTABLE, + NFT_MSG_GETFLOWTABLE, + NFT_MSG_DELFLOWTABLE, NFT_MSG_MAX, }; @@ -1309,6 +1315,53 @@ enum nft_object_attributes { }; #define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1) +/** + * enum nft_flowtable_attributes - nf_tables flow table netlink attributes + * + * @NFTA_FLOWTABLE_TABLE: name of the table containing the expression (NLA_STRING) + * @NFTA_FLOWTABLE_NAME: name of this flow table (NLA_STRING) + * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32) + * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32) + */ +enum nft_flowtable_attributes { + NFTA_FLOWTABLE_UNSPEC, + NFTA_FLOWTABLE_TABLE, + NFTA_FLOWTABLE_NAME, + NFTA_FLOWTABLE_HOOK, + NFTA_FLOWTABLE_USE, + __NFTA_FLOWTABLE_MAX +}; +#define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1) + +/** + * enum nft_flowtable_hook_attributes - nf_tables flow table hook netlink attributes + * + * @NFTA_FLOWTABLE_HOOK_NUM: netfilter hook number (NLA_U32) + * @NFTA_FLOWTABLE_HOOK_PRIORITY: netfilter hook priority (NLA_U32) + * @NFTA_FLOWTABLE_HOOK_DEVS: input devices this flow table is bound to (NLA_NESTED) + */ +enum nft_flowtable_hook_attributes { + NFTA_FLOWTABLE_HOOK_UNSPEC, + NFTA_FLOWTABLE_HOOK_NUM, + NFTA_FLOWTABLE_HOOK_PRIORITY, + NFTA_FLOWTABLE_HOOK_DEVS, + __NFTA_FLOWTABLE_HOOK_MAX +}; +#define NFTA_FLOWTABLE_HOOK_MAX (__NFTA_FLOWTABLE_HOOK_MAX - 1) + +/** + * enum nft_device_attributes - nf_tables device netlink attributes + * + * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) + */ +enum nft_devices_attributes { + NFTA_DEVICE_UNSPEC, + NFTA_DEVICE_NAME, + __NFTA_DEVICE_MAX +}; +#define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) + + /** * enum nft_trace_attributes - nf_tables trace netlink attributes * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fa564dac66a2..db0933256ec9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,7 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); +static LIST_HEAD(nf_tables_flowtables); /** * nft_register_afinfo - register nf_tables address family info @@ -345,6 +347,40 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) return err; } +static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, + struct nft_flowtable *flowtable) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, + sizeof(struct nft_trans_flowtable)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWFLOWTABLE) + nft_activate_next(ctx->net, flowtable); + + nft_trans_flowtable(trans) = flowtable; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + +static int nft_delflowtable(struct nft_ctx *ctx, + struct nft_flowtable *flowtable) +{ + int err; + + err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); + if (err < 0) + return err; + + nft_deactivate_next(ctx->net, flowtable); + ctx->table->use--; + + return err; +} + /* * Tables */ @@ -728,6 +764,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); + INIT_LIST_HEAD(&table->flowtables); table->flags = flags; nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); @@ -749,10 +786,11 @@ err1: static int nft_flush_table(struct nft_ctx *ctx) { - int err; + struct nft_flowtable *flowtable, *nft; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; struct nft_set *set, *ns; + int err; list_for_each_entry(chain, &ctx->table->chains, list) { if (!nft_is_active_next(ctx->net, chain)) @@ -778,6 +816,12 @@ static int nft_flush_table(struct nft_ctx *ctx) goto out; } + list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) { + err = nft_delflowtable(ctx, flowtable); + if (err < 0) + goto out; + } + list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { err = nft_delobj(ctx, obj); if (err < 0) @@ -4839,6 +4883,605 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx, ctx->afi->family, ctx->report, GFP_KERNEL); } +/* + * Flow tables + */ +void nft_register_flowtable_type(struct nf_flowtable_type *type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_add_tail_rcu(&type->list, &nf_tables_flowtables); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_register_flowtable_type); + +void nft_unregister_flowtable_type(struct nf_flowtable_type *type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&type->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_flowtable_type); + +static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { + [NFTA_FLOWTABLE_TABLE] = { .type = NLA_STRING, + .len = NFT_NAME_MAXLEN - 1 }, + [NFTA_FLOWTABLE_NAME] = { .type = NLA_STRING, + .len = NFT_NAME_MAXLEN - 1 }, + [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, +}; + +struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask) +{ + struct nft_flowtable *flowtable; + + list_for_each_entry(flowtable, &table->flowtables, list) { + if (!nla_strcmp(nla, flowtable->name) && + nft_active_genmask(flowtable, genmask)) + return flowtable; + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); + +#define NFT_FLOWTABLE_DEVICE_MAX 8 + +static int nf_tables_parse_devices(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct net_device *dev_array[], int *len) +{ + const struct nlattr *tmp; + struct net_device *dev; + char ifname[IFNAMSIZ]; + int rem, n = 0, err; + + nla_for_each_nested(tmp, attr, rem) { + if (nla_type(tmp) != NFTA_DEVICE_NAME) { + err = -EINVAL; + goto err1; + } + + nla_strlcpy(ifname, tmp, IFNAMSIZ); + dev = dev_get_by_name(ctx->net, ifname); + if (!dev) { + err = -ENOENT; + goto err1; + } + + dev_array[n++] = dev; + if (n == NFT_FLOWTABLE_DEVICE_MAX) { + err = -EFBIG; + goto err1; + } + } + if (!len) + return -EINVAL; + + err = 0; +err1: + *len = n; + return err; +} + +static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { + [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, + [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFTA_FLOWTABLE_HOOK_DEVS] = { .type = NLA_NESTED }, +}; + +static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct nft_flowtable *flowtable) +{ + struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX]; + struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; + struct nf_hook_ops *ops; + int hooknum, priority; + int err, n = 0, i; + + err = nla_parse_nested(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, + nft_flowtable_hook_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || + !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] || + !tb[NFTA_FLOWTABLE_HOOK_DEVS]) + return -EINVAL; + + hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); + if (hooknum >= ctx->afi->nhooks) + return -EINVAL; + + priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + + err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS], + dev_array, &n); + if (err < 0) + goto err1; + + ops = kzalloc(sizeof(struct nf_hook_ops) * n, GFP_KERNEL); + if (!ops) { + err = -ENOMEM; + goto err1; + } + + flowtable->ops = ops; + flowtable->ops_len = n; + + for (i = 0; i < n; i++) { + flowtable->ops[i].pf = NFPROTO_NETDEV; + flowtable->ops[i].hooknum = hooknum; + flowtable->ops[i].priority = priority; + flowtable->ops[i].priv = &flowtable->data.rhashtable; + flowtable->ops[i].hook = flowtable->data.type->hook; + flowtable->ops[i].dev = dev_array[i]; + } + + err = 0; +err1: + for (i = 0; i < n; i++) + dev_put(dev_array[i]); + + return err; +} + +static const struct nf_flowtable_type * +__nft_flowtable_type_get(const struct nft_af_info *afi) +{ + const struct nf_flowtable_type *type; + + list_for_each_entry(type, &nf_tables_flowtables, list) { + if (afi->family == type->family) + return type; + } + return NULL; +} + +static const struct nf_flowtable_type * +nft_flowtable_type_get(const struct nft_af_info *afi) +{ + const struct nf_flowtable_type *type; + + type = __nft_flowtable_type_get(afi); + if (type != NULL && try_module_get(type->owner)) + return type; + +#ifdef CONFIG_MODULES + if (type == NULL) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nf-flowtable-%u", afi->family); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_flowtable_type_get(afi)) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-ENOENT); +} + +void nft_flow_table_iterate(struct net *net, + void (*iter)(struct nf_flowtable *flowtable, void *data), + void *data) +{ + struct nft_flowtable *flowtable; + const struct nft_af_info *afi; + const struct nft_table *table; + + rcu_read_lock(); + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + iter(&flowtable->data, data); + } + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nft_flow_table_iterate); + +static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable) +{ + int i; + + for (i = 0; i < flowtable->ops_len; i++) { + if (!flowtable->ops[i].dev) + continue; + + nf_unregister_net_hook(net, &flowtable->ops[i]); + } +} + +static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nf_flowtable_type *type; + u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + struct nft_af_info *afi; + struct nft_table *table; + struct nft_ctx ctx; + int err, i, k; + + if (!nla[NFTA_FLOWTABLE_TABLE] || + !nla[NFTA_FLOWTABLE_NAME] || + !nla[NFTA_FLOWTABLE_HOOK]) + return -EINVAL; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); + if (IS_ERR(flowtable)) { + err = PTR_ERR(flowtable); + if (err != -ENOENT) + return err; + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + return 0; + } + + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + + flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL); + if (!flowtable) + return -ENOMEM; + + flowtable->table = table; + flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); + if (!flowtable->name) { + err = -ENOMEM; + goto err1; + } + + type = nft_flowtable_type_get(afi); + if (IS_ERR(type)) { + err = PTR_ERR(type); + goto err2; + } + + flowtable->data.type = type; + err = rhashtable_init(&flowtable->data.rhashtable, type->params); + if (err < 0) + goto err3; + + err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], + flowtable); + if (err < 0) + goto err3; + + for (i = 0; i < flowtable->ops_len; i++) { + err = nf_register_net_hook(net, &flowtable->ops[i]); + if (err < 0) + goto err4; + } + + err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); + if (err < 0) + goto err5; + + INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc); + queue_delayed_work(system_power_efficient_wq, + &flowtable->data.gc_work, HZ); + + list_add_tail_rcu(&flowtable->list, &table->flowtables); + table->use++; + + return 0; +err5: + i = flowtable->ops_len; +err4: + for (k = i - 1; k >= 0; k--) + nf_unregister_net_hook(net, &flowtable->ops[i]); + + kfree(flowtable->ops); +err3: + module_put(type->owner); +err2: + kfree(flowtable->name); +err1: + kfree(flowtable); + return err; +} + +static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + struct nft_af_info *afi; + struct nft_table *table; + struct nft_ctx ctx; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); + if (IS_ERR(flowtable)) + return PTR_ERR(flowtable); + if (flowtable->use > 0) + return -EBUSY; + + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + + return nft_delflowtable(&ctx, flowtable); +} + +static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, + u32 flags, int family, + struct nft_flowtable *flowtable) +{ + struct nlattr *nest, *nest_devs; + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + int i; + + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); + + if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || + nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || + nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use))) + goto nla_put_failure; + + nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK); + if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || + nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) + goto nla_put_failure; + + nest_devs = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK_DEVS); + if (!nest_devs) + goto nla_put_failure; + + for (i = 0; i < flowtable->ops_len; i++) { + if (flowtable->ops[i].dev && + nla_put_string(skb, NFTA_DEVICE_NAME, + flowtable->ops[i].dev->name)) + goto nla_put_failure; + } + nla_nest_end(skb, nest_devs); + nla_nest_end(skb, nest); + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +struct nft_flowtable_filter { + char *table; +}; + +static int nf_tables_dump_flowtable(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nft_flowtable_filter *filter = cb->data; + unsigned int idx = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + const struct nft_af_info *afi; + const struct nft_table *table; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (family != NFPROTO_UNSPEC && family != afi->family) + continue; + + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + if (!nft_is_active(net, flowtable)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (filter && filter->table[0] && + strcmp(filter->table, table->name)) + goto cont; + + if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWFLOWTABLE, + NLM_F_MULTI | NLM_F_APPEND, + afi->family, flowtable) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } + } +done: + rcu_read_unlock(); + + cb->args[0] = idx; + return skb->len; +} + +static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) +{ + struct nft_flowtable_filter *filter = cb->data; + + if (!filter) + return 0; + + kfree(filter->table); + kfree(filter); + + return 0; +} + +static struct nft_flowtable_filter * +nft_flowtable_filter_alloc(const struct nlattr * const nla[]) +{ + struct nft_flowtable_filter *filter; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); + + if (nla[NFTA_FLOWTABLE_TABLE]) { + filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], + GFP_KERNEL); + if (!filter->table) { + kfree(filter); + return ERR_PTR(-ENOMEM); + } + } + return filter; +} + +static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); + int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + const struct nft_af_info *afi; + const struct nft_table *table; + struct sk_buff *skb2; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_flowtable, + .done = nf_tables_dump_flowtable_done, + }; + + if (nla[NFTA_FLOWTABLE_TABLE]) { + struct nft_flowtable_filter *filter; + + filter = nft_flowtable_filter_alloc(nla); + if (IS_ERR(filter)) + return -ENOMEM; + + c.data = filter; + } + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + if (!nla[NFTA_FLOWTABLE_NAME]) + return -EINVAL; + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_FLOWTABLE_TABLE], genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); + if (IS_ERR(table)) + return PTR_ERR(flowtable); + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, + NFT_MSG_NEWFLOWTABLE, 0, family, + flowtable); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); +err: + kfree_skb(skb2); + return err; +} + +static void nf_tables_flowtable_notify(struct nft_ctx *ctx, + struct nft_flowtable *flowtable, + int event) +{ + struct sk_buff *skb; + int err; + + if (ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return; + + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, + ctx->seq, event, 0, + ctx->afi->family, flowtable); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); + return; +err: + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); +} + +static void nft_flowtable_destroy(void *ptr, void *arg) +{ + kfree(ptr); +} + +static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) +{ + cancel_delayed_work_sync(&flowtable->data.gc_work); + kfree(flowtable->name); + rhashtable_free_and_destroy(&flowtable->data.rhashtable, + nft_flowtable_destroy, NULL); + module_put(flowtable->data.type->owner); +} + static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { @@ -4869,6 +5512,49 @@ nla_put_failure: return -EMSGSIZE; } +static void nft_flowtable_event(unsigned long event, struct net_device *dev, + struct nft_flowtable *flowtable) +{ + int i; + + for (i = 0; i < flowtable->ops_len; i++) { + if (flowtable->ops[i].dev != dev) + continue; + + nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); + flowtable->ops[i].dev = NULL; + break; + } +} + +static int nf_tables_flowtable_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nft_flowtable *flowtable; + struct nft_table *table; + struct nft_af_info *afi; + + if (event != NETDEV_UNREGISTER) + return 0; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) { + list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { + nft_flowtable_event(event, dev, flowtable); + } + } + } + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + + return NOTIFY_DONE; +} + +static struct notifier_block nf_tables_flowtable_notifier = { + .notifier_call = nf_tables_flowtable_event, +}; + static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) { @@ -5021,6 +5707,21 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, + [NFT_MSG_NEWFLOWTABLE] = { + .call_batch = nf_tables_newflowtable, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, + [NFT_MSG_GETFLOWTABLE] = { + .call = nf_tables_getflowtable, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, + [NFT_MSG_DELFLOWTABLE] = { + .call_batch = nf_tables_delflowtable, + .attr_count = NFTA_FLOWTABLE_MAX, + .policy = nft_flowtable_policy, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) @@ -5066,6 +5767,9 @@ static void nf_tables_commit_release(struct nft_trans *trans) case NFT_MSG_DELOBJ: nft_obj_destroy(nft_trans_obj(trans)); break; + case NFT_MSG_DELFLOWTABLE: + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + break; } kfree(trans); } @@ -5183,6 +5887,21 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), NFT_MSG_DELOBJ); break; + case NFT_MSG_NEWFLOWTABLE: + nft_clear(net, nft_trans_flowtable(trans)); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + NFT_MSG_NEWFLOWTABLE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELFLOWTABLE: + list_del_rcu(&nft_trans_flowtable(trans)->list); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + NFT_MSG_DELFLOWTABLE); + nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans)); + break; } } @@ -5220,6 +5939,9 @@ static void nf_tables_abort_release(struct nft_trans *trans) case NFT_MSG_NEWOBJ: nft_obj_destroy(nft_trans_obj(trans)); break; + case NFT_MSG_NEWFLOWTABLE: + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + break; } kfree(trans); } @@ -5309,6 +6031,17 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; + case NFT_MSG_NEWFLOWTABLE: + trans->ctx.table->use--; + list_del_rcu(&nft_trans_flowtable(trans)->list); + nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans)); + break; + case NFT_MSG_DELFLOWTABLE: + trans->ctx.table->use++; + nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + nft_trans_destroy(trans); + break; } } @@ -5865,6 +6598,7 @@ EXPORT_SYMBOL_GPL(__nft_release_basechain); /* Called by nft_unregister_afinfo() from __net_exit path, nfnl_lock is held. */ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) { + struct nft_flowtable *flowtable, *nf; struct nft_table *table, *nt; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; @@ -5878,6 +6612,9 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) list_for_each_entry_safe(table, nt, &afi->tables, list) { list_for_each_entry(chain, &table->chains, list) nf_tables_unregister_hook(net, table, chain); + list_for_each_entry(flowtable, &table->flowtables, list) + nf_unregister_net_hooks(net, flowtable->ops, + flowtable->ops_len); /* No packets are walking on these chains anymore. */ ctx.table = table; list_for_each_entry(chain, &table->chains, list) { @@ -5888,6 +6625,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) nf_tables_rule_destroy(&ctx, rule); } } + list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { + list_del(&flowtable->list); + table->use--; + nf_tables_flowtable_destroy(flowtable); + } list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); table->use--; @@ -5932,6 +6674,8 @@ static int __init nf_tables_module_init(void) if (err < 0) goto err3; + register_netdevice_notifier(&nf_tables_flowtable_notifier); + pr_info("nf_tables: (c) 2007-2009 Patrick McHardy \n"); return register_pernet_subsys(&nf_tables_net_ops); err3: @@ -5946,6 +6690,7 @@ static void __exit nf_tables_module_exit(void) { unregister_pernet_subsys(&nf_tables_net_ops); nfnetlink_subsys_unregister(&nf_tables_subsys); + unregister_netdevice_notifier(&nf_tables_flowtable_notifier); rcu_barrier(); nf_tables_core_module_exit(); kfree(info); -- cgit v1.2.3-59-g8ed1b From a3c90f7a2323b331ae816d5b0633e68148e25d04 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 7 Jan 2018 01:04:26 +0100 Subject: netfilter: nf_tables: flow offload expression Add new instruction for the nf_tables VM that allows us to specify what flows are offloaded into a given flow table via name. This new instruction creates the flow entry and adds it to the flow table. Only established flows, ie. we have seen traffic in both directions, are added to the flow table. You can still decide to offload entries at a later stage via packet counting or checking the ct status in case you want to offload assured conntracks. This new extension depends on the conntrack subsystem. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 11 ++ net/netfilter/Kconfig | 7 + net/netfilter/Makefile | 1 + net/netfilter/nft_flow_offload.c | 264 +++++++++++++++++++++++++++++++ 4 files changed, 283 insertions(+) create mode 100644 net/netfilter/nft_flow_offload.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 591b53bce070..53e8dd2a3a03 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -957,6 +957,17 @@ enum nft_ct_attributes { }; #define NFTA_CT_MAX (__NFTA_CT_MAX - 1) +/** + * enum nft_flow_attributes - ct offload expression attributes + * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING) + */ +enum nft_offload_attributes { + NFTA_FLOW_UNSPEC, + NFTA_FLOW_TABLE_NAME, + __NFTA_FLOW_MAX, +}; +#define NFTA_FLOW_MAX (__NFTA_FLOW_MAX - 1) + enum nft_limit_type { NFT_LIMIT_PKTS, NFT_LIMIT_PKT_BYTES diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 272803079bf2..0ee0fcf3abbf 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -505,6 +505,13 @@ config NFT_CT This option adds the "ct" expression that you can use to match connection tracking information such as the flow state. +config NFT_FLOW_OFFLOAD + depends on NF_CONNTRACK + tristate "Netfilter nf_tables hardware flow offload module" + help + This option adds the "flow_offload" expression that you can use to + choose what flows are placed into the hardware. + config NFT_SET_RBTREE tristate "Netfilter nf_tables rbtree set module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 061365875cde..5d9b8b959e58 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -86,6 +86,7 @@ obj-$(CONFIG_NFT_META) += nft_meta.o obj-$(CONFIG_NFT_RT) += nft_rt.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o +obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o obj-$(CONFIG_NFT_OBJREF) += nft_objref.o diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c new file mode 100644 index 000000000000..dd38785dfed9 --- /dev/null +++ b/net/netfilter/nft_flow_offload.c @@ -0,0 +1,264 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ipv4 options. */ +#include +#include +#include +#include +#include + +struct nft_flow_offload { + struct nft_flowtable *flowtable; +}; + +static int nft_flow_route(const struct nft_pktinfo *pkt, + const struct nf_conn *ct, + struct nf_flow_route *route, + enum ip_conntrack_dir dir) +{ + struct dst_entry *this_dst = skb_dst(pkt->skb); + struct dst_entry *other_dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ct->tuplehash[!dir].tuple.dst.u3.ip; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ct->tuplehash[!dir].tuple.dst.u3.in6; + break; + } + + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); + if (!other_dst) + return -ENOENT; + + route->tuple[dir].dst = this_dst; + route->tuple[dir].ifindex = nft_in(pkt)->ifindex; + route->tuple[!dir].dst = other_dst; + route->tuple[!dir].ifindex = nft_out(pkt)->ifindex; + + return 0; +} + +static bool nft_flow_offload_skip(struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + + if (unlikely(opt->optlen)) + return true; + if (skb_sec_path(skb)) + return true; + + return false; +} + +static void nft_flow_offload_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + struct nf_flowtable *flowtable = &priv->flowtable->data; + enum ip_conntrack_info ctinfo; + struct nf_flow_route route; + struct flow_offload *flow; + enum ip_conntrack_dir dir; + struct nf_conn *ct; + int ret; + + if (nft_flow_offload_skip(pkt->skb)) + goto out; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct) + goto out; + + switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { + case IPPROTO_TCP: + case IPPROTO_UDP: + break; + default: + goto out; + } + + if (test_bit(IPS_HELPER_BIT, &ct->status)) + goto out; + + if (ctinfo == IP_CT_NEW || + ctinfo == IP_CT_RELATED) + goto out; + + if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) + goto out; + + dir = CTINFO2DIR(ctinfo); + if (nft_flow_route(pkt, ct, &route, dir) < 0) + goto err_flow_route; + + flow = flow_offload_alloc(ct, &route); + if (!flow) + goto err_flow_alloc; + + ret = flow_offload_add(flowtable, flow); + if (ret < 0) + goto err_flow_add; + + return; + +err_flow_add: + flow_offload_free(flow); +err_flow_alloc: + dst_release(route.tuple[!dir].dst); +err_flow_route: + clear_bit(IPS_OFFLOAD_BIT, &ct->status); +out: + regs->verdict.code = NFT_BREAK; +} + +static int nft_flow_offload_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + unsigned int hook_mask = (1 << NF_INET_FORWARD); + + return nft_chain_validate_hooks(ctx->chain, hook_mask); +} + +static int nft_flow_offload_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + struct nft_flowtable *flowtable; + + if (!tb[NFTA_FLOW_TABLE_NAME]) + return -EINVAL; + + flowtable = nf_tables_flowtable_lookup(ctx->table, + tb[NFTA_FLOW_TABLE_NAME], + genmask); + if (IS_ERR(flowtable)) + return PTR_ERR(flowtable); + + priv->flowtable = flowtable; + flowtable->use++; + + return nf_ct_netns_get(ctx->net, ctx->afi->family); +} + +static void nft_flow_offload_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + priv->flowtable->use--; + nf_ct_netns_put(ctx->net, ctx->afi->family); +} + +static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_flow_offload_type; +static const struct nft_expr_ops nft_flow_offload_ops = { + .type = &nft_flow_offload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), + .eval = nft_flow_offload_eval, + .init = nft_flow_offload_init, + .destroy = nft_flow_offload_destroy, + .validate = nft_flow_offload_validate, + .dump = nft_flow_offload_dump, +}; + +static struct nft_expr_type nft_flow_offload_type __read_mostly = { + .name = "flow_offload", + .ops = &nft_flow_offload_ops, + .maxattr = NFTA_FLOW_MAX, + .owner = THIS_MODULE, +}; + +static void flow_offload_iterate_cleanup(struct flow_offload *flow, void *data) +{ + struct net_device *dev = data; + + if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex) + return; + + flow_offload_dead(flow); +} + +static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable, + void *data) +{ + nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data); +} + +static int flow_offload_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; + + nft_flow_table_iterate(dev_net(dev), nft_flow_offload_iterate_cleanup, dev); + + return NOTIFY_DONE; +} + +static struct notifier_block flow_offload_netdev_notifier = { + .notifier_call = flow_offload_netdev_event, +}; + +static int __init nft_flow_offload_module_init(void) +{ + int err; + + register_netdevice_notifier(&flow_offload_netdev_notifier); + + err = nft_register_expr(&nft_flow_offload_type); + if (err < 0) + goto register_expr; + + return 0; + +register_expr: + unregister_netdevice_notifier(&flow_offload_netdev_notifier); + return err; +} + +static void __exit nft_flow_offload_module_exit(void) +{ + struct net *net; + + nft_unregister_expr(&nft_flow_offload_type); + unregister_netdevice_notifier(&flow_offload_netdev_notifier); + rtnl_lock(); + for_each_net(net) + nft_flow_table_iterate(net, nft_flow_offload_iterate_cleanup, NULL); + rtnl_unlock(); +} + +module_init(nft_flow_offload_module_init); +module_exit(nft_flow_offload_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_EXPR("flow_offload"); -- cgit v1.2.3-59-g8ed1b From 23fe846f9a48d5375722b3bd060e0a02ad1ca7f1 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 5 Jan 2018 19:47:14 +0100 Subject: l2tp: adjust comments about L2TPv3 offsets The "offset" option has been removed by commit 900631ee6a26 ("l2tp: remove configurable payload offset"). Signed-off-by: Guillaume Nault Acked-by: James Chapman Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 2 +- net/l2tp/l2tp_core.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index f78eef4cc56a..71e62795104d 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -65,7 +65,7 @@ struct sockaddr_l2tpip6 { * TUNNEL_MODIFY - CONN_ID, udpcsum * TUNNEL_GETSTATS - CONN_ID, (stats) * TUNNEL_GET - CONN_ID, (...) - * SESSION_CREATE - SESSION_ID, PW_TYPE, offset, data_seq, cookie, peer_cookie, offset, l2spec + * SESSION_CREATE - SESSION_ID, PW_TYPE, data_seq, cookie, peer_cookie, l2spec * SESSION_DELETE - SESSION_ID * SESSION_MODIFY - SESSION_ID, data_seq * SESSION_GET - SESSION_ID, (...) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 786cd7f6a5e8..62285fc6eb59 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -662,10 +662,9 @@ discard: * |x|S|x|x|x|x|x|x| Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * - * Cookie value, sublayer format and offset (pad) are negotiated with - * the peer when the session is set up. Unlike L2TPv2, we do not need - * to parse the packet header to determine if optional fields are - * present. + * Cookie value and sublayer format are negotiated with the peer when + * the session is set up. Unlike L2TPv2, we do not need to parse the + * packet header to determine if optional fields are present. * * Caller must already have parsed the frame and determined that it is * a data (not control) frame before coming here. Fields up to the -- cgit v1.2.3-59-g8ed1b From c5a9f6f0ab4054082dd5ce9bbdaa8e8ff05cf365 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Mon, 17 Jul 2017 13:47:07 +0300 Subject: net/core: Add drop counters to VF statistics Modern hardware can decide to drop packets going to/from a VF. Add receive and transmit drop counters to be displayed at hypervisor layer in iproute2 per VF statistics. Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- include/linux/if_link.h | 2 ++ include/uapi/linux/if_link.h | 2 ++ net/core/rtnetlink.c | 10 +++++++++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 4c54611e03e9..622658dfbf0a 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -13,6 +13,8 @@ struct ifla_vf_stats { __u64 tx_bytes; __u64 broadcast; __u64 multicast; + __u64 rx_dropped; + __u64 tx_dropped; }; struct ifla_vf_info { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 19fc02660e0c..f8f04fed6186 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -732,6 +732,8 @@ enum { IFLA_VF_STATS_BROADCAST, IFLA_VF_STATS_MULTICAST, IFLA_VF_STATS_PAD, + IFLA_VF_STATS_RX_DROPPED, + IFLA_VF_STATS_TX_DROPPED, __IFLA_VF_STATS_MAX, }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c688dc564b11..5421a3fd3ba1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -904,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else @@ -1258,7 +1262,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) { + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } -- cgit v1.2.3-59-g8ed1b From ccfdec9089229503d3a305e02accac01817d293e Mon Sep 17 00:00:00 2001 From: Felix Walter Date: Fri, 5 Jan 2018 14:33:31 +0100 Subject: macsec: Add support for GCM-AES-256 cipher suite This adds support for the GCM-AES-256 cipher suite as specified in IEEE 802.1AEbn-2011. The prepared cipher suite selection mechanism is used, with GCM-AES-128 being the default cipher suite as defined in the standard. Signed-off-by: Felix Walter Cc: Sabrina Dubroca Signed-off-by: David S. Miller --- drivers/net/macsec.c | 72 ++++++++++++++++++++++++++++++++++-------- include/uapi/linux/if_macsec.h | 11 +++++-- 2 files changed, 67 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 1d025ab9568f..f522715c6595 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -393,7 +393,12 @@ static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) #define MACSEC_PORT_SCB (0x0000) #define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL) -#define DEFAULT_SAK_LEN 16 +#define MACSEC_GCM_AES_128_SAK_LEN 16 +#define MACSEC_GCM_AES_256_SAK_LEN 32 + +#define MAX_SAK_LEN MACSEC_GCM_AES_256_SAK_LEN + +#define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN #define DEFAULT_SEND_SCI true #define DEFAULT_ENCRYPT false #define DEFAULT_ENCODING_SA 0 @@ -1600,7 +1605,7 @@ static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = { [MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY, .len = MACSEC_KEYID_LEN, }, [MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY, - .len = MACSEC_MAX_KEY_LEN, }, + .len = MAX_SAK_LEN, }, }; static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa) @@ -2362,15 +2367,26 @@ static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb) { struct macsec_tx_sc *tx_sc = &secy->tx_sc; struct nlattr *secy_nest = nla_nest_start(skb, MACSEC_ATTR_SECY); + u64 csid; if (!secy_nest) return 1; + switch (secy->key_len) { + case MACSEC_GCM_AES_128_SAK_LEN: + csid = MACSEC_CIPHER_ID_GCM_AES_128; + break; + case MACSEC_GCM_AES_256_SAK_LEN: + csid = MACSEC_CIPHER_ID_GCM_AES_256; + break; + default: + goto cancel; + } + if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci, MACSEC_SECY_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE, - MACSEC_DEFAULT_CIPHER_ID, - MACSEC_SECY_ATTR_PAD) || + csid, MACSEC_SECY_ATTR_PAD) || nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) || nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) || nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) || @@ -3015,8 +3031,8 @@ static void macsec_setup(struct net_device *dev) eth_zero_addr(dev->broadcast); } -static void macsec_changelink_common(struct net_device *dev, - struct nlattr *data[]) +static int macsec_changelink_common(struct net_device *dev, + struct nlattr *data[]) { struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; @@ -3056,6 +3072,22 @@ static void macsec_changelink_common(struct net_device *dev, if (data[IFLA_MACSEC_VALIDATION]) secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]); + + if (data[IFLA_MACSEC_CIPHER_SUITE]) { + switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) { + case MACSEC_CIPHER_ID_GCM_AES_128: + case MACSEC_DEFAULT_CIPHER_ALT: + secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; + break; + case MACSEC_CIPHER_ID_GCM_AES_256: + secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; + break; + default: + return -EINVAL; + } + } + + return 0; } static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], @@ -3071,9 +3103,7 @@ static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], data[IFLA_MACSEC_PORT]) return -EINVAL; - macsec_changelink_common(dev, data); - - return 0; + return macsec_changelink_common(dev, data); } static void macsec_del_dev(struct macsec_dev *macsec) @@ -3270,8 +3300,11 @@ static int macsec_newlink(struct net *net, struct net_device *dev, if (err) goto unlink; - if (data) - macsec_changelink_common(dev, data); + if (data) { + err = macsec_changelink_common(dev, data); + if (err) + goto del_dev; + } err = register_macsec_dev(real_dev, dev); if (err < 0) @@ -3320,7 +3353,8 @@ static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[], } switch (csid) { - case MACSEC_DEFAULT_CIPHER_ID: + case MACSEC_CIPHER_ID_GCM_AES_128: + case MACSEC_CIPHER_ID_GCM_AES_256: case MACSEC_DEFAULT_CIPHER_ALT: if (icv_len < MACSEC_MIN_ICV_LEN || icv_len > MACSEC_STD_ICV_LEN) @@ -3390,12 +3424,24 @@ static int macsec_fill_info(struct sk_buff *skb, { struct macsec_secy *secy = &macsec_priv(dev)->secy; struct macsec_tx_sc *tx_sc = &secy->tx_sc; + u64 csid; + + switch (secy->key_len) { + case MACSEC_GCM_AES_128_SAK_LEN: + csid = MACSEC_CIPHER_ID_GCM_AES_128; + break; + case MACSEC_GCM_AES_256_SAK_LEN: + csid = MACSEC_CIPHER_ID_GCM_AES_256; + break; + default: + goto nla_put_failure; + } if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci, IFLA_MACSEC_PAD) || nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) || nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE, - MACSEC_DEFAULT_CIPHER_ID, IFLA_MACSEC_PAD) || + csid, IFLA_MACSEC_PAD) || nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) || nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) || nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) || diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h index 719d243471f4..2e522835a4af 100644 --- a/include/uapi/linux/if_macsec.h +++ b/include/uapi/linux/if_macsec.h @@ -18,12 +18,17 @@ #define MACSEC_GENL_NAME "macsec" #define MACSEC_GENL_VERSION 1 -#define MACSEC_MAX_KEY_LEN 128 +#define MACSEC_MAX_KEY_LEN 256 #define MACSEC_KEYID_LEN 16 -#define MACSEC_DEFAULT_CIPHER_ID 0x0080020001000001ULL -#define MACSEC_DEFAULT_CIPHER_ALT 0x0080C20001000001ULL +/* cipher IDs as per IEEE802.1AEbn-2011 */ +#define MACSEC_CIPHER_ID_GCM_AES_128 0x0080C20001000001ULL +#define MACSEC_CIPHER_ID_GCM_AES_256 0x0080C20001000002ULL + +#define MACSEC_DEFAULT_CIPHER_ID MACSEC_CIPHER_ID_GCM_AES_128 +/* deprecated cipher ID for GCM-AES-128 */ +#define MACSEC_DEFAULT_CIPHER_ALT 0x0080020001000001ULL #define MACSEC_MIN_ICV_LEN 8 #define MACSEC_MAX_ICV_LEN 32 -- cgit v1.2.3-59-g8ed1b From faa9b39f0e9ddfa8c83725b3f230784976dd3c7f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 5 Jan 2018 17:44:54 -0500 Subject: virtio_net: propagate linkspeed/duplex settings from the hypervisor The ability to set speed and duplex for virtio_net is useful in various scenarios as described here: 16032be virtio_net: add ethtool support for set and get of settings However, it would be nice to be able to set this from the hypervisor, such that virtio_net doesn't require custom guest ethtool commands. Introduce a new feature flag, VIRTIO_NET_F_SPEED_DUPLEX, which allows the hypervisor to export a linkspeed and duplex setting. The user can subsequently overwrite it later if desired via: 'ethtool -s'. Note that VIRTIO_NET_F_SPEED_DUPLEX is defined as bit 63, the intention is that device feature bits are to grow down from bit 63, since the transports are starting from bit 24 and growing up. Signed-off-by: Jason Baron Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: virtio-dev@lists.oasis-open.org Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 23 ++++++++++++++++++++++- include/uapi/linux/virtio_net.h | 13 +++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ed8299343728..12dfc5fee58e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1906,6 +1906,24 @@ static void virtnet_init_settings(struct net_device *dev) vi->duplex = DUPLEX_UNKNOWN; } +static void virtnet_update_settings(struct virtnet_info *vi) +{ + u32 speed; + u8 duplex; + + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX)) + return; + + speed = virtio_cread32(vi->vdev, offsetof(struct virtio_net_config, + speed)); + if (ethtool_validate_speed(speed)) + vi->speed = speed; + duplex = virtio_cread8(vi->vdev, offsetof(struct virtio_net_config, + duplex)); + if (ethtool_validate_duplex(duplex)) + vi->duplex = duplex; +} + static const struct ethtool_ops virtnet_ethtool_ops = { .get_drvinfo = virtnet_get_drvinfo, .get_link = ethtool_op_get_link, @@ -2159,6 +2177,7 @@ static void virtnet_config_changed_work(struct work_struct *work) vi->status = v; if (vi->status & VIRTIO_NET_S_LINK_UP) { + virtnet_update_settings(vi); netif_carrier_on(vi->dev); netif_tx_wake_all_queues(vi->dev); } else { @@ -2707,6 +2726,7 @@ static int virtnet_probe(struct virtio_device *vdev) schedule_work(&vi->config_work); } else { vi->status = VIRTIO_NET_S_LINK_UP; + virtnet_update_settings(vi); netif_carrier_on(dev); } @@ -2808,7 +2828,8 @@ static struct virtio_device_id id_table[] = { VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \ VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \ VIRTIO_NET_F_CTRL_MAC_ADDR, \ - VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS + VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \ + VIRTIO_NET_F_SPEED_DUPLEX static unsigned int features[] = { VIRTNET_FEATURES, diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index fc353b518288..5de6ed37695b 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,8 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#define VIRTIO_NET_F_SPEED_DUPLEX 63 /* Device set linkspeed and duplex */ + #ifndef VIRTIO_NET_NO_LEGACY #define VIRTIO_NET_F_GSO 6 /* Host handles pkts w/ any GSO type */ #endif /* VIRTIO_NET_NO_LEGACY */ @@ -76,6 +78,17 @@ struct virtio_net_config { __u16 max_virtqueue_pairs; /* Default maximum transmit unit advice */ __u16 mtu; + /* + * speed, in units of 1Mb. All values 0 to INT_MAX are legal. + * Any other value stands for unknown. + */ + __u32 speed; + /* + * 0x00 - half duplex + * 0x01 - full duplex + * Any other value stands for unknown. + */ + __u8 duplex; } __attribute__((packed)); /* -- cgit v1.2.3-59-g8ed1b From 232d07b74a33b9f5d48516dc1d8ce41723ada593 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 8 Jan 2018 21:03:30 +0100 Subject: tipc: improve groupcast scope handling When a member joins a group, it also indicates a binding scope. This makes it possible to create both node local groups, invisible to other nodes, as well as cluster global groups, visible everywhere. In order to avoid that different members end up having permanently differing views of group size and memberhip, we must inhibit locally and globally bound members from joining the same group. We do this by using the binding scope as an additional separator between groups. I.e., a member must ignore all membership events from sockets using a different scope than itself, and all lookups for message destinations must require an exact match between the message's lookup scope and the potential target's binding scope. Apart from making it possible to create local groups using the same identity on different nodes, a side effect of this is that it now also becomes possible to create a cluster global group with the same identity across the same nodes, without interfering with the local groups. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 7 ++-- net/tipc/group.c | 13 ++++--- net/tipc/name_table.c | 40 ++++++++++----------- net/tipc/name_table.h | 4 +-- net/tipc/server.c | 4 +-- net/tipc/server.h | 6 ++-- net/tipc/socket.c | 88 ++++++++++++++++++++++++++++------------------- net/tipc/subscr.c | 10 ++++-- net/tipc/subscr.h | 2 +- 9 files changed, 99 insertions(+), 75 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 35f79d1f8c3a..14bacc7e6cef 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -117,10 +117,9 @@ static inline unsigned int tipc_node(__u32 addr) /* * Publication scopes when binding port names and port name sequences */ - -#define TIPC_ZONE_SCOPE 1 -#define TIPC_CLUSTER_SCOPE 2 -#define TIPC_NODE_SCOPE 3 +#define TIPC_ZONE_SCOPE 1 +#define TIPC_CLUSTER_SCOPE 2 +#define TIPC_NODE_SCOPE 3 /* * Limiting values for messages diff --git a/net/tipc/group.c b/net/tipc/group.c index cf996bd6ec98..1908773c9fca 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -87,7 +87,6 @@ struct tipc_group { int subid; u32 type; u32 instance; - u32 domain; u32 scope; u32 portid; u16 member_cnt; @@ -158,6 +157,8 @@ int tipc_group_size(struct tipc_group *grp) struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq) { + u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS; + bool global = mreq->scope != TIPC_NODE_SCOPE; struct tipc_group *grp; u32 type = mreq->type; @@ -171,15 +172,14 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, grp->members = RB_ROOT; grp->net = net; grp->portid = portid; - grp->domain = addr_domain(net, mreq->scope); grp->type = type; grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; - if (tipc_topsrv_kern_subscr(net, portid, type, - TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS, - 0, ~0, &grp->subid)) + filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; + if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, + filter, &grp->subid)) return grp; kfree(grp); return NULL; @@ -732,6 +732,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, if (!grp) return; + if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net)) + return; + m = tipc_group_find_member(grp, node, port); switch (msg_type(hdr)) { diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 60af9885f160..64cdd3c302b0 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -328,7 +328,8 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { tipc_subscrp_report_overlap(s, publ->lower, publ->upper, TIPC_PUBLISHED, publ->ref, - publ->node, created_subseq); + publ->node, publ->scope, + created_subseq); } return publ; } @@ -398,7 +399,8 @@ found: list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { tipc_subscrp_report_overlap(s, publ->lower, publ->upper, TIPC_WITHDRAWN, publ->ref, - publ->node, removed_subseq); + publ->node, publ->scope, + removed_subseq); } return publ; @@ -435,6 +437,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, sseq->upper, TIPC_PUBLISHED, crs->ref, crs->node, + crs->scope, must_report); must_report = 0; } @@ -598,7 +601,7 @@ not_found: return ref; } -bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, +bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, struct list_head *dsts, int *dstcnt, u32 exclude, bool all) { @@ -608,9 +611,6 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, struct name_seq *seq; struct sub_seq *sseq; - if (!tipc_in_scope(domain, self)) - return false; - *dstcnt = 0; rcu_read_lock(); seq = nametbl_find_seq(net, type); @@ -621,7 +621,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, if (likely(sseq)) { info = sseq->info; list_for_each_entry(publ, &info->zone_list, zone_list) { - if (!tipc_in_scope(domain, publ->node)) + if (publ->scope != scope) continue; if (publ->ref == exclude && publ->node == self) continue; @@ -639,13 +639,14 @@ exit: return !list_empty(dsts); } -int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct list_head *dports) +int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports) { - struct name_seq *seq; - struct sub_seq *sseq; struct sub_seq *sseq_stop; struct name_info *info; + struct publication *p; + struct name_seq *seq; + struct sub_seq *sseq; int res = 0; rcu_read_lock(); @@ -657,15 +658,12 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); sseq_stop = seq->sseqs + seq->first_free; for (; sseq != sseq_stop; sseq++) { - struct publication *publ; - if (sseq->lower > upper) break; - info = sseq->info; - list_for_each_entry(publ, &info->node_list, node_list) { - if (publ->scope <= limit) - tipc_dest_push(dports, 0, publ->ref); + list_for_each_entry(p, &info->node_list, node_list) { + if (p->scope == scope || (!exact && p->scope < scope)) + tipc_dest_push(dports, 0, p->ref); } if (info->cluster_list_size != info->node_list_size) @@ -682,7 +680,7 @@ exit: * - Determines if any node local ports overlap */ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, - u32 upper, u32 domain, + u32 upper, u32 scope, struct tipc_nlist *nodes) { struct sub_seq *sseq, *stop; @@ -701,7 +699,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, for (; sseq != stop && sseq->lower <= upper; sseq++) { info = sseq->info; list_for_each_entry(publ, &info->zone_list, zone_list) { - if (tipc_in_scope(domain, publ->node)) + if (publ->scope == scope) tipc_nlist_add(nodes, publ->node); } } @@ -713,7 +711,7 @@ exit: /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, - u32 type, u32 domain) + u32 type, u32 scope) { struct sub_seq *sseq, *stop; struct name_info *info; @@ -731,7 +729,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, for (; sseq != stop; sseq++) { info = sseq->info; list_for_each_entry(p, &info->zone_list, zone_list) { - if (!tipc_in_scope(domain, p->node)) + if (p->scope != scope) continue; tipc_group_add_member(grp, p->node, p->ref, p->lower); } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 73a148c85c15..b595d8aa00f0 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -100,8 +100,8 @@ struct name_table { int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); -int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct list_head *dports); +int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports); void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, u32 type, u32 domain); void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, diff --git a/net/tipc/server.c b/net/tipc/server.c index 950c54cbcf3a..8ee5e86b7870 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -489,8 +489,8 @@ void tipc_conn_terminate(struct tipc_server *s, int conid) } } -bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, - u32 filter, u32 lower, u32 upper, int *conid) +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, + u32 upper, u32 filter, int *conid) { struct tipc_subscriber *scbr; struct tipc_subscr sub; diff --git a/net/tipc/server.h b/net/tipc/server.h index ea1effbff23e..17f49ee44cfd 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -41,6 +41,8 @@ #include #define TIPC_SERVER_NAME_LEN 32 +#define TIPC_SUB_CLUSTER_SCOPE 0x20 +#define TIPC_SUB_NODE_SCOPE 0x40 #define TIPC_SUB_NO_STATUS 0x80 /** @@ -84,8 +86,8 @@ struct tipc_server { int tipc_conn_sendmsg(struct tipc_server *s, int conid, struct sockaddr_tipc *addr, void *data, size_t len); -bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, - u32 filter, u32 lower, u32 upper, int *conid); +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, + u32 upper, u32 filter, int *conid); void tipc_topsrv_kern_unsubscr(struct net *net, int conid); /** diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e3a02f1fcab5..b24dab3996c9 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -928,21 +928,22 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, struct list_head *cong_links = &tsk->cong_links; int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; struct tipc_member *first = NULL; struct tipc_member *mbr = NULL; struct net *net = sock_net(sk); u32 node, port, exclude; - u32 type, inst, domain; struct list_head dsts; + u32 type, inst, scope; int lookups = 0; int dstcnt, rc; bool cong; INIT_LIST_HEAD(&dsts); - type = dest->addr.name.name.type; + type = msg_nametype(hdr); inst = dest->addr.name.name.instance; - domain = addr_domain(net, dest->scope); + scope = msg_lookup_scope(hdr); exclude = tipc_group_exclude(grp); while (++lookups < 4) { @@ -950,7 +951,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, /* Look for a non-congested destination member, if any */ while (1) { - if (!tipc_nametbl_lookup(net, type, inst, domain, &dsts, + if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, &dstcnt, exclude, false)) return -EHOSTUNREACH; tipc_dest_pop(&dsts, &node, &port); @@ -1079,22 +1080,23 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - struct tipc_name_seq *seq = &dest->addr.nameseq; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - u32 domain, exclude, dstcnt; + u32 type, inst, scope, exclude; struct list_head dsts; + u32 dstcnt; INIT_LIST_HEAD(&dsts); - if (seq->lower != seq->upper) - return -ENOTSUPP; - - domain = addr_domain(net, dest->scope); + type = msg_nametype(hdr); + inst = dest->addr.name.name.instance; + scope = msg_lookup_scope(hdr); exclude = tipc_group_exclude(grp); - if (!tipc_nametbl_lookup(net, seq->type, seq->lower, domain, - &dsts, &dstcnt, exclude, true)) + + if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, + &dstcnt, exclude, true)) return -EHOSTUNREACH; if (dstcnt == 1) { @@ -1116,24 +1118,29 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { - u32 scope = TIPC_CLUSTER_SCOPE; u32 self = tipc_own_addr(net); + u32 type, lower, upper, scope; struct sk_buff *skb, *_skb; - u32 lower = 0, upper = ~0; - struct sk_buff_head tmpq; u32 portid, oport, onode; + struct sk_buff_head tmpq; struct list_head dports; - struct tipc_msg *msg; - int user, mtyp, hsz; + struct tipc_msg *hdr; + int user, mtyp, hlen; + bool exact; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { - msg = buf_msg(skb); - user = msg_user(msg); - mtyp = msg_type(msg); + hdr = buf_msg(skb); + user = msg_user(hdr); + mtyp = msg_type(hdr); + hlen = skb_headroom(skb) + msg_hdr_sz(hdr); + oport = msg_origport(hdr); + onode = msg_orignode(hdr); + type = msg_nametype(hdr); + if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { spin_lock_bh(&inputq->lock); if (skb_peek(arrvq) == skb) { @@ -1144,21 +1151,31 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, spin_unlock_bh(&inputq->lock); continue; } - hsz = skb_headroom(skb) + msg_hdr_sz(msg); - oport = msg_origport(msg); - onode = msg_orignode(msg); - if (onode == self) - scope = TIPC_NODE_SCOPE; - - /* Create destination port list and message clones: */ - if (!msg_in_group(msg)) { - lower = msg_namelower(msg); - upper = msg_nameupper(msg); + + /* Group messages require exact scope match */ + if (msg_in_group(hdr)) { + lower = 0; + upper = ~0; + scope = msg_lookup_scope(hdr); + exact = true; + } else { + /* TIPC_NODE_SCOPE means "any scope" in this context */ + if (onode == self) + scope = TIPC_NODE_SCOPE; + else + scope = TIPC_CLUSTER_SCOPE; + exact = false; + lower = msg_namelower(hdr); + upper = msg_nameupper(hdr); } - tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper, - scope, &dports); + + /* Create destination port list: */ + tipc_nametbl_mc_lookup(net, type, lower, upper, + scope, exact, &dports); + + /* Clone message per destination */ while (tipc_dest_pop(&dports, NULL, &portid)) { - _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); + _skb = __pskb_copy(skb, hlen, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); __skb_queue_tail(&tmpq, _skb); @@ -2731,7 +2748,6 @@ void tipc_sk_rht_destroy(struct net *net) static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) { struct net *net = sock_net(&tsk->sk); - u32 domain = addr_domain(net, mreq->scope); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq seq; @@ -2739,6 +2755,8 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) if (mreq->type < TIPC_RESERVED_TYPES) return -EACCES; + if (mreq->scope > TIPC_NODE_SCOPE) + return -EINVAL; if (grp) return -EACCES; grp = tipc_group_create(net, tsk->portid, mreq); @@ -2751,7 +2769,7 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) seq.type = mreq->type; seq.lower = mreq->instance; seq.upper = seq.lower; - tipc_nametbl_build_group(net, grp, mreq->type, domain); + tipc_nametbl_build_group(net, grp, mreq->type, mreq->scope); rc = tipc_sk_publish(tsk, mreq->scope, &seq); if (rc) { tipc_group_delete(net, grp); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 1052341a0ea9..44df528ed6ab 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -118,15 +118,19 @@ void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port_ref, - u32 node, int must) + u32 node, u32 scope, int must) { + u32 filter = htohl(sub->evt.s.filter, sub->swap); struct tipc_name_seq seq; tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; - if (!must && - !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS)) + if (!must && !(filter & TIPC_SUB_PORTS)) + return; + if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE) + return; + if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE) return; tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index ee52957dc952..f3edca775d9f 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -71,7 +71,7 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, - u32 port_ref, u32 node, int must); + u32 port_ref, u32 node, u32 scope, int must); void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, struct tipc_name_seq *out); u32 tipc_subscrp_convert_seq_type(u32 type, int swap); -- cgit v1.2.3-59-g8ed1b From 202a8ff545ccdaa5ac2000d9201df3453c8816be Mon Sep 17 00:00:00 2001 From: Ahmed Abdelsalam Date: Sun, 7 Jan 2018 19:22:02 +0100 Subject: netfilter: add IPv6 segment routing header 'srh' match It allows matching packets based on Segment Routing Header (SRH) information. The implementation considers revision 7 of the SRH draft. https://tools.ietf.org/html/draft-ietf-6man-segment-routing-header-07 Currently supported match options include: (1) Next Header (2) Hdr Ext Len (3) Segments Left (4) Last Entry (5) Tag value of SRH Signed-off-by: Ahmed Abdelsalam Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_ipv6/ip6t_srh.h | 57 ++++++++++ net/ipv6/netfilter/Kconfig | 9 ++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/ip6t_srh.c | 161 +++++++++++++++++++++++++++ 4 files changed, 228 insertions(+) create mode 100644 include/uapi/linux/netfilter_ipv6/ip6t_srh.h create mode 100644 net/ipv6/netfilter/ip6t_srh.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h new file mode 100644 index 000000000000..f3cc0ef514a7 --- /dev/null +++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _IP6T_SRH_H +#define _IP6T_SRH_H + +#include +#include + +/* Values for "mt_flags" field in struct ip6t_srh */ +#define IP6T_SRH_NEXTHDR 0x0001 +#define IP6T_SRH_LEN_EQ 0x0002 +#define IP6T_SRH_LEN_GT 0x0004 +#define IP6T_SRH_LEN_LT 0x0008 +#define IP6T_SRH_SEGS_EQ 0x0010 +#define IP6T_SRH_SEGS_GT 0x0020 +#define IP6T_SRH_SEGS_LT 0x0040 +#define IP6T_SRH_LAST_EQ 0x0080 +#define IP6T_SRH_LAST_GT 0x0100 +#define IP6T_SRH_LAST_LT 0x0200 +#define IP6T_SRH_TAG 0x0400 +#define IP6T_SRH_MASK 0x07FF + +/* Values for "mt_invflags" field in struct ip6t_srh */ +#define IP6T_SRH_INV_NEXTHDR 0x0001 +#define IP6T_SRH_INV_LEN_EQ 0x0002 +#define IP6T_SRH_INV_LEN_GT 0x0004 +#define IP6T_SRH_INV_LEN_LT 0x0008 +#define IP6T_SRH_INV_SEGS_EQ 0x0010 +#define IP6T_SRH_INV_SEGS_GT 0x0020 +#define IP6T_SRH_INV_SEGS_LT 0x0040 +#define IP6T_SRH_INV_LAST_EQ 0x0080 +#define IP6T_SRH_INV_LAST_GT 0x0100 +#define IP6T_SRH_INV_LAST_LT 0x0200 +#define IP6T_SRH_INV_TAG 0x0400 +#define IP6T_SRH_INV_MASK 0x07FF + +/** + * struct ip6t_srh - SRH match options + * @ next_hdr: Next header field of SRH + * @ hdr_len: Extension header length field of SRH + * @ segs_left: Segments left field of SRH + * @ last_entry: Last entry field of SRH + * @ tag: Tag field of SRH + * @ mt_flags: match options + * @ mt_invflags: Invert the sense of match options + */ + +struct ip6t_srh { + __u8 next_hdr; + __u8 hdr_len; + __u8 segs_left; + __u8 last_entry; + __u16 tag; + __u16 mt_flags; + __u16 mt_invflags; +}; + +#endif /*_IP6T_SRH_H*/ diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 806e95375ec8..b6f5edf926d2 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -240,6 +240,15 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_MATCH_SRH + tristate '"srh" Segment Routing header match support' + depends on NETFILTER_ADVANCED + help + srh matching allows you to match packets based on the segment + routing header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + # The targets config IP6_NF_TARGET_HL tristate '"HL" hoplimit target support' diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 95611c4b39b0..d984057b8395 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o +obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o # targets obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c new file mode 100644 index 000000000000..9642164107ce --- /dev/null +++ b/net/ipv6/netfilter/ip6t_srh.c @@ -0,0 +1,161 @@ +/* Kernel module to match Segment Routing Header (SRH) parameters. */ + +/* Author: + * Ahmed Abdelsalam + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Test a struct->mt_invflags and a boolean for inequality */ +#define NF_SRH_INVF(ptr, flag, boolean) \ + ((boolean) ^ !!((ptr)->mt_invflags & (flag))) + +static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip6t_srh *srhinfo = par->matchinfo; + struct ipv6_sr_hdr *srh; + struct ipv6_sr_hdr _srh; + int hdrlen, srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return false; + srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); + if (!srh) + return false; + + hdrlen = ipv6_optlen(srh); + if (skb->len - srhoff < hdrlen) + return false; + + if (srh->type != IPV6_SRCRT_TYPE_4) + return false; + + if (srh->segments_left > srh->first_segment) + return false; + + /* Next Header matching */ + if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, + !(srh->nexthdr == srhinfo->next_hdr))) + return false; + + /* Header Extension Length matching */ + if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, + !(srh->hdrlen == srhinfo->hdr_len))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, + !(srh->hdrlen > srhinfo->hdr_len))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, + !(srh->hdrlen < srhinfo->hdr_len))) + return false; + + /* Segments Left matching */ + if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, + !(srh->segments_left == srhinfo->segs_left))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, + !(srh->segments_left > srhinfo->segs_left))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, + !(srh->segments_left < srhinfo->segs_left))) + return false; + + /** + * Last Entry matching + * Last_Entry field was introduced in revision 6 of the SRH draft. + * It was called First_Segment in the previous revision + */ + if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, + !(srh->first_segment == srhinfo->last_entry))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, + !(srh->first_segment > srhinfo->last_entry))) + return false; + + if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, + !(srh->first_segment < srhinfo->last_entry))) + return false; + + /** + * Tag matchig + * Tag field was introduced in revision 6 of the SRH draft. + */ + if (srhinfo->mt_flags & IP6T_SRH_TAG) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, + !(srh->tag == srhinfo->tag))) + return false; + return true; +} + +static int srh_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_srh *srhinfo = par->matchinfo; + + if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { + pr_err("unknown srh match flags %X\n", srhinfo->mt_flags); + return -EINVAL; + } + + if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { + pr_err("unknown srh invflags %X\n", srhinfo->mt_invflags); + return -EINVAL; + } + + return 0; +} + +static struct xt_match srh_mt6_reg __read_mostly = { + .name = "srh", + .family = NFPROTO_IPV6, + .match = srh_mt6, + .matchsize = sizeof(struct ip6t_srh), + .checkentry = srh_mt6_check, + .me = THIS_MODULE, +}; + +static int __init srh_mt6_init(void) +{ + return xt_register_match(&srh_mt6_reg); +} + +static void __exit srh_mt6_exit(void) +{ + xt_unregister_match(&srh_mt6_reg); +} + +module_init(srh_mt6_init); +module_exit(srh_mt6_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 Segment Routing Header match"); +MODULE_AUTHOR("Ahmed Abdelsalam "); -- cgit v1.2.3-59-g8ed1b From 902d6a4c2a4f411582689e53fb101895ffe99028 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 10 Jan 2018 20:51:57 -0700 Subject: netfilter: nf_defrag: Skip defrag if NOTRACK is set conntrack defrag is needed only if some module like CONNTRACK or NAT explicitly requests it. For plain forwarding scenarios, defrag is not needed and can be skipped if NOTRACK is set in a rule. Since conntrack defrag is currently higher priority than raw table, setting NOTRACK is not sufficient. We need to move raw to a higher priority for iptables only. This is achieved by introducing a module parameter "raw_before_defrag" which allows to change the priority of raw table to place it before defrag. By default, the parameter is disabled and the priority of raw table is NF_IP_PRI_RAW to support legacy behavior. If the module parameter is enabled, then the priority of the raw table is set to NF_IP_PRI_RAW_BEFORE_DEFRAG. Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_ipv4.h | 1 + include/uapi/linux/netfilter_ipv6.h | 1 + net/ipv4/netfilter/iptable_raw.c | 13 ++++++++++++- net/ipv4/netfilter/nf_defrag_ipv4.c | 2 +- net/ipv6/netfilter/ip6table_raw.c | 13 ++++++++++++- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 3 +++ 6 files changed, 30 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h index e6b1a84f5dd3..c3b060775e13 100644 --- a/include/uapi/linux/netfilter_ipv4.h +++ b/include/uapi/linux/netfilter_ipv4.h @@ -57,6 +57,7 @@ enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, + NF_IP_PRI_RAW_BEFORE_DEFRAG = -450, NF_IP_PRI_CONNTRACK_DEFRAG = -400, NF_IP_PRI_RAW = -300, NF_IP_PRI_SELINUX_FIRST = -225, diff --git a/include/uapi/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h index 2f9724611cc2..dc624fd24d25 100644 --- a/include/uapi/linux/netfilter_ipv6.h +++ b/include/uapi/linux/netfilter_ipv6.h @@ -62,6 +62,7 @@ enum nf_ip6_hook_priorities { NF_IP6_PRI_FIRST = INT_MIN, + NF_IP6_PRI_RAW_BEFORE_DEFRAG = -450, NF_IP6_PRI_CONNTRACK_DEFRAG = -400, NF_IP6_PRI_RAW = -300, NF_IP6_PRI_SELINUX_FIRST = -225, diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index a869d1fea7d9..29b64d3024e0 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -3,6 +3,7 @@ * * Copyright (C) 2003 Jozsef Kadlecsik */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include @@ -12,7 +13,11 @@ static int __net_init iptable_raw_table_init(struct net *net); -static const struct xt_table packet_raw = { +static bool raw_before_defrag __read_mostly; +MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); +module_param(raw_before_defrag, bool, 0000); + +static struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, @@ -64,6 +69,12 @@ static int __init iptable_raw_init(void) { int ret; + if (raw_before_defrag) { + packet_raw.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG; + + pr_info("Enabling raw table before defrag\n"); + } + rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook); if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 37fe1616ca0b..cbd987f6b1f8 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -80,7 +80,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv, #endif #endif /* Gather fragments. */ - if (ip_is_fragment(ip_hdr(skb))) { + if (skb->_nfct != IP_CT_UNTRACKED && ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = nf_ct_defrag_user(state->hook, skb); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index d4bc56443dc1..3df7383f96d0 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -3,6 +3,7 @@ * * Copyright (C) 2003 Jozsef Kadlecsik */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include @@ -11,7 +12,11 @@ static int __net_init ip6table_raw_table_init(struct net *net); -static const struct xt_table packet_raw = { +static bool raw_before_defrag __read_mostly; +MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); +module_param(raw_before_defrag, bool, 0000); + +static struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, @@ -63,6 +68,12 @@ static int __init ip6table_raw_init(void) { int ret; + if (raw_before_defrag) { + packet_raw.priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG; + + pr_info("Enabling raw table before defrag\n"); + } + /* Register hooks */ rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook); if (IS_ERR(rawtable_ops)) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index b326da59257f..87b503a8f5ef 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -65,6 +65,9 @@ static unsigned int ipv6_defrag(void *priv, return NF_ACCEPT; #endif + if (skb->_nfct == IP_CT_UNTRACKED) + return NF_ACCEPT; + err = nf_ct_frag6_gather(state->net, skb, nf_ct6_defrag_user(state->hook, skb)); /* queued */ -- cgit v1.2.3-59-g8ed1b From daaf24c634ab951cad3dcef28492001ef9c931d0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 11 Jan 2018 17:39:09 +0100 Subject: bpf: simplify xdp_convert_ctx_access for xdp_rxq_info As pointed out by Daniel Borkmann, using bpf_target_off() is not necessary for xdp_rxq_info when extracting queue_index and ifindex, as these members are u32 like BPF_W. Also fix trivial spelling mistake introduced in same commit. Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs") Reported-by: Daniel Borkmann Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 405317f9c064..395d261948de 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -899,7 +899,7 @@ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; - /* Below access go though struct xdp_rxq_info */ + /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ }; diff --git a/net/core/filter.c b/net/core/filter.c index d4b190e63b79..db2ee8c7e1bd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4310,16 +4310,15 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - bpf_target_off(struct net_device, - ifindex, 4, target_size)); + offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - bpf_target_off(struct xdp_rxq_info, - queue_index, 4, target_size)); + offsetof(struct xdp_rxq_info, + queue_index)); break; } -- cgit v1.2.3-59-g8ed1b From a38845729ea3985db5d2544ec3ef3dc8f6313a27 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:09 -0800 Subject: bpf: offload: add map offload infrastructure BPF map offload follow similar path to program offload. At creation time users may specify ifindex of the device on which they want to create the map. Map will be validated by the kernel's .map_alloc_check callback and device driver will be called for the actual allocation. Map will have an empty set of operations associated with it (save for alloc and free callbacks). The real device callbacks are kept in map->offload->dev_ops because they have slightly different signatures. Map operations are called in process context so the driver may communicate with HW freely, msleep(), wait() etc. Map alloc and free callbacks are muxed via existing .ndo_bpf, and are always called with rtnl lock held. Maps and programs are guaranteed to be destroyed before .ndo_uninit (i.e. before unregister_netdev() returns). Map callbacks are invoked with bpf_devs_lock *read* locked, drivers must take care of exclusive locking if necessary. All offload-specific branches are marked with unlikely() (through bpf_map_is_dev_bound()), given that branch penalty will be negligible compared to IO anyway, and we don't want to penalize SW path unnecessarily. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 59 +++++++++++++ include/linux/netdevice.h | 6 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/offload.c | 188 +++++++++++++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 44 ++++++++-- kernel/bpf/verifier.c | 7 ++ tools/include/uapi/linux/bpf.h | 1 + 7 files changed, 293 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9fff1ace1d8e..5c2c104dc2c5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -74,6 +74,33 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +struct bpf_offloaded_map; + +struct bpf_map_dev_ops { + int (*map_get_next_key)(struct bpf_offloaded_map *map, + void *key, void *next_key); + int (*map_lookup_elem)(struct bpf_offloaded_map *map, + void *key, void *value); + int (*map_update_elem)(struct bpf_offloaded_map *map, + void *key, void *value, u64 flags); + int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); +}; + +struct bpf_offloaded_map { + struct bpf_map map; + struct net_device *netdev; + const struct bpf_map_dev_ops *dev_ops; + void *dev_priv; + struct list_head offloads; +}; + +static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) +{ + return container_of(map, struct bpf_offloaded_map, map); +} + +extern const struct bpf_map_ops bpf_map_offload_ops; + /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -369,6 +396,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); @@ -556,6 +584,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags); +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); +int bpf_map_offload_get_next_key(struct bpf_map *map, + void *key, void *next_key); + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); @@ -563,6 +600,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { return aux->offload_requested; } + +static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +{ + return unlikely(map->ops == &bpf_map_offload_ops); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); +void bpf_map_offload_map_free(struct bpf_map *map); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -574,6 +619,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { return false; } + +static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +{ + return false; +} + +static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void bpf_map_offload_map_free(struct bpf_map *map) +{ +} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ef7b348e8498..0b3ab42d50fe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -804,6 +804,8 @@ enum bpf_netdev_command { BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY, + BPF_OFFLOAD_MAP_ALLOC, + BPF_OFFLOAD_MAP_FREE, }; struct bpf_prog_offload_ops; @@ -834,6 +836,10 @@ struct netdev_bpf { struct { struct bpf_prog *prog; } offload; + /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ + struct { + struct bpf_offloaded_map *offmap; + }; }; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 395d261948de..7c2259e8bc54 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -245,6 +245,7 @@ union bpf_attr { * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index cdd1e19a668b..453785fa1881 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -24,11 +24,13 @@ #include #include -/* Protects bpf_prog_offload_devs and offload members of all progs. +/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members + * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static LIST_HEAD(bpf_map_offload_devs); static int bpf_dev_offload_check(struct net_device *netdev) { @@ -250,11 +252,186 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, const struct bpf_prog_ops bpf_offload_prog_ops = { }; +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_offloaded_map *offmap; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->map_type != BPF_MAP_TYPE_HASH) + return ERR_PTR(-EINVAL); + + offmap = kzalloc(sizeof(*offmap), GFP_USER); + if (!offmap) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&offmap->map, attr); + + rtnl_lock(); + down_write(&bpf_devs_lock); + offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); + err = bpf_dev_offload_check(offmap->netdev); + if (err) + goto err_unlock; + + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); + if (err) + goto err_unlock; + + list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + return &offmap->map; + +err_unlock: + up_write(&bpf_devs_lock); + rtnl_unlock(); + kfree(offmap); + return ERR_PTR(err); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +void bpf_map_offload_map_free(struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + + rtnl_lock(); + down_write(&bpf_devs_lock); + if (offmap->netdev) + __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + kfree(offmap); +} + +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_update_elem(offmap, key, value, + flags); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_delete_elem(offmap, key); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); + up_read(&bpf_devs_lock); + + return ret; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + struct bpf_prog_offload *offload; + bool ret; + + if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) + return false; + if (!bpf_prog_is_dev_bound(prog->aux)) + return true; + + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + offmap = map_to_offmap(map); + + ret = offload && offload->netdev == offmap->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + +static void bpf_offload_orphan_all_progs(struct net_device *netdev) +{ + struct bpf_prog_offload *offload, *tmp; + + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); +} + +static void bpf_offload_orphan_all_maps(struct net_device *netdev) +{ + struct bpf_offloaded_map *offmap, *tmp; + + list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) + if (offmap->netdev == netdev) + __bpf_map_offload_destroy(offmap); +} + static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_prog_offload *offload, *tmp; ASSERT_RTNL(); @@ -265,11 +442,8 @@ static int bpf_offload_notification(struct notifier_block *notifier, break; down_write(&bpf_devs_lock); - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, - offloads) { - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); - } + bpf_offload_orphan_all_progs(netdev); + bpf_offload_orphan_all_maps(netdev); up_write(&bpf_devs_lock); break; default: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3f726bb42ea..c691b9e972e3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr, return 0; } +const struct bpf_map_ops bpf_map_offload_ops = { + .map_alloc = bpf_map_offload_map_alloc, + .map_free = bpf_map_offload_map_free, +}; + static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; @@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) if (err) return ERR_PTR(err); } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; map = ops->map_alloc(attr); if (IS_ERR(map)) return map; @@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { unsigned long flags; + /* Offloaded maps are removed from the IDR store when their device + * disappears - even if someone holds an fd to them they are unusable, + * the memory is gone, all ops will fail; they are simply waiting for + * refcnt to drop to be freed. + */ + if (!map->id) + return; + if (do_idr_lock) spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); + map->id = 0; if (do_idr_lock) spin_unlock_irqrestore(&map_idr_lock, flags); @@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_name +#define BPF_MAP_CREATE_LAST_FIELD map_ifindex /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -673,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr) goto free_value; /* Need to create a kthread, thus must support schedule */ - if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_update_elem(map, key, value, attr->flags); + goto out; + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + goto out; + } + preempt_disable(); __this_cpu_inc(bpf_prog_active); rcu_read_lock(); @@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr) rcu_read_unlock(); __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_delete_elem(map, ufd, key); kfree(key); @@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_get_next_key(map, key, next_key); + goto out; + } + rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); +out: if (err) goto free_next_key; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48b61caa94cb..ceabb394d2dc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4816,6 +4816,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } } + + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + !bpf_offload_dev_match(prog, map)) { + verbose(env, "offload device mismatch between prog and map\n"); + return -EINVAL; + } + return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4e8c60acfa32..69f96af4a569 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -245,6 +245,7 @@ union bpf_attr { * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ -- cgit v1.2.3-59-g8ed1b From 2290aefa2e90a43af8555ad6431d49de43259aa3 Mon Sep 17 00:00:00 2001 From: Franklin S Cooper Jr Date: Wed, 10 Jan 2018 16:25:18 +0530 Subject: can: dev: Add support for limiting configured bitrate Various CAN or CAN-FD IP may be able to run at a faster rate than what the transceiver the CAN node is connected to. This can lead to unexpected errors. However, CAN transceivers typically have fixed limitations and provide no means to discover these limitations at runtime. Therefore, add support for a can-transceiver node that can be reused by other CAN peripheral drivers to determine for both CAN and CAN-FD what the max bitrate that can be used. If the user tries to configure CAN to pass these maximum bitrates it will throw an error. Also add support for reading bitrate_max via the netlink interface. Reviewed-by: Suman Anna Signed-off-by: Franklin S Cooper Jr [nsekhar@ti.com: fix build error with !CONFIG_OF] Signed-off-by: Sekhar Nori Signed-off-by: Faiz Abbas Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev.c | 45 +++++++++++++++++++++++++++++++++++++++- include/linux/can/dev.h | 7 +++++++ include/uapi/linux/can/netlink.h | 1 + 3 files changed, 52 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 365a8cc62405..cc94604b23e0 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #define MOD_DESC "CAN device driver interface" @@ -814,6 +815,29 @@ int open_candev(struct net_device *dev) } EXPORT_SYMBOL_GPL(open_candev); +#ifdef CONFIG_OF +/* Common function that can be used to understand the limitation of + * a transceiver when it provides no means to determine these limitations + * at runtime. + */ +void of_can_transceiver(struct net_device *dev) +{ + struct device_node *dn; + struct can_priv *priv = netdev_priv(dev); + struct device_node *np = dev->dev.parent->of_node; + int ret; + + dn = of_get_child_by_name(np, "can-transceiver"); + if (!dn) + return; + + ret = of_property_read_u32(dn, "max-bitrate", &priv->bitrate_max); + if ((ret && ret != -EINVAL) || (!ret && !priv->bitrate_max)) + netdev_warn(dev, "Invalid value for transceiver max bitrate. Ignoring bitrate limit.\n"); +} +EXPORT_SYMBOL_GPL(of_can_transceiver); +#endif + /* * Common close function for cleanup before the device gets closed. * @@ -913,6 +937,13 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], priv->bitrate_const_cnt); if (err) return err; + + if (priv->bitrate_max && bt.bitrate > priv->bitrate_max) { + netdev_err(dev, "arbitration bitrate surpasses transceiver capabilities of %d bps\n", + priv->bitrate_max); + return -EINVAL; + } + memcpy(&priv->bittiming, &bt, sizeof(bt)); if (priv->do_set_bittiming) { @@ -997,6 +1028,13 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], priv->data_bitrate_const_cnt); if (err) return err; + + if (priv->bitrate_max && dbt.bitrate > priv->bitrate_max) { + netdev_err(dev, "canfd data bitrate surpasses transceiver capabilities of %d bps\n", + priv->bitrate_max); + return -EINVAL; + } + memcpy(&priv->data_bittiming, &dbt, sizeof(dbt)); if (priv->do_set_data_bittiming) { @@ -1064,6 +1102,7 @@ static size_t can_get_size(const struct net_device *dev) if (priv->data_bitrate_const) /* IFLA_CAN_DATA_BITRATE_CONST */ size += nla_total_size(sizeof(*priv->data_bitrate_const) * priv->data_bitrate_const_cnt); + size += sizeof(priv->bitrate_max); /* IFLA_CAN_BITRATE_MAX */ return size; } @@ -1121,7 +1160,11 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST, sizeof(*priv->data_bitrate_const) * priv->data_bitrate_const_cnt, - priv->data_bitrate_const)) + priv->data_bitrate_const)) || + + (nla_put(skb, IFLA_CAN_BITRATE_MAX, + sizeof(priv->bitrate_max), + &priv->bitrate_max)) ) return -EMSGSIZE; diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 61f1cf2d9f44..055aaf5ed9af 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -46,6 +46,7 @@ struct can_priv { unsigned int bitrate_const_cnt; const u32 *data_bitrate_const; unsigned int data_bitrate_const_cnt; + u32 bitrate_max; struct can_clock clock; enum can_state state; @@ -166,6 +167,12 @@ void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); void can_free_echo_skb(struct net_device *dev, unsigned int idx); +#ifdef CONFIG_OF +void of_can_transceiver(struct net_device *dev); +#else +static inline void of_can_transceiver(struct net_device *dev) { } +#endif + struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd); diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index 96710e76d5ce..9f56fad4785b 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -132,6 +132,7 @@ enum { IFLA_CAN_TERMINATION_CONST, IFLA_CAN_BITRATE_CONST, IFLA_CAN_DATA_BITRATE_CONST, + IFLA_CAN_BITRATE_MAX, __IFLA_CAN_MAX }; -- cgit v1.2.3-59-g8ed1b From d9f9b9a4d05fab693fd23a9ecaa330e03ebe2c31 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:03 +0100 Subject: devlink: Add support for resource abstraction Add support for hardware resource abstraction over devlink. Each resource is identified via id, furthermore it contains information regarding its size and its related sub resources. Each resource can also provide its current occupancy. In some cases the sizes of some resources can be changed, yet for those changes to take place a hot driver reload may be needed. The reload capability will be introduced in the next patch. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 96 +++++++++++ include/uapi/linux/devlink.h | 18 +++ net/core/devlink.c | 374 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 488 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4d2c6fc94837..ceb1895d119b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -26,6 +26,7 @@ struct devlink { struct list_head port_list; struct list_head sb_list; struct list_head dpipe_table_list; + struct list_head resource_list; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -224,6 +225,61 @@ struct devlink_dpipe_headers { unsigned int headers_count; }; +/** + * struct devlink_resource_ops - resource ops + * @occ_get: get the occupied size + * @size_validate: validate the size of the resource before update, reload + * is needed for changes to take place + */ +struct devlink_resource_ops { + u64 (*occ_get)(struct devlink *devlink); + int (*size_validate)(struct devlink *devlink, u64 size, + struct netlink_ext_ack *extack); +}; + +/** + * struct devlink_resource_size_params - resource's size parameters + * @size_min: minimum size which can be set + * @size_max: maximum size which can be set + * @size_granularity: size granularity + * @size_unit: resource's basic unit + */ +struct devlink_resource_size_params { + u64 size_min; + u64 size_max; + u64 size_granularity; + enum devlink_resource_unit unit; +}; + +/** + * struct devlink_resource - devlink resource + * @name: name of the resource + * @id: id, per devlink instance + * @size: size of the resource + * @size_new: updated size of the resource, reload is needed + * @size_valid: valid in case the total size of the resource is valid + * including its children + * @parent: parent resource + * @size_params: size parameters + * @list: parent list + * @resource_list: list of child resources + * @resource_ops: resource ops + */ +struct devlink_resource { + const char *name; + u64 id; + u64 size; + u64 size_new; + bool size_valid; + struct devlink_resource *parent; + struct devlink_resource_size_params *size_params; + struct list_head list; + struct list_head resource_list; + const struct devlink_resource_ops *resource_ops; +}; + +#define DEVLINK_RESOURCE_ID_PARENT_TOP 0 + struct devlink_ops { int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); @@ -333,6 +389,20 @@ extern struct devlink_dpipe_header devlink_dpipe_header_ethernet; extern struct devlink_dpipe_header devlink_dpipe_header_ipv4; extern struct devlink_dpipe_header devlink_dpipe_header_ipv6; +int devlink_resource_register(struct devlink *devlink, + const char *resource_name, + bool top_hierarchy, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + struct devlink_resource_size_params *size_params, + const struct devlink_resource_ops *resource_ops); +void devlink_resources_unregister(struct devlink *devlink, + struct devlink_resource *resource); +int devlink_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size); + #else static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, @@ -469,6 +539,32 @@ devlink_dpipe_match_put(struct sk_buff *skb, return 0; } +static inline int +devlink_resource_register(struct devlink *devlink, + const char *resource_name, + bool top_hierarchy, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + struct devlink_resource_size_params *size_params, + const struct devlink_resource_ops *resource_ops) +{ + return 0; +} + +static inline void +devlink_resources_unregister(struct devlink *devlink, + struct devlink_resource *resource) +{ +} + +static inline int +devlink_resource_size_get(struct devlink *devlink, u64 resource_id, + u64 *p_resource_size) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6665df69e26a..f89950443e17 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -70,6 +70,8 @@ enum devlink_command { DEVLINK_CMD_DPIPE_ENTRIES_GET, DEVLINK_CMD_DPIPE_HEADERS_GET, DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, + DEVLINK_CMD_RESOURCE_SET, + DEVLINK_CMD_RESOURCE_DUMP, /* add new commands above here */ __DEVLINK_CMD_MAX, @@ -202,6 +204,18 @@ enum devlink_attr { DEVLINK_ATTR_PAD, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, /* u8 */ + DEVLINK_ATTR_RESOURCE_LIST, /* nested */ + DEVLINK_ATTR_RESOURCE, /* nested */ + DEVLINK_ATTR_RESOURCE_NAME, /* string */ + DEVLINK_ATTR_RESOURCE_ID, /* u64 */ + DEVLINK_ATTR_RESOURCE_SIZE, /* u64 */ + DEVLINK_ATTR_RESOURCE_SIZE_NEW, /* u64 */ + DEVLINK_ATTR_RESOURCE_SIZE_VALID, /* u8 */ + DEVLINK_ATTR_RESOURCE_SIZE_MIN, /* u64 */ + DEVLINK_ATTR_RESOURCE_SIZE_MAX, /* u64 */ + DEVLINK_ATTR_RESOURCE_SIZE_GRAN, /* u64 */ + DEVLINK_ATTR_RESOURCE_UNIT, /* u8 */ + DEVLINK_ATTR_RESOURCE_OCC, /* u64 */ /* add new attributes above here, update the policy in devlink.c */ @@ -245,4 +259,8 @@ enum devlink_dpipe_header_id { DEVLINK_DPIPE_HEADER_IPV6, }; +enum devlink_resource_unit { + DEVLINK_RESOURCE_UNIT_ENTRY, +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 2f71734c4ff6..89b3704fa450 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2288,6 +2288,233 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, counters_enable); } +struct devlink_resource * +devlink_resource_find(struct devlink *devlink, + struct devlink_resource *resource, u64 resource_id) +{ + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + struct devlink_resource *child_resource; + + if (resource->id == resource_id) + return resource; + + child_resource = devlink_resource_find(devlink, resource, + resource_id); + if (child_resource) + return child_resource; + } + return NULL; +} + +void devlink_resource_validate_children(struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + bool size_valid = true; + u64 parts_size = 0; + + if (list_empty(&resource->resource_list)) + goto out; + + list_for_each_entry(child_resource, &resource->resource_list, list) + parts_size += child_resource->size_new; + + if (parts_size > resource->size) + size_valid = false; +out: + resource->size_valid = size_valid; +} + +static int devlink_nl_cmd_resource_set(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + u64 resource_id; + u64 size; + int err; + + if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] || + !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]) + return -EINVAL; + resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) + return -EINVAL; + + if (!resource->resource_ops->size_validate) + return -EINVAL; + + size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); + err = resource->resource_ops->size_validate(devlink, size, + info->extack); + if (err) + return err; + + resource->size_new = size; + devlink_resource_validate_children(resource); + if (resource->parent) + devlink_resource_validate_children(resource->parent); + return 0; +} + +static void +devlink_resource_size_params_put(struct devlink_resource *resource, + struct sk_buff *skb) +{ + struct devlink_resource_size_params *size_params; + + size_params = resource->size_params; + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, + size_params->size_granularity, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, + size_params->size_max, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, + size_params->size_min, DEVLINK_ATTR_PAD); + nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit); +} + +static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, + struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + struct nlattr *child_resource_attr; + struct nlattr *resource_attr; + + resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE); + if (!resource_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size, + DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (resource->size != resource->size_new) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, + resource->size_new, DEVLINK_ATTR_PAD); + if (resource->resource_ops && resource->resource_ops->occ_get) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, + resource->resource_ops->occ_get(devlink), + DEVLINK_ATTR_PAD); + devlink_resource_size_params_put(resource, skb); + if (list_empty(&resource->resource_list)) + goto out; + + if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, + resource->size_valid)) + goto nla_put_failure; + + child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!child_resource_attr) + goto nla_put_failure; + + list_for_each_entry(child_resource, &resource->resource_list, list) { + if (devlink_resource_put(devlink, skb, child_resource)) + goto resource_put_failure; + } + + nla_nest_end(skb, child_resource_attr); +out: + nla_nest_end(skb, resource_attr); + return 0; + +resource_put_failure: + nla_nest_cancel(skb, child_resource_attr); +nla_put_failure: + nla_nest_cancel(skb, resource_attr); + return -EMSGSIZE; +} + +static int devlink_resource_fill(struct genl_info *info, + enum devlink_command cmd, int flags) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + struct nlattr *resources_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + resource = list_first_entry(&devlink->resource_list, + struct devlink_resource, list); +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + + resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!resources_attr) + goto nla_put_failure; + + incomplete = false; + i = 0; + list_for_each_entry_from(resource, &devlink->resource_list, list) { + err = devlink_resource_put(devlink, skb, resource); + if (err) { + if (!i) + goto err_resource_put; + incomplete = true; + break; + } + i++; + } + nla_nest_end(skb, resources_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_resource_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (list_empty(&devlink->resource_list)) + return -EOPNOTSUPP; + + return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2306,6 +2533,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, + [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, + [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, }; static const struct genl_ops devlink_nl_ops[] = { @@ -2466,6 +2695,20 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_RESOURCE_SET, + .doit = devlink_nl_cmd_resource_set, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_RESOURCE_DUMP, + .doit = devlink_nl_cmd_resource_dump, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -2503,6 +2746,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); + INIT_LIST_HEAD(&devlink->resource_list); mutex_init(&devlink->lock); return devlink; } @@ -2833,6 +3077,136 @@ unlock: } EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); +/** + * devlink_resource_register - devlink resource register + * + * @devlink: devlink + * @resource_name: resource's name + * @top_hierarchy: top hierarchy + * @reload_required: reload is required for new configuration to + * apply + * @resource_size: resource's size + * @resource_id: resource's id + * @parent_reosurce_id: resource's parent id + * @size params: size parameters + * @resource_ops: resource ops + */ +int devlink_resource_register(struct devlink *devlink, + const char *resource_name, + bool top_hierarchy, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + struct devlink_resource_size_params *size_params, + const struct devlink_resource_ops *resource_ops) +{ + struct devlink_resource *resource; + struct list_head *resource_list; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (resource) { + err = -EINVAL; + goto out; + } + + resource = kzalloc(sizeof(*resource), GFP_KERNEL); + if (!resource) { + err = -ENOMEM; + goto out; + } + + if (top_hierarchy) { + resource_list = &devlink->resource_list; + } else { + struct devlink_resource *parent_resource; + + parent_resource = devlink_resource_find(devlink, NULL, + parent_resource_id); + if (parent_resource) { + resource_list = &parent_resource->resource_list; + resource->parent = parent_resource; + } else { + err = -EINVAL; + goto out; + } + } + + resource->name = resource_name; + resource->size = resource_size; + resource->size_new = resource_size; + resource->id = resource_id; + resource->resource_ops = resource_ops; + resource->size_valid = true; + resource->size_params = size_params; + INIT_LIST_HEAD(&resource->resource_list); + list_add_tail(&resource->list, resource_list); +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_register); + +/** + * devlink_resources_unregister - free all resources + * + * @devlink: devlink + * @resource: resource + */ +void devlink_resources_unregister(struct devlink *devlink, + struct devlink_resource *resource) +{ + struct devlink_resource *tmp, *child_resource; + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + if (!resource) + mutex_lock(&devlink->lock); + + list_for_each_entry_safe(child_resource, tmp, resource_list, list) { + devlink_resources_unregister(devlink, child_resource); + list_del(&child_resource->list); + kfree(child_resource); + } + + if (!resource) + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_resources_unregister); + +/** + * devlink_resource_size_get - get and update size + * + * @devlink: devlink + * @resource_id: the requested resource id + * @p_resource_size: ptr to update + */ +int devlink_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size) +{ + struct devlink_resource *resource; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) { + err = -EINVAL; + goto out; + } + *p_resource_size = resource->size_new; + resource->size = resource->size_new; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_size_get); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3-59-g8ed1b From 2d8dc5bbf4e7603747875eb5cadcd67c1fa8b1bb Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:04 +0100 Subject: devlink: Add support for reload Add support for performing driver hot reload. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + include/uapi/linux/devlink.h | 5 +++++ net/core/devlink.c | 47 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index ceb1895d119b..c698883fb0bb 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -281,6 +281,7 @@ struct devlink_resource { #define DEVLINK_RESOURCE_ID_PARENT_TOP 0 struct devlink_ops { + int (*reload)(struct devlink *devlink); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index f89950443e17..555ddcaf0be2 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -73,6 +73,11 @@ enum devlink_command { DEVLINK_CMD_RESOURCE_SET, DEVLINK_CMD_RESOURCE_DUMP, + /* Hot driver reload, makes configuration changes take place. The + * devlink instance is not released during the process. + */ + DEVLINK_CMD_RELOAD, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 diff --git a/net/core/devlink.c b/net/core/devlink.c index 89b3704fa450..4c3d85560436 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2515,6 +2515,45 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); } +static int +devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info) +{ + struct list_head *resource_list; + int err = 0; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + if (!resource->size_valid) + return -EINVAL; + err = devlink_resources_validate(devlink, resource, info); + if (err) + return err; + } + return err; +} + +static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + int err; + + if (!devlink->ops->reload) + return -EOPNOTSUPP; + + err = devlink_resources_validate(devlink, NULL, info); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); + return err; + } + return devlink->ops->reload(devlink); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2709,6 +2748,14 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_RELOAD, + .doit = devlink_nl_cmd_reload, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3-59-g8ed1b From 56dc7cd0a87a1ff4f49ee1e67bd88e768385d51a Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:05 +0100 Subject: devlink: Add relation between dpipe and resource The hardware processes which are modeled via dpipe commonly use some internal hardware resources. Such relation can improve the understanding of hardware limitations. The number of resource's unit consumed per table's entry are also provided for each table. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 17 +++++++++++++++++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index c698883fb0bb..6545b03e97f7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -183,6 +183,9 @@ struct devlink_dpipe_table_ops; * @counters_enabled: indicates if counters are active * @counter_control_extern: indicates if counter control is in dpipe or * external tool + * @resource_valid: Indicate that the resource id is valid + * @resource_id: relative resource this table is related to + * @resource_units: number of resource's unit consumed per table's entry * @table_ops: table operations * @rcu: rcu */ @@ -192,6 +195,9 @@ struct devlink_dpipe_table { const char *name; bool counters_enabled; bool counter_control_extern; + bool resource_valid; + u64 resource_id; + u64 resource_units; struct devlink_dpipe_table_ops *table_ops; struct rcu_head rcu; }; @@ -403,6 +409,9 @@ void devlink_resources_unregister(struct devlink *devlink, int devlink_resource_size_get(struct devlink *devlink, u64 resource_id, u64 *p_resource_size); +int devlink_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units); #else @@ -566,6 +575,14 @@ devlink_resource_size_get(struct devlink *devlink, u64 resource_id, return -EOPNOTSUPP; } +static inline int +devlink_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 555ddcaf0be2..1df65a4c2044 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -221,6 +221,8 @@ enum devlink_attr { DEVLINK_ATTR_RESOURCE_SIZE_GRAN, /* u64 */ DEVLINK_ATTR_RESOURCE_UNIT, /* u8 */ DEVLINK_ATTR_RESOURCE_OCC, /* u64 */ + DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, /* u64 */ + DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 4c3d85560436..dd7d6dd07bfb 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1694,6 +1694,12 @@ static int devlink_dpipe_table_put(struct sk_buff *skb, table->counters_enabled)) goto nla_put_failure; + if (table->resource_valid) { + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, + table->resource_id, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, + table->resource_units, DEVLINK_ATTR_PAD); + } if (devlink_dpipe_matches_put(table, skb)) goto nla_put_failure; @@ -3254,6 +3260,37 @@ out: } EXPORT_SYMBOL_GPL(devlink_resource_size_get); +/** + * devlink_dpipe_table_resource_set - set the resource id + * + * @devlink: devlink + * @table_name: table name + * @resource_id: resource id + * @resource_units: number of resource's units consumed per table's entry + */ +int devlink_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units) +{ + struct devlink_dpipe_table *table; + int err = 0; + + mutex_lock(&devlink->lock); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) { + err = -EINVAL; + goto out; + } + table->resource_id = resource_id; + table->resource_units = resource_units; + table->resource_valid = true; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3-59-g8ed1b From 7960d1daf278cbe23bb48974fe6ae6a1e44c3c15 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 Jan 2018 11:46:51 +0100 Subject: net: sched: use block index as a handle instead of qdisc when block is shared As the tcm_ifindex with value TCM_IFINDEX_MAGIC_BLOCK is invalid ifindex, use it to indicate that we work with block, instead of qdisc. So if tcm_ifindex is set to TCM_IFINDEX_MAGIC_BLOCK, tcm_parent is used to carry block_index. If the block is set to be shared between at least 2 qdiscs, it is forbidden to use the qdisc handle to add/delete filters. In that case, userspace has to pass block_index. Also, for dump of the filters, in case the block is shared in between at least 2 qdiscs, the each filter is dumped with tcm_ifindex value TCM_IFINDEX_MAGIC_BLOCK and tcm_parent set to block_index. That gives the user clear indication, that the filter belongs to a shared block and not only to one qdisc under which it is dumped. Suggested-by: David Ahern Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Acked-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 10 ++ net/sched/cls_api.c | 202 ++++++++++++++++++++++++----------------- 2 files changed, 128 insertions(+), 84 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 843e29aa3cac..da878f2e7c39 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -541,9 +541,19 @@ struct tcmsg { int tcm_ifindex; __u32 tcm_handle; __u32 tcm_parent; +/* tcm_block_index is used instead of tcm_parent + * in case tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK + */ +#define tcm_block_index tcm_parent __u32 tcm_info; }; +/* For manipulation of filters in shared block, tcm_ifindex is set to + * TCM_IFINDEX_MAGIC_BLOCK, and tcm_parent is aliased to tcm_block_index + * which is the block index. + */ +#define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU) + enum { TCA_UNSPEC, TCA_KIND, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 03e2fa092d9e..e500d11da9cd 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -865,8 +865,9 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, } static int tcf_fill_node(struct net *net, struct sk_buff *skb, - struct tcf_proto *tp, struct Qdisc *q, u32 parent, - void *fh, u32 portid, u32 seq, u16 flags, int event) + struct tcf_proto *tp, struct tcf_block *block, + struct Qdisc *q, u32 parent, void *fh, + u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -879,8 +880,13 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; - tcm->tcm_ifindex = qdisc_dev(q)->ifindex; - tcm->tcm_parent = parent; + if (q) { + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; + tcm->tcm_parent = parent; + } else { + tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; + tcm->tcm_block_index = block->index; + } tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; @@ -903,8 +909,8 @@ nla_put_failure: static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, - struct Qdisc *q, u32 parent, - void *fh, int event, bool unicast) + struct tcf_block *block, struct Qdisc *q, + u32 parent, void *fh, int event, bool unicast) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -913,8 +919,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, - n->nlmsg_flags, event) <= 0) { + if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, + n->nlmsg_seq, n->nlmsg_flags, event) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -928,8 +934,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, - struct Qdisc *q, u32 parent, - void *fh, bool unicast, bool *last) + struct tcf_block *block, struct Qdisc *q, + u32 parent, void *fh, bool unicast, bool *last) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -939,8 +945,8 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, - n->nlmsg_flags, RTM_DELTFILTER) <= 0) { + if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, + n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -959,15 +965,16 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, } static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, - struct Qdisc *q, u32 parent, - struct nlmsghdr *n, + struct tcf_block *block, struct Qdisc *q, + u32 parent, struct nlmsghdr *n, struct tcf_chain *chain, int event) { struct tcf_proto *tp; for (tp = rtnl_dereference(chain->filter_chain); tp; tp = rtnl_dereference(tp->next)) - tfilter_notify(net, oskb, n, tp, q, parent, 0, event, false); + tfilter_notify(net, oskb, n, tp, block, + q, parent, 0, event, false); } /* Add/change/delete/get a filter node */ @@ -983,13 +990,11 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, bool prio_allocate; u32 parent; u32 chain_index; - struct net_device *dev; - struct Qdisc *q; + struct Qdisc *q = NULL; struct tcf_chain_info chain_info; struct tcf_chain *chain = NULL; struct tcf_block *block; struct tcf_proto *tp; - const struct Qdisc_class_ops *cops; unsigned long cl; void *fh; int err; @@ -1036,41 +1041,58 @@ replay: /* Find head of filter chain. */ - /* Find link */ - dev = __dev_get_by_index(net, t->tcm_ifindex); - if (dev == NULL) - return -ENODEV; - - /* Find qdisc */ - if (!parent) { - q = dev->qdisc; - parent = q->handle; + if (t->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_lookup(net, t->tcm_block_index); + if (!block) { + NL_SET_ERR_MSG(extack, "Block of given index was not found"); + err = -EINVAL; + goto errout; + } } else { - q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent)); - if (q == NULL) - return -EINVAL; - } + const struct Qdisc_class_ops *cops; + struct net_device *dev; - /* Is it classful? */ - cops = q->ops->cl_ops; - if (!cops) - return -EINVAL; + /* Find link */ + dev = __dev_get_by_index(net, t->tcm_ifindex); + if (!dev) + return -ENODEV; - if (!cops->tcf_block) - return -EOPNOTSUPP; + /* Find qdisc */ + if (!parent) { + q = dev->qdisc; + parent = q->handle; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent)); + if (!q) + return -EINVAL; + } - /* Do we search for filter, attached to class? */ - if (TC_H_MIN(parent)) { - cl = cops->find(q, parent); - if (cl == 0) - return -ENOENT; - } + /* Is it classful? */ + cops = q->ops->cl_ops; + if (!cops) + return -EINVAL; - /* And the last stroke */ - block = cops->tcf_block(q, cl, extack); - if (!block) { - err = -EINVAL; - goto errout; + if (!cops->tcf_block) + return -EOPNOTSUPP; + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(parent)) { + cl = cops->find(q, parent); + if (cl == 0) + return -ENOENT; + } + + /* And the last stroke */ + block = cops->tcf_block(q, cl, extack); + if (!block) { + err = -EINVAL; + goto errout; + } + if (tcf_block_shared(block)) { + NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); + err = -EOPNOTSUPP; + goto errout; + } } chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; @@ -1086,7 +1108,7 @@ replay: } if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { - tfilter_notify_chain(net, skb, q, parent, n, + tfilter_notify_chain(net, skb, block, q, parent, n, chain, RTM_DELTFILTER); tcf_chain_flush(chain); err = 0; @@ -1134,7 +1156,7 @@ replay: if (!fh) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { tcf_chain_tp_remove(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, q, parent, fh, + tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_DELTFILTER, false); tcf_proto_destroy(tp); err = 0; @@ -1159,8 +1181,8 @@ replay: } break; case RTM_DELTFILTER: - err = tfilter_del_notify(net, skb, n, tp, q, parent, - fh, false, &last); + err = tfilter_del_notify(net, skb, n, tp, block, + q, parent, fh, false, &last); if (err) goto errout; if (last) { @@ -1169,8 +1191,8 @@ replay: } goto errout; case RTM_GETTFILTER: - err = tfilter_notify(net, skb, n, tp, q, parent, fh, - RTM_NEWTFILTER, true); + err = tfilter_notify(net, skb, n, tp, block, q, parent, + fh, RTM_NEWTFILTER, true); goto errout; default: err = -EINVAL; @@ -1183,7 +1205,7 @@ replay: if (err == 0) { if (tp_created) tcf_chain_tp_insert(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, q, parent, fh, + tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false); } else { if (tp_created) @@ -1203,6 +1225,7 @@ struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; struct netlink_callback *cb; + struct tcf_block *block; struct Qdisc *q; u32 parent; }; @@ -1212,7 +1235,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct tcf_dump_args *a = (void *)arg; struct net *net = sock_net(a->skb->sk); - return tcf_fill_node(net, a->skb, tp, a->q, a->parent, + return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); @@ -1223,6 +1246,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, long index_start, long *p_index) { struct net *net = sock_net(skb->sk); + struct tcf_block *block = chain->block; struct tcmsg *tcm = nlmsg_data(cb->nlh); struct tcf_dump_args arg; struct tcf_proto *tp; @@ -1241,7 +1265,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, q, parent, 0, + if (tcf_fill_node(net, skb, tp, block, q, parent, 0, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) @@ -1254,6 +1278,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.fn = tcf_node_dump; arg.skb = skb; arg.cb = cb; + arg.block = block; arg.q = q; arg.parent = parent; arg.w.stop = 0; @@ -1272,13 +1297,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; - struct Qdisc *q; + struct Qdisc *q = NULL; struct tcf_block *block; struct tcf_chain *chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); - unsigned long cl = 0; - const struct Qdisc_class_ops *cops; long index_start; long index; u32 parent; @@ -1291,32 +1313,44 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return skb->len; - - parent = tcm->tcm_parent; - if (!parent) { - q = dev->qdisc; - parent = q->handle; + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_lookup(net, tcm->tcm_block_index); + if (!block) + goto out; } else { - q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); - } - if (!q) - goto out; - cops = q->ops->cl_ops; - if (!cops) - goto out; - if (!cops->tcf_block) - goto out; - if (TC_H_MIN(tcm->tcm_parent)) { - cl = cops->find(q, tcm->tcm_parent); - if (cl == 0) + const struct Qdisc_class_ops *cops; + struct net_device *dev; + unsigned long cl = 0; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return skb->len; + + parent = tcm->tcm_parent; + if (!parent) { + q = dev->qdisc; + parent = q->handle; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + } + if (!q) goto out; + cops = q->ops->cl_ops; + if (!cops) + goto out; + if (!cops->tcf_block) + goto out; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->find(q, tcm->tcm_parent); + if (cl == 0) + goto out; + } + block = cops->tcf_block(q, cl, NULL); + if (!block) + goto out; + if (tcf_block_shared(block)) + q = NULL; } - block = cops->tcf_block(q, cl, NULL); - if (!block) - goto out; index_start = cb->args[0]; index = 0; -- cgit v1.2.3-59-g8ed1b From d47a6b0e7c492a4ba4524d557db388e34fd0a47a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 Jan 2018 11:46:52 +0100 Subject: net: sched: introduce ingress/egress block index attributes for qdisc Introduce two new attributes to be used for qdisc creation and dumping. One for ingress block, one for egress block. Introduce a set of ops that qdisc which supports block sharing would implement. Passing block indexes in qdisc change is not supported yet and it is checked and forbidded. In future, these attributes are to be reused for specifying block indexes for classes as well. As of this moment however, it is not supported so a check is in place to forbid it. Suggested-by: Roopa Prabhu Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/sch_generic.h | 7 +++++ include/uapi/linux/rtnetlink.h | 2 ++ net/sched/sch_api.c | 60 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index bf5cc0a1d0f6..cfc19d0ba2ad 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -204,6 +204,13 @@ struct Qdisc_ops { int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); + void (*ingress_block_set)(struct Qdisc *sch, + u32 block_index); + void (*egress_block_set)(struct Qdisc *sch, + u32 block_index); + u32 (*ingress_block_get)(struct Qdisc *sch); + u32 (*egress_block_get)(struct Qdisc *sch); + struct module *owner; }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index da878f2e7c39..9b15005955fa 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -568,6 +568,8 @@ enum { TCA_DUMP_INVISIBLE, TCA_CHAIN, TCA_HW_OFFLOAD, + TCA_INGRESS_BLOCK, + TCA_EGRESS_BLOCK, __TCA_MAX }; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 7dffa9dce28b..d512f49ee83c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -791,6 +791,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct qdisc_size_table *stab; + u32 block_index; __u32 qlen; cond_resched(); @@ -807,6 +808,18 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, tcm->tcm_info = refcount_read(&q->refcnt); if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; + if (q->ops->ingress_block_get) { + block_index = q->ops->ingress_block_get(q); + if (block_index && + nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) + goto nla_put_failure; + } + if (q->ops->egress_block_get) { + block_index = q->ops->egress_block_get(q); + if (block_index && + nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) + goto nla_put_failure; + } if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) @@ -994,6 +1007,40 @@ skip: return err; } +static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, + struct netlink_ext_ack *extack) +{ + u32 block_index; + + if (tca[TCA_INGRESS_BLOCK]) { + block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); + + if (!block_index) { + NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); + return -EINVAL; + } + if (!sch->ops->ingress_block_set) { + NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); + return -EOPNOTSUPP; + } + sch->ops->ingress_block_set(sch, block_index); + } + if (tca[TCA_EGRESS_BLOCK]) { + block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); + + if (!block_index) { + NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); + return -EINVAL; + } + if (!sch->ops->egress_block_set) { + NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); + return -EOPNOTSUPP; + } + sch->ops->egress_block_set(sch, block_index); + } + return 0; +} + /* lockdep annotation is needed for ingress; egress gets it only for name */ static struct lock_class_key qdisc_tx_lock; static struct lock_class_key qdisc_rx_lock; @@ -1088,6 +1135,10 @@ static struct Qdisc *qdisc_create(struct net_device *dev, netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } + err = qdisc_block_indexes_set(sch, tca, extack); + if (err) + goto err_out3; + if (ops->init) { err = ops->init(sch, tca[TCA_OPTIONS], extack); if (err != 0) @@ -1169,6 +1220,10 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); return -EINVAL; } + if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { + NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); + return -EOPNOTSUPP; + } err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); if (err) return err; @@ -1894,6 +1949,11 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, } } + if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { + NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); + return -EOPNOTSUPP; + } + new_cl = cl; err = -EOPNOTSUPP; if (cops->change) -- cgit v1.2.3-59-g8ed1b From aff3d70a07fffc0abb53663e4a4acb059d2f36af Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 16 Jan 2018 16:31:02 +0800 Subject: tun: allow to attach ebpf socket filter This patch allows userspace to attach eBPF filter to tun. This will allow to implement VM dataplane filtering in a more efficient way compared to cBPF filter by allowing either qemu or libvirt to attach eBPF filter to tun. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 38 ++++++++++++++++++++++++++++++++++---- include/uapi/linux/if_tun.h | 1 + 2 files changed, 35 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 76197ede22a9..170a3e89b5af 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -239,6 +239,12 @@ struct tun_struct { struct tun_pcpu_stats __percpu *pcpu_stats; struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; + struct tun_prog __rcu *filter_prog; +}; + +struct veth { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; }; bool tun_is_xdp_buff(void *ptr) @@ -1036,12 +1042,25 @@ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) #endif } +static unsigned int run_ebpf_filter(struct tun_struct *tun, + struct sk_buff *skb, + int len) +{ + struct tun_prog *prog = rcu_dereference(tun->filter_prog); + + if (prog) + len = bpf_prog_run_clear_cb(prog->prog, skb); + + return len; +} + /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; struct tun_file *tfile; + int len = skb->len; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); @@ -1067,6 +1086,15 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) sk_filter(tfile->socket.sk, skb)) goto drop; + len = run_ebpf_filter(tun, skb, len); + + /* Trim extra bytes since we may insert vlan proto & TCI + * in tun_put_user(). + */ + len -= skb_vlan_tag_present(skb) ? sizeof(struct veth) : 0; + if (len <= 0 || pskb_trim(skb, len)) + goto drop; + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; @@ -2054,10 +2082,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (vlan_hlen) { int ret; - struct { - __be16 h_vlan_proto; - __be16 h_vlan_TCI; - } veth; + struct veth veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); @@ -2225,6 +2250,7 @@ static void tun_free_netdev(struct net_device *dev) tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); + __tun_set_ebpf(tun, &tun->filter_prog, NULL); } static void tun_setup(struct net_device *dev) @@ -3019,6 +3045,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; + case TUNSETFILTEREBPF: + ret = tun_set_ebpf(tun, &tun->filter_prog, argp); + break; + default: ret = -EINVAL; break; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index fb38c1797131..ee432cd3018c 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -58,6 +58,7 @@ #define TUNSETVNETBE _IOW('T', 222, int) #define TUNGETVNETBE _IOR('T', 223, int) #define TUNSETSTEERINGEBPF _IOR('T', 224, int) +#define TUNSETFILTEREBPF _IOR('T', 225, int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3-59-g8ed1b From cb5f7334d479414adc6afe60105283277e297489 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 17 Jan 2018 12:05:36 +0100 Subject: bpf: add comments to BPF ld/ldx sizes Doc BPF ld/ldx size defines as comments in code, as it makes in faster to lookup in a programming/review setting, than looking up the sizes in Documentation/networking/filter.txt. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 +- include/uapi/linux/bpf_common.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7c2259e8bc54..74dc4dc98681 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -17,7 +17,7 @@ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ +#define BPF_DW 0x18 /* double word (64-bit) */ #define BPF_XADD 0xc0 /* exclusive add */ /* alu/jmp fields */ diff --git a/include/uapi/linux/bpf_common.h b/include/uapi/linux/bpf_common.h index 18be90725ab0..ee97668bdadb 100644 --- a/include/uapi/linux/bpf_common.h +++ b/include/uapi/linux/bpf_common.h @@ -15,9 +15,10 @@ /* ld/ldx fields */ #define BPF_SIZE(code) ((code) & 0x18) -#define BPF_W 0x00 -#define BPF_H 0x08 -#define BPF_B 0x10 +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 -- cgit v1.2.3-59-g8ed1b From 52775b33bb5072fbc07b02c0cf4fe8da1f7ee7cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:28 -0800 Subject: bpf: offload: report device information about offloaded maps Tell user space about device on which the map was created. Unfortunate reality of user ABI makes sharing this code with program offload difficult but the information is the same. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 3 +++ kernel/bpf/offload.c | 55 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 +++++ tools/include/uapi/linux/bpf.h | 3 +++ 5 files changed, 69 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 025b1c2f8053..66df387106de 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -586,6 +586,8 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map); + int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 74dc4dc98681..406c19d6016b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -938,6 +938,9 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2657976aec2a..c9401075b58c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -413,6 +413,61 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) return ret; } +struct ns_get_path_bpf_map_args { + struct bpf_offloaded_map *offmap; + struct bpf_map_info *info; +}; + +static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_map_args *args = private_data; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (args->offmap->netdev) { + args->info->ifindex = args->offmap->netdev->ifindex; + net = dev_net(args->offmap->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ + struct ns_get_path_bpf_map_args args = { + .offmap = map_to_offmap(map), + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 97a825ffc763..5bdb0cc84ad2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1801,6 +1801,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_info_fill(&info, map); + if (err) + return err; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7c2259e8bc54..af1f49ad8b88 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -938,6 +938,9 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops -- cgit v1.2.3-59-g8ed1b From 3ecbfd65f50e5ff9c538c1bfa3356ef52cc66586 Mon Sep 17 00:00:00 2001 From: Harsha Sharma Date: Wed, 27 Dec 2017 00:59:00 +0530 Subject: netfilter: nf_tables: allocate handle and delete objects via handle This patch allows deletion of objects via unique handle which can be listed via '-a' option. Signed-off-by: Harsha Sharma Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 11 ++- include/uapi/linux/netfilter/nf_tables.h | 10 +++ net/netfilter/nf_tables_api.c | 146 ++++++++++++++++++++++++++++--- 3 files changed, 153 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4aca413367ee..663b015dace5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -374,6 +374,7 @@ void nft_unregister_set(struct nft_set_type *type); * @list: table set list node * @bindings: list of set bindings * @name: name of the set + * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) @@ -396,6 +397,7 @@ struct nft_set { struct list_head list; struct list_head bindings; char *name; + u64 handle; u32 ktype; u32 dtype; u32 objtype; @@ -946,6 +948,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state + * @handle: table handle * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask @@ -959,6 +962,7 @@ struct nft_table { struct list_head objects; struct list_head flowtables; u64 hgenerator; + u64 handle; u32 use; u16 family:6, flags:8, @@ -983,9 +987,9 @@ int nft_verdict_dump(struct sk_buff *skb, int type, * @name: name of this stateful object * @genmask: generation mask * @use: number of references to this stateful object - * @data: object data, layout depends on type + * @handle: unique object handle * @ops: object operations - * @data: pointer to object data + * @data: object data, layout depends on type */ struct nft_object { struct list_head list; @@ -993,6 +997,7 @@ struct nft_object { struct nft_table *table; u32 genmask:2, use:30; + u64 handle; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] @@ -1074,6 +1079,7 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table + * @handle: unique object handle * @data: rhashtable and garbage collector * @ops: array of hooks */ @@ -1086,6 +1092,7 @@ struct nft_flowtable { int ops_len; u32 genmask:2, use:30; + u64 handle; /* runtime data below here */ struct nf_hook_ops *ops ____cacheline_aligned; struct nf_flowtable data; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 53e8dd2a3a03..66dceee0ae30 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -174,6 +174,8 @@ enum nft_table_attributes { NFTA_TABLE_NAME, NFTA_TABLE_FLAGS, NFTA_TABLE_USE, + NFTA_TABLE_HANDLE, + NFTA_TABLE_PAD, __NFTA_TABLE_MAX }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) @@ -317,6 +319,7 @@ enum nft_set_desc_attributes { * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32) * @NFTA_SET_USERDATA: user data (NLA_BINARY) * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*) + * @NFTA_SET_HANDLE: set handle (NLA_U64) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -335,6 +338,7 @@ enum nft_set_attributes { NFTA_SET_USERDATA, NFTA_SET_PAD, NFTA_SET_OBJ_TYPE, + NFTA_SET_HANDLE, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) @@ -1314,6 +1318,7 @@ enum nft_ct_helper_attributes { * @NFTA_OBJ_TYPE: stateful object type (NLA_U32) * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED) * @NFTA_OBJ_USE: number of references to this expression (NLA_U32) + * @NFTA_OBJ_HANDLE: object handle (NLA_U64) */ enum nft_object_attributes { NFTA_OBJ_UNSPEC, @@ -1322,6 +1327,8 @@ enum nft_object_attributes { NFTA_OBJ_TYPE, NFTA_OBJ_DATA, NFTA_OBJ_USE, + NFTA_OBJ_HANDLE, + NFTA_OBJ_PAD, __NFTA_OBJ_MAX }; #define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1) @@ -1333,6 +1340,7 @@ enum nft_object_attributes { * @NFTA_FLOWTABLE_NAME: name of this flow table (NLA_STRING) * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32) * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32) + * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64) */ enum nft_flowtable_attributes { NFTA_FLOWTABLE_UNSPEC, @@ -1340,6 +1348,8 @@ enum nft_flowtable_attributes { NFTA_FLOWTABLE_NAME, NFTA_FLOWTABLE_HOOK, NFTA_FLOWTABLE_USE, + NFTA_FLOWTABLE_HANDLE, + NFTA_FLOWTABLE_PAD, __NFTA_FLOWTABLE_MAX }; #define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b541e5094dce..1addc401ff7d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -26,6 +26,7 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); +static u64 table_handle; static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, @@ -332,6 +333,20 @@ static struct nft_table *nft_table_lookup(const struct net *net, return NULL; } +static struct nft_table *nft_table_lookup_byhandle(const struct net *net, + const struct nlattr *nla, + u8 genmask) +{ + struct nft_table *table; + + list_for_each_entry(table, &net->nft.tables, list) { + if (be64_to_cpu(nla_get_be64(nla)) == table->handle && + nft_active_genmask(table, genmask)) + return table; + } + return NULL; +} + static struct nft_table *nf_tables_table_lookup(const struct net *net, const struct nlattr *nla, u8 family, u8 genmask) @@ -348,6 +363,22 @@ static struct nft_table *nf_tables_table_lookup(const struct net *net, return ERR_PTR(-ENOENT); } +static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net, + const struct nlattr *nla, + u8 genmask) +{ + struct nft_table *table; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + table = nft_table_lookup_byhandle(net, nla, genmask); + if (table != NULL) + return table; + + return ERR_PTR(-ENOENT); +} + static inline u64 nf_tables_alloc_handle(struct nft_table *table) { return ++table->hgenerator; @@ -394,6 +425,7 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, + [NFTA_TABLE_HANDLE] = { .type = NLA_U64 }, }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, @@ -415,7 +447,9 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || - nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use))) + nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || + nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), + NFTA_TABLE_PAD)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -674,6 +708,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, INIT_LIST_HEAD(&table->flowtables); table->family = family; table->flags = flags; + table->handle = ++table_handle; nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); @@ -791,11 +826,18 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, struct nft_ctx ctx; nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla); - if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) + if (family == AF_UNSPEC || + (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); - table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family, - genmask); + if (nla[NFTA_TABLE_HANDLE]) + table = nf_tables_table_lookup_byhandle(net, + nla[NFTA_TABLE_HANDLE], + genmask); + else + table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], + family, genmask); + if (IS_ERR(table)) return PTR_ERR(table); @@ -1539,6 +1581,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, struct nft_rule *rule; int family = nfmsg->nfgen_family; struct nft_ctx ctx; + u64 handle; u32 use; int err; @@ -1547,7 +1590,12 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + if (nla[NFTA_CHAIN_HANDLE]) { + handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); + chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); + } else { + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + } if (IS_ERR(chain)) return PTR_ERR(chain); @@ -2503,6 +2551,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, + [NFTA_SET_HANDLE] = { .type = NLA_U64 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -2546,6 +2595,22 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-ENOENT); } +static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) +{ + struct nft_set *set; + + if (nla == NULL) + return ERR_PTR(-EINVAL); + + list_for_each_entry(set, &table->sets, list) { + if (be64_to_cpu(nla_get_be64(nla)) == set->handle && + nft_active_genmask(set, genmask)) + return set; + } + return ERR_PTR(-ENOENT); +} + static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, const struct nlattr *nla, u8 genmask) @@ -2661,6 +2726,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) goto nla_put_failure; + if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle), + NFTA_SET_PAD)) + goto nla_put_failure; if (set->flags != 0) if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) goto nla_put_failure; @@ -3069,6 +3137,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->udata = udata; set->timeout = timeout; set->gc_int = gc_int; + set->handle = nf_tables_alloc_handle(table); err = ops->init(set, &desc, nla); if (err < 0) @@ -3126,7 +3195,10 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + if (nla[NFTA_SET_HANDLE]) + set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask); + else + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -4256,6 +4328,21 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); +struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) +{ + struct nft_object *obj; + + list_for_each_entry(obj, &table->objects, list) { + if (be64_to_cpu(nla_get_be64(nla)) == obj->handle && + objtype == obj->ops->type->type && + nft_active_genmask(obj, genmask)) + return obj; + } + return ERR_PTR(-ENOENT); +} + static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, @@ -4263,6 +4350,7 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, + [NFTA_OBJ_HANDLE] = { .type = NLA_U64}, }; static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, @@ -4410,6 +4498,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, goto err1; } obj->table = table; + obj->handle = nf_tables_alloc_handle(table); + obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); if (!obj->name) { err = -ENOMEM; @@ -4456,7 +4546,9 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nla_put_string(skb, NFTA_OBJ_NAME, obj->name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || - nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) + nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) || + nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle), + NFTA_OBJ_PAD)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -4654,7 +4746,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, u32 objtype; if (!nla[NFTA_OBJ_TYPE] || - !nla[NFTA_OBJ_NAME]) + (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, @@ -4663,7 +4755,12 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, return PTR_ERR(table); objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (nla[NFTA_OBJ_HANDLE]) + obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE], + objtype, genmask); + else + obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], + objtype, genmask); if (IS_ERR(obj)) return PTR_ERR(obj); if (obj->use > 0) @@ -4735,6 +4832,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, + [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, }; struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, @@ -4752,6 +4850,20 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); +struct nft_flowtable * +nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) +{ + struct nft_flowtable *flowtable; + + list_for_each_entry(flowtable, &table->flowtables, list) { + if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle && + nft_active_genmask(flowtable, genmask)) + return flowtable; + } + return ERR_PTR(-ENOENT); +} + #define NFT_FLOWTABLE_DEVICE_MAX 8 static int nf_tables_parse_devices(const struct nft_ctx *ctx, @@ -4960,6 +5072,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return -ENOMEM; flowtable->table = table; + flowtable->handle = nf_tables_alloc_handle(table); + flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); if (!flowtable->name) { err = -ENOMEM; @@ -5034,8 +5148,14 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], - genmask); + if (nla[NFTA_FLOWTABLE_HANDLE]) + flowtable = nf_tables_flowtable_lookup_byhandle(table, + nla[NFTA_FLOWTABLE_HANDLE], + genmask); + else + flowtable = nf_tables_flowtable_lookup(table, + nla[NFTA_FLOWTABLE_NAME], + genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (flowtable->use > 0) @@ -5068,7 +5188,9 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || - nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use))) + nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || + nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), + NFTA_FLOWTABLE_PAD)) goto nla_put_failure; nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK); -- cgit v1.2.3-59-g8ed1b From 4db5a802e565f0a60e08bd39a055f0095689802b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 16 Jan 2018 23:01:57 +0100 Subject: l2tp: mark L2TP_ATTR_L2SPEC_LEN as not used Reviewed-by: Guillaume Nault Tested-by: Guillaume Nault Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 71e62795104d..7d570c7bd117 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -97,7 +97,7 @@ enum { L2TP_ATTR_OFFSET, /* u16 (not used) */ L2TP_ATTR_DATA_SEQ, /* u16 */ L2TP_ATTR_L2SPEC_TYPE, /* u8, enum l2tp_l2spec_type */ - L2TP_ATTR_L2SPEC_LEN, /* u8, enum l2tp_l2spec_type */ + L2TP_ATTR_L2SPEC_LEN, /* u8 (not used) */ L2TP_ATTR_PROTO_VERSION, /* u8 */ L2TP_ATTR_IFNAME, /* string */ L2TP_ATTR_CONN_ID, /* u32 */ -- cgit v1.2.3-59-g8ed1b From e8660ded7f5a9889395d33ce3d5e8c729a462bf5 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 18 Jan 2018 17:48:18 +0100 Subject: macsec: restore uAPI after addition of GCM-AES-256 Commit ccfdec908922 ("macsec: Add support for GCM-AES-256 cipher suite") changed a few values in the uapi headers for MACsec. Because of existing userspace implementations, we need to preserve the value of MACSEC_DEFAULT_CIPHER_ID. Not doing that resulted in wpa_supplicant segfaults when a secure channel was created using the default cipher. Thus, swap MACSEC_DEFAULT_CIPHER_{ID,ALT} back to their original values. Changing the maximum length of the MACSEC_SA_ATTR_KEY attribute is unnecessary, as the previous value (MACSEC_MAX_KEY_LEN, which was 128B) is large enough to carry 32-bytes keys. This patch reverts MACSEC_MAX_KEY_LEN to 128B and restores the old length check on MACSEC_SA_ATTR_KEY. Fixes: ccfdec908922 ("macsec: Add support for GCM-AES-256 cipher suite") Signed-off-by: Davide Caratti Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- drivers/net/macsec.c | 12 +++++------- include/uapi/linux/if_macsec.h | 6 +++--- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index f522715c6595..7de88b33d5b9 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -396,8 +396,6 @@ static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) #define MACSEC_GCM_AES_128_SAK_LEN 16 #define MACSEC_GCM_AES_256_SAK_LEN 32 -#define MAX_SAK_LEN MACSEC_GCM_AES_256_SAK_LEN - #define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN #define DEFAULT_SEND_SCI true #define DEFAULT_ENCRYPT false @@ -1605,7 +1603,7 @@ static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = { [MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY, .len = MACSEC_KEYID_LEN, }, [MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY, - .len = MAX_SAK_LEN, }, + .len = MACSEC_MAX_KEY_LEN, }, }; static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa) @@ -2374,7 +2372,7 @@ static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb) switch (secy->key_len) { case MACSEC_GCM_AES_128_SAK_LEN: - csid = MACSEC_CIPHER_ID_GCM_AES_128; + csid = MACSEC_DEFAULT_CIPHER_ID; break; case MACSEC_GCM_AES_256_SAK_LEN: csid = MACSEC_CIPHER_ID_GCM_AES_256; @@ -3076,7 +3074,7 @@ static int macsec_changelink_common(struct net_device *dev, if (data[IFLA_MACSEC_CIPHER_SUITE]) { switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) { case MACSEC_CIPHER_ID_GCM_AES_128: - case MACSEC_DEFAULT_CIPHER_ALT: + case MACSEC_DEFAULT_CIPHER_ID: secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; break; case MACSEC_CIPHER_ID_GCM_AES_256: @@ -3355,7 +3353,7 @@ static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[], switch (csid) { case MACSEC_CIPHER_ID_GCM_AES_128: case MACSEC_CIPHER_ID_GCM_AES_256: - case MACSEC_DEFAULT_CIPHER_ALT: + case MACSEC_DEFAULT_CIPHER_ID: if (icv_len < MACSEC_MIN_ICV_LEN || icv_len > MACSEC_STD_ICV_LEN) return -EINVAL; @@ -3428,7 +3426,7 @@ static int macsec_fill_info(struct sk_buff *skb, switch (secy->key_len) { case MACSEC_GCM_AES_128_SAK_LEN: - csid = MACSEC_CIPHER_ID_GCM_AES_128; + csid = MACSEC_DEFAULT_CIPHER_ID; break; case MACSEC_GCM_AES_256_SAK_LEN: csid = MACSEC_CIPHER_ID_GCM_AES_256; diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h index 2e522835a4af..98e4d5d7c45c 100644 --- a/include/uapi/linux/if_macsec.h +++ b/include/uapi/linux/if_macsec.h @@ -18,7 +18,7 @@ #define MACSEC_GENL_NAME "macsec" #define MACSEC_GENL_VERSION 1 -#define MACSEC_MAX_KEY_LEN 256 +#define MACSEC_MAX_KEY_LEN 128 #define MACSEC_KEYID_LEN 16 @@ -26,9 +26,9 @@ #define MACSEC_CIPHER_ID_GCM_AES_128 0x0080C20001000001ULL #define MACSEC_CIPHER_ID_GCM_AES_256 0x0080C20001000002ULL -#define MACSEC_DEFAULT_CIPHER_ID MACSEC_CIPHER_ID_GCM_AES_128 /* deprecated cipher ID for GCM-AES-128 */ -#define MACSEC_DEFAULT_CIPHER_ALT 0x0080020001000001ULL +#define MACSEC_DEFAULT_CIPHER_ID 0x0080020001000001ULL +#define MACSEC_DEFAULT_CIPHER_ALT MACSEC_CIPHER_ID_GCM_AES_128 #define MACSEC_MIN_ICV_LEN 8 #define MACSEC_MAX_ICV_LEN 32 -- cgit v1.2.3-59-g8ed1b From b2d3bcfa26a7a8de41f358a6cae8b848673b3c6e Mon Sep 17 00:00:00 2001 From: David Decotigny Date: Thu, 18 Jan 2018 09:59:13 -0800 Subject: net: core: Expose number of link up/down transitions Expose the number of times the link has been going UP or DOWN, and update the "carrier_changes" counter to be the sum of these two events. While at it, also update the sysfs-class-net documentation to cover: carrier_changes (3.15), carrier_up_count (4.16) and carrier_down_count (4.16) Signed-off-by: David Decotigny [Florian: * rebase * add documentation * merge carrier_changes with up/down counters] Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 24 ++++++++++++++++++++++++ include/linux/netdevice.h | 6 ++++-- include/uapi/linux/if_link.h | 2 ++ net/core/net-sysfs.c | 25 ++++++++++++++++++++++++- net/core/rtnetlink.c | 13 +++++++++++-- net/sched/sch_generic.c | 4 ++-- 6 files changed, 67 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 6856da99b6f7..2f1788111cd9 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -259,3 +259,27 @@ Contact: netdev@vger.kernel.org Description: Symbolic link to the PHY device this network device is attached to. + +What: /sys/class/net//carrier_changes +Date: Mar 2014 +KernelVersion: 3.15 +Contact: netdev@vger.kernel.org +Description: + 32-bit unsigned integer counting the number of times the link has + seen a change from UP to DOWN and vice versa + +What: /sys/class/net//carrier_up_count +Date: Jan 2018 +KernelVersion: 4.16 +Contact: netdev@vger.kernel.org +Description: + 32-bit unsigned integer counting the number of times the link has + been up + +What: /sys/class/net//carrier_down_count +Date: Jan 2018 +KernelVersion: 4.16 +Contact: netdev@vger.kernel.org +Description: + 32-bit unsigned integer counting the number of times the link has + been down diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ed0799a12bf2..837e9cb7e358 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1680,8 +1680,6 @@ struct net_device { unsigned long base_addr; int irq; - atomic_t carrier_changes; - /* * Some hardware also needs these fields (state,dev_list, * napi_list,unreg_list,close_list) but they are not @@ -1719,6 +1717,10 @@ struct net_device { atomic_long_t tx_dropped; atomic_long_t rx_nohandler; + /* Stats to monitor link on/off, flapping */ + atomic_t carrier_up_count; + atomic_t carrier_down_count; + #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def *wireless_handlers; struct iw_public_data *wireless_data; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index f8f04fed6186..8616131e2c61 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -161,6 +161,8 @@ enum { IFLA_EVENT, IFLA_NEW_NETNSID, IFLA_IF_NETNSID, + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, __IFLA_MAX }; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7bf8b85ade16..c4a28f4667b6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -295,10 +295,31 @@ static ssize_t carrier_changes_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); return sprintf(buf, fmt_dec, - atomic_read(&netdev->carrier_changes)); + atomic_read(&netdev->carrier_up_count) + + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); +static ssize_t carrier_up_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); +} +static DEVICE_ATTR_RO(carrier_up_count); + +static ssize_t carrier_down_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); +} +static DEVICE_ATTR_RO(carrier_down_count); + /* read-write attributes */ static int change_mtu(struct net_device *dev, unsigned long new_mtu) @@ -547,6 +568,8 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, + &dev_attr_carrier_up_count.attr, + &dev_attr_carrier_down_count.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 16d644a4f974..97874daa1336 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -990,6 +990,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(1) /* IFLA_PROTO_DOWN */ + nla_total_size(4) /* IFLA_IF_NETNSID */ + + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + 0; } @@ -1551,8 +1553,13 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, - atomic_read(&dev->carrier_changes)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + atomic_read(&dev->carrier_up_count) + + atomic_read(&dev->carrier_down_count)) || + nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || + nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, + atomic_read(&dev->carrier_up_count)) || + nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, + atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { @@ -1656,6 +1663,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_IF_NETNSID] = { .type = NLA_S32 }, + [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, + [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ef8b4ecde2ac..1816bde47256 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -510,7 +510,7 @@ void netif_carrier_on(struct net_device *dev) if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; - atomic_inc(&dev->carrier_changes); + atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) __netdev_watchdog_up(dev); @@ -529,7 +529,7 @@ void netif_carrier_off(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; - atomic_inc(&dev->carrier_changes); + atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } } -- cgit v1.2.3-59-g8ed1b From de525be2ca2734865d29c4b67ddd29913b214906 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:09 -0800 Subject: bpf: Support passing args to sock_ops bpf function Adds support for passing up to 4 arguments to sock_ops bpf functions. It reusues the reply union, so the bpf_sock_ops structures are not increased in size. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 + include/net/tcp.h | 40 +++++++++++++++++++++++++++++++++++----- include/uapi/linux/bpf.h | 5 +++-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_nv.c | 2 +- net/ipv4/tcp_output.c | 2 +- 6 files changed, 42 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index daa5a676335f..20384c4bed25 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1003,6 +1003,7 @@ struct bpf_sock_ops_kern { struct sock *sk; u32 op; union { + u32 args[4]; u32 reply; u32 replylong[4]; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6092eaff61cf..093e967a2960 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2006,7 +2006,7 @@ void tcp_cleanup_ulp(struct sock *sk); * program loaded). */ #ifdef CONFIG_BPF -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { struct bpf_sock_ops_kern sock_ops; int ret; @@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op) sock_ops.sk = sk; sock_ops.op = op; + if (nargs > 0) + memcpy(sock_ops.args, args, nargs * sizeof(*args)); ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); if (ret == 0) @@ -2027,18 +2029,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op) ret = -1; return ret; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + u32 args[2] = {arg1, arg2}; + + return tcp_call_bpf(sk, op, 2, args); +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + u32 args[3] = {arg1, arg2, arg3}; + + return tcp_call_bpf(sk, op, 3, args); +} + #else -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { return -EPERM; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + return -EPERM; +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + return -EPERM; +} + #endif static inline u32 tcp_timeout_init(struct sock *sk) { int timeout; - timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT); + timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; @@ -2049,7 +2079,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) { int rwnd; - rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT); + rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); if (rwnd < 0) rwnd = 0; @@ -2058,7 +2088,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { - return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); + return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } #if IS_ENABLED(CONFIG_SMC) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 406c19d6016b..8d5874c2c4ff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -952,8 +952,9 @@ struct bpf_map_info { struct bpf_sock_ops { __u32 op; union { - __u32 reply; - __u32 replylong[4]; + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ }; __u32 family; __u32 remote_ip4; /* Stored in network byte order */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d7cf861bf699..88b62441e7e9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op); + tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 0b5a05bd82e3..ddbce73edae8 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk) * within a datacenter, where we have reasonable estimates of * RTTs */ - base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); if (base_rtt > 0) { ca->nv_base_rtt = base_rtt; ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 95461f02ac9a..d12f7f71c1c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3469,7 +3469,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; - tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ -- cgit v1.2.3-59-g8ed1b From b13d880721729384757f235166068c315326f4a1 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:10 -0800 Subject: bpf: Adds field bpf_sock_ops_cb_flags to tcp_sock Adds field bpf_sock_ops_cb_flags to tcp_sock and bpf_sock_ops. Its primary use is to determine if there should be calls to sock_ops bpf program at various points in the TCP code. The field is initialized to zero, disabling the calls. A sock_ops BPF program can set it, per connection and as necessary, when the connection is established. It also adds support for reading and writting the field within a sock_ops BPF program. Reading is done by accessing the field directly. However, writing is done through the helper function bpf_sock_ops_cb_flags_set, in order to return an error if a BPF program is trying to set a callback that is not supported in the current kernel (i.e. running an older kernel). The helper function returns 0 if it was able to set all of the bits set in the argument, a positive number containing the bits that could not be set, or -EINVAL if the socket is not a full TCP socket. Examples of where one could call the bpf program: 1) When RTO fires 2) When a packet is retransmitted 3) When the connection terminates 4) When a packet is sent 5) When a packet is received Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/tcp.h | 11 +++++++++++ include/uapi/linux/bpf.h | 17 ++++++++++++++++- net/core/filter.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4f93f0953c41..8f4c54986f97 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -335,6 +335,17 @@ struct tcp_sock { int linger2; + +/* Sock_ops bpf program related variables */ +#ifdef CONFIG_BPF + u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs + * values defined in uapi/linux/tcp.h + */ +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) +#else +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 +#endif + /* Receiver side RTT estimation */ struct { u32 rtt_us; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8d5874c2c4ff..aa128407c44d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -642,6 +642,14 @@ union bpf_attr { * @optlen: length of optval in bytes * Return: 0 or negative error * + * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) + * Set callback flags for sock_ops + * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct + * @flags: flags value + * Return: 0 for no error + * -EINVAL if there is no full tcp socket + * bits in flags that are not supported by current kernel + * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) * Grow or shrink room in sk_buff. * @skb: pointer to skb @@ -748,7 +756,8 @@ union bpf_attr { FN(perf_event_read_value), \ FN(perf_prog_read_value), \ FN(getsockopt), \ - FN(override_return), + FN(override_return), \ + FN(sock_ops_cb_flags_set), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -969,8 +978,14 @@ struct bpf_sock_ops { */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ }; +/* Definitions for bpf_sock_ops_cb_flags */ +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0 /* Mask of all currently + * supported cb flags + */ + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ diff --git a/net/core/filter.c b/net/core/filter.c index c356ec02b1a5..6936d19ac736 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3328,6 +3328,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!sk_fullsock(sk)) + return -EINVAL; + +#ifdef CONFIG_INET + if (val) + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +#else + return -EINVAL; +#endif +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3510,6 +3537,8 @@ static const struct bpf_func_proto * return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: @@ -4546,6 +4575,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; } return insn - insn_buf; } -- cgit v1.2.3-59-g8ed1b From f89013f66d0f1a0dad44c513318efb706399a36b Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:11 -0800 Subject: bpf: Add sock_ops RTO callback Adds an optional call to sock_ops BPF program based on whether the BPF_SOCK_OPS_RTO_CB_FLAG is set in bpf_sock_ops_flags. The BPF program is passed 2 arguments: icsk_retransmits and whether the RTO has expired. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 +++++++- net/ipv4/tcp_timer.c | 7 +++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa128407c44d..c8cecf9cf5bd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -982,7 +982,8 @@ struct bpf_sock_ops { }; /* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0 /* Mask of all currently +#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x1 /* Mask of all currently * supported cb flags */ @@ -1019,6 +1020,11 @@ enum { * a congestion threshold. RTTs above * this indicate congestion */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6db3124cdbda..257abdde23b0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -213,11 +213,18 @@ static int tcp_write_timeout(struct sock *sk) icsk->icsk_user_timeout); } tcp_fastopen_active_detect_blackhole(sk, expired); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, + icsk->icsk_retransmits, + icsk->icsk_rto, (int)expired); + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } + return 0; } -- cgit v1.2.3-59-g8ed1b From 44f0e43037d3a17b043843ba67610ac7c7e37db6 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:12 -0800 Subject: bpf: Add support for reading sk_state and more Add support for reading many more tcp_sock fields state, same as sk->sk_state rtt_min same as sk->rtt_min.s[0].v (current rtt_min) snd_ssthresh rcv_nxt snd_nxt snd_una mss_cache ecn_flags rate_delivered rate_interval_us packets_out retrans_out total_retrans segs_in data_segs_in segs_out data_segs_out lost_out sacked_out sk_txhash bytes_received (__u64) bytes_acked (__u64) Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 22 ++++++++ net/core/filter.c | 143 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 154 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8cecf9cf5bd..46520eae37fa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -979,6 +979,28 @@ struct bpf_sock_ops { __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; }; /* Definitions for bpf_sock_ops_cb_flags */ diff --git a/net/core/filter.c b/net/core/filter.c index 6936d19ac736..a858ebc4ece4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3855,33 +3855,43 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool __is_valid_sock_ops_access(int off, int size) +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; - - return true; -} -static bool sock_ops_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): + if (size != size_default) + return false; break; default: return false; } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } } - return __is_valid_sock_ops_access(off, size); + return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4498,6 +4508,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + FIELD_SIZEOF(struct minmax_sample, t)); + break; + /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ @@ -4580,6 +4616,91 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); + break; } return insn - insn_buf; } -- cgit v1.2.3-59-g8ed1b From a31ad29e6a30cb0b9084a9425b819cdcd97273ce Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:14 -0800 Subject: bpf: Add BPF_SOCK_OPS_RETRANS_CB Adds support for calling sock_ops BPF program when there is a retransmission. Three arguments are used; one for the sequence number, another for the number of segments retransmitted, and the last one for the return value of tcp_transmit_skb (0 => success). Does not include syn-ack retransmissions. New op: BPF_SOCK_OPS_RETRANS_CB. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 ++++++++- net/ipv4/tcp_output.c | 4 ++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 46520eae37fa..31c93a0bdbc2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1005,7 +1005,8 @@ struct bpf_sock_ops { /* Definitions for bpf_sock_ops_cb_flags */ #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x1 /* Mask of all currently +#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x3 /* Mask of all currently * supported cb flags */ @@ -1047,6 +1048,12 @@ enum { * Arg2: value of icsk_rto * Arg3: whether RTO has expired */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d12f7f71c1c4..e9f985e42405 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2905,6 +2905,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, + TCP_SKB_CB(skb)->seq, segs, err); + if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); -- cgit v1.2.3-59-g8ed1b From d44874910a26f3a8f81edf873a2473363f07f660 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:15 -0800 Subject: bpf: Add BPF_SOCK_OPS_STATE_CB Adds support for calling sock_ops BPF program when there is a TCP state change. Two arguments are used; one for the old state and another for the new state. There is a new enum in include/uapi/linux/bpf.h that exports the TCP states that prepends BPF_ to the current TCP state names. If it is ever necessary to change the internal TCP state values (other than adding more to the end), then it will become necessary to convert from the internal TCP state value to the BPF value before calling the BPF sock_ops function. There are a set of compile checks added in tcp.c to detect if the internal and BPF values differ so we can make the necessary fixes. New op: BPF_SOCK_OPS_STATE_CB. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 29 ++++++++++++++++++++++++++++- net/ipv4/tcp.c | 24 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 31c93a0bdbc2..db6bdc375126 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1006,7 +1006,8 @@ struct bpf_sock_ops { /* Definitions for bpf_sock_ops_cb_flags */ #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x3 /* Mask of all currently +#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently * supported cb flags */ @@ -1054,6 +1055,32 @@ enum { * Arg3: return value of * tcp_transmit_skb (0 => success) */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 88b62441e7e9..f013ddc191e0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2042,6 +2042,30 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + /* We defined a new enum for TCP states that are exported in BPF + * so as not force the internal TCP states to be frozen. The + * following checks will detect if an internal state value ever + * differs from the BPF value. If this ever happens, then we will + * need to remap the internal value to the BPF value before calling + * tcp_call_bpf_2arg. + */ + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) -- cgit v1.2.3-59-g8ed1b From d350a823020e71e20a10d1dfa44f1d1d653b0334 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 25 Jan 2018 13:20:10 -0800 Subject: net: erspan: create erspan metadata uapi header The patch adds a new uapi header file, erspan.h, and moves the 'struct erspan_metadata' from internal erspan.h to it. Signed-off-by: William Tu Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/erspan.h | 32 ++-------------------------- include/uapi/linux/erspan.h | 52 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 30 deletions(-) create mode 100644 include/uapi/linux/erspan.h (limited to 'include/uapi/linux') diff --git a/include/net/erspan.h b/include/net/erspan.h index 6d30fe898286..5daa4866412b 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -46,6 +46,8 @@ * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB */ +#include + #define ERSPAN_VERSION 0x1 /* ERSPAN type II */ #define VER_MASK 0xf000 #define VLAN_MASK 0x0fff @@ -68,29 +70,6 @@ #define HWID_OFFSET 4 #define DIR_OFFSET 3 -/* ERSPAN version 2 metadata header */ -struct erspan_md2 { - __be32 timestamp; - __be16 sgt; /* security group tag */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 hwid_upper:2, - ft:5, - p:1; - __u8 o:1, - gra:2, - dir:1, - hwid:4; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u8 p:1, - ft:5, - hwid_upper:2; - __u8 hwid:4, - dir:1, - gra:2, - o:1; -#endif -}; - enum erspan_encap_type { ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ @@ -100,13 +79,6 @@ enum erspan_encap_type { #define ERSPAN_V1_MDSIZE 4 #define ERSPAN_V2_MDSIZE 8 -struct erspan_metadata { - union { - __be32 index; /* Version 1 (type II)*/ - struct erspan_md2 md2; /* Version 2 (type III) */ - } u; - int version; -}; struct erspan_base_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) diff --git a/include/uapi/linux/erspan.h b/include/uapi/linux/erspan.h new file mode 100644 index 000000000000..841573019ae1 --- /dev/null +++ b/include/uapi/linux/erspan.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * ERSPAN Tunnel Metadata + * + * Copyright (c) 2018 VMware + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * Userspace API for metadata mode ERSPAN tunnel + */ +#ifndef _UAPI_ERSPAN_H +#define _UAPI_ERSPAN_H + +#include /* For __beXX in userspace */ +#include + +/* ERSPAN version 2 metadata header */ +struct erspan_md2 { + __be32 timestamp; + __be16 sgt; /* security group tag */ +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 hwid_upper:2, + ft:5, + p:1; + __u8 o:1, + gra:2, + dir:1, + hwid:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 p:1, + ft:5, + hwid_upper:2; + __u8 hwid:4, + dir:1, + gra:2, + o:1; +#else +#error "Please fix " +#endif +}; + +struct erspan_metadata { + int version; + union { + __be32 index; /* Version 1 (type II)*/ + struct erspan_md2 md2; /* Version 2 (type III) */ + } u; +}; + +#endif /* _UAPI_ERSPAN_H */ -- cgit v1.2.3-59-g8ed1b From fc1372f89ffe1f58b589643b75f679e452350703 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 25 Jan 2018 13:20:11 -0800 Subject: openvswitch: add erspan version I and II support The patch adds support for openvswitch to configure erspan v1 and v2. The OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS attr is added to uapi as a binary blob to support all ERSPAN v1 and v2's fields. Note that Previous commit "openvswitch: Add erspan tunnel support." was reverted since it does not design properly. Signed-off-by: William Tu Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/flow_netlink.c | 52 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index dcfab5e3b55c..713e56ce681f 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -363,6 +363,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ OVS_TUNNEL_KEY_ATTR_PAD, + OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* struct erspan_metadata */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index eb55f1b3d047..7322aa1e382e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "flow_netlink.h" @@ -329,7 +330,8 @@ size_t ovs_tun_key_attr_size(void) + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ - /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with + /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and + * OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ @@ -400,6 +402,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] .next = ovs_vxlan_ext_key_lens }, [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = OVS_ATTR_VARIABLE }, }; static const struct ovs_len_tbl @@ -631,6 +634,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } +static int erspan_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + + BUILD_BUG_ON(sizeof(struct erspan_metadata) > + sizeof(match->key->tun_opts)); + + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR(log, "ERSPAN option length err (len %d, max %zu).", + nla_len(a), sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (!is_mask) + SW_FLOW_KEY_PUT(match, tun_opts_len, + sizeof(struct erspan_metadata), false); + else + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + + opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), + nla_len(a), is_mask); + return 0; +} + static int ip_tun_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) @@ -738,6 +768,20 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_PAD: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = erspan_tun_opt_from_nlattr(a, match, is_mask, + log); + if (err) + return err; + + tun_flags |= TUNNEL_ERSPAN_OPT; + opts_type = type; + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -862,6 +906,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, else if (output->tun_flags & TUNNEL_VXLAN_OPT && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; + else if (output->tun_flags & TUNNEL_ERSPAN_OPT && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; } return 0; @@ -2486,6 +2534,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + break; } } -- cgit v1.2.3-59-g8ed1b From 38e01b30563a5b5ade7b54e5d739d16a2b02fe82 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 25 Jan 2018 15:01:39 +0100 Subject: dev: advertise the new ifindex when the netns iface changes The goal is to let the user follow an interface that moves to another netns. CC: Jiri Benc CC: Christian Brauner Signed-off-by: Nicolas Dichtel Reviewed-by: Jiri Benc Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 5 +++-- include/uapi/linux/if_link.h | 1 + net/core/dev.c | 19 ++++++++++++------- net/core/rtnetlink.c | 31 ++++++++++++++++++++----------- 4 files changed, 36 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 62d508b31f56..0514cc36ac34 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -19,10 +19,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, - gfp_t flags, int *new_nsid); + gfp_t flags, int *new_nsid, int new_ifindex); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, - gfp_t flags, int *new_nsid); + gfp_t flags, int *new_nsid, + int new_ifindex); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8616131e2c61..6d9447700e18 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -163,6 +163,7 @@ enum { IFLA_IF_NETNSID, IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, __IFLA_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index 59987eb6511a..858501b12869 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7360,7 +7360,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL); + GFP_KERNEL, NULL, 0); /* * Flush the unicast and multicast chains @@ -8473,7 +8473,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - int err, new_nsid; + int err, new_nsid, new_ifindex; ASSERT_RTNL(); @@ -8529,8 +8529,16 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + new_nsid = peernet2id_alloc(dev_net(dev), net); - rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid); + /* If there is an ifindex conflict assign a new one */ + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); + else + new_ifindex = dev->ifindex; + + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, + new_ifindex); /* * Flush the unicast and multicast chains @@ -8544,10 +8552,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Actually switch the network namespace */ dev_net_set(dev, net); - - /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) - dev->ifindex = dev_new_index(net); + dev->ifindex = new_ifindex; /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f111557958bb..e04af7b7f448 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -988,6 +988,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + nla_total_size(1) /* IFLA_PROTO_DOWN */ + nla_total_size(4) /* IFLA_IF_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ @@ -1511,7 +1512,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, - u32 event, int *new_nsid, int tgt_netnsid) + u32 event, int *new_nsid, int new_ifindex, + int tgt_netnsid) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1608,6 +1610,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; + if (new_ifindex && + nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) + goto nla_put_failure; + rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) @@ -1853,7 +1859,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0, NULL, + ext_filter_mask, 0, NULL, 0, netnsid); if (err < 0) { @@ -3088,7 +3094,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, - 0, NULL, netnsid); + 0, NULL, 0, netnsid); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -3184,7 +3190,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, - u32 event, gfp_t flags, int *new_nsid) + u32 event, gfp_t flags, int *new_nsid, + int new_ifindex) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -3197,7 +3204,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, 0, 0, change, 0, 0, event, - new_nsid, -1); + new_nsid, new_ifindex, -1); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3220,14 +3227,15 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, - gfp_t flags, int *new_nsid) + gfp_t flags, int *new_nsid, int new_ifindex) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, + new_ifindex); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } @@ -3235,14 +3243,15 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { - rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL); + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + NULL, 0); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, - gfp_t flags, int *new_nsid) + gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, - new_nsid); + new_nsid, new_ifindex); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -4642,7 +4651,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), - GFP_KERNEL, NULL); + GFP_KERNEL, NULL, 0); break; default: break; -- cgit v1.2.3-59-g8ed1b