From 31fa97c5defca3895dc6c823096d7ba59df76125 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Wed, 11 Jun 2014 17:18:21 +0300 Subject: cfg80211: pass TDLS initiator in tdls_mgmt operations The TDLS initiator is set once during link setup. If determines the address ordering in the link identifier IE. Fix dependent drivers - mwifiex and mac80211. Signed-off-by: Arik Nemtsov Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index be9519b52bb1..f1db15b9c041 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1591,6 +1591,9 @@ enum nl80211_commands { * creation then the new interface will be owned by the netlink socket * that created it and will be destroyed when the socket is closed * + * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is + * the TDLS link initiator. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1931,6 +1934,8 @@ enum nl80211_attrs { NL80211_ATTR_CSA_C_OFFSETS_TX, NL80211_ATTR_MAX_CSA_COUNTERS, + NL80211_ATTR_TDLS_INITIATOR, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, -- cgit v1.2.3-59-g8ed1b From 7200135bc1e61f1437dc326ae2ef2f310c50b4eb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 16 Jun 2014 13:01:52 +0200 Subject: netfilter: kill ulog targets This has been marked as deprecated for quite some time and the NFLOG target replacement has been also available since 2006. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/Kbuild | 1 - include/uapi/linux/netfilter_bridge/ebt_ulog.h | 38 -- include/uapi/linux/netfilter_ipv4/Kbuild | 1 - include/uapi/linux/netfilter_ipv4/ipt_ULOG.h | 49 --- net/bridge/netfilter/Kconfig | 16 - net/bridge/netfilter/ebt_ulog.c | 393 ------------------- net/ipv4/netfilter/Kconfig | 19 - net/ipv4/netfilter/ipt_ULOG.c | 498 ------------------------- 8 files changed, 1015 deletions(-) delete mode 100644 include/uapi/linux/netfilter_bridge/ebt_ulog.h delete mode 100644 include/uapi/linux/netfilter_ipv4/ipt_ULOG.h delete mode 100644 net/bridge/netfilter/ebt_ulog.c delete mode 100644 net/ipv4/netfilter/ipt_ULOG.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter_bridge/Kbuild b/include/uapi/linux/netfilter_bridge/Kbuild index 348717c3a22f..0fbad8ef96de 100644 --- a/include/uapi/linux/netfilter_bridge/Kbuild +++ b/include/uapi/linux/netfilter_bridge/Kbuild @@ -14,6 +14,5 @@ header-y += ebt_nflog.h header-y += ebt_pkttype.h header-y += ebt_redirect.h header-y += ebt_stp.h -header-y += ebt_ulog.h header-y += ebt_vlan.h header-y += ebtables.h diff --git a/include/uapi/linux/netfilter_bridge/ebt_ulog.h b/include/uapi/linux/netfilter_bridge/ebt_ulog.h deleted file mode 100644 index 89a6becb5269..000000000000 --- a/include/uapi/linux/netfilter_bridge/ebt_ulog.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _EBT_ULOG_H -#define _EBT_ULOG_H - -#include - -#define EBT_ULOG_DEFAULT_NLGROUP 0 -#define EBT_ULOG_DEFAULT_QTHRESHOLD 1 -#define EBT_ULOG_MAXNLGROUPS 32 /* hardcoded netlink max */ -#define EBT_ULOG_PREFIX_LEN 32 -#define EBT_ULOG_MAX_QLEN 50 -#define EBT_ULOG_WATCHER "ulog" -#define EBT_ULOG_VERSION 1 - -struct ebt_ulog_info { - __u32 nlgroup; - unsigned int cprange; - unsigned int qthreshold; - char prefix[EBT_ULOG_PREFIX_LEN]; -}; - -typedef struct ebt_ulog_packet_msg { - int version; - char indev[IFNAMSIZ]; - char outdev[IFNAMSIZ]; - char physindev[IFNAMSIZ]; - char physoutdev[IFNAMSIZ]; - char prefix[EBT_ULOG_PREFIX_LEN]; - struct timeval stamp; - unsigned long mark; - unsigned int hook; - size_t data_len; - /* The complete packet, including Ethernet header and perhaps - * the VLAN header is appended */ - unsigned char data[0] __attribute__ - ((aligned (__alignof__(struct ebt_ulog_info)))); -} ebt_ulog_packet_msg_t; - -#endif /* _EBT_ULOG_H */ diff --git a/include/uapi/linux/netfilter_ipv4/Kbuild b/include/uapi/linux/netfilter_ipv4/Kbuild index fb008437dde1..ecb291df390e 100644 --- a/include/uapi/linux/netfilter_ipv4/Kbuild +++ b/include/uapi/linux/netfilter_ipv4/Kbuild @@ -5,7 +5,6 @@ header-y += ipt_ECN.h header-y += ipt_LOG.h header-y += ipt_REJECT.h header-y += ipt_TTL.h -header-y += ipt_ULOG.h header-y += ipt_ah.h header-y += ipt_ecn.h header-y += ipt_ttl.h diff --git a/include/uapi/linux/netfilter_ipv4/ipt_ULOG.h b/include/uapi/linux/netfilter_ipv4/ipt_ULOG.h deleted file mode 100644 index 417aad280bcc..000000000000 --- a/include/uapi/linux/netfilter_ipv4/ipt_ULOG.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Header file for IP tables userspace logging, Version 1.8 - * - * (C) 2000-2002 by Harald Welte - * - * Distributed under the terms of GNU GPL */ - -#ifndef _IPT_ULOG_H -#define _IPT_ULOG_H - -#ifndef NETLINK_NFLOG -#define NETLINK_NFLOG 5 -#endif - -#define ULOG_DEFAULT_NLGROUP 1 -#define ULOG_DEFAULT_QTHRESHOLD 1 - -#define ULOG_MAC_LEN 80 -#define ULOG_PREFIX_LEN 32 - -#define ULOG_MAX_QLEN 50 -/* Why 50? Well... there is a limit imposed by the slab cache 131000 - * bytes. So the multipart netlink-message has to be < 131000 bytes. - * Assuming a standard ethernet-mtu of 1500, we could define this up - * to 80... but even 50 seems to be big enough. */ - -/* private data structure for each rule with a ULOG target */ -struct ipt_ulog_info { - unsigned int nl_group; - size_t copy_range; - size_t qthreshold; - char prefix[ULOG_PREFIX_LEN]; -}; - -/* Format of the ULOG packets passed through netlink */ -typedef struct ulog_packet_msg { - unsigned long mark; - long timestamp_sec; - long timestamp_usec; - unsigned int hook; - char indev_name[IFNAMSIZ]; - char outdev_name[IFNAMSIZ]; - size_t data_len; - char prefix[ULOG_PREFIX_LEN]; - unsigned char mac_len; - unsigned char mac[ULOG_MAC_LEN]; - unsigned char payload[0]; -} ulog_packet_msg_t; - -#endif /*_IPT_ULOG_H*/ diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 629dc77874a9..3a76ac7b7141 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -202,22 +202,6 @@ config BRIDGE_EBT_LOG To compile it as a module, choose M here. If unsure, say N. -config BRIDGE_EBT_ULOG - tristate "ebt: ulog support (OBSOLETE)" - help - This option enables the old bridge-specific "ebt_ulog" implementation - which has been obsoleted by the new "nfnetlink_log" code (see - CONFIG_NETFILTER_NETLINK_LOG). - - This option adds the ulog watcher, that you can use in any rule - in any ebtables table. The packet is passed to a userspace - logging daemon using netlink multicast sockets. This differs - from the log watcher in the sense that the complete packet is - sent to userspace instead of a descriptive text and that - netlink multicast sockets are used instead of the syslog. - - To compile it as a module, choose M here. If unsure, say N. - config BRIDGE_EBT_NFLOG tristate "ebt: nflog support" help diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c deleted file mode 100644 index 7c470c371e14..000000000000 --- a/net/bridge/netfilter/ebt_ulog.c +++ /dev/null @@ -1,393 +0,0 @@ -/* - * netfilter module for userspace bridged Ethernet frames logging daemons - * - * Authors: - * Bart De Schuymer - * Harald Welte - * - * November, 2004 - * - * Based on ipt_ULOG.c, which is - * (C) 2000-2002 by Harald Welte - * - * This module accepts two parameters: - * - * nlbufsiz: - * The parameter specifies how big the buffer for each netlink multicast - * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will - * get accumulated in the kernel until they are sent to userspace. It is - * NOT possible to allocate more than 128kB, and it is strongly discouraged, - * because atomically allocating 128kB inside the network rx softirq is not - * reliable. Please also keep in mind that this buffer size is allocated for - * each nlgroup you are using, so the total kernel memory usage increases - * by that factor. - * - * flushtimeout: - * Specify, after how many hundredths of a second the queue should be - * flushed even if it is not full yet. - * - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../br_private.h" - -static unsigned int nlbufsiz = NLMSG_GOODSIZE; -module_param(nlbufsiz, uint, 0600); -MODULE_PARM_DESC(nlbufsiz, "netlink buffer size (number of bytes) " - "(defaults to 4096)"); - -static unsigned int flushtimeout = 10; -module_param(flushtimeout, uint, 0600); -MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths ofa second) " - "(defaults to 10)"); - -typedef struct { - unsigned int qlen; /* number of nlmsgs' in the skb */ - struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ - struct sk_buff *skb; /* the pre-allocated skb */ - struct timer_list timer; /* the timer function */ - spinlock_t lock; /* the per-queue lock */ -} ebt_ulog_buff_t; - -static int ebt_ulog_net_id __read_mostly; -struct ebt_ulog_net { - unsigned int nlgroup[EBT_ULOG_MAXNLGROUPS]; - ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS]; - struct sock *ebtulognl; -}; - -static struct ebt_ulog_net *ebt_ulog_pernet(struct net *net) -{ - return net_generic(net, ebt_ulog_net_id); -} - -/* send one ulog_buff_t to userspace */ -static void ulog_send(struct ebt_ulog_net *ebt, unsigned int nlgroup) -{ - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[nlgroup]; - - del_timer(&ub->timer); - - if (!ub->skb) - return; - - /* last nlmsg needs NLMSG_DONE */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_type = NLMSG_DONE; - - NETLINK_CB(ub->skb).dst_group = nlgroup + 1; - netlink_broadcast(ebt->ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC); - - ub->qlen = 0; - ub->skb = NULL; -} - -/* timer function to flush queue in flushtimeout time */ -static void ulog_timer(unsigned long data) -{ - struct ebt_ulog_net *ebt = container_of((void *)data, - struct ebt_ulog_net, - nlgroup[*(unsigned int *)data]); - - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[*(unsigned int *)data]; - spin_lock_bh(&ub->lock); - if (ub->skb) - ulog_send(ebt, *(unsigned int *)data); - spin_unlock_bh(&ub->lock); -} - -static struct sk_buff *ulog_alloc_skb(unsigned int size) -{ - struct sk_buff *skb; - unsigned int n; - - n = max(size, nlbufsiz); - skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); - if (!skb) { - if (n > size) { - /* try to allocate only as much as we need for - * current packet */ - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - pr_debug("cannot even allocate buffer of size %ub\n", - size); - } - } - - return skb; -} - -static void ebt_ulog_packet(struct net *net, unsigned int hooknr, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct ebt_ulog_info *uloginfo, - const char *prefix) -{ - ebt_ulog_packet_msg_t *pm; - size_t size, copy_len; - struct nlmsghdr *nlh; - struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - unsigned int group = uloginfo->nlgroup; - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group]; - spinlock_t *lock = &ub->lock; - ktime_t kt; - - if ((uloginfo->cprange == 0) || - (uloginfo->cprange > skb->len + ETH_HLEN)) - copy_len = skb->len + ETH_HLEN; - else - copy_len = uloginfo->cprange; - - size = nlmsg_total_size(sizeof(*pm) + copy_len); - if (size > nlbufsiz) { - pr_debug("Size %Zd needed, but nlbufsiz=%d\n", size, nlbufsiz); - return; - } - - spin_lock_bh(lock); - - if (!ub->skb) { - if (!(ub->skb = ulog_alloc_skb(size))) - goto unlock; - } else if (size > skb_tailroom(ub->skb)) { - ulog_send(ebt, group); - - if (!(ub->skb = ulog_alloc_skb(size))) - goto unlock; - } - - nlh = nlmsg_put(ub->skb, 0, ub->qlen, 0, - size - NLMSG_ALIGN(sizeof(*nlh)), 0); - if (!nlh) { - kfree_skb(ub->skb); - ub->skb = NULL; - goto unlock; - } - ub->qlen++; - - pm = nlmsg_data(nlh); - memset(pm, 0, sizeof(*pm)); - - /* Fill in the ulog data */ - pm->version = EBT_ULOG_VERSION; - kt = ktime_get_real(); - pm->stamp = ktime_to_timeval(kt); - if (ub->qlen == 1) - ub->skb->tstamp = kt; - pm->data_len = copy_len; - pm->mark = skb->mark; - pm->hook = hooknr; - if (uloginfo->prefix != NULL) - strcpy(pm->prefix, uloginfo->prefix); - - if (in) { - strcpy(pm->physindev, in->name); - /* If in isn't a bridge, then physindev==indev */ - if (br_port_exists(in)) - /* rcu_read_lock()ed by nf_hook_slow */ - strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name); - else - strcpy(pm->indev, in->name); - } - - if (out) { - /* If out exists, then out is a bridge port */ - strcpy(pm->physoutdev, out->name); - /* rcu_read_lock()ed by nf_hook_slow */ - strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name); - } - - if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0) - BUG(); - - if (ub->qlen > 1) - ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; - - ub->lastnlh = nlh; - - if (ub->qlen >= uloginfo->qthreshold) - ulog_send(ebt, group); - else if (!timer_pending(&ub->timer)) { - ub->timer.expires = jiffies + flushtimeout * HZ / 100; - add_timer(&ub->timer); - } - -unlock: - spin_unlock_bh(lock); -} - -/* this function is registered with the netfilter core */ -static void ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, - const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct nf_loginfo *li, - const char *prefix) -{ - struct ebt_ulog_info loginfo; - - if (!li || li->type != NF_LOG_TYPE_ULOG) { - loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP; - loginfo.cprange = 0; - loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD; - loginfo.prefix[0] = '\0'; - } else { - loginfo.nlgroup = li->u.ulog.group; - loginfo.cprange = li->u.ulog.copy_len; - loginfo.qthreshold = li->u.ulog.qthreshold; - strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); - } - - ebt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); -} - -static unsigned int -ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - - ebt_ulog_packet(net, par->hooknum, skb, par->in, par->out, - par->targinfo, NULL); - return EBT_CONTINUE; -} - -static int ebt_ulog_tg_check(const struct xt_tgchk_param *par) -{ - struct ebt_ulog_info *uloginfo = par->targinfo; - - if (!par->net->xt.ebt_ulog_warn_deprecated) { - pr_info("ebt_ulog is deprecated and it will be removed soon, " - "use ebt_nflog instead\n"); - par->net->xt.ebt_ulog_warn_deprecated = true; - } - - if (uloginfo->nlgroup > 31) - return -EINVAL; - - uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0'; - - if (uloginfo->qthreshold > EBT_ULOG_MAX_QLEN) - uloginfo->qthreshold = EBT_ULOG_MAX_QLEN; - - return 0; -} - -static struct xt_target ebt_ulog_tg_reg __read_mostly = { - .name = "ulog", - .revision = 0, - .family = NFPROTO_BRIDGE, - .target = ebt_ulog_tg, - .checkentry = ebt_ulog_tg_check, - .targetsize = sizeof(struct ebt_ulog_info), - .me = THIS_MODULE, -}; - -static struct nf_logger ebt_ulog_logger __read_mostly = { - .name = "ebt_ulog", - .logfn = &ebt_log_packet, - .me = THIS_MODULE, -}; - -static int __net_init ebt_ulog_net_init(struct net *net) -{ - int i; - struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - - struct netlink_kernel_cfg cfg = { - .groups = EBT_ULOG_MAXNLGROUPS, - }; - - /* initialize ulog_buffers */ - for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { - ebt->nlgroup[i] = i; - setup_timer(&ebt->ulog_buffers[i].timer, ulog_timer, - (unsigned long)&ebt->nlgroup[i]); - spin_lock_init(&ebt->ulog_buffers[i].lock); - } - - ebt->ebtulognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); - if (!ebt->ebtulognl) - return -ENOMEM; - - nf_log_set(net, NFPROTO_BRIDGE, &ebt_ulog_logger); - return 0; -} - -static void __net_exit ebt_ulog_net_fini(struct net *net) -{ - int i; - struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - - nf_log_unset(net, &ebt_ulog_logger); - for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[i]; - del_timer(&ub->timer); - - if (ub->skb) { - kfree_skb(ub->skb); - ub->skb = NULL; - } - } - netlink_kernel_release(ebt->ebtulognl); -} - -static struct pernet_operations ebt_ulog_net_ops = { - .init = ebt_ulog_net_init, - .exit = ebt_ulog_net_fini, - .id = &ebt_ulog_net_id, - .size = sizeof(struct ebt_ulog_net), -}; - -static int __init ebt_ulog_init(void) -{ - int ret; - - if (nlbufsiz >= 128*1024) { - pr_warn("Netlink buffer has to be <= 128kB," - "please try a smaller nlbufsiz parameter.\n"); - return -EINVAL; - } - - ret = register_pernet_subsys(&ebt_ulog_net_ops); - if (ret) - goto out_pernet; - - ret = xt_register_target(&ebt_ulog_tg_reg); - if (ret) - goto out_target; - - nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger); - - return 0; - -out_target: - unregister_pernet_subsys(&ebt_ulog_net_ops); -out_pernet: - return ret; -} - -static void __exit ebt_ulog_fini(void) -{ - nf_log_unregister(&ebt_ulog_logger); - xt_unregister_target(&ebt_ulog_tg_reg); - unregister_pernet_subsys(&ebt_ulog_net_ops); -} - -module_init(ebt_ulog_init); -module_exit(ebt_ulog_fini); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Bart De Schuymer "); -MODULE_DESCRIPTION("Ebtables: Packet logging to netlink using ULOG"); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a26ce035e3fa..730faacbc5f8 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -159,25 +159,6 @@ config IP_NF_TARGET_SYNPROXY To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_ULOG - tristate "ULOG target support (obsolete)" - default m if NETFILTER_ADVANCED=n - ---help--- - - This option enables the old IPv4-only "ipt_ULOG" implementation - which has been obsoleted by the new "nfnetlink_log" code (see - CONFIG_NETFILTER_NETLINK_LOG). - - This option adds a `ULOG' target, which allows you to create rules in - any iptables table. The packet is passed to a userspace logging - daemon using netlink multicast sockets; unlike the LOG target - which can only be viewed through syslog. - - The appropriate userspace logging daemon (ulogd) may be obtained from - - - To compile it as a module, choose M here. If unsure, say N. - # NAT + specific targets: nf_conntrack config NF_NAT_IPV4 tristate "IPv4 NAT" diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c deleted file mode 100644 index 9cb993cd224b..000000000000 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ /dev/null @@ -1,498 +0,0 @@ -/* - * netfilter module for userspace packet logging daemons - * - * (C) 2000-2004 by Harald Welte - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * (C) 2005-2007 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This module accepts two parameters: - * - * nlbufsiz: - * The parameter specifies how big the buffer for each netlink multicast - * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will - * get accumulated in the kernel until they are sent to userspace. It is - * NOT possible to allocate more than 128kB, and it is strongly discouraged, - * because atomically allocating 128kB inside the network rx softirq is not - * reliable. Please also keep in mind that this buffer size is allocated for - * each nlgroup you are using, so the total kernel memory usage increases - * by that factor. - * - * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since - * nlbufsiz is used with alloc_skb, which adds another - * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead. - * - * flushtimeout: - * Specify, after how many hundredths of a second the queue should be - * flushed even if it is not full yet. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); - -#define ULOG_NL_EVENT 111 /* Harald's favorite number */ -#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ - -static unsigned int nlbufsiz = NLMSG_GOODSIZE; -module_param(nlbufsiz, uint, 0400); -MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); - -static unsigned int flushtimeout = 10; -module_param(flushtimeout, uint, 0600); -MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); - -static bool nflog = true; -module_param(nflog, bool, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - -/* global data structures */ - -typedef struct { - unsigned int qlen; /* number of nlmsgs' in the skb */ - struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ - struct sk_buff *skb; /* the pre-allocated skb */ - struct timer_list timer; /* the timer function */ -} ulog_buff_t; - -static int ulog_net_id __read_mostly; -struct ulog_net { - unsigned int nlgroup[ULOG_MAXNLGROUPS]; - ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; - struct sock *nflognl; - spinlock_t lock; -}; - -static struct ulog_net *ulog_pernet(struct net *net) -{ - return net_generic(net, ulog_net_id); -} - -/* send one ulog_buff_t to userspace */ -static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum) -{ - ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum]; - - pr_debug("ulog_send: timer is deleting\n"); - del_timer(&ub->timer); - - if (!ub->skb) { - pr_debug("ulog_send: nothing to send\n"); - return; - } - - /* last nlmsg needs NLMSG_DONE */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_type = NLMSG_DONE; - - NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; - pr_debug("throwing %d packets to netlink group %u\n", - ub->qlen, nlgroupnum + 1); - netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1, - GFP_ATOMIC); - - ub->qlen = 0; - ub->skb = NULL; - ub->lastnlh = NULL; -} - - -/* timer function to flush queue in flushtimeout time */ -static void ulog_timer(unsigned long data) -{ - unsigned int groupnum = *((unsigned int *)data); - struct ulog_net *ulog = container_of((void *)data, - struct ulog_net, - nlgroup[groupnum]); - pr_debug("timer function called, calling ulog_send\n"); - - /* lock to protect against somebody modifying our structure - * from ipt_ulog_target at the same time */ - spin_lock_bh(&ulog->lock); - ulog_send(ulog, groupnum); - spin_unlock_bh(&ulog->lock); -} - -static struct sk_buff *ulog_alloc_skb(unsigned int size) -{ - struct sk_buff *skb; - unsigned int n; - - /* alloc skb which should be big enough for a whole - * multipart message. WARNING: has to be <= 131000 - * due to slab allocator restrictions */ - - n = max(size, nlbufsiz); - skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); - if (!skb) { - if (n > size) { - /* try to allocate only as much as we need for - * current packet */ - - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - pr_debug("cannot even allocate %ub\n", size); - } - } - - return skb; -} - -static void ipt_ulog_packet(struct net *net, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct ipt_ulog_info *loginfo, - const char *prefix) -{ - ulog_buff_t *ub; - ulog_packet_msg_t *pm; - size_t size, copy_len; - struct nlmsghdr *nlh; - struct timeval tv; - struct ulog_net *ulog = ulog_pernet(net); - - /* ffs == find first bit set, necessary because userspace - * is already shifting groupnumber, but we need unshifted. - * ffs() returns [1..32], we need [0..31] */ - unsigned int groupnum = ffs(loginfo->nl_group) - 1; - - /* calculate the size of the skb needed */ - if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len) - copy_len = skb->len; - else - copy_len = loginfo->copy_range; - - size = nlmsg_total_size(sizeof(*pm) + copy_len); - - ub = &ulog->ulog_buffers[groupnum]; - - spin_lock_bh(&ulog->lock); - - if (!ub->skb) { - if (!(ub->skb = ulog_alloc_skb(size))) - goto alloc_failure; - } else if (ub->qlen >= loginfo->qthreshold || - size > skb_tailroom(ub->skb)) { - /* either the queue len is too high or we don't have - * enough room in nlskb left. send it to userspace. */ - - ulog_send(ulog, groupnum); - - if (!(ub->skb = ulog_alloc_skb(size))) - goto alloc_failure; - } - - pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); - - nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, - sizeof(*pm)+copy_len, 0); - if (!nlh) { - pr_debug("error during nlmsg_put\n"); - goto out_unlock; - } - ub->qlen++; - - pm = nlmsg_data(nlh); - memset(pm, 0, sizeof(*pm)); - - /* We might not have a timestamp, get one */ - if (skb->tstamp.tv64 == 0) - __net_timestamp((struct sk_buff *)skb); - - /* copy hook, prefix, timestamp, payload, etc. */ - pm->data_len = copy_len; - tv = ktime_to_timeval(skb->tstamp); - put_unaligned(tv.tv_sec, &pm->timestamp_sec); - put_unaligned(tv.tv_usec, &pm->timestamp_usec); - put_unaligned(skb->mark, &pm->mark); - pm->hook = hooknum; - if (prefix != NULL) { - strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1); - pm->prefix[sizeof(pm->prefix) - 1] = '\0'; - } - else if (loginfo->prefix[0] != '\0') - strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); - - if (in && in->hard_header_len > 0 && - skb->mac_header != skb->network_header && - in->hard_header_len <= ULOG_MAC_LEN) { - memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); - pm->mac_len = in->hard_header_len; - } else - pm->mac_len = 0; - - if (in) - strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); - - if (out) - strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); - - /* copy_len <= skb->len, so can't fail. */ - if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) - BUG(); - - /* check if we are building multi-part messages */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; - - ub->lastnlh = nlh; - - /* if timer isn't already running, start it */ - if (!timer_pending(&ub->timer)) { - ub->timer.expires = jiffies + flushtimeout * HZ / 100; - add_timer(&ub->timer); - } - - /* if threshold is reached, send message to userspace */ - if (ub->qlen >= loginfo->qthreshold) { - if (loginfo->qthreshold > 1) - nlh->nlmsg_type = NLMSG_DONE; - ulog_send(ulog, groupnum); - } -out_unlock: - spin_unlock_bh(&ulog->lock); - - return; - -alloc_failure: - pr_debug("Error building netlink message\n"); - spin_unlock_bh(&ulog->lock); -} - -static unsigned int -ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - - ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out, - par->targinfo, NULL); - return XT_CONTINUE; -} - -static void ipt_logfn(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *li, - const char *prefix) -{ - struct ipt_ulog_info loginfo; - - if (!li || li->type != NF_LOG_TYPE_ULOG) { - loginfo.nl_group = ULOG_DEFAULT_NLGROUP; - loginfo.copy_range = 0; - loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; - loginfo.prefix[0] = '\0'; - } else { - loginfo.nl_group = li->u.ulog.group; - loginfo.copy_range = li->u.ulog.copy_len; - loginfo.qthreshold = li->u.ulog.qthreshold; - strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); - } - - ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); -} - -static int ulog_tg_check(const struct xt_tgchk_param *par) -{ - const struct ipt_ulog_info *loginfo = par->targinfo; - - if (!par->net->xt.ulog_warn_deprecated) { - pr_info("ULOG is deprecated and it will be removed soon, " - "use NFLOG instead\n"); - par->net->xt.ulog_warn_deprecated = true; - } - - if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { - pr_debug("prefix not null-terminated\n"); - return -EINVAL; - } - if (loginfo->qthreshold > ULOG_MAX_QLEN) { - pr_debug("queue threshold %Zu > MAX_QLEN\n", - loginfo->qthreshold); - return -EINVAL; - } - return 0; -} - -#ifdef CONFIG_COMPAT -struct compat_ipt_ulog_info { - compat_uint_t nl_group; - compat_size_t copy_range; - compat_size_t qthreshold; - char prefix[ULOG_PREFIX_LEN]; -}; - -static void ulog_tg_compat_from_user(void *dst, const void *src) -{ - const struct compat_ipt_ulog_info *cl = src; - struct ipt_ulog_info l = { - .nl_group = cl->nl_group, - .copy_range = cl->copy_range, - .qthreshold = cl->qthreshold, - }; - - memcpy(l.prefix, cl->prefix, sizeof(l.prefix)); - memcpy(dst, &l, sizeof(l)); -} - -static int ulog_tg_compat_to_user(void __user *dst, const void *src) -{ - const struct ipt_ulog_info *l = src; - struct compat_ipt_ulog_info cl = { - .nl_group = l->nl_group, - .copy_range = l->copy_range, - .qthreshold = l->qthreshold, - }; - - memcpy(cl.prefix, l->prefix, sizeof(cl.prefix)); - return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0; -} -#endif /* CONFIG_COMPAT */ - -static struct xt_target ulog_tg_reg __read_mostly = { - .name = "ULOG", - .family = NFPROTO_IPV4, - .target = ulog_tg, - .targetsize = sizeof(struct ipt_ulog_info), - .checkentry = ulog_tg_check, -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_ipt_ulog_info), - .compat_from_user = ulog_tg_compat_from_user, - .compat_to_user = ulog_tg_compat_to_user, -#endif - .me = THIS_MODULE, -}; - -static struct nf_logger ipt_ulog_logger __read_mostly = { - .name = "ipt_ULOG", - .logfn = ipt_logfn, - .me = THIS_MODULE, -}; - -static int __net_init ulog_tg_net_init(struct net *net) -{ - int i; - struct ulog_net *ulog = ulog_pernet(net); - struct netlink_kernel_cfg cfg = { - .groups = ULOG_MAXNLGROUPS, - }; - - spin_lock_init(&ulog->lock); - /* initialize ulog_buffers */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ulog->nlgroup[i] = i; - setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, - (unsigned long)&ulog->nlgroup[i]); - } - - ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); - if (!ulog->nflognl) - return -ENOMEM; - - if (nflog) - nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger); - - return 0; -} - -static void __net_exit ulog_tg_net_exit(struct net *net) -{ - ulog_buff_t *ub; - int i; - struct ulog_net *ulog = ulog_pernet(net); - - if (nflog) - nf_log_unset(net, &ipt_ulog_logger); - - netlink_kernel_release(ulog->nflognl); - - /* remove pending timers and free allocated skb's */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ub = &ulog->ulog_buffers[i]; - pr_debug("timer is deleting\n"); - del_timer(&ub->timer); - - if (ub->skb) { - kfree_skb(ub->skb); - ub->skb = NULL; - } - } -} - -static struct pernet_operations ulog_tg_net_ops = { - .init = ulog_tg_net_init, - .exit = ulog_tg_net_exit, - .id = &ulog_net_id, - .size = sizeof(struct ulog_net), -}; - -static int __init ulog_tg_init(void) -{ - int ret; - pr_debug("init module\n"); - - if (nlbufsiz > 128*1024) { - pr_warn("Netlink buffer has to be <= 128kB\n"); - return -EINVAL; - } - - ret = register_pernet_subsys(&ulog_tg_net_ops); - if (ret) - goto out_pernet; - - ret = xt_register_target(&ulog_tg_reg); - if (ret < 0) - goto out_target; - - if (nflog) - nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); - - return 0; - -out_target: - unregister_pernet_subsys(&ulog_tg_net_ops); -out_pernet: - return ret; -} - -static void __exit ulog_tg_exit(void) -{ - pr_debug("cleanup_module\n"); - if (nflog) - nf_log_unregister(&ipt_ulog_logger); - xt_unregister_target(&ulog_tg_reg); - unregister_pernet_subsys(&ulog_tg_net_ops); -} - -module_init(ulog_tg_init); -module_exit(ulog_tg_exit); -- cgit v1.2.3-59-g8ed1b From a6eacef7fba7834da4d22762ea0d8524df3993a8 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Wed, 25 Jun 2014 10:07:05 +0200 Subject: tipc: bump max configurable window size The maximum window size is limited by the sequence gap field, which was expanded with bd7845337b105e090dd18912d511139945fa7586 ("tipc: Expand link sequence gap field to 13 bits") We remove the artificial limit that prevents the link window to be set larger than 150. Signed-off-by: Erik Hugne Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h index 41a76acbb305..876d0a14863c 100644 --- a/include/uapi/linux/tipc_config.h +++ b/include/uapi/linux/tipc_config.h @@ -182,7 +182,7 @@ #define TIPC_MIN_LINK_WIN 16 #define TIPC_DEF_LINK_WIN 50 -#define TIPC_MAX_LINK_WIN 150 +#define TIPC_MAX_LINK_WIN 8191 struct tipc_node_info { -- cgit v1.2.3-59-g8ed1b From 09d27b88f15f08fcfbaf57d9b0b4489816264815 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Jun 2014 13:37:13 +0200 Subject: netfilter: nft_log: complete logging support Use the unified nf_log_packet() interface that allows us explicit logger selection through the nf_loginfo structure. If you specify the group attribute, this means you want to receive logging messages through nfnetlink_log. In that case, the snaplen and qthreshold attributes allows you to tune internal aspects of the netlink logging infrastructure. On the other hand, if the level is specified, then the plain text format through the kernel logging ring is used instead, which is also used by default if neither group nor level are indicated. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++ net/netfilter/nft_log.c | 76 +++++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2a88f645a5d8..801bdd1e56e3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -697,6 +697,8 @@ enum nft_counter_attributes { * @NFTA_LOG_PREFIX: prefix to prepend to log messages (NLA_STRING) * @NFTA_LOG_SNAPLEN: length of payload to include in netlink message (NLA_U32) * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U32) + * @NFTA_LOG_LEVEL: log level (NLA_U32) + * @NFTA_LOG_FLAGS: logging flags (NLA_U32) */ enum nft_log_attributes { NFTA_LOG_UNSPEC, @@ -704,6 +706,8 @@ enum nft_log_attributes { NFTA_LOG_PREFIX, NFTA_LOG_SNAPLEN, NFTA_LOG_QTHRESHOLD, + NFTA_LOG_LEVEL, + NFTA_LOG_FLAGS, __NFTA_LOG_MAX }; #define NFTA_LOG_MAX (__NFTA_LOG_MAX - 1) diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 55d4297d8417..5b1a4f5c3dcc 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2008-2009 Patrick McHardy + * Copyright (c) 2012-2014 Pablo Neira Ayuso * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -41,6 +42,8 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_PREFIX] = { .type = NLA_STRING }, [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 }, [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 }, + [NFTA_LOG_LEVEL] = { .type = NLA_U32 }, + [NFTA_LOG_FLAGS] = { .type = NLA_U32 }, }; static int nft_log_init(const struct nft_ctx *ctx, @@ -58,18 +61,41 @@ static int nft_log_init(const struct nft_ctx *ctx, if (priv->prefix == NULL) return -ENOMEM; nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); - } else + } else { priv->prefix = (char *)nft_log_null_prefix; + } - li->type = NF_LOG_TYPE_ULOG; + li->type = NF_LOG_TYPE_LOG; + if (tb[NFTA_LOG_LEVEL] != NULL && + tb[NFTA_LOG_GROUP] != NULL) + return -EINVAL; if (tb[NFTA_LOG_GROUP] != NULL) + li->type = NF_LOG_TYPE_ULOG; + + switch (li->type) { + case NF_LOG_TYPE_LOG: + if (tb[NFTA_LOG_LEVEL] != NULL) { + li->u.log.level = + ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL]));; + } else { + li->u.log.level = 4; + } + if (tb[NFTA_LOG_FLAGS] != NULL) { + li->u.log.logflags = + ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); + } + break; + case NF_LOG_TYPE_ULOG: li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); - - if (tb[NFTA_LOG_SNAPLEN] != NULL) - li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); - if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { - li->u.ulog.qthreshold = - ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); + if (tb[NFTA_LOG_SNAPLEN] != NULL) { + li->u.ulog.copy_len = + ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); + } + if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { + li->u.ulog.qthreshold = + ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); + } + break; } if (ctx->afi->family == NFPROTO_INET) { @@ -113,17 +139,33 @@ static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) if (priv->prefix != nft_log_null_prefix) if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix)) goto nla_put_failure; - if (li->u.ulog.group) - if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) - goto nla_put_failure; - if (li->u.ulog.copy_len) - if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, - htonl(li->u.ulog.copy_len))) + switch (li->type) { + case NF_LOG_TYPE_LOG: + if (nla_put_be32(skb, NFTA_LOG_LEVEL, htonl(li->u.log.level))) goto nla_put_failure; - if (li->u.ulog.qthreshold) - if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, - htons(li->u.ulog.qthreshold))) + + if (li->u.log.logflags) { + if (nla_put_be32(skb, NFTA_LOG_FLAGS, + htonl(li->u.log.logflags))) + goto nla_put_failure; + } + break; + case NF_LOG_TYPE_ULOG: + if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) goto nla_put_failure; + + if (li->u.ulog.copy_len) { + if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, + htonl(li->u.ulog.copy_len))) + goto nla_put_failure; + } + if (li->u.ulog.qthreshold) { + if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, + htons(li->u.ulog.qthreshold))) + goto nla_put_failure; + } + break; + } return 0; nla_put_failure: -- cgit v1.2.3-59-g8ed1b From d93331965729850303f6111381c1a4a9e9b8ae5a Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Wed, 25 Jun 2014 14:44:53 -0700 Subject: ipv6: Allow accepting RA from local IP addresses. This can be used in virtual networking applications, and may have other uses as well. The option is disabled by default. A specific use case is setting up virtual routers, bridges, and hosts on a single OS without the use of network namespaces or virtual machines. With proper use of ip rules, routing tables, veth interface pairs and/or other virtual interfaces, and applications that can bind to interfaces and/or IP addresses, it is possibly to create one or more virtual routers with multiple hosts attached. The host interfaces can act as IPv6 systems, with radvd running on the ports in the virtual routers. With the option provided in this patch enabled, those hosts can now properly obtain IPv6 addresses from the radvd. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 12 ++++++++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + include/uapi/linux/sysctl.h | 1 + kernel/sysctl_binary.c | 1 + net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 21 +++++++++++++-------- 7 files changed, 39 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ab42c95f9985..10e216c6e05e 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1210,6 +1210,18 @@ accept_ra_defrtr - BOOLEAN Functional default: enabled if accept_ra is enabled. disabled if accept_ra is disabled. +accept_ra_from_local - BOOLEAN + Accept RA with source-address that is found on local machine + if the RA is otherwise proper and able to be accepted. + Default is to NOT accept these as it may be an un-intended + network loop. + + Functional default: + enabled if accept_ra_from_local is enabled + on a specific interface. + disabled if accept_ra_from_local is disabled + on a specific interface. + accept_ra_pinfo - BOOLEAN Learn Prefix Information in Router Advertisement. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index c811300b0b0c..b0f2452f1d58 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -39,6 +39,7 @@ struct ipv6_devconf { #endif __s32 proxy_ndp; __s32 accept_source_route; + __s32 accept_ra_from_local; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD __s32 optimistic_dad; #endif diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 593b0e32d956..efa2666f4b8a 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -163,6 +163,7 @@ enum { DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL, DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL, DEVCONF_SUPPRESS_FRAG_NDISC, + DEVCONF_ACCEPT_RA_FROM_LOCAL, DEVCONF_MAX }; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 6d6721341f49..43aaba1cc037 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -568,6 +568,7 @@ enum { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN=22, NET_IPV6_PROXY_NDP=23, NET_IPV6_ACCEPT_SOURCE_ROUTE=25, + NET_IPV6_ACCEPT_RA_FROM_LOCAL=26, __NET_IPV6_MAX }; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd9e7ad..e4ba9a5a5ccb 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = { { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, {} }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5667b3003af9..358edd2272ac 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -186,6 +186,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_from_local = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -222,6 +223,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_from_local = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -4321,6 +4323,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; + array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; } static inline size_t inet6_ifla6_size(void) @@ -5167,6 +5170,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "accept_ra_from_local", + .data = &ipv6_devconf.accept_ra_from_local, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 736c11c6d266..a845e3d2057e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1148,11 +1148,15 @@ static void ndisc_router_discovery(struct sk_buff *skb) goto skip_defrtr; } - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + /* Do not accept RA with source-addr found on local machine unless + * accept_ra_from_local is set to true. + */ + if (!(in6_dev->cnf.accept_ra_from_local || + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0))) { ND_PRINTK(2, info, - "RA: %s, chk_addr failed for dev: %s\n", - __func__, skb->dev->name); + "RA from local address detected on dev: %s: default router ignored\n", + skb->dev->name); goto skip_defrtr; } @@ -1290,11 +1294,12 @@ skip_linkparms: } #ifdef CONFIG_IPV6_ROUTE_INFO - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + if (!(in6_dev->cnf.accept_ra_from_local || + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0))) { ND_PRINTK(2, info, - "RA: %s, chk-addr (route info) is false for dev: %s\n", - __func__, skb->dev->name); + "RA from local address detected on dev: %s: router info ignored.\n", + skb->dev->name); goto skip_routeinfo; } -- cgit v1.2.3-59-g8ed1b From cb1ce2ef387b01686469487edd45994872d52d73 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:33:10 -0700 Subject: ipv6: Implement automatic flow label generation on transmit Automatically generate flow labels for IPv6 packets on transmit. The flow label is computed based on skb_get_hash. The flow label will only automatically be set when it is zero otherwise (i.e. flow label manager hasn't set one). This supports the transmit side functionality of RFC 6438. Added an IPv6 sysctl auto_flowlabels to enable/disable this behavior system wide, and added IPV6_AUTOFLOWLABEL socket option to enable this functionality per socket. By default, auto flowlabels are disabled to avoid possible conflicts with flow label manager, however if this feature proves useful we may want to enable it by default. It should also be noted that FreeBSD has already implemented automatic flow labels (including the sysctl and socket option). In FreeBSD, automatic flow labels default to enabled. Performance impact: Running super_netperf with 200 flows for TCP_RR and UDP_RR for IPv6. Note that in UDP case, __skb_get_hash will be called for every packet with explains slight regression. In the TCP case the hash is saved in the socket so there is no regression. Automatic flow labels disabled: TCP_RR: 86.53% CPU utilization 127/195/322 90/95/99% latencies 1.40498e+06 tps UDP_RR: 90.70% CPU utilization 118/168/243 90/95/99% latencies 1.50309e+06 tps Automatic flow labels enabled: TCP_RR: 85.90% CPU utilization 128/199/337 90/95/99% latencies 1.40051e+06 UDP_RR 92.61% CPU utilization 115/164/236 90/95/99% latencies 1.4687e+06 Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/linux/ipv6.h | 3 ++- include/net/ipv6.h | 20 ++++++++++++++++++++ include/net/netns/ipv6.h | 1 + include/uapi/linux/in6.h | 1 + net/ipv6/af_inet6.c | 1 + net/ipv6/ip6_gre.c | 7 +++++-- net/ipv6/ip6_output.c | 7 +++++-- net/ipv6/ip6_tunnel.c | 3 ++- net/ipv6/ipv6_sockglue.c | 8 ++++++++ net/ipv6/sysctl_net_ipv6.c | 8 ++++++++ 11 files changed, 62 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 10e216c6e05e..f35bfe43bf7a 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1132,6 +1132,15 @@ flowlabel_consistency - BOOLEAN FALSE: disabled Default: TRUE +auto_flowlabels - BOOLEAN + Automatically generate flow labels based based on a flow hash + of the packet. This allows intermediate devices, such as routers, + to idenfify packet flows for mechanisms like Equal Cost Multipath + Routing (see RFC 6438). + TRUE: enabled + FALSE: disabled + Default: false + anycast_src_echo_reply - BOOLEAN Controls the use of anycast addresses as source addresses for ICMPv6 echo reply diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 5dc68c3ebcbd..ff560537dd61 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -199,7 +199,8 @@ struct ipv6_pinfo { * 010: prefer public address * 100: prefer care-of address */ - dontfrag:1; + dontfrag:1, + autoflowlabel:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 2aa86e1135a1..4308f2ada8b3 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -699,6 +699,26 @@ static inline void ip6_set_txhash(struct sock *sk) sk->sk_txhash = flow_hash_from_keys(&keys); } +static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, + __be32 flowlabel, bool autolabel) +{ + if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) { + __be32 hash; + + hash = skb_get_hash(skb); + + /* Since this is being sent on the wire obfuscate hash a bit + * to minimize possbility that any useful information to an + * attacker is leaked. Only lower 20 bits are relevant. + */ + hash ^= hash >> 12; + + flowlabel = hash & IPV6_FLOWLABEL_MASK; + } + + return flowlabel; +} + /* * Header manipulation */ diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 19d3446e59d2..eade27adecf3 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 { int ip6_rt_mtu_expires; int ip6_rt_min_advmss; int flowlabel_consistency; + int auto_flowlabels; int icmpv6_time; int anycast_src_echo_reply; int fwmark_reflect; diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 0d8e0f0342dc..22b7a69619d8 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -233,6 +233,7 @@ struct in6_flowlabel_req { #if 0 /* not yet */ #define IPV6_USE_MIN_MTU 63 #endif +#define IPV6_AUTOFLOWLABEL 64 /* * Netfilter (1) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a426cd7099bb..2daa3a133e49 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -765,6 +765,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; + net->ipv6.sysctl.auto_flowlabels = 0; atomic_set(&net->ipv6.rt_genid, 0); err = ipv6_init_mibs(net); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3873181ed856..365b2b6f3942 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -723,7 +723,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, * Push down and install the IP header. */ ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = tunnel->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -1174,7 +1175,9 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen); __be16 *p = (__be16 *)(ipv6h+1); - ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel); + ip6_flow_hdr(ipv6h, 0, + ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cb9df0eb4023..fa83bdd4c3dd 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -205,7 +205,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); - ip6_flow_hdr(hdr, tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -1569,7 +1570,9 @@ int ip6_push_pending_frames(struct sock *sk) skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, np->cork.tclass, + ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index afa082458360..51a1eb185ea7 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1046,7 +1046,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index cc34f65179e4..b50b9e54cf53 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -834,6 +834,10 @@ pref_skip_coa: np->dontfrag = valbool; retv = 0; break; + case IPV6_AUTOFLOWLABEL: + np->autoflowlabel = valbool; + retv = 0; + break; } release_sock(sk); @@ -1273,6 +1277,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->dontfrag; break; + case IPV6_AUTOFLOWLABEL: + val = np->autoflowlabel; + break; + default: return -ENOPROTOOPT; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 058f3eca2e53..5bf7b61f8ae8 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -38,6 +38,13 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "auto_flowlabels", + .data = &init_net.ipv6.sysctl.auto_flowlabels, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, @@ -74,6 +81,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[0].data = &net->ipv6.sysctl.bindv6only; ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply; ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; + ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit v1.2.3-59-g8ed1b From bc91b0f07ada5535427373a4e2050877bcc12218 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Jul 2014 21:10:18 +0200 Subject: ipv6: addrconf: implement address generation modes This patch introduces a possibility for userspace to set various (so far two) modes of generating addresses. This is useful for example for NetworkManager because it can set the mode to NONE and take care of link local addresses itself. That allow it to have the interface up, monitoring carrier but still don't have any addresses on it. One more use-case by Dan Williams: WWAN devices often have their LL address provided by the firmware of the device, which sometimes refuses to respond to incorrect LL addresses when doing DHCPv6 or IPv6 ND. The kernel cannot generate the correct LL address for two reasons: 1) WWAN pseudo-ethernet interfaces often construct a fake MAC address, or read a meaningless MAC address from the firmware. Thus the EUI64 and the IPv6LL address the kernel assigns will be wrong. The real LL address is often retrieved from the firmware with AT or proprietary commands. 2) WWAN PPP interfaces receive their LL address from IPV6CP, not from kernel assignments. Only after IPV6CP has completed do we know the LL address of the PPP interface and its peer. But the kernel has already assigned an incorrect LL address to the interface. So being able to suppress the kernel LL address generation and assign the one retrieved from the firmware is less complicated and more robust. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 + include/uapi/linux/if_link.h | 6 +++++ net/ipv6/addrconf.c | 56 ++++++++++++++++++++++++++++++-------------- 3 files changed, 45 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index b4956a5fcc3f..d07b1a64b4e7 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -205,6 +205,7 @@ struct inet6_dev { struct timer_list rs_timer; __u8 rs_probes; + __u8 addr_gen_mode; unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b38534895db5..ff957604a721 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -204,11 +204,17 @@ enum { IFLA_INET6_CACHEINFO, /* time values and max reasm size */ IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ IFLA_INET6_TOKEN, /* device token */ + IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ __IFLA_INET6_MAX }; #define IFLA_INET6_MAX (__IFLA_INET6_MAX - 1) +enum in6_addr_gen_mode { + IN6_ADDR_GEN_MODE_EUI64, + IN6_ADDR_GEN_MODE_NONE, +}; + enum { BRIDGE_MODE_UNSPEC, BRIDGE_MODE_HAIRPIN, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 358edd2272ac..4c03c2843094 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2730,9 +2730,25 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr } } +static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) +{ + if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + /* addrconf_add_linklocal also adds a prefix_route and we + * only need to care about prefix routes if ipv6_generate_eui64 + * couldn't generate one. + */ + if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) + addrconf_add_linklocal(idev, &addr); + else if (prefix_route) + addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + } +} + static void addrconf_dev_config(struct net_device *dev) { - struct in6_addr addr; struct inet6_dev *idev; ASSERT_RTNL(); @@ -2753,11 +2769,7 @@ static void addrconf_dev_config(struct net_device *dev) if (IS_ERR(idev)) return; - memset(&addr, 0, sizeof(struct in6_addr)); - addr.s6_addr32[0] = htonl(0xFE800000); - - if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) - addrconf_add_linklocal(idev, &addr); + addrconf_addr_gen(idev, false); } #if IS_ENABLED(CONFIG_IPV6_SIT) @@ -2779,11 +2791,7 @@ static void addrconf_sit_config(struct net_device *dev) } if (dev->priv_flags & IFF_ISATAP) { - struct in6_addr addr; - - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) - addrconf_add_linklocal(idev, &addr); + addrconf_addr_gen(idev, false); return; } @@ -2798,7 +2806,6 @@ static void addrconf_sit_config(struct net_device *dev) static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; - struct in6_addr addr; ASSERT_RTNL(); @@ -2807,11 +2814,7 @@ static void addrconf_gre_config(struct net_device *dev) return; } - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) - addrconf_add_linklocal(idev, &addr); - else - addrconf_prefix_route(&addr, 64, dev, 0, 0); + addrconf_addr_gen(idev, true); } #endif @@ -4423,6 +4426,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (nla == NULL) goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode)) + goto nla_put_failure; + read_lock_bh(&idev->lock); memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); @@ -4527,8 +4534,21 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0) BUG(); - if (tb[IFLA_INET6_TOKEN]) + if (tb[IFLA_INET6_TOKEN]) { err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN])); + if (err) + return err; + } + + if (tb[IFLA_INET6_ADDR_GEN_MODE]) { + u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); + + if (mode != IN6_ADDR_GEN_MODE_EUI64 && + mode != IN6_ADDR_GEN_MODE_NONE) + return -EINVAL; + idev->addr_gen_mode = mode; + err = 0; + } return err; } -- cgit v1.2.3-59-g8ed1b From f736d9985e69e72d9a2ebfd131a72ee8f870c6f3 Mon Sep 17 00:00:00 2001 From: Nikita Edward Baruzdin Date: Fri, 11 Jul 2014 16:13:21 +0400 Subject: can: netlink: Remove space before tab Fixes the corresponing checkpatch.pl warning. Signed-off-by: Nikita Edward Baruzdin Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index 813d11f54977..3bbf5c7e1500 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -92,7 +92,7 @@ struct can_ctrlmode { }; #define CAN_CTRLMODE_LOOPBACK 0x01 /* Loopback mode */ -#define CAN_CTRLMODE_LISTENONLY 0x02 /* Listen-only mode */ +#define CAN_CTRLMODE_LISTENONLY 0x02 /* Listen-only mode */ #define CAN_CTRLMODE_3_SAMPLES 0x04 /* Triple sampling mode */ #define CAN_CTRLMODE_ONE_SHOT 0x08 /* One-Shot mode */ #define CAN_CTRLMODE_BERR_REPORTING 0x10 /* Bus-error reporting */ -- cgit v1.2.3-59-g8ed1b From 4b9e1bab12c9b6de965268c2fbe6ebbb35dddd89 Mon Sep 17 00:00:00 2001 From: Nikita Edward Baruzdin Date: Fri, 11 Jul 2014 16:13:22 +0400 Subject: can: netlink: Add CAN_CTRLMODE_PRESUME_ACK flag Most CAN controllers have a support for ignoring ACK absence. Some of them refer to this feature as a self test mode (e. g. SJA1000) and some include it as a part of a loopback mode (e. g. MCP2510). Setting the introduced flag via netlink should make CAN controller perform a successful transmission, even if there is no acknowledgement (dominant ACK bit) received. Signed-off-by: Nikita Edward Baruzdin Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index 3bbf5c7e1500..3e4323a3918d 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -97,6 +97,7 @@ struct can_ctrlmode { #define CAN_CTRLMODE_ONE_SHOT 0x08 /* One-Shot mode */ #define CAN_CTRLMODE_BERR_REPORTING 0x10 /* Bus-error reporting */ #define CAN_CTRLMODE_FD 0x20 /* CAN FD mode */ +#define CAN_CTRLMODE_PRESUME_ACK 0x40 /* Ignore missing CAN ACKs */ /* * CAN device statistics -- cgit v1.2.3-59-g8ed1b From 685343fc3ba61a1f6eef361b786601123db16c28 Mon Sep 17 00:00:00 2001 From: Tom Gundersen Date: Mon, 14 Jul 2014 16:37:22 +0200 Subject: net: add name_assign_type netdev attribute Based on a patch by David Herrmann. The name_assign_type attribute gives hints where the interface name of a given net-device comes from. These values are currently defined: NET_NAME_ENUM: The ifname is provided by the kernel with an enumerated suffix, typically based on order of discovery. Names may be reused and unpredictable. NET_NAME_PREDICTABLE: The ifname has been assigned by the kernel in a predictable way that is guaranteed to avoid reuse and always be the same for a given device. Examples include statically created devices like the loopback device and names deduced from hardware properties (including being given explicitly by the firmware). Names depending on the order of discovery, or in any other way on the existence of other devices, must not be marked as PREDICTABLE. NET_NAME_USER: The ifname was provided by user-space during net-device setup. NET_NAME_RENAMED: The net-device has been renamed from userspace. Once this type is set, it cannot change again. NET_NAME_UNKNOWN: This is an internal placeholder to indicate that we yet haven't yet categorized the name. It will not be exposed to userspace, rather -EINVAL is returned. The aim of these patches is to improve user-space renaming of interfaces. As a general rule, userspace must rename interfaces to guarantee that names stay the same every time a given piece of hardware appears (at boot, or when attaching it). However, there are several situations where userspace should not perform the renaming, and that depends on both the policy of the local admin, but crucially also on the nature of the current interface name. If an interface was created in repsonse to a userspace request, and userspace already provided a name, we most probably want to leave that name alone. The main instance of this is wifi-P2P devices created over nl80211, which currently have a long-standing bug where they are getting renamed by udev. We label such names NET_NAME_USER. If an interface, unbeknown to us, has already been renamed from userspace, we most probably want to leave also that alone. This will typically happen when third-party plugins (for instance to udev, but the interface is generic so could be from anywhere) renames the interface without informing udev about it. A typical situation is when you switch root from an installer or an initrd to the real system and the new instance of udev does not know what happened before the switch. These types of problems have caused repeated issues in the past. To solve this, once an interface has been renamed, its name is labelled NET_NAME_RENAMED. In many cases, the kernel is actually able to name interfaces in such a way that there is no need for userspace to rename them. This is the case when the enumeration order of devices, or in fact any other (non-parent) device on the system, can not influence the name of the interface. Examples include statically created devices, or any naming schemes based on hardware properties of the interface. In this case the admin may prefer to use the kernel-provided names, and to make that possible we label such names NET_NAME_PREDICTABLE. We want the kernel to have tho possibilty of performing predictable interface naming itself (and exposing to userspace that it has), as the information necessary for a proper naming scheme for a certain class of devices may not be exposed to userspace. The case where renaming is almost certainly desired, is when the kernel has given the interface a name using global device enumeration based on order of discovery (ethX, wlanY, etc). These naming schemes are labelled NET_NAME_ENUM. Lastly, a fallback is left as NET_NAME_UNKNOWN, to indicate that a driver has not yet been ported. This is mostly useful as a transitionary measure, allowing us to label the various naming schemes bit by bit. v8: minor documentation fixes v9: move comment to the right commit Signed-off-by: Tom Gundersen Reviewed-by: David Herrmann Reviewed-by: Kay Sievers Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 11 +++++++++++ include/linux/netdevice.h | 2 ++ include/uapi/linux/netdevice.h | 6 ++++++ net/core/net-sysfs.c | 20 ++++++++++++++++++++ 4 files changed, 39 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 416c5d59f52e..d322b0581194 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -1,3 +1,14 @@ +What: /sys/class/net//name_assign_type +Date: July 2014 +KernelVersion: 3.17 +Contact: netdev@vger.kernel.org +Description: + Indicates the name assignment type. Possible values are: + 1: enumerated by the kernel, possibly in an unpredictable way + 2: predictably named by the kernel + 3: named by userspace + 4: renamed + What: /sys/class/net//addr_assign_type Date: July 2010 KernelVersion: 3.2 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3a320db96180..9be34732142f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1381,6 +1381,8 @@ struct net_device { struct kset *queues_kset; #endif + unsigned char name_assign_type; + bool uc_promisc; unsigned int promiscuity; unsigned int allmulti; diff --git a/include/uapi/linux/netdevice.h b/include/uapi/linux/netdevice.h index fdfbd1c17065..55818543342d 100644 --- a/include/uapi/linux/netdevice.h +++ b/include/uapi/linux/netdevice.h @@ -37,6 +37,12 @@ #define INIT_NETDEV_GROUP 0 +/* interface name assignment types (sysfs name_assign_type attribute) */ +#define NET_NAME_UNKNOWN 0 /* unknown origin (not exposed to userspace) */ +#define NET_NAME_ENUM 1 /* enumerated by kernel */ +#define NET_NAME_PREDICTABLE 2 /* predictably named by the kernel */ +#define NET_NAME_USER 3 /* provided by user-space */ +#define NET_NAME_RENAMED 4 /* renamed by user-space */ /* Media selection options. */ enum { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1cac29ebb05b..7752f2ad49a5 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -112,6 +112,25 @@ NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); +static ssize_t format_name_assign_type(const struct net_device *net, char *buf) +{ + return sprintf(buf, fmt_dec, net->name_assign_type); +} + +static ssize_t name_assign_type_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *net = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (net->name_assign_type != NET_NAME_UNKNOWN) + ret = netdev_show(dev, attr, buf, format_name_assign_type); + + return ret; +} +static DEVICE_ATTR_RO(name_assign_type); + /* use same locking rules as GIFHWADDR ioctl's */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -387,6 +406,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_dev_port.attr, &dev_attr_iflink.attr, &dev_attr_ifindex.attr, + &dev_attr_name_assign_type.attr, &dev_attr_addr_assign_type.attr, &dev_attr_addr_len.attr, &dev_attr_link_mode.attr, -- cgit v1.2.3-59-g8ed1b From 63b949382c5f263746b1c177f6ff84de2201ae9d Mon Sep 17 00:00:00 2001 From: Geir Ola Vaagland Date: Sat, 12 Jul 2014 20:30:36 +0200 Subject: net: sctp: implement rfc6458, 5.3.4. SCTP_SNDINFO cmsg support This patch implements section 5.3.4. of RFC6458, that is, support for 'SCTP Send Information Structure' (SCTP_SNDINFO) which can be placed into ancillary data cmsghdr structure for sendmsg() calls. The sctp_sndinfo structure is defined as per RFC as below ... struct sctp_sndinfo { uint16_t snd_sid; uint16_t snd_flags; uint32_t snd_ppid; uint32_t snd_context; sctp_assoc_t snd_assoc_id; }; ... and supplied under cmsg_level IPPROTO_SCTP, cmsg_type SCTP_SNDINFO, while cmsg_data[] contains struct sctp_sndinfo. An sctp_sndinfo item always corresponds to the data in msg_iov. Joint work with Daniel Borkmann. Signed-off-by: Geir Ola Vaagland Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 +- include/uapi/linux/sctp.h | 22 +++++++++++-- net/sctp/socket.c | 77 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 79 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index f38588bf3462..7af9a0f5d8ce 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1919,7 +1919,8 @@ struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc); /* A convenience structure to parse out SCTP specific CMSGs. */ typedef struct sctp_cmsgs { struct sctp_initmsg *init; - struct sctp_sndrcvinfo *info; + struct sctp_sndrcvinfo *srinfo; + struct sctp_sndinfo *sinfo; } sctp_cmsgs_t; /* Structure for tracking memory objects */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 266022a2be4a..a387761f7e02 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -154,6 +154,22 @@ struct sctp_sndrcvinfo { sctp_assoc_t sinfo_assoc_id; }; +/* 5.3.4 SCTP Send Information Structure (SCTP_SNDINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ------------------- + * IPPROTO_SCTP SCTP_SNDINFO struct sctp_sndinfo + */ +struct sctp_sndinfo { + __u16 snd_sid; + __u16 snd_flags; + __u32 snd_ppid; + __u32 snd_context; + sctp_assoc_t snd_assoc_id; +}; + /* * sinfo_flags: 16 bits (unsigned integer) * @@ -177,10 +193,12 @@ typedef union { /* These are cmsg_types. */ typedef enum sctp_cmsg_type { - SCTP_INIT, /* 5.2.1 SCTP Initiation Structure */ + SCTP_INIT, /* 5.2.1 SCTP Initiation Structure */ #define SCTP_INIT SCTP_INIT - SCTP_SNDRCV, /* 5.2.2 SCTP Header Information Structure */ + SCTP_SNDRCV, /* 5.2.2 SCTP Header Information Structure */ #define SCTP_SNDRCV SCTP_SNDRCV + SCTP_SNDINFO, /* 5.3.4 SCTP Send Information Structure */ +#define SCTP_SNDINFO SCTP_SNDINFO } sctp_cmsg_t; /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 429899689408..d61729e99856 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1602,12 +1602,13 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, struct sctp_initmsg *sinit; sctp_assoc_t associd = 0; sctp_cmsgs_t cmsgs = { NULL }; - int err; sctp_scope_t scope; - long timeo; - __u16 sinfo_flags = 0; + bool fill_sinfo_ttl = false; struct sctp_datamsg *datamsg; int msg_flags = msg->msg_flags; + __u16 sinfo_flags = 0; + long timeo; + int err; err = 0; sp = sctp_sk(sk); @@ -1648,10 +1649,21 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, msg_name = msg->msg_name; } - sinfo = cmsgs.info; sinit = cmsgs.init; + if (cmsgs.sinfo != NULL) { + memset(&default_sinfo, 0, sizeof(default_sinfo)); + default_sinfo.sinfo_stream = cmsgs.sinfo->snd_sid; + default_sinfo.sinfo_flags = cmsgs.sinfo->snd_flags; + default_sinfo.sinfo_ppid = cmsgs.sinfo->snd_ppid; + default_sinfo.sinfo_context = cmsgs.sinfo->snd_context; + default_sinfo.sinfo_assoc_id = cmsgs.sinfo->snd_assoc_id; - /* Did the user specify SNDRCVINFO? */ + sinfo = &default_sinfo; + fill_sinfo_ttl = true; + } else { + sinfo = cmsgs.srinfo; + } + /* Did the user specify SNDINFO/SNDRCVINFO? */ if (sinfo) { sinfo_flags = sinfo->sinfo_flags; associd = sinfo->sinfo_assoc_id; @@ -1858,8 +1870,8 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, pr_debug("%s: we have a valid association\n", __func__); if (!sinfo) { - /* If the user didn't specify SNDRCVINFO, make up one with - * some defaults. + /* If the user didn't specify SNDINFO/SNDRCVINFO, make up + * one with some defaults. */ memset(&default_sinfo, 0, sizeof(default_sinfo)); default_sinfo.sinfo_stream = asoc->default_stream; @@ -1868,7 +1880,13 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, default_sinfo.sinfo_context = asoc->default_context; default_sinfo.sinfo_timetolive = asoc->default_timetolive; default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc); + sinfo = &default_sinfo; + } else if (fill_sinfo_ttl) { + /* In case SNDINFO was specified, we still need to fill + * it with a default ttl from the assoc here. + */ + sinfo->sinfo_timetolive = asoc->default_timetolive; } /* API 7.1.7, the sndbuf size per association bounds the @@ -6390,8 +6408,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) struct cmsghdr *cmsg; struct msghdr *my_msg = (struct msghdr *)msg; - for (cmsg = CMSG_FIRSTHDR(msg); - cmsg != NULL; + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; cmsg = CMSG_NXTHDR(my_msg, cmsg)) { if (!CMSG_OK(my_msg, cmsg)) return -EINVAL; @@ -6404,7 +6421,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) switch (cmsg->cmsg_type) { case SCTP_INIT: /* SCTP Socket API Extension - * 5.2.1 SCTP Initiation Structure (SCTP_INIT) + * 5.3.1 SCTP Initiation Structure (SCTP_INIT) * * This cmsghdr structure provides information for * initializing new SCTP associations with sendmsg(). @@ -6416,15 +6433,15 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_INIT struct sctp_initmsg */ - if (cmsg->cmsg_len != - CMSG_LEN(sizeof(struct sctp_initmsg))) + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_initmsg))) return -EINVAL; - cmsgs->init = (struct sctp_initmsg *)CMSG_DATA(cmsg); + + cmsgs->init = CMSG_DATA(cmsg); break; case SCTP_SNDRCV: /* SCTP Socket API Extension - * 5.2.2 SCTP Header Information Structure(SCTP_SNDRCV) + * 5.3.2 SCTP Header Information Structure(SCTP_SNDRCV) * * This cmsghdr structure specifies SCTP options for * sendmsg() and describes SCTP header information @@ -6434,24 +6451,44 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_SNDRCV struct sctp_sndrcvinfo */ - if (cmsg->cmsg_len != - CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) return -EINVAL; - cmsgs->info = - (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + cmsgs->srinfo = CMSG_DATA(cmsg); - /* Minimally, validate the sinfo_flags. */ - if (cmsgs->info->sinfo_flags & + if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; + case SCTP_SNDINFO: + /* SCTP Socket API Extension + * 5.3.4 SCTP Send Information Structure (SCTP_SNDINFO) + * + * This cmsghdr structure specifies SCTP options for + * sendmsg(). This structure and SCTP_RCVINFO replaces + * SCTP_SNDRCV which has been deprecated. + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_SNDINFO struct sctp_sndinfo + */ + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndinfo))) + return -EINVAL; + + cmsgs->sinfo = CMSG_DATA(cmsg); + + if (cmsgs->sinfo->snd_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; + break; default: return -EINVAL; } } + return 0; } -- cgit v1.2.3-59-g8ed1b From 0d3a421d284812d07970b4ccee74d4fa38737e4d Mon Sep 17 00:00:00 2001 From: Geir Ola Vaagland Date: Sat, 12 Jul 2014 20:30:37 +0200 Subject: net: sctp: implement rfc6458, 5.3.5. SCTP_RCVINFO cmsg support This patch implements section 5.3.5. of RFC6458, that is, support for 'SCTP Receive Information Structure' (SCTP_RCVINFO) which is placed into ancillary data cmsghdr structure for each recvmsg() call. This option can be enabled/disabled via setsockopt(2) on SOL_SCTP level by setting an int value with 1/0 for SCTP_RECVRCVINFO in user space applications as per RFC6458, section 8.1.29. The sctp_rcvinfo structure is defined as per RFC as below ... struct sctp_rcvinfo { uint16_t rcv_sid; uint16_t rcv_ssn; uint16_t rcv_flags; <-- 2 bytes hole --> uint32_t rcv_ppid; uint32_t rcv_tsn; uint32_t rcv_cumtsn; uint32_t rcv_context; sctp_assoc_t rcv_assoc_id; }; ... and provided under cmsg_level IPPROTO_SCTP, cmsg_type SCTP_RCVINFO, while cmsg_data[] contains struct sctp_rcvinfo. An sctp_rcvinfo item always corresponds to the data in msg_iov. Joint work with Daniel Borkmann. Signed-off-by: Geir Ola Vaagland Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 +++ include/net/sctp/ulpevent.h | 5 ++++- include/uapi/linux/sctp.h | 32 ++++++++++++++++++++++------- net/sctp/socket.c | 49 ++++++++++++++++++++++++++++++++++++++++++++- net/sctp/ulpevent.c | 25 +++++++++++++++++++++++ 5 files changed, 105 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 7af9a0f5d8ce..11d5df015370 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -207,7 +207,9 @@ struct sctp_sock { struct sctp_paddrparams paddrparam; struct sctp_event_subscribe subscribe; struct sctp_assocparams assocparams; + int user_frag; + __u32 autoclose; __u8 nodelay; __u8 disable_fragments; @@ -215,6 +217,7 @@ struct sctp_sock { __u8 frag_interleave; __u32 adaptation_ind; __u32 pd_point; + __u8 recvrcvinfo; atomic_t pd_mode; /* Receive to here while partial delivery is in effect. */ diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index daacb32b55b5..e8095f973e94 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -129,7 +129,10 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( const struct sctp_association *asoc, gfp_t gfp); void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, - struct msghdr *); + struct msghdr *); +void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, + struct msghdr *); + __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event); /* Is this event type enabled? */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index a387761f7e02..29b81bbfc53d 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -95,6 +95,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_GET_ASSOC_ID_LIST 29 /* Read only */ #define SCTP_AUTO_ASCONF 30 #define SCTP_PEER_ADDR_THLDS 31 +#define SCTP_RECVRCVINFO 32 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. @@ -110,8 +111,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_SOCKOPT_CONNECTX3 111 /* CONNECTX requests (updated) */ #define SCTP_GET_ASSOC_STATS 112 /* Read only */ -/* - * 5.2.1 SCTP Initiation Structure (SCTP_INIT) +/* 5.3.1 SCTP Initiation Structure (SCTP_INIT) * * This cmsghdr structure provides information for initializing new * SCTP associations with sendmsg(). The SCTP_INITMSG socket option @@ -121,7 +121,6 @@ typedef __s32 sctp_assoc_t; * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_INIT struct sctp_initmsg - * */ struct sctp_initmsg { __u16 sinit_num_ostreams; @@ -130,8 +129,7 @@ struct sctp_initmsg { __u16 sinit_max_init_timeo; }; -/* - * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) +/* 5.3.2 SCTP Header Information Structure (SCTP_SNDRCV) * * This cmsghdr structure specifies SCTP options for sendmsg() and * describes SCTP header information about a received message through @@ -140,7 +138,6 @@ struct sctp_initmsg { * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_SNDRCV struct sctp_sndrcvinfo - * */ struct sctp_sndrcvinfo { __u16 sinfo_stream; @@ -170,13 +167,32 @@ struct sctp_sndinfo { sctp_assoc_t snd_assoc_id; }; +/* 5.3.5 SCTP Receive Information Structure (SCTP_RCVINFO) + * + * This cmsghdr structure describes SCTP receive information + * about a received message through recvmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ------------------- + * IPPROTO_SCTP SCTP_RCVINFO struct sctp_rcvinfo + */ +struct sctp_rcvinfo { + __u16 rcv_sid; + __u16 rcv_ssn; + __u16 rcv_flags; + __u32 rcv_ppid; + __u32 rcv_tsn; + __u32 rcv_cumtsn; + __u32 rcv_context; + sctp_assoc_t rcv_assoc_id; +}; + /* * sinfo_flags: 16 bits (unsigned integer) * * This field may contain any of the following flags and is composed of * a bitwise OR of these values. */ - enum sctp_sinfo_flags { SCTP_UNORDERED = 1, /* Send/receive message unordered. */ SCTP_ADDR_OVER = 2, /* Override the primary destination. */ @@ -199,6 +215,8 @@ typedef enum sctp_cmsg_type { #define SCTP_SNDRCV SCTP_SNDRCV SCTP_SNDINFO, /* 5.3.4 SCTP Send Information Structure */ #define SCTP_SNDINFO SCTP_SNDINFO + SCTP_RCVINFO, /* 5.3.5 SCTP Receive Information Structure */ +#define SCTP_RCVINFO SCTP_RCVINFO } sctp_cmsg_t; /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d61729e99856..9c193887c5cd 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2112,9 +2112,13 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, sp->pf->skb_msgname(skb, msg->msg_name, addr_len); } + /* Check if we allow SCTP_RCVINFO. */ + if (sp->recvrcvinfo) + sctp_ulpevent_read_rcvinfo(event, msg); /* Check if we allow SCTP_SNDRCVINFO. */ if (sp->subscribe.sctp_data_io_event) sctp_ulpevent_read_sndrcvinfo(event, msg); + #if 0 /* FIXME: we should be calling IP/IPv6 layers. */ if (sk->sk_protinfo.af_inet.cmsg_flags) @@ -3541,7 +3545,6 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, return 0; } - /* * SCTP_PEER_ADDR_THLDS * @@ -3592,6 +3595,22 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, return 0; } +static int sctp_setsockopt_recvrcvinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *) optval)) + return -EFAULT; + + sctp_sk(sk)->recvrcvinfo = (val == 0) ? 0 : 1; + + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -3743,6 +3762,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_PEER_ADDR_THLDS: retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen); break; + case SCTP_RECVRCVINFO: + retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -3989,6 +4011,8 @@ static int sctp_init_sock(struct sock *sk) /* Enable Nagle algorithm by default. */ sp->nodelay = 0; + sp->recvrcvinfo = 0; + /* Enable by default. */ sp->v4mapped = 1; @@ -5770,6 +5794,26 @@ static int sctp_getsockopt_assoc_stats(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_recvrcvinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val = 0; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + if (sctp_sk(sk)->recvrcvinfo) + val = 1; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -5913,6 +5957,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_GET_ASSOC_STATS: retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen); break; + case SCTP_RECVRCVINFO: + retval = sctp_getsockopt_recvrcvinfo(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index b6842fdb53d4..b31f365f18ab 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -886,6 +886,31 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, sizeof(sinfo), &sinfo); } +/* RFC6458, Section 5.3.5 SCTP Receive Information Structure + * (SCTP_SNDRCV) + */ +void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr) +{ + struct sctp_rcvinfo rinfo; + + if (sctp_ulpevent_is_notification(event)) + return; + + memset(&rinfo, 0, sizeof(struct sctp_rcvinfo)); + rinfo.rcv_sid = event->stream; + rinfo.rcv_ssn = event->ssn; + rinfo.rcv_ppid = event->ppid; + rinfo.rcv_flags = event->flags; + rinfo.rcv_tsn = event->tsn; + rinfo.rcv_cumtsn = event->cumtsn; + rinfo.rcv_assoc_id = sctp_assoc2id(event->asoc); + rinfo.rcv_context = event->asoc->default_rcv_context; + + put_cmsg(msghdr, IPPROTO_SCTP, SCTP_RCVINFO, + sizeof(rinfo), &rinfo); +} + /* Do accounting for bytes received and hold a reference to the association * for each skb. */ -- cgit v1.2.3-59-g8ed1b From 2347c80ff127b94ddaa675e2b78ab4cef46dc905 Mon Sep 17 00:00:00 2001 From: Geir Ola Vaagland Date: Sat, 12 Jul 2014 20:30:38 +0200 Subject: net: sctp: implement rfc6458, 5.3.6. SCTP_NXTINFO cmsg support This patch implements section 5.3.6. of RFC6458, that is, support for 'SCTP Next Receive Information Structure' (SCTP_NXTINFO) which is placed into ancillary data cmsghdr structure for each recvmsg() call, if this information is already available when delivering the current message. This option can be enabled/disabled via setsockopt(2) on SOL_SCTP level by setting an int value with 1/0 for SCTP_RECVNXTINFO in user space applications as per RFC6458, section 8.1.30. The sctp_nxtinfo structure is defined as per RFC as below ... struct sctp_nxtinfo { uint16_t nxt_sid; uint16_t nxt_flags; uint32_t nxt_ppid; uint32_t nxt_length; sctp_assoc_t nxt_assoc_id; }; ... and provided under cmsg_level IPPROTO_SCTP, cmsg_type SCTP_NXTINFO, while cmsg_data[] contains struct sctp_nxtinfo. Joint work with Daniel Borkmann. Signed-off-by: Geir Ola Vaagland Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 1 + include/net/sctp/structs.h | 1 + include/net/sctp/ulpevent.h | 9 ++------ include/uapi/linux/sctp.h | 47 +++++++++++++++++++++++++++++----------- net/sctp/socket.c | 52 +++++++++++++++++++++++++++++++++++++++++---- net/sctp/ulpevent.c | 38 +++++++++++++++++++++++++++++++++ 6 files changed, 125 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index c2035c96a2ee..90c1cccd164d 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -109,6 +109,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc); extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); +struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); /* * sctp/primitive.c diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 11d5df015370..7741d1b66967 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -218,6 +218,7 @@ struct sctp_sock { __u32 adaptation_ind; __u32 pd_point; __u8 recvrcvinfo; + __u8 recvnxtinfo; atomic_t pd_mode; /* Receive to here while partial delivery is in effect. */ diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index e8095f973e94..cccdcfd14973 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -132,6 +132,8 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *); void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, struct msghdr *); +void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, + struct msghdr *, struct sock *sk); __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event); @@ -158,10 +160,3 @@ static inline int sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event, } #endif /* __sctp_ulpevent_h__ */ - - - - - - - diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 29b81bbfc53d..222f82ffeca4 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -96,6 +96,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_AUTO_ASCONF 30 #define SCTP_PEER_ADDR_THLDS 31 #define SCTP_RECVRCVINFO 32 +#define SCTP_RECVNXTINFO 33 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. @@ -111,6 +112,13 @@ typedef __s32 sctp_assoc_t; #define SCTP_SOCKOPT_CONNECTX3 111 /* CONNECTX requests (updated) */ #define SCTP_GET_ASSOC_STATS 112 /* Read only */ +/* These are bit fields for msghdr->msg_flags. See section 5.1. */ +/* On user space Linux, these live in as an enum. */ +enum sctp_msg_flags { + MSG_NOTIFICATION = 0x8000, +#define MSG_NOTIFICATION MSG_NOTIFICATION +}; + /* 5.3.1 SCTP Initiation Structure (SCTP_INIT) * * This cmsghdr structure provides information for initializing new @@ -187,6 +195,25 @@ struct sctp_rcvinfo { sctp_assoc_t rcv_assoc_id; }; +/* 5.3.6 SCTP Next Receive Information Structure (SCTP_NXTINFO) + * + * This cmsghdr structure describes SCTP receive information + * of the next message that will be delivered through recvmsg() + * if this information is already available when delivering + * the current message. + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ------------------- + * IPPROTO_SCTP SCTP_NXTINFO struct sctp_nxtinfo + */ +struct sctp_nxtinfo { + __u16 nxt_sid; + __u16 nxt_flags; + __u32 nxt_ppid; + __u32 nxt_length; + sctp_assoc_t nxt_assoc_id; +}; + /* * sinfo_flags: 16 bits (unsigned integer) * @@ -194,11 +221,12 @@ struct sctp_rcvinfo { * a bitwise OR of these values. */ enum sctp_sinfo_flags { - SCTP_UNORDERED = 1, /* Send/receive message unordered. */ - SCTP_ADDR_OVER = 2, /* Override the primary destination. */ - SCTP_ABORT=4, /* Send an ABORT message to the peer. */ - SCTP_SACK_IMMEDIATELY = 8, /* SACK should be sent without delay */ - SCTP_EOF=MSG_FIN, /* Initiate graceful shutdown process. */ + SCTP_UNORDERED = (1 << 0), /* Send/receive message unordered. */ + SCTP_ADDR_OVER = (1 << 1), /* Override the primary destination. */ + SCTP_ABORT = (1 << 2), /* Send an ABORT message to the peer. */ + SCTP_SACK_IMMEDIATELY = (1 << 3), /* SACK should be sent without delay. */ + SCTP_NOTIFICATION = MSG_NOTIFICATION, /* Next message is not user msg but notification. */ + SCTP_EOF = MSG_FIN, /* Initiate graceful shutdown process. */ }; typedef union { @@ -217,6 +245,8 @@ typedef enum sctp_cmsg_type { #define SCTP_SNDINFO SCTP_SNDINFO SCTP_RCVINFO, /* 5.3.5 SCTP Receive Information Structure */ #define SCTP_RCVINFO SCTP_RCVINFO + SCTP_NXTINFO, /* 5.3.6 SCTP Next Receive Information Structure */ +#define SCTP_NXTINFO SCTP_NXTINFO } sctp_cmsg_t; /* @@ -844,13 +874,6 @@ struct sctp_assoc_stats { __u64 sas_ictrlchunks; /* Control chunks received */ }; -/* These are bit fields for msghdr->msg_flags. See section 5.1. */ -/* On user space Linux, these live in as an enum. */ -enum sctp_msg_flags { - MSG_NOTIFICATION = 0x8000, -#define MSG_NOTIFICATION MSG_NOTIFICATION -}; - /* * 8.1 sctp_bindx() * diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9c193887c5cd..9bca87ee5152 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2060,8 +2060,6 @@ static int sctp_skb_pull(struct sk_buff *skb, int len) * flags - flags sent or received with the user message, see Section * 5 for complete description of the flags. */ -static struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); - static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) @@ -2112,6 +2110,9 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, sp->pf->skb_msgname(skb, msg->msg_name, addr_len); } + /* Check if we allow SCTP_NXTINFO. */ + if (sp->recvnxtinfo) + sctp_ulpevent_read_nxtinfo(event, msg, sk); /* Check if we allow SCTP_RCVINFO. */ if (sp->recvrcvinfo) sctp_ulpevent_read_rcvinfo(event, msg); @@ -3611,6 +3612,22 @@ static int sctp_setsockopt_recvrcvinfo(struct sock *sk, return 0; } +static int sctp_setsockopt_recvnxtinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *) optval)) + return -EFAULT; + + sctp_sk(sk)->recvnxtinfo = (val == 0) ? 0 : 1; + + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -3765,6 +3782,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_RECVRCVINFO: retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); break; + case SCTP_RECVNXTINFO: + retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4012,6 +4032,7 @@ static int sctp_init_sock(struct sock *sk) sp->nodelay = 0; sp->recvrcvinfo = 0; + sp->recvnxtinfo = 0; /* Enable by default. */ sp->v4mapped = 1; @@ -5814,6 +5835,26 @@ static int sctp_getsockopt_recvrcvinfo(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val = 0; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + if (sctp_sk(sk)->recvnxtinfo) + val = 1; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -5960,6 +6001,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_RECVRCVINFO: retval = sctp_getsockopt_recvrcvinfo(sk, len, optval, optlen); break; + case SCTP_RECVNXTINFO: + retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6602,8 +6646,8 @@ out: * Note: This is pretty much the same routine as in core/datagram.c * with a few changes to make lksctp work. */ -static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, - int noblock, int *err) +struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, + int noblock, int *err) { int error; struct sk_buff *skb; diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index b31f365f18ab..e049298ecfa0 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -911,6 +911,44 @@ void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, sizeof(rinfo), &rinfo); } +/* RFC6458, Section 5.3.6. SCTP Next Receive Information Structure + * (SCTP_NXTINFO) + */ +static void __sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr, + const struct sk_buff *skb) +{ + struct sctp_nxtinfo nxtinfo; + + memset(&nxtinfo, 0, sizeof(nxtinfo)); + nxtinfo.nxt_sid = event->stream; + nxtinfo.nxt_ppid = event->ppid; + nxtinfo.nxt_flags = event->flags; + if (sctp_ulpevent_is_notification(event)) + nxtinfo.nxt_flags |= SCTP_NOTIFICATION; + nxtinfo.nxt_length = skb->len; + nxtinfo.nxt_assoc_id = sctp_assoc2id(event->asoc); + + put_cmsg(msghdr, IPPROTO_SCTP, SCTP_NXTINFO, + sizeof(nxtinfo), &nxtinfo); +} + +void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr, + struct sock *sk) +{ + struct sk_buff *skb; + int err; + + skb = sctp_skb_recv_datagram(sk, MSG_PEEK, 1, &err); + if (skb != NULL) { + __sctp_ulpevent_read_nxtinfo(sctp_skb2event(skb), + msghdr, skb); + /* Just release refcount here. */ + kfree_skb(skb); + } +} + /* Do accounting for bytes received and hold a reference to the association * for each skb. */ -- cgit v1.2.3-59-g8ed1b From 6b3fd5f3a2bbc8464a8e0bf134a183b8fa026439 Mon Sep 17 00:00:00 2001 From: Geir Ola Vaagland Date: Sat, 12 Jul 2014 20:30:39 +0200 Subject: net: sctp: implement rfc6458, 8.1.31. SCTP_DEFAULT_SNDINFO support This patch implements section 8.1.31. of RFC6458, which adds support for setting/retrieving SCTP_DEFAULT_SNDINFO: Applications that wish to use the sendto() system call may wish to specify a default set of parameters that would normally be supplied through the inclusion of ancillary data. This socket option allows such an application to set the default sctp_sndinfo structure. The application that wishes to use this socket option simply passes the sctp_sndinfo structure (defined in Section 5.3.4) to this call. The input parameters accepted by this call include snd_sid, snd_flags, snd_ppid, and snd_context. The snd_flags parameter is composed of a bitwise OR of SCTP_UNORDERED, SCTP_EOF, and SCTP_SENDALL. The snd_assoc_id field specifies the association to which to apply the parameters. For a one-to-many style socket, any of the predefined constants are also allowed in this field. The field is ignored for one-to-one style sockets. Joint work with Daniel Borkmann. Signed-off-by: Geir Ola Vaagland Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 1 + net/sctp/socket.c | 107 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 99 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 222f82ffeca4..ce70fe6b45df 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -97,6 +97,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_PEER_ADDR_THLDS 31 #define SCTP_RECVRCVINFO 32 #define SCTP_RECVNXTINFO 33 +#define SCTP_DEFAULT_SNDINFO 34 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9bca87ee5152..d95a50c013c9 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2770,19 +2770,22 @@ static int sctp_setsockopt_default_send_param(struct sock *sk, char __user *optval, unsigned int optlen) { - struct sctp_sndrcvinfo info; - struct sctp_association *asoc; struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndrcvinfo info; - if (optlen != sizeof(struct sctp_sndrcvinfo)) + if (optlen != sizeof(info)) return -EINVAL; if (copy_from_user(&info, optval, optlen)) return -EFAULT; + if (info.sinfo_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - if (asoc) { asoc->default_stream = info.sinfo_stream; asoc->default_flags = info.sinfo_flags; @@ -2800,6 +2803,44 @@ static int sctp_setsockopt_default_send_param(struct sock *sk, return 0; } +/* RFC6458, Section 8.1.31. Set/get Default Send Parameters + * (SCTP_DEFAULT_SNDINFO) + */ +static int sctp_setsockopt_default_sndinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndinfo info; + + if (optlen != sizeof(info)) + return -EINVAL; + if (copy_from_user(&info, optval, optlen)) + return -EFAULT; + if (info.snd_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; + + asoc = sctp_id2assoc(sk, info.snd_assoc_id); + if (!asoc && info.snd_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + if (asoc) { + asoc->default_stream = info.snd_sid; + asoc->default_flags = info.snd_flags; + asoc->default_ppid = info.snd_ppid; + asoc->default_context = info.snd_context; + } else { + sp->default_stream = info.snd_sid; + sp->default_flags = info.snd_flags; + sp->default_ppid = info.snd_ppid; + sp->default_context = info.snd_context; + } + + return 0; +} + /* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) * * Requests that the local SCTP stack use the enclosed peer address as @@ -3725,6 +3766,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_default_send_param(sk, optval, optlen); break; + case SCTP_DEFAULT_SNDINFO: + retval = sctp_setsockopt_default_sndinfo(sk, optval, optlen); + break; case SCTP_PRIMARY_ADDR: retval = sctp_setsockopt_primary_addr(sk, optval, optlen); break; @@ -5027,14 +5071,14 @@ static int sctp_getsockopt_default_send_param(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct sctp_sndrcvinfo info; - struct sctp_association *asoc; struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndrcvinfo info; - if (len < sizeof(struct sctp_sndrcvinfo)) + if (len < sizeof(info)) return -EINVAL; - len = sizeof(struct sctp_sndrcvinfo); + len = sizeof(info); if (copy_from_user(&info, optval, len)) return -EFAULT; @@ -5042,7 +5086,6 @@ static int sctp_getsockopt_default_send_param(struct sock *sk, asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - if (asoc) { info.sinfo_stream = asoc->default_stream; info.sinfo_flags = asoc->default_flags; @@ -5065,6 +5108,48 @@ static int sctp_getsockopt_default_send_param(struct sock *sk, return 0; } +/* RFC6458, Section 8.1.31. Set/get Default Send Parameters + * (SCTP_DEFAULT_SNDINFO) + */ +static int sctp_getsockopt_default_sndinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndinfo info; + + if (len < sizeof(info)) + return -EINVAL; + + len = sizeof(info); + + if (copy_from_user(&info, optval, len)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, info.snd_assoc_id); + if (!asoc && info.snd_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + if (asoc) { + info.snd_sid = asoc->default_stream; + info.snd_flags = asoc->default_flags; + info.snd_ppid = asoc->default_ppid; + info.snd_context = asoc->default_context; + } else { + info.snd_sid = sp->default_stream; + info.snd_flags = sp->default_flags; + info.snd_ppid = sp->default_ppid; + info.snd_context = sp->default_context; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + + return 0; +} + /* * * 7.1.5 SCTP_NODELAY @@ -5924,6 +6009,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_default_send_param(sk, len, optval, optlen); break; + case SCTP_DEFAULT_SNDINFO: + retval = sctp_getsockopt_default_sndinfo(sk, len, + optval, optlen); + break; case SCTP_PRIMARY_ADDR: retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen); break; -- cgit v1.2.3-59-g8ed1b From 5cd667b0a4567048bb555927d6ee564f4e5620a9 Mon Sep 17 00:00:00 2001 From: Alex Wang Date: Thu, 17 Jul 2014 15:14:13 -0700 Subject: openvswitch: Allow each vport to have an array of 'port_id's. In order to allow handlers directly read upcalls from datapath, we need to support per-handler netlink socket for each vport in datapath. This commit makes this happen. Also, it is guaranteed to be backward compatible with previous branch. Signed-off-by: Alex Wang Acked-by: Thomas Graf Signed-off-by: Pravin B Shelar --- include/uapi/linux/openvswitch.h | 13 +++-- net/openvswitch/datapath.c | 23 ++++++--- net/openvswitch/vport.c | 101 ++++++++++++++++++++++++++++++++++++++- net/openvswitch/vport.h | 27 +++++++++-- 4 files changed, 148 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 0b979ee4bfc0..a794d1dd7b40 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -118,6 +118,9 @@ struct ovs_vport_stats { /* Allow last Netlink attribute to be unaligned */ #define OVS_DP_F_UNALIGNED (1 << 0) +/* Allow datapath to associate multiple Netlink PIDs to each vport */ +#define OVS_DP_F_VPORT_PIDS (1 << 1) + /* Fixed logical ports. */ #define OVSP_LOCAL ((__u32)0) @@ -203,9 +206,10 @@ enum ovs_vport_type { * this is the name of the network device. Maximum length %IFNAMSIZ-1 bytes * plus a null terminator. * @OVS_VPORT_ATTR_OPTIONS: Vport-specific configuration information. - * @OVS_VPORT_ATTR_UPCALL_PID: The Netlink socket in userspace that - * OVS_PACKET_CMD_MISS upcalls will be directed to for packets received on - * this port. A value of zero indicates that upcalls should not be sent. + * @OVS_VPORT_ATTR_UPCALL_PID: The array of Netlink socket pids in userspace + * among which OVS_PACKET_CMD_MISS upcalls will be distributed for packets + * received on this port. If this is a single-element array of value 0, + * upcalls should not be sent. * @OVS_VPORT_ATTR_STATS: A &struct ovs_vport_stats giving statistics for * packets sent or received through the vport. * @@ -228,7 +232,8 @@ enum ovs_vport_attr { OVS_VPORT_ATTR_TYPE, /* u32 OVS_VPORT_TYPE_* constant. */ OVS_VPORT_ATTR_NAME, /* string name, up to IFNAMSIZ bytes long */ OVS_VPORT_ATTR_OPTIONS, /* nested attributes, varies by vport type */ - OVS_VPORT_ATTR_UPCALL_PID, /* u32 Netlink PID to receive upcalls */ + OVS_VPORT_ATTR_UPCALL_PID, /* array of u32 Netlink socket PIDs for */ + /* receiving upcalls */ OVS_VPORT_ATTR_STATS, /* struct ovs_vport_stats */ __OVS_VPORT_ATTR_MAX }; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 20f59b62721a..65a8e5c089e4 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -266,7 +266,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) upcall.cmd = OVS_PACKET_CMD_MISS; upcall.key = &key; upcall.userdata = NULL; - upcall.portid = p->upcall_portid; + upcall.portid = ovs_vport_find_upcall_portid(p, skb); ovs_dp_upcall(dp, skb, &upcall); consume_skb(skb); stats_counter = &stats->n_missed; @@ -1373,7 +1373,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.options = NULL; parms.dp = dp; parms.port_no = OVSP_LOCAL; - parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]); + parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; ovs_dp_change(dp, a); @@ -1632,8 +1632,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || - nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)) || - nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_portid)) + nla_put_string(skb, OVS_VPORT_ATTR_NAME, + vport->ops->get_name(vport))) goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); @@ -1641,6 +1641,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, &vport_stats)) goto nla_put_failure; + if (ovs_vport_get_upcall_portids(vport, skb)) + goto nla_put_failure; + err = ovs_vport_get_options(vport, skb); if (err == -EMSGSIZE) goto error; @@ -1762,7 +1765,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; - parms.upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; vport = new_vport(&parms); err = PTR_ERR(vport); @@ -1812,8 +1815,14 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; } - if (a[OVS_VPORT_ATTR_UPCALL_PID]) - vport->upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + + if (a[OVS_VPORT_ATTR_UPCALL_PID]) { + struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; + + err = ovs_vport_set_upcall_portids(vport, ids); + if (err) + goto exit_unlock_free; + } err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 42c0f4a0b78c..702fb21bfe15 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -134,10 +134,12 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, vport->dp = parms->dp; vport->port_no = parms->port_no; - vport->upcall_portid = parms->upcall_portid; vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); + if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) + return ERR_PTR(-EINVAL); + vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vport->percpu_stats) { kfree(vport); @@ -161,6 +163,10 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, */ void ovs_vport_free(struct vport *vport) { + /* vport is freed from RCU callback or error path, Therefore + * it is safe to use raw dereference. + */ + kfree(rcu_dereference_raw(vport->upcall_portids)); free_percpu(vport->percpu_stats); kfree(vport); } @@ -326,6 +332,99 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) return 0; } +/** + * ovs_vport_set_upcall_portids - set upcall portids of @vport. + * + * @vport: vport to modify. + * @ids: new configuration, an array of port ids. + * + * Sets the vport's upcall_portids to @ids. + * + * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed + * as an array of U32. + * + * Must be called with ovs_mutex. + */ +int ovs_vport_set_upcall_portids(struct vport *vport, struct nlattr *ids) +{ + struct vport_portids *old, *vport_portids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(vport->upcall_portids); + + vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), + GFP_KERNEL); + if (!vport_portids) + return -ENOMEM; + + vport_portids->n_ids = nla_len(ids) / sizeof(u32); + vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); + nla_memcpy(vport_portids->ids, ids, nla_len(ids)); + + rcu_assign_pointer(vport->upcall_portids, vport_portids); + + if (old) + kfree_rcu(old, rcu); + return 0; +} + +/** + * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. + * + * @vport: vport from which to retrieve the portids. + * @skb: sk_buff where portids should be appended. + * + * Retrieves the configuration of the given vport, appending the + * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall + * portids to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. + * If an error occurs, @skb is left unmodified. Must be called with + * ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_upcall_portids(const struct vport *vport, + struct sk_buff *skb) +{ + struct vport_portids *ids; + + ids = rcu_dereference_ovsl(vport->upcall_portids); + + if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) + return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, + ids->n_ids * sizeof(u32), (void *)ids->ids); + else + return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); +} + +/** + * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. + * + * @vport: vport from which the missed packet is received. + * @skb: skb that the missed packet was received. + * + * Uses the skb_get_hash() to select the upcall portid to send the + * upcall. + * + * Returns the portid of the target socket. Must be called with rcu_read_lock. + */ +u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb) +{ + struct vport_portids *ids; + u32 ids_index; + u32 hash; + + ids = rcu_dereference(p->upcall_portids); + + if (ids->n_ids == 1 && ids->ids[0] == 0) + return 0; + + hash = skb_get_hash(skb); + ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); + return ids->ids[ids_index]; +} + /** * ovs_vport_receive - pass up received packet to the datapath for processing * diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 8d721e62f388..35f89d84b45e 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,10 @@ void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); +int ovs_vport_set_upcall_portids(struct vport *, struct nlattr *pids); +int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); +u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); + int ovs_vport_send(struct vport *, struct sk_buff *); /* The following definitions are for implementers of vport devices: */ @@ -62,13 +67,27 @@ struct vport_err_stats { u64 tx_dropped; u64 tx_errors; }; +/** + * struct vport_portids - array of netlink portids of a vport. + * must be protected by rcu. + * @rn_ids: The reciprocal value of @n_ids. + * @rcu: RCU callback head for deferred destruction. + * @n_ids: Size of @ids array. + * @ids: Array storing the Netlink socket pids to be used for packets received + * on this port that miss the flow table. + */ +struct vport_portids { + struct reciprocal_value rn_ids; + struct rcu_head rcu; + u32 n_ids; + u32 ids[]; +}; /** * struct vport - one port within a datapath * @rcu: RCU callback head for deferred destruction. * @dp: Datapath to which this port belongs. - * @upcall_portid: The Netlink port to use for packets received on this port that - * miss the flow table. + * @upcall_portids: RCU protected 'struct vport_portids'. * @port_no: Index into @dp's @ports array. * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. @@ -80,7 +99,7 @@ struct vport_err_stats { struct vport { struct rcu_head rcu; struct datapath *dp; - u32 upcall_portid; + struct vport_portids __rcu *upcall_portids; u16 port_no; struct hlist_node hash_node; @@ -111,7 +130,7 @@ struct vport_parms { /* For ovs_vport_alloc(). */ struct datapath *dp; u16 port_no; - u32 upcall_portid; + struct nlattr *upcall_portids; }; /** -- cgit v1.2.3-59-g8ed1b From 68a360e82e55c9b35097e7be7f7991d8f401032f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 25 Jul 2014 18:01:31 -0400 Subject: packet: remove deprecated syststamp timestamp No device driver will ever return an skb_shared_info structure with syststamp non-zero, so remove the branch that tests for this and optionally marks the packet timestamp as TP_STATUS_TS_SYS_HARDWARE. Do not remove the definition TP_STATUS_TS_SYS_HARDWARE, as processes may refer to it. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/packet_mmap.txt | 18 ++++++------------ include/uapi/linux/if_packet.h | 2 +- net/packet/af_packet.c | 12 ++++-------- 3 files changed, 11 insertions(+), 21 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index 38112d512f47..a6d7cb91069e 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -1008,14 +1008,9 @@ hardware timestamps to be used. Note: you may need to enable the generation of hardware timestamps with SIOCSHWTSTAMP (see related information from Documentation/networking/timestamping.txt). -PACKET_TIMESTAMP accepts the same integer bit field as -SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE -and SOF_TIMESTAMPING_RAW_HARDWARE values are recognized by -PACKET_TIMESTAMP. SOF_TIMESTAMPING_SYS_HARDWARE takes precedence over -SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set. - - int req = 0; - req |= SOF_TIMESTAMPING_SYS_HARDWARE; +PACKET_TIMESTAMP accepts the same integer bit field as SO_TIMESTAMPING: + + int req = SOF_TIMESTAMPING_RAW_HARDWARE; setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req)) For the mmap(2)ed ring buffers, such timestamps are stored in the @@ -1023,14 +1018,13 @@ tpacket{,2,3}_hdr structure's tp_sec and tp_{n,u}sec members. To determine what kind of timestamp has been reported, the tp_status field is binary |'ed with the following possible bits ... - TP_STATUS_TS_SYS_HARDWARE TP_STATUS_TS_RAW_HARDWARE TP_STATUS_TS_SOFTWARE ... that are equivalent to its SOF_TIMESTAMPING_* counterparts. For the -RX_RING, if none of those 3 are set (i.e. PACKET_TIMESTAMP is not set), -then this means that a software fallback was invoked *within* PF_PACKET's -processing code (less precise). +RX_RING, if neither is set (i.e. PACKET_TIMESTAMP is not set), then a +software fallback was invoked *within* PF_PACKET's processing code (less +precise). Getting timestamps for the TX_RING works as follows: i) fill the ring frames, ii) call sendto() e.g. in blocking mode, iii) wait for status of relevant diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index bac27fa05f5b..da2d668b8cf1 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -108,7 +108,7 @@ struct tpacket_auxdata { /* Rx and Tx ring - header status */ #define TP_STATUS_TS_SOFTWARE (1 << 29) -#define TP_STATUS_TS_SYS_HARDWARE (1 << 30) +#define TP_STATUS_TS_SYS_HARDWARE (1 << 30) /* deprecated, never set */ #define TP_STATUS_TS_RAW_HARDWARE (1 << 31) /* Rx ring - feature request bits */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 614ca91f785a..8d9f8042705a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -441,14 +441,10 @@ static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); - if (shhwtstamps) { - if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->syststamp, ts)) - return TP_STATUS_TS_SYS_HARDWARE; - if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) - return TP_STATUS_TS_RAW_HARDWARE; - } + if (shhwtstamps && + (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && + ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) + return TP_STATUS_TS_RAW_HARDWARE; if (ktime_to_timespec_cond(skb->tstamp, ts)) return TP_STATUS_TS_SOFTWARE; -- cgit v1.2.3-59-g8ed1b From 16eecd9be4b05e3216b8b12707aa6f51fb197903 Mon Sep 17 00:00:00 2001 From: Anish Bhatt Date: Mon, 28 Jul 2014 20:57:07 -0700 Subject: dcbnl : Fix misleading dcb_app->priority explanation Current explanation of dcb_app->priority is wrong. It says priority is expected to be a 3-bit unsigned integer which is only true when working with DCBx-IEEE. Use of dcb_app->priority by DCBx-CEE expects it to be 802.1p user priority bitmap. Updated accordingly This affects the cxgb4 driver, but I will post those changes as part of a larger changeset shortly. Fixes: 3e29027af4372 ("dcbnl: add support for ieee8021Qaz attributes") Signed-off-by: Anish Bhatt Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/uapi/linux/dcbnl.h | 3 ++- net/dcb/dcbnl.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index 6bb43382f3f3..e711f20dc522 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -148,7 +148,8 @@ struct cee_pfc { * * @selector: protocol identifier type * @protocol: protocol of type indicated - * @priority: 3-bit unsigned integer indicating priority + * @priority: 3-bit unsigned integer indicating priority for IEEE + * 8-bit 802.1p user priority bitmap for CEE * * ---- * Selector field values diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index c34af7a1d2d4..ca11d283bbeb 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1776,7 +1776,7 @@ EXPORT_SYMBOL(dcb_getapp); * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is - * set to zero. + * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap */ int dcb_setapp(struct net_device *dev, struct dcb_app *new) { @@ -1837,7 +1837,8 @@ EXPORT_SYMBOL(dcb_ieee_getapp_mask); * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long - * as the priorities are different. + * as the priorities are different. Priority is expected to be a + * 3-bit unsigned integer */ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) { -- cgit v1.2.3-59-g8ed1b From e10038a8ec06ac819b7552bb67aaa6d2d6f850c1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Tue, 29 Jul 2014 18:12:15 +0200 Subject: netfilter: xt_bpf: add mising opaque struct sk_filter definition This structure is not exposed to userspace, so fix this by defining struct sk_filter; so we skip the casting in kernelspace. This is safe since userspace has no way to lurk with that internal pointer. Fixes: e6f30c7 ("netfilter: x_tables: add xt_bpf match") Signed-off-by: Pablo Neira Ayuso Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/netfilter/xt_bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h index 5dda450eb55b..2ec9fbcd06f9 100644 --- a/include/uapi/linux/netfilter/xt_bpf.h +++ b/include/uapi/linux/netfilter/xt_bpf.h @@ -6,6 +6,8 @@ #define XT_BPF_MAX_NUM_INSTR 64 +struct sk_filter; + struct xt_bpf_info { __u16 bpf_program_num_elem; struct sock_filter bpf_program[XT_BPF_MAX_NUM_INSTR]; -- cgit v1.2.3-59-g8ed1b From 7ae457c1e5b45a1b826fad9d62b32191d2bdcfdb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jul 2014 20:34:16 -0700 Subject: net: filter: split 'struct sk_filter' into socket and bpf parts clean up names related to socket filtering and bpf in the following way: - everything that deals with sockets keeps 'sk_*' prefix - everything that is pure BPF is changed to 'bpf_*' prefix split 'struct sk_filter' into struct sk_filter { atomic_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; and struct bpf_prog { u32 jited:1, len:31; struct sock_fprog_kern *orig_prog; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; struct work_struct work; }; }; so that 'struct bpf_prog' can be used independent of sockets and cleans up 'unattached' bpf use cases split SK_RUN_FILTER macro into: SK_RUN_FILTER to be used with 'struct sk_filter *' and BPF_PROG_RUN to be used with 'struct bpf_prog *' __sk_filter_release(struct sk_filter *) gains __bpf_prog_release(struct bpf_prog *) helper function also perform related renames for the functions that work with 'struct bpf_prog *', since they're on the same lines: sk_filter_size -> bpf_prog_size sk_filter_select_runtime -> bpf_prog_select_runtime sk_filter_free -> bpf_prog_free sk_unattached_filter_create -> bpf_prog_create sk_unattached_filter_destroy -> bpf_prog_destroy sk_store_orig_filter -> bpf_prog_store_orig_filter sk_release_orig_filter -> bpf_release_orig_filter __sk_migrate_filter -> bpf_migrate_filter __sk_prepare_filter -> bpf_prepare_filter API for attaching classic BPF to a socket stays the same: sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *) and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program which is used by sockets, tun, af_packet API for 'unattached' BPF programs becomes: bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *) and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 10 ++-- arch/arm/net/bpf_jit_32.c | 8 +-- arch/mips/net/bpf_jit.c | 8 +-- arch/powerpc/net/bpf_jit_comp.c | 8 +-- arch/s390/net/bpf_jit_comp.c | 4 +- arch/sparc/net/bpf_jit_comp.c | 4 +- arch/x86/net/bpf_jit_comp.c | 12 ++--- drivers/isdn/i4l/isdn_ppp.c | 26 +++++---- drivers/net/ppp/ppp_generic.c | 28 +++++----- drivers/net/team/team_mode_loadbalance.c | 14 ++--- include/linux/filter.h | 40 ++++++++------ include/linux/isdn_ppp.h | 4 +- include/uapi/linux/netfilter/xt_bpf.h | 4 +- kernel/bpf/core.c | 30 +++++------ kernel/seccomp.c | 10 ++-- lib/test_bpf.c | 24 ++++----- net/core/filter.c | 92 ++++++++++++++++++-------------- net/core/ptp_classifier.c | 6 +-- net/core/sock_diag.c | 2 +- net/netfilter/xt_bpf.c | 6 +-- net/sched/cls_bpf.c | 12 ++--- 21 files changed, 183 insertions(+), 169 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 712068be8171..c48a9704bda8 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -586,11 +586,11 @@ team driver's classifier for its load-balancing mode, netfilter's xt_bpf extension, PTP dissector/classifier, and much more. They are all internally converted by the kernel into the new instruction set representation and run in the eBPF interpreter. For in-kernel handlers, this all works transparently -by using sk_unattached_filter_create() for setting up the filter, resp. -sk_unattached_filter_destroy() for destroying it. The macro -SK_RUN_FILTER(filter, ctx) transparently invokes eBPF interpreter or JITed -code to run the filter. 'filter' is a pointer to struct sk_filter that we -got from sk_unattached_filter_create(), and 'ctx' the given context (e.g. +by using bpf_prog_create() for setting up the filter, resp. +bpf_prog_destroy() for destroying it. The macro +BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed +code to run the filter. 'filter' is a pointer to struct bpf_prog that we +got from bpf_prog_create(), and 'ctx' the given context (e.g. skb pointer). All constraints and restrictions from bpf_check_classic() apply before a conversion to the new layout is being done behind the scenes! diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index fb5503ce016f..a37b989a2f91 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -56,7 +56,7 @@ #define FLAG_NEED_X_RESET (1 << 0) struct jit_ctx { - const struct sk_filter *skf; + const struct bpf_prog *skf; unsigned idx; unsigned prologue_bytes; int ret0_fp_idx; @@ -465,7 +465,7 @@ static inline void update_on_xread(struct jit_ctx *ctx) static int build_body(struct jit_ctx *ctx) { void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w}; - const struct sk_filter *prog = ctx->skf; + const struct bpf_prog *prog = ctx->skf; const struct sock_filter *inst; unsigned i, load_order, off, condt; int imm12; @@ -857,7 +857,7 @@ b_epilogue: } -void bpf_jit_compile(struct sk_filter *fp) +void bpf_jit_compile(struct bpf_prog *fp) { struct jit_ctx ctx; unsigned tmp_idx; @@ -926,7 +926,7 @@ out: return; } -void bpf_jit_free(struct sk_filter *fp) +void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index b87390a56a2f..05a56619ece2 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -131,7 +131,7 @@ * @target: Memory location for the compiled filter */ struct jit_ctx { - const struct sk_filter *skf; + const struct bpf_prog *skf; unsigned int prologue_bytes; u32 idx; u32 flags; @@ -789,7 +789,7 @@ static int pkt_type_offset(void) static int build_body(struct jit_ctx *ctx) { void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w}; - const struct sk_filter *prog = ctx->skf; + const struct bpf_prog *prog = ctx->skf; const struct sock_filter *inst; unsigned int i, off, load_order, condt; u32 k, b_off __maybe_unused; @@ -1369,7 +1369,7 @@ jmp_cmp: int bpf_jit_enable __read_mostly; -void bpf_jit_compile(struct sk_filter *fp) +void bpf_jit_compile(struct bpf_prog *fp) { struct jit_ctx ctx; unsigned int alloc_size, tmp_idx; @@ -1423,7 +1423,7 @@ out: kfree(ctx.offsets); } -void bpf_jit_free(struct sk_filter *fp) +void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 82e82cadcde5..3afa6f4c1957 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -25,7 +25,7 @@ static inline void bpf_flush_icache(void *start, void *end) flush_icache_range((unsigned long)start, (unsigned long)end); } -static void bpf_jit_build_prologue(struct sk_filter *fp, u32 *image, +static void bpf_jit_build_prologue(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx) { int i; @@ -121,7 +121,7 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) /* Assemble the body code between the prologue & epilogue. */ -static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, +static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, unsigned int *addrs) { @@ -569,7 +569,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, return 0; } -void bpf_jit_compile(struct sk_filter *fp) +void bpf_jit_compile(struct bpf_prog *fp) { unsigned int proglen; unsigned int alloclen; @@ -693,7 +693,7 @@ out: return; } -void bpf_jit_free(struct sk_filter *fp) +void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index a2cbd875543a..61e45b7c04d7 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -812,7 +812,7 @@ static struct bpf_binary_header *bpf_alloc_binary(unsigned int bpfsize, return header; } -void bpf_jit_compile(struct sk_filter *fp) +void bpf_jit_compile(struct bpf_prog *fp) { struct bpf_binary_header *header = NULL; unsigned long size, prg_len, lit_len; @@ -875,7 +875,7 @@ out: kfree(addrs); } -void bpf_jit_free(struct sk_filter *fp) +void bpf_jit_free(struct bpf_prog *fp) { unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index 892a102671ad..1f76c22a6a75 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -354,7 +354,7 @@ do { *prog++ = BR_OPC | WDISP22(OFF); \ * emit_jump() calls with adjusted offsets. */ -void bpf_jit_compile(struct sk_filter *fp) +void bpf_jit_compile(struct bpf_prog *fp) { unsigned int cleanup_addr, proglen, oldproglen = 0; u32 temp[8], *prog, *func, seen = 0, pass; @@ -808,7 +808,7 @@ out: return; } -void bpf_jit_free(struct sk_filter *fp) +void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e2ecc1380b3d..5c8cb8043c5a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -211,7 +211,7 @@ struct jit_context { bool seen_ld_abs; }; -static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image, +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { struct bpf_insn *insn = bpf_prog->insnsi; @@ -841,7 +841,7 @@ common_load: ctx->seen_ld_abs = true; /* By design x64 JIT should support all BPF instructions * This error will be seen if new instruction was added * to interpreter, but not to JIT - * or if there is junk in sk_filter + * or if there is junk in bpf_prog */ pr_err("bpf_jit: unknown opcode %02x\n", insn->code); return -EINVAL; @@ -862,11 +862,11 @@ common_load: ctx->seen_ld_abs = true; return proglen; } -void bpf_jit_compile(struct sk_filter *prog) +void bpf_jit_compile(struct bpf_prog *prog) { } -void bpf_int_jit_compile(struct sk_filter *prog) +void bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; int proglen, oldproglen = 0; @@ -932,7 +932,7 @@ out: static void bpf_jit_free_deferred(struct work_struct *work) { - struct sk_filter *fp = container_of(work, struct sk_filter, work); + struct bpf_prog *fp = container_of(work, struct bpf_prog, work); unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; @@ -941,7 +941,7 @@ static void bpf_jit_free_deferred(struct work_struct *work) kfree(fp); } -void bpf_jit_free(struct sk_filter *fp) +void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) { INIT_WORK(&fp->work, bpf_jit_free_deferred); diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c index 62f0688d45a5..c4198fa490bf 100644 --- a/drivers/isdn/i4l/isdn_ppp.c +++ b/drivers/isdn/i4l/isdn_ppp.c @@ -379,12 +379,12 @@ isdn_ppp_release(int min, struct file *file) #endif #ifdef CONFIG_IPPP_FILTER if (is->pass_filter) { - sk_unattached_filter_destroy(is->pass_filter); + bpf_prog_destroy(is->pass_filter); is->pass_filter = NULL; } if (is->active_filter) { - sk_unattached_filter_destroy(is->active_filter); + bpf_prog_destroy(is->active_filter); is->active_filter = NULL; } #endif @@ -639,12 +639,11 @@ isdn_ppp_ioctl(int min, struct file *file, unsigned int cmd, unsigned long arg) fprog.filter = code; if (is->pass_filter) { - sk_unattached_filter_destroy(is->pass_filter); + bpf_prog_destroy(is->pass_filter); is->pass_filter = NULL; } if (fprog.filter != NULL) - err = sk_unattached_filter_create(&is->pass_filter, - &fprog); + err = bpf_prog_create(&is->pass_filter, &fprog); else err = 0; kfree(code); @@ -664,12 +663,11 @@ isdn_ppp_ioctl(int min, struct file *file, unsigned int cmd, unsigned long arg) fprog.filter = code; if (is->active_filter) { - sk_unattached_filter_destroy(is->active_filter); + bpf_prog_destroy(is->active_filter); is->active_filter = NULL; } if (fprog.filter != NULL) - err = sk_unattached_filter_create(&is->active_filter, - &fprog); + err = bpf_prog_create(&is->active_filter, &fprog); else err = 0; kfree(code); @@ -1174,14 +1172,14 @@ isdn_ppp_push_higher(isdn_net_dev *net_dev, isdn_net_local *lp, struct sk_buff * } if (is->pass_filter - && SK_RUN_FILTER(is->pass_filter, skb) == 0) { + && BPF_PROG_RUN(is->pass_filter, skb) == 0) { if (is->debug & 0x2) printk(KERN_DEBUG "IPPP: inbound frame filtered.\n"); kfree_skb(skb); return; } if (!(is->active_filter - && SK_RUN_FILTER(is->active_filter, skb) == 0)) { + && BPF_PROG_RUN(is->active_filter, skb) == 0)) { if (is->debug & 0x2) printk(KERN_DEBUG "IPPP: link-active filter: resetting huptimer.\n"); lp->huptimer = 0; @@ -1320,14 +1318,14 @@ isdn_ppp_xmit(struct sk_buff *skb, struct net_device *netdev) } if (ipt->pass_filter - && SK_RUN_FILTER(ipt->pass_filter, skb) == 0) { + && BPF_PROG_RUN(ipt->pass_filter, skb) == 0) { if (ipt->debug & 0x4) printk(KERN_DEBUG "IPPP: outbound frame filtered.\n"); kfree_skb(skb); goto unlock; } if (!(ipt->active_filter - && SK_RUN_FILTER(ipt->active_filter, skb) == 0)) { + && BPF_PROG_RUN(ipt->active_filter, skb) == 0)) { if (ipt->debug & 0x4) printk(KERN_DEBUG "IPPP: link-active filter: resetting huptimer.\n"); lp->huptimer = 0; @@ -1517,9 +1515,9 @@ int isdn_ppp_autodial_filter(struct sk_buff *skb, isdn_net_local *lp) } drop |= is->pass_filter - && SK_RUN_FILTER(is->pass_filter, skb) == 0; + && BPF_PROG_RUN(is->pass_filter, skb) == 0; drop |= is->active_filter - && SK_RUN_FILTER(is->active_filter, skb) == 0; + && BPF_PROG_RUN(is->active_filter, skb) == 0; skb_push(skb, IPPP_MAX_HEADER - 4); return drop; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 765248b42a0a..fa0d71727894 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -143,8 +143,8 @@ struct ppp { struct sk_buff_head mrq; /* MP: receive reconstruction queue */ #endif /* CONFIG_PPP_MULTILINK */ #ifdef CONFIG_PPP_FILTER - struct sk_filter *pass_filter; /* filter for packets to pass */ - struct sk_filter *active_filter;/* filter for pkts to reset idle */ + struct bpf_prog *pass_filter; /* filter for packets to pass */ + struct bpf_prog *active_filter; /* filter for pkts to reset idle */ #endif /* CONFIG_PPP_FILTER */ struct net *ppp_net; /* the net we belong to */ struct ppp_link_stats stats64; /* 64 bit network stats */ @@ -762,12 +762,12 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ppp_lock(ppp); if (ppp->pass_filter) { - sk_unattached_filter_destroy(ppp->pass_filter); + bpf_prog_destroy(ppp->pass_filter); ppp->pass_filter = NULL; } if (fprog.filter != NULL) - err = sk_unattached_filter_create(&ppp->pass_filter, - &fprog); + err = bpf_prog_create(&ppp->pass_filter, + &fprog); else err = 0; kfree(code); @@ -788,12 +788,12 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ppp_lock(ppp); if (ppp->active_filter) { - sk_unattached_filter_destroy(ppp->active_filter); + bpf_prog_destroy(ppp->active_filter); ppp->active_filter = NULL; } if (fprog.filter != NULL) - err = sk_unattached_filter_create(&ppp->active_filter, - &fprog); + err = bpf_prog_create(&ppp->active_filter, + &fprog); else err = 0; kfree(code); @@ -1205,7 +1205,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) a four-byte PPP header on each packet */ *skb_push(skb, 2) = 1; if (ppp->pass_filter && - SK_RUN_FILTER(ppp->pass_filter, skb) == 0) { + BPF_PROG_RUN(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: outbound frame " @@ -1215,7 +1215,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) } /* if this packet passes the active filter, record the time */ if (!(ppp->active_filter && - SK_RUN_FILTER(ppp->active_filter, skb) == 0)) + BPF_PROG_RUN(ppp->active_filter, skb) == 0)) ppp->last_xmit = jiffies; skb_pull(skb, 2); #else @@ -1839,7 +1839,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) *skb_push(skb, 2) = 0; if (ppp->pass_filter && - SK_RUN_FILTER(ppp->pass_filter, skb) == 0) { + BPF_PROG_RUN(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: inbound frame " @@ -1848,7 +1848,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) return; } if (!(ppp->active_filter && - SK_RUN_FILTER(ppp->active_filter, skb) == 0)) + BPF_PROG_RUN(ppp->active_filter, skb) == 0)) ppp->last_recv = jiffies; __skb_pull(skb, 2); } else @@ -2829,12 +2829,12 @@ static void ppp_destroy_interface(struct ppp *ppp) #endif /* CONFIG_PPP_MULTILINK */ #ifdef CONFIG_PPP_FILTER if (ppp->pass_filter) { - sk_unattached_filter_destroy(ppp->pass_filter); + bpf_prog_destroy(ppp->pass_filter); ppp->pass_filter = NULL; } if (ppp->active_filter) { - sk_unattached_filter_destroy(ppp->active_filter); + bpf_prog_destroy(ppp->active_filter); ppp->active_filter = NULL; } #endif /* CONFIG_PPP_FILTER */ diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c index d7be9b36bce6..a1536d0d83a9 100644 --- a/drivers/net/team/team_mode_loadbalance.c +++ b/drivers/net/team/team_mode_loadbalance.c @@ -58,7 +58,7 @@ struct lb_priv_ex { }; struct lb_priv { - struct sk_filter __rcu *fp; + struct bpf_prog __rcu *fp; lb_select_tx_port_func_t __rcu *select_tx_port_func; struct lb_pcpu_stats __percpu *pcpu_stats; struct lb_priv_ex *ex; /* priv extension */ @@ -174,14 +174,14 @@ static lb_select_tx_port_func_t *lb_select_tx_port_get_func(const char *name) static unsigned int lb_get_skb_hash(struct lb_priv *lb_priv, struct sk_buff *skb) { - struct sk_filter *fp; + struct bpf_prog *fp; uint32_t lhash; unsigned char *c; fp = rcu_dereference_bh(lb_priv->fp); if (unlikely(!fp)) return 0; - lhash = SK_RUN_FILTER(fp, skb); + lhash = BPF_PROG_RUN(fp, skb); c = (char *) &lhash; return c[0] ^ c[1] ^ c[2] ^ c[3]; } @@ -271,8 +271,8 @@ static void __fprog_destroy(struct sock_fprog_kern *fprog) static int lb_bpf_func_set(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); - struct sk_filter *fp = NULL; - struct sk_filter *orig_fp = NULL; + struct bpf_prog *fp = NULL; + struct bpf_prog *orig_fp = NULL; struct sock_fprog_kern *fprog = NULL; int err; @@ -281,7 +281,7 @@ static int lb_bpf_func_set(struct team *team, struct team_gsetter_ctx *ctx) ctx->data.bin_val.ptr); if (err) return err; - err = sk_unattached_filter_create(&fp, fprog); + err = bpf_prog_create(&fp, fprog); if (err) { __fprog_destroy(fprog); return err; @@ -300,7 +300,7 @@ static int lb_bpf_func_set(struct team *team, struct team_gsetter_ctx *ctx) if (orig_fp) { synchronize_rcu(); - sk_unattached_filter_destroy(orig_fp); + bpf_prog_destroy(orig_fp); } return 0; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 7cb9b40e9a2f..a5227ab8ccb1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -296,7 +296,8 @@ enum { }) /* Macro to invoke filter function. */ -#define SK_RUN_FILTER(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi) +#define SK_RUN_FILTER(filter, ctx) \ + (*filter->prog->bpf_func)(ctx, filter->prog->insnsi) struct bpf_insn { __u8 code; /* opcode */ @@ -323,12 +324,10 @@ struct sk_buff; struct sock; struct seccomp_data; -struct sk_filter { - atomic_t refcnt; +struct bpf_prog { u32 jited:1, /* Is our filter JIT'ed? */ len:31; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ - struct rcu_head rcu; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); union { @@ -338,25 +337,32 @@ struct sk_filter { }; }; -static inline unsigned int sk_filter_size(unsigned int proglen) +struct sk_filter { + atomic_t refcnt; + struct rcu_head rcu; + struct bpf_prog *prog; +}; + +#define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi) + +static inline unsigned int bpf_prog_size(unsigned int proglen) { - return max(sizeof(struct sk_filter), - offsetof(struct sk_filter, insns[proglen])); + return max(sizeof(struct bpf_prog), + offsetof(struct bpf_prog, insns[proglen])); } #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) int sk_filter(struct sock *sk, struct sk_buff *skb); -void sk_filter_select_runtime(struct sk_filter *fp); -void sk_filter_free(struct sk_filter *fp); +void bpf_prog_select_runtime(struct bpf_prog *fp); +void bpf_prog_free(struct bpf_prog *fp); int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len); -int sk_unattached_filter_create(struct sk_filter **pfp, - struct sock_fprog_kern *fprog); -void sk_unattached_filter_destroy(struct sk_filter *fp); +int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); +void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_detach_filter(struct sock *sk); @@ -369,7 +375,7 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -void bpf_int_jit_compile(struct sk_filter *fp); +void bpf_int_jit_compile(struct bpf_prog *fp); #define BPF_ANC BIT(15) @@ -423,8 +429,8 @@ static inline void *bpf_load_pointer(const struct sk_buff *skb, int k, #include #include -void bpf_jit_compile(struct sk_filter *fp); -void bpf_jit_free(struct sk_filter *fp); +void bpf_jit_compile(struct bpf_prog *fp); +void bpf_jit_free(struct bpf_prog *fp); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) @@ -438,11 +444,11 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, #else #include -static inline void bpf_jit_compile(struct sk_filter *fp) +static inline void bpf_jit_compile(struct bpf_prog *fp) { } -static inline void bpf_jit_free(struct sk_filter *fp) +static inline void bpf_jit_free(struct bpf_prog *fp) { kfree(fp); } diff --git a/include/linux/isdn_ppp.h b/include/linux/isdn_ppp.h index 8e10f57f109f..a0070c6dfaf8 100644 --- a/include/linux/isdn_ppp.h +++ b/include/linux/isdn_ppp.h @@ -180,8 +180,8 @@ struct ippp_struct { struct slcompress *slcomp; #endif #ifdef CONFIG_IPPP_FILTER - struct sk_filter *pass_filter; /* filter for packets to pass */ - struct sk_filter *active_filter; /* filter for pkts to reset idle */ + struct bpf_prog *pass_filter; /* filter for packets to pass */ + struct bpf_prog *active_filter; /* filter for pkts to reset idle */ #endif unsigned long debug; struct isdn_ppp_compressor *compressor,*decompressor; diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h index 2ec9fbcd06f9..1fad2c27ac32 100644 --- a/include/uapi/linux/netfilter/xt_bpf.h +++ b/include/uapi/linux/netfilter/xt_bpf.h @@ -6,14 +6,14 @@ #define XT_BPF_MAX_NUM_INSTR 64 -struct sk_filter; +struct bpf_prog; struct xt_bpf_info { __u16 bpf_program_num_elem; struct sock_filter bpf_program[XT_BPF_MAX_NUM_INSTR]; /* only used in the kernel */ - struct sk_filter *filter __attribute__((aligned(8))); + struct bpf_prog *filter __attribute__((aligned(8))); }; #endif /*_XT_BPF_H */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 188ac5ba3900..7f0dbcbb34af 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -73,15 +73,13 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } /** - * __sk_run_filter - run a filter on a given context - * @ctx: buffer to run the filter on - * @insn: filter to apply + * __bpf_prog_run - run eBPF program on a given context + * @ctx: is the data we are operating on + * @insn: is the array of eBPF instructions * - * Decode and apply filter instructions to the skb->data. Return length to - * keep, 0 for none. @ctx is the data we are operating on, @insn is the - * array of filter instructions. + * Decode and execute eBPF instructions. */ -static unsigned int __sk_run_filter(void *ctx, const struct bpf_insn *insn) +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) { u64 stack[MAX_BPF_STACK / sizeof(u64)]; u64 regs[MAX_BPF_REG], tmp; @@ -508,29 +506,29 @@ load_byte: return 0; } -void __weak bpf_int_jit_compile(struct sk_filter *prog) +void __weak bpf_int_jit_compile(struct bpf_prog *prog) { } /** - * sk_filter_select_runtime - select execution runtime for BPF program - * @fp: sk_filter populated with internal BPF program + * bpf_prog_select_runtime - select execution runtime for BPF program + * @fp: bpf_prog populated with internal BPF program * * try to JIT internal BPF program, if JIT is not available select interpreter - * BPF program will be executed via SK_RUN_FILTER() macro + * BPF program will be executed via BPF_PROG_RUN() macro */ -void sk_filter_select_runtime(struct sk_filter *fp) +void bpf_prog_select_runtime(struct bpf_prog *fp) { - fp->bpf_func = (void *) __sk_run_filter; + fp->bpf_func = (void *) __bpf_prog_run; /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); } -EXPORT_SYMBOL_GPL(sk_filter_select_runtime); +EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); /* free internal BPF program */ -void sk_filter_free(struct sk_filter *fp) +void bpf_prog_free(struct bpf_prog *fp) { bpf_jit_free(fp); } -EXPORT_SYMBOL_GPL(sk_filter_free); +EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 33a3a97e2b58..2f3fa2cc2eac 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -54,7 +54,7 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; - struct sk_filter *prog; + struct bpf_prog *prog; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -187,7 +187,7 @@ static u32 seccomp_run_filters(int syscall) * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); + u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; @@ -260,7 +260,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(sk_filter_size(new_len), + filter->prog = kzalloc(bpf_prog_size(new_len), GFP_KERNEL|__GFP_NOWARN); if (!filter->prog) goto free_filter; @@ -273,7 +273,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) atomic_set(&filter->usage, 1); filter->prog->len = new_len; - sk_filter_select_runtime(filter->prog); + bpf_prog_select_runtime(filter->prog); /* * If there is an existing filter, make it the prev and don't drop its @@ -337,7 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - sk_filter_free(freeme->prog); + bpf_prog_free(freeme->prog); kfree(freeme); } } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 5f48623ee1a7..89e0345733bd 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1761,9 +1761,9 @@ static int probe_filter_length(struct sock_filter *fp) return len + 1; } -static struct sk_filter *generate_filter(int which, int *err) +static struct bpf_prog *generate_filter(int which, int *err) { - struct sk_filter *fp; + struct bpf_prog *fp; struct sock_fprog_kern fprog; unsigned int flen = probe_filter_length(tests[which].u.insns); __u8 test_type = tests[which].aux & TEST_TYPE_MASK; @@ -1773,7 +1773,7 @@ static struct sk_filter *generate_filter(int which, int *err) fprog.filter = tests[which].u.insns; fprog.len = flen; - *err = sk_unattached_filter_create(&fp, &fprog); + *err = bpf_prog_create(&fp, &fprog); if (tests[which].aux & FLAG_EXPECTED_FAIL) { if (*err == -EINVAL) { pr_cont("PASS\n"); @@ -1798,7 +1798,7 @@ static struct sk_filter *generate_filter(int which, int *err) break; case INTERNAL: - fp = kzalloc(sk_filter_size(flen), GFP_KERNEL); + fp = kzalloc(bpf_prog_size(flen), GFP_KERNEL); if (fp == NULL) { pr_cont("UNEXPECTED_FAIL no memory left\n"); *err = -ENOMEM; @@ -1809,7 +1809,7 @@ static struct sk_filter *generate_filter(int which, int *err) memcpy(fp->insnsi, tests[which].u.insns_int, fp->len * sizeof(struct bpf_insn)); - sk_filter_select_runtime(fp); + bpf_prog_select_runtime(fp); break; } @@ -1817,21 +1817,21 @@ static struct sk_filter *generate_filter(int which, int *err) return fp; } -static void release_filter(struct sk_filter *fp, int which) +static void release_filter(struct bpf_prog *fp, int which) { __u8 test_type = tests[which].aux & TEST_TYPE_MASK; switch (test_type) { case CLASSIC: - sk_unattached_filter_destroy(fp); + bpf_prog_destroy(fp); break; case INTERNAL: - sk_filter_free(fp); + bpf_prog_free(fp); break; } } -static int __run_one(const struct sk_filter *fp, const void *data, +static int __run_one(const struct bpf_prog *fp, const void *data, int runs, u64 *duration) { u64 start, finish; @@ -1840,7 +1840,7 @@ static int __run_one(const struct sk_filter *fp, const void *data, start = ktime_to_us(ktime_get()); for (i = 0; i < runs; i++) - ret = SK_RUN_FILTER(fp, data); + ret = BPF_PROG_RUN(fp, data); finish = ktime_to_us(ktime_get()); @@ -1850,7 +1850,7 @@ static int __run_one(const struct sk_filter *fp, const void *data, return ret; } -static int run_one(const struct sk_filter *fp, struct bpf_test *test) +static int run_one(const struct bpf_prog *fp, struct bpf_test *test) { int err_cnt = 0, i, runs = MAX_TESTRUNS; @@ -1884,7 +1884,7 @@ static __init int test_bpf(void) int i, err_cnt = 0, pass_cnt = 0; for (i = 0; i < ARRAY_SIZE(tests); i++) { - struct sk_filter *fp; + struct bpf_prog *fp; int err; pr_info("#%d %s ", i, tests[i].descr); diff --git a/net/core/filter.c b/net/core/filter.c index 6ac901613bee..d814b8a89d0f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -810,8 +810,8 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) } EXPORT_SYMBOL(bpf_check_classic); -static int sk_store_orig_filter(struct sk_filter *fp, - const struct sock_fprog *fprog) +static int bpf_prog_store_orig_filter(struct bpf_prog *fp, + const struct sock_fprog *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; @@ -831,7 +831,7 @@ static int sk_store_orig_filter(struct sk_filter *fp, return 0; } -static void sk_release_orig_filter(struct sk_filter *fp) +static void bpf_release_orig_filter(struct bpf_prog *fp) { struct sock_fprog_kern *fprog = fp->orig_prog; @@ -841,10 +841,16 @@ static void sk_release_orig_filter(struct sk_filter *fp) } } +static void __bpf_prog_release(struct bpf_prog *prog) +{ + bpf_release_orig_filter(prog); + bpf_prog_free(prog); +} + static void __sk_filter_release(struct sk_filter *fp) { - sk_release_orig_filter(fp); - sk_filter_free(fp); + __bpf_prog_release(fp->prog); + kfree(fp); } /** @@ -872,7 +878,7 @@ static void sk_filter_release(struct sk_filter *fp) void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { - u32 filter_size = sk_filter_size(fp->len); + u32 filter_size = bpf_prog_size(fp->prog->len); atomic_sub(filter_size, &sk->sk_omem_alloc); sk_filter_release(fp); @@ -883,7 +889,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) */ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - u32 filter_size = sk_filter_size(fp->len); + u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= sysctl_optmem_max && @@ -895,10 +901,10 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) return false; } -static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) +static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; - struct sk_filter *old_fp; + struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; /* We are free to overwrite insns et al right here as it @@ -927,7 +933,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = krealloc(old_fp, sk_filter_size(new_len), GFP_KERNEL); + fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -949,7 +955,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) */ goto out_err_free; - sk_filter_select_runtime(fp); + bpf_prog_select_runtime(fp); kfree(old_prog); return fp; @@ -957,11 +963,11 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) out_err_free: kfree(old_prog); out_err: - __sk_filter_release(fp); + __bpf_prog_release(fp); return ERR_PTR(err); } -static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) +static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) { int err; @@ -970,7 +976,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) err = bpf_check_classic(fp->insns, fp->len); if (err) { - __sk_filter_release(fp); + __bpf_prog_release(fp); return ERR_PTR(err); } @@ -983,13 +989,13 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) * internal BPF translation for the optimized interpreter. */ if (!fp->jited) - fp = __sk_migrate_filter(fp); + fp = bpf_migrate_filter(fp); return fp; } /** - * sk_unattached_filter_create - create an unattached filter + * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created * @fprog: the filter program * @@ -998,23 +1004,21 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ -int sk_unattached_filter_create(struct sk_filter **pfp, - struct sock_fprog_kern *fprog) +int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); - struct sk_filter *fp; + struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL); + fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); - atomic_set(&fp->refcnt, 1); fp->len = fprog->len; /* Since unattached filters are not copied back to user * space through sk_get_filter(), we do not need to hold @@ -1022,23 +1026,23 @@ int sk_unattached_filter_create(struct sk_filter **pfp, */ fp->orig_prog = NULL; - /* __sk_prepare_filter() already takes care of freeing + /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = __sk_prepare_filter(fp); + fp = bpf_prepare_filter(fp); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } -EXPORT_SYMBOL_GPL(sk_unattached_filter_create); +EXPORT_SYMBOL_GPL(bpf_prog_create); -void sk_unattached_filter_destroy(struct sk_filter *fp) +void bpf_prog_destroy(struct bpf_prog *fp) { - __sk_filter_release(fp); + __bpf_prog_release(fp); } -EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); +EXPORT_SYMBOL_GPL(bpf_prog_destroy); /** * sk_attach_filter - attach a socket filter @@ -1054,7 +1058,8 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct sk_filter *fp, *old_fp; unsigned int fsize = bpf_classic_proglen(fprog); - unsigned int sk_fsize = sk_filter_size(fprog->len); + unsigned int bpf_fsize = bpf_prog_size(fprog->len); + struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) @@ -1064,29 +1069,36 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(sk_fsize, GFP_KERNEL); - if (!fp) + prog = kmalloc(bpf_fsize, GFP_KERNEL); + if (!prog) return -ENOMEM; - if (copy_from_user(fp->insns, fprog->filter, fsize)) { - kfree(fp); + if (copy_from_user(prog->insns, fprog->filter, fsize)) { + kfree(prog); return -EFAULT; } - fp->len = fprog->len; + prog->len = fprog->len; - err = sk_store_orig_filter(fp, fprog); + err = bpf_prog_store_orig_filter(prog, fprog); if (err) { - kfree(fp); + kfree(prog); return -ENOMEM; } - /* __sk_prepare_filter() already takes care of freeing + /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = __sk_prepare_filter(fp); - if (IS_ERR(fp)) - return PTR_ERR(fp); + prog = bpf_prepare_filter(prog); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) { + __bpf_prog_release(prog); + return -ENOMEM; + } + fp->prog = prog; atomic_set(&fp->refcnt, 0); @@ -1142,7 +1154,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, /* We're copying the filter that has been originally attached, * so no conversion/decode needed anymore. */ - fprog = filter->orig_prog; + fprog = filter->prog->orig_prog; ret = fprog->len; if (!len) diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index 12ab7b4be609..4eab4a94a59d 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -107,11 +107,11 @@ #include #include -static struct sk_filter *ptp_insns __read_mostly; +static struct bpf_prog *ptp_insns __read_mostly; unsigned int ptp_classify_raw(const struct sk_buff *skb) { - return SK_RUN_FILTER(ptp_insns, skb); + return BPF_PROG_RUN(ptp_insns, skb); } EXPORT_SYMBOL_GPL(ptp_classify_raw); @@ -189,5 +189,5 @@ void __init ptp_classifier_init(void) .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter, }; - BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog)); + BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog)); } diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 57d922320c59..ad704c757bb4 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -68,7 +68,7 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, if (!filter) goto out; - fprog = filter->orig_prog; + fprog = filter->prog->orig_prog; flen = bpf_classic_proglen(fprog); attr = nla_reserve(skb, attrtype, flen); diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index bbffdbdaf603..dffee9d47ec4 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -28,7 +28,7 @@ static int bpf_mt_check(const struct xt_mtchk_param *par) program.len = info->bpf_program_num_elem; program.filter = info->bpf_program; - if (sk_unattached_filter_create(&info->filter, &program)) { + if (bpf_prog_create(&info->filter, &program)) { pr_info("bpf: check failed: parse error\n"); return -EINVAL; } @@ -40,13 +40,13 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; - return SK_RUN_FILTER(info->filter, skb); + return BPF_PROG_RUN(info->filter, skb); } static void bpf_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_bpf_info *info = par->matchinfo; - sk_unattached_filter_destroy(info->filter); + bpf_prog_destroy(info->filter); } static struct xt_match bpf_mt_reg __read_mostly = { diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 13f64df2c710..0e30d58149da 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -30,7 +30,7 @@ struct cls_bpf_head { }; struct cls_bpf_prog { - struct sk_filter *filter; + struct bpf_prog *filter; struct sock_filter *bpf_ops; struct tcf_exts exts; struct tcf_result res; @@ -54,7 +54,7 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, int ret; list_for_each_entry(prog, &head->plist, link) { - int filter_res = SK_RUN_FILTER(prog->filter, skb); + int filter_res = BPF_PROG_RUN(prog->filter, skb); if (filter_res == 0) continue; @@ -92,7 +92,7 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) tcf_unbind_filter(tp, &prog->res); tcf_exts_destroy(tp, &prog->exts); - sk_unattached_filter_destroy(prog->filter); + bpf_prog_destroy(prog->filter); kfree(prog->bpf_ops); kfree(prog); @@ -161,7 +161,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, struct sock_filter *bpf_ops, *bpf_old; struct tcf_exts exts; struct sock_fprog_kern tmp; - struct sk_filter *fp, *fp_old; + struct bpf_prog *fp, *fp_old; u16 bpf_size, bpf_len; u32 classid; int ret; @@ -193,7 +193,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, tmp.len = bpf_len; tmp.filter = bpf_ops; - ret = sk_unattached_filter_create(&fp, &tmp); + ret = bpf_prog_create(&fp, &tmp); if (ret) goto errout_free; @@ -211,7 +211,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, tcf_exts_change(tp, &prog->exts, &exts); if (fp_old) - sk_unattached_filter_destroy(fp_old); + bpf_prog_destroy(fp_old); if (bpf_old) kfree(bpf_old); -- cgit v1.2.3-59-g8ed1b From f24b9be5957b38bb420b838115040dc2031b7d0c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:45 -0400 Subject: net-timestamp: extend SCM_TIMESTAMPING ancillary data struct Applications that request kernel tx timestamps with SO_TIMESTAMPING read timestamps as recvmsg() ancillary data. The response is defined implicitly as timespec[3]. 1) define struct scm_timestamping explicitly and 2) add support for new tstamp types. On tx, scm_timestamping always accompanies a sock_extended_err. Define previously unused field ee_info to signal the type of ts[0]. Introduce SCM_TSTAMP_SND to define the existing behavior. The reception path is not modified. On rx, no struct similar to sock_extended_err is passed along with SCM_TIMESTAMPING. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ include/net/sock.h | 4 +++- include/uapi/linux/errqueue.h | 18 ++++++++++++++++++ net/core/skbuff.c | 1 + net/socket.c | 20 +++++++++++--------- 5 files changed, 36 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 281deced7469..477f0f60db45 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -249,6 +249,9 @@ enum { SKBTX_SHARED_FRAG = 1 << 5, }; +#define SKBTX_ANY_SW_TSTAMP SKBTX_SW_TSTAMP +#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) + /* * The callback notifies userspace to release buffers when skb DMA is done in * lower device, the skb last reference should be 0 when calling this. diff --git a/include/net/sock.h b/include/net/sock.h index b91c8868ab8d..02f5b35e65f1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2169,7 +2169,9 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) */ if (sock_flag(sk, SOCK_RCVTSTAMP) || sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE) || - (kt.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) || + (kt.tv64 && + (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) || + skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP)) || (hwtstamps->hwtstamp.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index aacd4fb7102a..accee72cae7c 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -22,5 +22,23 @@ struct sock_extended_err { #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) +/** + * struct scm_timestamping - timestamps exposed through cmsg + * + * The timestamping interfaces SO_TIMESTAMPING, MSG_TSTAMP_* + * communicate network timestamps by passing this struct in a cmsg with + * recvmsg(). See Documentation/networking/timestamping.txt for details. + */ +struct scm_timestamping { + struct timespec ts[3]; +}; + +/* The type of scm_timestamping, passed in sock_extended_err ee_info. + * This defines the type of ts[0]. For SCM_TSTAMP_SND only, if ts[0] + * is zero, then this is a hardware timestamp and recorded in ts[2]. + */ +enum { + SCM_TSTAMP_SND, /* driver passed skb to NIC, or HW */ +}; #endif /* _UAPI_LINUX_ERRQUEUE_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c1a33033cbe2..c9f68802e992 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3521,6 +3521,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + serr->ee.ee_info = SCM_TSTAMP_SND; err = sock_queue_err_skb(sk, skb); diff --git a/net/socket.c b/net/socket.c index d8222c025061..dc0cc5d95ee5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -106,6 +106,7 @@ #include #include #include +#include #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; @@ -697,7 +698,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); - struct timespec ts[3]; + struct scm_timestamping tss; int empty = 1; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); @@ -714,24 +715,25 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv); } else { - skb_get_timestampns(skb, &ts[0]); + struct timespec ts; + skb_get_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, - sizeof(ts[0]), &ts[0]); + sizeof(ts), &ts); } } - - memset(ts, 0, sizeof(ts)); - if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) && - ktime_to_timespec_cond(skb->tstamp, ts + 0)) + memset(&tss, 0, sizeof(tss)); + if ((sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) || + skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP) && + ktime_to_timespec_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts + 2)) + ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) empty = 0; if (!empty) put_cmsg(msg, SOL_SOCKET, - SCM_TIMESTAMPING, sizeof(ts), &ts); + SCM_TIMESTAMPING, sizeof(tss), &tss); } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); -- cgit v1.2.3-59-g8ed1b From 09c2d251b70723650ba47e83571ff49281320f7c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:47 -0400 Subject: net-timestamp: add key to disambiguate concurrent datagrams Datagrams timestamped on transmission can coexist in the kernel stack and be reordered in packet scheduling. When reading looped datagrams from the socket error queue it is not always possible to unique correlate looped data with original send() call (for application level retransmits). Even if possible, it may be expensive and complex, requiring packet inspection. Introduce a data-independent ID mechanism to associate timestamps with send calls. Pass an ID alongside the timestamp in field ee_data of sock_extended_err. The ID is a simple 32 bit unsigned int that is associated with the socket and incremented on each send() call for which software tx timestamp generation is enabled. The feature is enabled only if SOF_TIMESTAMPING_OPT_ID is set, to avoid changing ee_data for existing applications that expect it 0. The counter is reset each time the flag is reenabled. Reenabling does not change the ID of already submitted data. It is possible to receive out of order IDs if the timestamp stream is not quiesced first. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/sock.h | 2 ++ include/uapi/linux/net_tstamp.h | 8 +++++--- net/core/skbuff.c | 2 ++ net/core/sock.c | 3 +++ net/ipv4/ip_output.c | 6 ++++++ net/ipv6/ip6_output.c | 9 ++++++++- 7 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 477f0f60db45..0e35b3af7317 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -278,6 +278,7 @@ struct skb_shared_info { unsigned short gso_type; struct sk_buff *frag_list; struct skb_shared_hwtstamps hwtstamps; + u32 tskey; __be32 ip6_frag_id; /* diff --git a/include/net/sock.h b/include/net/sock.h index a21129716aae..52fe0bc5598a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -280,6 +280,7 @@ struct cg_proto; * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_tsflags: SO_TIMESTAMPING socket options + * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data * @sk_frag: cached page frag @@ -414,6 +415,7 @@ struct sock { struct timer_list sk_timer; ktime_t sk_stamp; u16 sk_tsflags; + u32 sk_tskey; struct socket *sk_socket; void *sk_user_data; struct page_frag sk_frag; diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index f53879c0f590..1e861d2e1a31 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -20,9 +20,11 @@ enum { SOF_TIMESTAMPING_SOFTWARE = (1<<4), SOF_TIMESTAMPING_SYS_HARDWARE = (1<<5), SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6), - SOF_TIMESTAMPING_MASK = - (SOF_TIMESTAMPING_RAW_HARDWARE - 1) | - SOF_TIMESTAMPING_RAW_HARDWARE + SOF_TIMESTAMPING_OPT_ID = (1<<7), + + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID, + SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | + SOF_TIMESTAMPING_LAST }; /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c9f68802e992..0df4f1d14c5a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3522,6 +3522,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SND; + if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + serr->ee.ee_data = skb_shinfo(skb)->tskey; err = sock_queue_err_skb(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 47c9377e14b9..1e0f1c63ad6b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -848,6 +848,9 @@ set_rcvbuf: ret = -EINVAL; break; } + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) + sk->sk_tskey = 0; sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b16556836d66..215af2b155cb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -855,11 +855,15 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; + u32 tskey = 0; skb = skb_peek_tail(queue); exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->fragsize; + if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -976,6 +980,8 @@ alloc_new_skb: /* only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; /* * Find where to start putting bytes. diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f5dafe609f8b..315a55d66079 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1157,6 +1157,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int err; int offset = 0; __u8 tx_flags = 0; + u32 tskey = 0; if (flags&MSG_PROBE) return 0; @@ -1272,8 +1273,12 @@ emsgsize: } } - if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { sock_tx_timestamp(sk, &tx_flags); + if (tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; + } /* * Let's try using as much space as possible. @@ -1397,6 +1402,8 @@ alloc_new_skb: /* Only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = tx_flags; tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; /* * Find where to start putting bytes -- cgit v1.2.3-59-g8ed1b From e7fd2885385157d46c85f282fc6d7d297db43e1f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:48 -0400 Subject: net-timestamp: SCHED timestamp on entering packet scheduler Kernel transmit latency is often incurred in the packet scheduler. Introduce a new timestamp on transmission just before entering the scheduler. When data travels through multiple devices (bonding, tunneling, ...) each device will export an individual timestamp. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++++++-- include/uapi/linux/errqueue.h | 1 + include/uapi/linux/net_tstamp.h | 3 ++- net/core/dev.c | 4 ++++ net/core/skbuff.c | 16 ++++++++++++---- net/socket.c | 3 +++ 6 files changed, 31 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0e35b3af7317..50e1e9b3a5a5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -229,7 +229,7 @@ enum { /* generate hardware time stamp */ SKBTX_HW_TSTAMP = 1 << 0, - /* generate software time stamp */ + /* generate software time stamp when queueing packet to NIC */ SKBTX_SW_TSTAMP = 1 << 1, /* device driver is going to provide hardware time stamp */ @@ -247,9 +247,12 @@ enum { * all frags to avoid possible bad checksum */ SKBTX_SHARED_FRAG = 1 << 5, + + /* generate software time stamp when entering packet scheduling */ + SKBTX_SCHED_TSTAMP = 1 << 6, }; -#define SKBTX_ANY_SW_TSTAMP SKBTX_SW_TSTAMP +#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) /* @@ -2695,6 +2698,10 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb) void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps); +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype); + /** * skb_tstamp_tx - queue clone of skb with send time stamps * @orig_skb: the original outgoing packet diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index accee72cae7c..17437cf297b7 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -39,6 +39,7 @@ struct scm_timestamping { */ enum { SCM_TSTAMP_SND, /* driver passed skb to NIC, or HW */ + SCM_TSTAMP_SCHED, /* data entered the packet scheduler */ }; #endif /* _UAPI_LINUX_ERRQUEUE_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 1e861d2e1a31..60733845fcdd 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -21,8 +21,9 @@ enum { SOF_TIMESTAMPING_SYS_HARDWARE = (1<<5), SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6), SOF_TIMESTAMPING_OPT_ID = (1<<7), + SOF_TIMESTAMPING_TX_SCHED = (1<<8), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_SCHED, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/core/dev.c b/net/core/dev.c index b370230fe1d3..1c15b189c52b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2876,6 +2877,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_reset_mac_header(skb); + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0df4f1d14c5a..9705c0732aab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3490,10 +3490,10 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); -void skb_tstamp_tx(struct sk_buff *orig_skb, - struct skb_shared_hwtstamps *hwtstamps) +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) { - struct sock *sk = orig_skb->sk; struct sock_exterr_skb *serr; struct sk_buff *skb; int err; @@ -3521,7 +3521,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; - serr->ee.ee_info = SCM_TSTAMP_SND; + serr->ee.ee_info = tstype; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = skb_shinfo(skb)->tskey; @@ -3530,6 +3530,14 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, if (err) kfree_skb(skb); } +EXPORT_SYMBOL_GPL(__skb_tstamp_tx); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, + SCM_TSTAMP_SND); +} EXPORT_SYMBOL_GPL(skb_tstamp_tx); void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) diff --git a/net/socket.c b/net/socket.c index 255d9b802723..3a2778d71631 100644 --- a/net/socket.c +++ b/net/socket.c @@ -617,6 +617,9 @@ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) *tx_flags |= SKBTX_HW_TSTAMP; if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) *tx_flags |= SKBTX_SW_TSTAMP; + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) + *tx_flags |= SKBTX_SCHED_TSTAMP; + if (sock_flag(sk, SOCK_WIFI_STATUS)) *tx_flags |= SKBTX_WIFI_STATUS; } -- cgit v1.2.3-59-g8ed1b From e1c8a607b28190cd09a271508aa3025d3c2f312e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:50 -0400 Subject: net-timestamp: ACK timestamp for bytestreams Add SOF_TIMESTAMPING_TX_ACK, a request for a tstamp when the last byte in the send() call is acknowledged. It implements the feature for TCP. The timestamp is generated when the TCP socket cumulative ACK is moved beyond the tracked seqno for the first time. The feature ignores SACK and FACK, because those acknowledge the specific byte, but not necessarily the entire contents of the buffer up to that byte. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++++- include/uapi/linux/errqueue.h | 1 + include/uapi/linux/net_tstamp.h | 3 ++- net/ipv4/tcp_input.c | 6 ++++++ net/socket.c | 2 ++ 5 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 50e1e9b3a5a5..11c270551d25 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -250,9 +250,14 @@ enum { /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, + + /* generate software timestamp on peer data acknowledgment */ + SKBTX_ACK_TSTAMP = 1 << 7, }; -#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | SKBTX_SCHED_TSTAMP) +#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ + SKBTX_SCHED_TSTAMP | \ + SKBTX_ACK_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) /* diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 17437cf297b7..07bdce1f444a 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -40,6 +40,7 @@ struct scm_timestamping { enum { SCM_TSTAMP_SND, /* driver passed skb to NIC, or HW */ SCM_TSTAMP_SCHED, /* data entered the packet scheduler */ + SCM_TSTAMP_ACK, /* data acknowledged by peer */ }; #endif /* _UAPI_LINUX_ERRQUEUE_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 60733845fcdd..ff354021bb69 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -22,8 +22,9 @@ enum { SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6), SOF_TIMESTAMPING_OPT_ID = (1<<7), SOF_TIMESTAMPING_TX_SCHED = (1<<8), + SOF_TIMESTAMPING_TX_ACK = (1<<9), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_SCHED, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_ACK, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6a2984507755..a3d47af01906 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -74,6 +74,7 @@ #include #include #include +#include int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; @@ -3106,6 +3107,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->retrans_stamp = 0; } + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) && + between(skb_shinfo(skb)->tskey, prior_snd_una, + tp->snd_una + 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + if (!fully_acked) break; diff --git a/net/socket.c b/net/socket.c index 3a2778d71631..ae89569a2db5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -619,6 +619,8 @@ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) *tx_flags |= SKBTX_SW_TSTAMP; if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) *tx_flags |= SKBTX_SCHED_TSTAMP; + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) + *tx_flags |= SKBTX_ACK_TSTAMP; if (sock_flag(sk, SOCK_WIFI_STATUS)) *tx_flags |= SKBTX_WIFI_STATUS; -- cgit v1.2.3-59-g8ed1b