From 93491ced3c87c94b12220dbac0527e1356702179 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 10 May 2017 18:18:29 +0200 Subject: USB: hub: fix SS max number of ports Add define for the maximum number of ports on a SuperSpeed hub as per USB 3.1 spec Table 10-5, and use it when verifying the retrieved hub descriptor. This specifically avoids benign attempts to update the DeviceRemovable mask for non-existing ports (should we get that far). Fixes: dbe79bbe9dcb ("USB 3.0 Hub Changes") Acked-by: Alan Stern Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/usb/ch11.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/usb/ch11.h b/include/uapi/linux/usb/ch11.h index 361297e96f58..576c704e3fb8 100644 --- a/include/uapi/linux/usb/ch11.h +++ b/include/uapi/linux/usb/ch11.h @@ -22,6 +22,9 @@ */ #define USB_MAXCHILDREN 31 +/* See USB 3.1 spec Table 10-5 */ +#define USB_SS_MAXPORTS 15 + /* * Hub request types */ -- cgit v1.3-6-gb490 From 5bc1701881e395cec51811d07ec6961f3d1b2612 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:08:01 +0200 Subject: net: sched: introduce multichain support for filters Instead of having only one filter per block, introduce a list of chains for every block. Create chain 0 by default. UAPI is extended so the user can specify which chain he wants to change. If the new attribute is not specified, chain 0 is used. That allows to maintain backward compatibility. If chain does not exist and user wants to manipulate with it, new chain is created with specified index. Also, when last filter is removed from the chain, the chain is destroyed. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 + include/net/sch_generic.h | 9 +++- include/uapi/linux/rtnetlink.h | 1 + net/sched/cls_api.c | 104 ++++++++++++++++++++++++++++++++++------- 4 files changed, 98 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e56e7157c280..2c213a69c196 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -18,6 +18,8 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); #ifdef CONFIG_NET_CLS +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index); +void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain); void tcf_block_put(struct tcf_block *block); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 52bceede534b..569b5654c30c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -236,7 +237,7 @@ struct tcf_proto { struct Qdisc *q; void *data; const struct tcf_proto_ops *ops; - struct tcf_block *block; + struct tcf_chain *chain; struct rcu_head rcu; }; @@ -251,10 +252,14 @@ struct qdisc_skb_cb { struct tcf_chain { struct tcf_proto __rcu *filter_chain; struct tcf_proto __rcu **p_filter_chain; + struct list_head list; + struct tcf_block *block; + u32 index; /* chain index */ + unsigned int refcnt; }; struct tcf_block { - struct tcf_chain *chain; + struct list_head chain_list; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index cce061382e40..6487b21b2c1e 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -549,6 +549,7 @@ enum { TCA_STAB, TCA_PAD, TCA_DUMP_INVISIBLE, + TCA_CHAIN, __TCA_MAX }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 63aa2ea5f00c..adacaf299c4a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -129,7 +129,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, u32 prio, u32 parent, struct Qdisc *q, - struct tcf_block *block) + struct tcf_chain *chain) { struct tcf_proto *tp; int err; @@ -165,7 +165,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->prio = prio; tp->classid = parent; tp->q = q; - tp->block = block; + tp->chain = chain; err = tp->ops->init(tp); if (err) { @@ -186,15 +186,26 @@ static void tcf_proto_destroy(struct tcf_proto *tp) kfree_rcu(tp, rcu); } -static struct tcf_chain *tcf_chain_create(void) +static struct tcf_chain *tcf_chain_create(struct tcf_block *block, + u32 chain_index) { - return kzalloc(sizeof(struct tcf_chain), GFP_KERNEL); + struct tcf_chain *chain; + + chain = kzalloc(sizeof(*chain), GFP_KERNEL); + if (!chain) + return NULL; + list_add_tail(&chain->list, &block->chain_list); + chain->block = block; + chain->index = chain_index; + chain->refcnt = 1; + return chain; } static void tcf_chain_destroy(struct tcf_chain *chain) { struct tcf_proto *tp; + list_del(&chain->list); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp); @@ -202,6 +213,30 @@ static void tcf_chain_destroy(struct tcf_chain *chain) kfree(chain); } +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index) +{ + struct tcf_chain *chain; + + list_for_each_entry(chain, &block->chain_list, list) { + if (chain->index == chain_index) { + chain->refcnt++; + return chain; + } + } + return tcf_chain_create(block, chain_index); +} +EXPORT_SYMBOL(tcf_chain_get); + +void tcf_chain_put(struct tcf_chain *chain) +{ + /* Destroy unused chain, with exception of chain 0, which is the + * default one and has to be always present. + */ + if (--chain->refcnt == 0 && !chain->filter_chain && chain->index != 0) + tcf_chain_destroy(chain); +} +EXPORT_SYMBOL(tcf_chain_put); + static void tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain, struct tcf_proto __rcu **p_filter_chain) @@ -213,16 +248,19 @@ int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain) { struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); + struct tcf_chain *chain; int err; if (!block) return -ENOMEM; - block->chain = tcf_chain_create(); - if (!block->chain) { + INIT_LIST_HEAD(&block->chain_list); + /* Create chain 0 by default, it has to be always present. */ + chain = tcf_chain_create(block, 0); + if (!chain) { err = -ENOMEM; goto err_chain_create; } - tcf_chain_filter_chain_ptr_set(block->chain, p_filter_chain); + tcf_chain_filter_chain_ptr_set(chain, p_filter_chain); *p_block = block; return 0; @@ -234,9 +272,13 @@ EXPORT_SYMBOL(tcf_block_get); void tcf_block_put(struct tcf_block *block) { + struct tcf_chain *chain, *tmp; + if (!block) return; - tcf_chain_destroy(block->chain); + + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_destroy(chain); kfree(block); } EXPORT_SYMBOL(tcf_block_put); @@ -360,10 +402,11 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, u32 prio; bool prio_allocate; u32 parent; + u32 chain_index; struct net_device *dev; struct Qdisc *q; struct tcf_chain_info chain_info; - struct tcf_chain *chain; + struct tcf_chain *chain = NULL; struct tcf_block *block; struct tcf_proto *tp; const struct Qdisc_class_ops *cops; @@ -449,7 +492,17 @@ replay: err = -EINVAL; goto errout; } - chain = block->chain; + + chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + if (chain_index > TC_ACT_EXT_VAL_MASK) { + err = -EINVAL; + goto errout; + } + chain = tcf_chain_get(block, chain_index); + if (!chain) { + err = -ENOMEM; + goto errout; + } if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); @@ -483,7 +536,7 @@ replay: prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info)); tp = tcf_proto_create(nla_data(tca[TCA_KIND]), - protocol, prio, parent, q, block); + protocol, prio, parent, q, chain); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout; @@ -556,6 +609,8 @@ replay: } errout: + if (chain) + tcf_chain_put(chain); if (cl) cops->put(q, cl); if (err == -EAGAIN) @@ -584,6 +639,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index)) + goto nla_put_failure; tcm->tcm_handle = fh; if (RTM_DELTFILTER != event) { tcm->tcm_handle = 0; @@ -640,7 +697,7 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, RTM_NEWTFILTER); } -static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, +static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, struct netlink_callback *cb, long index_start, long *p_index) { @@ -667,7 +724,7 @@ static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) - break; + return false; cb->args[1] = 1; } @@ -682,14 +739,16 @@ static void tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, tp->ops->walk(tp, &arg.w); cb->args[1] = arg.w.count + 1; if (arg.w.stop) - break; + return false; } + return true; } /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q; struct tcf_block *block; @@ -699,9 +758,15 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) const struct Qdisc_class_ops *cops; long index_start; long index; + int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; + + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + if (err) + return err; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return skb->len; @@ -725,11 +790,18 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) block = cops->tcf_block(q, cl); if (!block) goto errout; - chain = block->chain; index_start = cb->args[0]; index = 0; - tcf_chain_dump(chain, skb, cb, index_start, &index); + + list_for_each_entry(chain, &block->chain_list, list) { + if (tca[TCA_CHAIN] && + nla_get_u32(tca[TCA_CHAIN]) != chain->index) + continue; + if (!tcf_chain_dump(chain, skb, cb, index_start, &index)) + break; + } + cb->args[0] = index; errout: -- cgit v1.3-6-gb490 From db50514f9a9c7ef1f17e9921b1cc0902746872f3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 May 2017 11:08:03 +0200 Subject: net: sched: add termination action to allow goto chain Introduce new type of termination action called "goto_chain". This allows user to specify a chain to be processed. This action type is then processed as a return value in tcf_classify loop in similar way as "reclassify" is, only it does not reset to the first filter in chain but rather reset to the first filter of the desired chain. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + include/net/sch_generic.h | 9 +++++++-- include/uapi/linux/pkt_cls.h | 1 + net/sched/act_api.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/sched/cls_api.c | 6 +++++- 5 files changed, 54 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/act_api.h b/include/net/act_api.h index b22c6f3d6710..26ffd8333f50 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -42,6 +42,7 @@ struct tc_action { struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie *act_cookie; + struct tcf_chain *goto_chain; }; #define tcf_head common.tcfa_head #define tcf_index common.tcfa_index diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 569b5654c30c..368850194c94 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -193,8 +193,13 @@ struct Qdisc_ops { struct tcf_result { - unsigned long class; - u32 classid; + union { + struct { + unsigned long class; + u32 classid; + }; + const struct tcf_proto *goto_tp; + }; }; struct tcf_proto_ops { diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index d613be3b3239..1b9aa9e6b4fd 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -51,6 +51,7 @@ enum { (((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode) #define TC_ACT_JUMP __TC_ACT_EXT(1) +#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) /* Action type identifiers*/ enum { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index e389eb45b484..0ecf2a858767 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -28,6 +28,31 @@ #include #include +static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) +{ + u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK; + + if (!tp) + return -EINVAL; + a->goto_chain = tcf_chain_get(tp->chain->block, chain_index); + if (!a->goto_chain) + return -ENOMEM; + return 0; +} + +static void tcf_action_goto_chain_fini(struct tc_action *a) +{ + tcf_chain_put(a->goto_chain); +} + +static void tcf_action_goto_chain_exec(const struct tc_action *a, + struct tcf_result *res) +{ + const struct tcf_chain *chain = a->goto_chain; + + res->goto_tp = rcu_dereference_bh(chain->filter_chain); +} + static void free_tcf(struct rcu_head *head) { struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu); @@ -39,6 +64,8 @@ static void free_tcf(struct rcu_head *head) kfree(p->act_cookie->data); kfree(p->act_cookie); } + if (p->goto_chain) + tcf_action_goto_chain_fini(p); kfree(p); } @@ -465,6 +492,8 @@ repeat: else /* faulty graph, stop pipeline */ return TC_ACT_OK; } + } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { + tcf_action_goto_chain_exec(a, res); } if (ret != TC_ACT_PIPE) @@ -657,6 +686,17 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err != ACT_P_CREATED) module_put(a_o->owner); + if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { + err = tcf_action_goto_chain_init(a, tp); + if (err) { + LIST_HEAD(actions); + + list_add_tail(&a->list, &actions); + tcf_action_destroy(&actions, bind); + return ERR_PTR(err); + } + } + return a; err_mod: diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9e0c4bb82528..4020b8d932a1 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -307,8 +307,12 @@ reclassify: err = tp->classify(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT - if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) + if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { goto reset; + } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) { + old_tp = res->goto_tp; + goto reset; + } #endif if (err >= 0) return err; -- cgit v1.3-6-gb490 From b8210a9e4bea6354eccc5d8a50ecc21ea7486dc9 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 19 May 2017 17:52:35 +0200 Subject: net: define receive timestamp filter for NTP Add HWTSTAMP_FILTER_NTP_ALL to the hwtstamp_rx_filters enum for timestamping of NTP packets. There is currently only one driver (phyter) that could support it directly. CC: Richard Cochran CC: Willem de Bruijn Signed-off-by: Miroslav Lichvar Signed-off-by: David S. Miller --- include/uapi/linux/net_tstamp.h | 3 +++ net/core/dev_ioctl.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 464dcca5ed68..0749fb13e517 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -125,6 +125,9 @@ enum hwtstamp_rx_filters { HWTSTAMP_FILTER_PTP_V2_SYNC, /* PTP v2/802.AS1, any layer, Delay_req packet */ HWTSTAMP_FILTER_PTP_V2_DELAY_REQ, + + /* NTP, UDP, all versions and packet modes */ + HWTSTAMP_FILTER_NTP_ALL, }; #endif /* _NET_TIMESTAMPING_H */ diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index b94b1d293506..8f036a76b92e 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -227,6 +227,8 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: rx_filter_valid = 1; break; + case HWTSTAMP_FILTER_NTP_ALL: + break; } if (!tx_type_valid || !rx_filter_valid) -- cgit v1.3-6-gb490 From aad9c8c470f2a8321a99eb053630ce0e199558d6 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 19 May 2017 17:52:38 +0200 Subject: net: add new control message for incoming HW-timestamped packets Add SOF_TIMESTAMPING_OPT_PKTINFO option to request a new control message for incoming packets with hardware timestamps. It contains the index of the real interface which received the packet and the length of the packet at layer 2. The index is useful with bonding, bridges and other interfaces, where IP_PKTINFO doesn't allow applications to determine which PHC made the timestamp. With the L2 length (and link speed) it is possible to transpose preamble timestamps to trailer timestamps, which are used in the NTP protocol. While this information could be provided by two new socket options independently from timestamping, it doesn't look like they would be very useful. With this option any performance impact is limited to hardware timestamping. Use dev_get_by_napi_id() to get the device and its index. On kernels with disabled CONFIG_NET_RX_BUSY_POLL or drivers not using NAPI, a zero index will be returned in the control message. CC: Richard Cochran Acked-by: Willem de Bruijn Signed-off-by: Miroslav Lichvar Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 10 ++++++++++ include/uapi/asm-generic/socket.h | 2 ++ include/uapi/linux/net_tstamp.h | 11 ++++++++++- net/socket.c | 27 ++++++++++++++++++++++++++- 4 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 96f50694a748..ce11e3a08c0d 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -193,6 +193,16 @@ SOF_TIMESTAMPING_OPT_STATS: the transmit timestamps, such as how long a certain block of data was limited by peer's receiver window. +SOF_TIMESTAMPING_OPT_PKTINFO: + + Enable the SCM_TIMESTAMPING_PKTINFO control message for incoming + packets with hardware timestamps. The message contains struct + scm_ts_pktinfo, which supplies the index of the real interface which + received the packet and its length at layer 2. A valid (non-zero) + interface index will be returned only if CONFIG_NET_RX_BUSY_POLL is + enabled and the driver is using NAPI. The struct contains also two + other fields, but they are reserved and undefined. + New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate regardless of the setting of sysctl net.core.tstamp_allow_data. diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 2b488565599d..a5f6e819fafd 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -100,4 +100,6 @@ #define SO_COOKIE 57 +#define SCM_TIMESTAMPING_PKTINFO 58 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 0749fb13e517..dee74d39da94 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -9,6 +9,7 @@ #ifndef _NET_TIMESTAMPING_H #define _NET_TIMESTAMPING_H +#include #include /* for SO_TIMESTAMPING */ /* SO_TIMESTAMPING gets an integer bit field comprised of these values */ @@ -26,8 +27,9 @@ enum { SOF_TIMESTAMPING_OPT_CMSG = (1<<10), SOF_TIMESTAMPING_OPT_TSONLY = (1<<11), SOF_TIMESTAMPING_OPT_STATS = (1<<12), + SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_PKTINFO, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; @@ -130,4 +132,11 @@ enum hwtstamp_rx_filters { HWTSTAMP_FILTER_NTP_ALL, }; +/* SCM_TIMESTAMPING_PKTINFO control message */ +struct scm_ts_pktinfo { + __u32 if_index; + __u32 pkt_length; + __u32 reserved[2]; +}; + #endif /* _NET_TIMESTAMPING_H */ diff --git a/net/socket.c b/net/socket.c index c2564eb25c6b..67db7d8a3b81 100644 --- a/net/socket.c +++ b/net/socket.c @@ -662,6 +662,27 @@ static bool skb_is_err_queue(const struct sk_buff *skb) return skb->pkt_type == PACKET_OUTGOING; } +static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct scm_ts_pktinfo ts_pktinfo; + struct net_device *orig_dev; + + if (!skb_mac_header_was_set(skb)) + return; + + memset(&ts_pktinfo, 0, sizeof(ts_pktinfo)); + + rcu_read_lock(); + orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); + if (orig_dev) + ts_pktinfo.if_index = orig_dev->ifindex; + rcu_read_unlock(); + + ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO, + sizeof(ts_pktinfo), &ts_pktinfo); +} + /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ @@ -699,8 +720,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, empty = 0; if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) + ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) { empty = 0; + if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && + !skb_is_err_queue(skb)) + put_ts_pktinfo(msg, skb); + } if (!empty) { put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, sizeof(tss), &tss); -- cgit v1.3-6-gb490 From b50a5c70ffa4fd6b6da324ab54c84adf48fb17d9 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Fri, 19 May 2017 17:52:40 +0200 Subject: net: allow simultaneous SW and HW transmit timestamping Add SOF_TIMESTAMPING_OPT_TX_SWHW option to allow an outgoing packet to be looped to the socket's error queue with a software timestamp even when a hardware transmit timestamp is expected to be provided by the driver. Applications using this option will receive two separate messages from the error queue, one with a software timestamp and the other with a hardware timestamp. As the hardware timestamp is saved to the shared skb info, which may happen before the first message with software timestamp is received by the application, the hardware timestamp is copied to the SCM_TIMESTAMPING control message only when the skb has no software timestamp or it is an incoming packet. While changing sw_tx_timestamp(), inline it in skb_tx_timestamp() as there are no other users. CC: Richard Cochran CC: Willem de Bruijn Signed-off-by: Miroslav Lichvar Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 8 ++++++++ include/linux/skbuff.h | 10 ++-------- include/uapi/linux/net_tstamp.h | 3 ++- net/core/skbuff.c | 4 ++++ net/socket.c | 20 ++++++++++++++++++-- 5 files changed, 34 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 50eb0e554778..196ba17cc344 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -203,6 +203,14 @@ SOF_TIMESTAMPING_OPT_PKTINFO: enabled and the driver is using NAPI. The struct contains also two other fields, but they are reserved and undefined. +SOF_TIMESTAMPING_OPT_TX_SWHW: + + Request both hardware and software timestamps for outgoing packets + when SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE + are enabled at the same time. If both timestamps are generated, + two separate messages will be looped to the socket's error queue, + each containing just one timestamp. + New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate regardless of the setting of sysctl net.core.tstamp_allow_data. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8acce7143f6a..45a59c1e0cc7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3259,13 +3259,6 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps); -static inline void sw_tx_timestamp(struct sk_buff *skb) -{ - if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP && - !(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) - skb_tstamp_tx(skb, NULL); -} - /** * skb_tx_timestamp() - Driver hook for transmit timestamping * @@ -3281,7 +3274,8 @@ static inline void sw_tx_timestamp(struct sk_buff *skb) static inline void skb_tx_timestamp(struct sk_buff *skb) { skb_clone_tx_timestamp(skb); - sw_tx_timestamp(skb); + if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) + skb_tstamp_tx(skb, NULL); } /** diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index dee74d39da94..3d421d912193 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -28,8 +28,9 @@ enum { SOF_TIMESTAMPING_OPT_TSONLY = (1<<11), SOF_TIMESTAMPING_OPT_STATS = (1<<12), SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13), + SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_PKTINFO, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d5c98117cbce..780b7c1563d0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3901,6 +3901,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; + if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) + return; + tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; diff --git a/net/socket.c b/net/socket.c index 67db7d8a3b81..cb355a7ef135 100644 --- a/net/socket.c +++ b/net/socket.c @@ -662,6 +662,19 @@ static bool skb_is_err_queue(const struct sk_buff *skb) return skb->pkt_type == PACKET_OUTGOING; } +/* On transmit, software and hardware timestamps are returned independently. + * As the two skb clones share the hardware timestamp, which may be updated + * before the software timestamp is received, a hardware TX timestamp may be + * returned only if there is no software TX timestamp. Ignore false software + * timestamps, which may be made in the __sock_recv_timestamp() call when the + * option SO_TIMESTAMP(NS) is enabled on the socket, even when the skb has a + * hardware timestamp. + */ +static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) +{ + return skb->tstamp && !false_tstamp && skb_is_err_queue(skb); +} + static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct scm_ts_pktinfo ts_pktinfo; @@ -691,14 +704,16 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); struct scm_timestamping tss; - int empty = 1; + int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ - if (need_software_tstamp && skb->tstamp == 0) + if (need_software_tstamp && skb->tstamp == 0) { __net_timestamp(skb); + false_tstamp = 1; + } if (need_software_tstamp) { if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { @@ -720,6 +735,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, empty = 0; if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + !skb_is_swtx_tstamp(skb, false_tstamp) && ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) { empty = 0; if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && -- cgit v1.3-6-gb490 From fdfc7dd6ca39b117c709dceee8d32ac4447294d6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 23 May 2017 18:40:45 +0200 Subject: net/sched: flower: add support for matching on tcp flags Benefit from the support of tcp flags dissection and allow user to insert rules matching on tcp flags. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 +++ net/sched/cls_flower.c | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 1b9aa9e6b4fd..c6e8cf5e9c40 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -451,6 +451,9 @@ enum { TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ + TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index ca526c0881bd..fb74a47830f4 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -49,6 +49,7 @@ struct fl_flow_key { }; struct flow_dissector_key_ports enc_tp; struct flow_dissector_key_mpls mpls; + struct flow_dissector_key_tcp tcp; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -424,6 +425,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -596,6 +599,9 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, sizeof(key->tp.dst)); + fl_set_key_val(tb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS, + &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK, + sizeof(key->tcp.flags)); } else if (key->basic.ip_proto == IPPROTO_UDP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK, @@ -766,6 +772,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_TCP, tcp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ICMP, icmp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, @@ -1215,7 +1223,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->tp.src)) || fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, - sizeof(key->tp.dst)))) + sizeof(key->tp.dst)) || + fl_dump_key_val(skb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS, + &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK, + sizeof(key->tcp.flags)))) goto nla_put_failure; else if (key->basic.ip_proto == IPPROTO_UDP && (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, -- cgit v1.3-6-gb490 From 0be1b305d9b808e5b28e74f4ef807851c14c39f2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 25 May 2017 10:42:38 -0700 Subject: net: ipv4: add new RTM_F_FIB_MATCH flag for use with RTM_GETROUTE This flag when specified will return matched fib result in response to a RTM_GETROUTE query. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 6487b21b2c1e..564790e854f7 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -278,6 +278,7 @@ enum rt_scope_t { #define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ #define RTM_F_PREFIX 0x800 /* Prefix addresses */ #define RTM_F_LOOKUP_TABLE 0x1000 /* set rtm_table to FIB lookup result */ +#define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */ /* Reserved table identifiers */ -- cgit v1.3-6-gb490 From 3d3ea5af5c0b382bc9d9aed378fd814fb5d4a011 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 27 May 2017 10:14:34 -0400 Subject: rtnl: Add support for netdev event to link messages When netdev events happen, a rtnetlink_event() handler will send messages for every event in it's white list. These messages contain current information about a particular device, but they do not include the iformation about which event just happened. So, it is impossible to tell what just happend for these events. This patch adds a new extension to RTM_NEWLINK message called IFLA_EVENT that would have an encoding of event that triggered this message. This would allow the the message consumer to easily determine if it needs to perform certain actions. Signed-off-by: Vladislav Yasevich Acked-by: David Ahern Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 3 +- include/uapi/linux/if_link.h | 11 ++++++++ net/core/dev.c | 2 +- net/core/rtnetlink.c | 65 ++++++++++++++++++++++++++++++++++++++------ 4 files changed, 70 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 57e54847b0b9..dea59c8eec54 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -18,7 +18,8 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned change, gfp_t flags); + unsigned change, u32 event, + gfp_t flags); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 15ac20382aba..8ed679fe603f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -157,6 +157,7 @@ enum { IFLA_GSO_MAX_SIZE, IFLA_PAD, IFLA_XDP, + IFLA_EVENT, __IFLA_MAX }; @@ -911,4 +912,14 @@ enum { #define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) +enum { + IFLA_EVENT_NONE, + IFLA_EVENT_REBOOT, /* internal reset / reboot */ + IFLA_EVENT_FEATURES, /* change in offload features */ + IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ + IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ + IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ + IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ +}; + #endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 3d98fbf4cbb0..06e0a7492df8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7084,7 +7084,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL); /* diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 64953af4a3b1..9da53e43750c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -941,6 +941,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1282,9 +1283,40 @@ err_cancel: return err; } +static u32 rtnl_get_event(unsigned long event) +{ + u32 rtnl_event_type = IFLA_EVENT_NONE; + + switch (event) { + case NETDEV_REBOOT: + rtnl_event_type = IFLA_EVENT_REBOOT; + break; + case NETDEV_FEAT_CHANGE: + rtnl_event_type = IFLA_EVENT_FEATURES; + break; + case NETDEV_BONDING_FAILOVER: + rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; + break; + case NETDEV_NOTIFY_PEERS: + rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; + break; + case NETDEV_RESEND_IGMP: + rtnl_event_type = IFLA_EVENT_IGMP_RESEND; + break; + case NETDEV_CHANGEINFODATA: + rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; + break; + default: + break; + } + + return rtnl_event_type; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, u32 ext_filter_mask) + unsigned int flags, u32 ext_filter_mask, + u32 event) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1333,6 +1365,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { + if (nla_put_u32(skb, IFLA_EVENT, event)) + goto nla_put_failure; + } + if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure; @@ -1467,6 +1504,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, + [IFLA_EVENT] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1626,7 +1664,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask); + ext_filter_mask, 0); if (err < 0) { if (likely(skb->len)) @@ -2736,7 +2774,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask); + nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2808,7 +2846,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned int change, gfp_t flags) + unsigned int change, + u32 event, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2819,7 +2858,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2840,18 +2879,25 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) +static void rtmsg_ifinfo_event(int type, struct net_device *dev, + unsigned int change, u32 event, + gfp_t flags) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + rtmsg_ifinfo_event(type, dev, change, IFLA_EVENT_NONE, flags); +} EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -4168,7 +4214,8 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_NOTIFY_PEERS: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); + rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), + GFP_KERNEL); break; default: break; -- cgit v1.3-6-gb490 From 4d80cc0aaaab9efac14c9d3d702b69961800de20 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 1 Jun 2017 21:37:38 +0300 Subject: net/sched: cls_flower: add support for matching on ip tos and ttl Benefit from the support of ip header fields dissection and allow users to set rules matching on ipv4 tos and ttl or ipv6 traffic-class and hoplimit. Signed-off-by: Or Gerlitz Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 5 +++++ net/sched/cls_flower.c | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index c6e8cf5e9c40..edf43ddf47b0 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -454,6 +454,11 @@ enum { TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + TCA_FLOWER_KEY_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index fb74a47830f4..33feaee197cf 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -50,6 +50,7 @@ struct fl_flow_key { struct flow_dissector_key_ports enc_tp; struct flow_dissector_key_mpls mpls; struct flow_dissector_key_tcp tcp; + struct flow_dissector_key_ip ip; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -427,6 +428,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -528,6 +533,19 @@ static int fl_set_key_flags(struct nlattr **tb, return 0; } +static void fl_set_key_ip(struct nlattr **tb, + struct flow_dissector_key_ip *key, + struct flow_dissector_key_ip *mask) +{ + fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS, + &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK, + sizeof(key->tos)); + + fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, + &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK, + sizeof(key->ttl)); +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { @@ -570,6 +588,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); + fl_set_key_ip(tb, &key->ip, &mask->ip); } if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { @@ -772,6 +791,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_IP, ip); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_TCP, tcp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, @@ -1082,6 +1103,19 @@ static int fl_dump_key_mpls(struct sk_buff *skb, return 0; } +static int fl_dump_key_ip(struct sk_buff *skb, + struct flow_dissector_key_ip *key, + struct flow_dissector_key_ip *mask) +{ + if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos, + TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) || + fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl, + TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl))) + return -1; + + return 0; +} + static int fl_dump_key_vlan(struct sk_buff *skb, struct flow_dissector_key_vlan *vlan_key, struct flow_dissector_key_vlan *vlan_mask) @@ -1195,9 +1229,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && - fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, - sizeof(key->basic.ip_proto))) + sizeof(key->basic.ip_proto)) || + fl_dump_key_ip(skb, &key->ip, &mask->ip))) goto nla_put_failure; if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && -- cgit v1.3-6-gb490 From b7d3ed5be9bd7e0689eee0f0f36702937cd8f7c8 Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Fri, 2 Jun 2017 21:03:54 -0700 Subject: bpf: update perf event helper functions documentation This commit updates documentation of the bpf_perf_event_output and bpf_perf_event_read helpers to match their implementation. Signed-off-by: Teng Qin Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 11 +++++++---- tools/include/uapi/linux/bpf.h | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 94dfa9def355..e78aece03628 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -313,8 +313,11 @@ union bpf_attr { * @flags: room for future extensions * Return: 0 on success or negative error * - * u64 bpf_perf_event_read(&map, index) - * Return: Number events read or error code + * u64 bpf_perf_event_read(map, flags) + * read perf event counter value + * @map: pointer to perf_event_array map + * @flags: index of event in the map or bitmask flags + * Return: value of perf event counter read or error code * * int bpf_redirect(ifindex, flags) * redirect to another netdev @@ -328,11 +331,11 @@ union bpf_attr { * @skb: pointer to skb * Return: realm if != 0 * - * int bpf_perf_event_output(ctx, map, index, data, size) + * int bpf_perf_event_output(ctx, map, flags, data, size) * output perf raw sample * @ctx: struct pt_regs* * @map: pointer to perf_event_array map - * @index: index of event in the map + * @flags: index of event in the map or bitmask flags * @data: data on stack to be output as raw data * @size: size of data * Return: 0 on success or negative error diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 94dfa9def355..e78aece03628 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -313,8 +313,11 @@ union bpf_attr { * @flags: room for future extensions * Return: 0 on success or negative error * - * u64 bpf_perf_event_read(&map, index) - * Return: Number events read or error code + * u64 bpf_perf_event_read(map, flags) + * read perf event counter value + * @map: pointer to perf_event_array map + * @flags: index of event in the map or bitmask flags + * Return: value of perf event counter read or error code * * int bpf_redirect(ifindex, flags) * redirect to another netdev @@ -328,11 +331,11 @@ union bpf_attr { * @skb: pointer to skb * Return: realm if != 0 * - * int bpf_perf_event_output(ctx, map, index, data, size) + * int bpf_perf_event_output(ctx, map, flags, data, size) * output perf raw sample * @ctx: struct pt_regs* * @map: pointer to perf_event_array map - * @index: index of event in the map + * @flags: index of event in the map or bitmask flags * @data: data on stack to be output as raw data * @size: size of data * Return: 0 on success or negative error -- cgit v1.3-6-gb490 From e25ea21ffa66a029acfa89d2611c0e7ef23e7d8c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 6 Jun 2017 14:12:02 +0200 Subject: net: sched: introduce a TRAP control action There is need to instruct the HW offloaded path to push certain matched packets to cpu/kernel for further analysis. So this patch introduces a new TRAP control action to TC. For kernel datapath, this action does not make much sense. So with the same logic as in HW, new TRAP behaves similar to STOLEN. The skb is just dropped in the datapath (and virtually ejected to an upper level, which does not exist in case of kernel). Signed-off-by: Jiri Pirko Reviewed-by: Yotam Gigi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 7 +++++++ net/core/dev.c | 2 ++ net/sched/cls_bpf.c | 1 + net/sched/sch_atm.c | 1 + net/sched/sch_cbq.c | 1 + net/sched/sch_drr.c | 1 + net/sched/sch_dsmark.c | 1 + net/sched/sch_fq_codel.c | 1 + net/sched/sch_hfsc.c | 1 + net/sched/sch_htb.c | 1 + net/sched/sch_multiq.c | 1 + net/sched/sch_prio.c | 1 + net/sched/sch_qfq.c | 1 + net/sched/sch_sfb.c | 1 + net/sched/sch_sfq.c | 1 + 15 files changed, 22 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index edf43ddf47b0..2055783e6ee9 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -37,6 +37,13 @@ enum { #define TC_ACT_QUEUED 5 #define TC_ACT_REPEAT 6 #define TC_ACT_REDIRECT 7 +#define TC_ACT_TRAP 8 /* For hw path, this means "trap to cpu" + * and don't further process the frame + * in hardware. For sw path, this is + * equivalent of TC_ACT_STOLEN - drop + * the skb and act like everything + * is alright. + */ /* There is a special kind of actions called "extended actions", * which need a value parameter. These have a local opcode located in diff --git a/net/core/dev.c b/net/core/dev.c index 06e0a7492df8..8f72f4a9c6ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3269,6 +3269,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *ret = NET_XMIT_SUCCESS; consume_skb(skb); return NULL; @@ -4038,6 +4039,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: consume_skb(skb); return NULL; case TC_ACT_REDIRECT: diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5ebeae996e63..a9c56ad4533a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -70,6 +70,7 @@ static int cls_bpf_exec_opcode(int code) case TC_ACT_OK: case TC_ACT_SHOT: case TC_ACT_STOLEN: + case TC_ACT_TRAP: case TC_ACT_REDIRECT: case TC_ACT_UNSPEC: return code; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index f435546c3864..de162592eee0 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -406,6 +406,7 @@ done: switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 8dd6d0aca678..481036f6b54e 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -254,6 +254,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 5db2a2843c66..a413dc1c2098 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -339,6 +339,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 7ccdd825d34e..6d94fcc3592a 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -243,6 +243,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, #ifdef CONFIG_NET_CLS_ACT case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index f201e73947fb..337f2d6d81e4 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -103,6 +103,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return 0; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index a324f84b1ccd..b52f74610dc7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1155,6 +1155,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 195bbca9eb0b..203286ab4427 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -238,6 +238,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 604767482ad0..f143b7bbaa0d 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -52,6 +52,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index a2404688dd01..e3e364cc9a70 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -48,6 +48,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 076ad032befb..0e16dfda0bd7 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -726,6 +726,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 9756b1ccd345..11fb6ec878d6 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -266,6 +266,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return false; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 66dfd15b7946..f80ea2cc5f1f 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -187,6 +187,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return 0; -- cgit v1.3-6-gb490 From 34ad5580f8f9c86cb273ebea25c149613cd1667e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:48 -0700 Subject: bpf: Add BPF_(PROG|MAP)_GET_NEXT_ID command This patch adds BPF_PROG_GET_NEXT_ID and BPF_MAP_GET_NEXT_ID to allow userspace to iterate all bpf_prog IDs and bpf_map IDs. The API is trying to be consistent with the existing BPF_MAP_GET_NEXT_KEY. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e78aece03628..629747a3f273 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -82,6 +82,8 @@ enum bpf_cmd { BPF_PROG_ATTACH, BPF_PROG_DETACH, BPF_PROG_TEST_RUN, + BPF_PROG_GET_NEXT_ID, + BPF_MAP_GET_NEXT_ID, }; enum bpf_map_type { @@ -209,6 +211,11 @@ union bpf_attr { __u32 repeat; __u32 duration; } test; + + struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */ + __u32 start_id; + __u32 next_id; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4c3075b5d840..2405feedb8c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -166,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map) void bpf_map_put(struct bpf_map *map) { if (atomic_dec_and_test(&map->refcnt)) { + /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); @@ -726,6 +727,7 @@ void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); + /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); @@ -1069,6 +1071,34 @@ static int bpf_prog_test_run(const union bpf_attr *attr, return ret; } +#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id + +static int bpf_obj_get_next_id(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct idr *idr, + spinlock_t *lock) +{ + u32 next_id = attr->start_id; + int err = 0; + + if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + next_id++; + spin_lock_bh(lock); + if (!idr_get_next(idr, &next_id)) + err = -ENOENT; + spin_unlock_bh(lock); + + if (!err) + err = put_user(next_id, &uattr->next_id); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1146,6 +1176,14 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; + case BPF_PROG_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &prog_idr, &prog_idr_lock); + break; + case BPF_MAP_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &map_idr, &map_idr_lock); + break; default: err = -EINVAL; break; -- cgit v1.3-6-gb490 From b16d9aa4c2b90af8d2c3201e245150f8c430c3bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:49 -0700 Subject: bpf: Add BPF_PROG_GET_FD_BY_ID Add BPF_PROG_GET_FD_BY_ID command to allow user to get a fd from a bpf_prog's ID. bpf_prog_inc_not_zero() is added and is called with prog_idr_lock held. __bpf_prog_put() is also added which has the 'bool do_idr_lock' param to decide if the prog_idr_lock should be acquired when freeing the prog->id. In the error path of bpf_prog_inc_not_zero(), it may have to call __bpf_prog_put(map, false) which does not need to take the prog_idr_lock when freeing the prog->id. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 +++-- kernel/bpf/syscall.c | 91 ++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 629747a3f273..d70cfed19d5e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -84,6 +84,7 @@ enum bpf_cmd { BPF_PROG_TEST_RUN, BPF_PROG_GET_NEXT_ID, BPF_MAP_GET_NEXT_ID, + BPF_PROG_GET_FD_BY_ID, }; enum bpf_map_type { @@ -212,8 +213,11 @@ union bpf_attr { __u32 duration; } test; - struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */ - __u32 start_id; + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __u32 start_id; + __u32 prog_id; + }; __u32 next_id; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2405feedb8c1..dc6253bb8ebb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -703,15 +703,23 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog) +static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { /* cBPF to eBPF migrations are currently not in the idr store. */ if (!prog->aux->id) return; - spin_lock_bh(&prog_idr_lock); + if (do_idr_lock) + spin_lock_bh(&prog_idr_lock); + else + __acquire(&prog_idr_lock); + idr_remove(&prog_idr, prog->aux->id); - spin_unlock_bh(&prog_idr_lock); + + if (do_idr_lock) + spin_unlock_bh(&prog_idr_lock); + else + __release(&prog_idr_lock); } static void __bpf_prog_put_rcu(struct rcu_head *rcu) @@ -723,16 +731,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) bpf_prog_free(aux->prog); } -void bpf_prog_put(struct bpf_prog *prog) +static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ - bpf_prog_free_id(prog); + bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } + +void bpf_prog_put(struct bpf_prog *prog) +{ + __bpf_prog_put(prog, true); +} EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) @@ -814,6 +827,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc); +/* prog_idr_lock should have been held */ +static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) +{ + int refold; + + refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_prog_put(prog, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + return prog; +} + static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); @@ -928,16 +959,21 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_used_maps; err = bpf_prog_new_fd(prog); - if (err < 0) - /* failed to allocate fd */ - goto free_id; + if (err < 0) { + /* failed to allocate fd. + * bpf_prog_put() is needed because the above + * bpf_prog_alloc_id() has published the prog + * to the userspace and the userspace may + * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. + */ + bpf_prog_put(prog); + return err; + } bpf_prog_kallsyms_add(prog); trace_bpf_prog_load(prog, err); return err; -free_id: - bpf_prog_free_id(prog); free_used_maps: free_used_maps(prog->aux); free_prog: @@ -1099,6 +1135,38 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&prog_idr_lock); + prog = idr_find(&prog_idr, id); + if (prog) + prog = bpf_prog_inc_not_zero(prog); + else + prog = ERR_PTR(-ENOENT); + spin_unlock_bh(&prog_idr_lock); + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fd = bpf_prog_new_fd(prog); + if (fd < 0) + bpf_prog_put(prog); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1184,6 +1252,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &map_idr, &map_idr_lock); break; + case BPF_PROG_GET_FD_BY_ID: + err = bpf_prog_get_fd_by_id(&attr); + break; default: err = -EINVAL; break; -- cgit v1.3-6-gb490 From bd5f5f4ecb78e2698dad655645b6d6a2f7012a8c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:50 -0700 Subject: bpf: Add BPF_MAP_GET_FD_BY_ID Add BPF_MAP_GET_FD_BY_ID command to allow user to get a fd from a bpf_map's ID. bpf_map_inc_not_zero() is added and is called with map_idr_lock held. __bpf_map_put() is also added which has the 'bool do_idr_lock' param to decide if the map_idr_lock should be acquired when freeing the map->id. In the error path of bpf_map_inc_not_zero(), it may have to call __bpf_map_put(map, false) which does not need to take the map_idr_lock when freeing the map->id. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 + kernel/bpf/syscall.c | 95 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 87 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d70cfed19d5e..dd23f47ff00c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -85,6 +85,7 @@ enum bpf_cmd { BPF_PROG_GET_NEXT_ID, BPF_MAP_GET_NEXT_ID, BPF_PROG_GET_FD_BY_ID, + BPF_MAP_GET_FD_BY_ID, }; enum bpf_map_type { @@ -217,6 +218,7 @@ union bpf_attr { union { __u32 start_id; __u32 prog_id; + __u32 map_id; }; __u32 next_id; }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dc6253bb8ebb..1802bb9c47d9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -135,11 +135,19 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map) +static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { - spin_lock_bh(&map_idr_lock); + if (do_idr_lock) + spin_lock_bh(&map_idr_lock); + else + __acquire(&map_idr_lock); + idr_remove(&map_idr, map->id); - spin_unlock_bh(&map_idr_lock); + + if (do_idr_lock) + spin_unlock_bh(&map_idr_lock); + else + __release(&map_idr_lock); } /* called from workqueue */ @@ -163,16 +171,21 @@ static void bpf_map_put_uref(struct bpf_map *map) /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ -void bpf_map_put(struct bpf_map *map) +static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) { if (atomic_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ - bpf_map_free_id(map); + bpf_map_free_id(map, do_idr_lock); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } } +void bpf_map_put(struct bpf_map *map) +{ + __bpf_map_put(map, true); +} + void bpf_map_put_with_uref(struct bpf_map *map) { bpf_map_put_uref(map); @@ -271,15 +284,20 @@ static int map_create(union bpf_attr *attr) goto free_map; err = bpf_map_new_fd(map); - if (err < 0) - /* failed to allocate fd */ - goto free_id; + if (err < 0) { + /* failed to allocate fd. + * bpf_map_put() is needed because the above + * bpf_map_alloc_id() has published the map + * to the userspace and the userspace may + * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. + */ + bpf_map_put(map); + return err; + } trace_bpf_map_create(map, err); return err; -free_id: - bpf_map_free_id(map); free_map: bpf_map_uncharge_memlock(map); free_map_nouncharge: @@ -331,6 +349,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } +/* map_idr_lock should have been held */ +static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, + bool uref) +{ + int refold; + + refold = __atomic_add_unless(&map->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_map_put(map, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + if (uref) + atomic_inc(&map->usercnt); + + return map; +} + int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -1167,6 +1207,38 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) return fd; } +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id + +static int bpf_map_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_map *map; + u32 id = attr->map_id; + int fd; + + if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&map_idr_lock); + map = idr_find(&map_idr, id); + if (map) + map = bpf_map_inc_not_zero(map, true); + else + map = ERR_PTR(-ENOENT); + spin_unlock_bh(&map_idr_lock); + + if (IS_ERR(map)) + return PTR_ERR(map); + + fd = bpf_map_new_fd(map); + if (fd < 0) + bpf_map_put(map); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1255,6 +1327,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; + case BPF_MAP_GET_FD_BY_ID: + err = bpf_map_get_fd_by_id(&attr); + break; default: err = -EINVAL; break; -- cgit v1.3-6-gb490 From 1e270976908686ec25fb91b8a34145be54137976 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:52 -0700 Subject: bpf: Add BPF_OBJ_GET_INFO_BY_FD A single BPF_OBJ_GET_INFO_BY_FD cmd is used to obtain the info for both bpf_prog and bpf_map. The kernel can figure out the fd is associated with a bpf_prog or bpf_map. The suggested struct bpf_prog_info and struct bpf_map_info are not meant to be a complete list and it is not the goal of this patch. New fields can be added in the future patch. The focus of this patch is to create the interface, BPF_OBJ_GET_INFO_BY_FD cmd for exposing the bpf_prog's and bpf_map's info. The obj's info, which will be extended (and get bigger) over time, is separated from the bpf_attr to avoid bloating the bpf_attr. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 2 - include/uapi/linux/bpf.h | 28 ++++++++ kernel/bpf/syscall.c | 163 ++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 174 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e2dddf21f3b..1fa26dc562ce 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -69,8 +69,6 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 -#define BPF_TAG_SIZE 8 - /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd23f47ff00c..9b2c10b45733 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -86,6 +86,7 @@ enum bpf_cmd { BPF_MAP_GET_NEXT_ID, BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, + BPF_OBJ_GET_INFO_BY_FD, }; enum bpf_map_type { @@ -222,6 +223,12 @@ union bpf_attr { }; __u32 next_id; }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __u32 bpf_fd; + __u32 info_len; + __aligned_u64 info; + } info; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -686,4 +693,25 @@ struct xdp_md { __u32 data_end; }; +#define BPF_TAG_SIZE 8 + +struct bpf_prog_info { + __u32 type; + __u32 id; + __u8 tag[BPF_TAG_SIZE]; + __u32 jited_prog_len; + __u32 xlated_prog_len; + __aligned_u64 jited_prog_insns; + __aligned_u64 xlated_prog_insns; +} __attribute__((aligned(8))); + +struct bpf_map_info { + __u32 type; + __u32 id; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1802bb9c47d9..8942c820d620 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1239,6 +1239,145 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } +static int check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) +{ + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + int err; + + if (actual_size <= expected_size) + return 0; + + addr = uaddr + expected_size; + end = uaddr + actual_size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + + return 0; +} + +static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_prog_info info = {}; + u32 info_len = attr->info.info_len; + char __user *uinsns; + u32 ulen; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + if (copy_from_user(&info, uinfo, info_len)) + return err; + + info.type = prog->type; + info.id = prog->aux->id; + + memcpy(info.tag, prog->tag, sizeof(prog->tag)); + + if (!capable(CAP_SYS_ADMIN)) { + info.jited_prog_len = 0; + info.xlated_prog_len = 0; + goto done; + } + + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } + + ulen = info.xlated_prog_len; + info.xlated_prog_len = bpf_prog_size(prog->len); + if (info.xlated_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.xlated_prog_insns); + ulen = min_t(u32, info.xlated_prog_len, ulen); + if (copy_to_user(uinsns, prog->insnsi, ulen)) + return -EFAULT; + } + +done: + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +static int bpf_map_get_info_by_fd(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_map_info info = {}; + u32 info_len = attr->info.info_len; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + info.type = map->map_type; + info.id = map->id; + info.key_size = map->key_size; + info.value_size = map->value_size; + info.max_entries = map->max_entries; + info.map_flags = map->map_flags; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info + +static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ufd = attr->info.bpf_fd; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) + return -EINVAL; + + f = fdget(ufd); + if (!f.file) + return -EBADFD; + + if (f.file->f_op == &bpf_prog_fops) + err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + uattr); + else if (f.file->f_op == &bpf_map_fops) + err = bpf_map_get_info_by_fd(f.file->private_data, attr, + uattr); + else + err = -EINVAL; + + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1258,23 +1397,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz * user-space does not rely on any kernel feature * extensions we dont know about yet. */ - if (size > sizeof(attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - err = get_user(val, addr); - if (err) - return err; - if (val) - return -E2BIG; - } - size = sizeof(attr); - } + err = check_uarg_tail_zero(uattr, sizeof(attr), size); + if (err) + return err; + size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ if (copy_from_user(&attr, uattr, size) != 0) @@ -1330,6 +1456,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_FD_BY_ID: err = bpf_map_get_fd_by_id(&attr); break; + case BPF_OBJ_GET_INFO_BY_FD: + err = bpf_obj_get_info_by_fd(&attr, uattr); + break; default: err = -EINVAL; break; -- cgit v1.3-6-gb490