diff options
Diffstat (limited to 'include/net')
337 files changed, 28864 insertions, 9022 deletions
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h index 03614de86942..60cad0d200a4 100644 --- a/include/net/9p/9p.h +++ b/include/net/9p/9p.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * include/net/9p/9p.h - * * 9P protocol definitions. * * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net> @@ -32,18 +30,20 @@ */ enum p9_debug_flags { - P9_DEBUG_ERROR = (1<<0), - P9_DEBUG_9P = (1<<2), + P9_DEBUG_ERROR = (1<<0), + P9_DEBUG_9P = (1<<2), P9_DEBUG_VFS = (1<<3), P9_DEBUG_CONV = (1<<4), P9_DEBUG_MUX = (1<<5), P9_DEBUG_TRANS = (1<<6), - P9_DEBUG_SLABS = (1<<7), + P9_DEBUG_SLABS = (1<<7), P9_DEBUG_FCALL = (1<<8), P9_DEBUG_FID = (1<<9), P9_DEBUG_PKT = (1<<10), P9_DEBUG_FSC = (1<<11), P9_DEBUG_VPKT = (1<<12), + P9_DEBUG_CACHE = (1<<13), + P9_DEBUG_MMAP = (1<<14), }; #ifdef CONFIG_NET_9P_DEBUG @@ -215,6 +215,10 @@ enum p9_open_mode_t { P9_ORCLOSE = 0x40, P9_OAPPEND = 0x80, P9_OEXCL = 0x1000, + P9L_MODE_MASK = 0x1FFF, /* don't send anything under this to server */ + P9L_DIRECT = 0x2000, /* cache disabled */ + P9L_NOWRITECACHE = 0x4000, /* no write caching */ + P9L_LOOSE = 0x8000, /* loose cache */ }; /** @@ -317,8 +321,8 @@ enum p9_qid_t { }; /* 9P Magic Numbers */ -#define P9_NOTAG (u16)(~0) -#define P9_NOFID (u32)(~0) +#define P9_NOTAG ((u16)(~0)) +#define P9_NOFID ((u32)(~0)) #define P9_MAXWELEM 16 /* Minimal header size: size[4] type[1] tag[2] */ @@ -333,6 +337,9 @@ enum p9_qid_t { /* size of header for zero copy read/write */ #define P9_ZC_HDR_SZ 4096 +/* maximum length of an error string */ +#define P9_ERRMAX 128 + /** * struct p9_qid - file system entity information * @type: 8-bit type &p9_qid_t @@ -530,6 +537,7 @@ struct p9_rstatfs { * @offset: used by marshalling routines to track current position in buffer * @capacity: used by marshalling routines to track total malloc'd capacity * @sdata: payload + * @zc: whether zero-copy is used * * &p9_fcall represents the structure for all 9P RPC * transactions. Requests are packaged into fcalls, and reponses @@ -548,11 +556,10 @@ struct p9_fcall { struct kmem_cache *cache; u8 *sdata; + bool zc; }; int p9_errstr2errno(char *errstr, int len); int p9_error_init(void); -int p9_trans_fd_init(void); -void p9_trans_fd_exit(void); #endif /* NET_9P_H */ diff --git a/include/net/9p/client.h b/include/net/9p/client.h index dd5b5bd781a4..4f785098c67a 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * include/net/9p/client.h - * * 9P Client Definitions * * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com> @@ -13,6 +11,7 @@ #include <linux/utsname.h> #include <linux/idr.h> +#include <linux/tracepoint-defs.h> /* Number of requests per row */ #define P9_ROW_MAXTAG 255 @@ -23,7 +22,7 @@ * @p9_proto_2000L: 9P2000.L extension */ -enum p9_proto_versions{ +enum p9_proto_versions { p9_proto_legacy, p9_proto_2000u, p9_proto_2000L, @@ -78,7 +77,7 @@ enum p9_req_status_t { struct p9_req_t { int status; int t_err; - struct kref refcount; + refcount_t refcount; wait_queue_head_t wq; struct p9_fcall tc; struct p9_fcall rc; @@ -140,10 +139,16 @@ struct p9_client { * * TODO: This needs lots of explanation. */ +enum fid_source { + FID_FROM_OTHER, + FID_FROM_INODE, + FID_FROM_DENTRY, +}; struct p9_fid { struct p9_client *clnt; u32 fid; + refcount_t count; int mode; struct p9_qid qid; u32 iounit; @@ -152,6 +157,7 @@ struct p9_fid { void *rdir; struct hlist_node dlist; /* list of all fids attached to a dentry */ + struct hlist_node ilist; }; /** @@ -201,6 +207,8 @@ int p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err int p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err); int p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err); +struct netfs_io_subrequest; +void p9_client_write_subreq(struct netfs_io_subrequest *subreq); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset); int p9dirent_read(struct p9_client *clnt, char *buf, int len, struct p9_dirent *dirent); @@ -212,36 +220,80 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, u64 request_mask); int p9_client_mknod_dotl(struct p9_fid *oldfid, const char *name, int mode, - dev_t rdev, kgid_t gid, struct p9_qid *); + dev_t rdev, kgid_t gid, struct p9_qid *qid); int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, - kgid_t gid, struct p9_qid *); + kgid_t gid, struct p9_qid *qid); int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status); int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl); void p9_fcall_fini(struct p9_fcall *fc); -struct p9_req_t *p9_tag_lookup(struct p9_client *, u16); +struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag); static inline void p9_req_get(struct p9_req_t *r) { - kref_get(&r->refcount); + refcount_inc(&r->refcount); } static inline int p9_req_try_get(struct p9_req_t *r) { - return kref_get_unless_zero(&r->refcount); + return refcount_inc_not_zero(&r->refcount); } -int p9_req_put(struct p9_req_t *r); +int p9_req_put(struct p9_client *c, struct p9_req_t *r); + +/* We cannot have the real tracepoints in header files, + * use a wrapper function */ +DECLARE_TRACEPOINT(9p_fid_ref); +void do_trace_9p_fid_get(struct p9_fid *fid); +void do_trace_9p_fid_put(struct p9_fid *fid); + +/* fid reference counting helpers: + * - fids used for any length of time should always be referenced through + * p9_fid_get(), and released with p9_fid_put() + * - v9fs_fid_lookup() or similar will automatically call get for you + * and also require a put + * - the *_fid_add() helpers will stash the fid in the inode, + * at which point it is the responsibility of evict_inode() + * to call the put + * - the last put will automatically send a clunk to the server + */ +static inline struct p9_fid *p9_fid_get(struct p9_fid *fid) +{ + if (tracepoint_enabled(9p_fid_ref)) + do_trace_9p_fid_get(fid); + + refcount_inc(&fid->count); + + return fid; +} + +static inline int p9_fid_put(struct p9_fid *fid) +{ + if (!fid || IS_ERR(fid)) + return 0; + + if (tracepoint_enabled(9p_fid_ref)) + do_trace_9p_fid_put(fid); + + if (!refcount_dec_and_test(&fid->count)) + return 0; + + return p9_client_clunk(fid); +} void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status); -int p9_parse_header(struct p9_fcall *, int32_t *, int8_t *, int16_t *, int); -int p9stat_read(struct p9_client *, char *, int, struct p9_wstat *); -void p9stat_free(struct p9_wstat *); +int p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, + int16_t *tag, int rewind); +int p9stat_read(struct p9_client *clnt, char *buf, int len, + struct p9_wstat *st); +void p9stat_free(struct p9_wstat *stbuf); int p9_is_proto_dotu(struct p9_client *clnt); int p9_is_proto_dotl(struct p9_client *clnt); -struct p9_fid *p9_client_xattrwalk(struct p9_fid *, const char *, u64 *); -int p9_client_xattrcreate(struct p9_fid *, const char *, u64, int); +struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, + const char *attr_name, u64 *attr_size); +int p9_client_xattrcreate(struct p9_fid *fid, const char *name, + u64 attr_size, int flags); int p9_client_readlink(struct p9_fid *fid, char **target); int p9_client_init(void); diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index 3eb4261b2958..766ec07c9599 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * include/net/9p/transport.h - * * Transport Definition * * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net> @@ -11,6 +9,8 @@ #ifndef NET_9P_TRANSPORT_H #define NET_9P_TRANSPORT_H +#include <linux/module.h> + #define P9_DEF_MIN_RESVPORT (665U) #define P9_DEF_MAX_RESVPORT (1023U) @@ -19,6 +19,10 @@ * @list: used to maintain a list of currently available transports * @name: the human-readable name of the transport * @maxsize: transport provided maximum packet size + * @pooled_rbuffers: currently only set for RDMA transport which pulls the + * response buffers from a shared pool, and accordingly + * we're less flexible when choosing the response message + * size in this case * @def: set if this transport should be considered the default * @create: member function to create a new connection on this transport * @close: member function to discard a connection on this transport @@ -38,21 +42,28 @@ struct p9_trans_module { struct list_head list; char *name; /* name of transport */ int maxsize; /* max message size of transport */ + bool pooled_rbuffers; int def; /* this transport should be default */ struct module *owner; - int (*create)(struct p9_client *, const char *, char *); - void (*close) (struct p9_client *); - int (*request) (struct p9_client *, struct p9_req_t *req); - int (*cancel) (struct p9_client *, struct p9_req_t *req); - int (*cancelled)(struct p9_client *, struct p9_req_t *req); - int (*zc_request)(struct p9_client *, struct p9_req_t *, - struct iov_iter *, struct iov_iter *, int , int, int); - int (*show_options)(struct seq_file *, struct p9_client *); + int (*create)(struct p9_client *client, + const char *devname, char *args); + void (*close)(struct p9_client *client); + int (*request)(struct p9_client *client, struct p9_req_t *req); + int (*cancel)(struct p9_client *client, struct p9_req_t *req); + int (*cancelled)(struct p9_client *client, struct p9_req_t *req); + int (*zc_request)(struct p9_client *client, struct p9_req_t *req, + struct iov_iter *uidata, struct iov_iter *uodata, + int inlen, int outlen, int in_hdr_len); + int (*show_options)(struct seq_file *m, struct p9_client *client); }; void v9fs_register_trans(struct p9_trans_module *m); void v9fs_unregister_trans(struct p9_trans_module *m); -struct p9_trans_module *v9fs_get_trans_by_name(char *s); +struct p9_trans_module *v9fs_get_trans_by_name(const char *s); struct p9_trans_module *v9fs_get_default_trans(void); void v9fs_put_trans(struct p9_trans_module *m); + +#define MODULE_ALIAS_9P(transport) \ + MODULE_ALIAS("9p-" transport) + #endif /* NET_9P_TRANSPORT_H */ diff --git a/include/net/Space.h b/include/net/Space.h index 9cce0d80d37a..ef42629f4258 100644 --- a/include/net/Space.h +++ b/include/net/Space.h @@ -3,28 +3,10 @@ * ethernet adaptor have the name "eth[0123...]". */ -struct net_device *hp100_probe(int unit); struct net_device *ultra_probe(int unit); struct net_device *wd_probe(int unit); struct net_device *ne_probe(int unit); -struct net_device *fmv18x_probe(int unit); -struct net_device *i82596_probe(int unit); -struct net_device *ni65_probe(int unit); -struct net_device *sonic_probe(int unit); struct net_device *smc_init(int unit); -struct net_device *atarilance_probe(int unit); -struct net_device *sun3lance_probe(int unit); -struct net_device *sun3_82586_probe(int unit); -struct net_device *apne_probe(int unit); struct net_device *cs89x0_probe(int unit); -struct net_device *mvme147lance_probe(int unit); struct net_device *tc515_probe(int unit); struct net_device *lance_probe(int unit); -struct net_device *cops_probe(int unit); -struct net_device *ltpc_probe(void); - -/* Fibre Channel adapters */ -int iph5526_probe(struct net_device *dev); - -/* SBNI adapters */ -int sbni_probe(int unit); diff --git a/include/net/act_api.h b/include/net/act_api.h index 87214927314a..404df8557f6a 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -7,6 +7,7 @@ */ #include <linux/refcount.h> +#include <net/flow_offload.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/net_namespace.h> @@ -30,20 +31,21 @@ struct tc_action { atomic_t tcfa_bindcnt; int tcfa_action; struct tcf_t tcfa_tm; - struct gnet_stats_basic_packed tcfa_bstats; - struct gnet_stats_basic_packed tcfa_bstats_hw; + struct gnet_stats_basic_sync tcfa_bstats; + struct gnet_stats_basic_sync tcfa_bstats_hw; struct gnet_stats_queue tcfa_qstats; struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; - struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw; + struct gnet_stats_basic_sync __percpu *cpu_bstats; + struct gnet_stats_basic_sync __percpu *cpu_bstats_hw; struct gnet_stats_queue __percpu *cpu_qstats; - struct tc_cookie __rcu *act_cookie; + struct tc_cookie __rcu *user_cookie; struct tcf_chain __rcu *goto_chain; u32 tcfa_flags; u8 hw_stats; u8 used_hw_stats; bool used_hw_stats_valid; + u32 in_hw_count; }; #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt @@ -58,6 +60,15 @@ struct tc_action { #define TCA_ACT_HW_STATS_ANY (TCA_ACT_HW_STATS_IMMEDIATE | \ TCA_ACT_HW_STATS_DELAYED) +/* Reserve 16 bits for user-space. See TCA_ACT_FLAGS_NO_PERCPU_STATS. */ +#define TCA_ACT_FLAGS_USER_BITS 16 +#define TCA_ACT_FLAGS_USER_MASK 0xffff +#define TCA_ACT_FLAGS_POLICE (1U << TCA_ACT_FLAGS_USER_BITS) +#define TCA_ACT_FLAGS_BIND (1U << (TCA_ACT_FLAGS_USER_BITS + 1)) +#define TCA_ACT_FLAGS_REPLACE (1U << (TCA_ACT_FLAGS_USER_BITS + 2)) +#define TCA_ACT_FLAGS_NO_RTNL (1U << (TCA_ACT_FLAGS_USER_BITS + 3)) +#define TCA_ACT_FLAGS_AT_INGRESS (1U << (TCA_ACT_FLAGS_USER_BITS + 4)) + /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. */ @@ -80,10 +91,15 @@ static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) dtm->expires = jiffies_to_clock_t(stm->expires); } -#ifdef CONFIG_NET_CLS_ACT +static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) +{ + if (WARN_ON_ONCE(hw_stats > TCA_ACT_HW_STATS_ANY)) + return FLOW_ACTION_HW_STATS_DONT_CARE; + else if (!hw_stats) + return FLOW_ACTION_HW_STATS_DISABLED; -#define ACT_P_CREATED 1 -#define ACT_P_DELETED 1 + return hw_stats; +} typedef void (*tc_action_priv_destructor)(void *priv); @@ -91,6 +107,7 @@ struct tc_action_ops { struct list_head head; char kind[IFNAMSIZ]; enum tca_id id; /* identifier should match kind */ + unsigned int net_id; size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, @@ -99,8 +116,8 @@ struct tc_action_ops { void (*cleanup)(struct tc_action *); int (*lookup)(struct net *net, struct tc_action **a, u32 index); int (*init)(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **act, int ovr, - int bind, bool rtnl_held, struct tcf_proto *tp, + struct nlattr *est, struct tc_action **act, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, @@ -113,8 +130,17 @@ struct tc_action_ops { struct psample_group * (*get_psample_group)(const struct tc_action *a, tc_action_priv_destructor *destructor); + int (*offload_act_setup)(struct tc_action *act, void *entry_data, + u32 *index_inc, bool bind, + struct netlink_ext_ack *extack); }; +#ifdef CONFIG_NET_CLS_ACT + +#define ACT_P_BOUND 0 +#define ACT_P_CREATED 1 +#define ACT_P_DELETED 1 + struct tc_action_net { struct tcf_idrinfo *idrinfo; const struct tc_action_ops *ops; @@ -166,41 +192,39 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); +void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); -int __tcf_idr_release(struct tc_action *a, bool bind, bool strict); - -static inline int tcf_idr_release(struct tc_action *a, bool bind) -{ - return __tcf_idr_release(a, bind, false); -} +int tcf_idr_release(struct tc_action *a, bool bind); int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); +#define NET_ACT_ALIAS_PREFIX "net-act-" +#define MODULE_ALIAS_NET_ACT(kind) MODULE_ALIAS(NET_ACT_ALIAS_PREFIX kind) int tcf_action_destroy(struct tc_action *actions[], int bind); int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, - struct nlattr *est, char *name, int ovr, int bind, - struct tc_action *actions[], size_t *attr_size, - bool rtnl_held, struct netlink_ext_ack *extack); + struct nlattr *est, + struct tc_action *actions[], int init_res[], size_t *attr_size, + u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); +struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, + struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, - bool rtnl_held, - struct netlink_ext_ack *extack); + struct tc_action_ops *a_o, int *init_res, + u32 flags, struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); -int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) { if (likely(a->cpu_bstats)) { - bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + bstats_update(this_cpu_ptr(a->cpu_bstats), skb); return; } spin_lock(&a->tcfa_lock); @@ -234,11 +258,28 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, u64 drops, bool hw); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); +int tcf_action_update_hw_stats(struct tc_action *action); +int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, + void *cb_priv, bool add); int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, struct tcf_chain **handle, struct netlink_ext_ack *newchain); struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, struct tcf_chain *newchain); + +#ifdef CONFIG_INET +DECLARE_STATIC_KEY_FALSE(tcf_frag_xmit_count); +#endif + +int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)); + +#else /* !CONFIG_NET_CLS_ACT */ + +static inline int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, + void *cb_priv, bool add) { + return 0; +} + #endif /* CONFIG_NET_CLS_ACT */ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 18f783dcd55f..9e5e95988b9e 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -6,8 +6,11 @@ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ -#define TEMP_VALID_LIFETIME (7*86400) -#define TEMP_PREFERRED_LIFETIME (86400) +#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ + +#define TEMP_VALID_LIFETIME (7*86400) /* 1 week */ +#define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */ +#define REGEN_MIN_ADVANCE (2) /* 2 seconds */ #define REGEN_MAX_RETRY (3) #define MAX_DESYNC_FACTOR (600) @@ -29,17 +32,26 @@ struct prefix_info { __u8 length; __u8 prefix_len; + union __packed { + __u8 flags; + struct __packed { #if defined(__BIG_ENDIAN_BITFIELD) - __u8 onlink : 1, - autoconf : 1, - reserved : 6; + __u8 onlink : 1, + autoconf : 1, + routeraddr : 1, + preferpd : 1, + reserved : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - __u8 reserved : 6, + __u8 reserved : 4, + preferpd : 1, + routeraddr : 1, autoconf : 1, onlink : 1; #else #error "Please fix <asm/byteorder.h>" #endif + }; + }; __be32 valid; __be32 prefered; __be32 reserved2; @@ -47,6 +59,9 @@ struct prefix_info { struct in6_addr prefix; }; +/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */ +static_assert(sizeof(struct prefix_info) == 32); + #include <linux/ipv6.h> #include <linux/netdevice.h> #include <net/if_inet6.h> @@ -62,6 +77,8 @@ struct ifa6_config { const struct in6_addr *pfx; unsigned int plen; + u8 ifa_proto; + const struct in6_addr *peer_pfx; u32 rt_priority; @@ -71,6 +88,23 @@ struct ifa6_config { u16 scope; }; +enum addr_type_t { + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +struct inet6_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; + enum addr_type_t type; + bool force_rt_scope_universe; +}; + int addrconf_init(void); void addrconf_cleanup(void); @@ -107,8 +141,6 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); -int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, - u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, @@ -172,10 +204,12 @@ static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) return 0; } +#define INFINITY_LIFE_TIME 0xFFFFFFFF + static inline unsigned long addrconf_timeout_fixup(u32 timeout, unsigned int unit) { - if (timeout == 0xffffffff) + if (timeout == INFINITY_LIFE_TIME) return ~0UL; /* @@ -221,7 +255,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_mc_close(struct sock *sk); void ipv6_sock_mc_close(struct sock *sk); -bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, +bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr); int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); @@ -233,7 +267,6 @@ void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); -int ipv6_mc_check_icmpv6(struct sk_buff *skb); int ipv6_mc_check_mld(struct sk_buff *skb); void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); @@ -314,10 +347,15 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev) +{ + return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr); +} + /** * __in6_dev_stats_get - get inet6_dev pointer for stats * @dev: network device - * @skb: skb for original incoming interface if neeeded + * @skb: skb for original incoming interface if needed * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. @@ -404,7 +442,10 @@ static inline bool ip6_ignore_linkdown(const struct net_device *dev) { const struct inet6_dev *idev = __in6_dev_get(dev); - return !!idev->cnf.ignore_routes_with_linkdown; + if (unlikely(!idev)) + return true; + + return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown); } void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); @@ -425,6 +466,10 @@ static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) refcount_inc(&ifp->refcnt); } +static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) +{ + return refcount_inc_not_zero(&ifp->refcnt); +} /* * compute link-local solicited-node multicast address @@ -502,4 +547,11 @@ int if6_proc_init(void); void if6_proc_exit(void); #endif +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args); + +int inet6_fill_ifacaddr(struct sk_buff *skb, + const struct ifacaddr6 *ifaca, + struct inet6_fill_args *args); #endif diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index f6abcc0bbd6e..0fb4c41c9bbf 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -15,6 +15,9 @@ struct key; struct sock; struct socket; struct rxrpc_call; +struct rxrpc_peer; +struct krb5_buffer; +enum rxrpc_abort_reason; enum rxrpc_interruptibility { RXRPC_INTERRUPTIBLE, /* Call is interruptible */ @@ -22,56 +25,91 @@ enum rxrpc_interruptibility { RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ }; +enum rxrpc_oob_type { + RXRPC_OOB_CHALLENGE, /* Security challenge for a connection */ +}; + /* * Debug ID counter for tracing. */ extern atomic_t rxrpc_debug_id; +/* + * Operations table for rxrpc to call out to a kernel application (e.g. kAFS). + */ +struct rxrpc_kernel_ops { + void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call, + unsigned long user_call_ID); + void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*notify_oob)(struct sock *sk, struct sk_buff *oob); +}; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, - unsigned long); -typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long); -void rxrpc_kernel_new_call_notification(struct socket *, - rxrpc_notify_new_call_t, - rxrpc_discard_new_call_t); -struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, - struct sockaddr_rxrpc *, - struct key *, - unsigned long, - s64, - gfp_t, - rxrpc_notify_rx_t, - bool, - enum rxrpc_interruptibility, - unsigned int); +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops); +struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, + struct rxrpc_peer *peer, + struct key *key, + unsigned long user_call_ID, + s64 tx_total_len, + u32 hard_timeout, + gfp_t gfp, + rxrpc_notify_rx_t notify_rx, + u16 service_id, + bool upgrade, + enum rxrpc_interruptibility interruptibility, + unsigned int debug_id); int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t, rxrpc_notify_end_tx_t); int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, - struct iov_iter *, bool, u32 *, u16 *); + struct iov_iter *, size_t *, bool, u32 *, u16 *); bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, - u32, int, const char *); -void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); -void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, - struct sockaddr_rxrpc *); -bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *); -int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, - rxrpc_user_attach_call_t, unsigned long, gfp_t, - unsigned int); + u32, int, enum rxrpc_abort_reason); +void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call); +void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call); +struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, + struct sockaddr_rxrpc *srx, gfp_t gfp); +void rxrpc_kernel_put_peer(struct rxrpc_peer *peer); +struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer); +struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call); +const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer); +const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); +unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); +unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); +unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); -u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *, - ktime_t *); -bool rxrpc_kernel_call_is_complete(struct rxrpc_call *); -void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, - unsigned long); int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); +int rxrpc_sock_set_security_keyring(struct sock *, struct key *); +int rxrpc_sock_set_manage_response(struct sock *sk, bool set); + +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata); +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type); +void rxrpc_kernel_free_oob(struct sk_buff *oob); +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index); +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why); +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata); +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype); #endif /* _NET_RXRPC_H */ diff --git a/include/net/af_unix.h b/include/net/af_unix.h index f42fdddecd41..1af1841b7601 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -2,57 +2,36 @@ #ifndef __LINUX_NET_AFUNIX_H #define __LINUX_NET_AFUNIX_H -#include <linux/socket.h> -#include <linux/un.h> +#include <linux/atomic.h> #include <linux/mutex.h> +#include <linux/net.h> +#include <linux/path.h> #include <linux/refcount.h> +#include <linux/spinlock.h> +#include <linux/wait.h> #include <net/sock.h> +#include <uapi/linux/un.h> -void unix_inflight(struct user_struct *user, struct file *fp); -void unix_notinflight(struct user_struct *user, struct file *fp); -void unix_destruct_scm(struct sk_buff *skb); -void unix_gc(void); -void wait_for_unix_gc(void); -struct sock *unix_get_socket(struct file *filp); -struct sock *unix_peer_get(struct sock *sk); - -#define UNIX_HASH_SIZE 256 -#define UNIX_HASH_BITS 8 - -extern unsigned int unix_tot_inflight; -extern spinlock_t unix_table_lock; -extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; +#if IS_ENABLED(CONFIG_UNIX) +struct unix_sock *unix_get_socket(struct file *filp); +#else +static inline struct unix_sock *unix_get_socket(struct file *filp) +{ + return NULL; +} +#endif struct unix_address { refcount_t refcnt; int len; - unsigned int hash; struct sockaddr_un name[]; }; -struct unix_skb_parms { - struct pid *pid; /* Skb credentials */ - kuid_t uid; - kgid_t gid; - struct scm_fp_list *fp; /* Passed files */ -#ifdef CONFIG_SECURITY_NETWORK - u32 secid; /* Security ID */ -#endif - u32 consumed; -} __randomize_layout; - struct scm_stat { atomic_t nr_fds; + unsigned long nr_unix_fds; }; -#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) - -#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) -#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) -#define unix_state_lock_nested(s) \ - spin_lock_nested(&unix_sk(s)->lock, \ - SINGLE_DEPTH_NESTING) - /* The AF_UNIX socket */ struct unix_sock { /* WARNING: sk has to be the first member */ @@ -61,32 +40,22 @@ struct unix_sock { struct path path; struct mutex iolock, bindlock; struct sock *peer; - struct list_head link; - atomic_long_t inflight; + struct sock *listener; + struct unix_vertex *vertex; spinlock_t lock; - unsigned long gc_flags; -#define UNIX_GC_CANDIDATE 0 -#define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; +#define peer_wait peer_wq.wait wait_queue_entry_t peer_wake; struct scm_stat scm_stat; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + struct sk_buff *oob_skb; +#endif }; -static inline struct unix_sock *unix_sk(const struct sock *sk) -{ - return (struct unix_sock *)sk; -} - -#define peer_wait peer_wq.wait +#define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk) +#define unix_peer(sk) (unix_sk(sk)->peer) -long unix_inq_len(struct sock *sk); -long unix_outq_len(struct sock *sk); +#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) -#ifdef CONFIG_SYSCTL -int unix_sysctl_register(struct net *net); -void unix_sysctl_unregister(struct net *net); -#else -static inline int unix_sysctl_register(struct net *net) { return 0; } -static inline void unix_sysctl_unregister(struct net *net) {} -#endif #endif diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index b1c717286993..d56e6e135158 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/workqueue.h> +#include <net/sock.h> #include <uapi/linux/vm_sockets.h> #include "vsock_addr.h" @@ -74,9 +75,11 @@ struct vsock_sock { void *trans; }; +s64 vsock_connectible_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); struct sock *vsock_create_connected(struct sock *parent); +void vsock_data_ready(struct sock *sk); /**** TRANSPORT ****/ @@ -135,6 +138,14 @@ struct vsock_transport { bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); + /* SEQ_PACKET. */ + ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, + int flags); + int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg, + size_t len); + bool (*seqpacket_allow)(u32 remote_cid); + u32 (*seqpacket_has_data)(struct vsock_sock *vsk); + /* Notification. */ int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); @@ -156,12 +167,22 @@ struct vsock_transport { struct vsock_transport_send_notify_data *); /* sk_lock held by the caller */ void (*notify_buffer_size)(struct vsock_sock *, u64 *); + int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); + + /* SIOCOUTQ ioctl */ + ssize_t (*unsent_bytes)(struct vsock_sock *vsk); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); /* Addressing. */ u32 (*get_local_cid)(void); + + /* Read a single skb */ + int (*read_skb)(struct vsock_sock *, skb_read_actor_t); + + /* Zero-copy. */ + bool (*msgzerocopy_allow)(void); }; /**** CORE ****/ @@ -186,7 +207,6 @@ static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) return !list_empty(&vsk->connected_table); } -void vsock_release_pending(struct sock *pending); void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected); @@ -197,9 +217,11 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); -void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); +void vsock_for_each_connected_socket(struct vsock_transport *transport, + void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); +void vsock_linger(struct sock *sk); /**** TAP ****/ @@ -209,9 +231,29 @@ struct vsock_tap { struct list_head list; }; -int vsock_init_tap(void); int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); - +int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags); +int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags); +int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags); +int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags); + +#ifdef CONFIG_BPF_SYSCALL +extern struct proto vsock_proto; +int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +void __init vsock_bpf_build_proto(void); +#else +static inline void __init vsock_bpf_build_proto(void) +{} +#endif + +static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t) +{ + return t->msgzerocopy_allow && t->msgzerocopy_allow(); +} #endif /* __AF_VSOCK_H__ */ diff --git a/include/net/amt.h b/include/net/amt.h new file mode 100644 index 000000000000..c881bc8b673b --- /dev/null +++ b/include/net/amt.h @@ -0,0 +1,408 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2021 Taehee Yoo <ap420073@gmail.com> + */ +#ifndef _NET_AMT_H_ +#define _NET_AMT_H_ + +#include <linux/siphash.h> +#include <linux/jhash.h> +#include <linux/netdevice.h> +#include <net/gro_cells.h> +#include <net/rtnetlink.h> + +enum amt_msg_type { + AMT_MSG_DISCOVERY = 1, + AMT_MSG_ADVERTISEMENT, + AMT_MSG_REQUEST, + AMT_MSG_MEMBERSHIP_QUERY, + AMT_MSG_MEMBERSHIP_UPDATE, + AMT_MSG_MULTICAST_DATA, + AMT_MSG_TEARDOWN, + __AMT_MSG_MAX, +}; + +#define AMT_MSG_MAX (__AMT_MSG_MAX - 1) + +enum amt_ops { + /* A*B */ + AMT_OPS_INT, + /* A+B */ + AMT_OPS_UNI, + /* A-B */ + AMT_OPS_SUB, + /* B-A */ + AMT_OPS_SUB_REV, + __AMT_OPS_MAX, +}; + +#define AMT_OPS_MAX (__AMT_OPS_MAX - 1) + +enum amt_filter { + AMT_FILTER_FWD, + AMT_FILTER_D_FWD, + AMT_FILTER_FWD_NEW, + AMT_FILTER_D_FWD_NEW, + AMT_FILTER_ALL, + AMT_FILTER_NONE_NEW, + AMT_FILTER_BOTH, + AMT_FILTER_BOTH_NEW, + __AMT_FILTER_MAX, +}; + +#define AMT_FILTER_MAX (__AMT_FILTER_MAX - 1) + +enum amt_act { + AMT_ACT_GMI, + AMT_ACT_GMI_ZERO, + AMT_ACT_GT, + AMT_ACT_STATUS_FWD_NEW, + AMT_ACT_STATUS_D_FWD_NEW, + AMT_ACT_STATUS_NONE_NEW, + __AMT_ACT_MAX, +}; + +#define AMT_ACT_MAX (__AMT_ACT_MAX - 1) + +enum amt_status { + AMT_STATUS_INIT, + AMT_STATUS_SENT_DISCOVERY, + AMT_STATUS_RECEIVED_DISCOVERY, + AMT_STATUS_SENT_ADVERTISEMENT, + AMT_STATUS_RECEIVED_ADVERTISEMENT, + AMT_STATUS_SENT_REQUEST, + AMT_STATUS_RECEIVED_REQUEST, + AMT_STATUS_SENT_QUERY, + AMT_STATUS_RECEIVED_QUERY, + AMT_STATUS_SENT_UPDATE, + AMT_STATUS_RECEIVED_UPDATE, + __AMT_STATUS_MAX, +}; + +#define AMT_STATUS_MAX (__AMT_STATUS_MAX - 1) + +/* Gateway events only */ +enum amt_event { + AMT_EVENT_NONE, + AMT_EVENT_RECEIVE, + AMT_EVENT_SEND_DISCOVERY, + AMT_EVENT_SEND_REQUEST, + __AMT_EVENT_MAX, +}; + +struct amt_header { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 type:4, + version:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 version:4, + type:4; +#else +#error "Please fix <asm/byteorder.h>" +#endif +} __packed; + +struct amt_header_discovery { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u32 type:4, + version:4, + reserved:24; +#elif defined(__BIG_ENDIAN_BITFIELD) + u32 version:4, + type:4, + reserved:24; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __be32 nonce; +} __packed; + +struct amt_header_advertisement { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u32 type:4, + version:4, + reserved:24; +#elif defined(__BIG_ENDIAN_BITFIELD) + u32 version:4, + type:4, + reserved:24; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __be32 nonce; + __be32 ip4; +} __packed; + +struct amt_header_request { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u32 type:4, + version:4, + reserved1:7, + p:1, + reserved2:16; +#elif defined(__BIG_ENDIAN_BITFIELD) + u32 version:4, + type:4, + p:1, + reserved1:7, + reserved2:16; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __be32 nonce; +} __packed; + +struct amt_header_membership_query { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u64 type:4, + version:4, + reserved:6, + l:1, + g:1, + response_mac:48; +#elif defined(__BIG_ENDIAN_BITFIELD) + u64 version:4, + type:4, + g:1, + l:1, + reserved:6, + response_mac:48; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __be32 nonce; +} __packed; + +struct amt_header_membership_update { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u64 type:4, + version:4, + reserved:8, + response_mac:48; +#elif defined(__BIG_ENDIAN_BITFIELD) + u64 version:4, + type:4, + reserved:8, + response_mac:48; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __be32 nonce; +} __packed; + +struct amt_header_mcast_data { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u16 type:4, + version:4, + reserved:8; +#elif defined(__BIG_ENDIAN_BITFIELD) + u16 version:4, + type:4, + reserved:8; +#else +#error "Please fix <asm/byteorder.h>" +#endif +} __packed; + +struct amt_headers { + union { + struct amt_header_discovery discovery; + struct amt_header_advertisement advertisement; + struct amt_header_request request; + struct amt_header_membership_query query; + struct amt_header_membership_update update; + struct amt_header_mcast_data data; + }; +} __packed; + +struct amt_gw_headers { + union { + struct amt_header_discovery discovery; + struct amt_header_request request; + struct amt_header_membership_update update; + }; +} __packed; + +struct amt_relay_headers { + union { + struct amt_header_advertisement advertisement; + struct amt_header_membership_query query; + struct amt_header_mcast_data data; + }; +} __packed; + +struct amt_skb_cb { + struct amt_tunnel_list *tunnel; +}; + +struct amt_tunnel_list { + struct list_head list; + /* Protect All resources under an amt_tunne_list */ + spinlock_t lock; + struct amt_dev *amt; + u32 nr_groups; + u32 nr_sources; + enum amt_status status; + struct delayed_work gc_wq; + __be16 source_port; + __be32 ip4; + __be32 nonce; + siphash_key_t key; + u64 mac:48, + reserved:16; + struct rcu_head rcu; + struct hlist_head groups[]; +}; + +union amt_addr { + __be32 ip4; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr ip6; +#endif +}; + +/* RFC 3810 + * + * When the router is in EXCLUDE mode, the router state is represented + * by the notation EXCLUDE (X,Y), where X is called the "Requested List" + * and Y is called the "Exclude List". All sources, except those from + * the Exclude List, will be forwarded by the router + */ +enum amt_source_status { + AMT_SOURCE_STATUS_NONE, + /* Node of Requested List */ + AMT_SOURCE_STATUS_FWD, + /* Node of Exclude List */ + AMT_SOURCE_STATUS_D_FWD, +}; + +/* protected by gnode->lock */ +struct amt_source_node { + struct hlist_node node; + struct amt_group_node *gnode; + struct delayed_work source_timer; + union amt_addr source_addr; + enum amt_source_status status; +#define AMT_SOURCE_OLD 0 +#define AMT_SOURCE_NEW 1 + u8 flags; + struct rcu_head rcu; +}; + +/* Protected by amt_tunnel_list->lock */ +struct amt_group_node { + struct amt_dev *amt; + union amt_addr group_addr; + union amt_addr host_addr; + bool v6; + u8 filter_mode; + u32 nr_sources; + struct amt_tunnel_list *tunnel_list; + struct hlist_node node; + struct delayed_work group_timer; + struct rcu_head rcu; + struct hlist_head sources[]; +}; + +#define AMT_MAX_EVENTS 16 +struct amt_events { + enum amt_event event; + struct sk_buff *skb; +}; + +struct amt_dev { + struct net_device *dev; + struct net_device *stream_dev; + struct net *net; + /* Global lock for amt device */ + spinlock_t lock; + /* Used only in relay mode */ + struct list_head tunnel_list; + struct gro_cells gro_cells; + + /* Protected by RTNL */ + struct delayed_work discovery_wq; + /* Protected by RTNL */ + struct delayed_work req_wq; + /* Protected by RTNL */ + struct delayed_work secret_wq; + struct work_struct event_wq; + /* AMT status */ + enum amt_status status; + /* Generated key */ + siphash_key_t key; + struct socket __rcu *sock; + u32 max_groups; + u32 max_sources; + u32 hash_buckets; + u32 hash_seed; + /* Default 128 */ + u32 max_tunnels; + /* Default 128 */ + u32 nr_tunnels; + /* Gateway or Relay mode */ + u32 mode; + /* Default 2268 */ + __be16 relay_port; + /* Default 2268 */ + __be16 gw_port; + /* Outer local ip */ + __be32 local_ip; + /* Outer remote ip */ + __be32 remote_ip; + /* Outer discovery ip */ + __be32 discovery_ip; + /* Only used in gateway mode */ + __be32 nonce; + /* Gateway sent request and received query */ + bool ready4; + bool ready6; + u8 req_cnt; + u8 qi; + u64 qrv; + u64 qri; + /* Used only in gateway mode */ + u64 mac:48, + reserved:16; + /* AMT gateway side message handler queue */ + struct amt_events events[AMT_MAX_EVENTS]; + u8 event_idx; + u8 nr_events; +}; + +#define AMT_TOS 0xc0 +#define AMT_IPHDR_OPTS 4 +#define AMT_IP6HDR_OPTS 8 +#define AMT_GC_INTERVAL (30 * 1000) +#define AMT_MAX_GROUP 32 +#define AMT_MAX_SOURCE 128 +#define AMT_HSIZE_SHIFT 8 +#define AMT_HSIZE (1 << AMT_HSIZE_SHIFT) + +#define AMT_DISCOVERY_TIMEOUT 5000 +#define AMT_INIT_REQ_TIMEOUT 1 +#define AMT_INIT_QUERY_INTERVAL 125 +#define AMT_MAX_REQ_TIMEOUT 120 +#define AMT_MAX_REQ_COUNT 3 +#define AMT_SECRET_TIMEOUT 60000 +#define IANA_AMT_UDP_PORT 2268 +#define AMT_MAX_TUNNELS 128 +#define AMT_MAX_REQS 128 +#define AMT_GW_HLEN (sizeof(struct iphdr) + \ + sizeof(struct udphdr) + \ + sizeof(struct amt_gw_headers)) +#define AMT_RELAY_HLEN (sizeof(struct iphdr) + \ + sizeof(struct udphdr) + \ + sizeof(struct amt_relay_headers)) + +static inline bool netif_is_amt(const struct net_device *dev) +{ + return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "amt"); +} + +static inline u64 amt_gmi(const struct amt_dev *amt) +{ + return ((amt->qrv * amt->qi) + amt->qri) * 1000; +} + +#endif /* _NET_AMT_H_ */ diff --git a/include/net/arp.h b/include/net/arp.h index 4950191f6b2b..e8747e0713c7 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -38,11 +38,11 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 { struct neighbour *n; - rcu_read_lock_bh(); + rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; - rcu_read_unlock_bh(); + rcu_read_unlock(); return n; } @@ -51,16 +51,10 @@ static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; - rcu_read_lock_bh(); + rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); - if (n) { - unsigned long now = jiffies; - - /* avoid dirtying neighbour */ - if (READ_ONCE(n->confirmed) != now) - WRITE_ONCE(n->confirmed, now); - } - rcu_read_unlock_bh(); + neigh_confirm(n); + rcu_read_unlock(); } void arp_init(void); @@ -71,6 +65,7 @@ void arp_send(int type, int ptype, __be32 dest_ip, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); +int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, diff --git a/include/net/ax25.h b/include/net/ax25.h index 8b7eb46ad72d..a7bba42dde15 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -139,7 +139,9 @@ enum { AX25_VALUES_N2, /* Default N2 value */ AX25_VALUES_PACLEN, /* AX.25 MTU */ AX25_VALUES_PROTOCOL, /* Std AX.25, DAMA Slave, DAMA Master */ +#ifdef CONFIG_AX25_DAMA_SLAVE AX25_VALUES_DS_TIMEOUT, /* DAMA Slave timeout */ +#endif AX25_MAX_VALUES /* THIS MUST REMAIN THE LAST ENTRY OF THIS LIST */ }; @@ -187,18 +189,12 @@ typedef struct { typedef struct ax25_route { struct ax25_route *next; - refcount_t refcount; ax25_address callsign; struct net_device *dev; ax25_digi *digipeat; char ip_mode; } ax25_route; -static inline void ax25_hold_route(ax25_route *ax25_rt) -{ - refcount_inc(&ax25_rt->refcount); -} - void __ax25_put_route(ax25_route *ax25_rt); extern rwlock_t ax25_route_lock; @@ -213,12 +209,6 @@ static inline void ax25_route_lock_unuse(void) read_unlock(&ax25_route_lock); } -static inline void ax25_put_route(ax25_route *ax25_rt) -{ - if (refcount_dec_and_test(&ax25_rt->refcount)) - __ax25_put_route(ax25_rt); -} - typedef struct { char slave; /* slave_mode? */ struct timer_list slave_timer; /* timeout timer */ @@ -228,14 +218,20 @@ typedef struct { struct ctl_table; typedef struct ax25_dev { - struct ax25_dev *next; + struct list_head list; + struct net_device *dev; + netdevice_tracker dev_tracker; + struct net_device *forward; struct ctl_table_header *sysheader; int values[AX25_MAX_VALUES]; #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif + refcount_t refcount; + bool device_up; + struct rcu_head rcu; } ax25_dev; typedef struct ax25_cb { @@ -243,6 +239,7 @@ typedef struct ax25_cb { ax25_address source_addr, dest_addr; ax25_digi *digipeat; ax25_dev *ax25_dev; + netdevice_tracker dev_tracker; unsigned char iamdigi; unsigned char state, modulus, pidincl; unsigned short vs, vr, va; @@ -266,10 +263,7 @@ struct ax25_sock { struct ax25_cb *cb; }; -static inline struct ax25_sock *ax25_sk(const struct sock *sk) -{ - return (struct ax25_sock *) sk; -} +#define ax25_sk(ptr) container_of_const(ptr, struct ax25_sock, sk) static inline struct ax25_cb *sk_to_ax25(const struct sock *sk) { @@ -290,6 +284,16 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } +static inline void ax25_dev_hold(ax25_dev *ax25_dev) +{ + refcount_inc(&ax25_dev->refcount); +} + +static inline void ax25_dev_put(ax25_dev *ax25_dev) +{ + if (refcount_dec_and_test(&ax25_dev->refcount)) + kfree_rcu(ax25_dev, rcu); +} static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; @@ -304,7 +308,7 @@ extern spinlock_t ax25_list_lock; void ax25_cb_add(ax25_cb *); struct sock *ax25_find_listener(ax25_address *, int, struct net_device *, int); struct sock *ax25_get_socket(ax25_address *, ax25_address *, int); -ax25_cb *ax25_find_cb(ax25_address *, ax25_address *, ax25_digi *, +ax25_cb *ax25_find_cb(const ax25_address *, ax25_address *, ax25_digi *, struct net_device *); void ax25_send_to_raw(ax25_address *, struct sk_buff *, int); void ax25_destroy_socket(ax25_cb *); @@ -328,13 +332,12 @@ int ax25_addr_size(const ax25_digi *); void ax25_digi_invert(const ax25_digi *, ax25_digi *); /* ax25_dev.c */ -extern ax25_dev *ax25_dev_list; extern spinlock_t ax25_dev_lock; #if IS_ENABLED(CONFIG_AX25) -static inline ax25_dev *ax25_dev_ax25dev(struct net_device *dev) +static inline ax25_dev *ax25_dev_ax25dev(const struct net_device *dev) { - return dev->ax25_ptr; + return rcu_dereference_rtnl(dev->ax25_ptr); } #endif @@ -384,10 +387,11 @@ struct ax25_linkfail { void ax25_linkfail_register(struct ax25_linkfail *lf); void ax25_linkfail_release(struct ax25_linkfail *lf); -int __must_check ax25_listen_register(ax25_address *, struct net_device *); -void ax25_listen_release(ax25_address *, struct net_device *); +int __must_check ax25_listen_register(const ax25_address *, + struct net_device *); +void ax25_listen_release(const ax25_address *, struct net_device *); int(*ax25_protocol_function(unsigned int))(struct sk_buff *, ax25_cb *); -int ax25_listen_mine(ax25_address *, struct net_device *); +int ax25_listen_mine(const ax25_address *, struct net_device *); void ax25_link_failed(ax25_cb *, int); int ax25_protocol_is_registered(unsigned int); @@ -401,8 +405,8 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb); extern const struct header_ops ax25_header_ops; /* ax25_out.c */ -ax25_cb *ax25_send_frame(struct sk_buff *, int, ax25_address *, ax25_address *, - ax25_digi *, struct net_device *); +ax25_cb *ax25_send_frame(struct sk_buff *, int, const ax25_address *, + ax25_address *, ax25_digi *, struct net_device *); void ax25_output(ax25_cb *, int, struct sk_buff *); void ax25_kick(ax25_cb *); void ax25_transmit_buffer(ax25_cb *, struct sk_buff *, int); @@ -414,7 +418,6 @@ void ax25_rt_device_down(struct net_device *); int ax25_rt_ioctl(unsigned int, void __user *); extern const struct seq_operations ax25_rt_seqops; ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev); -int ax25_rt_autobind(ax25_cb *, ax25_address *); struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, ax25_address *, ax25_digi *); void ax25_rt_free(void); diff --git a/include/net/ax88796.h b/include/net/ax88796.h index aa52b2e8ff7b..303100f08ab8 100644 --- a/include/net/ax88796.h +++ b/include/net/ax88796.h @@ -8,6 +8,8 @@ #ifndef __NET_AX88796_PLAT_H #define __NET_AX88796_PLAT_H +#include <linux/types.h> + struct sk_buff; struct net_device; struct platform_device; @@ -32,10 +34,13 @@ struct ax_plat_data { const unsigned char *buf, int star_page); void (*block_input)(struct net_device *dev, int count, struct sk_buff *skb, int ring_offset); - /* returns nonzero if a pending interrupt request might by caused by - * the ax88786. Handles all interrupts if set to NULL + /* returns nonzero if a pending interrupt request might be caused by + * the ax88796. Handles all interrupts if set to NULL */ int (*check_irq)(struct platform_device *pdev); }; +/* exported from ax88796.c for xsurf100.c */ +extern void ax_NS8390_reinit(struct net_device *dev); + #endif /* __NET_AX88796_PLAT_H */ diff --git a/include/net/bareudp.h b/include/net/bareudp.h index dc65a0d71d9b..17610c8d6361 100644 --- a/include/net/bareudp.h +++ b/include/net/bareudp.h @@ -3,21 +3,10 @@ #ifndef __NET_BAREUDP_H #define __NET_BAREUDP_H +#include <linux/netdevice.h> #include <linux/types.h> -#include <linux/skbuff.h> #include <net/rtnetlink.h> -struct bareudp_conf { - __be16 ethertype; - __be16 port; - u16 sport_min; - bool multi_proto_mode; -}; - -struct net_device *bareudp_dev_create(struct net *net, const char *name, - u8 name_assign_type, - struct bareudp_conf *info); - static inline bool netif_is_bareudp(const struct net_device *dev) { return dev->rtnl_link_ops && diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 9125effbf448..114299bd8b98 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -1,6 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated + Copyright 2023 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> @@ -28,6 +29,7 @@ #include <linux/poll.h> #include <net/sock.h> #include <linux/seq_file.h> +#include <linux/ethtool.h> #define BT_SUBSYS_VERSION 2 #define BT_SUBSYS_REVISION 22 @@ -55,6 +57,8 @@ #define BTPROTO_CMTP 5 #define BTPROTO_HIDP 6 #define BTPROTO_AVDTP 7 +#define BTPROTO_ISO 8 +#define BTPROTO_LAST BTPROTO_ISO #define SOL_HCI 0 #define SOL_L2CAP 6 @@ -120,6 +124,7 @@ struct bt_voice { #define BT_VOICE_TRANSPARENT 0x0003 #define BT_VOICE_CVSD_16BIT 0x0060 +#define BT_VOICE_TRANSPARENT_16BIT 0x0063 #define BT_SNDMTU 12 #define BT_RCVMTU 13 @@ -149,9 +154,95 @@ struct bt_voice { #define BT_MODE_LE_FLOWCTL 0x03 #define BT_MODE_EXT_FLOWCTL 0x04 -#define BT_PKT_STATUS 16 +#define BT_PKT_STATUS 16 #define BT_SCM_PKT_STATUS 0x03 +#define BT_SCM_ERROR 0x04 + +#define BT_ISO_QOS 17 + +#define BT_ISO_QOS_CIG_UNSET 0xff +#define BT_ISO_QOS_CIS_UNSET 0xff + +#define BT_ISO_QOS_BIG_UNSET 0xff +#define BT_ISO_QOS_BIS_UNSET 0xff + +#define BT_ISO_SYNC_TIMEOUT 0x07d0 /* 20 secs */ + +struct bt_iso_io_qos { + __u32 interval; + __u16 latency; + __u16 sdu; + __u8 phy; + __u8 rtn; +}; + +struct bt_iso_ucast_qos { + __u8 cig; + __u8 cis; + __u8 sca; + __u8 packing; + __u8 framing; + struct bt_iso_io_qos in; + struct bt_iso_io_qos out; +}; + +struct bt_iso_bcast_qos { + __u8 big; + __u8 bis; + __u8 sync_factor; + __u8 packing; + __u8 framing; + struct bt_iso_io_qos in; + struct bt_iso_io_qos out; + __u8 encryption; + __u8 bcode[16]; + __u8 options; + __u16 skip; + __u16 sync_timeout; + __u8 sync_cte_type; + __u8 mse; + __u16 timeout; +}; + +struct bt_iso_qos { + union { + struct bt_iso_ucast_qos ucast; + struct bt_iso_bcast_qos bcast; + }; +}; + +#define BT_ISO_PHY_1M 0x01 +#define BT_ISO_PHY_2M 0x02 +#define BT_ISO_PHY_CODED 0x04 +#define BT_ISO_PHY_ANY (BT_ISO_PHY_1M | BT_ISO_PHY_2M | \ + BT_ISO_PHY_CODED) + +#define BT_CODEC 19 + +struct bt_codec_caps { + __u8 len; + __u8 data[]; +} __packed; + +struct bt_codec { + __u8 id; + __u16 cid; + __u16 vid; + __u8 data_path; + __u8 num_caps; +} __packed; + +struct bt_codecs { + __u8 num_codecs; + struct bt_codec codecs[]; +} __packed; + +#define BT_CODEC_CVSD 0x02 +#define BT_CODEC_TRANSPARENT 0x03 +#define BT_CODEC_MSBC 0x05 + +#define BT_ISO_BASE 20 __printf(1, 2) void bt_info(const char *fmt, ...); @@ -180,22 +271,24 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif +#define bt_dev_name(hdev) ((hdev) ? (hdev)->name : "null") + #define bt_dev_info(hdev, fmt, ...) \ - BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_INFO("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn(hdev, fmt, ...) \ - BT_WARN("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_WARN("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err(hdev, fmt, ...) \ - BT_ERR("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_ERR("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_dbg(hdev, fmt, ...) \ - BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_DBG("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn_ratelimited(hdev, fmt, ...) \ - bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_warn_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ - bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) /* Connection and socket states */ -enum { +enum bt_sock_state { BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */ BT_OPEN, BT_BOUND, @@ -298,6 +391,7 @@ struct bt_sock { enum { BT_SK_DEFER_SETUP, BT_SK_SUSPEND, + BT_SK_PKT_STATUS }; struct bt_sock_list { @@ -312,6 +406,9 @@ int bt_sock_register(int proto, const struct net_proto_family *ops); void bt_sock_unregister(int proto); void bt_sock_link(struct bt_sock_list *l, struct sock *s); void bt_sock_unlink(struct bt_sock_list *l, struct sock *s); +bool bt_sock_linked(struct bt_sock_list *l, struct sock *s); +struct sock *bt_sock_alloc(struct net *net, struct socket *sock, + struct proto *prot, int proto, gfp_t prio, int kern); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, @@ -319,7 +416,7 @@ int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, __poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); -int bt_sock_wait_ready(struct sock *sk, unsigned long flags); +int bt_sock_wait_ready(struct sock *sk, unsigned int msg_flags); void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh); void bt_accept_unlink(struct sock *sk); @@ -342,20 +439,24 @@ struct l2cap_ctrl { struct l2cap_chan *chan; }; -struct sco_ctrl { - u8 pkt_status; -}; - struct hci_dev; typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status, u16 opcode); typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb); +void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, + hci_req_complete_t *req_complete, + hci_req_complete_skb_t *req_complete_skb); + +int hci_ethtool_ts_info(unsigned int index, int sk_proto, + struct kernel_ethtool_ts_info *ts_info); + #define HCI_REQ_START BIT(0) #define HCI_REQ_SKB BIT(1) struct hci_ctrl { + struct sock *sk; u16 opcode; u8 req_flags; u8 req_event; @@ -365,22 +466,32 @@ struct hci_ctrl { }; }; +struct mgmt_ctrl { + struct hci_dev *hdev; + u16 opcode; +}; + struct bt_skb_cb { u8 pkt_type; u8 force_active; u16 expect; u8 incoming:1; + u8 pkt_status:2; union { struct l2cap_ctrl l2cap; - struct sco_ctrl sco; struct hci_ctrl hci; + struct mgmt_ctrl mgmt; + struct scm_creds creds; }; }; #define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb)) #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type +#define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status #define hci_skb_expect(skb) bt_cb((skb))->expect #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode +#define hci_skb_event(skb) bt_cb((skb))->hci.req_event +#define hci_skb_sk(skb) bt_cb((skb))->hci.sk static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how) { @@ -420,7 +531,73 @@ out: return NULL; } +/* Shall not be called with lock_sock held */ +static inline struct sk_buff *bt_skb_sendmsg(struct sock *sk, + struct msghdr *msg, + size_t len, size_t mtu, + size_t headroom, size_t tailroom) +{ + struct sk_buff *skb; + size_t size = min_t(size_t, len, mtu); + int err; + + skb = bt_skb_send_alloc(sk, size + headroom + tailroom, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return ERR_PTR(err); + + skb_reserve(skb, headroom); + skb_tailroom_reserve(skb, mtu, tailroom); + + if (!copy_from_iter_full(skb_put(skb, size), size, &msg->msg_iter)) { + kfree_skb(skb); + return ERR_PTR(-EFAULT); + } + + skb->priority = READ_ONCE(sk->sk_priority); + + return skb; +} + +/* Similar to bt_skb_sendmsg but can split the msg into multiple fragments + * accourding to the MTU. + */ +static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, + struct msghdr *msg, + size_t len, size_t mtu, + size_t headroom, size_t tailroom) +{ + struct sk_buff *skb, **frag; + + skb = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom); + if (IS_ERR(skb)) + return skb; + + len -= skb->len; + if (!len) + return skb; + + /* Add remaining data over MTU as continuation fragments */ + frag = &skb_shinfo(skb)->frag_list; + while (len) { + struct sk_buff *tmp; + + tmp = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom); + if (IS_ERR(tmp)) { + return skb; + } + + len -= tmp->len; + + *frag = tmp; + frag = &(*frag)->next; + } + + return skb; +} + int bt_to_errno(u16 code); +__u8 bt_status(int err); void hci_sock_set_flag(struct sock *sk, int nr); void hci_sock_clear_flag(struct sock *sk, int nr); @@ -458,8 +635,30 @@ static inline void sco_exit(void) } #endif +#if IS_ENABLED(CONFIG_BT_LE) +int iso_init(void); +int iso_exit(void); +bool iso_enabled(void); +#else +static inline int iso_init(void) +{ + return 0; +} + +static inline int iso_exit(void) +{ + return 0; +} + +static inline bool iso_enabled(void) +{ + return false; +} +#endif + int mgmt_init(void); void mgmt_exit(void); +void mgmt_cleanup(struct sock *sk); void bt_sock_reclassify_lock(struct sock *sk, int proto); diff --git a/include/net/bluetooth/coredump.h b/include/net/bluetooth/coredump.h new file mode 100644 index 000000000000..72f51b587a04 --- /dev/null +++ b/include/net/bluetooth/coredump.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 Google Corporation + */ + +#ifndef __COREDUMP_H +#define __COREDUMP_H + +#define DEVCOREDUMP_TIMEOUT msecs_to_jiffies(10000) /* 10 sec */ + +typedef void (*coredump_t)(struct hci_dev *hdev); +typedef void (*dmp_hdr_t)(struct hci_dev *hdev, struct sk_buff *skb); +typedef void (*notify_change_t)(struct hci_dev *hdev, int state); + +/* struct hci_devcoredump - Devcoredump state + * + * @supported: Indicates if FW dump collection is supported by driver + * @state: Current state of dump collection + * @timeout: Indicates a timeout for collecting the devcoredump + * + * @alloc_size: Total size of the dump + * @head: Start of the dump + * @tail: Pointer to current end of dump + * @end: head + alloc_size for easy comparisons + * + * @dump_q: Dump queue for state machine to process + * @dump_rx: Devcoredump state machine work + * @dump_timeout: Devcoredump timeout work + * + * @coredump: Called from the driver's .coredump() function. + * @dmp_hdr: Create a dump header to identify controller/fw/driver info + * @notify_change: Notify driver when devcoredump state has changed + */ +struct hci_devcoredump { + bool supported; + + enum devcoredump_state { + HCI_DEVCOREDUMP_IDLE, + HCI_DEVCOREDUMP_ACTIVE, + HCI_DEVCOREDUMP_DONE, + HCI_DEVCOREDUMP_ABORT, + HCI_DEVCOREDUMP_TIMEOUT, + } state; + + unsigned long timeout; + + size_t alloc_size; + char *head; + char *tail; + char *end; + + struct sk_buff_head dump_q; + struct work_struct dump_rx; + struct delayed_work dump_timeout; + + coredump_t coredump; + dmp_hdr_t dmp_hdr; + notify_change_t notify_change; +}; + +#ifdef CONFIG_DEV_COREDUMP + +void hci_devcd_reset(struct hci_dev *hdev); +void hci_devcd_rx(struct work_struct *work); +void hci_devcd_timeout(struct work_struct *work); + +int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump, + dmp_hdr_t dmp_hdr, notify_change_t notify_change); +int hci_devcd_init(struct hci_dev *hdev, u32 dump_size); +int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb); +int hci_devcd_append_pattern(struct hci_dev *hdev, u8 pattern, u32 len); +int hci_devcd_complete(struct hci_dev *hdev); +int hci_devcd_abort(struct hci_dev *hdev); + +#else + +static inline void hci_devcd_reset(struct hci_dev *hdev) {} +static inline void hci_devcd_rx(struct work_struct *work) {} +static inline void hci_devcd_timeout(struct work_struct *work) {} + +static inline int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump, + dmp_hdr_t dmp_hdr, + notify_change_t notify_change) +{ + return -EOPNOTSUPP; +} + +static inline int hci_devcd_init(struct hci_dev *hdev, u32 dump_size) +{ + return -EOPNOTSUPP; +} + +static inline int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + +static inline int hci_devcd_append_pattern(struct hci_dev *hdev, + u8 pattern, u32 len) +{ + return -EOPNOTSUPP; +} + +static inline int hci_devcd_complete(struct hci_dev *hdev) +{ + return -EOPNOTSUPP; +} + +static inline int hci_devcd_abort(struct hci_dev *hdev) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_DEV_COREDUMP */ + +#endif /* __COREDUMP_H */ diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index c8e67042a3b1..82cbd54443ac 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1,6 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated + Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> @@ -28,15 +29,13 @@ #define HCI_MAX_ACL_SIZE 1024 #define HCI_MAX_SCO_SIZE 255 #define HCI_MAX_ISO_SIZE 251 +#define HCI_MAX_ISO_BIS 31 #define HCI_MAX_EVENT_SIZE 260 #define HCI_MAX_FRAME_SIZE (HCI_MAX_ACL_SIZE + 4) #define HCI_LINK_KEY_SIZE 16 -#define HCI_AMP_LINK_KEY_SIZE (2 * HCI_LINK_KEY_SIZE) -#define HCI_MAX_AMP_ASSOC_SIZE 672 - -#define HCI_MAX_CSB_DATA_SIZE 252 +#define HCI_MAX_CPB_DATA_SIZE 252 /* HCI dev events */ #define HCI_DEV_REG 1 @@ -69,26 +68,7 @@ #define HCI_I2C 8 #define HCI_SMD 9 #define HCI_VIRTIO 10 - -/* HCI controller types */ -#define HCI_PRIMARY 0x00 -#define HCI_AMP 0x01 - -/* First BR/EDR Controller shall have ID = 0 */ -#define AMP_ID_BREDR 0x00 - -/* AMP controller types */ -#define AMP_TYPE_BREDR 0x00 -#define AMP_TYPE_80211 0x01 - -/* AMP controller status */ -#define AMP_STATUS_POWERED_DOWN 0x00 -#define AMP_STATUS_BLUETOOTH_ONLY 0x01 -#define AMP_STATUS_NO_CAPACITY 0x02 -#define AMP_STATUS_LOW_CAPACITY 0x03 -#define AMP_STATUS_MEDIUM_CAPACITY 0x04 -#define AMP_STATUS_HIGH_CAPACITY 0x05 -#define AMP_STATUS_FULL_CAPACITY 0x06 +#define HCI_IPC 11 /* HCI device quirks */ enum { @@ -175,6 +155,15 @@ enum { */ HCI_QUIRK_USE_BDADDR_PROPERTY, + /* When this quirk is set, the Bluetooth Device Address provided by + * the 'local-bd-address' fwnode property is incorrectly specified in + * big-endian order. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BDADDR_PROPERTY_BROKEN, + /* When this quirk is set, the duplicate filtering during * scanning is based on Bluetooth devices addresses. To allow * RSSI based updates, restart scanning if needed. @@ -219,14 +208,24 @@ enum { */ HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, - /* When this quirk is set, the controller has validated that - * LE states reported through the HCI_LE_READ_SUPPORTED_STATES are - * valid. This mechanism is necessary as many controllers have - * been seen has having trouble initiating a connectable - * advertisement despite the state combination being reported as - * supported. + /* When this quirk is set consider Sync Flow Control as supported by + * the driver. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, + + /* When this quirk is set, the LE states reported through the + * HCI_LE_READ_SUPPORTED_STATES are invalid/broken. + * + * This mechanism is necessary as many controllers have been seen has + * having trouble initiating a connectable advertisement despite the + * state combination being reported as supported. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. */ - HCI_QUIRK_VALID_LE_STATES, + HCI_QUIRK_BROKEN_LE_STATES, /* When this quirk is set, then erroneous data reporting * is ignored. This is mainly due to the fact that the HCI @@ -238,6 +237,146 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, + + /* + * When this quirk is set, then the hci_suspend_notifier is not + * registered. This is intended for devices which drop completely + * from the bus on system-suspend and which will show up as a new + * HCI after resume. + */ + HCI_QUIRK_NO_SUSPEND_NOTIFIER, + + /* + * When this quirk is set, LE tx power is not queried on startup + * and the min/max tx power values default to HCI_TX_POWER_INVALID. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, + + /* When this quirk is set, HCI_OP_SET_EVENT_FLT requests with + * HCI_FLT_CLEAR_ALL are ignored and event filtering is + * completely avoided. A subset of the CSR controller + * clones struggle with this and instantly lock up. + * + * Note that devices using this must (separately) disable + * runtime suspend, because event filtering takes place there. + */ + HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, + + /* + * When this quirk is set, disables the use of + * HCI_OP_ENHANCED_SETUP_SYNC_CONN command to setup SCO connections. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, + + /* + * When this quirk is set, the HCI_OP_LE_SET_EXT_SCAN_ENABLE command is + * disabled. This is required for some Broadcom controllers which + * erroneously claim to support extended scanning. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_EXT_SCAN, + + /* + * When this quirk is set, the HCI_OP_GET_MWS_TRANSPORT_CONFIG command is + * disabled. This is required for some Broadcom controllers which + * erroneously claim to support MWS Transport Layer Configuration. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, + + /* When this quirk is set, max_page for local extended features + * is set to 1, even if controller reports higher number. Some + * controllers (e.g. RTL8723CS) report more pages, but they + * don't actually support features declared there. + */ + HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2, + + /* + * When this quirk is set, the HCI_OP_LE_SET_RPA_TIMEOUT command is + * skipped during initialization. This is required for the Actions + * Semiconductor ATS2851 based controllers, which erroneously claims + * to support it. + */ + HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, + + /* + * When this quirk is set, the HCI_OP_LE_EXT_CREATE_CONN command is + * disabled. This is required for the Actions Semiconductor ATS2851 + * based controllers, which erroneously claims to support it. + */ + HCI_QUIRK_BROKEN_EXT_CREATE_CONN, + + /* + * When this quirk is set, the command WRITE_AUTH_PAYLOAD_TIMEOUT is + * skipped. This is required for the Actions Semiconductor ATS2851 + * based controllers, due to a race condition in pairing process. + */ + HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT, + + /* When this quirk is set, MSFT extension monitor tracking by + * address filter is supported. Since tracking quantity of each + * pattern is limited, this feature supports tracking multiple + * devices concurrently if controller supports multiple + * address filters. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, + + /* + * When this quirk is set, LE Coded PHY shall not be used. This is + * required for some Intel controllers which erroneously claim to + * support it but it causes problems with extended scanning. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_LE_CODED, + + /* + * When this quirk is set, the HCI_OP_READ_ENC_KEY_SIZE command is + * skipped during an HCI_EV_ENCRYPT_CHANGE event. This is required + * for Actions Semiconductor ATS2851 based controllers, which erroneously + * claim to support it. + */ + HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, + + /* + * When this quirk is set, the reserved bits of Primary/Secondary_PHY + * inside the LE Extended Advertising Report events are discarded. + * This is required for some Apple/Broadcom controllers which + * abuse these reserved bits for unrelated flags. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, + + /* When this quirk is set, the HCI_OP_READ_VOICE_SETTING command is + * skipped. This is required for a subset of the CSR controller clones + * which erroneously claim to support it. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_BROKEN_READ_VOICE_SETTING, + + /* When this quirk is set, the HCI_OP_READ_PAGE_SCAN_TYPE command is + * skipped. This is required for a subset of the CSR controller clones + * which erroneously claim to support it. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, }; /* HCI device flags */ @@ -279,6 +418,8 @@ enum { enum { HCI_SETUP, HCI_CONFIG, + HCI_DEBUGFS_CREATED, + HCI_POWERING_DOWN, HCI_AUTO_OFF, HCI_RFKILLED, HCI_MGMT, @@ -291,6 +432,7 @@ enum { HCI_USER_CHANNEL, HCI_EXT_CONFIGURED, HCI_LE_ADV, + HCI_LE_PER_ADV, HCI_LE_SCAN, HCI_SSP_ENABLED, HCI_SC_ENABLED, @@ -299,7 +441,6 @@ enum { HCI_LIMITED_PRIVACY, HCI_RPA_EXPIRED, HCI_RPA_RESOLVING, - HCI_HS_ENABLED, HCI_LE_ENABLED, HCI_ADVERTISING, HCI_ADVERTISING_CONNECTABLE, @@ -312,15 +453,25 @@ enum { HCI_BREDR_ENABLED, HCI_LE_SCAN_INTERRUPTED, HCI_WIDEBAND_SPEECH_ENABLED, + HCI_EVENT_FILTER_CONFIGURED, + HCI_PA_SYNC, + HCI_SCO_FLOWCTL, HCI_DUT_MODE, HCI_VENDOR_DIAG, HCI_FORCE_BREDR_SMP, HCI_FORCE_STATIC_ADDR, HCI_LL_RPA_RESOLUTION, - HCI_ENABLE_LL_PRIVACY, HCI_CMD_PENDING, HCI_FORCE_NO_MITM, + HCI_QUALITY_REPORT, + HCI_OFFLOAD_CODECS_ENABLED, + HCI_LE_SIMULTANEOUS_ROLES, + HCI_CMD_DRAIN_WORKQUEUE, + + HCI_MESH_EXPERIMENTAL, + HCI_MESH, + HCI_MESH_SENDING, __HCI_NUM_FLAGS, }; @@ -330,11 +481,11 @@ enum { #define HCI_PAIRING_TIMEOUT msecs_to_jiffies(60000) /* 60 seconds */ #define HCI_INIT_TIMEOUT msecs_to_jiffies(10000) /* 10 seconds */ #define HCI_CMD_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ +#define HCI_NCMD_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ #define HCI_ACL_TX_TIMEOUT msecs_to_jiffies(45000) /* 45 seconds */ #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ -#define HCI_POWER_OFF_TIMEOUT msecs_to_jiffies(5000) /* 5 seconds */ +#define HCI_ACL_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ -#define HCI_LE_AUTOCONN_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 @@ -343,6 +494,7 @@ enum { #define HCI_EVENT_PKT 0x04 #define HCI_ISODATA_PKT 0x05 #define HCI_DIAG_PKT 0xf0 +#define HCI_DRV_PKT 0xf1 #define HCI_VENDOR_PKT 0xff /* HCI packet types */ @@ -406,8 +558,8 @@ enum { #define ESCO_LINK 0x02 /* Low Energy links do not have defined link type. Use invented one */ #define LE_LINK 0x80 -#define AMP_LINK 0x81 -#define ISO_LINK 0x82 +#define CIS_LINK 0x82 +#define BIS_LINK 0x83 #define INVALID_LINK 0xff /* LMP features */ @@ -455,6 +607,7 @@ enum { #define LMP_EXT_INQ 0x01 #define LMP_SIMUL_LE_BR 0x02 #define LMP_SIMPLE_PAIR 0x08 +#define LMP_ERR_DATA_REPORTING 0x20 #define LMP_NO_FLUSH 0x40 #define LMP_LSTO 0x01 @@ -462,10 +615,10 @@ enum { #define LMP_EXTFEATURES 0x80 /* Extended LMP features */ -#define LMP_CSB_MASTER 0x01 -#define LMP_CSB_SLAVE 0x02 -#define LMP_SYNC_TRAIN 0x04 -#define LMP_SYNC_SCAN 0x08 +#define LMP_CPB_CENTRAL 0x01 +#define LMP_CPB_PERIPHERAL 0x02 +#define LMP_SYNC_TRAIN 0x04 +#define LMP_SYNC_SCAN 0x08 #define LMP_SC 0x01 #define LMP_PING 0x02 @@ -479,7 +632,7 @@ enum { /* LE features */ #define HCI_LE_ENCRYPTION 0x01 #define HCI_LE_CONN_PARAM_REQ_PROC 0x02 -#define HCI_LE_SLAVE_FEATURES 0x08 +#define HCI_LE_PERIPHERAL_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 #define HCI_LE_LL_PRIVACY 0x40 @@ -487,9 +640,12 @@ enum { #define HCI_LE_PHY_2M 0x01 #define HCI_LE_PHY_CODED 0x08 #define HCI_LE_EXT_ADV 0x10 +#define HCI_LE_PERIODIC_ADV 0x20 #define HCI_LE_CHAN_SEL_ALG2 0x40 -#define HCI_LE_CIS_MASTER 0x10 -#define HCI_LE_CIS_SLAVE 0x20 +#define HCI_LE_CIS_CENTRAL 0x10 +#define HCI_LE_CIS_PERIPHERAL 0x20 +#define HCI_LE_ISO_BROADCASTER 0x40 +#define HCI_LE_ISO_SYNC_RECEIVER 0x80 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 @@ -544,16 +700,20 @@ enum { #define HCI_ERROR_PIN_OR_KEY_MISSING 0x06 #define HCI_ERROR_MEMORY_EXCEEDED 0x07 #define HCI_ERROR_CONNECTION_TIMEOUT 0x08 +#define HCI_ERROR_COMMAND_DISALLOWED 0x0c #define HCI_ERROR_REJ_LIMITED_RESOURCES 0x0d #define HCI_ERROR_REJ_BAD_ADDR 0x0f +#define HCI_ERROR_INVALID_PARAMETERS 0x12 #define HCI_ERROR_REMOTE_USER_TERM 0x13 #define HCI_ERROR_REMOTE_LOW_RESOURCES 0x14 #define HCI_ERROR_REMOTE_POWER_OFF 0x15 #define HCI_ERROR_LOCAL_HOST_TERM 0x16 #define HCI_ERROR_PAIRING_NOT_ALLOWED 0x18 +#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE 0x1a #define HCI_ERROR_INVALID_LL_PARAMS 0x1e #define HCI_ERROR_UNSPECIFIED 0x1f #define HCI_ERROR_ADVERTISING_TIMEOUT 0x3c +#define HCI_ERROR_CANCELLED_BY_HOST 0x44 /* Flow control modes */ #define HCI_FLOW_CTL_MODE_PACKET_BASED 0x00 @@ -563,6 +723,9 @@ enum { #define HCI_TX_POWER_INVALID 127 #define HCI_RSSI_INVALID 127 +#define HCI_SYNC_HANDLE_INVALID 0xffff +#define HCI_SID_INVALID 0xff + #define HCI_ROLE_MASTER 0x00 #define HCI_ROLE_SLAVE 0x01 @@ -582,6 +745,7 @@ enum { #define EIR_SSP_RAND_R192 0x0F /* Simple Pairing Randomizer R-192 */ #define EIR_DEVICE_ID 0x10 /* device ID */ #define EIR_APPEARANCE 0x19 /* Device appearance */ +#define EIR_SERVICE_DATA 0x16 /* Service Data */ #define EIR_LE_BDADDR 0x1B /* LE Bluetooth device address */ #define EIR_LE_ROLE 0x1C /* LE role */ #define EIR_SSP_HASH_C256 0x1D /* Simple Pairing Hash C-256 */ @@ -717,6 +881,11 @@ struct hci_cp_remote_name_req_cancel { bdaddr_t bdaddr; } __packed; +struct hci_rp_remote_name_req_cancel { + __u8 status; + bdaddr_t bdaddr; +} __packed; + #define HCI_OP_READ_REMOTE_FEATURES 0x041b struct hci_cp_read_remote_features { __le16 handle; @@ -811,54 +980,38 @@ struct hci_cp_io_capability_neg_reply { __u8 reason; } __packed; -#define HCI_OP_CREATE_PHY_LINK 0x0435 -struct hci_cp_create_phy_link { - __u8 phy_handle; - __u8 key_len; - __u8 key_type; - __u8 key[HCI_AMP_LINK_KEY_SIZE]; -} __packed; - -#define HCI_OP_ACCEPT_PHY_LINK 0x0436 -struct hci_cp_accept_phy_link { - __u8 phy_handle; - __u8 key_len; - __u8 key_type; - __u8 key[HCI_AMP_LINK_KEY_SIZE]; -} __packed; - -#define HCI_OP_DISCONN_PHY_LINK 0x0437 -struct hci_cp_disconn_phy_link { - __u8 phy_handle; - __u8 reason; -} __packed; - -struct ext_flow_spec { - __u8 id; - __u8 stype; - __le16 msdu; - __le32 sdu_itime; - __le32 acc_lat; - __le32 flush_to; +#define HCI_OP_ENHANCED_SETUP_SYNC_CONN 0x043d +struct hci_coding_format { + __u8 id; + __le16 cid; + __le16 vid; } __packed; -#define HCI_OP_CREATE_LOGICAL_LINK 0x0438 -#define HCI_OP_ACCEPT_LOGICAL_LINK 0x0439 -struct hci_cp_create_accept_logical_link { - __u8 phy_handle; - struct ext_flow_spec tx_flow_spec; - struct ext_flow_spec rx_flow_spec; -} __packed; - -#define HCI_OP_DISCONN_LOGICAL_LINK 0x043a -struct hci_cp_disconn_logical_link { - __le16 log_handle; -} __packed; - -#define HCI_OP_LOGICAL_LINK_CANCEL 0x043b -struct hci_cp_logical_link_cancel { - __u8 phy_handle; - __u8 flow_spec_id; +struct hci_cp_enhanced_setup_sync_conn { + __le16 handle; + __le32 tx_bandwidth; + __le32 rx_bandwidth; + struct hci_coding_format tx_coding_format; + struct hci_coding_format rx_coding_format; + __le16 tx_codec_frame_size; + __le16 rx_codec_frame_size; + __le32 in_bandwidth; + __le32 out_bandwidth; + struct hci_coding_format in_coding_format; + struct hci_coding_format out_coding_format; + __le16 in_coded_data_size; + __le16 out_coded_data_size; + __u8 in_pcm_data_format; + __u8 out_pcm_data_format; + __u8 in_pcm_sample_payload_msb_pos; + __u8 out_pcm_sample_payload_msb_pos; + __u8 in_data_path; + __u8 out_data_path; + __u8 in_transport_unit_size; + __u8 out_transport_unit_size; + __le16 max_latency; + __le16 pkt_type; + __u8 retrans_effort; } __packed; struct hci_rp_logical_link_cancel { @@ -867,17 +1020,17 @@ struct hci_rp_logical_link_cancel { __u8 flow_spec_id; } __packed; -#define HCI_OP_SET_CSB 0x0441 -struct hci_cp_set_csb { +#define HCI_OP_SET_CPB 0x0441 +struct hci_cp_set_cpb { __u8 enable; __u8 lt_addr; __u8 lpo_allowed; __le16 packet_type; __le16 interval_min; __le16 interval_max; - __le16 csb_sv_tout; + __le16 cpb_sv_tout; } __packed; -struct hci_rp_set_csb { +struct hci_rp_set_cpb { __u8 status; __u8 lt_addr; __le16 interval; @@ -1000,8 +1153,8 @@ struct hci_cp_read_stored_link_key { } __packed; struct hci_rp_read_stored_link_key { __u8 status; - __u8 max_keys; - __u8 num_keys; + __le16 max_keys; + __le16 num_keys; } __packed; #define HCI_OP_DELETE_STORED_LINK_KEY 0x0c12 @@ -1011,7 +1164,7 @@ struct hci_cp_delete_stored_link_key { } __packed; struct hci_rp_delete_stored_link_key { __u8 status; - __u8 num_keys; + __le16 num_keys; } __packed; #define HCI_MAX_NAME_LENGTH 248 @@ -1174,14 +1327,14 @@ struct hci_rp_delete_reserved_lt_addr { __u8 lt_addr; } __packed; -#define HCI_OP_SET_CSB_DATA 0x0c76 -struct hci_cp_set_csb_data { +#define HCI_OP_SET_CPB_DATA 0x0c76 +struct hci_cp_set_cpb_data { __u8 lt_addr; __u8 fragment; __u8 data_length; - __u8 data[HCI_MAX_CSB_DATA_SIZE]; + __u8 data[HCI_MAX_CPB_DATA_SIZE]; } __packed; -struct hci_rp_set_csb_data { +struct hci_rp_set_cpb_data { __u8 status; __u8 lt_addr; } __packed; @@ -1240,6 +1393,14 @@ struct hci_rp_read_local_oob_ext_data { __u8 rand256[16]; } __packed; +#define HCI_CONFIGURE_DATA_PATH 0x0c83 +struct hci_op_configure_data_path { + __u8 direction; + __u8 data_path_id; + __u8 vnd_len; + __u8 vnd_data[]; +} __packed; + #define HCI_OP_READ_LOCAL_VERSION 0x1001 struct hci_rp_read_local_version { __u8 status; @@ -1297,6 +1458,28 @@ struct hci_rp_read_data_block_size { } __packed; #define HCI_OP_READ_LOCAL_CODECS 0x100b +struct hci_std_codecs { + __u8 num; + __u8 codec[]; +} __packed; + +struct hci_vnd_codec { + /* company id */ + __le16 cid; + /* vendor codec id */ + __le16 vid; +} __packed; + +struct hci_vnd_codecs { + __u8 num; + struct hci_vnd_codec codec[]; +} __packed; + +struct hci_rp_read_local_supported_codecs { + __u8 status; + struct hci_std_codecs std_codecs; + struct hci_vnd_codecs vnd_codecs; +} __packed; #define HCI_OP_READ_LOCAL_PAIRING_OPTS 0x100c struct hci_rp_read_local_pairing_opts { @@ -1305,6 +1488,53 @@ struct hci_rp_read_local_pairing_opts { __u8 max_key_size; } __packed; +#define HCI_OP_READ_LOCAL_CODECS_V2 0x100d +struct hci_std_codec_v2 { + __u8 id; + __u8 transport; +} __packed; + +struct hci_std_codecs_v2 { + __u8 num; + struct hci_std_codec_v2 codec[]; +} __packed; + +struct hci_vnd_codec_v2 { + __le16 cid; + __le16 vid; + __u8 transport; +} __packed; + +struct hci_vnd_codecs_v2 { + __u8 num; + struct hci_vnd_codec_v2 codec[]; +} __packed; + +struct hci_rp_read_local_supported_codecs_v2 { + __u8 status; + struct hci_std_codecs_v2 std_codecs; + struct hci_vnd_codecs_v2 vendor_codecs; +} __packed; + +#define HCI_OP_READ_LOCAL_CODEC_CAPS 0x100e +struct hci_op_read_local_codec_caps { + __u8 id; + __le16 cid; + __le16 vid; + __u8 transport; + __u8 direction; +} __packed; + +struct hci_codec_caps { + __u8 len; + __u8 data[]; +} __packed; + +struct hci_rp_read_local_codec_caps { + __u8 status; + __u8 num_caps; +} __packed; + #define HCI_OP_READ_PAGE_SCAN_ACTIVITY 0x0c1b struct hci_rp_read_page_scan_activity { __u8 status; @@ -1329,6 +1559,11 @@ struct hci_rp_read_tx_power { __s8 tx_power; } __packed; +#define HCI_OP_WRITE_SYNC_FLOWCTL 0x0c2f +struct hci_cp_write_sync_flowctl { + __u8 enable; +} __packed; + #define HCI_OP_READ_PAGE_SCAN_TYPE 0x0c46 struct hci_rp_read_page_scan_type { __u8 status; @@ -1371,46 +1606,6 @@ struct hci_rp_read_enc_key_size { __u8 key_size; } __packed; -#define HCI_OP_READ_LOCAL_AMP_INFO 0x1409 -struct hci_rp_read_local_amp_info { - __u8 status; - __u8 amp_status; - __le32 total_bw; - __le32 max_bw; - __le32 min_latency; - __le32 max_pdu; - __u8 amp_type; - __le16 pal_cap; - __le16 max_assoc_size; - __le32 max_flush_to; - __le32 be_flush_to; -} __packed; - -#define HCI_OP_READ_LOCAL_AMP_ASSOC 0x140a -struct hci_cp_read_local_amp_assoc { - __u8 phy_handle; - __le16 len_so_far; - __le16 max_len; -} __packed; -struct hci_rp_read_local_amp_assoc { - __u8 status; - __u8 phy_handle; - __le16 rem_len; - __u8 frag[]; -} __packed; - -#define HCI_OP_WRITE_REMOTE_AMP_ASSOC 0x140b -struct hci_cp_write_remote_amp_assoc { - __u8 phy_handle; - __le16 len_so_far; - __le16 rem_len; - __u8 frag[]; -} __packed; -struct hci_rp_write_remote_amp_assoc { - __u8 status; - __u8 phy_handle; -} __packed; - #define HCI_OP_GET_MWS_TRANSPORT_CONFIG 0x140c #define HCI_OP_ENABLE_DUT_MODE 0x1803 @@ -1422,6 +1617,15 @@ struct hci_cp_le_set_event_mask { __u8 mask[8]; } __packed; +/* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 4, Part E + * 7.8.2 LE Read Buffer Size command + * MAX_LE_MTU is 0xffff. + * 0 is also valid. It means that no dedicated LE Buffer exists. + * It should use the HCI_Read_Buffer_Size command and mtu is shared + * between BR/EDR and LE. + */ +#define HCI_MIN_LE_MTU 0x001b + #define HCI_OP_LE_READ_BUFFER_SIZE 0x2002 struct hci_rp_le_read_buffer_size { __u8 status; @@ -1495,7 +1699,7 @@ struct hci_cp_le_set_scan_enable { } __packed; #define HCI_LE_USE_PEER_ADDR 0x00 -#define HCI_LE_USE_WHITELIST 0x01 +#define HCI_LE_USE_ACCEPT_LIST 0x01 #define HCI_OP_LE_CREATE_CONN 0x200d struct hci_cp_le_create_conn { @@ -1515,22 +1719,22 @@ struct hci_cp_le_create_conn { #define HCI_OP_LE_CREATE_CONN_CANCEL 0x200e -#define HCI_OP_LE_READ_WHITE_LIST_SIZE 0x200f -struct hci_rp_le_read_white_list_size { +#define HCI_OP_LE_READ_ACCEPT_LIST_SIZE 0x200f +struct hci_rp_le_read_accept_list_size { __u8 status; __u8 size; } __packed; -#define HCI_OP_LE_CLEAR_WHITE_LIST 0x2010 +#define HCI_OP_LE_CLEAR_ACCEPT_LIST 0x2010 -#define HCI_OP_LE_ADD_TO_WHITE_LIST 0x2011 -struct hci_cp_le_add_to_white_list { +#define HCI_OP_LE_ADD_TO_ACCEPT_LIST 0x2011 +struct hci_cp_le_add_to_accept_list { __u8 bdaddr_type; bdaddr_t bdaddr; } __packed; -#define HCI_OP_LE_DEL_FROM_WHITE_LIST 0x2012 -struct hci_cp_le_del_from_white_list { +#define HCI_OP_LE_DEL_FROM_ACCEPT_LIST 0x2012 +struct hci_cp_le_del_from_accept_list { __u8 bdaddr_type; bdaddr_t bdaddr; } __packed; @@ -1718,6 +1922,24 @@ struct hci_cp_le_ext_conn_param { __le16 max_ce_len; } __packed; +#define HCI_OP_LE_PA_CREATE_SYNC 0x2044 +struct hci_cp_le_pa_create_sync { + __u8 options; + __u8 sid; + __u8 addr_type; + bdaddr_t addr; + __le16 skip; + __le16 sync_timeout; + __u8 sync_cte_type; +} __packed; + +#define HCI_OP_LE_PA_CREATE_SYNC_CANCEL 0x2045 + +#define HCI_OP_LE_PA_TERM_SYNC 0x2046 +struct hci_cp_le_pa_term_sync { + __le16 handle; +} __packed; + #define HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS 0x203b struct hci_rp_le_read_num_supported_adv_sets { __u8 status; @@ -1752,26 +1974,21 @@ struct hci_rp_le_set_ext_adv_params { __u8 tx_power; } __packed; -#define HCI_OP_LE_SET_EXT_ADV_ENABLE 0x2039 -struct hci_cp_le_set_ext_adv_enable { - __u8 enable; - __u8 num_of_sets; - __u8 data[]; -} __packed; - struct hci_cp_ext_adv_set { __u8 handle; __le16 duration; __u8 max_events; } __packed; +#define HCI_MAX_EXT_AD_LENGTH 251 + #define HCI_OP_LE_SET_EXT_ADV_DATA 0x2037 struct hci_cp_le_set_ext_adv_data { __u8 handle; __u8 operation; __u8 frag_pref; __u8 length; - __u8 data[HCI_MAX_AD_LENGTH]; + __u8 data[] __counted_by(length); } __packed; #define HCI_OP_LE_SET_EXT_SCAN_RSP_DATA 0x2038 @@ -1780,7 +1997,39 @@ struct hci_cp_le_set_ext_scan_rsp_data { __u8 operation; __u8 frag_pref; __u8 length; - __u8 data[HCI_MAX_AD_LENGTH]; + __u8 data[] __counted_by(length); +} __packed; + +#define HCI_OP_LE_SET_EXT_ADV_ENABLE 0x2039 +struct hci_cp_le_set_ext_adv_enable { + __u8 enable; + __u8 num_of_sets; + __u8 data[]; +} __packed; + +#define HCI_OP_LE_SET_PER_ADV_PARAMS 0x203e +struct hci_cp_le_set_per_adv_params { + __u8 handle; + __le16 min_interval; + __le16 max_interval; + __le16 periodic_properties; +} __packed; + +#define HCI_MAX_PER_AD_LENGTH 252 +#define HCI_MAX_PER_AD_TOT_LEN 1650 + +#define HCI_OP_LE_SET_PER_ADV_DATA 0x203f +struct hci_cp_le_set_per_adv_data { + __u8 handle; + __u8 operation; + __u8 length; + __u8 data[] __counted_by(length); +} __packed; + +#define HCI_OP_LE_SET_PER_ADV_ENABLE 0x2040 +struct hci_cp_le_set_per_adv_enable { + __u8 enable; + __u8 handle; } __packed; #define LE_SET_ADV_DATA_OP_COMPLETE 0x03 @@ -1797,6 +2046,23 @@ struct hci_cp_le_set_adv_set_rand_addr { bdaddr_t bdaddr; } __packed; +#define HCI_OP_LE_READ_TRANSMIT_POWER 0x204b +struct hci_rp_le_read_transmit_power { + __u8 status; + __s8 min_le_tx_power; + __s8 max_le_tx_power; +} __packed; + +#define HCI_NETWORK_PRIVACY 0x00 +#define HCI_DEVICE_PRIVACY 0x01 + +#define HCI_OP_LE_SET_PRIVACY_MODE 0x204e +struct hci_cp_le_set_privacy_mode { + __u8 bdaddr_type; + bdaddr_t bdaddr; + __u8 mode; +} __packed; + #define HCI_OP_LE_READ_BUFFER_SIZE_V2 0x2060 struct hci_rp_le_read_buffer_size_v2 { __u8 status; @@ -1822,25 +2088,25 @@ struct hci_rp_le_read_iso_tx_sync { #define HCI_OP_LE_SET_CIG_PARAMS 0x2062 struct hci_cis_params { __u8 cis_id; - __le16 m_sdu; - __le16 s_sdu; - __u8 m_phy; - __u8 s_phy; - __u8 m_rtn; - __u8 s_rtn; + __le16 c_sdu; + __le16 p_sdu; + __u8 c_phy; + __u8 p_phy; + __u8 c_rtn; + __u8 p_rtn; } __packed; struct hci_cp_le_set_cig_params { __u8 cig_id; - __u8 m_interval[3]; - __u8 s_interval[3]; + __u8 c_interval[3]; + __u8 p_interval[3]; __u8 sca; __u8 packing; __u8 framing; - __le16 m_latency; - __le16 s_latency; + __le16 c_latency; + __le16 p_latency; __u8 num_cis; - struct hci_cis_params cis[]; + struct hci_cis_params cis[] __counted_by(num_cis); } __packed; struct hci_rp_le_set_cig_params { @@ -1858,7 +2124,7 @@ struct hci_cis { struct hci_cp_le_create_cis { __u8 num_cis; - struct hci_cis cis[]; + struct hci_cis cis[] __counted_by(num_cis); } __packed; #define HCI_OP_LE_REMOVE_CIG 0x2065 @@ -1877,7 +2143,78 @@ struct hci_cp_le_reject_cis { __u8 reason; } __packed; +#define HCI_OP_LE_CREATE_BIG 0x2068 +struct hci_bis { + __u8 sdu_interval[3]; + __le16 sdu; + __le16 latency; + __u8 rtn; + __u8 phy; + __u8 packing; + __u8 framing; + __u8 encryption; + __u8 bcode[16]; +} __packed; + +struct hci_cp_le_create_big { + __u8 handle; + __u8 adv_handle; + __u8 num_bis; + struct hci_bis bis; +} __packed; + +#define HCI_OP_LE_TERM_BIG 0x206a +struct hci_cp_le_term_big { + __u8 handle; + __u8 reason; +} __packed; + +#define HCI_OP_LE_BIG_CREATE_SYNC 0x206b +struct hci_cp_le_big_create_sync { + __u8 handle; + __le16 sync_handle; + __u8 encryption; + __u8 bcode[16]; + __u8 mse; + __le16 timeout; + __u8 num_bis; + __u8 bis[] __counted_by(num_bis); +} __packed; + +#define HCI_OP_LE_BIG_TERM_SYNC 0x206c +struct hci_cp_le_big_term_sync { + __u8 handle; +} __packed; + +#define HCI_OP_LE_SETUP_ISO_PATH 0x206e +struct hci_cp_le_setup_iso_path { + __le16 handle; + __u8 direction; + __u8 path; + __u8 codec; + __le16 codec_cid; + __le16 codec_vid; + __u8 delay[3]; + __u8 codec_cfg_len; + __u8 codec_cfg[]; +} __packed; + +struct hci_rp_le_setup_iso_path { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_SET_HOST_FEATURE 0x2074 +struct hci_cp_le_set_host_feature { + __u8 bit_number; + __u8 bit_value; +} __packed; + /* ---- HCI Events ---- */ +struct hci_ev_status { + __u8 status; +} __packed; + #define HCI_EV_INQUIRY_COMPLETE 0x01 #define HCI_EV_INQUIRY_RESULT 0x02 @@ -1890,6 +2227,11 @@ struct inquiry_info { __le16 clock_offset; } __packed; +struct hci_ev_inquiry_result { + __u8 num; + struct inquiry_info info[]; +}; + #define HCI_EV_CONN_COMPLETE 0x03 struct hci_ev_conn_complete { __u8 status; @@ -2001,7 +2343,7 @@ struct hci_comp_pkts_info { } __packed; struct hci_ev_num_comp_pkts { - __u8 num_hndl; + __u8 num; struct hci_comp_pkts_info handles[]; } __packed; @@ -2051,7 +2393,7 @@ struct hci_ev_pscan_rep_mode { } __packed; #define HCI_EV_INQUIRY_RESULT_WITH_RSSI 0x22 -struct inquiry_info_with_rssi { +struct inquiry_info_rssi { bdaddr_t bdaddr; __u8 pscan_rep_mode; __u8 pscan_period_mode; @@ -2059,7 +2401,7 @@ struct inquiry_info_with_rssi { __le16 clock_offset; __s8 rssi; } __packed; -struct inquiry_info_with_rssi_and_pscan_mode { +struct inquiry_info_rssi_pscan { bdaddr_t bdaddr; __u8 pscan_rep_mode; __u8 pscan_period_mode; @@ -2068,6 +2410,10 @@ struct inquiry_info_with_rssi_and_pscan_mode { __le16 clock_offset; __s8 rssi; } __packed; +struct hci_ev_inquiry_result_rssi { + __u8 num; + __u8 data[]; +} __packed; #define HCI_EV_REMOTE_EXT_FEATURES 0x23 struct hci_ev_remote_ext_features { @@ -2122,6 +2468,11 @@ struct extended_inquiry_info { __u8 data[240]; } __packed; +struct hci_ev_ext_inquiry_result { + __u8 num; + struct extended_inquiry_info info[]; +} __packed; + #define HCI_EV_KEY_REFRESH_COMPLETE 0x30 struct hci_ev_key_refresh_complete { __u8 status; @@ -2243,7 +2594,7 @@ struct hci_ev_sync_train_complete { __u8 status; } __packed; -#define HCI_EV_SLAVE_PAGE_RESP_TIMEOUT 0x54 +#define HCI_EV_PERIPHERAL_PAGE_RESP_TIMEOUT 0x54 #define HCI_EV_LE_CONN_COMPLETE 0x01 struct hci_ev_le_conn_complete { @@ -2281,6 +2632,7 @@ struct hci_ev_le_conn_complete { #define LE_EXT_ADV_DIRECT_IND 0x0004 #define LE_EXT_ADV_SCAN_RSP 0x0008 #define LE_EXT_ADV_LEGACY_PDU 0x0010 +#define LE_EXT_ADV_EVT_TYPE_MASK 0x007f #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 @@ -2289,13 +2641,18 @@ struct hci_ev_le_conn_complete { #define HCI_EV_LE_ADVERTISING_REPORT 0x02 struct hci_ev_le_advertising_info { - __u8 evt_type; + __u8 type; __u8 bdaddr_type; bdaddr_t bdaddr; __u8 length; __u8 data[]; } __packed; +struct hci_ev_le_advertising_report { + __u8 num; + struct hci_ev_le_advertising_info info[]; +} __packed; + #define HCI_EV_LE_CONN_UPDATE_COMPLETE 0x03 struct hci_ev_le_conn_update_complete { __u8 status; @@ -2339,7 +2696,7 @@ struct hci_ev_le_data_len_change { #define HCI_EV_LE_DIRECT_ADV_REPORT 0x0B struct hci_ev_le_direct_adv_info { - __u8 evt_type; + __u8 type; __u8 bdaddr_type; bdaddr_t bdaddr; __u8 direct_addr_type; @@ -2347,6 +2704,11 @@ struct hci_ev_le_direct_adv_info { __s8 rssi; } __packed; +struct hci_ev_le_direct_adv_report { + __u8 num; + struct hci_ev_le_direct_adv_info info[]; +} __packed; + #define HCI_EV_LE_PHY_UPDATE_COMPLETE 0x0c struct hci_ev_le_phy_update_complete { __u8 status; @@ -2356,8 +2718,8 @@ struct hci_ev_le_phy_update_complete { } __packed; #define HCI_EV_LE_EXT_ADV_REPORT 0x0d -struct hci_ev_le_ext_adv_report { - __le16 evt_type; +struct hci_ev_le_ext_adv_info { + __le16 type; __u8 bdaddr_type; bdaddr_t bdaddr; __u8 primary_phy; @@ -2365,11 +2727,28 @@ struct hci_ev_le_ext_adv_report { __u8 sid; __u8 tx_power; __s8 rssi; - __le16 interval; - __u8 direct_addr_type; + __le16 interval; + __u8 direct_addr_type; bdaddr_t direct_addr; - __u8 length; - __u8 data[]; + __u8 length; + __u8 data[]; +} __packed; + +struct hci_ev_le_ext_adv_report { + __u8 num; + struct hci_ev_le_ext_adv_info info[]; +} __packed; + +#define HCI_EV_LE_PA_SYNC_ESTABLISHED 0x0e +struct hci_ev_le_pa_sync_established { + __u8 status; + __le16 handle; + __u8 sid; + __u8 bdaddr_type; + bdaddr_t bdaddr; + __u8 phy; + __le16 interval; + __u8 clock_accuracy; } __packed; #define HCI_EV_LE_ENHANCED_CONN_COMPLETE 0x0a @@ -2387,6 +2766,21 @@ struct hci_ev_le_enh_conn_complete { __u8 clk_accurancy; } __packed; +#define HCI_EV_LE_PER_ADV_REPORT 0x0f +struct hci_ev_le_per_adv_report { + __le16 sync_handle; + __u8 tx_power; + __u8 rssi; + __u8 cte_type; + __u8 data_status; + __u8 length; + __u8 data[]; +} __packed; + +#define LE_PA_DATA_COMPLETE 0x00 +#define LE_PA_DATA_MORE_TO_COME 0x01 +#define LE_PA_DATA_TRUNCATED 0x02 + #define HCI_EV_LE_EXT_ADV_SET_TERM 0x12 struct hci_evt_le_ext_adv_set_term { __u8 status; @@ -2401,17 +2795,17 @@ struct hci_evt_le_cis_established { __le16 handle; __u8 cig_sync_delay[3]; __u8 cis_sync_delay[3]; - __u8 m_latency[3]; - __u8 s_latency[3]; - __u8 m_phy; - __u8 s_phy; + __u8 c_latency[3]; + __u8 p_latency[3]; + __u8 c_phy; + __u8 p_phy; __u8 nse; - __u8 m_bn; - __u8 s_bn; - __u8 m_ft; - __u8 s_ft; - __le16 m_mtu; - __le16 s_mtu; + __u8 c_bn; + __u8 p_bn; + __u8 c_ft; + __u8 p_ft; + __le16 c_mtu; + __le16 p_mtu; __le16 interval; } __packed; @@ -2423,6 +2817,55 @@ struct hci_evt_le_cis_req { __u8 cis_id; } __packed; +#define HCI_EVT_LE_CREATE_BIG_COMPLETE 0x1b +struct hci_evt_le_create_big_complete { + __u8 status; + __u8 handle; + __u8 sync_delay[3]; + __u8 transport_delay[3]; + __u8 phy; + __u8 nse; + __u8 bn; + __u8 pto; + __u8 irc; + __le16 max_pdu; + __le16 interval; + __u8 num_bis; + __le16 bis_handle[]; +} __packed; + +#define HCI_EVT_LE_BIG_SYNC_ESTABLISHED 0x1d +struct hci_evt_le_big_sync_estabilished { + __u8 status; + __u8 handle; + __u8 latency[3]; + __u8 nse; + __u8 bn; + __u8 pto; + __u8 irc; + __le16 max_pdu; + __le16 interval; + __u8 num_bis; + __le16 bis[]; +} __packed; + +#define HCI_EVT_LE_BIG_INFO_ADV_REPORT 0x22 +struct hci_evt_le_big_info_adv_report { + __le16 sync_handle; + __u8 num_bis; + __u8 nse; + __le16 iso_interval; + __u8 bn; + __u8 pto; + __u8 irc; + __le16 max_pdu; + __u8 sdu_interval[3]; + __le16 max_sdu; + __u8 phy; + __u8 framing; + __u8 encryption; +} __packed; + #define HCI_EV_VENDOR 0xff /* Internal events generated by Bluetooth stack */ @@ -2512,6 +2955,11 @@ static inline struct hci_sco_hdr *hci_sco_hdr(const struct sk_buff *skb) return (struct hci_sco_hdr *) skb->data; } +static inline struct hci_iso_hdr *hci_iso_hdr(const struct sk_buff *skb) +{ + return (struct hci_iso_hdr *)skb->data; +} + /* Command opcode pack/unpack */ #define hci_opcode_pack(ogf, ocf) ((__u16) ((ocf & 0x03ff)|(ogf << 10))) #define hci_opcode_ogf(op) (op >> 10) @@ -2532,6 +2980,9 @@ static inline struct hci_sco_hdr *hci_sco_hdr(const struct sk_buff *skb) #define hci_iso_data_len(h) ((h) & 0x3fff) #define hci_iso_data_flags(h) ((h) >> 14) +/* codec transport types */ +#define HCI_TRANSPORT_SCO_ESCO 0x01 + /* le24 support */ static inline void hci_cpu_to_le24(__u32 val, __u8 dst[3]) { diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 9873e1c8cd16..9fc8f544e20e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1,6 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. + Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> @@ -28,13 +29,20 @@ #include <linux/idr.h> #include <linux/leds.h> #include <linux/rculist.h> +#include <linux/srcu.h> #include <net/bluetooth/hci.h> +#include <net/bluetooth/hci_drv.h> +#include <net/bluetooth/hci_sync.h> #include <net/bluetooth/hci_sock.h> +#include <net/bluetooth/coredump.h> /* HCI priority */ #define HCI_PRIO_MAX 7 +/* HCI maximum id value */ +#define HCI_MAX_ID 10000 + /* HCI Core structures */ struct inquiry_data { bdaddr_t bdaddr; @@ -77,7 +85,7 @@ struct discovery_state { u8 last_adv_addr_type; s8 last_adv_rssi; u32 last_adv_flags; - u8 last_adv_data[HCI_MAX_AD_LENGTH]; + u8 last_adv_data[HCI_MAX_EXT_AD_LENGTH]; u8 last_adv_data_len; bool report_invalid_rssi; bool result_filtering; @@ -85,8 +93,7 @@ struct discovery_state { s8 rssi; u16 uuid_count; u8 (*uuids)[16]; - unsigned long scan_start; - unsigned long scan_duration; + unsigned long name_resolve_timeout; }; #define SUSPEND_NOTIFIER_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ @@ -105,6 +112,8 @@ enum suspend_tasks { SUSPEND_POWERING_DOWN, SUSPEND_PREPARE_NOTIFIER, + + SUSPEND_SET_ADV_FILTER, __SUSPEND_NUM_TASKS }; @@ -117,10 +126,10 @@ enum suspended_state { struct hci_conn_hash { struct list_head list; unsigned int acl_num; - unsigned int amp_num; unsigned int sco_num; + unsigned int iso_num; unsigned int le_num; - unsigned int le_num_slave; + unsigned int le_num_peripheral; }; struct bdaddr_list { @@ -129,6 +138,17 @@ struct bdaddr_list { u8 bdaddr_type; }; +struct codec_list { + struct list_head list; + u8 id; + __u16 cid; + __u16 vid; + u8 transport; + u8 num_caps; + u32 len; + struct hci_codec_caps caps[]; +}; + struct bdaddr_list_with_irk { struct list_head list; bdaddr_t bdaddr; @@ -137,23 +157,21 @@ struct bdaddr_list_with_irk { u8 local_irk[16]; }; +/* Bitmask of connection flags */ +enum hci_conn_flags { + HCI_CONN_FLAG_REMOTE_WAKEUP = BIT(0), + HCI_CONN_FLAG_DEVICE_PRIVACY = BIT(1), + HCI_CONN_FLAG_ADDRESS_RESOLUTION = BIT(2), +}; +typedef u8 hci_conn_flags_t; + struct bdaddr_list_with_flags { struct list_head list; bdaddr_t bdaddr; u8 bdaddr_type; - u32 current_flags; -}; - -enum hci_conn_flags { - HCI_CONN_FLAG_REMOTE_WAKEUP, - HCI_CONN_FLAG_MAX + hci_conn_flags_t flags; }; -#define hci_conn_test_flag(nr, flags) ((flags) & (1U << nr)) - -/* Make sure number of flags doesn't exceed sizeof(current_flags) */ -static_assert(HCI_CONN_FLAG_MAX < 32); - struct bt_uuid { struct list_head list; u8 uuid[16]; @@ -219,45 +237,101 @@ struct oob_data { struct adv_info { struct list_head list; - bool pending; + bool enabled; + bool pending; + bool periodic; + __u8 mesh; __u8 instance; + __u8 handle; + __u8 sid; __u32 flags; __u16 timeout; __u16 remaining_time; __u16 duration; __u16 adv_data_len; - __u8 adv_data[HCI_MAX_AD_LENGTH]; + __u8 adv_data[HCI_MAX_EXT_AD_LENGTH]; + bool adv_data_changed; __u16 scan_rsp_len; - __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; + __u8 scan_rsp_data[HCI_MAX_EXT_AD_LENGTH]; + bool scan_rsp_changed; + __u16 per_adv_data_len; + __u8 per_adv_data[HCI_MAX_PER_AD_LENGTH]; __s8 tx_power; + __u32 min_interval; + __u32 max_interval; bdaddr_t random_addr; bool rpa_expired; struct delayed_work rpa_expired_cb; }; +struct tx_queue { + struct sk_buff_head queue; + unsigned int extra; + unsigned int tracked; +}; + #define HCI_MAX_ADV_INSTANCES 5 #define HCI_DEFAULT_ADV_DURATION 2 +#define HCI_ADV_TX_POWER_NO_PREFERENCE 0x7F + +#define DATA_CMP(_d1, _l1, _d2, _l2) \ + (_l1 == _l2 ? memcmp(_d1, _d2, _l1) : _l1 - _l2) + +#define ADV_DATA_CMP(_adv, _data, _len) \ + DATA_CMP((_adv)->adv_data, (_adv)->adv_data_len, _data, _len) + +#define SCAN_RSP_CMP(_adv, _data, _len) \ + DATA_CMP((_adv)->scan_rsp_data, (_adv)->scan_rsp_len, _data, _len) + +struct monitored_device { + struct list_head list; + + bdaddr_t bdaddr; + __u8 addr_type; + __u16 handle; + bool notified; +}; + struct adv_pattern { struct list_head list; __u8 ad_type; __u8 offset; __u8 length; - __u8 value[HCI_MAX_AD_LENGTH]; + __u8 value[HCI_MAX_EXT_AD_LENGTH]; +}; + +struct adv_rssi_thresholds { + __s8 low_threshold; + __s8 high_threshold; + __u16 low_threshold_timeout; + __u16 high_threshold_timeout; + __u8 sampling_period; }; struct adv_monitor { struct list_head patterns; - bool active; + struct adv_rssi_thresholds rssi; __u16 handle; + + enum { + ADV_MONITOR_STATE_NOT_REGISTERED, + ADV_MONITOR_STATE_REGISTERED, + ADV_MONITOR_STATE_OFFLOADED + } state; }; #define HCI_MIN_ADV_MONITOR_HANDLE 1 -#define HCI_MAX_ADV_MONITOR_NUM_HANDLES 32 +#define HCI_MAX_ADV_MONITOR_NUM_HANDLES 32 #define HCI_MAX_ADV_MONITOR_NUM_PATTERNS 16 +#define HCI_ADV_MONITOR_EXT_NONE 1 +#define HCI_ADV_MONITOR_EXT_MSFT 2 #define HCI_MAX_SHORT_NAME_LENGTH 10 +#define HCI_CONN_HANDLE_MAX 0x0eff +#define HCI_CONN_HANDLE_UNSET(_handle) (_handle > HCI_CONN_HANDLE_MAX) + /* Min encryption key size to match with SMP */ #define HCI_MIN_ENC_KEY_SIZE 7 @@ -270,25 +344,19 @@ struct adv_monitor { /* Default authenticated payload timeout 30s */ #define DEFAULT_AUTH_PAYLOAD_TIMEOUT 0x0bb8 -struct amp_assoc { - __u16 len; - __u16 offset; - __u16 rem_len; - __u16 len_so_far; - __u8 data[HCI_MAX_AMP_ASSOC_SIZE]; -}; - #define HCI_MAX_PAGES 3 struct hci_dev { struct list_head list; + struct srcu_struct srcu; struct mutex lock; - char name[8]; + struct ida unset_handle_ida; + + const char *name; unsigned long flags; __u16 id; __u8 bus; - __u8 dev_type; bdaddr_t bdaddr; bdaddr_t setup_addr; bdaddr_t public_addr; @@ -305,10 +373,12 @@ struct hci_dev { __u8 max_page; __u8 features[HCI_MAX_PAGES][8]; __u8 le_features[8]; - __u8 le_white_list_size; + __u8 le_accept_list_size; __u8 le_resolv_list_size; __u8 le_num_of_adv_sets; __u8 le_states[8]; + __u8 mesh_ad_types[16]; + __u8 mesh_send_ref; __u8 commands[64]; __u8 hci_ver; __u16 hci_rev; @@ -317,8 +387,8 @@ struct hci_dev { __u16 lmp_subver; __u16 voice_setting; __u8 num_iac; - __u8 stored_max_keys; - __u8 stored_num_keys; + __u16 stored_max_keys; + __u16 stored_num_keys; __u8 io_capability; __s8 inq_tx_power; __u8 err_data_reporting; @@ -361,6 +431,9 @@ struct hci_dev { __u8 ssp_debug_mode; __u8 hw_error_code; __u32 clock; + __u16 advmon_allowlist_duration; + __u16 advmon_no_filter_duration; + __u8 enable_advmon_interleave_scan; __u16 devid_source; __u16 devid_vendor; @@ -377,6 +450,8 @@ struct hci_dev { __u16 def_page_timeout; __u16 def_multi_adv_rotation_duration; __u16 def_le_autoconnect_timeout; + __s8 min_le_tx_power; + __s8 max_le_tx_power; __u16 pkt_type; __u16 esco_type; @@ -387,21 +462,6 @@ struct hci_dev { __u16 sniff_min_interval; __u16 sniff_max_interval; - __u8 amp_status; - __u32 amp_total_bw; - __u32 amp_max_bw; - __u32 amp_min_latency; - __u32 amp_max_pdu; - __u8 amp_type; - __u16 amp_pal_cap; - __u16 amp_assoc_size; - __u32 amp_max_flush_to; - __u32 amp_be_flush_to; - - struct amp_assoc loc_assoc; - - __u8 flow_ctl_mode; - unsigned int auto_accept_delay; unsigned long quirks; @@ -410,21 +470,18 @@ struct hci_dev { unsigned int acl_cnt; unsigned int sco_cnt; unsigned int le_cnt; + unsigned int iso_cnt; unsigned int acl_mtu; unsigned int sco_mtu; unsigned int le_mtu; + unsigned int iso_mtu; unsigned int acl_pkts; unsigned int sco_pkts; unsigned int le_pkts; - - __u16 block_len; - __u16 block_mtu; - __u16 num_blocks; - __u16 block_cnt; + unsigned int iso_pkts; unsigned long acl_last_tx; - unsigned long sco_last_tx; unsigned long le_last_tx; __u8 le_tx_def_phys; @@ -436,6 +493,12 @@ struct hci_dev { struct work_struct power_on; struct delayed_work power_off; struct work_struct error_reset; + struct work_struct cmd_sync_work; + struct list_head cmd_sync_work_list; + struct mutex cmd_sync_work_lock; + struct mutex unregister_lock; + struct work_struct cmd_sync_cancel_work; + struct work_struct reenable_adv_work; __u16 discov_timeout; struct delayed_work discov_off; @@ -443,43 +506,38 @@ struct hci_dev { struct delayed_work service_cache; struct delayed_work cmd_timer; + struct delayed_work ncmd_timer; struct work_struct rx_work; struct work_struct cmd_work; struct work_struct tx_work; - struct work_struct discov_update; - struct work_struct bg_scan_update; - struct work_struct scan_update; - struct work_struct connectable_update; - struct work_struct discoverable_update; struct delayed_work le_scan_disable; - struct delayed_work le_scan_restart; struct sk_buff_head rx_q; struct sk_buff_head raw_q; struct sk_buff_head cmd_q; struct sk_buff *sent_cmd; + struct sk_buff *recv_event; struct mutex req_lock; wait_queue_head_t req_wait_q; __u32 req_status; __u32 req_result; struct sk_buff *req_skb; + struct sk_buff *req_rsp; void *smp_data; void *smp_bredr_data; struct discovery_state discovery; - int discovery_old_state; bool discovery_paused; int advertising_old_state; bool advertising_paused; struct notifier_block suspend_notifier; - struct work_struct suspend_prepare; enum suspended_state suspend_state_next; enum suspended_state suspend_state; bool scanning_paused; @@ -488,25 +546,25 @@ struct hci_dev { bdaddr_t wake_addr; u8 wake_addr_type; - wait_queue_head_t suspend_wait_q; - DECLARE_BITMAP(suspend_tasks, __SUSPEND_NUM_TASKS); - struct hci_conn_hash conn_hash; + struct list_head mesh_pending; + struct mutex mgmt_pending_lock; struct list_head mgmt_pending; - struct list_head blacklist; - struct list_head whitelist; + struct list_head reject_list; + struct list_head accept_list; struct list_head uuids; struct list_head link_keys; struct list_head long_term_keys; struct list_head identity_resolving_keys; struct list_head remote_oob_data; - struct list_head le_white_list; + struct list_head le_accept_list; struct list_head le_resolv_list; struct list_head le_conn_params; struct list_head pend_le_conns; struct list_head pend_le_reports; struct list_head blocked_keys; + struct list_head local_codecs; struct hci_dev_stats stat; @@ -516,17 +574,22 @@ struct hci_dev { const char *fw_info; struct dentry *debugfs; + struct hci_devcoredump dump; + struct device dev; struct rfkill *rfkill; DECLARE_BITMAP(dev_flags, __HCI_NUM_FLAGS); + hci_conn_flags_t conn_flags; __s8 adv_tx_power; - __u8 adv_data[HCI_MAX_AD_LENGTH]; + __u8 adv_data[HCI_MAX_EXT_AD_LENGTH]; __u8 adv_data_len; - __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; + __u8 scan_rsp_data[HCI_MAX_EXT_AD_LENGTH]; __u8 scan_rsp_data_len; + __u8 per_adv_data[HCI_MAX_PER_AD_LENGTH]; + __u8 per_adv_data_len; struct list_head adv_instances; unsigned int adv_instance_cnt; @@ -542,6 +605,21 @@ struct hci_dev { struct delayed_work rpa_expired; bdaddr_t rpa; + struct delayed_work mesh_send_done; + + enum { + INTERLEAVE_SCAN_NONE, + INTERLEAVE_SCAN_NO_FILTER, + INTERLEAVE_SCAN_ALLOWLIST + } interleave_scan_state; + + struct delayed_work interleave_scan; + + struct list_head monitored_devices; + bool advmon_pend_notify; + + struct hci_drv *hci_drv; + #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; #endif @@ -549,6 +627,12 @@ struct hci_dev { #if IS_ENABLED(CONFIG_BT_MSFTEXT) __u16 msft_opcode; void *msft_data; + bool msft_curve_validity; +#endif + +#if IS_ENABLED(CONFIG_BT_AOSPEXT) + bool aosp_capable; + bool aosp_quality_report; #endif int (*open)(struct hci_dev *hdev); @@ -562,8 +646,14 @@ struct hci_dev { int (*post_init)(struct hci_dev *hdev); int (*set_diag)(struct hci_dev *hdev, bool enable); int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr); - void (*cmd_timeout)(struct hci_dev *hdev); - bool (*prevent_wake)(struct hci_dev *hdev); + void (*reset)(struct hci_dev *hdev); + bool (*wakeup)(struct hci_dev *hdev); + int (*set_quality_report)(struct hci_dev *hdev, bool enable); + int (*get_data_path_id)(struct hci_dev *hdev, __u8 *data_path); + int (*get_codec_config_data)(struct hci_dev *hdev, __u8 type, + struct bt_codec *codec, __u8 *vnd_len, + __u8 **vnd_data); + u8 (*classify_pkt_type)(struct hci_dev *hdev, struct sk_buff *skb); }; #define HCI_PHY_HANDLE(handle) (handle & 0xff) @@ -572,6 +662,7 @@ enum conn_reasons { CONN_REASON_PAIR_DEVICE, CONN_REASON_L2CAP_CHAN, CONN_REASON_SCO_CONNECT, + CONN_REASON_ISO_CONNECT, }; struct hci_conn { @@ -587,8 +678,12 @@ struct hci_conn { __u8 init_addr_type; bdaddr_t resp_addr; __u8 resp_addr_type; + __u8 adv_instance; __u16 handle; + __u16 sync_handle; + __u8 sid; __u16 state; + __u16 mtu; __u8 mode; __u8 type; __u8 role; @@ -616,16 +711,26 @@ struct hci_conn { __u16 le_conn_interval; __u16 le_conn_latency; __u16 le_supv_timeout; - __u8 le_adv_data[HCI_MAX_AD_LENGTH]; + __u8 le_adv_data[HCI_MAX_EXT_AD_LENGTH]; __u8 le_adv_data_len; + __u8 le_per_adv_data[HCI_MAX_PER_AD_TOT_LEN]; + __u16 le_per_adv_data_len; + __u16 le_per_adv_data_offset; + __u8 le_adv_phy; + __u8 le_adv_sec_phy; __u8 le_tx_phy; __u8 le_rx_phy; __s8 rssi; __s8 tx_power; __s8 max_tx_power; + struct bt_iso_qos iso_qos; + __u8 num_bis; + __u8 bis[HCI_MAX_ISO_BIS]; + unsigned long flags; enum conn_reasons conn_reason; + __u8 abort_reason; __u32 clock; __u16 clock_accuracy; @@ -641,11 +746,12 @@ struct hci_conn { struct sk_buff_head data_q; struct list_head chan_list; + struct tx_queue tx_q; + struct delayed_work disc_work; struct delayed_work auto_accept_work; struct delayed_work idle_work; struct delayed_work le_conn_timeout; - struct work_struct le_scan_cleanup; struct device dev; struct dentry *debugfs; @@ -653,13 +759,24 @@ struct hci_conn { struct hci_dev *hdev; void *l2cap_data; void *sco_data; - struct amp_mgr *amp_mgr; + void *iso_data; + + struct list_head link_list; + struct hci_conn *parent; + struct hci_link *link; - struct hci_conn *link; + struct bt_codec codec; void (*connect_cfm_cb) (struct hci_conn *conn, u8 status); void (*security_cfm_cb) (struct hci_conn *conn, u8 status); void (*disconn_cfm_cb) (struct hci_conn *conn, u8 reason); + + void (*cleanup)(struct hci_conn *conn); +}; + +struct hci_link { + struct list_head list; + struct hci_conn *conn; }; struct hci_chan { @@ -694,7 +811,9 @@ struct hci_conn_params { struct hci_conn *conn; bool explicit_connect; - u32 current_flags; + /* Accessed without hdev->lock: */ + hci_conn_flags_t flags; + u8 privacy_mode; }; extern struct list_head hci_dev_list; @@ -716,8 +835,15 @@ extern struct mutex hci_cb_list_lock; hci_dev_clear_flag(hdev, HCI_LE_ADV); \ hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\ hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); \ + hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); \ } while (0) +#define hci_dev_le_state_simultaneous(hdev) \ + (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) && \ + (hdev->le_states[4] & 0x08) && /* Central */ \ + (hdev->le_states[4] & 0x40) && /* Peripheral */ \ + (hdev->le_states[3] & 0x10)) /* Simultaneous */ + /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); int l2cap_disconn_ind(struct hci_conn *hcon); @@ -738,6 +864,21 @@ static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) } #endif +#if IS_ENABLED(CONFIG_BT_LE) +int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); +void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +#else +static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, + __u8 *flags) +{ + return 0; +} +static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, + u16 flags) +{ +} +#endif + /* ----- Inquiry cache ----- */ #define INQUIRY_CACHE_AGE_MAX (HZ*30) /* 30 seconds */ #define INQUIRY_ENTRY_AGE_MAX (HZ*60) /* 60 seconds */ @@ -760,8 +901,6 @@ static inline void hci_discovery_filter_clear(struct hci_dev *hdev) hdev->discovery.uuid_count = 0; kfree(hdev->discovery.uuids); hdev->discovery.uuids = NULL; - hdev->discovery.scan_start = 0; - hdev->discovery.scan_duration = 0; } bool hci_discovery_active(struct hci_dev *hdev); @@ -800,7 +939,6 @@ void hci_inquiry_cache_flush(struct hci_dev *hdev); /* ----- HCI Connections ----- */ enum { HCI_CONN_AUTH_PEND, - HCI_CONN_REAUTH_PEND, HCI_CONN_ENCRYPT_PEND, HCI_CONN_RSWITCH_PEND, HCI_CONN_MODE_CHANGE_PEND, @@ -818,10 +956,20 @@ enum { HCI_CONN_STK_ENCRYPT, HCI_CONN_AUTH_INITIATOR, HCI_CONN_DROP, + HCI_CONN_CANCEL, HCI_CONN_PARAM_REMOVAL_PEND, HCI_CONN_NEW_LINK_KEY, HCI_CONN_SCANNING, HCI_CONN_AUTH_FAILURE, + HCI_CONN_PER_ADV, + HCI_CONN_BIG_CREATED, + HCI_CONN_CREATE_CIS, + HCI_CONN_CREATE_BIG_SYNC, + HCI_CONN_BIG_SYNC, + HCI_CONN_BIG_SYNC_FAILED, + HCI_CONN_CREATE_PA_SYNC, + HCI_CONN_PA_SYNC, + HCI_CONN_PA_SYNC_FAILED, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) @@ -841,23 +989,24 @@ static inline bool hci_conn_sc_enabled(struct hci_conn *conn) static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) { struct hci_conn_hash *h = &hdev->conn_hash; - list_add_rcu(&c->list, &h->list); + list_add_tail_rcu(&c->list, &h->list); switch (c->type) { case ACL_LINK: h->acl_num++; break; - case AMP_LINK: - h->amp_num++; - break; case LE_LINK: h->le_num++; if (c->role == HCI_ROLE_SLAVE) - h->le_num_slave++; + h->le_num_peripheral++; break; case SCO_LINK: case ESCO_LINK: h->sco_num++; break; + case CIS_LINK: + case BIS_LINK: + h->iso_num++; + break; } } @@ -872,18 +1021,19 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) case ACL_LINK: h->acl_num--; break; - case AMP_LINK: - h->amp_num--; - break; case LE_LINK: h->le_num--; if (c->role == HCI_ROLE_SLAVE) - h->le_num_slave--; + h->le_num_peripheral--; break; case SCO_LINK: case ESCO_LINK: h->sco_num--; break; + case CIS_LINK: + case BIS_LINK: + h->iso_num--; + break; } } @@ -893,13 +1043,14 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) switch (type) { case ACL_LINK: return h->acl_num; - case AMP_LINK: - return h->amp_num; case LE_LINK: return h->le_num; case SCO_LINK: case ESCO_LINK: return h->sco_num; + case CIS_LINK: + case BIS_LINK: + return h->iso_num; default: return 0; } @@ -909,7 +1060,25 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev) { struct hci_conn_hash *c = &hdev->conn_hash; - return c->acl_num + c->amp_num + c->sco_num + c->le_num; + return c->acl_num + c->sco_num + c->le_num + c->iso_num; +} + +static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c == conn) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + + return false; } static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle) @@ -932,6 +1101,78 @@ static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle) return type; } +static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, + bdaddr_t *ba, __u8 bis) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (bacmp(&c->dst, ba) || c->type != BIS_LINK) + continue; + + if (c->iso_qos.bcast.bis == bis) { + rcu_read_unlock(); + return c; + } + } + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * +hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != BIS_LINK) + continue; + + if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags)) + continue; + + rcu_read_unlock(); + return c; + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * +hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, + bdaddr_t *ba, + __u8 big, __u8 bis) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (bacmp(&c->dst, ba) || c->type != BIS_LINK || + !test_bit(HCI_CONN_PER_ADV, &c->flags)) + continue; + + if (c->iso_qos.bcast.big == big && + c->iso_qos.bcast.bis == bis) { + rcu_read_unlock(); + return c; + } + } + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_handle(struct hci_dev *hdev, __u16 handle) { @@ -995,6 +1236,187 @@ static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, + bdaddr_t *ba, + __u8 ba_type, + __u8 cig, + __u8 id) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != CIS_LINK) + continue; + + /* Match CIG ID if set */ + if (cig != c->iso_qos.ucast.cig) + continue; + + /* Match CIS ID if set */ + if (id != c->iso_qos.ucast.cis) + continue; + + /* Match destination address if set */ + if (!ba || (ba_type == c->dst_type && !bacmp(&c->dst, ba))) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev, + __u8 handle) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != CIS_LINK) + continue; + + if (handle == c->iso_qos.ucast.cig) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, + __u8 handle) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != BIS_LINK) + continue; + + if (handle == c->iso_qos.bcast.big) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * +hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, + __u8 handle, __u8 num_bis) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != BIS_LINK) + continue; + + if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * +hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != BIS_LINK || bacmp(&c->dst, BDADDR_ANY) || + c->state != state) + continue; + + if (handle == c->iso_qos.bcast.big) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * +hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != BIS_LINK || + !test_bit(HCI_CONN_PA_SYNC, &c->flags)) + continue; + + if (c->iso_qos.bcast.big == big) { + rcu_read_unlock(); + return c; + } + } + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * +hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != BIS_LINK) + continue; + + /* Ignore the listen hcon, we are looking + * for the child hcon that was created as + * a result of the PA sync established event. + */ + if (c->state == BT_LISTEN) + continue; + + if (c->sync_handle == sync_handle) { + rcu_read_unlock(); + return c; + } + } + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, __u8 type, __u16 state) { @@ -1015,6 +1437,47 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, return NULL; } +typedef void (*hci_conn_func_t)(struct hci_conn *conn, void *data); +static inline void hci_conn_hash_list_state(struct hci_dev *hdev, + hci_conn_func_t func, __u8 type, + __u16 state, void *data) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + if (!func) + return; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && c->state == state) + func(c, data); + } + + rcu_read_unlock(); +} + +static inline void hci_conn_hash_list_flag(struct hci_dev *hdev, + hci_conn_func_t func, __u8 type, + __u8 flag, void *data) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + if (!func) + return; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && test_bit(flag, &c->flags)) + func(c, data); + } + + rcu_read_unlock(); +} + static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev) { struct hci_conn_hash *h = &hdev->conn_hash; @@ -1035,15 +1498,40 @@ static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev) return NULL; } +/* Returns true if an le connection is in the scanning state */ +static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == LE_LINK && c->state == BT_CONNECT && + test_bit(HCI_CONN_SCANNING, &c->flags)) { + rcu_read_unlock(); + return true; + } + } + + rcu_read_unlock(); + + return false; +} + int hci_disconnect(struct hci_conn *conn, __u8 reason); bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); +bool hci_iso_setup_path(struct hci_conn *conn); +int hci_le_create_cis_pending(struct hci_dev *hdev); +int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, - u8 role); -int hci_conn_del(struct hci_conn *conn); + u8 role, u16 handle); +struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, + bdaddr_t *dst, u8 role); +void hci_conn_del(struct hci_conn *conn); void hci_conn_hash_flush(struct hci_dev *hdev); -void hci_conn_check_pending(struct hci_dev *hdev); struct hci_chan *hci_chan_create(struct hci_conn *conn); void hci_chan_del(struct hci_chan *chan); @@ -1055,13 +1543,31 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u16 conn_timeout, enum conn_reasons conn_reason); struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, - u8 dst_type, u8 sec_level, u16 conn_timeout, - u8 role, bdaddr_t *direct_rpa); + u8 dst_type, bool dst_resolved, u8 sec_level, + u16 conn_timeout, u8 role, u8 phy, u8 sec_phy); +void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status); struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type, - enum conn_reasons conn_reason); + enum conn_reasons conn_reason, u16 timeout); struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, - __u16 setting); + __u16 setting, struct bt_codec *codec, + u16 timeout); +struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, struct bt_iso_qos *qos); +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, + struct bt_iso_qos *qos, + __u8 base_len, __u8 *base); +struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, struct bt_iso_qos *qos); +struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, __u8 sid, + struct bt_iso_qos *qos, + __u8 data_len, __u8 *data); +struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); +int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, + __u8 num_bis, __u8 bis[]); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, @@ -1070,7 +1576,20 @@ int hci_conn_switch_role(struct hci_conn *conn, __u8 role); void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active); -void hci_le_conn_failed(struct hci_conn *conn, u8 status); +void hci_conn_failed(struct hci_conn *conn, u8 status); +u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle); + +void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb); +void hci_conn_tx_dequeue(struct hci_conn *conn); +void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset, + const struct sockcm_cookie *sockc); + +static inline void hci_sockcm_init(struct sockcm_cookie *sockc, struct sock *sk) +{ + *sockc = (struct sockcm_cookie) { + .tsflags = READ_ONCE(sk->sk_tsflags), + }; +} /* * hci_conn_get() and hci_conn_put() are used to control the life-time of an @@ -1104,12 +1623,14 @@ static inline void hci_conn_put(struct hci_conn *conn) put_device(&conn->dev); } -static inline void hci_conn_hold(struct hci_conn *conn) +static inline struct hci_conn *hci_conn_hold(struct hci_conn *conn) { BT_DBG("hcon %p orig refcnt %d", conn, atomic_read(&conn->refcnt)); atomic_inc(&conn->refcnt); cancel_delayed_work(&conn->disc_work); + + return conn; } static inline void hci_conn_drop(struct hci_conn *conn) @@ -1132,10 +1653,6 @@ static inline void hci_conn_drop(struct hci_conn *conn) } break; - case AMP_LINK: - timeo = conn->disc_timeout; - break; - default: timeo = 0; break; @@ -1181,13 +1698,27 @@ static inline void hci_set_drvdata(struct hci_dev *hdev, void *data) dev_set_drvdata(&hdev->dev, data); } +static inline void *hci_get_priv(struct hci_dev *hdev) +{ + return (char *)hdev + sizeof(*hdev); +} + struct hci_dev *hci_dev_get(int index); struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, u8 src_type); -struct hci_dev *hci_alloc_dev(void); +struct hci_dev *hci_alloc_dev_priv(int sizeof_priv); + +static inline struct hci_dev *hci_alloc_dev(void) +{ + return hci_alloc_dev_priv(0); +} + void hci_free_dev(struct hci_dev *hdev); int hci_register_dev(struct hci_dev *hdev); void hci_unregister_dev(struct hci_dev *hdev); +void hci_release_dev(struct hci_dev *hdev); +int hci_register_suspend_notifier(struct hci_dev *hdev); +int hci_unregister_suspend_notifier(struct hci_dev *hdev); int hci_suspend_dev(struct hci_dev *hdev); int hci_resume_dev(struct hci_dev *hdev); int hci_reset_dev(struct hci_dev *hdev); @@ -1203,6 +1734,22 @@ static inline void hci_set_msft_opcode(struct hci_dev *hdev, __u16 opcode) #endif } +static inline void hci_set_aosp_capable(struct hci_dev *hdev) +{ +#if IS_ENABLED(CONFIG_BT_AOSPEXT) + hdev->aosp_capable = true; +#endif +} + +static inline void hci_devcd_setup(struct hci_dev *hdev) +{ +#ifdef CONFIG_DEV_COREDUMP + INIT_WORK(&hdev->dump.dump_rx, hci_devcd_rx); + INIT_DELAYED_WORK(&hdev->dump.dump_timeout, hci_devcd_timeout); + skb_queue_head_init(&hdev->dump.dump_q); +#endif +} + int hci_dev_open(__u16 dev); int hci_dev_close(__u16 dev); int hci_dev_do_close(struct hci_dev *hdev); @@ -1232,8 +1779,6 @@ int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type); int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type); -int hci_bdaddr_list_del_with_flags(struct list_head *list, bdaddr_t *bdaddr, - u8 type); void hci_bdaddr_list_clear(struct list_head *list); struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, @@ -1242,7 +1787,11 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_clear_disabled(struct hci_dev *hdev); +void hci_conn_params_free(struct hci_conn_params *param); +void hci_pend_le_list_del_init(struct hci_conn_params *param); +void hci_pend_le_list_add(struct hci_conn_params *param, + struct list_head *list); struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type); @@ -1250,6 +1799,7 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, void hci_uuids_clear(struct hci_dev *hdev); void hci_link_keys_clear(struct hci_dev *hdev); +u8 *hci_conn_key_enc_size(struct hci_conn *conn); struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr); struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, @@ -1286,19 +1836,32 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, void hci_adv_instances_clear(struct hci_dev *hdev); struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance); +struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid); struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance); -int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, +struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, + u32 flags, u16 adv_data_len, u8 *adv_data, + u16 scan_rsp_len, u8 *scan_rsp_data, + u16 timeout, u16 duration, s8 tx_power, + u32 min_interval, u32 max_interval, + u8 mesh_handle); +struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, + u32 flags, u8 data_len, u8 *data, + u32 min_interval, u32 max_interval); +int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, u16 adv_data_len, u8 *adv_data, - u16 scan_rsp_len, u8 *scan_rsp_data, - u16 timeout, u16 duration); + u16 scan_rsp_len, u8 *scan_rsp_data); int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance); void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired); +u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance); +bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance); void hci_adv_monitors_clear(struct hci_dev *hdev); -void hci_free_adv_monitor(struct adv_monitor *monitor); +void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor); int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor); -int hci_remove_adv_monitor(struct hci_dev *hdev, u16 handle); +int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle); +int hci_remove_all_adv_monitor(struct hci_dev *hdev); bool hci_is_adv_monitoring(struct hci_dev *hdev); +int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev); void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); @@ -1308,6 +1871,7 @@ void hci_conn_add_sysfs(struct hci_conn *conn); void hci_conn_del_sysfs(struct hci_conn *conn); #define SET_HCIDEV_DEV(hdev, pdev) ((hdev)->dev.parent = (pdev)) +#define GET_HCIDEV_DEV(hdev) ((hdev)->dev.parent) /* ----- LMP capabilities ----- */ #define lmp_encrypt_capable(dev) ((dev)->features[0][0] & LMP_ENCRYPT) @@ -1315,12 +1879,14 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_hold_capable(dev) ((dev)->features[0][0] & LMP_HOLD) #define lmp_sniff_capable(dev) ((dev)->features[0][0] & LMP_SNIFF) #define lmp_park_capable(dev) ((dev)->features[0][1] & LMP_PARK) +#define lmp_sco_capable(dev) ((dev)->features[0][1] & LMP_SCO) #define lmp_inq_rssi_capable(dev) ((dev)->features[0][3] & LMP_RSSI_INQ) #define lmp_esco_capable(dev) ((dev)->features[0][3] & LMP_ESCO) #define lmp_bredr_capable(dev) (!((dev)->features[0][4] & LMP_NO_BREDR)) #define lmp_le_capable(dev) ((dev)->features[0][4] & LMP_LE) #define lmp_sniffsubr_capable(dev) ((dev)->features[0][5] & LMP_SNIFF_SUBR) #define lmp_pause_enc_capable(dev) ((dev)->features[0][5] & LMP_PAUSE_ENC) +#define lmp_esco_2m_capable(dev) ((dev)->features[0][5] & LMP_EDR_ESCO_2M) #define lmp_ext_inq_capable(dev) ((dev)->features[0][6] & LMP_EXT_INQ) #define lmp_le_br_capable(dev) (!!((dev)->features[0][6] & LMP_SIMUL_LE_BR)) #define lmp_ssp_capable(dev) ((dev)->features[0][6] & LMP_SIMPLE_PAIR) @@ -1335,8 +1901,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_edr_5slot_capable(dev) ((dev)->features[0][5] & LMP_EDR_5SLOT) /* ----- Extended LMP capabilities ----- */ -#define lmp_csb_master_capable(dev) ((dev)->features[2][0] & LMP_CSB_MASTER) -#define lmp_csb_slave_capable(dev) ((dev)->features[2][0] & LMP_CSB_SLAVE) +#define lmp_cpb_central_capable(dev) ((dev)->features[2][0] & LMP_CPB_CENTRAL) +#define lmp_cpb_peripheral_capable(dev) ((dev)->features[2][0] & LMP_CPB_PERIPHERAL) #define lmp_sync_train_capable(dev) ((dev)->features[2][0] & LMP_SYNC_TRAIN) #define lmp_sync_scan_capable(dev) ((dev)->features[2][0] & LMP_SYNC_SCAN) #define lmp_sc_capable(dev) ((dev)->features[2][1] & LMP_SC) @@ -1352,28 +1918,88 @@ void hci_conn_del_sysfs(struct hci_conn *conn); !hci_dev_test_flag(dev, HCI_AUTO_OFF)) #define bredr_sc_enabled(dev) (lmp_sc_capable(dev) && \ hci_dev_test_flag(dev, HCI_SC_ENABLED)) +#define rpa_valid(dev) (bacmp(&dev->rpa, BDADDR_ANY) && \ + !hci_dev_test_flag(dev, HCI_RPA_EXPIRED)) +#define adv_rpa_valid(adv) (bacmp(&adv->random_addr, BDADDR_ANY) && \ + !adv->rpa_expired) #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) +#define le_2m_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_2M)) + #define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) +#define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \ + !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \ + &(dev)->quirks)) + #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) -/* Use LL Privacy based address resolution if supported */ -#define use_ll_privacy(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) +#define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) + +#define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ + (hdev->commands[39] & 0x04)) + +#define read_key_size_capable(dev) \ + ((dev)->commands[20] & 0x10 && \ + !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks)) + +#define read_voice_setting_capable(dev) \ + ((dev)->commands[9] & 0x04 && \ + !test_bit(HCI_QUIRK_BROKEN_READ_VOICE_SETTING, &(dev)->quirks)) + +/* Use enhanced synchronous connection if command is supported and its quirk + * has not been set. + */ +#define enhanced_sync_conn_capable(dev) \ + (((dev)->commands[29] & 0x08) && \ + !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks)) /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ - ((dev)->commands[37] & 0x40)) -/* Use ext create connection if command is supported */ -#define use_ext_conn(dev) ((dev)->commands[37] & 0x80) + ((dev)->commands[37] & 0x40) && \ + !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) +/* Use ext create connection if command is supported */ +#define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \ + !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks)) /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) +/* Maximum advertising length */ +#define max_adv_len(dev) \ + (ext_adv_capable(dev) ? HCI_MAX_EXT_AD_LENGTH : HCI_MAX_AD_LENGTH) + +/* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 1789: + * + * C24: Mandatory if the LE Controller supports Connection State and either + * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported + */ +#define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \ + ext_adv_capable(dev)) && \ + !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \ + &(dev)->quirks)) + +/* Periodic advertising support */ +#define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV)) + +/* CIS Master/Slave and BIS support */ +#define iso_capable(dev) (cis_capable(dev) || bis_capable(dev)) +#define cis_capable(dev) \ + (cis_central_capable(dev) || cis_peripheral_capable(dev)) +#define cis_central_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_CIS_CENTRAL) +#define cis_peripheral_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) +#define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) +#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) + +#define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ + (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 @@ -1388,6 +2014,10 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, case ESCO_LINK: return sco_connect_ind(hdev, bdaddr, flags); + case CIS_LINK: + case BIS_LINK: + return iso_connect_ind(hdev, bdaddr, flags); + default: BT_ERR("unknown link type %d", type); return -EINVAL; @@ -1532,43 +2162,6 @@ static inline void hci_role_switch_cfm(struct hci_conn *conn, __u8 status, mutex_unlock(&hci_cb_list_lock); } -static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type, - size_t *data_len) -{ - size_t parsed = 0; - - if (eir_len < 2) - return NULL; - - while (parsed < eir_len - 1) { - u8 field_len = eir[0]; - - if (field_len == 0) - break; - - parsed += field_len + 1; - - if (parsed > eir_len) - break; - - if (eir[1] != type) { - eir += field_len + 1; - continue; - } - - /* Zero length data */ - if (field_len == 1) - return NULL; - - if (data_len) - *data_len = field_len - 1; - - return &eir[2]; - } - - return NULL; -} - static inline bool hci_bdaddr_is_rpa(bdaddr_t *bdaddr, u8 addr_type) { if (addr_type != ADDR_LE_DEV_RANDOM) @@ -1606,18 +2199,46 @@ static inline int hci_check_conn_params(u16 min, u16 max, u16 latency, { u16 max_latency; - if (min > max || min < 6 || max > 3200) + if (min > max) { + BT_WARN("min %d > max %d", min, max); + return -EINVAL; + } + + if (min < 6) { + BT_WARN("min %d < 6", min); + return -EINVAL; + } + + if (max > 3200) { + BT_WARN("max %d > 3200", max); + return -EINVAL; + } + + if (to_multiplier < 10) { + BT_WARN("to_multiplier %d < 10", to_multiplier); return -EINVAL; + } - if (to_multiplier < 10 || to_multiplier > 3200) + if (to_multiplier > 3200) { + BT_WARN("to_multiplier %d > 3200", to_multiplier); return -EINVAL; + } - if (max >= to_multiplier * 8) + if (max >= to_multiplier * 8) { + BT_WARN("max %d >= to_multiplier %d * 8", max, to_multiplier); return -EINVAL; + } max_latency = (to_multiplier * 4 / max) - 1; - if (latency > 499 || latency > max_latency) + if (latency > 499) { + BT_WARN("latency %d > 499", latency); + return -EINVAL; + } + + if (latency > max_latency) { + BT_WARN("latency %d > max_latency %d", latency, max_latency); return -EINVAL; + } return 0; } @@ -1625,10 +2246,6 @@ static inline int hci_check_conn_params(u16 min, u16 max, u16 latency, int hci_register_cb(struct hci_cb *hcb); int hci_unregister_cb(struct hci_cb *hcb); -struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, - const void *param, u32 timeout); -struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, - const void *param, u8 event, u32 timeout); int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param); @@ -1636,11 +2253,10 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param); void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags); void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb); +void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb); void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode); - -struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, - const void *param, u32 timeout); +void *hci_recv_event_data(struct hci_dev *hdev, __u8 event); u32 hci_conn_get_phy(struct hci_conn *conn); @@ -1690,15 +2306,36 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c); /* These LE scan and inquiry parameters were chosen according to LE General * Discovery Procedure specification. */ -#define DISCOV_LE_SCAN_WIN 0x12 -#define DISCOV_LE_SCAN_INT 0x12 +#define DISCOV_LE_SCAN_WIN 0x0012 /* 11.25 msec */ +#define DISCOV_LE_SCAN_INT 0x0012 /* 11.25 msec */ +#define DISCOV_LE_SCAN_INT_FAST 0x0060 /* 60 msec */ +#define DISCOV_LE_SCAN_WIN_FAST 0x0030 /* 30 msec */ +#define DISCOV_LE_SCAN_INT_CONN 0x0060 /* 60 msec */ +#define DISCOV_LE_SCAN_WIN_CONN 0x0060 /* 60 msec */ +#define DISCOV_LE_SCAN_INT_SLOW1 0x0800 /* 1.28 sec */ +#define DISCOV_LE_SCAN_WIN_SLOW1 0x0012 /* 11.25 msec */ +#define DISCOV_LE_SCAN_INT_SLOW2 0x1000 /* 2.56 sec */ +#define DISCOV_LE_SCAN_WIN_SLOW2 0x0024 /* 22.5 msec */ +#define DISCOV_CODED_SCAN_INT_FAST 0x0120 /* 180 msec */ +#define DISCOV_CODED_SCAN_WIN_FAST 0x0090 /* 90 msec */ +#define DISCOV_CODED_SCAN_INT_SLOW1 0x1800 /* 3.84 sec */ +#define DISCOV_CODED_SCAN_WIN_SLOW1 0x0036 /* 33.75 msec */ +#define DISCOV_CODED_SCAN_INT_SLOW2 0x3000 /* 7.68 sec */ +#define DISCOV_CODED_SCAN_WIN_SLOW2 0x006c /* 67.5 msec */ #define DISCOV_LE_TIMEOUT 10240 /* msec */ #define DISCOV_INTERLEAVED_TIMEOUT 5120 /* msec */ #define DISCOV_INTERLEAVED_INQUIRY_LEN 0x04 #define DISCOV_BREDR_INQUIRY_LEN 0x08 #define DISCOV_LE_RESTART_DELAY msecs_to_jiffies(200) /* msec */ -#define DISCOV_LE_FAST_ADV_INT_MIN 100 /* msec */ -#define DISCOV_LE_FAST_ADV_INT_MAX 150 /* msec */ +#define DISCOV_LE_FAST_ADV_INT_MIN 0x00A0 /* 100 msec */ +#define DISCOV_LE_FAST_ADV_INT_MAX 0x00F0 /* 150 msec */ +#define DISCOV_LE_PER_ADV_INT_MIN 0x00A0 /* 200 msec */ +#define DISCOV_LE_PER_ADV_INT_MAX 0x00A0 /* 200 msec */ +#define DISCOV_LE_ADV_MESH_MIN 0x00A0 /* 100 msec */ +#define DISCOV_LE_ADV_MESH_MAX 0x00A0 /* 100 msec */ +#define INTERVAL_TO_MS(x) (((x) * 10) / 0x10) + +#define NAME_RESOLVE_DURATION msecs_to_jiffies(10240) /* 10.24 sec */ void mgmt_fill_version_info(void *ver); int mgmt_new_settings(struct hci_dev *hdev); @@ -1710,14 +2347,14 @@ void __mgmt_power_off(struct hci_dev *hdev); void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent); void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, - u32 flags, u8 *name, u8 name_len); + u8 *name, u8 name_len); void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 reason, bool mgmt_connected); void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status); -void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 status); +void mgmt_connect_failed(struct hci_dev *hdev, struct hci_conn *conn, + u8 status); void mgmt_pin_code_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 secure); void mgmt_pin_code_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 status); @@ -1741,15 +2378,13 @@ int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 entered); void mgmt_auth_failed(struct hci_conn *conn, u8 status); void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status); -void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status); void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status); void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status); -void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status); -void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, - u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len); + u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, + u64 instant); void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, s8 rssi, u8 *name, u8 name_len); void mgmt_discovering(struct hci_dev *hdev, u8 discovering); @@ -1766,15 +2401,16 @@ void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, u16 max_interval, u16 latency, u16 timeout); void mgmt_smp_complete(struct hci_conn *conn, bool complete); bool mgmt_get_connectable(struct hci_dev *hdev); -void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status); -void mgmt_set_discoverable_complete(struct hci_dev *hdev, u8 status); u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev); void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); +void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, + bdaddr_t *bdaddr, u8 addr_type); +int hci_abort_conn(struct hci_conn *conn, u8 reason); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, @@ -1787,4 +2423,9 @@ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, #define SCO_AIRMODE_CVSD 0x0000 #define SCO_AIRMODE_TRANSP 0x0003 +#define LOCAL_CODEC_ACL_MASK BIT(0) +#define LOCAL_CODEC_SCO_MASK BIT(1) + +#define TRANSPORT_TYPE_MAX 0x04 + #endif /* __HCI_CORE_H */ diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h new file mode 100644 index 000000000000..2f01c44f05ec --- /dev/null +++ b/include/net/bluetooth/hci_drv.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Google Corporation + */ + +#ifndef __HCI_DRV_H +#define __HCI_DRV_H + +#include <linux/types.h> + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci.h> + +struct hci_drv_cmd_hdr { + __le16 opcode; + __le16 len; +} __packed; + +struct hci_drv_ev_hdr { + __le16 opcode; + __le16 len; +} __packed; + +#define HCI_DRV_EV_CMD_STATUS 0x0000 +struct hci_drv_ev_cmd_status { + __le16 opcode; + __u8 status; +} __packed; + +#define HCI_DRV_EV_CMD_COMPLETE 0x0001 +struct hci_drv_ev_cmd_complete { + __le16 opcode; + __u8 status; + __u8 data[]; +} __packed; + +#define HCI_DRV_STATUS_SUCCESS 0x00 +#define HCI_DRV_STATUS_UNSPECIFIED_ERROR 0x01 +#define HCI_DRV_STATUS_UNKNOWN_COMMAND 0x02 +#define HCI_DRV_STATUS_INVALID_PARAMETERS 0x03 + +#define HCI_DRV_MAX_DRIVER_NAME_LENGTH 32 + +/* Common commands that make sense on all drivers start from 0x0000 */ +#define HCI_DRV_OP_READ_INFO 0x0000 +#define HCI_DRV_READ_INFO_SIZE 0 +struct hci_drv_rp_read_info { + __u8 driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH]; + __le16 num_supported_commands; + __le16 supported_commands[]; +} __packed; + +/* Driver specific OGF (Opcode Group Field) + * Commands in this group may have different meanings across different drivers. + */ +#define HCI_DRV_OGF_DRIVER_SPECIFIC 0x01 + +int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status); +int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp, + size_t rp_len); +int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *cmd_skb); + +struct hci_drv_handler { + int (*func)(struct hci_dev *hdev, void *data, u16 data_len); + size_t data_len; +}; + +struct hci_drv { + size_t common_handler_count; + const struct hci_drv_handler *common_handlers; + + size_t specific_handler_count; + const struct hci_drv_handler *specific_handlers; +}; + +#endif /* __HCI_DRV_H */ diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 2d5fcda1bcd0..bbd752494ef9 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -51,12 +51,14 @@ struct hci_mon_hdr { #define HCI_MON_CTRL_EVENT 17 #define HCI_MON_ISO_TX_PKT 18 #define HCI_MON_ISO_RX_PKT 19 +#define HCI_MON_DRV_TX_PKT 20 +#define HCI_MON_DRV_RX_PKT 21 struct hci_mon_new_index { __u8 type; __u8 bus; bdaddr_t bdaddr; - char name[8]; + char name[8] __nonstring; } __packed; #define HCI_MON_NEW_INDEX_SIZE 16 diff --git a/include/net/bluetooth/hci_sock.h b/include/net/bluetooth/hci_sock.h index 9949870f7d78..13e8cd4414a1 100644 --- a/include/net/bluetooth/hci_sock.h +++ b/include/net/bluetooth/hci_sock.h @@ -144,7 +144,7 @@ struct hci_dev_req { struct hci_dev_list_req { __u16 dev_num; - struct hci_dev_req dev_req[]; /* hci_dev_req structures */ + struct hci_dev_req dev_req[] __counted_by(dev_num); }; struct hci_conn_list_req { diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h new file mode 100644 index 000000000000..5224f57f6af2 --- /dev/null +++ b/include/net/bluetooth/hci_sync.h @@ -0,0 +1,190 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BlueZ - Bluetooth protocol stack for Linux + * + * Copyright (C) 2021 Intel Corporation + */ + +#define UINT_PTR(_handle) ((void *)((uintptr_t)_handle)) +#define PTR_UINT(_ptr) ((uintptr_t)((void *)_ptr)) + +#define HCI_REQ_DONE 0 +#define HCI_REQ_PEND 1 +#define HCI_REQ_CANCELED 2 + +#define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock) +#define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock) + +struct hci_request { + struct hci_dev *hdev; + struct sk_buff_head cmd_q; + + /* If something goes wrong when building the HCI request, the error + * value is stored in this field. + */ + int err; +}; + +typedef int (*hci_cmd_sync_work_func_t)(struct hci_dev *hdev, void *data); +typedef void (*hci_cmd_sync_work_destroy_t)(struct hci_dev *hdev, void *data, + int err); + +struct hci_cmd_sync_work_entry { + struct list_head list; + hci_cmd_sync_work_func_t func; + void *data; + hci_cmd_sync_work_destroy_t destroy; +}; + +struct adv_info; + +struct sk_buff *hci_cmd_sync_alloc(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, struct sock *sk); + +/* Function with sync suffix shall not be called with hdev->lock held as they + * wait the command to complete and in the meantime an event could be received + * which could attempt to acquire hdev->lock causing a deadlock. + */ +struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout); +struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout); +struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u8 event, u32 timeout); +struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u8 event, u32 timeout, + struct sock *sk); +int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout); +int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u8 event, u32 timeout, + struct sock *sk); +int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout); + +void hci_cmd_sync_init(struct hci_dev *hdev); +void hci_cmd_sync_clear(struct hci_dev *hdev); +void hci_cmd_sync_cancel(struct hci_dev *hdev, int err); +void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err); + +int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +struct hci_cmd_sync_work_entry * +hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +void hci_cmd_sync_cancel_entry(struct hci_dev *hdev, + struct hci_cmd_sync_work_entry *entry); +bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy); +bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev, + hci_cmd_sync_work_func_t func, void *data, + hci_cmd_sync_work_destroy_t destroy); + +int hci_update_eir_sync(struct hci_dev *hdev); +int hci_update_class_sync(struct hci_dev *hdev); + +int hci_update_eir_sync(struct hci_dev *hdev); +int hci_update_class_sync(struct hci_dev *hdev); +int hci_update_name_sync(struct hci_dev *hdev); +int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode); + +int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, + bool use_rpa, struct adv_info *adv_instance, + u8 *own_addr_type, bdaddr_t *rand_addr); + +int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy, + bool rpa, u8 *own_addr_type); + +int hci_update_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance); +int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance); +int hci_update_adv_data(struct hci_dev *hdev, u8 instance); +int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance, + bool force); + +int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance); +int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance); +int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance); +int hci_enable_advertising_sync(struct hci_dev *hdev); +int hci_enable_advertising(struct hci_dev *hdev); + +int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid, + u8 data_len, u8 *data, u32 flags, u16 min_interval, + u16 max_interval, u16 sync_interval); + +int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance); + +int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk, + u8 instance, bool force); +int hci_disable_advertising_sync(struct hci_dev *hdev); +int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk, + u8 instance, bool force); +int hci_update_passive_scan_sync(struct hci_dev *hdev); +int hci_update_passive_scan(struct hci_dev *hdev); +int hci_read_rssi_sync(struct hci_dev *hdev, __le16 handle); +int hci_read_tx_power_sync(struct hci_dev *hdev, __le16 handle, u8 type); +int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val); +int hci_read_clock_sync(struct hci_dev *hdev, struct hci_cp_read_clock *cp); + +int hci_write_fast_connectable_sync(struct hci_dev *hdev, bool enable); +int hci_update_scan_sync(struct hci_dev *hdev); +int hci_update_scan(struct hci_dev *hdev); + +int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul); +int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance, + struct sock *sk); +struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, bool ext, + struct sock *sk); + +int hci_reset_sync(struct hci_dev *hdev); +int hci_dev_open_sync(struct hci_dev *hdev); +int hci_dev_close_sync(struct hci_dev *hdev); + +int hci_powered_update_sync(struct hci_dev *hdev); +int hci_set_powered_sync(struct hci_dev *hdev, u8 val); + +int hci_update_discoverable_sync(struct hci_dev *hdev); +int hci_update_discoverable(struct hci_dev *hdev); + +int hci_update_connectable_sync(struct hci_dev *hdev); + +int hci_inquiry_sync(struct hci_dev *hdev, u8 length, u8 num_rsp); + +int hci_start_discovery_sync(struct hci_dev *hdev); +int hci_stop_discovery_sync(struct hci_dev *hdev); + +int hci_suspend_sync(struct hci_dev *hdev); +int hci_resume_sync(struct hci_dev *hdev); + +struct hci_conn; +struct hci_conn_params; + +int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason); + +int hci_le_create_cis_sync(struct hci_dev *hdev); + +int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle); + +int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason); + +int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle); + +int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle); + +int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn); + +int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn); + +int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, + struct hci_conn_params *params); + +int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn); diff --git a/include/net/bluetooth/iso.h b/include/net/bluetooth/iso.h new file mode 100644 index 000000000000..3f4fe8b78e1b --- /dev/null +++ b/include/net/bluetooth/iso.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BlueZ - Bluetooth protocol stack for Linux + * + * Copyright (C) 2022 Intel Corporation + */ + +#ifndef __ISO_H +#define __ISO_H + +/* ISO defaults */ +#define ISO_DEFAULT_MTU 251 +#define ISO_MAX_NUM_BIS 0x1f + +/* ISO socket broadcast address */ +struct sockaddr_iso_bc { + bdaddr_t bc_bdaddr; + __u8 bc_bdaddr_type; + __u8 bc_sid; + __u8 bc_num_bis; + __u8 bc_bis[ISO_MAX_NUM_BIS]; +}; + +/* ISO socket address */ +struct sockaddr_iso { + sa_family_t iso_family; + bdaddr_t iso_bdaddr; + __u8 iso_bdaddr_type; + struct sockaddr_iso_bc iso_bc[]; +}; + +#endif /* __ISO_H */ diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 1d1232917de7..4bb0eaedda18 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -27,7 +27,7 @@ #ifndef __L2CAP_H #define __L2CAP_H -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/atomic.h> /* L2CAP defaults */ @@ -38,8 +38,8 @@ #define L2CAP_DEFAULT_TX_WINDOW 63 #define L2CAP_DEFAULT_EXT_WINDOW 0x3FFF #define L2CAP_DEFAULT_MAX_TX 3 -#define L2CAP_DEFAULT_RETRANS_TO 2000 /* 2 seconds */ -#define L2CAP_DEFAULT_MONITOR_TO 12000 /* 12 seconds */ +#define L2CAP_DEFAULT_RETRANS_TO 2 /* seconds */ +#define L2CAP_DEFAULT_MONITOR_TO 12 /* seconds */ #define L2CAP_DEFAULT_MAX_PDU_SIZE 1492 /* Sized for AMP packet */ #define L2CAP_DEFAULT_ACK_TO 200 #define L2CAP_DEFAULT_MAX_SDU_SIZE 0xFFFF @@ -59,8 +59,6 @@ #define L2CAP_WAIT_ACK_POLL_PERIOD msecs_to_jiffies(200) #define L2CAP_WAIT_ACK_TIMEOUT msecs_to_jiffies(10000) -#define L2CAP_A2MP_DEFAULT_MTU 670 - /* L2CAP socket address */ struct sockaddr_l2 { sa_family_t l2_family; @@ -109,12 +107,6 @@ struct l2cap_conninfo { #define L2CAP_ECHO_RSP 0x09 #define L2CAP_INFO_REQ 0x0a #define L2CAP_INFO_RSP 0x0b -#define L2CAP_CREATE_CHAN_REQ 0x0c -#define L2CAP_CREATE_CHAN_RSP 0x0d -#define L2CAP_MOVE_CHAN_REQ 0x0e -#define L2CAP_MOVE_CHAN_RSP 0x0f -#define L2CAP_MOVE_CHAN_CFM 0x10 -#define L2CAP_MOVE_CHAN_CFM_RSP 0x11 #define L2CAP_CONN_PARAM_UPDATE_REQ 0x12 #define L2CAP_CONN_PARAM_UPDATE_RSP 0x13 #define L2CAP_LE_CONN_REQ 0x14 @@ -144,7 +136,6 @@ struct l2cap_conninfo { /* L2CAP fixed channels */ #define L2CAP_FC_SIG_BREDR 0x02 #define L2CAP_FC_CONNLESS 0x04 -#define L2CAP_FC_A2MP 0x08 #define L2CAP_FC_ATT 0x10 #define L2CAP_FC_SIG_LE 0x20 #define L2CAP_FC_SMP_LE 0x40 @@ -207,6 +198,7 @@ struct l2cap_hdr { __le16 len; __le16 cid; } __packed; +#define L2CAP_LEN_SIZE 2 #define L2CAP_HDR_SIZE 4 #define L2CAP_ENH_HDR_SIZE 6 #define L2CAP_EXT_HDR_SIZE 8 @@ -266,7 +258,6 @@ struct l2cap_conn_rsp { /* channel identifier */ #define L2CAP_CID_SIGNALING 0x0001 #define L2CAP_CID_CONN_LESS 0x0002 -#define L2CAP_CID_A2MP 0x0003 #define L2CAP_CID_ATT 0x0004 #define L2CAP_CID_LE_SIGNALING 0x0005 #define L2CAP_CID_SMP 0x0006 @@ -281,7 +272,6 @@ struct l2cap_conn_rsp { #define L2CAP_CR_BAD_PSM 0x0002 #define L2CAP_CR_SEC_BLOCK 0x0003 #define L2CAP_CR_NO_MEM 0x0004 -#define L2CAP_CR_BAD_AMP 0x0005 #define L2CAP_CR_INVALID_SCID 0x0006 #define L2CAP_CR_SCID_IN_USE 0x0007 @@ -403,29 +393,6 @@ struct l2cap_info_rsp { __u8 data[]; } __packed; -struct l2cap_create_chan_req { - __le16 psm; - __le16 scid; - __u8 amp_id; -} __packed; - -struct l2cap_create_chan_rsp { - __le16 dcid; - __le16 scid; - __le16 result; - __le16 status; -} __packed; - -struct l2cap_move_chan_req { - __le16 icid; - __u8 dest_amp_id; -} __packed; - -struct l2cap_move_chan_rsp { - __le16 icid; - __le16 result; -} __packed; - #define L2CAP_MR_SUCCESS 0x0000 #define L2CAP_MR_PEND 0x0001 #define L2CAP_MR_BAD_ID 0x0002 @@ -493,20 +460,27 @@ struct l2cap_le_credits { #define L2CAP_ECRED_MIN_MTU 64 #define L2CAP_ECRED_MIN_MPS 64 +#define L2CAP_ECRED_MAX_CID 5 struct l2cap_ecred_conn_req { - __le16 psm; - __le16 mtu; - __le16 mps; - __le16 credits; + /* New members must be added within the struct_group() macro below. */ + __struct_group(l2cap_ecred_conn_req_hdr, hdr, __packed, + __le16 psm; + __le16 mtu; + __le16 mps; + __le16 credits; + ); __le16 scid[]; } __packed; struct l2cap_ecred_conn_rsp { - __le16 mtu; - __le16 mps; - __le16 credits; - __le16 result; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(l2cap_ecred_conn_rsp_hdr, hdr, + __le16 mtu; + __le16 mps; + __le16 credits; + __le16 result; + ); __le16 dcid[]; }; @@ -537,8 +511,6 @@ struct l2cap_seq_list { struct l2cap_chan { struct l2cap_conn *conn; - struct hci_conn *hs_hcon; - struct hci_chan *hs_hchan; struct kref kref; atomic_t nesting; @@ -582,6 +554,9 @@ struct l2cap_chan { __u16 tx_credits; __u16 rx_credits; + /* estimated available receive buffer space or -1 if unknown */ + ssize_t rx_avail; + __u8 tx_state; __u8 rx_state; @@ -589,12 +564,6 @@ struct l2cap_chan { unsigned long conn_state; unsigned long flags; - __u8 remote_amp_id; - __u8 local_amp_id; - __u8 move_id; - __u8 move_state; - __u8 move_role; - __u16 next_tx_seq; __u16 expected_ack_seq; __u16 expected_tx_seq; @@ -692,14 +661,14 @@ struct l2cap_conn { struct sk_buff_head pending_rx; struct work_struct pending_rx_work; - struct work_struct id_addr_update_work; + struct delayed_work id_addr_timer; __u8 disc_reason; struct l2cap_chan *smp; struct list_head chan_l; - struct mutex chan_lock; + struct mutex lock; struct kref ref; struct list_head users; }; @@ -722,10 +691,15 @@ struct l2cap_user { /* ----- L2CAP socket info ----- */ #define l2cap_pi(sk) ((struct l2cap_pinfo *) sk) +struct l2cap_rx_busy { + struct list_head list; + struct sk_buff *skb; +}; + struct l2cap_pinfo { struct bt_sock bt; struct l2cap_chan *chan; - struct sk_buff *rx_busy_skb; + struct list_head rx_busy; }; enum { @@ -845,6 +819,7 @@ enum { }; void l2cap_chan_hold(struct l2cap_chan *c); +struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c); void l2cap_chan_put(struct l2cap_chan *c); static inline void l2cap_chan_lock(struct l2cap_chan *chan) @@ -978,10 +953,12 @@ int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid); struct l2cap_chan *l2cap_chan_create(void); void l2cap_chan_close(struct l2cap_chan *chan, int reason); int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, - bdaddr_t *dst, u8 dst_type); + bdaddr_t *dst, u8 dst_type, u16 timeout); int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu); -int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len); +int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, + const struct sockcm_cookie *sockc); void l2cap_chan_busy(struct l2cap_chan *chan, int busy); +void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail); int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator); void l2cap_chan_set_defaults(struct l2cap_chan *chan); int l2cap_ertm_init(struct l2cap_chan *chan); @@ -992,12 +969,9 @@ void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, void *data); void l2cap_chan_del(struct l2cap_chan *chan, int err); void l2cap_send_conn_req(struct l2cap_chan *chan); -void l2cap_move_start(struct l2cap_chan *chan); -void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan, - u8 status); -void __l2cap_physical_cfm(struct l2cap_chan *chan, int result); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 6b55155e05e9..3575cd16049a 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -91,24 +91,29 @@ struct mgmt_rp_read_index_list { #define MGMT_MAX_NAME_LENGTH (HCI_MAX_NAME_LENGTH + 1) #define MGMT_MAX_SHORT_NAME_LENGTH (HCI_MAX_SHORT_NAME_LENGTH + 1) -#define MGMT_SETTING_POWERED 0x00000001 -#define MGMT_SETTING_CONNECTABLE 0x00000002 -#define MGMT_SETTING_FAST_CONNECTABLE 0x00000004 -#define MGMT_SETTING_DISCOVERABLE 0x00000008 -#define MGMT_SETTING_BONDABLE 0x00000010 -#define MGMT_SETTING_LINK_SECURITY 0x00000020 -#define MGMT_SETTING_SSP 0x00000040 -#define MGMT_SETTING_BREDR 0x00000080 -#define MGMT_SETTING_HS 0x00000100 -#define MGMT_SETTING_LE 0x00000200 -#define MGMT_SETTING_ADVERTISING 0x00000400 -#define MGMT_SETTING_SECURE_CONN 0x00000800 -#define MGMT_SETTING_DEBUG_KEYS 0x00001000 -#define MGMT_SETTING_PRIVACY 0x00002000 -#define MGMT_SETTING_CONFIGURATION 0x00004000 -#define MGMT_SETTING_STATIC_ADDRESS 0x00008000 -#define MGMT_SETTING_PHY_CONFIGURATION 0x00010000 -#define MGMT_SETTING_WIDEBAND_SPEECH 0x00020000 +#define MGMT_SETTING_POWERED BIT(0) +#define MGMT_SETTING_CONNECTABLE BIT(1) +#define MGMT_SETTING_FAST_CONNECTABLE BIT(2) +#define MGMT_SETTING_DISCOVERABLE BIT(3) +#define MGMT_SETTING_BONDABLE BIT(4) +#define MGMT_SETTING_LINK_SECURITY BIT(5) +#define MGMT_SETTING_SSP BIT(6) +#define MGMT_SETTING_BREDR BIT(7) +#define MGMT_SETTING_HS BIT(8) +#define MGMT_SETTING_LE BIT(9) +#define MGMT_SETTING_ADVERTISING BIT(10) +#define MGMT_SETTING_SECURE_CONN BIT(11) +#define MGMT_SETTING_DEBUG_KEYS BIT(12) +#define MGMT_SETTING_PRIVACY BIT(13) +#define MGMT_SETTING_CONFIGURATION BIT(14) +#define MGMT_SETTING_STATIC_ADDRESS BIT(15) +#define MGMT_SETTING_PHY_CONFIGURATION BIT(16) +#define MGMT_SETTING_WIDEBAND_SPEECH BIT(17) +#define MGMT_SETTING_CIS_CENTRAL BIT(18) +#define MGMT_SETTING_CIS_PERIPHERAL BIT(19) +#define MGMT_SETTING_ISO_BROADCASTER BIT(20) +#define MGMT_SETTING_ISO_SYNC_RECEIVER BIT(21) +#define MGMT_SETTING_LL_PRIVACY BIT(22) #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 @@ -202,7 +207,7 @@ struct mgmt_cp_load_link_keys { struct mgmt_ltk_info { struct mgmt_addr_info addr; __u8 type; - __u8 master; + __u8 initiator; __u8 enc_size; __le16 ediv; __le64 rand; @@ -574,6 +579,11 @@ struct mgmt_rp_add_advertising { #define MGMT_ADV_FLAG_SEC_CODED BIT(9) #define MGMT_ADV_FLAG_CAN_SET_TX_POWER BIT(10) #define MGMT_ADV_FLAG_HW_OFFLOAD BIT(11) +#define MGMT_ADV_PARAM_DURATION BIT(12) +#define MGMT_ADV_PARAM_TIMEOUT BIT(13) +#define MGMT_ADV_PARAM_INTERVALS BIT(14) +#define MGMT_ADV_PARAM_TX_POWER BIT(15) +#define MGMT_ADV_PARAM_SCAN_RSP BIT(16) #define MGMT_ADV_FLAG_SEC_MASK (MGMT_ADV_FLAG_SEC_1M | MGMT_ADV_FLAG_SEC_2M | \ MGMT_ADV_FLAG_SEC_CODED) @@ -621,28 +631,28 @@ struct mgmt_cp_set_appearance { #define MGMT_SET_APPEARANCE_SIZE 2 #define MGMT_OP_GET_PHY_CONFIGURATION 0x0044 -struct mgmt_rp_get_phy_confguration { +struct mgmt_rp_get_phy_configuration { __le32 supported_phys; __le32 configurable_phys; __le32 selected_phys; } __packed; #define MGMT_GET_PHY_CONFIGURATION_SIZE 0 -#define MGMT_PHY_BR_1M_1SLOT 0x00000001 -#define MGMT_PHY_BR_1M_3SLOT 0x00000002 -#define MGMT_PHY_BR_1M_5SLOT 0x00000004 -#define MGMT_PHY_EDR_2M_1SLOT 0x00000008 -#define MGMT_PHY_EDR_2M_3SLOT 0x00000010 -#define MGMT_PHY_EDR_2M_5SLOT 0x00000020 -#define MGMT_PHY_EDR_3M_1SLOT 0x00000040 -#define MGMT_PHY_EDR_3M_3SLOT 0x00000080 -#define MGMT_PHY_EDR_3M_5SLOT 0x00000100 -#define MGMT_PHY_LE_1M_TX 0x00000200 -#define MGMT_PHY_LE_1M_RX 0x00000400 -#define MGMT_PHY_LE_2M_TX 0x00000800 -#define MGMT_PHY_LE_2M_RX 0x00001000 -#define MGMT_PHY_LE_CODED_TX 0x00002000 -#define MGMT_PHY_LE_CODED_RX 0x00004000 +#define MGMT_PHY_BR_1M_1SLOT BIT(0) +#define MGMT_PHY_BR_1M_3SLOT BIT(1) +#define MGMT_PHY_BR_1M_5SLOT BIT(2) +#define MGMT_PHY_EDR_2M_1SLOT BIT(3) +#define MGMT_PHY_EDR_2M_3SLOT BIT(4) +#define MGMT_PHY_EDR_2M_5SLOT BIT(5) +#define MGMT_PHY_EDR_3M_1SLOT BIT(6) +#define MGMT_PHY_EDR_3M_3SLOT BIT(7) +#define MGMT_PHY_EDR_3M_5SLOT BIT(8) +#define MGMT_PHY_LE_1M_TX BIT(9) +#define MGMT_PHY_LE_1M_RX BIT(10) +#define MGMT_PHY_LE_2M_TX BIT(11) +#define MGMT_PHY_LE_2M_RX BIT(12) +#define MGMT_PHY_LE_CODED_TX BIT(13) +#define MGMT_PHY_LE_CODED_RX BIT(14) #define MGMT_PHY_BREDR_MASK (MGMT_PHY_BR_1M_1SLOT | MGMT_PHY_BR_1M_3SLOT | \ MGMT_PHY_BR_1M_5SLOT | MGMT_PHY_EDR_2M_1SLOT | \ @@ -658,7 +668,7 @@ struct mgmt_rp_get_phy_confguration { MGMT_PHY_LE_CODED_RX) #define MGMT_OP_SET_PHY_CONFIGURATION 0x0045 -struct mgmt_cp_set_phy_confguration { +struct mgmt_cp_set_phy_configuration { __le32 selected_phys; } __packed; #define MGMT_SET_PHY_CONFIGURATION_SIZE 4 @@ -682,11 +692,16 @@ struct mgmt_cp_set_blocked_keys { #define MGMT_OP_SET_WIDEBAND_SPEECH 0x0047 -#define MGMT_OP_READ_SECURITY_INFO 0x0048 -#define MGMT_READ_SECURITY_INFO_SIZE 0 -struct mgmt_rp_read_security_info { - __le16 sec_len; - __u8 sec[]; +#define MGMT_CAP_SEC_FLAGS 0x01 +#define MGMT_CAP_MAX_ENC_KEY_SIZE 0x02 +#define MGMT_CAP_SMP_MAX_ENC_KEY_SIZE 0x03 +#define MGMT_CAP_LE_TX_PWR 0x04 + +#define MGMT_OP_READ_CONTROLLER_CAP 0x0048 +#define MGMT_READ_CONTROLLER_CAP_SIZE 0 +struct mgmt_rp_read_controller_cap { + __le16 cap_len; + __u8 cap[]; } __packed; #define MGMT_OP_READ_EXP_FEATURES_INFO 0x0049 @@ -782,6 +797,98 @@ struct mgmt_rp_remove_adv_monitor { __le16 monitor_handle; } __packed; +#define MGMT_OP_ADD_EXT_ADV_PARAMS 0x0054 +struct mgmt_cp_add_ext_adv_params { + __u8 instance; + __le32 flags; + __le16 duration; + __le16 timeout; + __le32 min_interval; + __le32 max_interval; + __s8 tx_power; +} __packed; +#define MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE 18 +struct mgmt_rp_add_ext_adv_params { + __u8 instance; + __s8 tx_power; + __u8 max_adv_data_len; + __u8 max_scan_rsp_len; +} __packed; + +#define MGMT_OP_ADD_EXT_ADV_DATA 0x0055 +struct mgmt_cp_add_ext_adv_data { + __u8 instance; + __u8 adv_data_len; + __u8 scan_rsp_len; + __u8 data[]; +} __packed; +#define MGMT_ADD_EXT_ADV_DATA_SIZE 3 +struct mgmt_rp_add_ext_adv_data { + __u8 instance; +} __packed; + +struct mgmt_adv_rssi_thresholds { + __s8 high_threshold; + __le16 high_threshold_timeout; + __s8 low_threshold; + __le16 low_threshold_timeout; + __u8 sampling_period; +} __packed; + +#define MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI 0x0056 +struct mgmt_cp_add_adv_patterns_monitor_rssi { + struct mgmt_adv_rssi_thresholds rssi; + __u8 pattern_count; + struct mgmt_adv_pattern patterns[]; +} __packed; +#define MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE 8 +#define MGMT_OP_SET_MESH_RECEIVER 0x0057 +struct mgmt_cp_set_mesh { + __u8 enable; + __le16 window; + __le16 period; + __u8 num_ad_types; + __u8 ad_types[]; +} __packed; +#define MGMT_SET_MESH_RECEIVER_SIZE 6 + +#define MGMT_OP_MESH_READ_FEATURES 0x0058 +#define MGMT_MESH_READ_FEATURES_SIZE 0 +#define MESH_HANDLES_MAX 3 +struct mgmt_rp_mesh_read_features { + __le16 index; + __u8 max_handles; + __u8 used_handles; + __u8 handles[MESH_HANDLES_MAX]; +} __packed; + +#define MGMT_OP_MESH_SEND 0x0059 +struct mgmt_cp_mesh_send { + struct mgmt_addr_info addr; + __le64 instant; + __le16 delay; + __u8 cnt; + __u8 adv_data_len; + __u8 adv_data[]; +} __packed; +#define MGMT_MESH_SEND_SIZE 19 + +#define MGMT_OP_MESH_SEND_CANCEL 0x005A +struct mgmt_cp_mesh_send_cancel { + __u8 handle; +} __packed; +#define MGMT_MESH_SEND_CANCEL_SIZE 1 + +#define MGMT_OP_HCI_CMD_SYNC 0x005B +struct mgmt_cp_hci_cmd_sync { + __le16 opcode; + __u8 event; + __u8 timeout; + __le16 params_len; + __u8 params[]; +} __packed; +#define MGMT_HCI_CMD_SYNC_SIZE 6 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; @@ -880,9 +987,12 @@ struct mgmt_ev_auth_failed { __u8 status; } __packed; -#define MGMT_DEV_FOUND_CONFIRM_NAME 0x01 -#define MGMT_DEV_FOUND_LEGACY_PAIRING 0x02 -#define MGMT_DEV_FOUND_NOT_CONNECTABLE 0x04 +#define MGMT_DEV_FOUND_CONFIRM_NAME BIT(0) +#define MGMT_DEV_FOUND_LEGACY_PAIRING BIT(1) +#define MGMT_DEV_FOUND_NOT_CONNECTABLE BIT(2) +#define MGMT_DEV_FOUND_INITIATED_CONN BIT(3) +#define MGMT_DEV_FOUND_NAME_REQUEST_FAILED BIT(4) +#define MGMT_DEV_FOUND_SCAN_RSP BIT(5) #define MGMT_EV_DEVICE_FOUND 0x0012 struct mgmt_ev_device_found { @@ -1046,3 +1156,35 @@ struct mgmt_ev_controller_resume { #define MGMT_WAKE_REASON_NON_BT_WAKE 0x0 #define MGMT_WAKE_REASON_UNEXPECTED 0x1 #define MGMT_WAKE_REASON_REMOTE_WAKE 0x2 + +#define MGMT_EV_ADV_MONITOR_DEVICE_FOUND 0x002f +struct mgmt_ev_adv_monitor_device_found { + __le16 monitor_handle; + struct mgmt_addr_info addr; + __s8 rssi; + __le32 flags; + __le16 eir_len; + __u8 eir[]; +} __packed; + +#define MGMT_EV_ADV_MONITOR_DEVICE_LOST 0x0030 +struct mgmt_ev_adv_monitor_device_lost { + __le16 monitor_handle; + struct mgmt_addr_info addr; +} __packed; + +#define MGMT_EV_MESH_DEVICE_FOUND 0x0031 +struct mgmt_ev_mesh_device_found { + struct mgmt_addr_info addr; + __s8 rssi; + __le64 instant; + __le32 flags; + __le16 eir_len; + __u8 eir[]; +} __packed; + + +#define MGMT_EV_MESH_PACKET_CMPLT 0x0032 +struct mgmt_ev_mesh_pkt_cmplt { + __u8 handle; +} __packed; diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h index 99d26879b02a..c05882476900 100644 --- a/include/net/bluetooth/rfcomm.h +++ b/include/net/bluetooth/rfcomm.h @@ -355,7 +355,7 @@ struct rfcomm_dev_info { struct rfcomm_dev_list_req { u16 dev_num; - struct rfcomm_dev_info dev_info[]; + struct rfcomm_dev_info dev_info[] __counted_by(dev_num); }; int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); diff --git a/include/net/bluetooth/sco.h b/include/net/bluetooth/sco.h index 1aa2e14b6c94..f40ddb4264fc 100644 --- a/include/net/bluetooth/sco.h +++ b/include/net/bluetooth/sco.h @@ -46,6 +46,4 @@ struct sco_conninfo { __u8 dev_class[3]; }; -#define SCO_CMSG_PKT_STATUS 0x01 - #endif /* __SCO_H */ diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index c8696a230b7d..2053cd8e788a 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -15,8 +15,6 @@ #define PKT_TYPE_LACPDU cpu_to_be16(ETH_P_SLOW) #define AD_TIMER_INTERVAL 100 /*msec*/ -#define MULTICAST_LACPDU_ADDR {0x01, 0x80, 0xC2, 0x00, 0x00, 0x02} - #define AD_LACP_SLOW 0 #define AD_LACP_FAST 1 @@ -56,6 +54,8 @@ typedef enum { AD_MUX_DETACHED, /* mux machine */ AD_MUX_WAITING, /* mux machine */ AD_MUX_ATTACHED, /* mux machine */ + AD_MUX_COLLECTING, /* mux machine */ + AD_MUX_DISTRIBUTING, /* mux machine */ AD_MUX_COLLECTING_DISTRIBUTING /* mux machine */ } mux_states_t; @@ -231,7 +231,10 @@ typedef struct port { mux_states_t sm_mux_state; /* state machine mux state */ u16 sm_mux_timer_counter; /* state machine mux timer counter */ tx_states_t sm_tx_state; /* state machine tx state */ - u16 sm_tx_timer_counter; /* state machine tx timer counter(allways on - enter to transmit state 3 time per second) */ + u16 sm_tx_timer_counter; /* state machine tx timer counter + * (always on - enter to transmit + * state 3 time per second) + */ u16 sm_churn_actor_timer_counter; u16 sm_churn_partner_timer_counter; u32 churn_actor_count; @@ -262,7 +265,7 @@ struct ad_system { struct ad_bond_info { struct ad_system system; /* 802.3ad system structure */ struct bond_3ad_stats stats; - u32 agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ + atomic_t agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ u16 aggregator_identifier; }; @@ -290,7 +293,7 @@ static inline const char *bond_3ad_churn_desc(churn_state_t state) } /* ========== AD Exported functions to the main bonding code ========== */ -void bond_3ad_initialize(struct bonding *bond, u16 tick_resolution); +void bond_3ad_initialize(struct bonding *bond); void bond_3ad_bind_slave(struct slave *slave); void bond_3ad_unbind_slave(struct slave *slave); void bond_3ad_state_machine_handler(struct work_struct *); diff --git a/include/net/bond_alb.h b/include/net/bond_alb.h index f6af76c87a6c..e5945427f38d 100644 --- a/include/net/bond_alb.h +++ b/include/net/bond_alb.h @@ -53,7 +53,7 @@ struct slave; struct tlb_client_info { - struct slave *tx_slave; /* A pointer to slave used for transmiting + struct slave *tx_slave; /* A pointer to slave used for transmitting * packets to a Client that the Hash function * gave this entry index. */ @@ -126,7 +126,7 @@ struct tlb_slave_info { struct alb_bond_info { struct tlb_client_info *tx_hashtbl; /* Dynamically allocated */ u32 unbalanced_load; - int tx_rebalance_counter; + atomic_t tx_rebalance_counter; int lp_counter; /* -------- rlb parameters -------- */ int rlb_enabled; @@ -156,8 +156,8 @@ int bond_alb_init_slave(struct bonding *bond, struct slave *slave); void bond_alb_deinit_slave(struct bonding *bond, struct slave *slave); void bond_alb_handle_link_change(struct bonding *bond, struct slave *slave, char link); void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave); -int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev); -int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev); +netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev); +netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev); struct slave *bond_xmit_alb_slave_get(struct bonding *bond, struct sk_buff *skb); struct slave *bond_xmit_tlb_slave_get(struct bonding *bond, diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 9d382f2f0bc5..18687ccf0638 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -7,6 +7,14 @@ #ifndef _NET_BOND_OPTIONS_H #define _NET_BOND_OPTIONS_H +#include <linux/bits.h> +#include <linux/limits.h> +#include <linux/types.h> +#include <linux/string.h> + +struct netlink_ext_ack; +struct nlattr; + #define BOND_OPT_MAX_NAMELEN 32 #define BOND_OPT_VALID(opt) ((opt) < BOND_OPT_LAST) #define BOND_MODE_ALL_EX(x) (~(x)) @@ -64,19 +72,31 @@ enum { BOND_OPT_AD_USER_PORT_KEY, BOND_OPT_NUM_PEER_NOTIF_ALIAS, BOND_OPT_PEER_NOTIF_DELAY, + BOND_OPT_LACP_ACTIVE, + BOND_OPT_MISSED_MAX, + BOND_OPT_NS_TARGETS, + BOND_OPT_PRIO, + BOND_OPT_COUPLED_CONTROL, BOND_OPT_LAST }; /* This structure is used for storing option values and for passing option * values when changing an option. The logic when used as an arg is as follows: - * - if string != NULL -> parse it, if the opt is RAW type then return it, else - * return the parse result - * - if string == NULL -> parse value + * - if value != ULLONG_MAX -> parse value + * - if string != NULL -> parse string + * - if the opt is RAW data and length less than maxlen, + * copy the data to extra storage */ + +#define BOND_OPT_EXTRA_MAXLEN 16 struct bond_opt_value { char *string; u64 value; u32 flags; + union { + char extra[BOND_OPT_EXTRA_MAXLEN]; + struct net_device *slave_dev; + }; }; struct bonding; @@ -100,7 +120,8 @@ struct bond_option { }; int __bond_opt_set(struct bonding *bond, unsigned int option, - struct bond_opt_value *val); + struct bond_opt_value *val, + struct nlattr *bad_attr, struct netlink_ext_ack *extack); int __bond_opt_set_notify(struct bonding *bond, unsigned int option, struct bond_opt_value *val); int bond_opt_tryset_rtnl(struct bonding *bond, unsigned int option, char *buf); @@ -116,18 +137,31 @@ const struct bond_opt_value *bond_opt_get_val(unsigned int option, u64 val); * When value is ULLONG_MAX then string will be used. */ static inline void __bond_opt_init(struct bond_opt_value *optval, - char *string, u64 value) + char *string, u64 value, + void *extra, size_t extra_len) { memset(optval, 0, sizeof(*optval)); optval->value = ULLONG_MAX; - if (value == ULLONG_MAX) - optval->string = string; - else + if (value != ULLONG_MAX) optval->value = value; + else if (string) + optval->string = string; + + if (extra && extra_len <= BOND_OPT_EXTRA_MAXLEN) + memcpy(optval->extra, extra, extra_len); } -#define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value) -#define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX) +#define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value, NULL, 0) +#define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX, NULL, 0) +#define bond_opt_initextra(optval, extra, extra_len) \ + __bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len) +#define bond_opt_slave_initval(optval, slave_dev, value) \ + __bond_opt_init(optval, NULL, value, slave_dev, sizeof(struct net_device *)) void bond_option_arp_ip_targets_clear(struct bonding *bond); +#if IS_ENABLED(CONFIG_IPV6) +void bond_option_ns_ip6_targets_clear(struct bonding *bond); +#endif +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave); +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave); #endif /* _NET_BOND_OPTIONS_H */ diff --git a/include/net/bonding.h b/include/net/bonding.h index 7d132cc1e584..95f67b308c19 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-1.0+ */ /* * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'. * @@ -7,9 +8,6 @@ * BUT, I'm the one who modified it for ethernet, so: * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov * - * This software may be used and distributed according to the terms - * of the GNU Public License, incorporated herein by reference. - * */ #ifndef _NET_BONDING_H @@ -29,8 +27,11 @@ #include <net/bond_3ad.h> #include <net/bond_alb.h> #include <net/bond_options.h> +#include <net/ipv6.h> +#include <net/addrconf.h> #define BOND_MAX_ARP_TARGETS 16 +#define BOND_MAX_NS_TARGETS BOND_MAX_ARP_TARGETS #define BOND_DEFAULT_MIIMON 100 @@ -86,10 +87,8 @@ #define bond_for_each_slave_rcu(bond, pos, iter) \ netdev_for_each_lower_private_rcu((bond)->dev, pos, iter) -#ifdef CONFIG_XFRM_OFFLOAD #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) -#endif /* CONFIG_XFRM_OFFLOAD */ #ifdef CONFIG_NET_POLL_CONTROLLER extern atomic_t netpoll_block_tx; @@ -121,6 +120,7 @@ struct bond_params { int xmit_policy; int miimon; u8 num_peer_notif; + u8 missed_max; int arp_interval; int arp_validate; int arp_all_targets; @@ -129,6 +129,7 @@ struct bond_params { int updelay; int downdelay; int peer_notif_delay; + int lacp_active; int lacp_fast; unsigned int min_links; int ad_select; @@ -144,22 +145,22 @@ struct bond_params { struct reciprocal_value reciprocal_packets_per_slave; u16 ad_actor_sys_prio; u16 ad_user_port_key; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; +#endif + int coupled_control; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; }; -struct bond_parm_tbl { - char *modename; - int mode; -}; - struct slave { struct net_device *dev; /* first - useful for panic debug */ struct bonding *bond; /* our master */ int delay; - /* all three in jiffies */ + /* all 4 in jiffies */ unsigned long last_link_up; + unsigned long last_tx; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ @@ -167,6 +168,7 @@ struct slave { u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ + rx_disabled:1, /* indicates whether slave's Rx is disabled */ should_notify:1, /* indicates whether the state changed */ should_notify_link:1; /* indicates whether the link changed */ u8 duplex; @@ -175,6 +177,7 @@ struct slave { u32 speed; u16 queue_id; u8 perm_hwaddr[MAX_ADDR_LEN]; + int prio; struct ad_slave_info *ad_info; struct tlb_slave_info tlb_info; #ifdef CONFIG_NET_POLL_CONTROLLER @@ -185,6 +188,11 @@ struct slave { struct rtnl_link_stats64 slave_stats; }; +static inline struct slave *to_slave(struct kobject *kobj) +{ + return container_of(kobj, struct slave, kobj); +} + struct bond_up_slave { unsigned int count; struct rcu_head rcu; @@ -196,6 +204,11 @@ struct bond_up_slave { */ #define BOND_LINK_NOCHANGE -1 +struct bond_ipsec { + struct list_head list; + struct xfrm_state *xs; +}; + /* * Here are the locking policies for the two bonding locks: * Get rcu_read_lock when reading or RTNL when writing slave list. @@ -208,6 +221,7 @@ struct bonding { struct bond_up_slave __rcu *usable_slaves; struct bond_up_slave __rcu *all_slaves; bool force_primary; + bool notifier_ctx; s32 slave_cnt; /* never change this value outside the attach/detach wrappers */ int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); @@ -220,14 +234,14 @@ struct bonding { */ spinlock_t mode_lock; spinlock_t stats_lock; - u8 send_peer_notif; + u32 send_peer_notif; u8 igmp_retrans; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_entry; char proc_file_name[IFNAMSIZ]; #endif /* CONFIG_PROC_FS */ struct list_head bond_list; - u32 rr_tx_counter; + u32 __percpu *rr_tx_counter; struct ad_bond_info ad_info; struct alb_bond_info alb_info; struct bond_params params; @@ -244,8 +258,11 @@ struct bonding { #endif /* CONFIG_DEBUG_FS */ struct rtnl_link_stats64 bond_stats; #ifdef CONFIG_XFRM_OFFLOAD - struct xfrm_state *xs; + struct list_head ipsec_list; + /* protecting ipsec_list */ + struct mutex ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ + struct bpf_prog *xdp_prog; }; #define bond_slave_get_rcu(dev) \ @@ -262,7 +279,7 @@ struct bond_vlan_tag { unsigned short vlan_id; }; -/** +/* * Returns NULL if the net_device does not belong to any of the bond's slaves * * Caller must hold bond lock for read @@ -334,7 +351,7 @@ static inline bool bond_uses_primary(struct bonding *bond) static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond) { - struct slave *slave = rcu_dereference(bond->curr_active_slave); + struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); return bond_uses_primary(bond) && slave ? slave->dev : NULL; } @@ -487,6 +504,15 @@ static inline int bond_is_ip_target_ok(__be32 addr) return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr); } +#if IS_ENABLED(CONFIG_IPV6) +static inline int bond_is_ip6_target_ok(struct in6_addr *addr) +{ + return !ipv6_addr_any(addr) && + !ipv6_addr_loopback(addr) && + !ipv6_addr_is_multicast(addr); +} +#endif + /* Get the oldest arp which we've received on this slave for bond's * arp_targets. */ @@ -512,6 +538,16 @@ static inline unsigned long slave_last_rx(struct bonding *bond, return slave->last_rx; } +static inline void slave_update_last_tx(struct slave *slave) +{ + WRITE_ONCE(slave->last_tx, jiffies); +} + +static inline unsigned long slave_last_tx(struct slave *slave) +{ + return READ_ONCE(slave->last_tx); +} + #ifdef CONFIG_NET_POLL_CONTROLLER static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) @@ -534,6 +570,14 @@ static inline void bond_set_slave_inactive_flags(struct slave *slave, bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); if (!slave->bond->params.all_slaves_active) slave->inactive = 1; + if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) + slave->rx_disabled = 1; +} + +static inline void bond_set_slave_tx_disabled_flags(struct slave *slave, + bool notify) +{ + bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); } static inline void bond_set_slave_active_flags(struct slave *slave, @@ -541,6 +585,14 @@ static inline void bond_set_slave_active_flags(struct slave *slave, { bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify); slave->inactive = 0; + if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) + slave->rx_disabled = 0; +} + +static inline void bond_set_slave_rx_enabled_flags(struct slave *slave, + bool notify) +{ + slave->rx_disabled = 0; } static inline bool bond_is_slave_inactive(struct slave *slave) @@ -548,6 +600,11 @@ static inline bool bond_is_slave_inactive(struct slave *slave) return slave->inactive; } +static inline bool bond_is_slave_rx_disabled(struct slave *slave) +{ + return slave->rx_disabled; +} + static inline void bond_propose_link_state(struct slave *slave, int state) { slave->link_new_state = state; @@ -616,7 +673,7 @@ struct bond_net { struct class_attribute class_attr_bonding_masters; }; -int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); +int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev); int bond_create(struct net *net, const char *name); int bond_create_sysfs(struct bond_net *net); @@ -624,6 +681,7 @@ void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); +void bond_xdp_set_features(struct net_device *bond_dev); int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); @@ -637,6 +695,7 @@ void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); +bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); @@ -687,37 +746,14 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond, } /* Caller must hold rcu_read_lock() for read */ -static inline struct slave *bond_slave_has_mac_rcu(struct bonding *bond, - const u8 *mac) +static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave_rcu(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) - return tmp; - - return NULL; -} - -/* Caller must hold rcu_read_lock() for read */ -static inline bool bond_slave_has_mac_rx(struct bonding *bond, const u8 *mac) -{ - struct list_head *iter; - struct slave *tmp; - struct netdev_hw_addr *ha; - - bond_for_each_slave_rcu(bond, tmp, iter) - if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return true; - - if (netdev_uc_empty(bond->dev)) - return false; - - netdev_for_each_uc_addr(ha, bond->dev) - if (ether_addr_equal_64bits(mac, ha->addr)) - return true; - return false; } @@ -737,22 +773,40 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) return -1; } +#if IS_ENABLED(CONFIG_IPV6) +static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) +{ + struct in6_addr mcaddr; + int i; + + for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { + addrconf_addr_solict_mult(&targets[i], &mcaddr); + if ((ipv6_addr_equal(&targets[i], ip)) || + (ipv6_addr_equal(&mcaddr, ip))) + return i; + else if (ipv6_addr_any(&targets[i])) + break; + } + + return -1; +} +#endif + /* exported from bond_main.c */ extern unsigned int bond_net_id; -extern const struct bond_parm_tbl bond_lacp_tbl[]; -extern const struct bond_parm_tbl xmit_hashtype_tbl[]; -extern const struct bond_parm_tbl arp_validate_tbl[]; -extern const struct bond_parm_tbl arp_all_targets_tbl[]; -extern const struct bond_parm_tbl fail_over_mac_tbl[]; -extern const struct bond_parm_tbl pri_reselect_tbl[]; -extern struct bond_parm_tbl ad_select_tbl[]; /* exported from bond_netlink.c */ extern struct rtnl_link_ops bond_link_ops; +/* exported from bond_sysfs_slave.c */ +extern const struct sysfs_ops slave_sysfs_ops; + +/* exported from bond_3ad.c */ +extern const u8 lacpdu_mcast_addr[]; + static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NET_XMIT_DROP; } diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 3c516dd07caf..2926f1f00d65 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,12 +20,13 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +extern const struct bpf_func_proto bpf_sk_storage_get_tracing_proto; +extern const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; struct sk_buff; struct nlattr; -struct sock; #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index b001fa91c14e..6e172d0f6ef5 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -16,6 +16,7 @@ #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> +#include <net/xdp.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu @@ -23,6 +24,13 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +static inline bool napi_id_valid(unsigned int napi_id) +{ + return napi_id >= MIN_NAPI_ID; +} + +#define BUSY_POLL_BUDGET 8 + #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; @@ -31,19 +39,26 @@ extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { - return sysctl_net_busy_poll; + return READ_ONCE(sysctl_net_busy_poll); } static inline bool sk_can_busy_loop(const struct sock *sk) { - return sk->sk_ll_usec && !signal_pending(current); + return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); } bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg); + void *loop_end_arg, bool prefer_busy_poll, u16 budget); + +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget); + +void napi_suspend_irqs(unsigned int napi_id); +void napi_resume_irqs(unsigned int napi_id); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -61,7 +76,7 @@ static inline bool sk_can_busy_loop(struct sock *sk) static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL - return (unsigned long)(local_clock() >> 10); + return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif @@ -104,40 +119,69 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); - if (napi_id >= MIN_NAPI_ID) - napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); + if (napi_id_valid(napi_id)) + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, + READ_ONCE(sk->sk_prefer_busy_poll), + READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } /* used in the NIC receive handler to mark the skb */ -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) +static inline void __skb_mark_napi_id(struct sk_buff *skb, + const struct gro_node *gro) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ - if (skb->napi_id < MIN_NAPI_ID) - skb->napi_id = napi->napi_id; + if (!napi_id_valid(skb->napi_id)) + skb->napi_id = gro->cached_napi_id; #endif } -/* used in the protocol hanlder to propagate the napi_id to the socket */ +static inline void skb_mark_napi_id(struct sk_buff *skb, + const struct napi_struct *napi) +{ + __skb_mark_napi_id(skb, &napi->gro); +} + +/* used in the protocol handler to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL + if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) + WRITE_ONCE(sk->sk_napi_id, skb->napi_id); +#endif + sk_rx_queue_update(sk, skb); +} + +/* Variant of sk_mark_napi_id() for passive flow setup, + * as sk->sk_napi_id and sk->sk_rx_queue_mapping content + * needs to be set. + */ +static inline void sk_mark_napi_id_set(struct sock *sk, + const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } +static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (!READ_ONCE(sk->sk_napi_id)) + WRITE_ONCE(sk->sk_napi_id, napi_id); +#endif +} + /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL - if (!READ_ONCE(sk->sk_napi_id)) - WRITE_ONCE(sk->sk_napi_id, skb->napi_id); + __sk_mark_napi_id_once(sk, skb->napi_id); #endif } diff --git a/include/net/caif/caif_dev.h b/include/net/caif/caif_dev.h index 48ecca8530ff..b655d8666f55 100644 --- a/include/net/caif/caif_dev.h +++ b/include/net/caif/caif_dev.h @@ -119,7 +119,7 @@ void caif_free_client(struct cflayer *adap_layer); * The link_support layer is used to add any Link Layer specific * framing. */ -void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, +int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct cflayer *link_support, int head_room, struct cflayer **layer, int (**rcv_func)( struct sk_buff *, struct net_device *, diff --git a/include/net/caif/caif_hsi.h b/include/net/caif/caif_hsi.h deleted file mode 100644 index 552cf68d28d2..000000000000 --- a/include/net/caif/caif_hsi.h +++ /dev/null @@ -1,200 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Daniel Martensson / daniel.martensson@stericsson.com - * Dmitry.Tarnyagin / dmitry.tarnyagin@stericsson.com - */ - -#ifndef CAIF_HSI_H_ -#define CAIF_HSI_H_ - -#include <net/caif/caif_layer.h> -#include <net/caif/caif_device.h> -#include <linux/atomic.h> - -/* - * Maximum number of CAIF frames that can reside in the same HSI frame. - */ -#define CFHSI_MAX_PKTS 15 - -/* - * Maximum number of bytes used for the frame that can be embedded in the - * HSI descriptor. - */ -#define CFHSI_MAX_EMB_FRM_SZ 96 - -/* - * Decides if HSI buffers should be prefilled with 0xFF pattern for easier - * debugging. Both TX and RX buffers will be filled before the transfer. - */ -#define CFHSI_DBG_PREFILL 0 - -/* Structure describing a HSI packet descriptor. */ -#pragma pack(1) /* Byte alignment. */ -struct cfhsi_desc { - u8 header; - u8 offset; - u16 cffrm_len[CFHSI_MAX_PKTS]; - u8 emb_frm[CFHSI_MAX_EMB_FRM_SZ]; -}; -#pragma pack() /* Default alignment. */ - -/* Size of the complete HSI packet descriptor. */ -#define CFHSI_DESC_SZ (sizeof(struct cfhsi_desc)) - -/* - * Size of the complete HSI packet descriptor excluding the optional embedded - * CAIF frame. - */ -#define CFHSI_DESC_SHORT_SZ (CFHSI_DESC_SZ - CFHSI_MAX_EMB_FRM_SZ) - -/* - * Maximum bytes transferred in one transfer. - */ -#define CFHSI_MAX_CAIF_FRAME_SZ 4096 - -#define CFHSI_MAX_PAYLOAD_SZ (CFHSI_MAX_PKTS * CFHSI_MAX_CAIF_FRAME_SZ) - -/* Size of the complete HSI TX buffer. */ -#define CFHSI_BUF_SZ_TX (CFHSI_DESC_SZ + CFHSI_MAX_PAYLOAD_SZ) - -/* Size of the complete HSI RX buffer. */ -#define CFHSI_BUF_SZ_RX ((2 * CFHSI_DESC_SZ) + CFHSI_MAX_PAYLOAD_SZ) - -/* Bitmasks for the HSI descriptor. */ -#define CFHSI_PIGGY_DESC (0x01 << 7) - -#define CFHSI_TX_STATE_IDLE 0 -#define CFHSI_TX_STATE_XFER 1 - -#define CFHSI_RX_STATE_DESC 0 -#define CFHSI_RX_STATE_PAYLOAD 1 - -/* Bitmasks for power management. */ -#define CFHSI_WAKE_UP 0 -#define CFHSI_WAKE_UP_ACK 1 -#define CFHSI_WAKE_DOWN_ACK 2 -#define CFHSI_AWAKE 3 -#define CFHSI_WAKELOCK_HELD 4 -#define CFHSI_SHUTDOWN 5 -#define CFHSI_FLUSH_FIFO 6 - -#ifndef CFHSI_INACTIVITY_TOUT -#define CFHSI_INACTIVITY_TOUT (1 * HZ) -#endif /* CFHSI_INACTIVITY_TOUT */ - -#ifndef CFHSI_WAKE_TOUT -#define CFHSI_WAKE_TOUT (3 * HZ) -#endif /* CFHSI_WAKE_TOUT */ - -#ifndef CFHSI_MAX_RX_RETRIES -#define CFHSI_MAX_RX_RETRIES (10 * HZ) -#endif - -/* Structure implemented by the CAIF HSI driver. */ -struct cfhsi_cb_ops { - void (*tx_done_cb) (struct cfhsi_cb_ops *drv); - void (*rx_done_cb) (struct cfhsi_cb_ops *drv); - void (*wake_up_cb) (struct cfhsi_cb_ops *drv); - void (*wake_down_cb) (struct cfhsi_cb_ops *drv); -}; - -/* Structure implemented by HSI device. */ -struct cfhsi_ops { - int (*cfhsi_up) (struct cfhsi_ops *dev); - int (*cfhsi_down) (struct cfhsi_ops *dev); - int (*cfhsi_tx) (u8 *ptr, int len, struct cfhsi_ops *dev); - int (*cfhsi_rx) (u8 *ptr, int len, struct cfhsi_ops *dev); - int (*cfhsi_wake_up) (struct cfhsi_ops *dev); - int (*cfhsi_wake_down) (struct cfhsi_ops *dev); - int (*cfhsi_get_peer_wake) (struct cfhsi_ops *dev, bool *status); - int (*cfhsi_fifo_occupancy) (struct cfhsi_ops *dev, size_t *occupancy); - int (*cfhsi_rx_cancel)(struct cfhsi_ops *dev); - struct cfhsi_cb_ops *cb_ops; -}; - -/* Structure holds status of received CAIF frames processing */ -struct cfhsi_rx_state { - int state; - int nfrms; - int pld_len; - int retries; - bool piggy_desc; -}; - -/* Priority mapping */ -enum { - CFHSI_PRIO_CTL = 0, - CFHSI_PRIO_VI, - CFHSI_PRIO_VO, - CFHSI_PRIO_BEBK, - CFHSI_PRIO_LAST, -}; - -struct cfhsi_config { - u32 inactivity_timeout; - u32 aggregation_timeout; - u32 head_align; - u32 tail_align; - u32 q_high_mark; - u32 q_low_mark; -}; - -/* Structure implemented by CAIF HSI drivers. */ -struct cfhsi { - struct caif_dev_common cfdev; - struct net_device *ndev; - struct platform_device *pdev; - struct sk_buff_head qhead[CFHSI_PRIO_LAST]; - struct cfhsi_cb_ops cb_ops; - struct cfhsi_ops *ops; - int tx_state; - struct cfhsi_rx_state rx_state; - struct cfhsi_config cfg; - int rx_len; - u8 *rx_ptr; - u8 *tx_buf; - u8 *rx_buf; - u8 *rx_flip_buf; - spinlock_t lock; - int flow_off_sent; - struct list_head list; - struct work_struct wake_up_work; - struct work_struct wake_down_work; - struct work_struct out_of_sync_work; - struct workqueue_struct *wq; - wait_queue_head_t wake_up_wait; - wait_queue_head_t wake_down_wait; - wait_queue_head_t flush_fifo_wait; - struct timer_list inactivity_timer; - struct timer_list rx_slowpath_timer; - - /* TX aggregation */ - int aggregation_len; - struct timer_list aggregation_timer; - - unsigned long bits; -}; -extern struct platform_driver cfhsi_driver; - -/** - * enum ifla_caif_hsi - CAIF HSI NetlinkRT parameters. - * @IFLA_CAIF_HSI_INACTIVITY_TOUT: Inactivity timeout before - * taking the HSI wakeline down, in milliseconds. - * When using RT Netlink to create, destroy or configure a CAIF HSI interface, - * enum ifla_caif_hsi is used to specify the configuration attributes. - */ -enum ifla_caif_hsi { - __IFLA_CAIF_HSI_UNSPEC, - __IFLA_CAIF_HSI_INACTIVITY_TOUT, - __IFLA_CAIF_HSI_AGGREGATION_TOUT, - __IFLA_CAIF_HSI_HEAD_ALIGN, - __IFLA_CAIF_HSI_TAIL_ALIGN, - __IFLA_CAIF_HSI_QHIGH_WATERMARK, - __IFLA_CAIF_HSI_QLOW_WATERMARK, - __IFLA_CAIF_HSI_MAX -}; - -struct cfhsi_ops *cfhsi_get_ops(void); - -#endif /* CAIF_HSI_H_ */ diff --git a/include/net/caif/caif_layer.h b/include/net/caif/caif_layer.h index 51f7bb42a936..053e7c6a6a66 100644 --- a/include/net/caif/caif_layer.h +++ b/include/net/caif/caif_layer.h @@ -11,9 +11,7 @@ struct cflayer; struct cfpkt; -struct cfpktq; struct caif_payload_info; -struct caif_packet_funcs; #define CAIF_LAYER_NAME_SZ 16 @@ -22,7 +20,7 @@ struct caif_packet_funcs; * @assert: expression to evaluate. * * This function will print a error message and a do WARN_ON if the - * assertion failes. Normally this will do a stack up at the current location. + * assertion fails. Normally this will do a stack up at the current location. */ #define caif_assert(assert) \ do { \ @@ -118,7 +116,7 @@ enum caif_direction { * @dn: Pointer down to the layer below. * @node: List node used when layer participate in a list. * @receive: Packet receive function. - * @transmit: Packet transmit funciton. + * @transmit: Packet transmit function. * @ctrlcmd: Used for control signalling upwards in the stack. * @modemcmd: Used for control signaling downwards in the stack. * @id: The identity of this layer diff --git a/include/net/caif/cfcnfg.h b/include/net/caif/cfcnfg.h index 2aa5e91d8457..8819ff4db35a 100644 --- a/include/net/caif/cfcnfg.h +++ b/include/net/caif/cfcnfg.h @@ -62,7 +62,7 @@ void cfcnfg_remove(struct cfcnfg *cfg); * @fcs: Specify if checksum is used in CAIF Framing Layer. * @head_room: Head space needed by link specific protocol. */ -void +int cfcnfg_add_phy_layer(struct cfcnfg *cnfg, struct net_device *dev, struct cflayer *phy_layer, enum cfcnfg_phy_preference pref, diff --git a/include/net/caif/cfpkt.h b/include/net/caif/cfpkt.h index 44d914a50369..acf664227d96 100644 --- a/include/net/caif/cfpkt.h +++ b/include/net/caif/cfpkt.h @@ -18,7 +18,7 @@ struct cfpkt *cfpkt_create(u16 len); /* * Destroy a CAIF Packet. - * pkt Packet to be destoyed. + * pkt Packet to be destroyed. */ void cfpkt_destroy(struct cfpkt *pkt); diff --git a/include/net/caif/cfserl.h b/include/net/caif/cfserl.h index 14a55e03bb3c..67cce8757175 100644 --- a/include/net/caif/cfserl.h +++ b/include/net/caif/cfserl.h @@ -9,4 +9,5 @@ #include <net/caif/caif_layer.h> struct cflayer *cfserl_create(int instance, bool use_stx); +void cfserl_release(struct cflayer *layer); #endif diff --git a/include/net/caif/cfsrvl.h b/include/net/caif/cfsrvl.h index bd5440977f7f..a000dc45f966 100644 --- a/include/net/caif/cfsrvl.h +++ b/include/net/caif/cfsrvl.h @@ -33,9 +33,6 @@ struct cflayer *cfrfml_create(u8 linkid, struct dev_info *dev_info, int mtu_size); struct cflayer *cfdbgl_create(u8 linkid, struct dev_info *dev_info); -void cfsrvl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid); - bool cfsrvl_phyid_match(struct cflayer *layer, int phyid); void cfsrvl_init(struct cfsrvl *service, @@ -43,7 +40,6 @@ void cfsrvl_init(struct cfsrvl *service, struct dev_info *dev_info, bool supports_flowctrl); bool cfsrvl_ready(struct cfsrvl *service, int *err); -u8 cfsrvl_getphyid(struct cflayer *layer); static inline void cfsrvl_get(struct cflayer *layr) { diff --git a/include/net/calipso.h b/include/net/calipso.h index f8667a3fda9e..76b9e08c10c2 100644 --- a/include/net/calipso.h +++ b/include/net/calipso.h @@ -25,7 +25,7 @@ #include <net/netlabel.h> #include <net/request_sock.h> #include <linux/refcount.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /* known doi values */ #define CALIPSO_DOI_UNKNOWN 0x00000000 diff --git a/include/net/cfg80211-wext.h b/include/net/cfg80211-wext.h index ad77caf2ffde..0ee36d97e068 100644 --- a/include/net/cfg80211-wext.h +++ b/include/net/cfg80211-wext.h @@ -19,34 +19,34 @@ */ int cfg80211_wext_giwname(struct net_device *dev, struct iw_request_info *info, - char *name, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, - u32 *mode, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, - u32 *mode, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwscan(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwscan(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwrange(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwrts(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rts, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwrts(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rts, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_request_info *info, - struct iw_param *frag, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwfrag(struct net_device *dev, struct iw_request_info *info, - struct iw_param *frag, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwretry(struct net_device *dev, struct iw_request_info *info, - struct iw_param *retry, char *extra); + union iwreq_data *wrqu, char *extra); #endif /* __NET_CFG80211_WEXT_H */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 661edfc8722e..d1848dc8ec99 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7,9 +7,11 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ +#include <linux/ethtool.h> +#include <uapi/linux/rfkill.h> #include <linux/netdevice.h> #include <linux/debugfs.h> #include <linux/list.h> @@ -20,6 +22,7 @@ #include <linux/if_ether.h> #include <linux/ieee80211.h> #include <linux/net.h> +#include <linux/rfkill.h> #include <net/regulatory.h> /** @@ -49,7 +52,7 @@ * such wiphy can have zero, one, or many virtual interfaces associated with * it, which need to be identified as such by pointing the network interface's * @ieee80211_ptr pointer to a &struct wireless_dev which further describes - * the wireless part of the interface, normally this struct is embedded in the + * the wireless part of the interface. Normally this struct is embedded in the * network interface's private data area. Drivers can optionally allow creating * or destroying virtual interfaces on the fly, but without at least one or the * ability to create some the wireless device isn't useful. @@ -73,6 +76,8 @@ struct wiphy; * @IEEE80211_CHAN_DISABLED: This channel is disabled. * @IEEE80211_CHAN_NO_IR: do not initiate radiation, this includes * sending probe requests or beaconing. + * @IEEE80211_CHAN_PSD: Power spectral density (in dBm) is set for this + * channel. * @IEEE80211_CHAN_RADAR: Radar detection is required on this channel. * @IEEE80211_CHAN_NO_HT40PLUS: extension channel above this channel * is not permitted. @@ -106,28 +111,53 @@ struct wiphy; * on this channel. * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted * on this channel. - * + * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band, + * this flag indicates that a 320 MHz channel cannot use this + * channel as the control or any of the secondary channels. + * This may be due to the driver or due to regulatory bandwidth + * restrictions. + * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel. + * @IEEE80211_CHAN_DFS_CONCURRENT: See %NL80211_RRF_DFS_CONCURRENT + * @IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT: Client connection with VLP AP + * not permitted using this channel + * @IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT: Client connection with AFC AP + * not permitted using this channel + * @IEEE80211_CHAN_CAN_MONITOR: This channel can be used for monitor + * mode even in the presence of other (regulatory) restrictions, + * even if it is otherwise disabled. + * @IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP: Allow using this channel for AP operation + * with very low power (VLP), even if otherwise set to NO_IR. + * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel, + * even if otherwise set to NO_IR. */ enum ieee80211_channel_flags { - IEEE80211_CHAN_DISABLED = 1<<0, - IEEE80211_CHAN_NO_IR = 1<<1, - /* hole at 1<<2 */ - IEEE80211_CHAN_RADAR = 1<<3, - IEEE80211_CHAN_NO_HT40PLUS = 1<<4, - IEEE80211_CHAN_NO_HT40MINUS = 1<<5, - IEEE80211_CHAN_NO_OFDM = 1<<6, - IEEE80211_CHAN_NO_80MHZ = 1<<7, - IEEE80211_CHAN_NO_160MHZ = 1<<8, - IEEE80211_CHAN_INDOOR_ONLY = 1<<9, - IEEE80211_CHAN_IR_CONCURRENT = 1<<10, - IEEE80211_CHAN_NO_20MHZ = 1<<11, - IEEE80211_CHAN_NO_10MHZ = 1<<12, - IEEE80211_CHAN_NO_HE = 1<<13, - IEEE80211_CHAN_1MHZ = 1<<14, - IEEE80211_CHAN_2MHZ = 1<<15, - IEEE80211_CHAN_4MHZ = 1<<16, - IEEE80211_CHAN_8MHZ = 1<<17, - IEEE80211_CHAN_16MHZ = 1<<18, + IEEE80211_CHAN_DISABLED = BIT(0), + IEEE80211_CHAN_NO_IR = BIT(1), + IEEE80211_CHAN_PSD = BIT(2), + IEEE80211_CHAN_RADAR = BIT(3), + IEEE80211_CHAN_NO_HT40PLUS = BIT(4), + IEEE80211_CHAN_NO_HT40MINUS = BIT(5), + IEEE80211_CHAN_NO_OFDM = BIT(6), + IEEE80211_CHAN_NO_80MHZ = BIT(7), + IEEE80211_CHAN_NO_160MHZ = BIT(8), + IEEE80211_CHAN_INDOOR_ONLY = BIT(9), + IEEE80211_CHAN_IR_CONCURRENT = BIT(10), + IEEE80211_CHAN_NO_20MHZ = BIT(11), + IEEE80211_CHAN_NO_10MHZ = BIT(12), + IEEE80211_CHAN_NO_HE = BIT(13), + IEEE80211_CHAN_1MHZ = BIT(14), + IEEE80211_CHAN_2MHZ = BIT(15), + IEEE80211_CHAN_4MHZ = BIT(16), + IEEE80211_CHAN_8MHZ = BIT(17), + IEEE80211_CHAN_16MHZ = BIT(18), + IEEE80211_CHAN_NO_320MHZ = BIT(19), + IEEE80211_CHAN_NO_EHT = BIT(20), + IEEE80211_CHAN_DFS_CONCURRENT = BIT(21), + IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT = BIT(22), + IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT = BIT(23), + IEEE80211_CHAN_CAN_MONITOR = BIT(24), + IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25), + IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26), }; #define IEEE80211_CHAN_NO_HT40 \ @@ -161,6 +191,7 @@ enum ieee80211_channel_flags { * on this channel. * @dfs_state_entered: timestamp (jiffies) when the dfs state was entered. * @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels. + * @psd: power spectral density (in dBm) */ struct ieee80211_channel { enum nl80211_band band; @@ -177,6 +208,7 @@ struct ieee80211_channel { enum nl80211_dfs_state dfs_state; unsigned long dfs_state_entered; unsigned int dfs_cac_ms; + s8 psd; }; /** @@ -203,13 +235,13 @@ struct ieee80211_channel { * @IEEE80211_RATE_SUPPORTS_10MHZ: Rate can be used in 10 MHz mode */ enum ieee80211_rate_flags { - IEEE80211_RATE_SHORT_PREAMBLE = 1<<0, - IEEE80211_RATE_MANDATORY_A = 1<<1, - IEEE80211_RATE_MANDATORY_B = 1<<2, - IEEE80211_RATE_MANDATORY_G = 1<<3, - IEEE80211_RATE_ERP_G = 1<<4, - IEEE80211_RATE_SUPPORTS_5MHZ = 1<<5, - IEEE80211_RATE_SUPPORTS_10MHZ = 1<<6, + IEEE80211_RATE_SHORT_PREAMBLE = BIT(0), + IEEE80211_RATE_MANDATORY_A = BIT(1), + IEEE80211_RATE_MANDATORY_B = BIT(2), + IEEE80211_RATE_MANDATORY_G = BIT(3), + IEEE80211_RATE_ERP_G = BIT(4), + IEEE80211_RATE_SUPPORTS_5MHZ = BIT(5), + IEEE80211_RATE_SUPPORTS_10MHZ = BIT(6), }; /** @@ -253,7 +285,7 @@ enum ieee80211_privacy { * are only for driver use when pointers to this structure are * passed around. * - * @flags: rate-specific flags + * @flags: rate-specific flags from &enum ieee80211_rate_flags * @bitrate: bitrate in units of 100 Kbps * @hw_value: driver/hardware value for this rate * @hw_value_short: driver/hardware value for this rate when @@ -302,19 +334,6 @@ struct cfg80211_he_bss_color { }; /** - * struct ieee80211_he_bss_color - AP settings for BSS coloring - * - * @color: the current color. - * @disabled: is the feature disabled. - * @partial: define the AID equation. - */ -struct ieee80211_he_bss_color { - u8 color; - bool disabled; - bool partial; -}; - -/** * struct ieee80211_sta_ht_cap - STA's HT capabilities * * This structure describes most essential parameters needed @@ -371,7 +390,63 @@ struct ieee80211_sta_he_cap { }; /** - * struct ieee80211_sband_iftype_data + * struct ieee80211_eht_mcs_nss_supp - EHT max supported NSS per MCS + * + * See P802.11be_D1.3 Table 9-401k - "Subfields of the Supported EHT-MCS + * and NSS Set field" + * + * @only_20mhz: MCS/NSS support for 20 MHz-only STA. + * @bw: MCS/NSS support for 80, 160 and 320 MHz + * @bw._80: MCS/NSS support for BW <= 80 MHz + * @bw._160: MCS/NSS support for BW = 160 MHz + * @bw._320: MCS/NSS support for BW = 320 MHz + */ +struct ieee80211_eht_mcs_nss_supp { + union { + struct ieee80211_eht_mcs_nss_supp_20mhz_only only_20mhz; + struct { + struct ieee80211_eht_mcs_nss_supp_bw _80; + struct ieee80211_eht_mcs_nss_supp_bw _160; + struct ieee80211_eht_mcs_nss_supp_bw _320; + } __packed bw; + } __packed; +} __packed; + +#define IEEE80211_EHT_PPE_THRES_MAX_LEN 32 + +/** + * struct ieee80211_sta_eht_cap - STA's EHT capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11be EHT capabilities for a STA. + * + * @has_eht: true iff EHT data is valid. + * @eht_cap_elem: Fixed portion of the eht capabilities element. + * @eht_mcs_nss_supp: The supported NSS/MCS combinations. + * @eht_ppe_thres: Holds the PPE Thresholds data. + */ +struct ieee80211_sta_eht_cap { + bool has_eht; + struct ieee80211_eht_cap_elem_fixed eht_cap_elem; + struct ieee80211_eht_mcs_nss_supp eht_mcs_nss_supp; + u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN]; +}; + +/* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */ +#ifdef __CHECKER__ +/* + * This is used to mark the sband->iftype_data pointer which is supposed + * to be an array with special access semantics (per iftype), but a lot + * of code got it wrong in the past, so with this marking sparse will be + * noisy when the pointer is used directly. + */ +# define __iftd __attribute__((noderef, address_space(__iftype_data))) +#else +# define __iftd +#endif /* __CHECKER__ */ + +/** + * struct ieee80211_sband_iftype_data - sband data per interface type * * This structure encapsulates sband data that is relevant for the * interface types defined in @types_mask. Each type in the @@ -381,11 +456,20 @@ struct ieee80211_sta_he_cap { * @he_cap: holds the HE capabilities * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a * 6 GHz band channel (and 0 may be valid value). + * @eht_cap: STA's EHT capabilities + * @vendor_elems: vendor element(s) to advertise + * @vendor_elems.data: vendor element(s) data + * @vendor_elems.len: vendor element(s) length */ struct ieee80211_sband_iftype_data { u16 types_mask; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct ieee80211_sta_eht_cap eht_cap; + struct { + const u8 *data; + unsigned int len; + } vendor_elems; }; /** @@ -448,7 +532,7 @@ struct ieee80211_edmg { * This structure describes most essential parameters needed * to describe 802.11ah S1G capabilities for a STA. * - * @s1g_supported: is STA an S1G STA + * @s1g: is STA an S1G STA * @cap: S1G capabilities information * @nss_mcs: Supported NSS MCS set */ @@ -494,10 +578,48 @@ struct ieee80211_supported_band { struct ieee80211_sta_s1g_cap s1g_cap; struct ieee80211_edmg edmg_cap; u16 n_iftype_data; - const struct ieee80211_sband_iftype_data *iftype_data; + const struct ieee80211_sband_iftype_data __iftd *iftype_data; }; /** + * _ieee80211_set_sband_iftype_data - set sband iftype data array + * @sband: the sband to initialize + * @iftd: the iftype data array pointer + * @n_iftd: the length of the iftype data array + * + * Set the sband iftype data array; use this where the length cannot + * be derived from the ARRAY_SIZE() of the argument, but prefer + * ieee80211_set_sband_iftype_data() where it can be used. + */ +static inline void +_ieee80211_set_sband_iftype_data(struct ieee80211_supported_band *sband, + const struct ieee80211_sband_iftype_data *iftd, + u16 n_iftd) +{ + sband->iftype_data = (const void __iftd __force *)iftd; + sband->n_iftype_data = n_iftd; +} + +/** + * ieee80211_set_sband_iftype_data - set sband iftype data array + * @sband: the sband to initialize + * @iftd: the iftype data array + */ +#define ieee80211_set_sband_iftype_data(sband, iftd) \ + _ieee80211_set_sband_iftype_data(sband, iftd, ARRAY_SIZE(iftd)) + +/** + * for_each_sband_iftype_data - iterate sband iftype data entries + * @sband: the sband whose iftype_data array to iterate + * @i: iterator counter + * @iftd: iftype data pointer to set + */ +#define for_each_sband_iftype_data(sband, i, iftd) \ + for (i = 0, iftd = (const void __force *)&(sband)->iftype_data[i]; \ + i < (sband)->n_iftype_data; \ + i++, iftd = (const void __force *)&(sband)->iftype_data[i]) + +/** * ieee80211_get_sband_iftype_data - return sband data for a given iftype * @sband: the sband to search for the STA on * @iftype: enum nl80211_iftype @@ -508,15 +630,16 @@ static inline const struct ieee80211_sband_iftype_data * ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, u8 iftype) { + const struct ieee80211_sband_iftype_data *data; int i; if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) return NULL; - for (i = 0; i < sband->n_iftype_data; i++) { - const struct ieee80211_sband_iftype_data *data = - &sband->iftype_data[i]; + if (iftype == NL80211_IFTYPE_AP_VLAN) + iftype = NL80211_IFTYPE_AP; + for_each_sband_iftype_data(sband, i, data) { if (data->types_mask & BIT(iftype)) return data; } @@ -545,18 +668,6 @@ ieee80211_get_he_iftype_cap(const struct ieee80211_supported_band *sband, } /** - * ieee80211_get_he_sta_cap - return HE capabilities for an sband's STA - * @sband: the sband to search for the STA on - * - * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found - */ -static inline const struct ieee80211_sta_he_cap * -ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband) -{ - return ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_STATION); -} - -/** * ieee80211_get_he_6ghz_capa - return HE 6 GHz capabilities * @sband: the sband to search for the STA on * @iftype: the iftype to search for @@ -577,6 +688,26 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband, } /** + * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype + * @sband: the sband to search for the iftype on + * @iftype: enum nl80211_iftype + * + * Return: pointer to the struct ieee80211_sta_eht_cap, or NULL is none found + */ +static inline const struct ieee80211_sta_eht_cap * +ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband, + enum nl80211_iftype iftype) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, iftype); + + if (data && data->eht_cap.has_eht) + return &data->eht_cap; + + return NULL; +} + +/** * wiphy_read_of_freq_limits - read frequency limits from device tree * * @wiphy: the wireless device to get extra limits for @@ -687,6 +818,9 @@ struct key_params { * chan will define the primary channel and all other * parameters are ignored. * @freq1_offset: offset from @center_freq1, in KHz + * @punctured: mask of the punctured 20 MHz subchannels, with + * bits turned on being disabled (punctured); numbered + * from lower to higher frequency (like in the spec) */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -695,6 +829,7 @@ struct cfg80211_chan_def { u32 center_freq2; struct ieee80211_edmg edmg; u16 freq1_offset; + u16 punctured; }; /* @@ -751,7 +886,35 @@ struct cfg80211_tid_cfg { struct cfg80211_tid_config { const u8 *peer; u32 n_tid_conf; - struct cfg80211_tid_cfg tid_conf[]; + struct cfg80211_tid_cfg tid_conf[] __counted_by(n_tid_conf); +}; + +/** + * struct cfg80211_fils_aad - FILS AAD data + * @macaddr: STA MAC address + * @kek: FILS KEK + * @kek_len: FILS KEK length + * @snonce: STA Nonce + * @anonce: AP Nonce + */ +struct cfg80211_fils_aad { + const u8 *macaddr; + const u8 *kek; + u8 kek_len; + const u8 *snonce; + const u8 *anonce; +}; + +/** + * struct cfg80211_set_hw_timestamp - enable/disable HW timestamping + * @macaddr: peer MAC address. NULL to enable/disable HW timestamping for all + * addresses. + * @enable: if set, enable HW timestamping for the specified MAC address. + * Otherwise disable HW timestamping for the specified MAC address. + */ +struct cfg80211_set_hw_timestamp { + const u8 *macaddr; + bool enable; }; /** @@ -807,7 +970,8 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1, chandef1->width == chandef2->width && chandef1->center_freq1 == chandef2->center_freq1 && chandef1->freq1_offset == chandef2->freq1_offset && - chandef1->center_freq2 == chandef2->center_freq2); + chandef1->center_freq2 == chandef2->center_freq2 && + chandef1->punctured == chandef2->punctured); } /** @@ -836,6 +1000,26 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1, const struct cfg80211_chan_def *chandef2); /** + * nl80211_chan_width_to_mhz - get the channel width in MHz + * @chan_width: the channel width from &enum nl80211_chan_width + * + * Return: channel width in MHz if the chan_width from &enum nl80211_chan_width + * is valid. -1 otherwise. + */ +int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width); + +/** + * cfg80211_chandef_get_width - return chandef width in MHz + * @c: chandef to return bandwidth for + * Return: channel width in MHz for the given chandef; note that it returns + * 80 for 80+80 configurations + */ +static inline int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) +{ + return nl80211_chan_width_to_mhz(c->width); +} + +/** * cfg80211_chandef_valid - check if a channel definition is valid * @chandef: the channel definition to check * Return: %true if the channel definition is valid. %false otherwise. @@ -866,28 +1050,51 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, enum nl80211_iftype iftype); /** - * ieee80211_chandef_rate_flags - returns rate flags for a channel + * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable and we + * can/need start CAC on such channel + * @wiphy: the wiphy to validate against + * @chandef: the channel definition to check * - * In some channel types, not all rates may be used - for example CCK - * rates may not be used in 5/10 MHz channels. + * Return: true if all channels available and at least + * one channel requires CAC (NL80211_DFS_USABLE) + */ +bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef); + +/** + * cfg80211_chandef_dfs_cac_time - get the DFS CAC time (in ms) for given + * channel definition + * @wiphy: the wiphy to validate against + * @chandef: the channel definition to check * - * @chandef: channel definition for the channel + * Returns: DFS CAC time (in ms) which applies for this channel definition + */ +unsigned int +cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef); + +/** + * cfg80211_chandef_primary - calculate primary 40/80/160 MHz freq + * @chandef: chandef to calculate for + * @primary_chan_width: primary channel width to calculate center for + * @punctured: punctured sub-channel bitmap, will be recalculated + * according to the new bandwidth, can be %NULL * - * Returns: rate flags which apply for this channel + * Returns: the primary 40/80/160 MHz channel center frequency, or -1 + * for errors, updating the punctured bitmap */ -static inline enum ieee80211_rate_flags -ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) -{ - switch (chandef->width) { - case NL80211_CHAN_WIDTH_5: - return IEEE80211_RATE_SUPPORTS_5MHZ; - case NL80211_CHAN_WIDTH_10: - return IEEE80211_RATE_SUPPORTS_10MHZ; - default: - break; - } - return 0; -} +int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef, + enum nl80211_chan_width primary_chan_width, + u16 *punctured); + +/** + * nl80211_send_chandef - sends the channel definition. + * @msg: the msg to send channel definition + * @chandef: the channel definition to check + * + * Returns: 0 if sent the channel definition to msg, < 0 on error + **/ +int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef); /** * ieee80211_chandef_max_power - maximum transmission power for the chandef @@ -917,6 +1124,19 @@ ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef) } /** + * cfg80211_any_usable_channels - check for usable channels + * @wiphy: the wiphy to check for + * @band_mask: which bands to check on + * @prohibited_flags: which channels to not consider usable, + * %IEEE80211_CHAN_DISABLED is always taken into account + * + * Return: %true if usable channels found, %false otherwise + */ +bool cfg80211_any_usable_channels(struct wiphy *wiphy, + unsigned long band_mask, + u32 prohibited_flags); + +/** * enum survey_info_flags - survey information flags * * @SURVEY_INFO_NOISE_DBM: noise (in dBm) was filled in @@ -978,7 +1198,7 @@ struct survey_info { s8 noise; }; -#define CFG80211_MAX_WEP_KEYS 4 +#define CFG80211_MAX_NUM_AKM_SUITES 10 /** * struct cfg80211_crypto_settings - Crypto settings @@ -1001,13 +1221,25 @@ struct survey_info { * port frames over NL80211 instead of the network interface. * @control_port_no_preauth: disables pre-auth rx over the nl80211 control * port for mac80211 - * @wep_keys: static WEP keys, if not NULL points to an array of - * CFG80211_MAX_WEP_KEYS WEP keys - * @wep_tx_key: key index (0..3) of the default TX static WEP key * @psk: PSK (for devices supporting 4-way-handshake offload) * @sae_pwd: password for SAE authentication (for devices supporting SAE * offload) * @sae_pwd_len: length of SAE password (for devices supporting SAE offload) + * @sae_pwe: The mechanisms allowed for SAE PWE derivation: + * + * NL80211_SAE_PWE_UNSPECIFIED + * Not-specified, used to indicate userspace did not specify any + * preference. The driver should follow its internal policy in + * such a scenario. + * + * NL80211_SAE_PWE_HUNT_AND_PECK + * Allow hunting-and-pecking loop only + * + * NL80211_SAE_PWE_HASH_TO_ELEMENT + * Allow hash-to-element only + * + * NL80211_SAE_PWE_BOTH + * Allow either hunting-and-pecking loop or hash-to-element */ struct cfg80211_crypto_settings { u32 wpa_versions; @@ -1015,21 +1247,70 @@ struct cfg80211_crypto_settings { int n_ciphers_pairwise; u32 ciphers_pairwise[NL80211_MAX_NR_CIPHER_SUITES]; int n_akm_suites; - u32 akm_suites[NL80211_MAX_NR_AKM_SUITES]; + u32 akm_suites[CFG80211_MAX_NUM_AKM_SUITES]; bool control_port; __be16 control_port_ethertype; bool control_port_no_encrypt; bool control_port_over_nl80211; bool control_port_no_preauth; - struct key_params *wep_keys; - int wep_tx_key; const u8 *psk; const u8 *sae_pwd; u8 sae_pwd_len; + enum nl80211_sae_pwe_mechanism sae_pwe; +}; + +/** + * struct cfg80211_mbssid_config - AP settings for multi bssid + * + * @tx_wdev: pointer to the transmitted interface in the MBSSID set + * @tx_link_id: link ID of the transmitted profile in an MLD. + * @index: index of this AP in the multi bssid group. + * @ema: set to true if the beacons should be sent out in EMA mode. + */ +struct cfg80211_mbssid_config { + struct wireless_dev *tx_wdev; + u8 tx_link_id; + u8 index; + bool ema; +}; + +/** + * struct cfg80211_mbssid_elems - Multiple BSSID elements + * + * @cnt: Number of elements in array %elems. + * + * @elem: Array of multiple BSSID element(s) to be added into Beacon frames. + * @elem.data: Data for multiple BSSID elements. + * @elem.len: Length of data. + */ +struct cfg80211_mbssid_elems { + u8 cnt; + struct { + const u8 *data; + size_t len; + } elem[] __counted_by(cnt); +}; + +/** + * struct cfg80211_rnr_elems - Reduced neighbor report (RNR) elements + * + * @cnt: Number of elements in array %elems. + * + * @elem: Array of RNR element(s) to be added into Beacon frames. + * @elem.data: Data for RNR elements. + * @elem.len: Length of data. + */ +struct cfg80211_rnr_elems { + u8 cnt; + struct { + const u8 *data; + size_t len; + } elem[] __counted_by(cnt); }; /** * struct cfg80211_beacon_data - beacon data + * @link_id: the link ID for the AP MLD link sending this beacon * @head: head portion of beacon (before TIM IE) * or %NULL if not changed * @tail: tail portion of beacon (after TIM IE) @@ -1046,6 +1327,8 @@ struct cfg80211_crypto_settings { * @assocresp_ies_len: length of assocresp_ies in octets * @probe_resp_len: length of probe response template (@probe_resp) * @probe_resp: probe response template (AP mode only) + * @mbssid_ies: multiple BSSID elements + * @rnr_ies: reduced neighbor report elements * @ftm_responder: enable FTM responder functionality; -1 for no change * (which also implies no change in LCI/civic location data) * @lci: Measurement Report element content, starting with Measurement Token @@ -1054,8 +1337,13 @@ struct cfg80211_crypto_settings { * Token (measurement type 11) * @lci_len: LCI data length * @civicloc_len: Civic location data length + * @he_bss_color: BSS Color settings + * @he_bss_color_valid: indicates whether bss color + * attribute is present in beacon data or not. */ struct cfg80211_beacon_data { + unsigned int link_id; + const u8 *head, *tail; const u8 *beacon_ies; const u8 *proberesp_ies; @@ -1063,6 +1351,8 @@ struct cfg80211_beacon_data { const u8 *probe_resp; const u8 *lci; const u8 *civicloc; + struct cfg80211_mbssid_elems *mbssid_ies; + struct cfg80211_rnr_elems *rnr_ies; s8 ftm_responder; size_t head_len, tail_len; @@ -1072,6 +1362,8 @@ struct cfg80211_beacon_data { size_t probe_resp_len; size_t lci_len; size_t civicloc_len; + struct cfg80211_he_bss_color he_bss_color; + bool he_bss_color_valid; }; struct mac_address { @@ -1091,13 +1383,14 @@ struct cfg80211_acl_data { int n_acl_entries; /* Keep it last */ - struct mac_address mac_addrs[]; + struct mac_address mac_addrs[] __counted_by(n_acl_entries); }; /** * struct cfg80211_fils_discovery - FILS discovery parameters from * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. * + * @update: Set to true if the feature configuration should be updated. * @min_interval: Minimum packet interval in TUs (0 - 10000) * @max_interval: Maximum packet interval in TUs (0 - 10000) * @tmpl_len: Template length @@ -1105,6 +1398,7 @@ struct cfg80211_acl_data { * frame headers. */ struct cfg80211_fils_discovery { + bool update; u32 min_interval; u32 max_interval; size_t tmpl_len; @@ -1115,6 +1409,7 @@ struct cfg80211_fils_discovery { * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe * response parameters in 6GHz. * + * @update: Set to true if the feature configuration should be updated. * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned * in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive * scanning @@ -1122,23 +1417,13 @@ struct cfg80211_fils_discovery { * @tmpl: Template data for probe response */ struct cfg80211_unsol_bcast_probe_resp { + bool update; u32 interval; size_t tmpl_len; const u8 *tmpl; }; /** - * enum cfg80211_ap_settings_flags - AP settings flags - * - * Used by cfg80211_ap_settings - * - * @AP_SETTINGS_EXTERNAL_AUTH_SUPPORT: AP supports external authentication - */ -enum cfg80211_ap_settings_flags { - AP_SETTINGS_EXTERNAL_AUTH_SUPPORT = BIT(0), -}; - -/** * struct cfg80211_ap_settings - AP configuration * * Used to configure an AP interface. @@ -1154,7 +1439,6 @@ enum cfg80211_ap_settings_flags { * @crypto: crypto settings * @privacy: the BSS uses privacy * @auth_type: Authentication type (algorithm) - * @smps_mode: SMPS mode * @inactivity_timeout: time in seconds to determine station's inactivity. * @p2p_ctwindow: P2P CT Window * @p2p_opp_ps: P2P opportunistic PS @@ -1166,16 +1450,19 @@ enum cfg80211_ap_settings_flags { * @ht_cap: HT capabilities (or %NULL if HT isn't enabled) * @vht_cap: VHT capabilities (or %NULL if VHT isn't enabled) * @he_cap: HE capabilities (or %NULL if HE isn't enabled) + * @eht_cap: EHT capabilities (or %NULL if EHT isn't enabled) + * @eht_oper: EHT operation IE (or %NULL if EHT isn't enabled) * @ht_required: stations must support HT * @vht_required: stations must support VHT * @twt_responder: Enable Target Wait Time * @he_required: stations must support HE - * @flags: flags, as defined in enum cfg80211_ap_settings_flags + * @sae_h2e_required: stations must support direct H2E technique in SAE + * @flags: flags, as defined in &enum nl80211_ap_settings_flags * @he_obss_pd: OBSS Packet Detection settings - * @he_bss_color: BSS Color settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters + * @mbssid_config: AP settings for multiple bssid */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1189,7 +1476,6 @@ struct cfg80211_ap_settings { struct cfg80211_crypto_settings crypto; bool privacy; enum nl80211_auth_type auth_type; - enum nl80211_smps_mode smps_mode; int inactivity_timeout; u8 p2p_ctwindow; bool p2p_opp_ps; @@ -1201,11 +1487,29 @@ struct cfg80211_ap_settings { const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; const struct ieee80211_he_operation *he_oper; - bool ht_required, vht_required, he_required; + const struct ieee80211_eht_cap_elem *eht_cap; + const struct ieee80211_eht_operation *eht_oper; + bool ht_required, vht_required, he_required, sae_h2e_required; bool twt_responder; u32 flags; struct ieee80211_he_obss_pd he_obss_pd; - struct cfg80211_he_bss_color he_bss_color; + struct cfg80211_fils_discovery fils_discovery; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; + struct cfg80211_mbssid_config mbssid_config; +}; + + +/** + * struct cfg80211_ap_update - AP configuration update + * + * Subset of &struct cfg80211_ap_settings, for updating a running AP. + * + * @beacon: beacon data + * @fils_discovery: FILS discovery transmission parameters + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters + */ +struct cfg80211_ap_update { + struct cfg80211_beacon_data beacon; struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; }; @@ -1225,6 +1529,8 @@ struct cfg80211_ap_settings { * @radar_required: whether radar detection is required on the new channel * @block_tx: whether transmissions should be blocked while changing * @count: number of beacons until switch + * @link_id: defines the link on which channel switch is expected during + * MLO. 0 in case of non-MLO. */ struct cfg80211_csa_settings { struct cfg80211_chan_def chandef; @@ -1237,15 +1543,39 @@ struct cfg80211_csa_settings { bool radar_required; bool block_tx; u8 count; + u8 link_id; }; -#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 +/** + * struct cfg80211_color_change_settings - color change settings + * + * Used for bss color change + * + * @beacon_color_change: beacon data while performing the color countdown + * @counter_offset_beacon: offsets of the counters within the beacon (tail) + * @counter_offset_presp: offsets of the counters within the probe response + * @beacon_next: beacon data to be used after the color change + * @count: number of beacons until the color change + * @color: the color used after the change + * @link_id: defines the link on which color change is expected during MLO. + * 0 in case of non-MLO. + */ +struct cfg80211_color_change_settings { + struct cfg80211_beacon_data beacon_color_change; + u16 counter_offset_beacon; + u16 counter_offset_presp; + struct cfg80211_beacon_data beacon_next; + u8 count; + u8 color; + u8 link_id; +}; /** * struct iface_combination_params - input parameters for interface combinations * * Used to pass interface combination parameters * + * @radio_idx: wiphy radio index or -1 for global * @num_different_channels: the number of different channels we want * to use for verification * @radar_detect: a bitmap where each bit corresponds to a channel @@ -1259,6 +1589,7 @@ struct cfg80211_csa_settings { * the verification */ struct iface_combination_params { + int radio_idx; int num_different_channels; u8 radar_detect; int iftype_num[NUM_NL80211_IFTYPES]; @@ -1278,7 +1609,6 @@ enum station_parameters_apply_mask { STATION_PARAM_APPLY_UAPSD = BIT(0), STATION_PARAM_APPLY_CAPABILITY = BIT(1), STATION_PARAM_APPLY_PLINK_STATE = BIT(2), - STATION_PARAM_APPLY_STA_TXPOWER = BIT(3), }; /** @@ -1302,14 +1632,81 @@ struct sta_txpwr { }; /** - * struct station_parameters - station parameters + * struct link_station_parameters - link station parameters * - * Used to change and create a new station. + * Used to change and create a new link station. * - * @vlan: vlan interface station should belong to + * @mld_mac: MAC address of the station + * @link_id: the link id (-1 for non-MLD station) + * @link_mac: MAC address of the link * @supported_rates: supported rates in IEEE 802.11 format * (or NULL for no change) * @supported_rates_len: number of supported rates + * @ht_capa: HT capabilities of station + * @vht_capa: VHT capabilities of station + * @opmode_notif: operating mode field from Operating Mode Notification + * @opmode_notif_used: information if operating mode field is used + * @he_capa: HE capabilities of station + * @he_capa_len: the length of the HE capabilities + * @txpwr: transmit power for an associated station + * @txpwr_set: txpwr field is set + * @he_6ghz_capa: HE 6 GHz Band capabilities of station + * @eht_capa: EHT capabilities of station + * @eht_capa_len: the length of the EHT capabilities + */ +struct link_station_parameters { + const u8 *mld_mac; + int link_id; + const u8 *link_mac; + const u8 *supported_rates; + u8 supported_rates_len; + const struct ieee80211_ht_cap *ht_capa; + const struct ieee80211_vht_cap *vht_capa; + u8 opmode_notif; + bool opmode_notif_used; + const struct ieee80211_he_cap_elem *he_capa; + u8 he_capa_len; + struct sta_txpwr txpwr; + bool txpwr_set; + const struct ieee80211_he_6ghz_capa *he_6ghz_capa; + const struct ieee80211_eht_cap_elem *eht_capa; + u8 eht_capa_len; +}; + +/** + * struct link_station_del_parameters - link station deletion parameters + * + * Used to delete a link station entry (or all stations). + * + * @mld_mac: MAC address of the station + * @link_id: the link id + */ +struct link_station_del_parameters { + const u8 *mld_mac; + u32 link_id; +}; + +/** + * struct cfg80211_ttlm_params: TID to link mapping parameters + * + * Used for setting a TID to link mapping. + * + * @dlink: Downlink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + * @ulink: Uplink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + */ +struct cfg80211_ttlm_params { + u16 dlink[8]; + u16 ulink[8]; +}; + +/** + * struct station_parameters - station parameters + * + * Used to change and create a new station. + * + * @vlan: vlan interface station should belong to * @sta_flags_mask: station flags that changed * (bitmask of BIT(%NL80211_STA_FLAG_...)) * @sta_flags_set: station flags values @@ -1320,8 +1717,6 @@ struct sta_txpwr { * @peer_aid: mesh peer AID or zero for no change * @plink_action: plink action to take * @plink_state: set the peer link state for a station - * @ht_capa: HT capabilities of station - * @vht_capa: VHT capabilities of station * @uapsd_queues: bitmap of queues configured for uapsd. same format * as the AC bitmap in the QoS info field * @max_sp: max Service Period. same format as the MAX_SP in the @@ -1338,17 +1733,14 @@ struct sta_txpwr { * @supported_channels_len: number of supported channels * @supported_oper_classes: supported oper classes in IEEE 802.11 format * @supported_oper_classes_len: number of supported operating classes - * @opmode_notif: operating mode field from Operating Mode Notification - * @opmode_notif_used: information if operating mode field is used * @support_p2p_ps: information if station supports P2P PS mechanism - * @he_capa: HE capabilities of station - * @he_capa_len: the length of the HE capabilities * @airtime_weight: airtime scheduler weight for this station - * @txpwr: transmit power for an associated station - * @he_6ghz_capa: HE 6 GHz Band capabilities of station + * @eml_cap_present: Specifies if EML capabilities field (@eml_cap) is + * present/updated + * @eml_cap: EML capabilities of this station + * @link_sta_params: link related params. */ struct station_parameters { - const u8 *supported_rates; struct net_device *vlan; u32 sta_flags_mask, sta_flags_set; u32 sta_modify_mask; @@ -1356,11 +1748,8 @@ struct station_parameters { u16 aid; u16 vlan_id; u16 peer_aid; - u8 supported_rates_len; u8 plink_action; u8 plink_state; - const struct ieee80211_ht_cap *ht_capa; - const struct ieee80211_vht_cap *vht_capa; u8 uapsd_queues; u8 max_sp; enum nl80211_mesh_power_mode local_pm; @@ -1371,14 +1760,11 @@ struct station_parameters { u8 supported_channels_len; const u8 *supported_oper_classes; u8 supported_oper_classes_len; - u8 opmode_notif; - bool opmode_notif_used; int support_p2p_ps; - const struct ieee80211_he_cap_elem *he_capa; - u8 he_capa_len; u16 airtime_weight; - struct sta_txpwr txpwr; - const struct ieee80211_he_6ghz_capa *he_6ghz_capa; + bool eml_cap_present; + u16 eml_cap; + struct link_station_parameters link_sta_params; }; /** @@ -1390,11 +1776,15 @@ struct station_parameters { * @subtype: Management frame subtype to use for indicating removal * (10 = Disassociation, 12 = Deauthentication) * @reason_code: Reason code for the Disassociation/Deauthentication frame + * @link_id: Link ID indicating a link that stations to be flushed must be + * using; valid only for MLO, but can also be -1 for MLO to really + * remove all stations. */ struct station_del_parameters { const u8 *mac; u8 subtype; u16 reason_code; + int link_id; }; /** @@ -1435,16 +1825,18 @@ enum cfg80211_station_type { * * Utility function for the @change_station driver method. Call this function * with the appropriate station type looking up the station (and checking that - * it exists). It will verify whether the station change is acceptable, and if - * not will return an error code. Note that it may modify the parameters for - * backward compatibility reasons, so don't use them before calling this. + * it exists). It will verify whether the station change is acceptable. + * + * Return: 0 if the change is acceptable, otherwise an error code. Note that + * it may modify the parameters for backward compatibility reasons, so don't + * use them before calling this. */ int cfg80211_check_station_change(struct wiphy *wiphy, struct station_parameters *params, enum cfg80211_station_type statype); /** - * enum station_info_rate_flags - bitrate info flags + * enum rate_info_flags - bitrate info flags * * Used by the driver to indicate the specific rate transmission * type for 802.11n transmissions. @@ -1455,6 +1847,9 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_DMG: 60GHz MCS * @RATE_INFO_FLAGS_HE_MCS: HE MCS information * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode + * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS + * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information + * @RATE_INFO_FLAGS_S1G_MCS: MCS field filled with S1G MCS */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), @@ -1463,6 +1858,9 @@ enum rate_info_flags { RATE_INFO_FLAGS_DMG = BIT(3), RATE_INFO_FLAGS_HE_MCS = BIT(4), RATE_INFO_FLAGS_EDMG = BIT(5), + RATE_INFO_FLAGS_EXTENDED_SC_DMG = BIT(6), + RATE_INFO_FLAGS_EHT_MCS = BIT(7), + RATE_INFO_FLAGS_S1G_MCS = BIT(8), }; /** @@ -1477,6 +1875,13 @@ enum rate_info_flags { * @RATE_INFO_BW_80: 80 MHz bandwidth * @RATE_INFO_BW_160: 160 MHz bandwidth * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation + * @RATE_INFO_BW_320: 320 MHz bandwidth + * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation + * @RATE_INFO_BW_1: 1 MHz bandwidth + * @RATE_INFO_BW_2: 2 MHz bandwidth + * @RATE_INFO_BW_4: 4 MHz bandwidth + * @RATE_INFO_BW_8: 8 MHz bandwidth + * @RATE_INFO_BW_16: 16 MHz bandwidth */ enum rate_info_bw { RATE_INFO_BW_20 = 0, @@ -1486,6 +1891,13 @@ enum rate_info_bw { RATE_INFO_BW_80, RATE_INFO_BW_160, RATE_INFO_BW_HE_RU, + RATE_INFO_BW_320, + RATE_INFO_BW_EHT_RU, + RATE_INFO_BW_1, + RATE_INFO_BW_2, + RATE_INFO_BW_4, + RATE_INFO_BW_8, + RATE_INFO_BW_16, }; /** @@ -1494,8 +1906,8 @@ enum rate_info_bw { * Information about a receiving or transmitting bitrate * * @flags: bitflag of flags from &enum rate_info_flags - * @mcs: mcs index if struct describes an HT/VHT/HE rate * @legacy: bitrate in 100kbit/s for 802.11abg + * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G rate * @nss: number of streams (VHT & HE only) * @bw: bandwidth (from &enum rate_info_bw) * @he_gi: HE guard interval (from &enum nl80211_he_gi) @@ -1503,21 +1915,26 @@ enum rate_info_bw { * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc, * only valid if bw is %RATE_INFO_BW_HE_RU) * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4) + * @eht_gi: EHT guard interval (from &enum nl80211_eht_gi) + * @eht_ru_alloc: EHT RU allocation (from &enum nl80211_eht_ru_alloc, + * only valid if bw is %RATE_INFO_BW_EHT_RU) */ struct rate_info { - u8 flags; - u8 mcs; + u16 flags; u16 legacy; + u8 mcs; u8 nss; u8 bw; u8 he_gi; u8 he_dcm; u8 he_ru_alloc; u8 n_bonded_ch; + u8 eht_gi; + u8 eht_ru_alloc; }; /** - * enum station_info_rate_flags - bitrate info flags + * enum bss_param_flags - bitrate info flags * * Used by the driver to indicate the specific rate transmission * type for 802.11n transmissions. @@ -1527,9 +1944,9 @@ struct rate_info { * @BSS_PARAM_FLAGS_SHORT_SLOT_TIME: whether short slot time is enabled */ enum bss_param_flags { - BSS_PARAM_FLAGS_CTS_PROT = 1<<0, - BSS_PARAM_FLAGS_SHORT_PREAMBLE = 1<<1, - BSS_PARAM_FLAGS_SHORT_SLOT_TIME = 1<<2, + BSS_PARAM_FLAGS_CTS_PROT = BIT(0), + BSS_PARAM_FLAGS_SHORT_PREAMBLE = BIT(1), + BSS_PARAM_FLAGS_SHORT_SLOT_TIME = BIT(2), }; /** @@ -1612,9 +2029,6 @@ struct cfg80211_tid_stats { * @assoc_at: bootime (ns) of the last association * @rx_bytes: bytes (size of MPDUs) received from this station * @tx_bytes: bytes (size of MPDUs) transmitted to this station - * @llid: mesh local link id - * @plid: mesh peer link id - * @plink_state: mesh peer link state * @signal: The signal strength, type depends on the wiphy's signal_type. * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_. * @signal_avg: Average signal strength, type depends on the wiphy's signal_type. @@ -1634,14 +2048,20 @@ struct cfg80211_tid_stats { * This number should increase every time the list of stations * changes, i.e. when a station is added or removed, so that * userspace can tell whether it got a consistent snapshot. + * @beacon_loss_count: Number of times beacon loss event has triggered. * @assoc_req_ies: IEs from (Re)Association Request. * This is used only when in AP mode with drivers that do not use * user space MLME/SME implementation. The information is provided for * the cfg80211_new_sta() calls to notify user space of the IEs. * @assoc_req_ies_len: Length of assoc_req_ies buffer in octets. * @sta_flags: station flags mask & values - * @beacon_loss_count: Number of times beacon loss event has triggered. * @t_offset: Time offset of the station relative to this host. + * @llid: mesh local link id + * @plid: mesh peer link id + * @plink_state: mesh peer link state + * @connected_to_gate: true if mesh STA has a path to mesh gate + * @connected_to_as: true if mesh STA has a path to authentication server + * @airtime_link_metric: mesh airtime link metric. * @local_pm: local mesh STA power save mode * @peer_pm: peer mesh STA power save mode * @nonpeer_pm: non-peer mesh STA power save mode @@ -1650,7 +2070,6 @@ struct cfg80211_tid_stats { * @rx_beacon: number of beacons received from this peer * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received * from this peer - * @connected_to_gate: true if mesh STA has a path to mesh gate * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer * @airtime_weight: current airtime scheduling weight @@ -1664,8 +2083,24 @@ struct cfg80211_tid_stats { * @fcs_err_count: number of packets (MPDUs) received from this station with * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. - * @airtime_link_metric: mesh airtime link metric. - * @connected_to_as: true if mesh STA has a path to authentication server + * @mlo_params_valid: Indicates @assoc_link_id and @mld_addr fields are filled + * by driver. Drivers use this only in cfg80211_new_sta() calls when AP + * MLD's MLME/SME is offload to driver. Drivers won't fill this + * information in cfg80211_del_sta_sinfo(), get_station() and + * dump_station() callbacks. + * @assoc_link_id: Indicates MLO link ID of the AP, with which the station + * completed (re)association. This information filled for both MLO + * and non-MLO STA connections when the AP affiliated with an MLD. + * @mld_addr: For MLO STA connection, filled with MLD address of the station. + * For non-MLO STA connection, filled with all zeros. + * @assoc_resp_ies: IEs from (Re)Association Response. + * This is used only when in AP mode with drivers that do not use user + * space MLME/SME implementation. The information is provided only for the + * cfg80211_new_sta() calls to notify user space of the IEs. Drivers won't + * fill this information in cfg80211_del_sta_sinfo(), get_station() and + * dump_station() callbacks. User space needs this information to determine + * the accepted and rejected affiliated links of the connected station. + * @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets. */ struct station_info { u64 filled; @@ -1674,9 +2109,6 @@ struct station_info { u64 assoc_at; u64 rx_bytes; u64 tx_bytes; - u16 llid; - u16 plid; - u8 plink_state; s8 signal; s8 signal_avg; @@ -1696,35 +2128,91 @@ struct station_info { int generation; + u32 beacon_loss_count; + const u8 *assoc_req_ies; size_t assoc_req_ies_len; - u32 beacon_loss_count; s64 t_offset; + u16 llid; + u16 plid; + u8 plink_state; + u8 connected_to_gate; + u8 connected_to_as; + u32 airtime_link_metric; enum nl80211_mesh_power_mode local_pm; enum nl80211_mesh_power_mode peer_pm; enum nl80211_mesh_power_mode nonpeer_pm; u32 expected_throughput; - u64 tx_duration; - u64 rx_duration; - u64 rx_beacon; - u8 rx_beacon_signal_avg; - u8 connected_to_gate; + u16 airtime_weight; - struct cfg80211_tid_stats *pertid; s8 ack_signal; s8 avg_ack_signal; + struct cfg80211_tid_stats *pertid; - u16 airtime_weight; + u64 tx_duration; + u64 rx_duration; + u64 rx_beacon; + u8 rx_beacon_signal_avg; u32 rx_mpdu_count; u32 fcs_err_count; - u32 airtime_link_metric; + bool mlo_params_valid; + u8 assoc_link_id; + u8 mld_addr[ETH_ALEN] __aligned(2); + const u8 *assoc_resp_ies; + size_t assoc_resp_ies_len; +}; - u8 connected_to_as; +/** + * struct cfg80211_sar_sub_specs - sub specs limit + * @power: power limitation in 0.25dbm + * @freq_range_index: index the power limitation applies to + */ +struct cfg80211_sar_sub_specs { + s32 power; + u32 freq_range_index; +}; + +/** + * struct cfg80211_sar_specs - sar limit specs + * @type: it's set with power in 0.25dbm or other types + * @num_sub_specs: number of sar sub specs + * @sub_specs: memory to hold the sar sub specs + */ +struct cfg80211_sar_specs { + enum nl80211_sar_type type; + u32 num_sub_specs; + struct cfg80211_sar_sub_specs sub_specs[] __counted_by(num_sub_specs); +}; + + +/** + * struct cfg80211_sar_freq_ranges - sar frequency ranges + * @start_freq: start range edge frequency + * @end_freq: end range edge frequency + */ +struct cfg80211_sar_freq_ranges { + u32 start_freq; + u32 end_freq; +}; + +/** + * struct cfg80211_sar_capa - sar limit capability + * @type: it's set via power in 0.25dbm or other types + * @num_freq_ranges: number of frequency ranges + * @freq_ranges: memory to hold the freq ranges. + * + * Note: WLAN driver may append new ranges or split an existing + * range to small ones and then append them. + */ +struct cfg80211_sar_capa { + enum nl80211_sar_type type; + u32 num_freq_ranges; + const struct cfg80211_sar_freq_ranges *freq_ranges; }; #if IS_ENABLED(CONFIG_CFG80211) @@ -1734,7 +2222,7 @@ struct station_info { * @mac_addr: the mac address of the station of interest * @sinfo: pointer to the structure to fill with the information * - * Returns 0 on success and sinfo is filled with the available information + * Return: 0 on success and sinfo is filled with the available information * otherwise returns a negative error code and the content of sinfo has to be * considered undefined. */ @@ -1760,17 +2248,19 @@ static inline int cfg80211_get_station(struct net_device *dev, * @MONITOR_FLAG_PLCPFAIL: pass frames with bad PLCP * @MONITOR_FLAG_CONTROL: pass control frames * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering - * @MONITOR_FLAG_COOK_FRAMES: report frames after processing + * @MONITOR_FLAG_COOK_FRAMES: deprecated, will unconditionally be refused * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address + * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames */ enum monitor_flags { - MONITOR_FLAG_CHANGED = 1<<__NL80211_MNTR_FLAG_INVALID, - MONITOR_FLAG_FCSFAIL = 1<<NL80211_MNTR_FLAG_FCSFAIL, - MONITOR_FLAG_PLCPFAIL = 1<<NL80211_MNTR_FLAG_PLCPFAIL, - MONITOR_FLAG_CONTROL = 1<<NL80211_MNTR_FLAG_CONTROL, - MONITOR_FLAG_OTHER_BSS = 1<<NL80211_MNTR_FLAG_OTHER_BSS, - MONITOR_FLAG_COOK_FRAMES = 1<<NL80211_MNTR_FLAG_COOK_FRAMES, - MONITOR_FLAG_ACTIVE = 1<<NL80211_MNTR_FLAG_ACTIVE, + MONITOR_FLAG_CHANGED = BIT(__NL80211_MNTR_FLAG_INVALID), + MONITOR_FLAG_FCSFAIL = BIT(NL80211_MNTR_FLAG_FCSFAIL), + MONITOR_FLAG_PLCPFAIL = BIT(NL80211_MNTR_FLAG_PLCPFAIL), + MONITOR_FLAG_CONTROL = BIT(NL80211_MNTR_FLAG_CONTROL), + MONITOR_FLAG_OTHER_BSS = BIT(NL80211_MNTR_FLAG_OTHER_BSS), + MONITOR_FLAG_COOK_FRAMES = BIT(NL80211_MNTR_FLAG_COOK_FRAMES), + MONITOR_FLAG_ACTIVE = BIT(NL80211_MNTR_FLAG_ACTIVE), + MONITOR_FLAG_SKIP_TX = BIT(NL80211_MNTR_FLAG_SKIP_TX), }; /** @@ -1811,7 +2301,7 @@ enum mpath_info_flags { * @sn: target sequence number * @metric: metric (cost) of this mesh path * @exptime: expiration time for the mesh path from now, in msecs - * @flags: mesh path flags + * @flags: mesh path flags from &enum mesh_path_flags * @discovery_timeout: total mesh path discovery timeout, in msecs * @discovery_retries: mesh path discovery retries * @generation: generation number for nl80211 dumps. @@ -1841,6 +2331,7 @@ struct mpath_info { * * Used to change BSS parameters (mainly for AP mode). * + * @link_id: link_id or -1 for non-MLD * @use_cts_prot: Whether to use CTS protection * (0 = no, 1 = yes, -1 = do not change) * @use_short_preamble: Whether the use of short preambles is allowed @@ -1858,6 +2349,7 @@ struct mpath_info { * @p2p_opp_ps: P2P opportunistic PS (-1 = no change) */ struct bss_parameters { + int link_id; int use_cts_prot; int use_short_preamble; int use_short_slot_time; @@ -1937,6 +2429,9 @@ struct bss_parameters { * @plink_timeout: If no tx activity is seen from a STA we've established * peering with for longer than this time (in seconds), then remove it * from the STA's list of peers. Default is 30 minutes. + * @dot11MeshConnectedToAuthServer: if set to true then this mesh STA + * will advertise that it is connected to a authentication server + * in the mesh formation field. * @dot11MeshConnectedToMeshGate: if set to true, advertise that this STA is * connected to a mesh gate in mesh formation info. If false, the * value in mesh formation is determined by the presence of root paths @@ -1997,7 +2492,7 @@ struct mesh_config { * @user_mpm: userspace handles all MPM functions * @dtim_period: DTIM period to use * @beacon_interval: beacon interval to use - * @mcast_rate: multicat rate for Mesh Node [6Mbps is the default for 802.11a] + * @mcast_rate: multicast rate for Mesh Node [6Mbps is the default for 802.11a] * @basic_rates: basic rates to use when creating the mesh * @beacon_rate: bitrate to be used for beacons * @userspace_handles_dfs: whether user space controls DFS operation, i.e. @@ -2049,6 +2544,7 @@ struct ocb_setup { * @cwmax: Maximum contention window [a value of the form 2^n-1 in the range * 1..32767] * @aifs: Arbitration interframe space [0..255] + * @link_id: link_id or -1 for non-MLD */ struct ieee80211_txq_params { enum nl80211_ac ac; @@ -2056,6 +2552,7 @@ struct ieee80211_txq_params { u16 cwmin; u16 cwmax; u8 aifs; + int link_id; }; /** @@ -2109,14 +2606,15 @@ struct cfg80211_scan_info { /** * struct cfg80211_scan_6ghz_params - relevant for 6 GHz only * - * @short_bssid: short ssid to scan for + * @short_ssid: short ssid to scan for * @bssid: bssid to scan for * @channel_idx: idx of the channel in the channel array in the scan request - * which the above info relvant to + * which the above info is relevant to * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU - * @short_ssid_valid: short_ssid is valid and can be used + * @short_ssid_valid: @short_ssid is valid and can be used * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait * 20 TUs before starting to send probe requests. + * @psd_20: The AP's 20 MHz PSD value. */ struct cfg80211_scan_6ghz_params { u32 short_ssid; @@ -2125,6 +2623,7 @@ struct cfg80211_scan_6ghz_params { bool unsolicited_probe; bool short_ssid_valid; bool psc_no_listen; + s8 psd_20; }; /** @@ -2134,7 +2633,6 @@ struct cfg80211_scan_6ghz_params { * @n_ssids: number of SSIDs * @channels: channels to scan on. * @n_channels: total number of channels to scan - * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets * @duration: how long to listen on each channel, in TUs. If @@ -2142,7 +2640,7 @@ struct cfg80211_scan_6ghz_params { * the actual dwell time may be shorter. * @duration_mandatory: if set, the scan duration must be as specified by the * %duration field. - * @flags: bit field of flags controlling operation + * @flags: control flags from &enum nl80211_scan_flags * @rates: bitmap of rates to advertise for each band * @wiphy: the wiphy this was for * @scan_start: time (in jiffies) when the scan started @@ -2159,12 +2657,13 @@ struct cfg80211_scan_6ghz_params { * @n_6ghz_params: number of 6 GHz params * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) + * @tsf_report_link_id: for MLO, indicates the link ID of the BSS that should be + * used for TSF reporting. Can be set to -1 to indicate no preference. */ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; - enum nl80211_bss_scan_width scan_width; const u8 *ie; size_t ie_len; u16 duration; @@ -2188,9 +2687,10 @@ struct cfg80211_scan_request { bool scan_6ghz; u32 n_6ghz_params; struct cfg80211_scan_6ghz_params *scan_6ghz_params; + s8 tsf_report_link_id; /* keep last */ - struct ieee80211_channel *channels[]; + struct ieee80211_channel *channels[] __counted_by(n_channels); }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) @@ -2212,19 +2712,11 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match * or no match (RSSI only) * @rssi_thold: don't report scan results below this threshold (in s32 dBm) - * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied - * for filtering out scan results received. Drivers advertize this support - * of band specific rssi based filtering through the feature capability - * %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band - * specific rssi thresholds take precedence over rssi_thold, if specified. - * If not specified for any band, it will be assigned with rssi_thold of - * corresponding matchset. */ struct cfg80211_match_set { struct cfg80211_ssid ssid; u8 bssid[ETH_ALEN]; s32 rssi_thold; - s32 per_band_rssi_thold[NUM_NL80211_BANDS]; }; /** @@ -2259,14 +2751,13 @@ struct cfg80211_bss_select_adjust { * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans) * @n_ssids: number of SSIDs * @n_channels: total number of channels to scan - * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets - * @flags: bit field of flags controlling operation + * @flags: control flags from &enum nl80211_scan_flags * @match_sets: sets of parameters to be matched for a scan result * entry to be considered valid and to be passed to the host * (others are filtered out). - * If ommited, all results are passed. + * If omitted, all results are passed. * @n_match_sets: number of match sets * @report_results: indicates that results were reported for this request * @wiphy: the wiphy this was for @@ -2300,14 +2791,13 @@ struct cfg80211_bss_select_adjust { * to the specified band while deciding whether a better BSS is reported * using @relative_rssi. If delta is a negative number, the BSSs that * belong to the specified band will be penalized by delta dB in relative - * comparisions. + * comparisons. */ struct cfg80211_sched_scan_request { u64 reqid; struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; - enum nl80211_bss_scan_width scan_width; const u8 *ie; size_t ie_len; u32 flags; @@ -2336,7 +2826,7 @@ struct cfg80211_sched_scan_request { struct list_head list; /* keep last */ - struct ieee80211_channel *channels[]; + struct ieee80211_channel *channels[] __counted_by(n_channels); }; /** @@ -2355,7 +2845,6 @@ enum cfg80211_signal_type { /** * struct cfg80211_inform_bss - BSS inform data * @chan: channel the frame was received on - * @scan_width: scan width that was used * @signal: signal strength value, according to the wiphy's * signal type * @boottime_ns: timestamp (CLOCK_BOOTTIME) when the information was @@ -2371,16 +2860,28 @@ enum cfg80211_signal_type { * the BSS that requested the scan in which the beacon/probe was received. * @chains: bitmask for filled values in @chain_signal. * @chain_signal: per-chain signal strength of last received BSS in dBm. + * @restrict_use: restrict usage, if not set, assume @use_for is + * %NL80211_BSS_USE_FOR_NORMAL. + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons + * @drv_data: Data to be passed through to @inform_bss */ struct cfg80211_inform_bss { struct ieee80211_channel *chan; - enum nl80211_bss_scan_width scan_width; s32 signal; u64 boottime_ns; u64 parent_tsf; u8 parent_bssid[ETH_ALEN] __aligned(2); u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; + + u8 restrict_use:1, use_for:7; + u8 cannot_use_reasons; + + void *drv_data; }; /** @@ -2406,7 +2907,6 @@ struct cfg80211_bss_ies { * for use in scan results and similar. * * @channel: channel this BSS is on - * @scan_width: width of the control channel * @bssid: BSSID of the BSS * @beacon_interval: the beacon interval as from the frame * @capability: the capability field in host byte order @@ -2419,6 +2919,8 @@ struct cfg80211_bss_ies { * own the beacon_ies, but they're just pointers to the ones from the * @hidden_beacon_bss struct) * @proberesp_ies: the information elements from the last Probe Response frame + * @proberesp_ecsa_stuck: ECSA element is stuck in the Probe Response frame, + * cannot rely on it having valid data * @hidden_beacon_bss: in case this BSS struct represents a probe response from * a BSS that hides the SSID in its beacon, this points to the BSS struct * that holds the beacon data. @beacon_ies is still valid, of course, and @@ -2428,15 +2930,20 @@ struct cfg80211_bss_ies { * @nontrans_list: list of non-transmitted BSS, if this is a transmitted one * (multi-BSSID support) * @signal: signal strength value (type depends on the wiphy's signal_type) + * @ts_boottime: timestamp of the last BSS update in nanoseconds since boot * @chains: bitmask for filled values in @chain_signal. * @chain_signal: per-chain signal strength of last received BSS in dBm. * @bssid_index: index in the multiple BSS set * @max_bssid_indicator: max number of members in the BSS set + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes */ struct cfg80211_bss { struct ieee80211_channel *channel; - enum nl80211_bss_scan_width scan_width; const struct cfg80211_bss_ies __rcu *ies; const struct cfg80211_bss_ies __rcu *beacon_ies; @@ -2448,6 +2955,8 @@ struct cfg80211_bss { s32 signal; + u64 ts_boottime; + u16 beacon_interval; u16 capability; @@ -2455,9 +2964,14 @@ struct cfg80211_bss { u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; + u8 proberesp_ecsa_stuck:1; + u8 bssid_index; u8 max_bssid_indicator; + u8 use_for; + u8 cannot_use_reasons; + u8 priv[] __aligned(sizeof(void *)); }; @@ -2483,7 +2997,7 @@ const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id); */ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id) { - return (void *)ieee80211_bss_get_elem(bss, id); + return (const void *)ieee80211_bss_get_elem(bss, id); } @@ -2495,6 +3009,10 @@ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id) * * @bss: The BSS to authenticate with, the callee must obtain a reference * to it if it needs to keep it. + * @supported_selectors: List of selectors that should be assumed to be + * supported by the station. + * SAE_H2E must be assumed supported if set to %NULL. + * @supported_selectors_len: Length of supported_selectors in octets. * @auth_type: Authentication type (algorithm) * @ie: Extra IEs to add to Authentication frame or %NULL * @ie_len: Length of ie buffer in octets @@ -2506,16 +3024,59 @@ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id) * Authentication algorithm number, i.e., starting at the Authentication * transaction sequence number field. * @auth_data_len: Length of auth_data buffer in octets + * @link_id: if >= 0, indicates authentication should be done as an MLD, + * the interface address is included as the MLD address and the + * necessary link (with the given link_id) will be created (and + * given an MLD address) by the driver + * @ap_mld_addr: AP MLD address in case of authentication request with + * an AP MLD, valid iff @link_id >= 0 */ struct cfg80211_auth_request { struct cfg80211_bss *bss; const u8 *ie; size_t ie_len; + const u8 *supported_selectors; + u8 supported_selectors_len; enum nl80211_auth_type auth_type; const u8 *key; - u8 key_len, key_idx; + u8 key_len; + s8 key_idx; const u8 *auth_data; size_t auth_data_len; + s8 link_id; + const u8 *ap_mld_addr; +}; + +/** + * struct cfg80211_assoc_link - per-link information for MLO association + * @bss: the BSS pointer, see also &struct cfg80211_assoc_request::bss; + * if this is %NULL for a link, that link is not requested + * @elems: extra elements for the per-STA profile for this link + * @elems_len: length of the elements + * @disabled: If set this link should be included during association etc. but it + * should not be used until enabled by the AP MLD. + * @error: per-link error code, must be <= 0. If there is an error, then the + * operation as a whole must fail. + */ +struct cfg80211_assoc_link { + struct cfg80211_bss *bss; + const u8 *elems; + size_t elems_len; + bool disabled; + int error; +}; + +/** + * struct cfg80211_ml_reconf_req - MLO link reconfiguration request + * @add_links: data for links to add, see &struct cfg80211_assoc_link + * @rem_links: bitmap of links to remove + * @ext_mld_capa_ops: extended MLD capabilities and operations set by + * userspace for the ML reconfiguration action frame + */ +struct cfg80211_ml_reconf_req { + struct cfg80211_assoc_link add_links[IEEE80211_MLD_MAX_NUM_LINKS]; + u16 rem_links; + u16 ext_mld_capa_ops; }; /** @@ -2528,12 +3089,22 @@ struct cfg80211_auth_request { * authentication capability. Drivers can offload authentication to * userspace if this flag is set. Only applicable for cfg80211_connect() * request (connect callback). + * @ASSOC_REQ_DISABLE_HE: Disable HE + * @ASSOC_REQ_DISABLE_EHT: Disable EHT + * @CONNECT_REQ_MLO_SUPPORT: Userspace indicates support for handling MLD links. + * Drivers shall disable MLO features for the current association if this + * flag is not set. + * @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any) */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), ASSOC_REQ_DISABLE_VHT = BIT(1), ASSOC_REQ_USE_RRM = BIT(2), CONNECT_REQ_EXTERNAL_AUTH_SUPPORT = BIT(3), + ASSOC_REQ_DISABLE_HE = BIT(4), + ASSOC_REQ_DISABLE_EHT = BIT(5), + CONNECT_REQ_MLO_SUPPORT = BIT(6), + ASSOC_REQ_SPP_AMSDU = BIT(7), }; /** @@ -2545,6 +3116,8 @@ enum cfg80211_assoc_req_flags { * given a reference that it must give back to cfg80211_send_rx_assoc() * or to cfg80211_assoc_timeout(). To ensure proper refcounting, new * association requests while already associating must be rejected. + * This also applies to the @links.bss parameter, which is used instead + * of this one (it is %NULL) for MLO associations. * @ie: Extra IEs to add to (Re)Association Request frame or %NULL * @ie_len: Length of ie buffer in octets * @use_mfp: Use management frame protection (IEEE 802.11w) in this association @@ -2556,6 +3129,10 @@ enum cfg80211_assoc_req_flags { * included in the Current AP address field of the Reassociation Request * frame. * @flags: See &enum cfg80211_assoc_req_flags + * @supported_selectors: supported BSS selectors in IEEE 802.11 format + * (or %NULL for no change). + * If %NULL, then support for SAE_H2E should be assumed. + * @supported_selectors_len: number of supported BSS selectors * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask * will be used in ht_capa. Un-supported values will be ignored. * @ht_capa_mask: The bits of ht_capa which are to be used. @@ -2569,6 +3146,13 @@ enum cfg80211_assoc_req_flags { * with 16 octets of STA Nonce followed by 16 octets of AP Nonce. * @s1g_capa: S1G capability override * @s1g_capa_mask: S1G capability override mask + * @links: per-link information for MLO connections + * @link_id: >= 0 for MLO connections, where links are given, and indicates + * the link on which the association request should be sent + * @ap_mld_addr: AP MLD address in case of MLO association request, + * valid iff @link_id >= 0 + * @ext_mld_capa_ops: extended MLD capabilities and operations set by + * userspace for the association */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -2577,6 +3161,8 @@ struct cfg80211_assoc_request { struct cfg80211_crypto_settings crypto; bool use_mfp; u32 flags; + const u8 *supported_selectors; + u8 supported_selectors_len; struct ieee80211_ht_cap ht_capa; struct ieee80211_ht_cap ht_capa_mask; struct ieee80211_vht_cap vht_capa, vht_capa_mask; @@ -2584,6 +3170,10 @@ struct cfg80211_assoc_request { size_t fils_kek_len; const u8 *fils_nonces; struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask; + struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS]; + const u8 *ap_mld_addr; + s8 link_id; + u16 ext_mld_capa_ops; }; /** @@ -2592,7 +3182,7 @@ struct cfg80211_assoc_request { * This structure provides information needed to complete IEEE 802.11 * deauthentication. * - * @bssid: the BSSID of the BSS to deauthenticate from + * @bssid: the BSSID or AP MLD address to deauthenticate from * @ie: Extra IEs to add to Deauthentication frame or %NULL * @ie_len: Length of ie buffer in octets * @reason_code: The reason code for the deauthentication @@ -2613,7 +3203,7 @@ struct cfg80211_deauth_request { * This structure provides information needed to complete IEEE 802.11 * disassociation. * - * @bss: the BSS to disassociate from + * @ap_addr: the BSSID or AP MLD address to disassociate from * @ie: Extra IEs to add to Disassociation frame or %NULL * @ie_len: Length of ie buffer in octets * @reason_code: The reason code for the disassociation @@ -2621,7 +3211,7 @@ struct cfg80211_deauth_request { * Disassociation frame is to be transmitted. */ struct cfg80211_disassoc_request { - struct cfg80211_bss *bss; + const u8 *ap_addr; const u8 *ie; size_t ie_len; u16 reason_code; @@ -2689,8 +3279,8 @@ struct cfg80211_ibss_params { * * @behaviour: requested BSS selection behaviour. * @param: parameters for requestion behaviour. - * @band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF. - * @adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST. + * @param.band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF. + * @param.adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST. */ struct cfg80211_bss_selection { enum nl80211_bss_select_attr behaviour; @@ -2828,15 +3418,15 @@ enum cfg80211_connect_params_changed { * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum */ enum wiphy_params_flags { - WIPHY_PARAM_RETRY_SHORT = 1 << 0, - WIPHY_PARAM_RETRY_LONG = 1 << 1, - WIPHY_PARAM_FRAG_THRESHOLD = 1 << 2, - WIPHY_PARAM_RTS_THRESHOLD = 1 << 3, - WIPHY_PARAM_COVERAGE_CLASS = 1 << 4, - WIPHY_PARAM_DYN_ACK = 1 << 5, - WIPHY_PARAM_TXQ_LIMIT = 1 << 6, - WIPHY_PARAM_TXQ_MEMORY_LIMIT = 1 << 7, - WIPHY_PARAM_TXQ_QUANTUM = 1 << 8, + WIPHY_PARAM_RETRY_SHORT = BIT(0), + WIPHY_PARAM_RETRY_LONG = BIT(1), + WIPHY_PARAM_FRAG_THRESHOLD = BIT(2), + WIPHY_PARAM_RTS_THRESHOLD = BIT(3), + WIPHY_PARAM_COVERAGE_CLASS = BIT(4), + WIPHY_PARAM_DYN_ACK = BIT(5), + WIPHY_PARAM_TXQ_LIMIT = BIT(6), + WIPHY_PARAM_TXQ_MEMORY_LIMIT = BIT(7), + WIPHY_PARAM_TXQ_QUANTUM = BIT(8), }; #define IEEE80211_DEFAULT_AIRTIME_WEIGHT 256 @@ -2995,8 +3585,8 @@ struct cfg80211_coalesce_rules { * @n_rules: number of rules */ struct cfg80211_coalesce { - struct cfg80211_coalesce_rules *rules; int n_rules; + struct cfg80211_coalesce_rules rules[] __counted_by(n_rules); }; /** @@ -3011,7 +3601,7 @@ struct cfg80211_coalesce { struct cfg80211_wowlan_nd_match { struct cfg80211_ssid ssid; int n_channels; - u32 channels[]; + u32 channels[] __counted_by(n_channels); }; /** @@ -3025,7 +3615,7 @@ struct cfg80211_wowlan_nd_match { */ struct cfg80211_wowlan_nd_info { int n_matches; - struct cfg80211_wowlan_nd_match *matches[]; + struct cfg80211_wowlan_nd_match *matches[] __counted_by(n_matches); }; /** @@ -3048,12 +3638,15 @@ struct cfg80211_wowlan_nd_info { * @tcp_connlost: TCP connection lost or failed to establish * @tcp_nomoretokens: TCP data ran out of tokens * @net_detect: if not %NULL, woke up because of net detect + * @unprot_deauth_disassoc: woke up due to unprotected deauth or + * disassoc frame (in MFP). */ struct cfg80211_wowlan_wakeup { bool disconnect, magic_pkt, gtk_rekey_failure, eap_identity_req, four_way_handshake, rfkill_release, packet_80211, - tcp_match, tcp_connlost, tcp_nomoretokens; + tcp_match, tcp_connlost, tcp_nomoretokens, + unprot_deauth_disassoc; s32 pattern_idx; u32 packet_present_len, packet_len; const void *packet; @@ -3066,7 +3659,7 @@ struct cfg80211_wowlan_wakeup { * @kck: key confirmation key (@kck_len bytes) * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes) * @kek_len: length of kek - * @kck_len length of kck + * @kck_len: length of kck * @akm: akm (oui, id) */ struct cfg80211_gtk_rekey_data { @@ -3096,7 +3689,7 @@ struct cfg80211_update_ft_ies_params { * This structure provides information needed to transmit a mgmt frame * * @chan: channel to use - * @offchan: indicates wether off channel operation is required + * @offchan: indicates whether off channel operation is required * @wait: duration for ROC * @buf: buffer to transmit * @len: buffer length @@ -3104,6 +3697,9 @@ struct cfg80211_update_ft_ies_params { * @dont_wait_for_ack: tells the low level not to wait for an ack * @n_csa_offsets: length of csa_offsets array * @csa_offsets: array of all the csa offsets in the frame + * @link_id: for MLO, the link ID to transmit on, -1 if not given; note + * that the link ID isn't validated (much), it's in range but the + * link might not exist (or be used by the receiver STA) */ struct cfg80211_mgmt_tx_params { struct ieee80211_channel *chan; @@ -3115,6 +3711,7 @@ struct cfg80211_mgmt_tx_params { bool dont_wait_for_ack; int n_csa_offsets; const u16 *csa_offsets; + int link_id; }; /** @@ -3210,7 +3807,7 @@ struct cfg80211_nan_func_filter { * @publish_bcast: if true, the solicited publish should be broadcasted * @subscribe_active: if true, the subscribe is active * @followup_id: the instance ID for follow up - * @followup_reqid: the requestor instance ID for follow up + * @followup_reqid: the requester instance ID for follow up * @followup_dest: MAC address of the recipient of the follow up * @ttl: time to live counter in DW. * @serv_spec_info: Service Specific Info @@ -3291,6 +3888,17 @@ struct cfg80211_pmk_conf { * the real status code for failures. Used only for the authentication * response command interface (user space to driver). * @pmkid: The identifier to refer a PMKSA. + * @mld_addr: MLD address of the peer. Used by the authentication request event + * interface. Driver indicates this to enable MLO during the authentication + * offload to user space. Driver shall look at %NL80211_ATTR_MLO_SUPPORT + * flag capability in NL80211_CMD_CONNECT to know whether the user space + * supports enabling MLO during the authentication offload. + * User space should use the address of the interface (on which the + * authentication request event reported) as self MLD address. User space + * and driver should use MLD addresses in RA, TA and BSSID fields of + * authentication frames sent or received via cfg80211. The driver + * translates the MLD addresses to/from link addresses based on the link + * chosen for the authentication. */ struct cfg80211_external_auth_params { enum nl80211_external_auth_action action; @@ -3299,6 +3907,7 @@ struct cfg80211_external_auth_params { unsigned int key_mgmt_suite; u16 status; const u8 *pmkid; + u8 mld_addr[ETH_ALEN] __aligned(2); }; /** @@ -3428,6 +4037,7 @@ struct cfg80211_pmsr_ftm_result { * @type: type of the measurement reported, note that we only support reporting * one type at a time, but you can report multiple results separately and * they're all aggregated for userspace. + * @ftm: FTM result */ struct cfg80211_pmsr_result { u64 host_time, ap_tsf; @@ -3463,6 +4073,11 @@ struct cfg80211_pmsr_result { * @non_trigger_based: use non trigger based ranging for the measurement * If neither @trigger_based nor @non_trigger_based is set, * EDCA based ranging will be used. + * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either + * @trigger_based or @non_trigger_based is set. + * @bss_color: the bss color of the responder. Optional. Set to zero to + * indicate the driver should set the BSS color. Only valid if + * @non_trigger_based or @trigger_based is set. * * See also nl80211 for the respective attribute documentation. */ @@ -3474,11 +4089,13 @@ struct cfg80211_pmsr_ftm_request_peer { request_lci:1, request_civicloc:1, trigger_based:1, - non_trigger_based:1; + non_trigger_based:1, + lmr_feedback:1; u8 num_bursts_exp; u8 burst_duration; u8 ftms_per_burst; u8 ftmr_retries; + u8 bss_color; }; /** @@ -3524,7 +4141,7 @@ struct cfg80211_pmsr_request { struct list_head list; - struct cfg80211_pmsr_request_peer peers[]; + struct cfg80211_pmsr_request_peer peers[] __counted_by(n_peers); }; /** @@ -3545,12 +4162,22 @@ struct cfg80211_pmsr_request { * the IEs of the remote peer in the event from the host driver and * the constructed IEs by the user space in the request interface. * @ie_len: Length of IEs in octets. + * @assoc_link_id: MLO link ID of the AP, with which (re)association requested + * by peer. This will be filled by driver for both MLO and non-MLO station + * connections when the AP affiliated with an MLD. For non-MLD AP mode, it + * will be -1. Used only with OWE update event (driver to user space). + * @peer_mld_addr: For MLO connection, MLD address of the peer. For non-MLO + * connection, it will be all zeros. This is applicable only when + * @assoc_link_id is not -1, i.e., the AP affiliated with an MLD. Used only + * with OWE update event (driver to user space). */ struct cfg80211_update_owe_info { u8 peer[ETH_ALEN] __aligned(2); u16 status; const u8 *ie; size_t ie_len; + int assoc_link_id; + u8 peer_mld_addr[ETH_ALEN] __aligned(2); }; /** @@ -3559,7 +4186,7 @@ struct cfg80211_update_owe_info { * for the entire device * @interface_stypes: bitmap of management frame subtypes registered * for the given interface - * @global_mcast_rx: mcast RX is needed globally for these subtypes + * @global_mcast_stypes: mcast RX is needed globally for these subtypes * @interface_mcast_stypes: mcast RX is needed on this interface * for these subtypes */ @@ -3577,9 +4204,10 @@ struct mgmt_frame_regs { * All callbacks except where otherwise noted should return 0 * on success or a negative error code. * - * All operations are currently invoked under rtnl for consistency with the - * wireless extensions but this is subject to reevaluation as soon as this - * code is used more widely and we have a first user without wext. + * All operations are invoked with the wiphy mutex held. The RTNL may be + * held in addition (due to wireless extensions) but this cannot be relied + * upon except in cases where documented below. Note that due to ordering, + * the RTNL also cannot be acquired in any handlers. * * @suspend: wiphy device needs to be suspended. The variable @wow will * be %NULL or contain the enabled Wake-on-Wireless triggers that are @@ -3594,29 +4222,48 @@ struct mgmt_frame_regs { * the new netdev in the wiphy's network namespace! Returns the struct * wireless_dev, or an ERR_PTR. For P2P device wdevs, the driver must * also set the address member in the wdev. + * This additionally holds the RTNL to be able to do netdev changes. * * @del_virtual_intf: remove the virtual interface + * This additionally holds the RTNL to be able to do netdev changes. * * @change_virtual_intf: change type/configuration of virtual interface, * keep the struct wireless_dev's iftype updated. + * This additionally holds the RTNL to be able to do netdev changes. + * + * @add_intf_link: Add a new MLO link to the given interface. Note that + * the wdev->link[] data structure has been updated, so the new link + * address is available. + * @del_intf_link: Remove an MLO link from the given interface. * * @add_key: add a key with the given parameters. @mac_addr will be %NULL - * when adding a group key. + * when adding a group key. @link_id will be -1 for non-MLO connection. + * For MLO connection, @link_id will be >= 0 for group key and -1 for + * pairwise key, @mac_addr will be peer's MLD address for MLO pairwise key. * * @get_key: get information about the key with the given parameters. * @mac_addr will be %NULL when requesting information for a group * key. All pointers given to the @callback function need not be valid * after it returns. This function should return an error if it is * not possible to retrieve the key, -ENOENT if it doesn't exist. + * @link_id will be -1 for non-MLO connection. For MLO connection, + * @link_id will be >= 0 for group key and -1 for pairwise key, @mac_addr + * will be peer's MLD address for MLO pairwise key. * * @del_key: remove a key given the @mac_addr (%NULL for a group key) - * and @key_index, return -ENOENT if the key doesn't exist. + * and @key_index, return -ENOENT if the key doesn't exist. @link_id will + * be -1 for non-MLO connection. For MLO connection, @link_id will be >= 0 + * for group key and -1 for pairwise key, @mac_addr will be peer's MLD + * address for MLO pairwise key. * - * @set_default_key: set the default key on an interface + * @set_default_key: set the default key on an interface. @link_id will be >= 0 + * for MLO connection and -1 for non-MLO connection. * - * @set_default_mgmt_key: set the default management frame key on an interface + * @set_default_mgmt_key: set the default management frame key on an interface. + * @link_id will be >= 0 for MLO connection and -1 for non-MLO connection. * - * @set_default_beacon_key: set the default Beacon frame key on an interface + * @set_default_beacon_key: set the default Beacon frame key on an interface. + * @link_id will be >= 0 for MLO connection and -1 for non-MLO connection. * * @set_rekey_data: give the data necessary for GTK rekeying to the driver * @@ -3655,6 +4302,13 @@ struct mgmt_frame_regs { * * @change_bss: Modify parameters for a given BSS. * + * @inform_bss: Called by cfg80211 while being informed about new BSS data + * for every BSS found within the reported data or frame. This is called + * from within the cfg8011 inform_bss handlers while holding the bss_lock. + * The data parameter is passed through from drv_data inside + * struct cfg80211_inform_bss. + * The new IE data for the BSS is explicitly passed. + * * @set_txq_params: Set TX queue parameters * * @libertas_set_mesh_channel: Only for backward compatibility for libertas, @@ -3736,8 +4390,6 @@ struct mgmt_frame_regs { * @get_tx_power: store the current TX power into the dbm variable; * return 0 if successful * - * @set_wds_peer: set the WDS peer for a WDS interface - * * @rfkill_poll: polls the hw rfkill line, use cfg80211 reporting * functions to adjust rfkill hw state * @@ -3921,6 +4573,41 @@ struct mgmt_frame_regs { * This callback may sleep. * @reset_tid_config: Reset TID specific configuration for the peer, for the * given TIDs. This callback may sleep. + * + * @set_sar_specs: Update the SAR (TX power) settings. + * + * @color_change: Initiate a color change. + * + * @set_fils_aad: Set FILS AAD data to the AP driver so that the driver can use + * those to decrypt (Re)Association Request and encrypt (Re)Association + * Response frame. + * + * @set_radar_background: Configure dedicated offchannel chain available for + * radar/CAC detection on some hw. This chain can't be used to transmit + * or receive frames and it is bounded to a running wdev. + * Background radar/CAC detection allows to avoid the CAC downtime + * switching to a different channel during CAC detection on the selected + * radar channel. + * The caller is expected to set chandef pointer to NULL in order to + * disable background CAC/radar detection. + * @add_link_station: Add a link to a station. + * @mod_link_station: Modify a link of a station. + * @del_link_station: Remove a link of a station. + * + * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. + * @set_ttlm: set the TID to link mapping. + * @set_epcs: Enable/Disable EPCS for station mode. + * @get_radio_mask: get bitmask of radios in use. + * (invoked with the wiphy mutex held) + * @assoc_ml_reconf: Request a non-AP MLO connection to perform ML + * reconfiguration, i.e., add and/or remove links to/from the + * association using ML reconfiguration action frames. Successfully added + * links will be added to the set of valid links. Successfully removed + * links will be removed from the set of valid links. The driver must + * indicate removed links by calling cfg80211_links_removed() and added + * links by calling cfg80211_mlo_reconf_add_done(). When calling + * cfg80211_mlo_reconf_add_done() the bss pointer must be given for each + * link for which MLO reconfiguration 'add' operation was requested. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3939,30 +4626,40 @@ struct cfg80211_ops { enum nl80211_iftype type, struct vif_params *params); + int (*add_intf_link)(struct wiphy *wiphy, + struct wireless_dev *wdev, + unsigned int link_id); + void (*del_intf_link)(struct wiphy *wiphy, + struct wireless_dev *wdev, + unsigned int link_id); + int (*add_key)(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, - struct key_params *params); + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, struct key_params *params); int (*get_key)(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, - void *cookie, + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)); int (*del_key)(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr); + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr); int (*set_default_key)(struct wiphy *wiphy, - struct net_device *netdev, + struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast); int (*set_default_mgmt_key)(struct wiphy *wiphy, - struct net_device *netdev, + struct net_device *netdev, int link_id, u8 key_index); int (*set_default_beacon_key)(struct wiphy *wiphy, struct net_device *netdev, + int link_id, u8 key_index); int (*start_ap)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *settings); int (*change_beacon)(struct wiphy *wiphy, struct net_device *dev, - struct cfg80211_beacon_data *info); - int (*stop_ap)(struct wiphy *wiphy, struct net_device *dev); + struct cfg80211_ap_update *info); + int (*stop_ap)(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id); int (*add_station)(struct wiphy *wiphy, struct net_device *dev, @@ -4012,6 +4709,9 @@ struct cfg80211_ops { int (*change_bss)(struct wiphy *wiphy, struct net_device *dev, struct bss_parameters *params); + void (*inform_bss)(struct wiphy *wiphy, struct cfg80211_bss *bss, + const struct cfg80211_bss_ies *ies, void *data); + int (*set_txq_params)(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_txq_params *params); @@ -4020,6 +4720,7 @@ struct cfg80211_ops { struct ieee80211_channel *chan); int (*set_monitor_channel)(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef); int (*scan)(struct wiphy *wiphy, @@ -4056,10 +4757,7 @@ struct cfg80211_ops { int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm); int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, - int *dbm); - - int (*set_wds_peer)(struct wiphy *wiphy, struct net_device *dev, - const u8 *addr); + unsigned int link_id, int *dbm); void (*rfkill_poll)(struct wiphy *wiphy); @@ -4073,6 +4771,7 @@ struct cfg80211_ops { int (*set_bitrate_mask)(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, const u8 *peer, const struct cfg80211_bitrate_mask *mask); @@ -4133,9 +4832,10 @@ struct cfg80211_ops { struct cfg80211_gtk_rekey_data *data); int (*tdls_mgmt)(struct wiphy *wiphy, struct net_device *dev, - const u8 *peer, u8 action_code, u8 dialog_token, - u16 status_code, u32 peer_capability, - bool initiator, const u8 *buf, size_t len); + const u8 *peer, int link_id, + u8 action_code, u8 dialog_token, u16 status_code, + u32 peer_capability, bool initiator, + const u8 *buf, size_t len); int (*tdls_oper)(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper); @@ -4148,6 +4848,7 @@ struct cfg80211_ops { int (*get_channel)(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id, struct cfg80211_chan_def *chandef); int (*start_p2p_device)(struct wiphy *wiphy, @@ -4161,9 +4862,9 @@ struct cfg80211_ops { int (*start_radar_detection)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef, - u32 cac_time_ms); + u32 cac_time_ms, int link_id); void (*end_cac)(struct wiphy *wiphy, - struct net_device *dev); + struct net_device *dev, unsigned int link_id); int (*update_ft_ies)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_update_ft_ies_params *ftie); int (*crit_proto_start)(struct wiphy *wiphy, @@ -4184,6 +4885,7 @@ struct cfg80211_ops { struct cfg80211_qos_map *qos_map); int (*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, struct cfg80211_chan_def *chandef); int (*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev, @@ -4230,7 +4932,7 @@ struct cfg80211_ops { struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, const __be16 proto, - const bool noencrypt, + const bool noencrypt, int link_id, u64 *cookie); int (*get_ftm_responder_stats)(struct wiphy *wiphy, @@ -4249,6 +4951,30 @@ struct cfg80211_ops { struct cfg80211_tid_config *tid_conf); int (*reset_tid_config)(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u8 tids); + int (*set_sar_specs)(struct wiphy *wiphy, + struct cfg80211_sar_specs *sar); + int (*color_change)(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_color_change_settings *params); + int (*set_fils_aad)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_fils_aad *fils_aad); + int (*set_radar_background)(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef); + int (*add_link_station)(struct wiphy *wiphy, struct net_device *dev, + struct link_station_parameters *params); + int (*mod_link_station)(struct wiphy *wiphy, struct net_device *dev, + struct link_station_parameters *params); + int (*del_link_station)(struct wiphy *wiphy, struct net_device *dev, + struct link_station_del_parameters *params); + int (*set_hw_timestamp)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_set_hw_timestamp *hwts); + int (*set_ttlm)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ttlm_params *params); + u32 (*get_radio_mask)(struct wiphy *wiphy, struct net_device *dev); + int (*assoc_ml_reconf)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ml_reconf_req *req); + int (*set_epcs)(struct wiphy *wiphy, struct net_device *dev, + bool val); }; /* @@ -4260,7 +4986,7 @@ struct cfg80211_ops { * enum wiphy_flags - wiphy capability flags * * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split - * into two, first for legacy bands and second for UHB. + * into two, first for legacy bands and second for 6 GHz. * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this * wiphy at all * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled @@ -4297,13 +5023,23 @@ struct cfg80211_ops { * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in * beaconing mode (AP, IBSS, Mesh, ...). - * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation - * before connection. * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys + * @WIPHY_FLAG_SUPPORTS_MLO: This is a temporary flag gating the MLO APIs, + * in order to not have them reachable in normal drivers, until we have + * complete feature/interface combinations/etc. advertisement. No driver + * should set this flag for now. + * @WIPHY_FLAG_SUPPORTS_EXT_KCK_32: The device supports 32-byte KCK keys. + * @WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER: The device could handle reg notify for + * NL80211_REGDOM_SET_BY_DRIVER. + * @WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON: reg_call_notifier() is called if driver + * set this flag to update channels on beacon hints. + * @WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY: support connection to non-primary link + * of an NSTR mobile AP MLD. + * @WIPHY_FLAG_DISABLE_WEXT: disable wireless extensions for this device */ enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), - /* use hole at 1 */ + WIPHY_FLAG_SUPPORTS_MLO = BIT(1), WIPHY_FLAG_SPLIT_SCAN_6GHZ = BIT(2), WIPHY_FLAG_NETNS_OK = BIT(3), WIPHY_FLAG_PS_ON_BY_DEFAULT = BIT(4), @@ -4311,9 +5047,10 @@ enum wiphy_flags { WIPHY_FLAG_4ADDR_STATION = BIT(6), WIPHY_FLAG_CONTROL_PORT_PROTOCOL = BIT(7), WIPHY_FLAG_IBSS_RSN = BIT(8), + WIPHY_FLAG_DISABLE_WEXT = BIT(9), WIPHY_FLAG_MESH_AUTH = BIT(10), - /* use hole at 11 */ - /* use hole at 12 */ + WIPHY_FLAG_SUPPORTS_EXT_KCK_32 = BIT(11), + WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY = BIT(12), WIPHY_FLAG_SUPPORTS_FW_ROAM = BIT(13), WIPHY_FLAG_AP_UAPSD = BIT(14), WIPHY_FLAG_SUPPORTS_TDLS = BIT(15), @@ -4325,7 +5062,8 @@ enum wiphy_flags { WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL = BIT(21), WIPHY_FLAG_SUPPORTS_5_10_MHZ = BIT(22), WIPHY_FLAG_HAS_CHANNEL_SWITCH = BIT(23), - WIPHY_FLAG_HAS_STATIC_WEP = BIT(24), + WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER = BIT(24), + WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON = BIT(25), }; /** @@ -4342,7 +5080,9 @@ struct ieee80211_iface_limit { * struct ieee80211_iface_combination - possible interface combination * * With this structure the driver can describe which interface - * combinations it supports concurrently. + * combinations it supports concurrently. When set in a struct wiphy_radio, + * the combinations refer to combinations of interfaces currently active on + * that radio. * * Examples: * @@ -4352,7 +5092,7 @@ struct ieee80211_iface_limit { * * struct ieee80211_iface_limit limits1[] = { * { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), }, - * { .max = 1, .types = BIT(NL80211_IFTYPE_AP}, }, + * { .max = 1, .types = BIT(NL80211_IFTYPE_AP), }, * }; * struct ieee80211_iface_combination combination1 = { * .limits = limits1, @@ -4624,19 +5364,34 @@ struct wiphy_vendor_command { * 802.11-2012 8.4.2.29 for the defined fields. * @extended_capabilities_mask: mask of the valid values * @extended_capabilities_len: length of the extended capabilities + * @eml_capabilities: EML capabilities (for MLO) + * @mld_capa_and_ops: MLD capabilities and operations (for MLO) */ struct wiphy_iftype_ext_capab { enum nl80211_iftype iftype; const u8 *extended_capabilities; const u8 *extended_capabilities_mask; u8 extended_capabilities_len; + u16 eml_capabilities; + u16 mld_capa_and_ops; }; /** + * cfg80211_get_iftype_ext_capa - lookup interface type extended capability + * @wiphy: the wiphy to look up from + * @type: the interface type to look up + * + * Return: The extended capability for the given interface @type, may be %NULL + */ +const struct wiphy_iftype_ext_capab * +cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type); + +/** * struct cfg80211_pmsr_capabilities - cfg80211 peer measurement capabilities * @max_peers: maximum number of peers in a single measurement * @report_ap_tsf: can report assoc AP's TSF for radio resource measurement * @randomize_mac_addr: can randomize MAC address for measurement + * @ftm: FTM measurement data * @ftm.supported: FTM measurement is supported * @ftm.asap: ASAP-mode is supported * @ftm.non_asap: non-ASAP-mode is supported @@ -4688,7 +5443,46 @@ struct wiphy_iftype_akm_suites { }; /** + * struct wiphy_radio_freq_range - wiphy frequency range + * @start_freq: start range edge frequency (kHz) + * @end_freq: end range edge frequency (kHz) + */ +struct wiphy_radio_freq_range { + u32 start_freq; + u32 end_freq; +}; + + +/** + * struct wiphy_radio - physical radio of a wiphy + * This structure describes a physical radio belonging to a wiphy. + * It is used to describe concurrent-channel capabilities. Only one channel + * can be active on the radio described by struct wiphy_radio. + * + * @freq_range: frequency range that the radio can operate on. + * @n_freq_range: number of elements in @freq_range + * + * @iface_combinations: Valid interface combinations array, should not + * list single interface types. + * @n_iface_combinations: number of entries in @iface_combinations array. + * + * @antenna_mask: bitmask of antennas connected to this radio. + */ +struct wiphy_radio { + const struct wiphy_radio_freq_range *freq_range; + int n_freq_range; + + const struct ieee80211_iface_combination *iface_combinations; + int n_iface_combinations; + + u32 antenna_mask; +}; + +#define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff + +/** * struct wiphy - wireless hardware description + * @mtx: mutex for the data (structures) of this device * @reg_notifier: the driver's regulatory notification callback, * note that if your driver uses wiphy_apply_custom_regulatory() * the reg_notifier's request can be passed as NULL @@ -4879,8 +5673,36 @@ struct wiphy_iftype_akm_suites { * @max_data_retry_count: maximum supported per TID retry count for * configuration through the %NL80211_TID_CONFIG_ATTR_RETRY_SHORT and * %NL80211_TID_CONFIG_ATTR_RETRY_LONG attributes + * @sar_capa: SAR control capabilities + * @rfkill: a pointer to the rfkill structure + * + * @mbssid_max_interfaces: maximum number of interfaces supported by the driver + * in a multiple BSSID set. This field must be set to a non-zero value + * by the driver to advertise MBSSID support. + * @ema_max_profile_periodicity: maximum profile periodicity supported by + * the driver. Setting this field to a non-zero value indicates that the + * driver supports enhanced multi-BSSID advertisements (EMA AP). + * @max_num_akm_suites: maximum number of AKM suites allowed for + * configuration through %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and + * %NL80211_CMD_START_AP. Set to NL80211_MAX_NR_AKM_SUITES if not set by + * driver. If set by driver minimum allowed value is + * NL80211_MAX_NR_AKM_SUITES in order to avoid compatibility issues with + * legacy userspace and maximum allowed value is + * CFG80211_MAX_NUM_AKM_SUITES. + * + * @hw_timestamp_max_peers: maximum number of peers that the driver supports + * enabling HW timestamping for concurrently. Setting this field to a + * non-zero value indicates that the driver supports HW timestamping. + * A value of %CFG80211_HW_TIMESTAMP_ALL_PEERS indicates the driver + * supports enabling HW timestamping for all peers (i.e. no need to + * specify a mac address). + * + * @radio: radios belonging to this wiphy + * @n_radio: number of radios */ struct wiphy { + struct mutex mtx; + /* assign these fields before you register the wiphy */ u8 perm_addr[ETH_ALEN]; @@ -5017,6 +5839,19 @@ struct wiphy { u8 max_data_retry_count; + const struct cfg80211_sar_capa *sar_capa; + + struct rfkill *rfkill; + + u8 mbssid_max_interfaces; + u8 ema_max_profile_periodicity; + u16 max_num_akm_suites; + + u16 hw_timestamp_max_peers; + + int n_radio; + const struct wiphy_radio *radio; + char priv[] __aligned(NETDEV_ALIGN); }; @@ -5131,6 +5966,41 @@ static inline struct wiphy *wiphy_new(const struct cfg80211_ops *ops, */ int wiphy_register(struct wiphy *wiphy); +/* this is a define for better error reporting (file/line) */ +#define lockdep_assert_wiphy(wiphy) lockdep_assert_held(&(wiphy)->mtx) + +/** + * rcu_dereference_wiphy - rcu_dereference with debug checking + * @wiphy: the wiphy to check the locking on + * @p: The pointer to read, prior to dereferencing + * + * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() + * or RTNL. Note: Please prefer wiphy_dereference() or rcu_dereference(). + */ +#define rcu_dereference_wiphy(wiphy, p) \ + rcu_dereference_check(p, lockdep_is_held(&wiphy->mtx)) + +/** + * wiphy_dereference - fetch RCU pointer when updates are prevented by wiphy mtx + * @wiphy: the wiphy to check the locking on + * @p: The pointer to read, prior to dereferencing + * + * Return: the value of the specified RCU-protected pointer, but omit the + * READ_ONCE(), because caller holds the wiphy mutex used for updates. + */ +#define wiphy_dereference(wiphy, p) \ + rcu_dereference_protected(p, lockdep_is_held(&wiphy->mtx)) + +/** + * get_wiphy_regdom - get custom regdomain for the given wiphy + * @wiphy: the wiphy to get the regdomain from + * + * Context: Requires any of RTNL, wiphy mutex or RCU protection. + * + * Return: pointer to the regulatory domain associated with the wiphy + */ +const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy); + /** * wiphy_unregister - deregister a wiphy from cfg80211 * @@ -5156,13 +6026,216 @@ struct cfg80211_cached_keys; struct cfg80211_cqm_config; /** + * wiphy_lock - lock the wiphy + * @wiphy: the wiphy to lock + * + * This is needed around registering and unregistering netdevs that + * aren't created through cfg80211 calls, since that requires locking + * in cfg80211 when the notifiers is called, but that cannot + * differentiate which way it's called. + * + * It can also be used by drivers for their own purposes. + * + * When cfg80211 ops are called, the wiphy is already locked. + * + * Note that this makes sure that no workers that have been queued + * with wiphy_queue_work() are running. + */ +static inline void wiphy_lock(struct wiphy *wiphy) + __acquires(&wiphy->mtx) +{ + mutex_lock(&wiphy->mtx); + __acquire(&wiphy->mtx); +} + +/** + * wiphy_unlock - unlock the wiphy again + * @wiphy: the wiphy to unlock + */ +static inline void wiphy_unlock(struct wiphy *wiphy) + __releases(&wiphy->mtx) +{ + __release(&wiphy->mtx); + mutex_unlock(&wiphy->mtx); +} + +DEFINE_GUARD(wiphy, struct wiphy *, + mutex_lock(&_T->mtx), + mutex_unlock(&_T->mtx)) + +struct wiphy_work; +typedef void (*wiphy_work_func_t)(struct wiphy *, struct wiphy_work *); + +struct wiphy_work { + struct list_head entry; + wiphy_work_func_t func; +}; + +static inline void wiphy_work_init(struct wiphy_work *work, + wiphy_work_func_t func) +{ + INIT_LIST_HEAD(&work->entry); + work->func = func; +} + +/** + * wiphy_work_queue - queue work for the wiphy + * @wiphy: the wiphy to queue for + * @work: the work item + * + * This is useful for work that must be done asynchronously, and work + * queued here has the special property that the wiphy mutex will be + * held as if wiphy_lock() was called, and that it cannot be running + * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can + * use just cancel_work() instead of cancel_work_sync(), it requires + * being in a section protected by wiphy_lock(). + */ +void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work); + +/** + * wiphy_work_cancel - cancel previously queued work + * @wiphy: the wiphy, for debug purposes + * @work: the work to cancel + * + * Cancel the work *without* waiting for it, this assumes being + * called under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work); + +/** + * wiphy_work_flush - flush previously queued work + * @wiphy: the wiphy, for debug purposes + * @work: the work to flush, this can be %NULL to flush all work + * + * Flush the work (i.e. run it if pending). This must be called + * under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work); + +struct wiphy_delayed_work { + struct wiphy_work work; + struct wiphy *wiphy; + struct timer_list timer; +}; + +void wiphy_delayed_work_timer(struct timer_list *t); + +static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork, + wiphy_work_func_t func) +{ + timer_setup(&dwork->timer, wiphy_delayed_work_timer, 0); + wiphy_work_init(&dwork->work, func); +} + +/** + * wiphy_delayed_work_queue - queue delayed work for the wiphy + * @wiphy: the wiphy to queue for + * @dwork: the delayable worker + * @delay: number of jiffies to wait before queueing + * + * This is useful for work that must be done asynchronously, and work + * queued here has the special property that the wiphy mutex will be + * held as if wiphy_lock() was called, and that it cannot be running + * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can + * use just cancel_work() instead of cancel_work_sync(), it requires + * being in a section protected by wiphy_lock(). + */ +void wiphy_delayed_work_queue(struct wiphy *wiphy, + struct wiphy_delayed_work *dwork, + unsigned long delay); + +/** + * wiphy_delayed_work_cancel - cancel previously queued delayed work + * @wiphy: the wiphy, for debug purposes + * @dwork: the delayed work to cancel + * + * Cancel the work *without* waiting for it, this assumes being + * called under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_delayed_work_cancel(struct wiphy *wiphy, + struct wiphy_delayed_work *dwork); + +/** + * wiphy_delayed_work_flush - flush previously queued delayed work + * @wiphy: the wiphy, for debug purposes + * @dwork: the delayed work to flush + * + * Flush the work (i.e. run it if pending). This must be called + * under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_delayed_work_flush(struct wiphy *wiphy, + struct wiphy_delayed_work *dwork); + +/** + * wiphy_delayed_work_pending - Find out whether a wiphy delayable + * work item is currently pending. + * + * @wiphy: the wiphy, for debug purposes + * @dwork: the delayed work in question + * + * Return: true if timer is pending, false otherwise + * + * How wiphy_delayed_work_queue() works is by setting a timer which + * when it expires calls wiphy_work_queue() to queue the wiphy work. + * Because wiphy_delayed_work_queue() uses mod_timer(), if it is + * called twice and the second call happens before the first call + * deadline, the work will rescheduled for the second deadline and + * won't run before that. + * + * wiphy_delayed_work_pending() can be used to detect if calling + * wiphy_work_delayed_work_queue() would start a new work schedule + * or delayed a previous one. As seen below it cannot be used to + * detect precisely if the work has finished to execute nor if it + * is currently executing. + * + * CPU0 CPU1 + * wiphy_delayed_work_queue(wk) + * mod_timer(wk->timer) + * wiphy_delayed_work_pending(wk) -> true + * + * [...] + * expire_timers(wk->timer) + * detach_timer(wk->timer) + * wiphy_delayed_work_pending(wk) -> false + * wk->timer->function() | + * wiphy_work_queue(wk) | delayed work pending + * list_add_tail() | returns false but + * queue_work(cfg80211_wiphy_work) | wk->func() has not + * | been run yet + * [...] | + * cfg80211_wiphy_work() | + * wk->func() V + * + */ +bool wiphy_delayed_work_pending(struct wiphy *wiphy, + struct wiphy_delayed_work *dwork); + +/** + * enum ieee80211_ap_reg_power - regulatory power for an Access Point + * + * @IEEE80211_REG_UNSET_AP: Access Point has no regulatory power mode + * @IEEE80211_REG_LPI_AP: Indoor Access Point + * @IEEE80211_REG_SP_AP: Standard power Access Point + * @IEEE80211_REG_VLP_AP: Very low power Access Point + */ +enum ieee80211_ap_reg_power { + IEEE80211_REG_UNSET_AP, + IEEE80211_REG_LPI_AP, + IEEE80211_REG_SP_AP, + IEEE80211_REG_VLP_AP, +}; + +/** * struct wireless_dev - wireless device state * * For netdevs, this structure must be allocated by the driver * that uses the ieee80211_ptr field in struct net_device (this * is intentional so it can be allocated along with the netdev.) * It need not be registered then as netdev registration will - * be intercepted by cfg80211 to see the new wireless device. + * be intercepted by cfg80211 to see the new wireless device, + * however, drivers must lock the wiphy before registering or + * unregistering netdevs if they pre-create any netdevs (in ops + * called from cfg80211, the wiphy is already locked.) * * For non-netdev uses, it must also be allocated by the driver * in response to the cfg80211 callbacks that require it, as @@ -5171,20 +6244,15 @@ struct cfg80211_cqm_config; * * @wiphy: pointer to hardware description * @iftype: interface type + * @registered: is this wdev already registered with cfg80211 + * @registering: indicates we're doing registration under wiphy lock + * for the notifier * @list: (private) Used to collect the interfaces * @netdev: (private) Used to reference back to the netdev, may be %NULL * @identifier: (private) Identifier used in nl80211 to identify this * wireless device if it has no netdev - * @current_bss: (private) Used by the internal configuration code - * @chandef: (private) Used by the internal configuration code to track - * the user-set channel definition. - * @preset_chandef: (private) Used by the internal configuration code to - * track the channel to be used for AP later - * @bssid: (private) Used by the internal configuration code - * @ssid: (private) Used by the internal configuration code - * @ssid_len: (private) Used by the internal configuration code - * @mesh_id_len: (private) Used by the internal configuration code - * @mesh_id_up_len: (private) Used by the internal configuration code + * @u: union containing data specific to @iftype + * @connected: indicates if connected or not (STA mode) * @wext: (private) Used by the internal wireless extensions compat code * @wext.ibss: (private) IBSS data part of wext handling * @wext.connect: (private) connection handling data @@ -5202,19 +6270,11 @@ struct cfg80211_cqm_config; * netdev and may otherwise be used by driver read-only, will be update * by cfg80211 on change_interface * @mgmt_registrations: list of registrations for management frames - * @mgmt_registrations_lock: lock for the list * @mgmt_registrations_need_update: mgmt registrations were updated, * need to propagate the update to the driver - * @mtx: mutex used to lock data in this struct, may be used by drivers - * and some API functions require it held - * @beacon_interval: beacon interval used on this device for transmitting - * beacons, 0 when not valid * @address: The address for this device, valid only if @netdev is %NULL * @is_running: true if this is a non-netdev device that has been started, e.g. * the P2P Device. - * @cac_started: true if DFS channel availability check has been started - * @cac_start_time: timestamp (jiffies) when the dfs state was entered. - * @cac_time_ms: CAC time in ms * @ps: powersave mode is enabled * @ps_timeout: dynamic powersave timeout * @ap_unexpected_nlportid: (private) netlink port ID of application @@ -5225,18 +6285,26 @@ struct cfg80211_cqm_config; * @conn_owner_nlportid: (private) connection owner socket port ID * @disconnect_wk: (private) auto-disconnect work * @disconnect_bssid: (private) the BSSID to use for auto-disconnect - * @ibss_fixed: (private) IBSS is using fixed BSSID - * @ibss_dfs_possible: (private) IBSS may change to a DFS channel * @event_list: (private) list for internal event processing * @event_lock: (private) lock for event list * @owner_nlportid: (private) owner socket port ID * @nl_owner_dead: (private) owner socket went away + * @cqm_rssi_work: (private) CQM RSSI reporting work * @cqm_config: (private) nl80211 RSSI monitor state * @pmsr_list: (private) peer measurement requests * @pmsr_lock: (private) peer measurements requests/results lock * @pmsr_free_wk: (private) peer measurements cleanup work * @unprot_beacon_reported: (private) timestamp of last * unprotected beacon report + * @links: array of %IEEE80211_MLD_MAX_NUM_LINKS elements containing @addr + * @ap and @client for each link + * @links.cac_started: true if DFS channel availability check has been + * started + * @links.cac_start_time: timestamp (jiffies) when the dfs state was + * entered. + * @links.cac_time_ms: CAC time in ms + * @valid_links: bitmap describing what elements of @links are valid + * @radio_mask: Bitmask of radios that this interface is allowed to operate on. */ struct wireless_dev { struct wiphy *wiphy; @@ -5249,18 +6317,13 @@ struct wireless_dev { u32 identifier; struct list_head mgmt_registrations; - spinlock_t mgmt_registrations_lock; u8 mgmt_registrations_need_update:1; - struct mutex mtx; - - bool use_4addr, is_running; + bool use_4addr, is_running, registered, registering; u8 address[ETH_ALEN] __aligned(sizeof(u16)); /* currently used for IBSS and SME - might be rearranged later */ - u8 ssid[IEEE80211_MAX_SSID_LEN]; - u8 ssid_len, mesh_id_len, mesh_id_up_len; struct cfg80211_conn *conn; struct cfg80211_cached_keys *connect_keys; enum ieee80211_bss_type conn_bss_type; @@ -5272,27 +6335,16 @@ struct wireless_dev { struct list_head event_list; spinlock_t event_lock; - struct cfg80211_internal_bss *current_bss; /* associated / joined */ - struct cfg80211_chan_def preset_chandef; - struct cfg80211_chan_def chandef; - - bool ibss_fixed; - bool ibss_dfs_possible; + u8 connected:1; bool ps; int ps_timeout; - int beacon_interval; - u32 ap_unexpected_nlportid; u32 owner_nlportid; bool nl_owner_dead; - bool cac_started; - unsigned long cac_start_time; - unsigned int cac_time_ms; - #ifdef CONFIG_CFG80211_WEXT /* wext data */ struct { @@ -5309,16 +6361,67 @@ struct wireless_dev { } wext; #endif - struct cfg80211_cqm_config *cqm_config; + struct wiphy_work cqm_rssi_work; + struct cfg80211_cqm_config __rcu *cqm_config; struct list_head pmsr_list; spinlock_t pmsr_lock; struct work_struct pmsr_free_wk; unsigned long unprot_beacon_reported; -}; -static inline u8 *wdev_address(struct wireless_dev *wdev) + union { + struct { + u8 connected_addr[ETH_ALEN] __aligned(2); + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + } client; + struct { + int beacon_interval; + struct cfg80211_chan_def preset_chandef; + struct cfg80211_chan_def chandef; + u8 id[IEEE80211_MAX_MESH_ID_LEN]; + u8 id_len, id_up_len; + } mesh; + struct { + struct cfg80211_chan_def preset_chandef; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + } ap; + struct { + struct cfg80211_internal_bss *current_bss; + struct cfg80211_chan_def chandef; + int beacon_interval; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + } ibss; + struct { + struct cfg80211_chan_def chandef; + } ocb; + } u; + + struct { + u8 addr[ETH_ALEN] __aligned(2); + union { + struct { + unsigned int beacon_interval; + struct cfg80211_chan_def chandef; + } ap; + struct { + struct cfg80211_internal_bss *current_bss; + } client; + }; + + bool cac_started; + unsigned long cac_start_time; + unsigned int cac_time_ms; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; + u16 valid_links; + + u32 radio_mask; +}; + +static inline const u8 *wdev_address(struct wireless_dev *wdev) { if (wdev->netdev) return wdev->netdev->dev_addr; @@ -5345,6 +6448,32 @@ static inline void *wdev_priv(struct wireless_dev *wdev) } /** + * wdev_chandef - return chandef pointer from wireless_dev + * @wdev: the wdev + * @link_id: the link ID for MLO + * + * Return: The chandef depending on the mode, or %NULL. + */ +struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev, + unsigned int link_id); + +static inline void WARN_INVALID_LINK_ID(struct wireless_dev *wdev, + unsigned int link_id) +{ + WARN_ON(link_id && !wdev->valid_links); + WARN_ON(wdev->valid_links && + !(wdev->valid_links & BIT(link_id))); +} + +#define for_each_valid_link(link_info, link_id) \ + for (link_id = 0; \ + link_id < ((link_info)->valid_links ? \ + ARRAY_SIZE((link_info)->links) : 1); \ + link_id++) \ + if (!(link_info)->valid_links || \ + ((link_info)->valid_links & BIT(link_id))) + +/** * DOC: Utility functions * * cfg80211 offers a number of utility functions that can be useful. @@ -5453,6 +6582,8 @@ ieee80211_get_channel(struct wiphy *wiphy, int freq) * * The Preferred Scanning Channels (PSC) are defined in * Draft IEEE P802.11ax/D5.0, 26.17.2.3.3 + * + * Return: %true if channel is a PSC, %false otherwise */ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) { @@ -5463,6 +6594,28 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) } /** + * cfg80211_radio_chandef_valid - Check if the radio supports the chandef + * + * @radio: wiphy radio + * @chandef: chandef for current channel + * + * Return: whether or not the given chandef is valid for the given radio + */ +bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, + const struct cfg80211_chan_def *chandef); + +/** + * cfg80211_wdev_channel_allowed - Check if the wdev may use the channel + * + * @wdev: the wireless device + * @chan: channel to check + * + * Return: whether or not the wdev may use the channel + */ +bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan); + +/** * ieee80211_get_response_rate - get basic rate for a given rate * * @sband: the band to look for rates in @@ -5474,20 +6627,18 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) * which is, for this function, given as a bitmap of indices of * rates in the band's bitrate table. */ -struct ieee80211_rate * +const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate); /** * ieee80211_mandatory_rates - get mandatory rates for a given band * @sband: the band to look for rates in - * @scan_width: width of the control channel * - * This function returns a bitmap of the mandatory rates for the given - * band, bits are set according to the rate position in the bitrates array. + * Return: a bitmap of the mandatory rates for the given band, bits + * are set according to the rate position in the bitrates array. */ -u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, - enum nl80211_bss_scan_width scan_width); +u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband); /* * Radiotap parsing functions -- for controlled injection support @@ -5620,11 +6771,12 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr); * @addr: the device MAC address * @iftype: the virtual interface type * @data_offset: offset of payload after the 802.11 header + * @is_amsdu: true if the 802.11 header is A-MSDU * Return: 0 on success. Non-zero on error. */ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, const u8 *addr, enum nl80211_iftype iftype, - u8 data_offset); + u8 data_offset, bool is_amsdu); /** * ieee80211_data_to_8023 - convert an 802.11 data frame to 802.3 @@ -5636,10 +6788,26 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, enum nl80211_iftype iftype) { - return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype, 0); + return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype, 0, false); } /** + * ieee80211_is_valid_amsdu - check if subframe lengths of an A-MSDU are valid + * + * This is used to detect non-standard A-MSDU frames, e.g. the ones generated + * by ath10k and ath11k, where the subframe length includes the length of the + * mesh control field. + * + * @skb: The input A-MSDU frame without any headers. + * @mesh_hdr: the type of mesh header to test + * 0: non-mesh A-MSDU length field + * 1: big-endian mesh A-MSDU length field + * 2: little-endian mesh A-MSDU length field + * Returns: true if subframe header lengths are valid for the @mesh_hdr mode + */ +bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr); + +/** * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame * * Decode an IEEE 802.11 A-MSDU and convert it to a list of 802.3 frames. @@ -5654,11 +6822,38 @@ static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, * @extra_headroom: The hardware extra headroom for SKBs in the @list. * @check_da: DA to check in the inner ethernet header, or NULL * @check_sa: SA to check in the inner ethernet header, or NULL + * @mesh_control: see mesh_hdr in ieee80211_is_valid_amsdu */ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, - const u8 *check_da, const u8 *check_sa); + const u8 *check_da, const u8 *check_sa, + u8 mesh_control); + +/** + * ieee80211_get_8023_tunnel_proto - get RFC1042 or bridge tunnel encap protocol + * + * Check for RFC1042 or bridge tunnel header and fetch the encapsulated + * protocol. + * + * @hdr: pointer to the MSDU payload + * @proto: destination pointer to store the protocol + * Return: true if encapsulation was found + */ +bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto); + +/** + * ieee80211_strip_8023_mesh_hdr - strip mesh header from converted 802.3 frames + * + * Strip the mesh header, which was left in by ieee80211_data_to_8023 as part + * of the MSDU data. Also move any source/destination addresses from the mesh + * header to the ethernet header (if present). + * + * @skb: The 802.3 frame with embedded mesh header + * + * Return: 0 on success. Non-zero on error. + */ +int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb); /** * cfg80211_classify8021d - determine the 802.1p/1d tag for a data frame @@ -5732,9 +6927,9 @@ cfg80211_find_ie_match(u8 eid, const u8 *ies, unsigned int len, (!match_len && match_offset))) return NULL; - return (void *)cfg80211_find_elem_match(eid, ies, len, - match, match_len, - match_offset ? + return (const void *)cfg80211_find_elem_match(eid, ies, len, + match, match_len, + match_offset ? match_offset - 2 : 0); } @@ -5786,7 +6981,7 @@ static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) * @ies: data consisting of IEs * @len: length of data * - * Return: %NULL if the etended element could not be found or if + * Return: %NULL if the extended element could not be found or if * the element is invalid (claims to be longer than the given * data) or if the byte array doesn't match; otherwise return the * requested element struct. @@ -5861,10 +7056,64 @@ static inline const u8 * cfg80211_find_vendor_ie(unsigned int oui, int oui_type, const u8 *ies, unsigned int len) { - return (void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len); + return (const void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len); } /** + * enum cfg80211_rnr_iter_ret - reduced neighbor report iteration state + * @RNR_ITER_CONTINUE: continue iterating with the next entry + * @RNR_ITER_BREAK: break iteration and return success + * @RNR_ITER_ERROR: break iteration and return error + */ +enum cfg80211_rnr_iter_ret { + RNR_ITER_CONTINUE, + RNR_ITER_BREAK, + RNR_ITER_ERROR, +}; + +/** + * cfg80211_iter_rnr - iterate reduced neighbor report entries + * @elems: the frame elements to iterate RNR elements and then + * their entries in + * @elems_len: length of the elements + * @iter: iteration function, see also &enum cfg80211_rnr_iter_ret + * for the return value + * @iter_data: additional data passed to the iteration function + * Return: %true on success (after successfully iterating all entries + * or if the iteration function returned %RNR_ITER_BREAK), + * %false on error (iteration function returned %RNR_ITER_ERROR + * or elements were malformed.) + */ +bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len, + enum cfg80211_rnr_iter_ret + (*iter)(void *data, u8 type, + const struct ieee80211_neighbor_ap_info *info, + const u8 *tbtt_info, u8 tbtt_info_len), + void *iter_data); + +/** + * cfg80211_defragment_element - Defrag the given element data into a buffer + * + * @elem: the element to defragment + * @ies: elements where @elem is contained + * @ieslen: length of @ies + * @data: buffer to store element data, or %NULL to just determine size + * @data_len: length of @data, or 0 + * @frag_id: the element ID of fragments + * + * Return: length of @data, or -EINVAL on error + * + * Copy out all data from an element that may be fragmented into @data, while + * skipping all headers. + * + * The function uses memmove() internally. It is acceptable to defragment an + * element in-place. + */ +ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies, + size_t ieslen, u8 *data, size_t data_len, + u8 frag_id); + +/** * cfg80211_send_layer2_update - send layer 2 update frame * * @dev: network device @@ -5911,7 +7160,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2); /** * regulatory_set_wiphy_regd - set regdom info for self managed drivers * @wiphy: the wireless device we want to process the regulatory domain on - * @rd: the regulatory domain informatoin to use for this wiphy + * @rd: the regulatory domain information to use for this wiphy * * Set the regulatory domain information for self-managed wiphys, only they * may use this function. See %REGULATORY_WIPHY_SELF_MANAGED for more @@ -5923,18 +7172,18 @@ int regulatory_set_wiphy_regd(struct wiphy *wiphy, struct ieee80211_regdomain *rd); /** - * regulatory_set_wiphy_regd_sync_rtnl - set regdom for self-managed drivers + * regulatory_set_wiphy_regd_sync - set regdom for self-managed drivers * @wiphy: the wireless device we want to process the regulatory domain on * @rd: the regulatory domain information to use for this wiphy * - * This functions requires the RTNL to be held and applies the new regdomain - * synchronously to this wiphy. For more details see - * regulatory_set_wiphy_regd(). + * This functions requires the RTNL and the wiphy mutex to be held and + * applies the new regdomain synchronously to this wiphy. For more details + * see regulatory_set_wiphy_regd(). * * Return: 0 on success. -EINVAL, -EPERM */ -int regulatory_set_wiphy_regd_sync_rtnl(struct wiphy *wiphy, - struct ieee80211_regdomain *rd); +int regulatory_set_wiphy_regd_sync(struct wiphy *wiphy, + struct ieee80211_regdomain *rd); /** * wiphy_apply_custom_regulatory - apply a custom driver regulatory domain @@ -5981,6 +7230,8 @@ const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, * * You can use this to map the regulatory request initiator enum to a * proper string representation. + * + * Return: pointer to string representation of the initiator */ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); @@ -5989,6 +7240,8 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); * @wiphy: wiphy for which pre-CAC capability is checked. * * Pre-CAC is allowed only in some regdomains (notable ETSI). + * + * Return: %true if allowed, %false otherwise */ bool regulatory_pre_cac_allowed(struct wiphy *wiphy); @@ -6002,7 +7255,7 @@ bool regulatory_pre_cac_allowed(struct wiphy *wiphy); * Regulatory self-managed driver can use it to proactively * * @alpha2: the ISO/IEC 3166 alpha2 wmm rule to be queried. - * @freq: the freqency(in MHz) to be queried. + * @freq: the frequency (in MHz) to be queried. * @rule: pointer to store the wmm rule from the regulatory db. * * Self-managed wireless drivers can use this function to query @@ -6052,7 +7305,7 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid); void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid); /** - * cfg80211_sched_scan_stopped_rtnl - notify that the scheduled scan has stopped + * cfg80211_sched_scan_stopped_locked - notify that the scheduled scan has stopped * * @wiphy: the wiphy on which the scheduled scan stopped * @reqid: identifier for the related scheduled scan request @@ -6060,9 +7313,9 @@ void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid); * The driver can call this function to inform cfg80211 that the * scheduled scan had to be stopped, for whatever reason. The driver * is then called back via the sched_scan_stop operation when done. - * This function should be called with rtnl locked. + * This function should be called with the wiphy mutex held. */ -void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy, u64 reqid); +void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid); /** * cfg80211_inform_bss_frame_data - inform cfg80211 of a received BSS frame @@ -6085,22 +7338,6 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, gfp_t gfp); static inline struct cfg80211_bss * __must_check -cfg80211_inform_bss_width_frame(struct wiphy *wiphy, - struct ieee80211_channel *rx_channel, - enum nl80211_bss_scan_width scan_width, - struct ieee80211_mgmt *mgmt, size_t len, - s32 signal, gfp_t gfp) -{ - struct cfg80211_inform_bss data = { - .chan = rx_channel, - .scan_width = scan_width, - .signal = signal, - }; - - return cfg80211_inform_bss_frame_data(wiphy, &data, mgmt, len, gfp); -} - -static inline struct cfg80211_bss * __must_check cfg80211_inform_bss_frame(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, struct ieee80211_mgmt *mgmt, size_t len, @@ -6108,7 +7345,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, { struct cfg80211_inform_bss data = { .chan = rx_channel, - .scan_width = NL80211_BSS_CHAN_WIDTH_20, .signal = signal, }; @@ -6140,6 +7376,8 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid, * cfg80211_is_element_inherited - returns if element ID should be inherited * @element: element to check * @non_inherit_element: non inheritance element + * + * Return: %true if should be inherited, %false otherwise */ bool cfg80211_is_element_inherited(const struct element *element, const struct element *non_inherit_element); @@ -6152,6 +7390,8 @@ bool cfg80211_is_element_inherited(const struct element *element, * @sub_elem: current MBSSID subelement (profile) * @merged_ie: location of the merged profile * @max_copy_len: max merged profile length + * + * Return: the number of bytes merged */ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, const struct element *mbssid_elem, @@ -6164,14 +7404,44 @@ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, * from a beacon or probe response * @CFG80211_BSS_FTYPE_BEACON: data comes from a beacon * @CFG80211_BSS_FTYPE_PRESP: data comes from a probe response + * @CFG80211_BSS_FTYPE_S1G_BEACON: data comes from an S1G beacon */ enum cfg80211_bss_frame_type { CFG80211_BSS_FTYPE_UNKNOWN, CFG80211_BSS_FTYPE_BEACON, CFG80211_BSS_FTYPE_PRESP, + CFG80211_BSS_FTYPE_S1G_BEACON, }; /** + * cfg80211_get_ies_channel_number - returns the channel number from ies + * @ie: IEs + * @ielen: length of IEs + * @band: enum nl80211_band of the channel + * + * Return: the channel number, or -1 if none could be determined. + */ +int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, + enum nl80211_band band); + +/** + * cfg80211_ssid_eq - compare two SSIDs + * @a: first SSID + * @b: second SSID + * + * Return: %true if SSIDs are equal, %false otherwise. + */ +static inline bool +cfg80211_ssid_eq(struct cfg80211_ssid *a, struct cfg80211_ssid *b) +{ + if (WARN_ON(!a || !b)) + return false; + if (a->ssid_len != b->ssid_len) + return false; + return memcmp(a->ssid, b->ssid, a->ssid_len) ? false : true; +} + +/** * cfg80211_inform_bss_data - inform cfg80211 of a new BSS * * @wiphy: the wiphy reporting the BSS @@ -6200,26 +7470,6 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, gfp_t gfp); static inline struct cfg80211_bss * __must_check -cfg80211_inform_bss_width(struct wiphy *wiphy, - struct ieee80211_channel *rx_channel, - enum nl80211_bss_scan_width scan_width, - enum cfg80211_bss_frame_type ftype, - const u8 *bssid, u64 tsf, u16 capability, - u16 beacon_interval, const u8 *ie, size_t ielen, - s32 signal, gfp_t gfp) -{ - struct cfg80211_inform_bss data = { - .chan = rx_channel, - .scan_width = scan_width, - .signal = signal, - }; - - return cfg80211_inform_bss_data(wiphy, &data, ftype, bssid, tsf, - capability, beacon_interval, ie, ielen, - gfp); -} - -static inline struct cfg80211_bss * __must_check cfg80211_inform_bss(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, enum cfg80211_bss_frame_type ftype, @@ -6229,7 +7479,6 @@ cfg80211_inform_bss(struct wiphy *wiphy, { struct cfg80211_inform_bss data = { .chan = rx_channel, - .scan_width = NL80211_BSS_CHAN_WIDTH_20, .signal = signal, }; @@ -6239,6 +7488,27 @@ cfg80211_inform_bss(struct wiphy *wiphy, } /** + * __cfg80211_get_bss - get a BSS reference + * @wiphy: the wiphy this BSS struct belongs to + * @channel: the channel to search on (or %NULL) + * @bssid: the desired BSSID (or %NULL) + * @ssid: the desired SSID (or %NULL) + * @ssid_len: length of the SSID (or 0) + * @bss_type: type of BSS, see &enum ieee80211_bss_type + * @privacy: privacy filter, see &enum ieee80211_privacy + * @use_for: indicates which use is intended + * + * Return: Reference-counted BSS on success. %NULL on error. + */ +struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy, + u32 use_for); + +/** * cfg80211_get_bss - get a BSS reference * @wiphy: the wiphy this BSS struct belongs to * @channel: the channel to search on (or %NULL) @@ -6247,13 +7517,22 @@ cfg80211_inform_bss(struct wiphy *wiphy, * @ssid_len: length of the SSID (or 0) * @bss_type: type of BSS, see &enum ieee80211_bss_type * @privacy: privacy filter, see &enum ieee80211_privacy + * + * This version implies regular usage, %NL80211_BSS_USE_FOR_NORMAL. + * + * Return: Reference-counted BSS on success. %NULL on error. */ -struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, - struct ieee80211_channel *channel, - const u8 *bssid, - const u8 *ssid, size_t ssid_len, - enum ieee80211_bss_type bss_type, - enum ieee80211_privacy privacy); +static inline struct cfg80211_bss * +cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, + const u8 *bssid, const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy) +{ + return __cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, + bss_type, privacy, + NL80211_BSS_USE_FOR_NORMAL); +} + static inline struct cfg80211_bss * cfg80211_get_ibss(struct wiphy *wiphy, struct ieee80211_channel *channel, @@ -6314,19 +7593,6 @@ void cfg80211_bss_iter(struct wiphy *wiphy, void *data), void *iter_data); -static inline enum nl80211_bss_scan_width -cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef) -{ - switch (chandef->width) { - case NL80211_CHAN_WIDTH_5: - return NL80211_BSS_CHAN_WIDTH_5; - case NL80211_CHAN_WIDTH_10: - return NL80211_BSS_CHAN_WIDTH_10; - default: - return NL80211_BSS_CHAN_WIDTH_20; - } -} - /** * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame * @dev: network device @@ -6359,16 +7625,39 @@ void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); /** - * cfg80211_rx_assoc_resp - notification of processed association response - * @dev: network device - * @bss: the BSS that association was requested with, ownership of the pointer - * moves to cfg80211 in this call + * struct cfg80211_rx_assoc_resp_data - association response data * @buf: (Re)Association Response frame (header + body) * @len: length of the frame data * @uapsd_queues: bitmap of queues configured for uapsd. Same format * as the AC bitmap in the QoS info field * @req_ies: information elements from the (Re)Association Request frame * @req_ies_len: length of req_ies data + * @ap_mld_addr: AP MLD address (in case of MLO) + * @links: per-link information indexed by link ID, use links[0] for + * non-MLO connections + * @links.bss: the BSS that association was requested with, ownership of the + * pointer moves to cfg80211 in the call to cfg80211_rx_assoc_resp() + * @links.status: Set this (along with a BSS pointer) for links that + * were rejected by the AP. + */ +struct cfg80211_rx_assoc_resp_data { + const u8 *buf; + size_t len; + const u8 *req_ies; + size_t req_ies_len; + int uapsd_queues; + const u8 *ap_mld_addr; + struct { + u8 addr[ETH_ALEN] __aligned(2); + struct cfg80211_bss *bss; + u16 status; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; +}; + +/** + * cfg80211_rx_assoc_resp - notification of processed association response + * @dev: network device + * @data: association response data, &struct cfg80211_rx_assoc_resp_data * * After being asked to associate via cfg80211_ops::assoc() the driver must * call either this function or cfg80211_auth_timeout(). @@ -6376,43 +7665,47 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * This function may sleep. The caller must hold the corresponding wdev's mutex. */ void cfg80211_rx_assoc_resp(struct net_device *dev, - struct cfg80211_bss *bss, - const u8 *buf, size_t len, - int uapsd_queues, - const u8 *req_ies, size_t req_ies_len); + const struct cfg80211_rx_assoc_resp_data *data); /** - * cfg80211_assoc_timeout - notification of timed out association - * @dev: network device - * @bss: The BSS entry with which association timed out. - * - * This function may sleep. The caller must hold the corresponding wdev's mutex. + * struct cfg80211_assoc_failure - association failure data + * @ap_mld_addr: AP MLD address, or %NULL + * @bss: list of BSSes, must use entry 0 for non-MLO connections + * (@ap_mld_addr is %NULL) + * @timeout: indicates the association failed due to timeout, otherwise + * the association was abandoned for a reason reported through some + * other API (e.g. deauth RX) */ -void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss); +struct cfg80211_assoc_failure { + const u8 *ap_mld_addr; + struct cfg80211_bss *bss[IEEE80211_MLD_MAX_NUM_LINKS]; + bool timeout; +}; /** - * cfg80211_abandon_assoc - notify cfg80211 of abandoned association attempt + * cfg80211_assoc_failure - notification of association failure * @dev: network device - * @bss: The BSS entry with which association was abandoned. + * @data: data describing the association failure * - * Call this whenever - for reasons reported through other API, like deauth RX, - * an association attempt was abandoned. * This function may sleep. The caller must hold the corresponding wdev's mutex. */ -void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss); +void cfg80211_assoc_failure(struct net_device *dev, + struct cfg80211_assoc_failure *data); /** * cfg80211_tx_mlme_mgmt - notification of transmitted deauth/disassoc frame * @dev: network device * @buf: 802.11 frame (header + body) * @len: length of the frame data + * @reconnect: immediate reconnect is desired (include the nl80211 attribute) * * This function is called whenever deauthentication has been processed in * station mode. This includes both received deauthentication frames and * locally generated ones. This function may sleep. The caller must hold the * corresponding wdev's mutex. */ -void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); +void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len, + bool reconnect); /** * cfg80211_rx_unprot_mlme_mgmt - notification of unprotected mlme mgmt frame @@ -6467,12 +7760,14 @@ void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, struct ieee80211_channel *channel, gfp_t gfp); /** - * cfg80211_notify_new_candidate - notify cfg80211 of a new mesh peer candidate + * cfg80211_notify_new_peer_candidate - notify cfg80211 of a new mesh peer + * candidate * * @dev: network device * @macaddr: the MAC address of the new candidate * @ie: information elements advertised by the peer candidate * @ie_len: length of the information elements buffer + * @sig_dbm: signal level in dBm * @gfp: allocation flags * * This function notifies cfg80211 that the mesh peer candidate has been @@ -6489,7 +7784,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, * RFkill integration in cfg80211 is almost invisible to drivers, * as cfg80211 automatically registers an rfkill instance for each * wireless device it knows about. Soft kill is also translated - * into disconnecting and turning all interfaces off, drivers are + * into disconnecting and turning all interfaces off. Drivers are * expected to turn off the device when all interfaces are down. * * However, devices may have a hard RFkill line, in which case they @@ -6498,11 +7793,19 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, */ /** - * wiphy_rfkill_set_hw_state - notify cfg80211 about hw block state + * wiphy_rfkill_set_hw_state_reason - notify cfg80211 about hw block state * @wiphy: the wiphy * @blocked: block status + * @reason: one of reasons in &enum rfkill_hard_block_reasons */ -void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked); +void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, + enum rfkill_hard_block_reasons reason); + +static inline void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked) +{ + wiphy_rfkill_set_hw_state_reason(wiphy, blocked, + RFKILL_HARD_BLOCK_SIGNAL); +} /** * wiphy_rfkill_start_polling - start polling rfkill @@ -6514,7 +7817,10 @@ void wiphy_rfkill_start_polling(struct wiphy *wiphy); * wiphy_rfkill_stop_polling - stop polling rfkill * @wiphy: the wiphy */ -void wiphy_rfkill_stop_polling(struct wiphy *wiphy); +static inline void wiphy_rfkill_stop_polling(struct wiphy *wiphy) +{ + rfkill_pause_polling(wiphy->rfkill); +} /** * DOC: Vendor commands @@ -6526,7 +7832,7 @@ void wiphy_rfkill_stop_polling(struct wiphy *wiphy); * the configuration mechanism. * * A driver supporting vendor commands must register them as an array - * in struct wiphy, with handlers for each one, each command has an + * in struct wiphy, with handlers for each one. Each command has an * OUI and sub command ID to identify it. * * Note that this feature should not be (ab)used to implement protocol @@ -6596,11 +7902,12 @@ cfg80211_vendor_cmd_alloc_reply_skb(struct wiphy *wiphy, int approxlen) int cfg80211_vendor_cmd_reply(struct sk_buff *skb); /** - * cfg80211_vendor_cmd_get_sender + * cfg80211_vendor_cmd_get_sender - get the current sender netlink ID * @wiphy: the wiphy * - * Return the current netlink port ID in a vendor command handler. - * Valid to call only there. + * Return: the current netlink port ID in a vendor command handler. + * + * Context: May only be called from a vendor command handler */ unsigned int cfg80211_vendor_cmd_get_sender(struct wiphy *wiphy); @@ -6690,7 +7997,7 @@ static inline void cfg80211_vendor_event(struct sk_buff *skb, gfp_t gfp) * interact with driver-specific tools to aid, for instance, * factory programming. * - * This chapter describes how drivers interact with it, for more + * This chapter describes how drivers interact with it. For more * information see the nl80211 book's chapter on it. */ @@ -6826,13 +8133,6 @@ struct cfg80211_fils_resp_params { * indicate that this is a failure, but without a status code. * @timeout_reason is used to report the reason for the timeout in that * case. - * @bssid: The BSSID of the AP (may be %NULL) - * @bss: Entry of bss to which STA got connected to, can be obtained through - * cfg80211_get_bss() (may be %NULL). But it is recommended to store the - * bss from the connect_request and hold a reference to it and return - * through this param to avoid a warning if the bss is expired during the - * connection, esp. for those drivers implementing connect op. - * Only one parameter among @bssid and @bss needs to be specified. * @req_ie: Association request IEs (may be %NULL) * @req_ie_len: Association request IEs length * @resp_ie: Association response IEs (may be %NULL) @@ -6844,17 +8144,45 @@ struct cfg80211_fils_resp_params { * not known. This value is used only if @status < 0 to indicate that the * failure is due to a timeout and not due to explicit rejection by the AP. * This value is ignored in other cases (@status >= 0). + * @valid_links: For MLO connection, BIT mask of the valid link ids. Otherwise + * zero. + * @ap_mld_addr: For MLO connection, MLD address of the AP. Otherwise %NULL. + * @links : For MLO connection, contains link info for the valid links indicated + * using @valid_links. For non-MLO connection, links[0] contains the + * connected AP info. + * @links.addr: For MLO connection, MAC address of the STA link. Otherwise + * %NULL. + * @links.bssid: For MLO connection, MAC address of the AP link. For non-MLO + * connection, links[0].bssid points to the BSSID of the AP (may be %NULL). + * @links.bss: For MLO connection, entry of bss to which STA link is connected. + * For non-MLO connection, links[0].bss points to entry of bss to which STA + * is connected. It can be obtained through cfg80211_get_bss() (may be + * %NULL). It is recommended to store the bss from the connect_request and + * hold a reference to it and return through this param to avoid a warning + * if the bss is expired during the connection, esp. for those drivers + * implementing connect op. Only one parameter among @bssid and @bss needs + * to be specified. + * @links.status: per-link status code, to report a status code that's not + * %WLAN_STATUS_SUCCESS for a given link, it must also be in the + * @valid_links bitmap and may have a BSS pointer (which is then released) */ struct cfg80211_connect_resp_params { int status; - const u8 *bssid; - struct cfg80211_bss *bss; const u8 *req_ie; size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; struct cfg80211_fils_resp_params fils; enum nl80211_timeout_reason timeout_reason; + + const u8 *ap_mld_addr; + u16 valid_links; + struct { + const u8 *addr; + const u8 *bssid; + struct cfg80211_bss *bss; + u16 status; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; /** @@ -6924,8 +8252,8 @@ cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, memset(¶ms, 0, sizeof(params)); params.status = status; - params.bssid = bssid; - params.bss = bss; + params.links[0].bssid = bssid; + params.links[0].bss = bss; params.req_ie = req_ie; params.req_ie_len = req_ie_len; params.resp_ie = resp_ie; @@ -6996,24 +8324,40 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid, /** * struct cfg80211_roam_info - driver initiated roaming information * - * @channel: the channel of the new AP - * @bss: entry of bss to which STA got roamed (may be %NULL if %bssid is set) - * @bssid: the BSSID of the new AP (may be %NULL if %bss is set) * @req_ie: association request IEs (maybe be %NULL) * @req_ie_len: association request IEs length * @resp_ie: association response IEs (may be %NULL) * @resp_ie_len: assoc response IEs length * @fils: FILS related roaming information. + * @valid_links: For MLO roaming, BIT mask of the new valid links is set. + * Otherwise zero. + * @ap_mld_addr: For MLO roaming, MLD address of the new AP. Otherwise %NULL. + * @links : For MLO roaming, contains new link info for the valid links set in + * @valid_links. For non-MLO roaming, links[0] contains the new AP info. + * @links.addr: For MLO roaming, MAC address of the STA link. Otherwise %NULL. + * @links.bssid: For MLO roaming, MAC address of the new AP link. For non-MLO + * roaming, links[0].bssid points to the BSSID of the new AP. May be + * %NULL if %links.bss is set. + * @links.channel: the channel of the new AP. + * @links.bss: For MLO roaming, entry of new bss to which STA link got + * roamed. For non-MLO roaming, links[0].bss points to entry of bss to + * which STA got roamed (may be %NULL if %links.bssid is set) */ struct cfg80211_roam_info { - struct ieee80211_channel *channel; - struct cfg80211_bss *bss; - const u8 *bssid; const u8 *req_ie; size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; struct cfg80211_fils_resp_params fils; + + const u8 *ap_mld_addr; + u16 valid_links; + struct { + const u8 *addr; + const u8 *bssid; + struct ieee80211_channel *channel; + struct cfg80211_bss *bss; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; /** @@ -7041,7 +8385,10 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * cfg80211_port_authorized - notify cfg80211 of successful security association * * @dev: network device - * @bssid: the BSSID of the AP + * @peer_addr: BSSID of the AP/P2P GO in case of STA/GC or STA/GC MAC address + * in case of AP/P2P GO + * @td_bitmap: transition disable policy + * @td_bitmap_len: Length of transition disable policy * @gfp: allocation flags * * This function should be called by a driver that supports 4 way handshake @@ -7050,9 +8397,12 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * should be preceded with a call to cfg80211_connect_result(), * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to * indicate the 802.11 association. + * This function can also be called by AP/P2P GO driver that supports + * authentication offload. In this case the peer_mac passed is that of + * associated STA/GC. */ -void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, - gfp_t gfp); +void cfg80211_port_authorized(struct net_device *dev, const u8 *peer_addr, + const u8* td_bitmap, u8 td_bitmap_len, gfp_t gfp); /** * cfg80211_disconnected - notify cfg80211 that connection was dropped @@ -7110,6 +8460,8 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, * * @sinfo: the station information * @gfp: allocation flags + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); @@ -7140,7 +8492,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, /** * cfg80211_del_sta_sinfo - notify userspace about deletion of a station * @dev: the netdev - * @mac_addr: the station's address + * @mac_addr: the station's address. For MLD station, MLD address is used. * @sinfo: the station information/statistics * @gfp: allocation flags */ @@ -7151,7 +8503,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, * cfg80211_del_sta - notify userspace about deletion of a station * * @dev: the netdev - * @mac_addr: the station's address + * @mac_addr: the station's address. For MLD station, MLD address is used. * @gfp: allocation flags */ static inline void cfg80211_del_sta(struct net_device *dev, @@ -7180,6 +8532,48 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, gfp_t gfp); /** + * struct cfg80211_rx_info - received management frame info + * + * @freq: Frequency on which the frame was received in kHz + * @sig_dbm: signal strength in dBm, or 0 if unknown + * @have_link_id: indicates the frame was received on a link of + * an MLD, i.e. the @link_id field is valid + * @link_id: the ID of the link the frame was received on + * @buf: Management frame (header + body) + * @len: length of the frame data + * @flags: flags, as defined in &enum nl80211_rxmgmt_flags + * @rx_tstamp: Hardware timestamp of frame RX in nanoseconds + * @ack_tstamp: Hardware timestamp of ack TX in nanoseconds + */ +struct cfg80211_rx_info { + int freq; + int sig_dbm; + bool have_link_id; + u8 link_id; + const u8 *buf; + size_t len; + u32 flags; + u64 rx_tstamp; + u64 ack_tstamp; +}; + +/** + * cfg80211_rx_mgmt_ext - management frame notification with extended info + * @wdev: wireless device receiving the frame + * @info: RX info as defined in struct cfg80211_rx_info + * + * This function is called whenever an Action frame is received for a station + * mode interface, but is not processed in kernel. + * + * Return: %true if a user space application has registered for this frame. + * For action frames, that makes it responsible for rejecting unrecognized + * action frames; %false otherwise, in which case for action frames the + * driver is responsible for rejecting the frame. + */ +bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev, + struct cfg80211_rx_info *info); + +/** * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame * @wdev: wireless device receiving the frame * @freq: Frequency on which the frame was received in KHz @@ -7196,8 +8590,20 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * action frames; %false otherwise, in which case for action frames the * driver is responsible for rejecting the frame. */ -bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags); +static inline bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, + int sig_dbm, const u8 *buf, size_t len, + u32 flags) +{ + struct cfg80211_rx_info info = { + .freq = freq, + .sig_dbm = sig_dbm, + .buf = buf, + .len = len, + .flags = flags + }; + + return cfg80211_rx_mgmt_ext(wdev, &info); +} /** * cfg80211_rx_mgmt - notification of received, unprocessed management frame @@ -7220,11 +8626,50 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, const u8 *buf, size_t len, u32 flags) { - return cfg80211_rx_mgmt_khz(wdev, MHZ_TO_KHZ(freq), sig_dbm, buf, len, - flags); + struct cfg80211_rx_info info = { + .freq = MHZ_TO_KHZ(freq), + .sig_dbm = sig_dbm, + .buf = buf, + .len = len, + .flags = flags + }; + + return cfg80211_rx_mgmt_ext(wdev, &info); } /** + * struct cfg80211_tx_status - TX status for management frame information + * + * @cookie: Cookie returned by cfg80211_ops::mgmt_tx() + * @tx_tstamp: hardware TX timestamp in nanoseconds + * @ack_tstamp: hardware ack RX timestamp in nanoseconds + * @buf: Management frame (header + body) + * @len: length of the frame data + * @ack: Whether frame was acknowledged + */ +struct cfg80211_tx_status { + u64 cookie; + u64 tx_tstamp; + u64 ack_tstamp; + const u8 *buf; + size_t len; + bool ack; +}; + +/** + * cfg80211_mgmt_tx_status_ext - TX status notification with extended info + * @wdev: wireless device receiving the frame + * @status: TX status data + * @gfp: context flags + * + * This function is called whenever a management frame was requested to be + * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the + * transmission attempt with extended info. + */ +void cfg80211_mgmt_tx_status_ext(struct wireless_dev *wdev, + struct cfg80211_tx_status *status, gfp_t gfp); + +/** * cfg80211_mgmt_tx_status - notification of TX status for management frame * @wdev: wireless device receiving the frame * @cookie: Cookie returned by cfg80211_ops::mgmt_tx() @@ -7237,8 +8682,19 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the * transmission attempt. */ -void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp); +static inline void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, + u64 cookie, const u8 *buf, + size_t len, bool ack, gfp_t gfp) +{ + struct cfg80211_tx_status status = { + .cookie = cookie, + .buf = buf, + .len = len, + .ack = ack + }; + + cfg80211_mgmt_tx_status_ext(wdev, &status, gfp); +} /** * cfg80211_control_port_tx_status - notification of TX status for control @@ -7267,6 +8723,7 @@ void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, * responsible for any cleanup. The caller must also ensure that * skb->protocol is set appropriately. * @unencrypted: Whether the frame was received unencrypted + * @link_id: the link the frame was received on, -1 if not applicable or unknown * * This function is used to inform userspace about a received control port * frame. It should only be used if userspace indicated it wants to receive @@ -7277,8 +8734,8 @@ void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, * * Return: %true if the frame was passed to userspace */ -bool cfg80211_rx_control_port(struct net_device *dev, - struct sk_buff *skb, bool unencrypted); +bool cfg80211_rx_control_port(struct net_device *dev, struct sk_buff *skb, + bool unencrypted, int link_id); /** * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event @@ -7331,15 +8788,33 @@ void cfg80211_cqm_txe_notify(struct net_device *dev, const u8 *peer, void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp); /** - * cfg80211_radar_event - radar detection event + * __cfg80211_radar_event - radar detection event * @wiphy: the wiphy * @chandef: chandef for the current channel + * @offchan: the radar has been detected on the offchannel chain * @gfp: context flags * * This function is called when a radar is detected on the current chanenl. */ -void cfg80211_radar_event(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef, gfp_t gfp); +void __cfg80211_radar_event(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + bool offchan, gfp_t gfp); + +static inline void +cfg80211_radar_event(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + gfp_t gfp) +{ + __cfg80211_radar_event(wiphy, chandef, false, gfp); +} + +static inline void +cfg80211_background_radar_event(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + gfp_t gfp) +{ + __cfg80211_radar_event(wiphy, chandef, true, gfp); +} /** * cfg80211_sta_opmode_change_notify - STA's ht/vht operation mode change event @@ -7361,6 +8836,7 @@ void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, * @chandef: chandef for the current channel * @event: type of event * @gfp: context flags + * @link_id: valid link_id for MLO operation or 0 otherwise. * * This function is called when a Channel availability check (CAC) is finished * or aborted. This must be called to notify the completion of a CAC process, @@ -7368,8 +8844,17 @@ void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, */ void cfg80211_cac_event(struct net_device *netdev, const struct cfg80211_chan_def *chandef, - enum nl80211_radar_event event, gfp_t gfp); + enum nl80211_radar_event event, gfp_t gfp, + unsigned int link_id); +/** + * cfg80211_background_cac_abort - Channel Availability Check offchan abort event + * @wiphy: the wiphy + * + * This function is called by the driver when a Channel Availability Check + * (CAC) is aborted by a offchannel dedicated chain. + */ +void cfg80211_background_cac_abort(struct wiphy *wiphy); /** * cfg80211_gtk_rekey_notify - notify userspace about driver rekeying @@ -7473,6 +8958,34 @@ static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy, } /** + * struct cfg80211_beaconing_check_config - beacon check configuration + * @iftype: the interface type to check for + * @relax: allow IR-relaxation conditions to apply (e.g. another + * interface connected already on the same channel) + * NOTE: If this is set, wiphy mutex must be held. + * @reg_power: &enum ieee80211_ap_reg_power value indicating the + * advertised/used 6 GHz regulatory power setting + */ +struct cfg80211_beaconing_check_config { + enum nl80211_iftype iftype; + enum ieee80211_ap_reg_power reg_power; + bool relax; +}; + +/** + * cfg80211_reg_check_beaconing - check if beaconing is allowed + * @wiphy: the wiphy + * @chandef: the channel definition + * @cfg: additional parameters for the checking + * + * Return: %true if there is no secondary channel or the secondary channel(s) + * can be used for beaconing (i.e. is not a radar channel etc.) + */ +bool cfg80211_reg_check_beaconing(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + struct cfg80211_beaconing_check_config *cfg); + +/** * cfg80211_reg_can_beacon - check if beaconing is allowed * @wiphy: the wiphy * @chandef: the channel definition @@ -7481,9 +8994,17 @@ static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy, * Return: %true if there is no secondary channel or the secondary channel(s) * can be used for beaconing (i.e. is not a radar channel etc.) */ -bool cfg80211_reg_can_beacon(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef, - enum nl80211_iftype iftype); +static inline bool +cfg80211_reg_can_beacon(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) +{ + struct cfg80211_beaconing_check_config config = { + .iftype = iftype, + }; + + return cfg80211_reg_check_beaconing(wiphy, chandef, &config); +} /** * cfg80211_reg_can_beacon_relax - check if beaconing is allowed with relaxation @@ -7496,28 +9017,41 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy, * also checks if IR-relaxation conditions apply, to allow beaconing under * more permissive conditions. * - * Requires the RTNL to be held. + * Context: Requires the wiphy mutex to be held. */ -bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef, - enum nl80211_iftype iftype); +static inline bool +cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) +{ + struct cfg80211_beaconing_check_config config = { + .iftype = iftype, + .relax = true, + }; -/* + return cfg80211_reg_check_beaconing(wiphy, chandef, &config); +} + +/** * cfg80211_ch_switch_notify - update wdev channel and notify userspace * @dev: the device which switched channels * @chandef: the new channel definition + * @link_id: the link ID for MLO, must be 0 for non-MLO * - * Caller must acquire wdev_lock, therefore must only be called from sleepable + * Caller must hold wiphy mutex, therefore must only be called from sleepable * driver context! */ void cfg80211_ch_switch_notify(struct net_device *dev, - struct cfg80211_chan_def *chandef); + struct cfg80211_chan_def *chandef, + unsigned int link_id); -/* +/** * cfg80211_ch_switch_started_notify - notify channel switch start * @dev: the device on which the channel switch started * @chandef: the future channel definition + * @link_id: the link ID for MLO, must be 0 for non-MLO * @count: the number of TBTTs until the channel switch happens + * @quiet: whether or not immediate quiet was requested by the AP * * Inform the userspace about the channel switch that has just * started, so that it can take appropriate actions (eg. starting @@ -7525,7 +9059,8 @@ void cfg80211_ch_switch_notify(struct net_device *dev, */ void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, - u8 count); + unsigned int link_id, u8 count, + bool quiet); /** * ieee80211_operating_class_to_band - convert operating class to band @@ -7533,18 +9068,31 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev, * @operating_class: the operating class to convert * @band: band pointer to fill * - * Returns %true if the conversion was successful, %false otherwise. + * Return: %true if the conversion was successful, %false otherwise. */ bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band); /** + * ieee80211_operating_class_to_chandef - convert operating class to chandef + * + * @operating_class: the operating class to convert + * @chan: the ieee80211_channel to convert + * @chandef: a pointer to the resulting chandef + * + * Return: %true if the conversion was successful, %false otherwise. + */ +bool ieee80211_operating_class_to_chandef(u8 operating_class, + struct ieee80211_channel *chan, + struct cfg80211_chan_def *chandef); + +/** * ieee80211_chandef_to_operating_class - convert chandef to operation class * * @chandef: the chandef to convert * @op_class: a pointer to the resulting operating class * - * Returns %true if the conversion was successful, %false otherwise. + * Return: %true if the conversion was successful, %false otherwise. */ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class); @@ -7554,7 +9102,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, * * @chandef: the chandef to convert * - * Returns the center frequency of chandef (1st segment) in KHz. + * Return: the center frequency of chandef (1st segment) in KHz. */ static inline u32 ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef) @@ -7562,7 +9110,7 @@ ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef) return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; } -/* +/** * cfg80211_tdls_oper_request - request userspace to perform TDLS operation * @dev: the device on which the operation is requested * @peer: the MAC address of the peer device @@ -7581,11 +9129,11 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper, u16 reason_code, gfp_t gfp); -/* +/** * cfg80211_calculate_bitrate - calculate actual bitrate (in 100Kbps units) * @rate: given rate_info to calculate bitrate from * - * return 0 if MCS index >= 32 + * Return: calculated bitrate */ u32 cfg80211_calculate_bitrate(struct rate_info *rate); @@ -7593,20 +9141,51 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate); * cfg80211_unregister_wdev - remove the given wdev * @wdev: struct wireless_dev to remove * - * Call this function only for wdevs that have no netdev assigned, - * e.g. P2P Devices. It removes the device from the list so that - * it can no longer be used. It is necessary to call this function - * even when cfg80211 requests the removal of the interface by - * calling the del_virtual_intf() callback. The function must also - * be called when the driver wishes to unregister the wdev, e.g. - * when the device is unbound from the driver. + * This function removes the device so it can no longer be used. It is necessary + * to call this function even when cfg80211 requests the removal of the device + * by calling the del_virtual_intf() callback. The function must also be called + * when the driver wishes to unregister the wdev, e.g. when the hardware device + * is unbound from the driver. * - * Requires the RTNL to be held. + * Context: Requires the RTNL and wiphy mutex to be held. */ void cfg80211_unregister_wdev(struct wireless_dev *wdev); /** - * struct cfg80211_ft_event - FT Information Elements + * cfg80211_register_netdevice - register the given netdev + * @dev: the netdev to register + * + * Note: In contexts coming from cfg80211 callbacks, you must call this rather + * than register_netdevice(), unregister_netdev() is impossible as the RTNL is + * held. Otherwise, both register_netdevice() and register_netdev() are usable + * instead as well. + * + * Context: Requires the RTNL and wiphy mutex to be held. + * + * Return: 0 on success. Non-zero on error. + */ +int cfg80211_register_netdevice(struct net_device *dev); + +/** + * cfg80211_unregister_netdevice - unregister the given netdev + * @dev: the netdev to register + * + * Note: In contexts coming from cfg80211 callbacks, you must call this rather + * than unregister_netdevice(), unregister_netdev() is impossible as the RTNL + * is held. Otherwise, both unregister_netdevice() and unregister_netdev() are + * usable instead as well. + * + * Context: Requires the RTNL and wiphy mutex to be held. + */ +static inline void cfg80211_unregister_netdevice(struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_CFG80211) + cfg80211_unregister_wdev(dev->ieee80211_ptr); +#endif +} + +/** + * struct cfg80211_ft_event_params - FT Information Elements * @ies: FT IEs * @ies_len: length of the FT IE in bytes * @target_ap: target AP's MAC address @@ -7672,9 +9251,9 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, * correctly, if not the result of using this function will not * be ordered correctly either, i.e. it does no reordering. * - * The function returns the offset where the next part of the - * buffer starts, which may be @ielen if the entire (remainder) - * of the buffer should be used. + * Return: The offset where the next part of the buffer starts, which + * may be @ielen if the entire (remainder) of the buffer should be + * used. */ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, @@ -7702,9 +9281,9 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, * correctly, if not the result of using this function will not * be ordered correctly either, i.e. it does no reordering. * - * The function returns the offset where the next part of the - * buffer starts, which may be @ielen if the entire (remainder) - * of the buffer should be used. + * Return: The offset where the next part of the buffer starts, which + * may be @ielen if the entire (remainder) of the buffer should be + * used. */ static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, size_t offset) @@ -7713,6 +9292,18 @@ static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen, } /** + * ieee80211_fragment_element - fragment the last element in skb + * @skb: The skbuf that the element was added to + * @len_pos: Pointer to length of the element to fragment + * @frag_id: The element ID to use for fragments + * + * This function fragments all data after @len_pos, adding fragmentation + * elements with the given ID as appropriate. The SKB will grow in size + * accordingly. + */ +void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id); + +/** * cfg80211_report_wowlan_wakeup - report wakeup from WoWLAN * @wdev: the wireless device reporting the wakeup * @wakeup: the wakeup report @@ -7756,6 +9347,8 @@ unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy); * This function can be called by the driver to check whether a * combination of interfaces and their types are allowed according to * the interface combinations. + * + * Return: 0 if combinations are allowed. Non-zero on error. */ int cfg80211_check_combinations(struct wiphy *wiphy, struct iface_combination_params *params); @@ -7771,6 +9364,8 @@ int cfg80211_check_combinations(struct wiphy *wiphy, * This function can be called by the driver to check what possible * combinations it fits in at a given moment, e.g. for channel switching * purposes. + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_iter_combinations(struct wiphy *wiphy, struct iface_combination_params *params, @@ -7778,7 +9373,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, void *data), void *data); -/* +/** * cfg80211_stop_iface - trigger interface disconnection * * @wiphy: the wiphy @@ -7833,6 +9428,8 @@ static inline void wiphy_ext_feature_set(struct wiphy *wiphy, * * The extended features are flagged in multiple bytes (see * &struct wiphy.@ext_features) + * + * Return: %true if extended feature flag is set, %false otherwise */ static inline bool wiphy_ext_feature_isset(struct wiphy *wiphy, @@ -7954,11 +9551,25 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev, * Check whether the interface is allowed to operate; additionally, this API * can be used to check iftype against the software interfaces when * check_swif is '1'. + * + * Return: %true if allowed, %false otherwise */ bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif); +/** + * cfg80211_assoc_comeback - notification of association that was + * temporarily rejected with a comeback + * @netdev: network device + * @ap_addr: AP (MLD) address that rejected the association + * @timeout: timeout interval value TUs. + * + * this function may sleep. the caller must hold the corresponding wdev's mutex. + */ +void cfg80211_assoc_comeback(struct net_device *netdev, + const u8 *ap_addr, u32 timeout); + /* Logging, debugging and troubleshooting/diagnostic helpers. */ /* wiphy_printk helpers, similar to dev_printk */ @@ -7979,6 +9590,8 @@ bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, dev_notice(&(wiphy)->dev, format, ##args) #define wiphy_info(wiphy, format, args...) \ dev_info(&(wiphy)->dev, format, ##args) +#define wiphy_info_once(wiphy, format, args...) \ + dev_info_once(&(wiphy)->dev, format, ##args) #define wiphy_err_ratelimited(wiphy, format, args...) \ dev_err_ratelimited(&(wiphy)->dev, format, ##args) @@ -8026,4 +9639,198 @@ void cfg80211_update_owe_info_event(struct net_device *netdev, */ void cfg80211_bss_flush(struct wiphy *wiphy); +/** + * cfg80211_bss_color_notify - notify about bss color event + * @dev: network device + * @cmd: the actual event we want to notify + * @count: the number of TBTTs until the color change happens + * @color_bitmap: representations of the colors that the local BSS is aware of + * @link_id: valid link_id in case of MLO or 0 for non-MLO. + * + * Return: 0 on success. Non-zero on error. + */ +int cfg80211_bss_color_notify(struct net_device *dev, + enum nl80211_commands cmd, u8 count, + u64 color_bitmap, u8 link_id); + +/** + * cfg80211_obss_color_collision_notify - notify about bss color collision + * @dev: network device + * @color_bitmap: representations of the colors that the local BSS is aware of + * @link_id: valid link_id in case of MLO or 0 for non-MLO. + * + * Return: 0 on success. Non-zero on error. + */ +static inline int cfg80211_obss_color_collision_notify(struct net_device *dev, + u64 color_bitmap, + u8 link_id) +{ + return cfg80211_bss_color_notify(dev, NL80211_CMD_OBSS_COLOR_COLLISION, + 0, color_bitmap, link_id); +} + +/** + * cfg80211_color_change_started_notify - notify color change start + * @dev: the device on which the color is switched + * @count: the number of TBTTs until the color change happens + * @link_id: valid link_id in case of MLO or 0 for non-MLO. + * + * Inform the userspace about the color change that has started. + * + * Return: 0 on success. Non-zero on error. + */ +static inline int cfg80211_color_change_started_notify(struct net_device *dev, + u8 count, u8 link_id) +{ + return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_STARTED, + count, 0, link_id); +} + +/** + * cfg80211_color_change_aborted_notify - notify color change abort + * @dev: the device on which the color is switched + * @link_id: valid link_id in case of MLO or 0 for non-MLO. + * + * Inform the userspace about the color change that has aborted. + * + * Return: 0 on success. Non-zero on error. + */ +static inline int cfg80211_color_change_aborted_notify(struct net_device *dev, + u8 link_id) +{ + return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_ABORTED, + 0, 0, link_id); +} + +/** + * cfg80211_color_change_notify - notify color change completion + * @dev: the device on which the color was switched + * @link_id: valid link_id in case of MLO or 0 for non-MLO. + * + * Inform the userspace about the color change that has completed. + * + * Return: 0 on success. Non-zero on error. + */ +static inline int cfg80211_color_change_notify(struct net_device *dev, + u8 link_id) +{ + return cfg80211_bss_color_notify(dev, + NL80211_CMD_COLOR_CHANGE_COMPLETED, + 0, 0, link_id); +} + +/** + * cfg80211_links_removed - Notify about removed STA MLD setup links. + * @dev: network device. + * @link_mask: BIT mask of removed STA MLD setup link IDs. + * + * Inform cfg80211 and the userspace about removed STA MLD setup links due to + * AP MLD removing the corresponding affiliated APs with Multi-Link + * reconfiguration. Note that it's not valid to remove all links, in this + * case disconnect instead. + * Also note that the wdev mutex must be held. + */ +void cfg80211_links_removed(struct net_device *dev, u16 link_mask); + +/** + * struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data + * @buf: MLO Reconfiguration Response frame (header + body) + * @len: length of the frame data + * @added_links: BIT mask of links successfully added to the association + * @links: per-link information indexed by link ID + * @links.bss: the BSS that MLO reconfiguration was requested for, ownership of + * the pointer moves to cfg80211 in the call to + * cfg80211_mlo_reconf_add_done(). + * + * The BSS pointer must be set for each link for which 'add' operation was + * requested in the assoc_ml_reconf callback. + */ +struct cfg80211_mlo_reconf_done_data { + const u8 *buf; + size_t len; + u16 added_links; + struct { + struct cfg80211_bss *bss; + u8 *addr; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; +}; + +/** + * cfg80211_mlo_reconf_add_done - Notify about MLO reconfiguration result + * @dev: network device. + * @data: MLO reconfiguration done data, &struct cfg80211_mlo_reconf_done_data + * + * Inform cfg80211 and the userspace that processing of ML reconfiguration + * request to add links to the association is done. + */ +void cfg80211_mlo_reconf_add_done(struct net_device *dev, + struct cfg80211_mlo_reconf_done_data *data); + +/** + * cfg80211_schedule_channels_check - schedule regulatory check if needed + * @wdev: the wireless device to check + * + * In case the device supports NO_IR or DFS relaxations, schedule regulatory + * channels check, as previous concurrent operation conditions may not + * hold anymore. + */ +void cfg80211_schedule_channels_check(struct wireless_dev *wdev); + +/** + * cfg80211_epcs_changed - Notify about a change in EPCS state + * @netdev: the wireless device whose EPCS state changed + * @enabled: set to true if EPCS was enabled, otherwise set to false. + */ +void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); + +#ifdef CONFIG_CFG80211_DEBUGFS +/** + * wiphy_locked_debugfs_read - do a locked read in debugfs + * @wiphy: the wiphy to use + * @file: the file being read + * @buf: the buffer to fill and then read from + * @bufsize: size of the buffer + * @userbuf: the user buffer to copy to + * @count: read count + * @ppos: read position + * @handler: the read handler to call (under wiphy lock) + * @data: additional data to pass to the read handler + * + * Return: the number of characters read, or a negative errno + */ +ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file, + char *buf, size_t bufsize, + char __user *userbuf, size_t count, + loff_t *ppos, + ssize_t (*handler)(struct wiphy *wiphy, + struct file *file, + char *buf, + size_t bufsize, + void *data), + void *data); + +/** + * wiphy_locked_debugfs_write - do a locked write in debugfs + * @wiphy: the wiphy to use + * @file: the file being written to + * @buf: the buffer to copy the user data to + * @bufsize: size of the buffer + * @userbuf: the user buffer to copy from + * @count: read count + * @handler: the write handler to call (under wiphy lock) + * @data: additional data to pass to the write handler + * + * Return: the number of characters written, or a negative errno + */ +ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, + char *buf, size_t bufsize, + const char __user *userbuf, size_t count, + ssize_t (*handler)(struct wiphy *wiphy, + struct file *file, + char *buf, + size_t count, + void *data), + void *data); +#endif + #endif /* __NET_CFG80211_H */ diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 6ed07844eb24..76d2cd2e2b30 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -11,13 +11,16 @@ #include <linux/ieee802154.h> #include <linux/netdevice.h> -#include <linux/mutex.h> +#include <linux/spinlock.h> #include <linux/bug.h> #include <net/nl802154.h> struct wpan_phy; struct wpan_phy_cca; +struct cfg802154_scan_request; +struct cfg802154_beacon_request; +struct ieee802154_addr; #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL struct ieee802154_llsec_device_key; @@ -67,6 +70,20 @@ struct cfg802154_ops { struct wpan_dev *wpan_dev, bool mode); int (*set_ackreq_default)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool ackreq); + int (*trigger_scan)(struct wpan_phy *wpan_phy, + struct cfg802154_scan_request *request); + int (*abort_scan)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev); + int (*send_beacons)(struct wpan_phy *wpan_phy, + struct cfg802154_beacon_request *request); + int (*stop_beacons)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev); + int (*associate)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *coord); + int (*disassociate)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *target); #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL void (*get_llsec_table)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, @@ -160,17 +177,24 @@ wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b) } /** - * @WPAN_PHY_FLAG_TRANSMIT_POWER: Indicates that transceiver will support + * enum wpan_phy_flags - WPAN PHY state flags + * @WPAN_PHY_FLAG_TXPOWER: Indicates that transceiver will support * transmit power setting. * @WPAN_PHY_FLAG_CCA_ED_LEVEL: Indicates that transceiver will support cca ed * level setting. * @WPAN_PHY_FLAG_CCA_MODE: Indicates that transceiver will support cca mode * setting. + * @WPAN_PHY_FLAG_STATE_QUEUE_STOPPED: Indicates that the transmit queue was + * temporarily stopped. + * @WPAN_PHY_FLAG_DATAGRAMS_ONLY: Indicates that transceiver is only able to + * send/receive datagrams. */ enum wpan_phy_flags { WPAN_PHY_FLAG_TXPOWER = BIT(1), WPAN_PHY_FLAG_CCA_ED_LEVEL = BIT(2), WPAN_PHY_FLAG_CCA_MODE = BIT(3), + WPAN_PHY_FLAG_STATE_QUEUE_STOPPED = BIT(4), + WPAN_PHY_FLAG_DATAGRAMS_ONLY = BIT(5), }; struct wpan_phy { @@ -182,7 +206,7 @@ struct wpan_phy { */ const void *privid; - u32 flags; + unsigned long flags; /* * This is a PIB according to 802.15.4-2011. @@ -203,8 +227,8 @@ struct wpan_phy { /* PHY depended MAC PIB values */ - /* 802.15.4 acronym: Tdsym in usec */ - u8 symbol_duration; + /* 802.15.4 acronym: Tdsym in nsec */ + u32 symbol_duration; /* lifs and sifs periods timing */ u16 lifs_period; u16 sifs_period; @@ -214,6 +238,17 @@ struct wpan_phy { /* the network namespace this phy lives in currently */ possible_net_t _net; + /* Transmission monitoring and control */ + spinlock_t queue_lock; + atomic_t ongoing_txs; + atomic_t hold_txs; + wait_queue_head_t sync_txq; + + /* Current filtering level on reception. + * Only allowed to be changed if phy is not operational. + */ + enum ieee802154_filtering_level filtering; + char priv[] __aligned(NETDEV_ALIGN); }; @@ -227,6 +262,27 @@ static inline void wpan_phy_net_set(struct wpan_phy *wpan_phy, struct net *net) write_pnet(&wpan_phy->_net, net); } +static inline bool ieee802154_chan_is_valid(struct wpan_phy *phy, + u8 page, u8 channel) +{ + if (page > IEEE802154_MAX_PAGE || + channel > IEEE802154_MAX_CHANNEL || + !(phy->supported.channels[page] & BIT(channel))) + return false; + + return true; +} + +/** + * struct ieee802154_addr - IEEE802.15.4 device address + * @mode: Address mode from frame header. Can be one of: + * - @IEEE802154_ADDR_NONE + * - @IEEE802154_ADDR_SHORT + * - @IEEE802154_ADDR_LONG + * @pan_id: The PAN ID this address belongs to + * @short_addr: address if @mode is @IEEE802154_ADDR_SHORT + * @extended_addr: address if @mode is @IEEE802154_ADDR_LONG + */ struct ieee802154_addr { u8 mode; __le16 pan_id; @@ -236,6 +292,94 @@ struct ieee802154_addr { }; }; +/** + * struct ieee802154_coord_desc - Coordinator descriptor + * @addr: PAN ID and coordinator address + * @page: page this coordinator is using + * @channel: channel this coordinator is using + * @superframe_spec: SuperFrame specification as received + * @link_quality: link quality indicator at which the beacon was received + * @gts_permit: the coordinator accepts GTS requests + */ +struct ieee802154_coord_desc { + struct ieee802154_addr addr; + u8 page; + u8 channel; + u16 superframe_spec; + u8 link_quality; + bool gts_permit; +}; + +/** + * struct ieee802154_pan_device - PAN device information + * @pan_id: the PAN ID of this device + * @mode: the preferred mode to reach the device + * @short_addr: the short address of this device + * @extended_addr: the extended address of this device + * @node: the list node + */ +struct ieee802154_pan_device { + __le16 pan_id; + u8 mode; + __le16 short_addr; + __le64 extended_addr; + struct list_head node; +}; + +/** + * struct cfg802154_scan_request - Scan request + * + * @type: type of scan to be performed + * @page: page on which to perform the scan + * @channels: channels in te %page to be scanned + * @duration: time spent on each channel, calculated with: + * aBaseSuperframeDuration * (2 ^ duration + 1) + * @wpan_dev: the wpan device on which to perform the scan + * @wpan_phy: the wpan phy on which to perform the scan + */ +struct cfg802154_scan_request { + enum nl802154_scan_types type; + u8 page; + u32 channels; + u8 duration; + struct wpan_dev *wpan_dev; + struct wpan_phy *wpan_phy; +}; + +/** + * struct cfg802154_beacon_request - Beacon request descriptor + * + * @interval: interval n between sendings, in multiple order of the super frame + * duration: aBaseSuperframeDuration * (2^n) unless the interval + * order is greater or equal to 15, in this case beacons won't be + * passively sent out at a fixed rate but instead inform the device + * that it should answer beacon requests as part of active scan + * procedures + * @wpan_dev: the concerned wpan device + * @wpan_phy: the wpan phy this was for + */ +struct cfg802154_beacon_request { + u8 interval; + struct wpan_dev *wpan_dev; + struct wpan_phy *wpan_phy; +}; + +/** + * struct cfg802154_mac_pkt - MAC packet descriptor (beacon/command) + * @node: MAC packets to process list member + * @skb: the received sk_buff + * @sdata: the interface on which @skb was received + * @page: page configuration when @skb was received + * @channel: channel configuration when @skb was received + */ +struct cfg802154_mac_pkt { + struct list_head node; + struct sk_buff *skb; + struct ieee802154_sub_if_data *sdata; + u8 page; + u8 channel; +}; + struct ieee802154_llsec_key_id { u8 mode; u8 id; @@ -257,6 +401,7 @@ struct ieee802154_llsec_key { struct ieee802154_llsec_key_entry { struct list_head list; + struct rcu_head rcu; struct ieee802154_llsec_key_id id; struct ieee802154_llsec_key *key; @@ -355,14 +500,20 @@ struct wpan_dev { bool lbt; - bool promiscuous_mode; - /* fallback for acknowledgment bit setting */ bool ackreq; + + /* Associations */ + struct mutex association_lock; + struct ieee802154_pan_device *parent; + struct list_head children; + unsigned int max_associations; + unsigned int nchildren; }; #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) +#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) static inline int wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, const struct ieee802154_addr *daddr, @@ -373,6 +524,7 @@ wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, return wpan_dev->header_ops->create(skb, dev, daddr, saddr, len); } +#endif struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size); @@ -405,4 +557,49 @@ static inline const char *wpan_phy_name(struct wpan_phy *phy) return dev_name(&phy->dev); } +void ieee802154_configure_durations(struct wpan_phy *phy, + unsigned int page, unsigned int channel); + +/** + * cfg802154_device_is_associated - Checks whether we are associated to any device + * @wpan_dev: the wpan device + * @return: true if we are associated + */ +bool cfg802154_device_is_associated(struct wpan_dev *wpan_dev); + +/** + * cfg802154_device_is_parent - Checks if a device is our coordinator + * @wpan_dev: the wpan device + * @target: the expected parent + * @return: true if @target is our coordinator + */ +bool cfg802154_device_is_parent(struct wpan_dev *wpan_dev, + struct ieee802154_addr *target); + +/** + * cfg802154_device_is_child - Checks whether a device is associated to us + * @wpan_dev: the wpan device + * @target: the expected child + * @return: the PAN device + */ +struct ieee802154_pan_device * +cfg802154_device_is_child(struct wpan_dev *wpan_dev, + struct ieee802154_addr *target); + +/** + * cfg802154_set_max_associations - Limit the number of future associations + * @wpan_dev: the wpan device + * @max: the maximum number of devices we accept to associate + * @return: the old maximum value + */ +unsigned int cfg802154_set_max_associations(struct wpan_dev *wpan_dev, + unsigned int max); + +/** + * cfg802154_get_free_short_addr - Get a free address among the known devices + * @wpan_dev: the wpan device + * @return: a random short address expectedly unused on our PAN + */ +__le16 cfg802154_get_free_short_addr(struct wpan_dev *wpan_dev); + #endif /* __NET_CFG802154_H */ diff --git a/include/net/checksum.h b/include/net/checksum.h index 0d05b9e8690b..3cbab35de5ab 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -18,11 +18,13 @@ #include <linux/errno.h> #include <asm/types.h> #include <asm/byteorder.h> -#include <linux/uaccess.h> #include <asm/checksum.h> +#if !defined(_HAVE_ARCH_COPY_AND_CSUM_FROM_USER) || !defined(HAVE_CSUM_COPY_USER) +#include <linux/uaccess.h> +#endif #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER -static inline +static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { @@ -33,7 +35,7 @@ __wsum csum_and_copy_from_user (const void __user *src, void *dst, #endif #ifndef HAVE_CSUM_COPY_USER -static __inline__ __wsum csum_and_copy_to_user +static __always_inline __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); @@ -45,7 +47,7 @@ static __inline__ __wsum csum_and_copy_to_user #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY -static inline __wsum +static __always_inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); @@ -54,7 +56,7 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len) #endif #ifndef HAVE_ARCH_CSUM_ADD -static inline __wsum csum_add(__wsum csum, __wsum addend) +static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; @@ -62,12 +64,12 @@ static inline __wsum csum_add(__wsum csum, __wsum addend) } #endif -static inline __wsum csum_sub(__wsum csum, __wsum addend) +static __always_inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } -static inline __sum16 csum16_add(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; @@ -75,53 +77,46 @@ static inline __sum16 csum16_add(__sum16 csum, __be16 addend) return (__force __sum16)(res + (res < (__force u16)addend)); } -static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } -static inline __wsum -csum_block_add(__wsum csum, __wsum csum2, int offset) +#ifndef HAVE_ARCH_CSUM_SHIFT +static __always_inline __wsum csum_shift(__wsum sum, int offset) { - u32 sum = (__force u32)csum2; - /* rotate sum to align it with a 16b boundary */ if (offset & 1) - sum = ror32(sum, 8); - - return csum_add(csum, (__force __wsum)sum); + return (__force __wsum)ror32((__force u32)sum, 8); + return sum; } +#endif -static inline __wsum -csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) +static __always_inline __wsum +csum_block_add(__wsum csum, __wsum csum2, int offset) { - return csum_block_add(csum, csum2, offset); + return csum_add(csum, csum_shift(csum2, offset)); } -static inline __wsum +static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } -static inline __wsum csum_unfold(__sum16 n) +static __always_inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } -static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) -{ - return csum_partial(buff, len, sum); -} - #define CSUM_MANGLED_0 ((__force __sum16)0xffff) -static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } -static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) +static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); @@ -134,11 +129,22 @@ static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) * m : old value of a 16bit field * m' : new value of a 16bit field */ -static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) +static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } +static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) +{ + *csum = csum_add(csum_sub(*csum, old), new); +} + +static inline unsigned short csum_from32to16(unsigned int sum) +{ + sum += (sum >> 16) | (sum << 16); + return (unsigned short)(sum >> 16); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); @@ -146,18 +152,18 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr); + __wsum diff, bool pseudohdr, bool ipv6); -static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, - __be16 from, __be16 to, - bool pseudohdr) +static __always_inline +void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, + __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } -static inline __wsum remcsum_adjust(void *ptr, __wsum csum, - int start, int offset) +static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum, + int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; @@ -173,9 +179,13 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum, return delta; } -static inline void remcsum_unadjust(__sum16 *psum, __wsum delta) +static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } +static __always_inline __wsum wsum_negate(__wsum val) +{ + return (__force __wsum)-((__force u32)val); +} #endif diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h index 53dd7d988a2d..d6780d7903f4 100644 --- a/include/net/cipso_ipv4.h +++ b/include/net/cipso_ipv4.h @@ -28,7 +28,7 @@ #include <net/request_sock.h> #include <linux/atomic.h> #include <linux/refcount.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /* known doi values */ #define CIPSO_V4_DOI_UNKNOWN 0x00000000 @@ -183,7 +183,8 @@ int cipso_v4_getattr(const unsigned char *cipso, struct netlbl_lsm_secattr *secattr); int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr); + const struct netlbl_lsm_secattr *secattr, + bool sk_locked); void cipso_v4_sock_delattr(struct sock *sk); int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); int cipso_v4_req_setattr(struct request_sock *req, @@ -214,7 +215,8 @@ static inline int cipso_v4_getattr(const unsigned char *cipso, static inline int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr) + const struct netlbl_lsm_secattr *secattr, + bool sk_locked) { return -ENOSYS; } diff --git a/include/net/codel.h b/include/net/codel.h index a6e428f80135..aa80f744826c 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -44,8 +44,6 @@ #include <linux/types.h> #include <linux/ktime.h> #include <linux/skbuff.h> -#include <net/pkt_sched.h> -#include <net/inet_ecn.h> /* Controlling Queue Delay (CoDel) algorithm * ========================================= @@ -102,6 +100,9 @@ static inline u32 codel_time_to_us(codel_time_t val) * @interval: width of moving time window * @mtu: device mtu, or minimal queue backlog in bytes. * @ecn: is Explicit Congestion Notification enabled + * @ce_threshold_selector: apply ce_threshold to packets matching this value + * in the diffserv/ECN byte of the IP header + * @ce_threshold_mask: mask to apply to ce_threshold_selector comparison */ struct codel_params { codel_time_t target; @@ -109,6 +110,8 @@ struct codel_params { codel_time_t interval; u32 mtu; bool ecn; + u8 ce_threshold_selector; + u8 ce_threshold_mask; }; /** @@ -142,8 +145,8 @@ struct codel_vars { * @maxpacket: largest packet we've seen so far * @drop_count: temp count of dropped packets in dequeue() * @drop_len: bytes of dropped packets in dequeue() - * ecn_mark: number of packets we ECN marked instead of dropping - * ce_mark: number of packets CE marked because sojourn time was above ce_threshold + * @ecn_mark: number of packets we ECN marked instead of dropping + * @ce_mark: number of packets CE marked because sojourn time was above ce_threshold */ struct codel_stats { u32 maxpacket; diff --git a/include/net/codel_impl.h b/include/net/codel_impl.h index d289b91dcd65..78a27ac73070 100644 --- a/include/net/codel_impl.h +++ b/include/net/codel_impl.h @@ -49,11 +49,15 @@ * Implemented on linux by Dave Taht and Eric Dumazet */ +#include <net/inet_ecn.h> + static void codel_params_init(struct codel_params *params) { params->interval = MS2TIME(100); params->target = MS2TIME(5); params->ce_threshold = CODEL_DISABLED_THRESHOLD; + params->ce_threshold_mask = 0; + params->ce_threshold_selector = 0; params->ecn = false; } @@ -246,9 +250,19 @@ static struct sk_buff *codel_dequeue(void *ctx, vars->rec_inv_sqrt); } end: - if (skb && codel_time_after(vars->ldelay, params->ce_threshold) && - INET_ECN_set_ce(skb)) - stats->ce_mark++; + if (skb && codel_time_after(vars->ldelay, params->ce_threshold)) { + bool set_ce = true; + + if (params->ce_threshold_mask) { + int dsfield = skb_get_dsfield(skb); + + set_ce = (dsfield >= 0 && + (((u8)dsfield & params->ce_threshold_mask) == + params->ce_threshold_selector)); + } + if (set_ce && INET_ECN_set_ce(skb)) + stats->ce_mark++; + } return skb; } diff --git a/include/net/codel_qdisc.h b/include/net/codel_qdisc.h index 098630f83a55..7d3d9219f4fe 100644 --- a/include/net/codel_qdisc.h +++ b/include/net/codel_qdisc.h @@ -49,6 +49,9 @@ * Implemented on linux by Dave Taht and Eric Dumazet */ +#include <net/codel.h> +#include <net/pkt_sched.h> + /* Qdiscs using codel plugin must use codel_skb_cb in their own cb[] */ struct codel_skb_cb { codel_time_t enqueue_time; diff --git a/include/net/compat.h b/include/net/compat.h index 745db0d605b6..84c163f40f38 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -5,8 +5,6 @@ struct sock; -#if defined(CONFIG_COMPAT) - #include <linux/compat.h> struct compat_msghdr { @@ -48,17 +46,8 @@ struct compat_rtentry { unsigned short rt_irtt; /* Initial RTT */ }; -#else /* defined(CONFIG_COMPAT) */ -/* - * To avoid compiler warnings: - */ -#define compat_msghdr msghdr -#define compat_mmsghdr mmsghdr -#endif /* defined(CONFIG_COMPAT) */ - -int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg, - struct sockaddr __user **save_addr, compat_uptr_t *ptr, - compat_size_t *len); +int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr *msg, + struct sockaddr __user **save_addr); int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, struct sockaddr __user **, struct iovec **); int put_cmsg_compat(struct msghdr*, int, int, int, void *); @@ -81,13 +70,26 @@ struct compat_group_source_req { } __packed; struct compat_group_filter { - __u32 gf_interface; - struct __kernel_sockaddr_storage gf_group - __aligned(4); - __u32 gf_fmode; - __u32 gf_numsrc; - struct __kernel_sockaddr_storage gf_slist[1] - __aligned(4); + union { + struct { + __u32 gf_interface_aux; + struct __kernel_sockaddr_storage gf_group_aux + __aligned(4); + __u32 gf_fmode_aux; + __u32 gf_numsrc_aux; + struct __kernel_sockaddr_storage gf_slist[1] + __aligned(4); + } __packed; + struct { + __u32 gf_interface; + struct __kernel_sockaddr_storage gf_group + __aligned(4); + __u32 gf_fmode; + __u32 gf_numsrc; + struct __kernel_sockaddr_storage gf_slist_flex[] + __aligned(4); + } __packed; + }; } __packed; #endif /* NET_COMPAT_H */ diff --git a/include/net/datalink.h b/include/net/datalink.h index a9663229b913..6c529a40e00d 100644 --- a/include/net/datalink.h +++ b/include/net/datalink.h @@ -2,6 +2,13 @@ #ifndef _NET_INET_DATALINK_H_ #define _NET_INET_DATALINK_H_ +#include <linux/list.h> + +struct llc_sap; +struct net_device; +struct packet_type; +struct sk_buff; + struct datalink_proto { unsigned char type[8]; @@ -12,10 +19,8 @@ struct datalink_proto { int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); int (*request)(struct datalink_proto *, struct sk_buff *, - unsigned char *); + const unsigned char *); struct list_head node; }; -struct datalink_proto *make_EII_client(void); -void destroy_EII_client(struct datalink_proto *dl); #endif diff --git a/include/net/dcbevent.h b/include/net/dcbevent.h index 43e34131a53f..02700262f71a 100644 --- a/include/net/dcbevent.h +++ b/include/net/dcbevent.h @@ -8,6 +8,8 @@ #ifndef _DCB_EVENT_H #define _DCB_EVENT_H +struct notifier_block; + enum dcbevent_notif_type { DCB_APP_EVENT = 1, }; diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index e4ad58c4062c..42207fc44660 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -10,6 +10,8 @@ #include <linux/dcbnl.h> +struct net_device; + struct dcb_app_type { int ifindex; struct dcb_app app; @@ -17,18 +19,32 @@ struct dcb_app_type { u8 dcbx; }; +u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app); +int dcb_setrewr(struct net_device *dev, struct dcb_app *app); +int dcb_delrewr(struct net_device *dev, struct dcb_app *app); + int dcb_setapp(struct net_device *, struct dcb_app *); u8 dcb_getapp(struct net_device *, struct dcb_app *); int dcb_ieee_setapp(struct net_device *, struct dcb_app *); int dcb_ieee_delapp(struct net_device *, struct dcb_app *); u8 dcb_ieee_getapp_mask(struct net_device *, struct dcb_app *); +struct dcb_rewr_prio_pcp_map { + u16 map[IEEE_8021QAZ_MAX_TCS]; +}; + +void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, + struct dcb_rewr_prio_pcp_map *p_map); + struct dcb_ieee_app_prio_map { u64 map[IEEE_8021QAZ_MAX_TCS]; }; void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map); +void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, + struct dcb_ieee_app_prio_map *p_map); + struct dcb_ieee_app_dscp_map { u8 map[64]; }; @@ -107,6 +123,14 @@ struct dcbnl_rtnl_ops { /* buffer settings */ int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *); int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *); + + /* apptrust */ + int (*dcbnl_setapptrust)(struct net_device *, u8 *, int); + int (*dcbnl_getapptrust)(struct net_device *, u8 *, int *); + + /* rewrite */ + int (*dcbnl_setrewr)(struct net_device *dev, struct dcb_app *app); + int (*dcbnl_delrewr)(struct net_device *dev, struct dcb_app *app); }; #endif /* __NET_DCBNL_H__ */ diff --git a/include/net/devlink.h b/include/net/devlink.h index b01bb9bca5a2..0091f23a40f7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -19,44 +19,10 @@ #include <net/flow_offload.h> #include <uapi/linux/devlink.h> #include <linux/xarray.h> +#include <linux/firmware.h> -#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ - (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) - -struct devlink_dev_stats { - u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; - u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; -}; - -struct devlink_ops; - -struct devlink { - struct list_head list; - struct list_head port_list; - struct list_head sb_list; - struct list_head dpipe_table_list; - struct list_head resource_list; - struct list_head param_list; - struct list_head region_list; - struct list_head reporter_list; - struct mutex reporters_lock; /* protects reporter_list */ - struct devlink_dpipe_headers *dpipe_headers; - struct list_head trap_list; - struct list_head trap_group_list; - struct list_head trap_policer_list; - const struct devlink_ops *ops; - struct xarray snapshot_ids; - struct devlink_dev_stats stats; - struct device *dev; - possible_net_t _net; - struct mutex lock; /* Serializes access to devlink instance specific objects such as - * port, sb, dpipe, resource, params, region, traps and more. - */ - u8 reload_failed:1, - reload_enabled:1, - registered:1; - char priv[0] __aligned(NETDEV_ALIGN); -}; +struct devlink; +struct devlink_linecard; struct devlink_port_phys_attrs { u32 port_number; /* Same value as "split group". @@ -69,7 +35,7 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes * @controller: Associated controller number - * @pf: Associated PCI PF number for this port. + * @pf: associated PCI function number for the devlink port instance * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { @@ -81,8 +47,9 @@ struct devlink_port_pci_pf_attrs { /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes * @controller: Associated controller number - * @pf: Associated PCI PF number for this port. - * @vf: Associated PCI VF for of the PCI PF for this port. + * @pf: associated PCI function number for the devlink port instance + * @vf: associated PCI VF number of a PF for the devlink port instance; + * VF number starts from 0 for the first PCI virtual function * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { @@ -93,6 +60,20 @@ struct devlink_port_pci_vf_attrs { }; /** + * struct devlink_port_pci_sf_attrs - devlink port's PCI SF attributes + * @controller: Associated controller number + * @sf: associated SF number of a PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance + * @external: when set, indicates if a port is for an external controller + */ +struct devlink_port_pci_sf_attrs { + u32 controller; + u32 sf; + u16 pf; + u8 external:1; +}; + +/** * struct devlink_port_attrs - devlink port object * @flavour: flavour of the port * @split: indicates if this is split port @@ -102,6 +83,7 @@ struct devlink_port_pci_vf_attrs { * @phys: physical port attributes * @pci_pf: PCI PF port attributes * @pci_vf: PCI VF port attributes + * @pci_sf: PCI SF port attributes */ struct devlink_port_attrs { u8 split:1, @@ -113,28 +95,108 @@ struct devlink_port_attrs { struct devlink_port_phys_attrs phys; struct devlink_port_pci_pf_attrs pci_pf; struct devlink_port_pci_vf_attrs pci_vf; + struct devlink_port_pci_sf_attrs pci_sf; }; }; +struct devlink_rate { + struct list_head list; + enum devlink_rate_type type; + struct devlink *devlink; + void *priv; + u64 tx_share; + u64 tx_max; + + struct devlink_rate *parent; + union { + struct devlink_port *devlink_port; + struct { + char *name; + refcount_t refcnt; + }; + }; + + u32 tx_priority; + u32 tx_weight; +}; + struct devlink_port { struct list_head list; - struct list_head param_list; struct list_head region_list; struct devlink *devlink; + const struct devlink_port_ops *ops; unsigned int index; - bool registered; - spinlock_t type_lock; /* Protects type and type_dev - * pointer consistency. + spinlock_t type_lock; /* Protects type and type_eth/ib + * structures consistency. */ enum devlink_port_type type; enum devlink_port_type desired_type; - void *type_dev; + union { + struct { + struct net_device *netdev; + int ifindex; + char ifname[IFNAMSIZ]; + } type_eth; + struct { + struct ib_device *ibdev; + } type_ib; + }; struct devlink_port_attrs attrs; u8 attrs_set:1, - switch_port:1; + switch_port:1, + registered:1, + initialized:1; struct delayed_work type_warn_dw; struct list_head reporter_list; - struct mutex reporters_lock; /* Protects reporter_list */ + + struct devlink_rate *devlink_rate; + struct devlink_linecard *linecard; + u32 rel_index; +}; + +struct devlink_port_new_attrs { + enum devlink_port_flavour flavour; + unsigned int port_index; + u32 controller; + u32 sfnum; + u16 pfnum; + u8 port_index_valid:1, + controller_valid:1, + sfnum_valid:1; +}; + +/** + * struct devlink_linecard_ops - Linecard operations + * @provision: callback to provision the linecard slot with certain + * type of linecard. As a result of this operation, + * driver is expected to eventually (could be after + * the function call returns) call one of: + * devlink_linecard_provision_set() + * devlink_linecard_provision_fail() + * @unprovision: callback to unprovision the linecard slot. As a result + * of this operation, driver is expected to eventually + * (could be after the function call returns) call + * devlink_linecard_provision_clear() + * devlink_linecard_provision_fail() + * @same_provision: callback to ask the driver if linecard is already + * provisioned in the same way user asks this linecard to be + * provisioned. + * @types_count: callback to get number of supported types + * @types_get: callback to get next type in list + */ +struct devlink_linecard_ops { + int (*provision)(struct devlink_linecard *linecard, void *priv, + const char *type, const void *type_priv, + struct netlink_ext_ack *extack); + int (*unprovision)(struct devlink_linecard *linecard, void *priv, + struct netlink_ext_ack *extack); + bool (*same_provision)(struct devlink_linecard *linecard, void *priv, + const char *type, const void *type_priv); + unsigned int (*types_count)(struct devlink_linecard *linecard, + void *priv); + void (*types_get)(struct devlink_linecard *linecard, + void *priv, unsigned int index, const char **type, + const void **type_priv); }; struct devlink_sb_pool_info { @@ -161,7 +223,7 @@ struct devlink_dpipe_field { /** * struct devlink_dpipe_header - dpipe header object * @name: header name - * @id: index, global/local detrmined by global bit + * @id: index, global/local determined by global bit * @fields: fields * @fields_count: number of fields * @global: indicates if header is shared like most protocol header @@ -181,7 +243,7 @@ struct devlink_dpipe_header { * @header_index: header index (packets can have several headers of same * type like in case of tunnels) * @header: header - * @fieled_id: field index + * @field_id: field index */ struct devlink_dpipe_match { enum devlink_dpipe_match_type type; @@ -196,7 +258,7 @@ struct devlink_dpipe_match { * @header_index: header index (packets can have several headers of same * type like in case of tunnels) * @header: header - * @fieled_id: field index + * @field_id: field index */ struct devlink_dpipe_action { enum devlink_dpipe_action_type type; @@ -232,7 +294,7 @@ struct devlink_dpipe_value { * struct devlink_dpipe_entry - table entry object * @index: index of the entry in the table * @match_values: match values - * @matche_values_count: count of matches tuples + * @match_values_count: count of matches tuples * @action_values: actions values * @action_values_count: count of actions values * @counter: value of counter @@ -282,26 +344,28 @@ struct devlink_dpipe_table_ops; */ struct devlink_dpipe_table { void *priv; + /* private: */ struct list_head list; + /* public: */ const char *name; bool counters_enabled; bool counter_control_extern; bool resource_valid; u64 resource_id; u64 resource_units; - struct devlink_dpipe_table_ops *table_ops; + const struct devlink_dpipe_table_ops *table_ops; struct rcu_head rcu; }; /** * struct devlink_dpipe_table_ops - dpipe_table ops - * @actions_dump - dumps all tables actions - * @matches_dump - dumps all tables matches - * @entries_dump - dumps all active entries in the table - * @counters_set_update - when changing the counter status hardware sync + * @actions_dump: dumps all tables actions + * @matches_dump: dumps all tables matches + * @entries_dump: dumps all active entries in the table + * @counters_set_update: when changing the counter status hardware sync * maybe needed to allocate/free counter related * resources - * @size_get - get size + * @size_get: get size */ struct devlink_dpipe_table_ops { int (*actions_dump)(void *priv, struct sk_buff *skb); @@ -314,8 +378,8 @@ struct devlink_dpipe_table_ops { /** * struct devlink_dpipe_headers - dpipe headers - * @headers - header array can be shared (global bit) or driver specific - * @headers_count - count of headers + * @headers: header array can be shared (global bit) or driver specific + * @headers_count: count of headers */ struct devlink_dpipe_headers { struct devlink_dpipe_header **headers; @@ -327,7 +391,7 @@ struct devlink_dpipe_headers { * @size_min: minimum size which can be set * @size_max: maximum size which can be set * @size_granularity: size granularity - * @size_unit: resource's basic unit + * @unit: resource's basic unit */ struct devlink_resource_size_params { u64 size_min; @@ -350,42 +414,17 @@ devlink_resource_size_params_init(struct devlink_resource_size_params *size_para typedef u64 devlink_resource_occ_get_t(void *priv); -/** - * struct devlink_resource - devlink resource - * @name: name of the resource - * @id: id, per devlink instance - * @size: size of the resource - * @size_new: updated size of the resource, reload is needed - * @size_valid: valid in case the total size of the resource is valid - * including its children - * @parent: parent resource - * @size_params: size parameters - * @list: parent list - * @resource_list: list of child resources - */ -struct devlink_resource { - const char *name; - u64 id; - u64 size; - u64 size_new; - bool size_valid; - struct devlink_resource *parent; - struct devlink_resource_size_params size_params; - struct list_head list; - struct list_head resource_list; - devlink_resource_occ_get_t *occ_get; - void *occ_get_priv; -}; - #define DEVLINK_RESOURCE_ID_PARENT_TOP 0 +#define DEVLINK_RESOURCE_GENERIC_NAME_PORTS "physical_ports" + #define __DEVLINK_PARAM_MAX_STRING_VALUE 32 enum devlink_param_type { - DEVLINK_PARAM_TYPE_U8, - DEVLINK_PARAM_TYPE_U16, - DEVLINK_PARAM_TYPE_U32, - DEVLINK_PARAM_TYPE_STRING, - DEVLINK_PARAM_TYPE_BOOL, + DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8, + DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16, + DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32, + DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING, + DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG, }; union devlink_param_value { @@ -422,6 +461,7 @@ struct devlink_flash_notify { /** * struct devlink_param - devlink configuration parameter data + * @id: devlink parameter id number * @name: name of the parameter * @generic: indicates if the parameter is generic or driver specific * @type: parameter type @@ -444,7 +484,8 @@ struct devlink_param { int (*get)(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx); int (*set)(struct devlink *devlink, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int (*validate)(struct devlink *devlink, u32 id, union devlink_param_value val, struct netlink_ext_ack *extack); @@ -455,7 +496,10 @@ struct devlink_param_item { const struct devlink_param *param; union devlink_param_value driverinit_value; bool driverinit_value_valid; - bool published; + union devlink_param_value driverinit_value_new; /* Not reachable + * until reload. + */ + bool driverinit_value_new_valid; }; enum devlink_param_generic_id { @@ -470,6 +514,12 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, + DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, + DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, + DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, + DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, + DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -510,6 +560,24 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME "enable_remote_dev_reset" #define DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE DEVLINK_PARAM_TYPE_BOOL +#define DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME "enable_eth" +#define DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE DEVLINK_PARAM_TYPE_BOOL + +#define DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME "enable_rdma" +#define DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE DEVLINK_PARAM_TYPE_BOOL + +#define DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME "enable_vnet" +#define DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE DEVLINK_PARAM_TYPE_BOOL + +#define DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME "enable_iwarp" +#define DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE DEVLINK_PARAM_TYPE_BOOL + +#define DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME "io_eq_size" +#define DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE DEVLINK_PARAM_TYPE_U32 + +#define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME "event_eq_size" +#define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ @@ -533,12 +601,14 @@ enum devlink_param_generic_id { .validate = _validate, \ } -/* Part number, identifier of board design */ +/* Identifier of board design */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID "board.id" /* Revision of board design */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_REV "board.rev" /* Maker of the board */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_MANUFACTURE "board.manufacture" +/* Part number of the board and its components */ +#define DEVLINK_INFO_VERSION_GENERIC_BOARD_PART_NUMBER "board.part_number" /* Part number, identifier of asic design */ #define DEVLINK_INFO_VERSION_GENERIC_ASIC_ID "asic.id" @@ -563,24 +633,26 @@ enum devlink_param_generic_id { #define DEVLINK_INFO_VERSION_GENERIC_FW_ROCE "fw.roce" /* Firmware bundle identifier */ #define DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID "fw.bundle_id" +/* Bootloader */ +#define DEVLINK_INFO_VERSION_GENERIC_FW_BOOTLOADER "fw.bootloader" /** * struct devlink_flash_update_params - Flash Update parameters - * @file_name: the name of the flash firmware file to update from + * @fw: pointer to the firmware data to update from * @component: the flash component to update + * @overwrite_mask: which types of flash update are supported (may be %0) * - * With the exception of file_name, drivers must opt-in to parameters by + * With the exception of fw, drivers must opt-in to parameters by * setting the appropriate bit in the supported_flash_update_params field in * their devlink_ops structure. */ struct devlink_flash_update_params { - const char *file_name; + const struct firmware *fw; const char *component; u32 overwrite_mask; }; -#define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) -#define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK BIT(1) +#define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK BIT(0) struct devlink_region; struct devlink_info_req; @@ -593,6 +665,10 @@ struct devlink_info_req; * the data variable must be updated to point to the snapshot data. * The function will be called while the devlink instance lock is * held. + * @read: callback to directly read a portion of the region. On success, + * the data pointer will be updated with the contents of the + * requested portion of the region. The function will be called + * while the devlink instance lock is held. * @priv: Pointer to driver private data for the region operation */ struct devlink_region_ops { @@ -602,6 +678,10 @@ struct devlink_region_ops { const struct devlink_region_ops *ops, struct netlink_ext_ack *extack, u8 **data); + int (*read)(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data); void *priv; }; @@ -613,6 +693,10 @@ struct devlink_region_ops { * the data variable must be updated to point to the snapshot data. * The function will be called while the devlink instance lock is * held. + * @read: callback to directly read a portion of the region. On success, + * the data pointer will be updated with the contents of the + * requested portion of the region. The function will be called + * while the devlink instance lock is held. * @priv: Pointer to driver private data for the region operation */ struct devlink_port_region_ops { @@ -622,6 +706,10 @@ struct devlink_port_region_ops { const struct devlink_port_region_ops *ops, struct netlink_ext_ack *extack, u8 **data); + int (*read)(struct devlink_port *port, + const struct devlink_port_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data); void *priv; }; @@ -663,13 +751,17 @@ struct devlink_health_reporter_ops { * @trap_name: Trap name. * @trap_group_name: Trap group name. * @input_dev: Input netdevice. + * @dev_tracker: refcount tracker for @input_dev. * @fa_cookie: Flow action user cookie. * @trap_type: Trap type. */ struct devlink_trap_metadata { const char *trap_name; const char *trap_group_name; + struct net_device *input_dev; + netdevice_tracker dev_tracker; + const struct flow_action_cookie *fa_cookie; enum devlink_trap_type trap_type; }; @@ -834,6 +926,10 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_DCCP_PARSING, DEVLINK_TRAP_GENERIC_ID_GTP_PARSING, DEVLINK_TRAP_GENERIC_ID_ESP_PARSING, + DEVLINK_TRAP_GENERIC_ID_BLACKHOLE_NEXTHOP, + DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER, + DEVLINK_TRAP_GENERIC_ID_EAPOL, + DEVLINK_TRAP_GENERIC_ID_LOCKED_PORT, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -870,6 +966,7 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_SAMPLE, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_TRAP, DEVLINK_TRAP_GROUP_GENERIC_ID_PARSER_ERROR_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_EAPOL, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -1057,7 +1154,14 @@ enum devlink_trap_group_generic_id { "gtp_parsing" #define DEVLINK_TRAP_GENERIC_NAME_ESP_PARSING \ "esp_parsing" - +#define DEVLINK_TRAP_GENERIC_NAME_BLACKHOLE_NEXTHOP \ + "blackhole_nexthop" +#define DEVLINK_TRAP_GENERIC_NAME_DMAC_FILTER \ + "dmac_filter" +#define DEVLINK_TRAP_GENERIC_NAME_EAPOL \ + "eapol" +#define DEVLINK_TRAP_GENERIC_NAME_LOCKED_PORT \ + "locked_port" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -1111,6 +1215,8 @@ enum devlink_trap_group_generic_id { "acl_trap" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_PARSER_ERROR_DROPS \ "parser_error_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_EAPOL \ + "eapol" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ @@ -1156,11 +1262,28 @@ enum devlink_trap_group_generic_id { .min_burst = _min_burst, \ } +#define devlink_fmsg_put(fmsg, name, value) ( \ + _Generic((value), \ + bool : devlink_fmsg_bool_pair_put, \ + u8 : devlink_fmsg_u8_pair_put, \ + u16 : devlink_fmsg_u32_pair_put, \ + u32 : devlink_fmsg_u32_pair_put, \ + u64 : devlink_fmsg_u64_pair_put, \ + int : devlink_fmsg_u32_pair_put, \ + char * : devlink_fmsg_string_pair_put, \ + const char * : devlink_fmsg_string_pair_put) \ + (fmsg, name, (value))) + +enum { + /* device supports reload operations */ + DEVLINK_F_RELOAD = 1UL << 0, +}; + struct devlink_ops { /** * @supported_flash_update_params: * mask of parameters supported by the driver's .flash_update - * implemementation. + * implementation. */ u32 supported_flash_update_params; unsigned long reload_actions; @@ -1172,12 +1295,6 @@ struct devlink_ops { int (*reload_up)(struct devlink *devlink, enum devlink_reload_action action, enum devlink_reload_limit limit, u32 *actions_performed, struct netlink_ext_ack *extack); - int (*port_type_set)(struct devlink_port *devlink_port, - enum devlink_port_type port_type); - int (*port_split)(struct devlink *devlink, unsigned int port_index, - unsigned int count, struct netlink_ext_ack *extack); - int (*port_unsplit)(struct devlink *devlink, unsigned int port_index, - struct netlink_ext_ack *extack); int (*sb_pool_get)(struct devlink *devlink, unsigned int sb_index, u16 pool_index, struct devlink_sb_pool_info *pool_info); @@ -1292,6 +1409,16 @@ struct devlink_ops { enum devlink_trap_action action, struct netlink_ext_ack *extack); /** + * @trap_drop_counter_get: Trap drop counter get function. + * + * Should be used by device drivers to report number of packets + * that have been dropped, and cannot be passed to the devlink + * subsystem by the underlying device. + */ + int (*trap_drop_counter_get)(struct devlink *devlink, + const struct devlink_trap *trap, + u64 *p_drops); + /** * @trap_policer_init: Trap policer initialization function. * * Should be used by device drivers to initialize the trap policer in @@ -1324,74 +1451,272 @@ struct devlink_ops { const struct devlink_trap_policer *policer, u64 *p_drops); /** - * @port_function_hw_addr_get: Port function's hardware address get function. + * port_new() - Add a new port function of a specified flavor + * @devlink: Devlink instance + * @attrs: attributes of the new port + * @extack: extack for reporting error messages + * @devlink_port: pointer to store new devlink port pointer + * + * Devlink core will call this device driver function upon user request + * to create a new port function of a specified flavor and optional + * attributes * - * Should be used by device drivers to report the hardware address of a function managed - * by the devlink port. Driver should return -EOPNOTSUPP if it doesn't support port - * function handling for a particular port. + * Notes: + * - On success, drivers must register a port with devlink core * - * Note: @extack can be NULL when port notifier queries the port function. + * Return: 0 on success, negative value otherwise. */ - int (*port_function_hw_addr_get)(struct devlink *devlink, struct devlink_port *port, - u8 *hw_addr, int *hw_addr_len, - struct netlink_ext_ack *extack); + int (*port_new)(struct devlink *devlink, + const struct devlink_port_new_attrs *attrs, + struct netlink_ext_ack *extack, + struct devlink_port **devlink_port); + /** - * @port_function_hw_addr_set: Port function's hardware address set function. + * Rate control callbacks. + */ + int (*rate_leaf_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, + u64 tx_share, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, + u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_priority, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, + u64 tx_share, struct netlink_ext_ack *extack); + int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, + u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_node_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_priority, struct netlink_ext_ack *extack); + int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, + struct netlink_ext_ack *extack); + int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, + struct netlink_ext_ack *extack); + int (*rate_leaf_parent_set)(struct devlink_rate *child, + struct devlink_rate *parent, + void *priv_child, void *priv_parent, + struct netlink_ext_ack *extack); + int (*rate_node_parent_set)(struct devlink_rate *child, + struct devlink_rate *parent, + void *priv_child, void *priv_parent, + struct netlink_ext_ack *extack); + /** + * selftests_check() - queries if selftest is supported + * @devlink: devlink instance + * @id: test index + * @extack: extack for reporting error messages * - * Should be used by device drivers to set the hardware address of a function managed - * by the devlink port. Driver should return -EOPNOTSUPP if it doesn't support port - * function handling for a particular port. + * Return: true if test is supported by the driver */ - int (*port_function_hw_addr_set)(struct devlink *devlink, struct devlink_port *port, - const u8 *hw_addr, int hw_addr_len, - struct netlink_ext_ack *extack); + bool (*selftest_check)(struct devlink *devlink, unsigned int id, + struct netlink_ext_ack *extack); + /** + * selftest_run() - Runs a selftest + * @devlink: devlink instance + * @id: test index + * @extack: extack for reporting error messages + * + * Return: status of the test + */ + enum devlink_selftest_status + (*selftest_run)(struct devlink *devlink, unsigned int id, + struct netlink_ext_ack *extack); }; -static inline void *devlink_priv(struct devlink *devlink) -{ - BUG_ON(!devlink); - return &devlink->priv; -} +void *devlink_priv(struct devlink *devlink); +struct devlink *priv_to_devlink(void *priv); +struct device *devlink_to_dev(const struct devlink *devlink); -static inline struct devlink *priv_to_devlink(void *priv) +/* Devlink instance explicit locking */ +void devl_lock(struct devlink *devlink); +int devl_trylock(struct devlink *devlink); +void devl_unlock(struct devlink *devlink); +void devl_assert_locked(struct devlink *devlink); +bool devl_lock_is_held(struct devlink *devlink); +DEFINE_GUARD(devl, struct devlink *, devl_lock(_T), devl_unlock(_T)); + +struct ib_device; + +struct net *devlink_net(const struct devlink *devlink); +/* This call is intended for software devices that can create + * devlink instances in other namespaces than init_net. + * + * Drivers that operate on real HW must use devlink_alloc() instead. + */ +struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, + size_t priv_size, struct net *net, + struct device *dev); +static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, + size_t priv_size, + struct device *dev) { - BUG_ON(!priv); - return container_of(priv, struct devlink, priv); + return devlink_alloc_ns(ops, priv_size, &init_net, dev); } -static inline struct devlink_port * -netdev_to_devlink_port(struct net_device *dev) +int devl_register(struct devlink *devlink); +void devl_unregister(struct devlink *devlink); +void devlink_register(struct devlink *devlink); +void devlink_unregister(struct devlink *devlink); +void devlink_free(struct devlink *devlink); + +/** + * struct devlink_port_ops - Port operations + * @port_split: Callback used to split the port into multiple ones. + * @port_unsplit: Callback used to unsplit the port group back into + * a single port. + * @port_type_set: Callback used to set a type of a port. + * @port_del: Callback used to delete selected port along with related function. + * Devlink core calls this upon user request to delete + * a port previously created by devlink_ops->port_new(). + * @port_fn_hw_addr_get: Callback used to set port function's hardware address. + * Should be used by device drivers to report + * the hardware address of a function managed + * by the devlink port. + * @port_fn_hw_addr_set: Callback used to set port function's hardware address. + * Should be used by device drivers to set the hardware + * address of a function managed by the devlink port. + * @port_fn_roce_get: Callback used to get port function's RoCE capability. + * Should be used by device drivers to report + * the current state of RoCE capability of a function + * managed by the devlink port. + * @port_fn_roce_set: Callback used to set port function's RoCE capability. + * Should be used by device drivers to enable/disable + * RoCE capability of a function managed + * by the devlink port. + * @port_fn_migratable_get: Callback used to get port function's migratable + * capability. Should be used by device drivers + * to report the current state of migratable capability + * of a function managed by the devlink port. + * @port_fn_migratable_set: Callback used to set port function's migratable + * capability. Should be used by device drivers + * to enable/disable migratable capability of + * a function managed by the devlink port. + * @port_fn_state_get: Callback used to get port function's state. + * Should be used by device drivers to report + * the current admin and operational state of a + * function managed by the devlink port. + * @port_fn_state_set: Callback used to get port function's state. + * Should be used by device drivers set + * the admin state of a function managed + * by the devlink port. + * @port_fn_ipsec_crypto_get: Callback used to get port function's ipsec_crypto + * capability. Should be used by device drivers + * to report the current state of ipsec_crypto + * capability of a function managed by the devlink + * port. + * @port_fn_ipsec_crypto_set: Callback used to set port function's ipsec_crypto + * capability. Should be used by device drivers to + * enable/disable ipsec_crypto capability of a + * function managed by the devlink port. + * @port_fn_ipsec_packet_get: Callback used to get port function's ipsec_packet + * capability. Should be used by device drivers + * to report the current state of ipsec_packet + * capability of a function managed by the devlink + * port. + * @port_fn_ipsec_packet_set: Callback used to set port function's ipsec_packet + * capability. Should be used by device drivers to + * enable/disable ipsec_packet capability of a + * function managed by the devlink port. + * @port_fn_max_io_eqs_get: Callback used to get port function's maximum number + * of event queues. Should be used by device drivers to + * report the maximum event queues of a function + * managed by the devlink port. + * @port_fn_max_io_eqs_set: Callback used to set port function's maximum number + * of event queues. Should be used by device drivers to + * configure maximum number of event queues + * of a function managed by the devlink port. + * + * Note: Driver should return -EOPNOTSUPP if it doesn't support + * port function (@port_fn_*) handling for a particular port. + */ +struct devlink_port_ops { + int (*port_split)(struct devlink *devlink, struct devlink_port *port, + unsigned int count, struct netlink_ext_ack *extack); + int (*port_unsplit)(struct devlink *devlink, struct devlink_port *port, + struct netlink_ext_ack *extack); + int (*port_type_set)(struct devlink_port *devlink_port, + enum devlink_port_type port_type); + int (*port_del)(struct devlink *devlink, struct devlink_port *port, + struct netlink_ext_ack *extack); + int (*port_fn_hw_addr_get)(struct devlink_port *port, u8 *hw_addr, + int *hw_addr_len, + struct netlink_ext_ack *extack); + int (*port_fn_hw_addr_set)(struct devlink_port *port, + const u8 *hw_addr, int hw_addr_len, + struct netlink_ext_ack *extack); + int (*port_fn_roce_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + int (*port_fn_roce_set)(struct devlink_port *devlink_port, + bool enable, struct netlink_ext_ack *extack); + int (*port_fn_migratable_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + int (*port_fn_migratable_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); + int (*port_fn_state_get)(struct devlink_port *port, + enum devlink_port_fn_state *state, + enum devlink_port_fn_opstate *opstate, + struct netlink_ext_ack *extack); + int (*port_fn_state_set)(struct devlink_port *port, + enum devlink_port_fn_state state, + struct netlink_ext_ack *extack); + int (*port_fn_ipsec_crypto_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + int (*port_fn_ipsec_crypto_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); + int (*port_fn_ipsec_packet_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + int (*port_fn_ipsec_packet_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); + int (*port_fn_max_io_eqs_get)(struct devlink_port *devlink_port, + u32 *max_eqs, + struct netlink_ext_ack *extack); + int (*port_fn_max_io_eqs_set)(struct devlink_port *devlink_port, + u32 max_eqs, + struct netlink_ext_ack *extack); +}; + +void devlink_port_init(struct devlink *devlink, + struct devlink_port *devlink_port); +void devlink_port_fini(struct devlink_port *devlink_port); + +int devl_port_register_with_ops(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index, + const struct devlink_port_ops *ops); + +static inline int devl_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) { - if (dev->netdev_ops->ndo_get_devlink_port) - return dev->netdev_ops->ndo_get_devlink_port(dev); - return NULL; + return devl_port_register_with_ops(devlink, devlink_port, + port_index, NULL); } -static inline struct devlink *netdev_to_devlink(struct net_device *dev) -{ - struct devlink_port *devlink_port = netdev_to_devlink_port(dev); +int devlink_port_register_with_ops(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index, + const struct devlink_port_ops *ops); - if (devlink_port) - return devlink_port->devlink; - return NULL; +static inline int devlink_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) +{ + return devlink_port_register_with_ops(devlink, devlink_port, + port_index, NULL); } -struct ib_device; - -struct net *devlink_net(const struct devlink *devlink); -void devlink_net_set(struct devlink *devlink, struct net *net); -struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); -int devlink_register(struct devlink *devlink, struct device *dev); -void devlink_unregister(struct devlink *devlink); -void devlink_reload_enable(struct devlink *devlink); -void devlink_reload_disable(struct devlink *devlink); -void devlink_free(struct devlink *devlink); -int devlink_port_register(struct devlink *devlink, - struct devlink_port *devlink_port, - unsigned int port_index); +void devl_port_unregister(struct devlink_port *devlink_port); void devlink_port_unregister(struct devlink_port *devlink_port); -void devlink_port_type_eth_set(struct devlink_port *devlink_port, - struct net_device *netdev); +void devlink_port_type_eth_set(struct devlink_port *devlink_port); void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); @@ -1401,20 +1726,52 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external); +void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, + u32 controller, u16 pf, u32 sf, + bool external); +int devl_port_fn_devlink_set(struct devlink_port *devlink_port, + struct devlink *fn_devlink); +struct devlink_rate * +devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, + struct devlink_rate *parent); +int +devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, + struct devlink_rate *parent); +void devl_rate_leaf_destroy(struct devlink_port *devlink_port); +void devl_rate_nodes_destroy(struct devlink *devlink); +void devlink_port_linecard_set(struct devlink_port *devlink_port, + struct devlink_linecard *linecard); +struct devlink_linecard * +devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, + const struct devlink_linecard_ops *ops, void *priv); +void devl_linecard_destroy(struct devlink_linecard *linecard); +void devlink_linecard_provision_set(struct devlink_linecard *linecard, + const char *type); +void devlink_linecard_provision_clear(struct devlink_linecard *linecard); +void devlink_linecard_provision_fail(struct devlink_linecard *linecard); +void devlink_linecard_activate(struct devlink_linecard *linecard); +void devlink_linecard_deactivate(struct devlink_linecard *linecard); +int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, + struct devlink *nested_devlink); +int devl_sb_register(struct devlink *devlink, unsigned int sb_index, + u32 size, u16 ingress_pools_count, + u16 egress_pools_count, u16 ingress_tc_count, + u16 egress_tc_count); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, u16 egress_tc_count); +void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index); void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index); -int devlink_dpipe_table_register(struct devlink *devlink, - const char *table_name, - struct devlink_dpipe_table_ops *table_ops, - void *priv, bool counter_control_extern); -void devlink_dpipe_table_unregister(struct devlink *devlink, - const char *table_name); -int devlink_dpipe_headers_register(struct devlink *devlink, - struct devlink_dpipe_headers *dpipe_headers); -void devlink_dpipe_headers_unregister(struct devlink *devlink); +int devl_dpipe_table_register(struct devlink *devlink, + const char *table_name, + const struct devlink_dpipe_table_ops *table_ops, + void *priv, bool counter_control_extern); +void devl_dpipe_table_unregister(struct devlink *devlink, + const char *table_name); +void devl_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers *dpipe_headers); +void devl_dpipe_headers_unregister(struct devlink *devlink); bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, const char *table_name); int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx); @@ -1430,56 +1787,47 @@ extern struct devlink_dpipe_header devlink_dpipe_header_ethernet; extern struct devlink_dpipe_header devlink_dpipe_header_ipv4; extern struct devlink_dpipe_header devlink_dpipe_header_ipv6; -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params); -void devlink_resources_unregister(struct devlink *devlink, - struct devlink_resource *resource); -int devlink_resource_size_get(struct devlink *devlink, - u64 resource_id, - u64 *p_resource_size); -int devlink_dpipe_table_resource_set(struct devlink *devlink, - const char *table_name, u64 resource_id, - u64 resource_units); -void devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv); -void devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id); +int devl_resource_register(struct devlink *devlink, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params); +void devl_resources_unregister(struct devlink *devlink); +void devlink_resources_unregister(struct devlink *devlink); +int devl_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size); +int devl_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units); +void devl_resource_occ_get_register(struct devlink *devlink, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv); +void devl_resource_occ_get_unregister(struct devlink *devlink, + u64 resource_id); +int devl_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count); int devlink_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count); +void devl_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count); void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count); -void devlink_params_publish(struct devlink *devlink); -void devlink_params_unpublish(struct devlink *devlink); -int devlink_port_params_register(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count); -void devlink_port_params_unregister(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count); -int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, - union devlink_param_value *init_val); -int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, - union devlink_param_value init_val); -int -devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port, - u32 param_id, - union devlink_param_value *init_val); -int devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port, - u32 param_id, - union devlink_param_value init_val); -void devlink_param_value_changed(struct devlink *devlink, u32 param_id); -void devlink_port_param_value_changed(struct devlink_port *devlink_port, - u32 param_id); -void devlink_param_value_str_fill(union devlink_param_value *dst_val, - const char *src); +int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *val); +void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val); +void devl_param_value_changed(struct devlink *devlink, u32 param_id); +struct devlink_region *devl_region_create(struct devlink *devlink, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, + u64 region_size); struct devlink_region * devlink_region_create(struct devlink *devlink, const struct devlink_region_ops *ops, @@ -1488,78 +1836,98 @@ struct devlink_region * devlink_port_region_create(struct devlink_port *port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size); +void devl_region_destroy(struct devlink_region *region); void devlink_region_destroy(struct devlink_region *region); -void devlink_port_region_destroy(struct devlink_region *region); - int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id); void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id); int devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id); int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn); -int devlink_info_driver_name_put(struct devlink_info_req *req, - const char *name); int devlink_info_board_serial_number_put(struct devlink_info_req *req, const char *bsn); + +enum devlink_info_version_type { + DEVLINK_INFO_VERSION_TYPE_NONE, + DEVLINK_INFO_VERSION_TYPE_COMPONENT, /* May be used as flash update + * component by name. + */ +}; + int devlink_info_version_fixed_put(struct devlink_info_req *req, const char *version_name, const char *version_value); int devlink_info_version_stored_put(struct devlink_info_req *req, const char *version_name, const char *version_value); +int devlink_info_version_stored_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type); int devlink_info_version_running_put(struct devlink_info_req *req, const char *version_name, const char *version_value); - -int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg); -int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg); - -int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name); -int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg); - -int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name); -int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg); -int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name); -int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg); - -int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value); -int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value); -int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value); -int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value); -int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value); -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len); - -int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, - bool value); -int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, - u8 value); -int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, - u32 value); -int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, - u64 value); -int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, - const char *value); -int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u32 value_len); +int devlink_info_version_running_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type); + +void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg); +void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg); + +void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name); +void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg); + +void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name); +void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg); +void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name); +void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg); + +void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value); +void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value); +void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len); + +void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, + bool value); +void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, + u8 value); +void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, + u32 value); +void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, + u64 value); +void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, + const char *value); +void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, + const void *value, u32 value_len); struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); +devl_port_health_reporter_create(struct devlink_port *port, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv); +struct devlink_health_reporter * +devl_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv); + +struct devlink_health_reporter * +devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, void *priv); + void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter); +devl_health_reporter_destroy(struct devlink_health_reporter *reporter); void -devlink_port_health_reporter_destroy(struct devlink_health_reporter *reporter); +devlink_health_reporter_destroy(struct devlink_health_reporter *reporter); void * devlink_health_reporter_priv(struct devlink_health_reporter *reporter); @@ -1571,13 +1939,13 @@ devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter); +int devl_nested_devlink_set(struct devlink *devlink, + struct devlink *nested_devlink); bool devlink_is_reload_failed(const struct devlink *devlink); void devlink_remote_reload_actions_performed(struct devlink *devlink, enum devlink_reload_limit limit, u32 actions_performed); -void devlink_flash_update_begin_notify(struct devlink *devlink); -void devlink_flash_update_end_notify(struct devlink *devlink); void devlink_flash_update_status_notify(struct devlink *devlink, const char *status_msg, const char *component, @@ -1588,9 +1956,15 @@ void devlink_flash_update_timeout_notify(struct devlink *devlink, const char *component, unsigned long timeout); +int devl_traps_register(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count, void *priv); int devlink_traps_register(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count, void *priv); +void devl_traps_unregister(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count); void devlink_traps_unregister(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count); @@ -1598,40 +1972,62 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, void *trap_ctx, struct devlink_port *in_devlink_port, const struct flow_action_cookie *fa_cookie); void *devlink_trap_ctx_priv(void *trap_ctx); +int devl_trap_groups_register(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count); int devlink_trap_groups_register(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count); +void devl_trap_groups_unregister(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count); void devlink_trap_groups_unregister(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count); int -devlink_trap_policers_register(struct devlink *devlink, - const struct devlink_trap_policer *policers, - size_t policers_count); +devl_trap_policers_register(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count); void -devlink_trap_policers_unregister(struct devlink *devlink, - const struct devlink_trap_policer *policers, - size_t policers_count); +devl_trap_policers_unregister(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count); #if IS_ENABLED(CONFIG_NET_DEVLINK) -void devlink_compat_running_version(struct net_device *dev, +struct devlink *__must_check devlink_try_get(struct devlink *devlink); +void devlink_put(struct devlink *devlink); + +void devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len); -int devlink_compat_flash_update(struct net_device *dev, const char *file_name); +int devlink_compat_flash_update(struct devlink *devlink, const char *file_name); int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len); int devlink_compat_switch_id_get(struct net_device *dev, struct netdev_phys_item_id *ppid); +int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port); +size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port); +void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb); + #else +static inline struct devlink *devlink_try_get(struct devlink *devlink) +{ + return NULL; +} + +static inline void devlink_put(struct devlink *devlink) +{ +} + static inline void -devlink_compat_running_version(struct net_device *dev, char *buf, size_t len) +devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len) { } static inline int -devlink_compat_flash_update(struct net_device *dev, const char *file_name) +devlink_compat_flash_update(struct devlink *devlink, const char *file_name) { return -EOPNOTSUPP; } @@ -1650,6 +2046,17 @@ devlink_compat_switch_id_get(struct net_device *dev, return -EOPNOTSUPP; } +static inline int +devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) +{ + return 0; +} + +static inline size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/net/dn.h b/include/net/dn.h deleted file mode 100644 index 56ab0726c641..000000000000 --- a/include/net/dn.h +++ /dev/null @@ -1,231 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_DN_H -#define _NET_DN_H - -#include <linux/dn.h> -#include <net/sock.h> -#include <net/flow.h> -#include <asm/byteorder.h> -#include <asm/unaligned.h> - -struct dn_scp /* Session Control Port */ -{ - unsigned char state; -#define DN_O 1 /* Open */ -#define DN_CR 2 /* Connect Receive */ -#define DN_DR 3 /* Disconnect Reject */ -#define DN_DRC 4 /* Discon. Rej. Complete*/ -#define DN_CC 5 /* Connect Confirm */ -#define DN_CI 6 /* Connect Initiate */ -#define DN_NR 7 /* No resources */ -#define DN_NC 8 /* No communication */ -#define DN_CD 9 /* Connect Delivery */ -#define DN_RJ 10 /* Rejected */ -#define DN_RUN 11 /* Running */ -#define DN_DI 12 /* Disconnect Initiate */ -#define DN_DIC 13 /* Disconnect Complete */ -#define DN_DN 14 /* Disconnect Notificat */ -#define DN_CL 15 /* Closed */ -#define DN_CN 16 /* Closed Notification */ - - __le16 addrloc; - __le16 addrrem; - __u16 numdat; - __u16 numoth; - __u16 numoth_rcv; - __u16 numdat_rcv; - __u16 ackxmt_dat; - __u16 ackxmt_oth; - __u16 ackrcv_dat; - __u16 ackrcv_oth; - __u8 flowrem_sw; - __u8 flowloc_sw; -#define DN_SEND 2 -#define DN_DONTSEND 1 -#define DN_NOCHANGE 0 - __u16 flowrem_dat; - __u16 flowrem_oth; - __u16 flowloc_dat; - __u16 flowloc_oth; - __u8 services_rem; - __u8 services_loc; - __u8 info_rem; - __u8 info_loc; - - __u16 segsize_rem; - __u16 segsize_loc; - - __u8 nonagle; - __u8 multi_ireq; - __u8 accept_mode; - unsigned long seg_total; /* Running total of current segment */ - - struct optdata_dn conndata_in; - struct optdata_dn conndata_out; - struct optdata_dn discdata_in; - struct optdata_dn discdata_out; - struct accessdata_dn accessdata; - - struct sockaddr_dn addr; /* Local address */ - struct sockaddr_dn peer; /* Remote address */ - - /* - * In this case the RTT estimation is not specified in the - * docs, nor is any back off algorithm. Here we follow well - * known tcp algorithms with a few small variations. - * - * snd_window: Max number of packets we send before we wait for - * an ack to come back. This will become part of a - * more complicated scheme when we support flow - * control. - * - * nsp_srtt: Round-Trip-Time (x8) in jiffies. This is a rolling - * average. - * nsp_rttvar: Round-Trip-Time-Varience (x4) in jiffies. This is the - * varience of the smoothed average (but calculated in - * a simpler way than for normal statistical varience - * calculations). - * - * nsp_rxtshift: Backoff counter. Value is zero normally, each time - * a packet is lost is increases by one until an ack - * is received. Its used to index an array of backoff - * multipliers. - */ -#define NSP_MIN_WINDOW 1 -#define NSP_MAX_WINDOW (0x07fe) - unsigned long max_window; - unsigned long snd_window; -#define NSP_INITIAL_SRTT (HZ) - unsigned long nsp_srtt; -#define NSP_INITIAL_RTTVAR (HZ*3) - unsigned long nsp_rttvar; -#define NSP_MAXRXTSHIFT 12 - unsigned long nsp_rxtshift; - - /* - * Output queues, one for data, one for otherdata/linkservice - */ - struct sk_buff_head data_xmit_queue; - struct sk_buff_head other_xmit_queue; - - /* - * Input queue for other data - */ - struct sk_buff_head other_receive_queue; - int other_report; - - /* - * Stuff to do with the slow timer - */ - unsigned long stamp; /* time of last transmit */ - unsigned long persist; - int (*persist_fxn)(struct sock *sk); - unsigned long keepalive; - void (*keepalive_fxn)(struct sock *sk); - -}; - -static inline struct dn_scp *DN_SK(struct sock *sk) -{ - return (struct dn_scp *)(sk + 1); -} - -/* - * src,dst : Source and Destination DECnet addresses - * hops : Number of hops through the network - * dst_port, src_port : NSP port numbers - * services, info : Useful data extracted from conninit messages - * rt_flags : Routing flags byte - * nsp_flags : NSP layer flags byte - * segsize : Size of segment - * segnum : Number, for data, otherdata and linkservice - * xmit_count : Number of times we've transmitted this skb - * stamp : Time stamp of most recent transmission, used in RTT calculations - * iif: Input interface number - * - * As a general policy, this structure keeps all addresses in network - * byte order, and all else in host byte order. Thus dst, src, dst_port - * and src_port are in network order. All else is in host order. - * - */ -#define DN_SKB_CB(skb) ((struct dn_skb_cb *)(skb)->cb) -struct dn_skb_cb { - __le16 dst; - __le16 src; - __u16 hops; - __le16 dst_port; - __le16 src_port; - __u8 services; - __u8 info; - __u8 rt_flags; - __u8 nsp_flags; - __u16 segsize; - __u16 segnum; - __u16 xmit_count; - unsigned long stamp; - int iif; -}; - -static inline __le16 dn_eth2dn(unsigned char *ethaddr) -{ - return get_unaligned((__le16 *)(ethaddr + 4)); -} - -static inline __le16 dn_saddr2dn(struct sockaddr_dn *saddr) -{ - return *(__le16 *)saddr->sdn_nodeaddr; -} - -static inline void dn_dn2eth(unsigned char *ethaddr, __le16 addr) -{ - __u16 a = le16_to_cpu(addr); - ethaddr[0] = 0xAA; - ethaddr[1] = 0x00; - ethaddr[2] = 0x04; - ethaddr[3] = 0x00; - ethaddr[4] = (__u8)(a & 0xff); - ethaddr[5] = (__u8)(a >> 8); -} - -static inline void dn_sk_ports_copy(struct flowidn *fld, struct dn_scp *scp) -{ - fld->fld_sport = scp->addrloc; - fld->fld_dport = scp->addrrem; -} - -unsigned int dn_mss_from_pmtu(struct net_device *dev, int mtu); -void dn_register_sysctl(void); -void dn_unregister_sysctl(void); - -#define DN_MENUVER_ACC 0x01 -#define DN_MENUVER_USR 0x02 -#define DN_MENUVER_PRX 0x04 -#define DN_MENUVER_UIC 0x08 - -struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr); -struct sock *dn_find_by_skb(struct sk_buff *skb); -#define DN_ASCBUF_LEN 9 -char *dn_addr2asc(__u16, char *); -int dn_destroy_timer(struct sock *sk); - -int dn_sockaddr2username(struct sockaddr_dn *addr, unsigned char *buf, - unsigned char type); -int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *addr, - unsigned char *type); - -void dn_start_slow_timer(struct sock *sk); -void dn_stop_slow_timer(struct sock *sk); - -extern __le16 decnet_address; -extern int decnet_debug_level; -extern int decnet_time_wait; -extern int decnet_dn_count; -extern int decnet_di_count; -extern int decnet_dr_count; -extern int decnet_no_fc_max_cwnd; - -extern long sysctl_decnet_mem[3]; -extern int sysctl_decnet_wmem[3]; -extern int sysctl_decnet_rmem[3]; - -#endif /* _NET_DN_H */ diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h deleted file mode 100644 index 595b4f6c1eb1..000000000000 --- a/include/net/dn_dev.h +++ /dev/null @@ -1,199 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_DN_DEV_H -#define _NET_DN_DEV_H - - -struct dn_dev; - -struct dn_ifaddr { - struct dn_ifaddr __rcu *ifa_next; - struct dn_dev *ifa_dev; - __le16 ifa_local; - __le16 ifa_address; - __u32 ifa_flags; - __u8 ifa_scope; - char ifa_label[IFNAMSIZ]; - struct rcu_head rcu; -}; - -#define DN_DEV_S_RU 0 /* Run - working normally */ -#define DN_DEV_S_CR 1 /* Circuit Rejected */ -#define DN_DEV_S_DS 2 /* Data Link Start */ -#define DN_DEV_S_RI 3 /* Routing Layer Initialize */ -#define DN_DEV_S_RV 4 /* Routing Layer Verify */ -#define DN_DEV_S_RC 5 /* Routing Layer Complete */ -#define DN_DEV_S_OF 6 /* Off */ -#define DN_DEV_S_HA 7 /* Halt */ - - -/* - * The dn_dev_parms structure contains the set of parameters - * for each device (hence inclusion in the dn_dev structure) - * and an array is used to store the default types of supported - * device (in dn_dev.c). - * - * The type field matches the ARPHRD_ constants and is used in - * searching the list for supported devices when new devices - * come up. - * - * The mode field is used to find out if a device is broadcast, - * multipoint, or pointopoint. Please note that DECnet thinks - * different ways about devices to the rest of the kernel - * so the normal IFF_xxx flags are invalid here. For devices - * which can be any combination of the previously mentioned - * attributes, you can set this on a per device basis by - * installing an up() routine. - * - * The device state field, defines the initial state in which the - * device will come up. In the dn_dev structure, it is the actual - * state. - * - * Things have changed here. I've killed timer1 since it's a user space - * issue for a user space routing deamon to sort out. The kernel does - * not need to be bothered with it. - * - * Timers: - * t2 - Rate limit timer, min time between routing and hello messages - * t3 - Hello timer, send hello messages when it expires - * - * Callbacks: - * up() - Called to initialize device, return value can veto use of - * device with DECnet. - * down() - Called to turn device off when it goes down - * timer3() - Called once for each ifaddr when timer 3 goes off - * - * sysctl - Hook for sysctl things - * - */ -struct dn_dev_parms { - int type; /* ARPHRD_xxx */ - int mode; /* Broadcast, Unicast, Mulitpoint */ -#define DN_DEV_BCAST 1 -#define DN_DEV_UCAST 2 -#define DN_DEV_MPOINT 4 - int state; /* Initial state */ - int forwarding; /* 0=EndNode, 1=L1Router, 2=L2Router */ - unsigned long t2; /* Default value of t2 */ - unsigned long t3; /* Default value of t3 */ - int priority; /* Priority to be a router */ - char *name; /* Name for sysctl */ - int (*up)(struct net_device *); - void (*down)(struct net_device *); - void (*timer3)(struct net_device *, struct dn_ifaddr *ifa); - void *sysctl; -}; - - -struct dn_dev { - struct dn_ifaddr __rcu *ifa_list; - struct net_device *dev; - struct dn_dev_parms parms; - char use_long; - struct timer_list timer; - unsigned long t3; - struct neigh_parms *neigh_parms; - __u8 addr[ETH_ALEN]; - struct neighbour *router; /* Default router on circuit */ - struct neighbour *peer; /* Peer on pointopoint links */ - unsigned long uptime; /* Time device went up in jiffies */ -}; - -struct dn_short_packet { - __u8 msgflg; - __le16 dstnode; - __le16 srcnode; - __u8 forward; -} __packed; - -struct dn_long_packet { - __u8 msgflg; - __u8 d_area; - __u8 d_subarea; - __u8 d_id[6]; - __u8 s_area; - __u8 s_subarea; - __u8 s_id[6]; - __u8 nl2; - __u8 visit_ct; - __u8 s_class; - __u8 pt; -} __packed; - -/*------------------------- DRP - Routing messages ---------------------*/ - -struct endnode_hello_message { - __u8 msgflg; - __u8 tiver[3]; - __u8 id[6]; - __u8 iinfo; - __le16 blksize; - __u8 area; - __u8 seed[8]; - __u8 neighbor[6]; - __le16 timer; - __u8 mpd; - __u8 datalen; - __u8 data[2]; -} __packed; - -struct rtnode_hello_message { - __u8 msgflg; - __u8 tiver[3]; - __u8 id[6]; - __u8 iinfo; - __le16 blksize; - __u8 priority; - __u8 area; - __le16 timer; - __u8 mpd; -} __packed; - - -void dn_dev_init(void); -void dn_dev_cleanup(void); - -int dn_dev_ioctl(unsigned int cmd, void __user *arg); - -void dn_dev_devices_off(void); -void dn_dev_devices_on(void); - -void dn_dev_init_pkt(struct sk_buff *skb); -void dn_dev_veri_pkt(struct sk_buff *skb); -void dn_dev_hello(struct sk_buff *skb); - -void dn_dev_up(struct net_device *); -void dn_dev_down(struct net_device *); - -int dn_dev_set_default(struct net_device *dev, int force); -struct net_device *dn_dev_get_default(void); -int dn_dev_bind_default(__le16 *addr); - -int register_dnaddr_notifier(struct notifier_block *nb); -int unregister_dnaddr_notifier(struct notifier_block *nb); - -static inline int dn_dev_islocal(struct net_device *dev, __le16 addr) -{ - struct dn_dev *dn_db; - struct dn_ifaddr *ifa; - int res = 0; - - rcu_read_lock(); - dn_db = rcu_dereference(dev->dn_ptr); - if (dn_db == NULL) { - printk(KERN_DEBUG "dn_dev_islocal: Called for non DECnet device\n"); - goto out; - } - - for (ifa = rcu_dereference(dn_db->ifa_list); - ifa != NULL; - ifa = rcu_dereference(ifa->ifa_next)) - if ((addr ^ ifa->ifa_local) == 0) { - res = 1; - break; - } -out: - rcu_read_unlock(); - return res; -} - -#endif /* _NET_DN_DEV_H */ diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h deleted file mode 100644 index ccc6e9df178b..000000000000 --- a/include/net/dn_fib.h +++ /dev/null @@ -1,167 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_DN_FIB_H -#define _NET_DN_FIB_H - -#include <linux/netlink.h> -#include <linux/refcount.h> - -extern const struct nla_policy rtm_dn_policy[]; - -struct dn_fib_res { - struct fib_rule *r; - struct dn_fib_info *fi; - unsigned char prefixlen; - unsigned char nh_sel; - unsigned char type; - unsigned char scope; -}; - -struct dn_fib_nh { - struct net_device *nh_dev; - unsigned int nh_flags; - unsigned char nh_scope; - int nh_weight; - int nh_power; - int nh_oif; - __le16 nh_gw; -}; - -struct dn_fib_info { - struct dn_fib_info *fib_next; - struct dn_fib_info *fib_prev; - int fib_treeref; - refcount_t fib_clntref; - int fib_dead; - unsigned int fib_flags; - int fib_protocol; - __le16 fib_prefsrc; - __u32 fib_priority; - __u32 fib_metrics[RTAX_MAX]; - int fib_nhs; - int fib_power; - struct dn_fib_nh fib_nh[0]; -#define dn_fib_dev fib_nh[0].nh_dev -}; - - -#define DN_FIB_RES_RESET(res) ((res).nh_sel = 0) -#define DN_FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) - -#define DN_FIB_RES_PREFSRC(res) ((res).fi->fib_prefsrc ? : __dn_fib_res_prefsrc(&res)) -#define DN_FIB_RES_GW(res) (DN_FIB_RES_NH(res).nh_gw) -#define DN_FIB_RES_DEV(res) (DN_FIB_RES_NH(res).nh_dev) -#define DN_FIB_RES_OIF(res) (DN_FIB_RES_NH(res).nh_oif) - -typedef struct { - __le16 datum; -} dn_fib_key_t; - -typedef struct { - __le16 datum; -} dn_fib_hash_t; - -typedef struct { - __u16 datum; -} dn_fib_idx_t; - -struct dn_fib_node { - struct dn_fib_node *fn_next; - struct dn_fib_info *fn_info; -#define DN_FIB_INFO(f) ((f)->fn_info) - dn_fib_key_t fn_key; - u8 fn_type; - u8 fn_scope; - u8 fn_state; -}; - - -struct dn_fib_table { - struct hlist_node hlist; - u32 n; - - int (*insert)(struct dn_fib_table *t, struct rtmsg *r, - struct nlattr *attrs[], struct nlmsghdr *n, - struct netlink_skb_parms *req); - int (*delete)(struct dn_fib_table *t, struct rtmsg *r, - struct nlattr *attrs[], struct nlmsghdr *n, - struct netlink_skb_parms *req); - int (*lookup)(struct dn_fib_table *t, const struct flowidn *fld, - struct dn_fib_res *res); - int (*flush)(struct dn_fib_table *t); - int (*dump)(struct dn_fib_table *t, struct sk_buff *skb, struct netlink_callback *cb); - - unsigned char data[]; -}; - -#ifdef CONFIG_DECNET_ROUTER -/* - * dn_fib.c - */ -void dn_fib_init(void); -void dn_fib_cleanup(void); - -int dn_fib_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); -struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, - struct nlattr *attrs[], - const struct nlmsghdr *nlh, int *errp); -int dn_fib_semantic_match(int type, struct dn_fib_info *fi, - const struct flowidn *fld, struct dn_fib_res *res); -void dn_fib_release_info(struct dn_fib_info *fi); -void dn_fib_flush(void); -void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res); - -/* - * dn_tables.c - */ -struct dn_fib_table *dn_fib_get_table(u32 n, int creat); -struct dn_fib_table *dn_fib_empty_table(void); -void dn_fib_table_init(void); -void dn_fib_table_cleanup(void); - -/* - * dn_rules.c - */ -void dn_fib_rules_init(void); -void dn_fib_rules_cleanup(void); -unsigned int dnet_addr_type(__le16 addr); -int dn_fib_lookup(struct flowidn *fld, struct dn_fib_res *res); - -int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb); - -void dn_fib_free_info(struct dn_fib_info *fi); - -static inline void dn_fib_info_put(struct dn_fib_info *fi) -{ - if (refcount_dec_and_test(&fi->fib_clntref)) - dn_fib_free_info(fi); -} - -static inline void dn_fib_res_put(struct dn_fib_res *res) -{ - if (res->fi) - dn_fib_info_put(res->fi); - if (res->r) - fib_rule_put(res->r); -} - -#else /* Endnode */ - -#define dn_fib_init() do { } while(0) -#define dn_fib_cleanup() do { } while(0) - -#define dn_fib_lookup(fl, res) (-ESRCH) -#define dn_fib_info_put(fi) do { } while(0) -#define dn_fib_select_multipath(fl, res) do { } while(0) -#define dn_fib_rules_policy(saddr,res,flags) (0) -#define dn_fib_res_put(res) do { } while(0) - -#endif /* CONFIG_DECNET_ROUTER */ - -static inline __le16 dnet_make_mask(int n) -{ - if (n) - return cpu_to_le16(~((1 << (16 - n)) - 1)); - return cpu_to_le16(0); -} - -#endif /* _NET_DN_FIB_H */ diff --git a/include/net/dn_neigh.h b/include/net/dn_neigh.h deleted file mode 100644 index 2e3e7793973a..000000000000 --- a/include/net/dn_neigh.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_DN_NEIGH_H -#define _NET_DN_NEIGH_H - -/* - * The position of the first two fields of - * this structure are critical - SJW - */ -struct dn_neigh { - struct neighbour n; - __le16 addr; - unsigned long flags; -#define DN_NDFLAG_R1 0x0001 /* Router L1 */ -#define DN_NDFLAG_R2 0x0002 /* Router L2 */ -#define DN_NDFLAG_P3 0x0004 /* Phase III Node */ - unsigned long blksize; - __u8 priority; -}; - -void dn_neigh_init(void); -void dn_neigh_cleanup(void); -int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb); -int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb); -void dn_neigh_pointopoint_hello(struct sk_buff *skb); -int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n); -int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb); - -extern struct neigh_table dn_neigh_table; - -#endif /* _NET_DN_NEIGH_H */ diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h deleted file mode 100644 index f83932b864a9..000000000000 --- a/include/net/dn_nsp.h +++ /dev/null @@ -1,195 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _NET_DN_NSP_H -#define _NET_DN_NSP_H -/****************************************************************************** - (c) 1995-1998 E.M. Serrat emserrat@geocities.com - -*******************************************************************************/ -/* dn_nsp.c functions prototyping */ - -void dn_nsp_send_data_ack(struct sock *sk); -void dn_nsp_send_oth_ack(struct sock *sk); -void dn_send_conn_ack(struct sock *sk); -void dn_send_conn_conf(struct sock *sk, gfp_t gfp); -void dn_nsp_send_disc(struct sock *sk, unsigned char type, - unsigned short reason, gfp_t gfp); -void dn_nsp_return_disc(struct sk_buff *skb, unsigned char type, - unsigned short reason); -void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval); -void dn_nsp_send_conninit(struct sock *sk, unsigned char flags); - -void dn_nsp_output(struct sock *sk); -int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *q, unsigned short acknum); -void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, gfp_t gfp, - int oob); -unsigned long dn_nsp_persist(struct sock *sk); -int dn_nsp_xmit_timeout(struct sock *sk); - -int dn_nsp_rx(struct sk_buff *); -int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb); - -struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri); -struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int noblock, - long timeo, int *err); - -#define NSP_REASON_OK 0 /* No error */ -#define NSP_REASON_NR 1 /* No resources */ -#define NSP_REASON_UN 2 /* Unrecognised node name */ -#define NSP_REASON_SD 3 /* Node shutting down */ -#define NSP_REASON_ID 4 /* Invalid destination end user */ -#define NSP_REASON_ER 5 /* End user lacks resources */ -#define NSP_REASON_OB 6 /* Object too busy */ -#define NSP_REASON_US 7 /* Unspecified error */ -#define NSP_REASON_TP 8 /* Third-Party abort */ -#define NSP_REASON_EA 9 /* End user has aborted the link */ -#define NSP_REASON_IF 10 /* Invalid node name format */ -#define NSP_REASON_LS 11 /* Local node shutdown */ -#define NSP_REASON_LL 32 /* Node lacks logical-link resources */ -#define NSP_REASON_LE 33 /* End user lacks logical-link resources */ -#define NSP_REASON_UR 34 /* Unacceptable RQSTRID or PASSWORD field */ -#define NSP_REASON_UA 36 /* Unacceptable ACCOUNT field */ -#define NSP_REASON_TM 38 /* End user timed out logical link */ -#define NSP_REASON_NU 39 /* Node unreachable */ -#define NSP_REASON_NL 41 /* No-link message */ -#define NSP_REASON_DC 42 /* Disconnect confirm */ -#define NSP_REASON_IO 43 /* Image data field overflow */ - -#define NSP_DISCINIT 0x38 -#define NSP_DISCCONF 0x48 - -/*------------------------- NSP - messages ------------------------------*/ -/* Data Messages */ -/*---------------*/ - -/* Data Messages (data segment/interrupt/link service) */ - -struct nsp_data_seg_msg { - __u8 msgflg; - __le16 dstaddr; - __le16 srcaddr; -} __packed; - -struct nsp_data_opt_msg { - __le16 acknum; - __le16 segnum; - __le16 lsflgs; -} __packed; - -struct nsp_data_opt_msg1 { - __le16 acknum; - __le16 segnum; -} __packed; - - -/* Acknowledgment Message (data/other data) */ -struct nsp_data_ack_msg { - __u8 msgflg; - __le16 dstaddr; - __le16 srcaddr; - __le16 acknum; -} __packed; - -/* Connect Acknowledgment Message */ -struct nsp_conn_ack_msg { - __u8 msgflg; - __le16 dstaddr; -} __packed; - - -/* Connect Initiate/Retransmit Initiate/Connect Confirm */ -struct nsp_conn_init_msg { - __u8 msgflg; -#define NSP_CI 0x18 /* Connect Initiate */ -#define NSP_RCI 0x68 /* Retrans. Conn Init */ - __le16 dstaddr; - __le16 srcaddr; - __u8 services; -#define NSP_FC_NONE 0x00 /* Flow Control None */ -#define NSP_FC_SRC 0x04 /* Seg Req. Count */ -#define NSP_FC_SCMC 0x08 /* Sess. Control Mess */ -#define NSP_FC_MASK 0x0c /* FC type mask */ - __u8 info; - __le16 segsize; -} __packed; - -/* Disconnect Initiate/Disconnect Confirm */ -struct nsp_disconn_init_msg { - __u8 msgflg; - __le16 dstaddr; - __le16 srcaddr; - __le16 reason; -} __packed; - - - -struct srcobj_fmt { - __u8 format; - __u8 task; - __le16 grpcode; - __le16 usrcode; - __u8 dlen; -} __packed; - -/* - * A collection of functions for manipulating the sequence - * numbers used in NSP. Similar in operation to the functions - * of the same name in TCP. - */ -static __inline__ int dn_before(__u16 seq1, __u16 seq2) -{ - seq1 &= 0x0fff; - seq2 &= 0x0fff; - - return (int)((seq1 - seq2) & 0x0fff) > 2048; -} - - -static __inline__ int dn_after(__u16 seq1, __u16 seq2) -{ - seq1 &= 0x0fff; - seq2 &= 0x0fff; - - return (int)((seq2 - seq1) & 0x0fff) > 2048; -} - -static __inline__ int dn_equal(__u16 seq1, __u16 seq2) -{ - return ((seq1 ^ seq2) & 0x0fff) == 0; -} - -static __inline__ int dn_before_or_equal(__u16 seq1, __u16 seq2) -{ - return (dn_before(seq1, seq2) || dn_equal(seq1, seq2)); -} - -static __inline__ void seq_add(__u16 *seq, __u16 off) -{ - (*seq) += off; - (*seq) &= 0x0fff; -} - -static __inline__ int seq_next(__u16 seq1, __u16 seq2) -{ - return dn_equal(seq1 + 1, seq2); -} - -/* - * Can we delay the ack ? - */ -static __inline__ int sendack(__u16 seq) -{ - return (int)((seq & 0x1000) ? 0 : 1); -} - -/* - * Is socket congested ? - */ -static __inline__ int dn_congested(struct sock *sk) -{ - return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); -} - -#define DN_MAX_NSP_DATA_HEADER (11) - -#endif /* _NET_DN_NSP_H */ diff --git a/include/net/dn_route.h b/include/net/dn_route.h deleted file mode 100644 index 6f1e94ac0bdf..000000000000 --- a/include/net/dn_route.h +++ /dev/null @@ -1,115 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _NET_DN_ROUTE_H -#define _NET_DN_ROUTE_H - -/****************************************************************************** - (c) 1995-1998 E.M. Serrat emserrat@geocities.com - -*******************************************************************************/ - -struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri); -int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *, - struct sock *sk, int flags); -int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb); -void dn_rt_cache_flush(int delay); -int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); - -/* Masks for flags field */ -#define DN_RT_F_PID 0x07 /* Mask for packet type */ -#define DN_RT_F_PF 0x80 /* Padding Follows */ -#define DN_RT_F_VER 0x40 /* Version =0 discard packet if ==1 */ -#define DN_RT_F_IE 0x20 /* Intra Ethernet, Reserved in short pkt */ -#define DN_RT_F_RTS 0x10 /* Packet is being returned to sender */ -#define DN_RT_F_RQR 0x08 /* Return packet to sender upon non-delivery */ - -/* Mask for types of routing packets */ -#define DN_RT_PKT_MSK 0x06 -/* Types of routing packets */ -#define DN_RT_PKT_SHORT 0x02 /* Short routing packet */ -#define DN_RT_PKT_LONG 0x06 /* Long routing packet */ - -/* Mask for control/routing selection */ -#define DN_RT_PKT_CNTL 0x01 /* Set to 1 if a control packet */ -/* Types of control packets */ -#define DN_RT_CNTL_MSK 0x0f /* Mask for control packets */ -#define DN_RT_PKT_INIT 0x01 /* Initialisation packet */ -#define DN_RT_PKT_VERI 0x03 /* Verification Message */ -#define DN_RT_PKT_HELO 0x05 /* Hello and Test Message */ -#define DN_RT_PKT_L1RT 0x07 /* Level 1 Routing Message */ -#define DN_RT_PKT_L2RT 0x09 /* Level 2 Routing Message */ -#define DN_RT_PKT_ERTH 0x0b /* Ethernet Router Hello */ -#define DN_RT_PKT_EEDH 0x0d /* Ethernet EndNode Hello */ - -/* Values for info field in hello message */ -#define DN_RT_INFO_TYPE 0x03 /* Type mask */ -#define DN_RT_INFO_L1RT 0x02 /* L1 Router */ -#define DN_RT_INFO_L2RT 0x01 /* L2 Router */ -#define DN_RT_INFO_ENDN 0x03 /* EndNode */ -#define DN_RT_INFO_VERI 0x04 /* Verification Reqd. */ -#define DN_RT_INFO_RJCT 0x08 /* Reject Flag, Reserved */ -#define DN_RT_INFO_VFLD 0x10 /* Verification Failed, Reserved */ -#define DN_RT_INFO_NOML 0x20 /* No Multicast traffic accepted */ -#define DN_RT_INFO_BLKR 0x40 /* Blocking Requested */ - -/* - * The fl structure is what we used to look up the route. - * The rt_saddr & rt_daddr entries are the same as key.saddr & key.daddr - * except for local input routes, where the rt_saddr = fl.fld_dst and - * rt_daddr = fl.fld_src to allow the route to be used for returning - * packets to the originating host. - */ -struct dn_route { - struct dst_entry dst; - struct dn_route __rcu *dn_next; - - struct neighbour *n; - - struct flowidn fld; - - __le16 rt_saddr; - __le16 rt_daddr; - __le16 rt_gateway; - __le16 rt_local_src; /* Source used for forwarding packets */ - __le16 rt_src_map; - __le16 rt_dst_map; - - unsigned int rt_flags; - unsigned int rt_type; -}; - -static inline bool dn_is_input_route(struct dn_route *rt) -{ - return rt->fld.flowidn_iif != 0; -} - -static inline bool dn_is_output_route(struct dn_route *rt) -{ - return rt->fld.flowidn_iif == 0; -} - -void dn_route_init(void); -void dn_route_cleanup(void); - -#include <net/sock.h> -#include <linux/if_arp.h> - -static inline void dn_rt_send(struct sk_buff *skb) -{ - dev_queue_xmit(skb); -} - -static inline void dn_rt_finish_output(struct sk_buff *skb, char *dst, char *src) -{ - struct net_device *dev = skb->dev; - - if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK)) - dst = NULL; - - if (dev_hard_header(skb, dev, ETH_P_DNA_RT, dst, src, skb->len) >= 0) - dn_rt_send(skb); - else - kfree_skb(skb); -} - -#endif /* _NET_DN_ROUTE_H */ diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h new file mode 100644 index 000000000000..bcf9d7467e1a --- /dev/null +++ b/include/net/dropreason-core.h @@ -0,0 +1,604 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_DROPREASON_CORE_H +#define _LINUX_DROPREASON_CORE_H + +#define DEFINE_DROP_REASON(FN, FNe) \ + FN(NOT_SPECIFIED) \ + FN(NO_SOCKET) \ + FN(SOCKET_CLOSE) \ + FN(SOCKET_FILTER) \ + FN(SOCKET_RCVBUFF) \ + FN(UNIX_DISCONNECT) \ + FN(UNIX_SKIP_OOB) \ + FN(PKT_TOO_SMALL) \ + FN(TCP_CSUM) \ + FN(UDP_CSUM) \ + FN(NETFILTER_DROP) \ + FN(OTHERHOST) \ + FN(IP_CSUM) \ + FN(IP_INHDR) \ + FN(IP_RPFILTER) \ + FN(UNICAST_IN_L2_MULTICAST) \ + FN(XFRM_POLICY) \ + FN(IP_NOPROTO) \ + FN(PROTO_MEM) \ + FN(TCP_AUTH_HDR) \ + FN(TCP_MD5NOTFOUND) \ + FN(TCP_MD5UNEXPECTED) \ + FN(TCP_MD5FAILURE) \ + FN(TCP_AONOTFOUND) \ + FN(TCP_AOUNEXPECTED) \ + FN(TCP_AOKEYNOTFOUND) \ + FN(TCP_AOFAILURE) \ + FN(SOCKET_BACKLOG) \ + FN(TCP_FLAGS) \ + FN(TCP_ABORT_ON_DATA) \ + FN(TCP_ZEROWINDOW) \ + FN(TCP_OLD_DATA) \ + FN(TCP_OVERWINDOW) \ + FN(TCP_OFOMERGE) \ + FN(TCP_RFC7323_PAWS) \ + FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TW_PAWS) \ + FN(TCP_RFC7323_TSECR) \ + FN(TCP_LISTEN_OVERFLOW) \ + FN(TCP_OLD_SEQUENCE) \ + FN(TCP_INVALID_SEQUENCE) \ + FN(TCP_INVALID_ACK_SEQUENCE) \ + FN(TCP_RESET) \ + FN(TCP_INVALID_SYN) \ + FN(TCP_CLOSE) \ + FN(TCP_FASTOPEN) \ + FN(TCP_OLD_ACK) \ + FN(TCP_TOO_OLD_ACK) \ + FN(TCP_ACK_UNSENT_DATA) \ + FN(TCP_OFO_QUEUE_PRUNE) \ + FN(TCP_OFO_DROP) \ + FN(IP_OUTNOROUTES) \ + FN(BPF_CGROUP_EGRESS) \ + FN(IPV6DISABLED) \ + FN(NEIGH_CREATEFAIL) \ + FN(NEIGH_FAILED) \ + FN(NEIGH_QUEUEFULL) \ + FN(NEIGH_DEAD) \ + FN(NEIGH_HH_FILLFAIL) \ + FN(TC_EGRESS) \ + FN(SECURITY_HOOK) \ + FN(QDISC_DROP) \ + FN(QDISC_OVERLIMIT) \ + FN(QDISC_CONGESTED) \ + FN(CAKE_FLOOD) \ + FN(FQ_BAND_LIMIT) \ + FN(FQ_HORIZON_LIMIT) \ + FN(FQ_FLOW_LIMIT) \ + FN(CPU_BACKLOG) \ + FN(XDP) \ + FN(TC_INGRESS) \ + FN(UNHANDLED_PROTO) \ + FN(SKB_CSUM) \ + FN(SKB_GSO_SEG) \ + FN(SKB_UCOPY_FAULT) \ + FN(DEV_HDR) \ + FN(DEV_READY) \ + FN(FULL_RING) \ + FN(NOMEM) \ + FN(HDR_TRUNC) \ + FN(TAP_FILTER) \ + FN(TAP_TXFILTER) \ + FN(ICMP_CSUM) \ + FN(INVALID_PROTO) \ + FN(IP_INADDRERRORS) \ + FN(IP_INNOROUTES) \ + FN(IP_LOCAL_SOURCE) \ + FN(IP_INVALID_SOURCE) \ + FN(IP_LOCALNET) \ + FN(IP_INVALID_DEST) \ + FN(PKT_TOO_BIG) \ + FN(DUP_FRAG) \ + FN(FRAG_REASM_TIMEOUT) \ + FN(FRAG_TOO_FAR) \ + FN(TCP_MINTTL) \ + FN(IPV6_BAD_EXTHDR) \ + FN(IPV6_NDISC_FRAG) \ + FN(IPV6_NDISC_HOP_LIMIT) \ + FN(IPV6_NDISC_BAD_CODE) \ + FN(IPV6_NDISC_BAD_OPTIONS) \ + FN(IPV6_NDISC_NS_OTHERHOST) \ + FN(QUEUE_PURGE) \ + FN(TC_COOKIE_ERROR) \ + FN(PACKET_SOCK_ERROR) \ + FN(TC_CHAIN_NOTFOUND) \ + FN(TC_RECLASSIFY_LOOP) \ + FN(VXLAN_INVALID_HDR) \ + FN(VXLAN_VNI_NOT_FOUND) \ + FN(MAC_INVALID_SOURCE) \ + FN(VXLAN_ENTRY_EXISTS) \ + FN(NO_TX_TARGET) \ + FN(IP_TUNNEL_ECN) \ + FN(TUNNEL_TXINFO) \ + FN(LOCAL_MAC) \ + FN(ARP_PVLAN_DISABLE) \ + FN(MAC_IEEE_MAC_CONTROL) \ + FN(BRIDGE_INGRESS_STP_STATE) \ + FNe(MAX) + +/** + * enum skb_drop_reason - the reasons of skb drops + * + * The reason of skb drop, which is used in kfree_skb_reason(). + */ +enum skb_drop_reason { + /** + * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case) + */ + SKB_NOT_DROPPED_YET = 0, + /** @SKB_CONSUMED: packet has been consumed */ + SKB_CONSUMED, + /** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */ + SKB_DROP_REASON_NOT_SPECIFIED, + /** + * @SKB_DROP_REASON_NO_SOCKET: no valid socket that can be used. + * Reason could be one of three cases: + * 1) no established/listening socket found during lookup process + * 2) no valid request socket during 3WHS process + * 3) no valid child socket during 3WHS process + */ + SKB_DROP_REASON_NO_SOCKET, + /** @SKB_DROP_REASON_SOCKET_CLOSE: socket is close()d */ + SKB_DROP_REASON_SOCKET_CLOSE, + /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ + SKB_DROP_REASON_SOCKET_FILTER, + /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ + SKB_DROP_REASON_SOCKET_RCVBUFF, + /** + * @SKB_DROP_REASON_UNIX_DISCONNECT: recv queue is purged when SOCK_DGRAM + * or SOCK_SEQPACKET socket re-connect()s to another socket or notices + * during send() that the peer has been close()d. + */ + SKB_DROP_REASON_UNIX_DISCONNECT, + /** + * @SKB_DROP_REASON_UNIX_SKIP_OOB: Out-Of-Band data is skipped by + * recv() without MSG_OOB so dropped. + */ + SKB_DROP_REASON_UNIX_SKIP_OOB, + /** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */ + SKB_DROP_REASON_PKT_TOO_SMALL, + /** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */ + SKB_DROP_REASON_TCP_CSUM, + /** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */ + SKB_DROP_REASON_UDP_CSUM, + /** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */ + SKB_DROP_REASON_NETFILTER_DROP, + /** + * @SKB_DROP_REASON_OTHERHOST: packet don't belong to current host + * (interface is in promisc mode) + */ + SKB_DROP_REASON_OTHERHOST, + /** @SKB_DROP_REASON_IP_CSUM: IP checksum error */ + SKB_DROP_REASON_IP_CSUM, + /** + * @SKB_DROP_REASON_IP_INHDR: there is something wrong with IP header (see + * IPSTATS_MIB_INHDRERRORS) + */ + SKB_DROP_REASON_IP_INHDR, + /** + * @SKB_DROP_REASON_IP_RPFILTER: IP rpfilter validate failed. see the + * document for rp_filter in ip-sysctl.rst for more information + */ + SKB_DROP_REASON_IP_RPFILTER, + /** + * @SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST: destination address of L2 is + * multicast, but L3 is unicast. + */ + SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, + /** @SKB_DROP_REASON_XFRM_POLICY: xfrm policy check failed */ + SKB_DROP_REASON_XFRM_POLICY, + /** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */ + SKB_DROP_REASON_IP_NOPROTO, + /** + * @SKB_DROP_REASON_PROTO_MEM: proto memory limitation, such as + * udp packet drop out of udp_memory_allocated. + */ + SKB_DROP_REASON_PROTO_MEM, + /** + * @SKB_DROP_REASON_TCP_AUTH_HDR: TCP-MD5 or TCP-AO hashes are met + * twice or set incorrectly. + */ + SKB_DROP_REASON_TCP_AUTH_HDR, + /** + * @SKB_DROP_REASON_TCP_MD5NOTFOUND: no MD5 hash and one expected, + * corresponding to LINUX_MIB_TCPMD5NOTFOUND + */ + SKB_DROP_REASON_TCP_MD5NOTFOUND, + /** + * @SKB_DROP_REASON_TCP_MD5UNEXPECTED: MD5 hash and we're not expecting + * one, corresponding to LINUX_MIB_TCPMD5UNEXPECTED + */ + SKB_DROP_REASON_TCP_MD5UNEXPECTED, + /** + * @SKB_DROP_REASON_TCP_MD5FAILURE: MD5 hash and its wrong, corresponding + * to LINUX_MIB_TCPMD5FAILURE + */ + SKB_DROP_REASON_TCP_MD5FAILURE, + /** + * @SKB_DROP_REASON_TCP_AONOTFOUND: no TCP-AO hash and one was expected, + * corresponding to LINUX_MIB_TCPAOREQUIRED + */ + SKB_DROP_REASON_TCP_AONOTFOUND, + /** + * @SKB_DROP_REASON_TCP_AOUNEXPECTED: TCP-AO hash is present and it + * was not expected, corresponding to LINUX_MIB_TCPAOKEYNOTFOUND + */ + SKB_DROP_REASON_TCP_AOUNEXPECTED, + /** + * @SKB_DROP_REASON_TCP_AOKEYNOTFOUND: TCP-AO key is unknown, + * corresponding to LINUX_MIB_TCPAOKEYNOTFOUND + */ + SKB_DROP_REASON_TCP_AOKEYNOTFOUND, + /** + * @SKB_DROP_REASON_TCP_AOFAILURE: TCP-AO hash is wrong, + * corresponding to LINUX_MIB_TCPAOBAD + */ + SKB_DROP_REASON_TCP_AOFAILURE, + /** + * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog ( + * see LINUX_MIB_TCPBACKLOGDROP) + */ + SKB_DROP_REASON_SOCKET_BACKLOG, + /** @SKB_DROP_REASON_TCP_FLAGS: TCP flags invalid */ + SKB_DROP_REASON_TCP_FLAGS, + /** + * @SKB_DROP_REASON_TCP_ABORT_ON_DATA: abort on data, corresponding to + * LINUX_MIB_TCPABORTONDATA + */ + SKB_DROP_REASON_TCP_ABORT_ON_DATA, + /** + * @SKB_DROP_REASON_TCP_ZEROWINDOW: TCP receive window size is zero, + * see LINUX_MIB_TCPZEROWINDOWDROP + */ + SKB_DROP_REASON_TCP_ZEROWINDOW, + /** + * @SKB_DROP_REASON_TCP_OLD_DATA: the TCP data received is already + * received before (spurious retrans may happened), see + * LINUX_MIB_DELAYEDACKLOST + */ + SKB_DROP_REASON_TCP_OLD_DATA, + /** + * @SKB_DROP_REASON_TCP_OVERWINDOW: the TCP data is out of window, + * the seq of the first byte exceed the right edges of receive + * window + */ + SKB_DROP_REASON_TCP_OVERWINDOW, + /** + * @SKB_DROP_REASON_TCP_OFOMERGE: the data of skb is already in the ofo + * queue, corresponding to LINUX_MIB_TCPOFOMERGE + */ + SKB_DROP_REASON_TCP_OFOMERGE, + /** + * @SKB_DROP_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to + * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED + */ + SKB_DROP_REASON_TCP_RFC7323_PAWS, + /** + * @SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK: PAWS check, old ACK packet. + * Corresponds to LINUX_MIB_PAWS_OLD_ACK. + */ + SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in + * TIME_WAIT state. + * Corresponds to LINUX_MIB_PAWS_TW_REJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. + * Corresponds to LINUX_MIB_TSECRREJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TSECR, + /** @SKB_DROP_REASON_TCP_LISTEN_OVERFLOW: listener queue full. */ + SKB_DROP_REASON_TCP_LISTEN_OVERFLOW, + /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ + SKB_DROP_REASON_TCP_OLD_SEQUENCE, + /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ + SKB_DROP_REASON_TCP_INVALID_SEQUENCE, + /** + * @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ + * field because ack sequence is not in the window between snd_una + * and snd_nxt + */ + SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE, + /** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */ + SKB_DROP_REASON_TCP_RESET, + /** + * @SKB_DROP_REASON_TCP_INVALID_SYN: Incoming packet has unexpected + * SYN flag + */ + SKB_DROP_REASON_TCP_INVALID_SYN, + /** @SKB_DROP_REASON_TCP_CLOSE: TCP socket in CLOSE state */ + SKB_DROP_REASON_TCP_CLOSE, + /** @SKB_DROP_REASON_TCP_FASTOPEN: dropped by FASTOPEN request socket */ + SKB_DROP_REASON_TCP_FASTOPEN, + /** @SKB_DROP_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */ + SKB_DROP_REASON_TCP_OLD_ACK, + /** @SKB_DROP_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */ + SKB_DROP_REASON_TCP_TOO_OLD_ACK, + /** + * @SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't + * sent yet + */ + SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, + /** @SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE: pruned from TCP OFO queue */ + SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE, + /** @SKB_DROP_REASON_TCP_OFO_DROP: data already in receive queue */ + SKB_DROP_REASON_TCP_OFO_DROP, + /** @SKB_DROP_REASON_IP_OUTNOROUTES: route lookup failed */ + SKB_DROP_REASON_IP_OUTNOROUTES, + /** + * @SKB_DROP_REASON_BPF_CGROUP_EGRESS: dropped by BPF_PROG_TYPE_CGROUP_SKB + * eBPF program + */ + SKB_DROP_REASON_BPF_CGROUP_EGRESS, + /** @SKB_DROP_REASON_IPV6DISABLED: IPv6 is disabled on the device */ + SKB_DROP_REASON_IPV6DISABLED, + /** @SKB_DROP_REASON_NEIGH_CREATEFAIL: failed to create neigh entry */ + SKB_DROP_REASON_NEIGH_CREATEFAIL, + /** @SKB_DROP_REASON_NEIGH_FAILED: neigh entry in failed state */ + SKB_DROP_REASON_NEIGH_FAILED, + /** @SKB_DROP_REASON_NEIGH_QUEUEFULL: arp_queue for neigh entry is full */ + SKB_DROP_REASON_NEIGH_QUEUEFULL, + /** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */ + SKB_DROP_REASON_NEIGH_DEAD, + /** @SKB_DROP_REASON_NEIGH_HH_FILLFAIL: failed to fill the device hard header */ + SKB_DROP_REASON_NEIGH_HH_FILLFAIL, + /** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */ + SKB_DROP_REASON_TC_EGRESS, + /** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */ + SKB_DROP_REASON_SECURITY_HOOK, + /** + * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting ( + * failed to enqueue to current qdisc) + */ + SKB_DROP_REASON_QDISC_DROP, + /** + * @SKB_DROP_REASON_QDISC_OVERLIMIT: dropped by qdisc when a qdisc + * instance exceeds its total buffer size limit. + */ + SKB_DROP_REASON_QDISC_OVERLIMIT, + /** + * @SKB_DROP_REASON_QDISC_CONGESTED: dropped by a qdisc AQM algorithm + * due to congestion. + */ + SKB_DROP_REASON_QDISC_CONGESTED, + /** + * @SKB_DROP_REASON_CAKE_FLOOD: dropped by the flood protection part of + * CAKE qdisc AQM algorithm (BLUE). + */ + SKB_DROP_REASON_CAKE_FLOOD, + /** + * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band + * limit is reached. + */ + SKB_DROP_REASON_FQ_BAND_LIMIT, + /** + * @SKB_DROP_REASON_FQ_HORIZON_LIMIT: dropped by fq qdisc when packet + * timestamp is too far in the future. + */ + SKB_DROP_REASON_FQ_HORIZON_LIMIT, + /** + * @SKB_DROP_REASON_FQ_FLOW_LIMIT: dropped by fq qdisc when a flow + * exceeds its limits. + */ + SKB_DROP_REASON_FQ_FLOW_LIMIT, + /** + * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU + * backlog queue. This can be caused by backlog queue full (see + * netdev_max_backlog in net.rst) or RPS flow limit + */ + SKB_DROP_REASON_CPU_BACKLOG, + /** @SKB_DROP_REASON_XDP: dropped by XDP in input path */ + SKB_DROP_REASON_XDP, + /** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */ + SKB_DROP_REASON_TC_INGRESS, + /** @SKB_DROP_REASON_UNHANDLED_PROTO: protocol not implemented or not supported */ + SKB_DROP_REASON_UNHANDLED_PROTO, + /** @SKB_DROP_REASON_SKB_CSUM: sk_buff checksum computation error */ + SKB_DROP_REASON_SKB_CSUM, + /** @SKB_DROP_REASON_SKB_GSO_SEG: gso segmentation error */ + SKB_DROP_REASON_SKB_GSO_SEG, + /** + * @SKB_DROP_REASON_SKB_UCOPY_FAULT: failed to copy data from user space, + * e.g., via zerocopy_sg_from_iter() or skb_orphan_frags_rx() + */ + SKB_DROP_REASON_SKB_UCOPY_FAULT, + /** @SKB_DROP_REASON_DEV_HDR: device driver specific header/metadata is invalid */ + SKB_DROP_REASON_DEV_HDR, + /** + * @SKB_DROP_REASON_DEV_READY: the device is not ready to xmit/recv due to + * any of its data structure that is not up/ready/initialized, + * e.g., the IFF_UP is not set, or driver specific tun->tfiles[txq] + * is not initialized + */ + SKB_DROP_REASON_DEV_READY, + /** @SKB_DROP_REASON_FULL_RING: ring buffer is full */ + SKB_DROP_REASON_FULL_RING, + /** @SKB_DROP_REASON_NOMEM: error due to OOM */ + SKB_DROP_REASON_NOMEM, + /** + * @SKB_DROP_REASON_HDR_TRUNC: failed to trunc/extract the header from + * networking data, e.g., failed to pull the protocol header from + * frags via pskb_may_pull() + */ + SKB_DROP_REASON_HDR_TRUNC, + /** + * @SKB_DROP_REASON_TAP_FILTER: dropped by (ebpf) filter directly attached + * to tun/tap, e.g., via TUNSETFILTEREBPF + */ + SKB_DROP_REASON_TAP_FILTER, + /** + * @SKB_DROP_REASON_TAP_TXFILTER: dropped by tx filter implemented at + * tun/tap, e.g., check_filter() + */ + SKB_DROP_REASON_TAP_TXFILTER, + /** @SKB_DROP_REASON_ICMP_CSUM: ICMP checksum error */ + SKB_DROP_REASON_ICMP_CSUM, + /** + * @SKB_DROP_REASON_INVALID_PROTO: the packet doesn't follow RFC 2211, + * such as a broadcasts ICMP_TIMESTAMP + */ + SKB_DROP_REASON_INVALID_PROTO, + /** + * @SKB_DROP_REASON_IP_INADDRERRORS: host unreachable, corresponding to + * IPSTATS_MIB_INADDRERRORS + */ + SKB_DROP_REASON_IP_INADDRERRORS, + /** + * @SKB_DROP_REASON_IP_INNOROUTES: network unreachable, corresponding to + * IPSTATS_MIB_INADDRERRORS + */ + SKB_DROP_REASON_IP_INNOROUTES, + /** @SKB_DROP_REASON_IP_LOCAL_SOURCE: the source ip is local */ + SKB_DROP_REASON_IP_LOCAL_SOURCE, + /** + * @SKB_DROP_REASON_IP_INVALID_SOURCE: the source ip is invalid: + * 1) source ip is multicast or limited broadcast + * 2) source ip is zero and not IGMP + */ + SKB_DROP_REASON_IP_INVALID_SOURCE, + /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ + SKB_DROP_REASON_IP_LOCALNET, + /** + * @SKB_DROP_REASON_IP_INVALID_DEST: the dest ip is invalid: + * 1) dest ip is 0 + */ + SKB_DROP_REASON_IP_INVALID_DEST, + /** + * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the + * MTU) + */ + SKB_DROP_REASON_PKT_TOO_BIG, + /** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */ + SKB_DROP_REASON_DUP_FRAG, + /** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */ + SKB_DROP_REASON_FRAG_REASM_TIMEOUT, + /** + * @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far. + * (/proc/sys/net/ipv4/ipfrag_max_dist) + */ + SKB_DROP_REASON_FRAG_TOO_FAR, + /** + * @SKB_DROP_REASON_TCP_MINTTL: ipv4 ttl or ipv6 hoplimit below + * the threshold (IP_MINTTL or IPV6_MINHOPCOUNT). + */ + SKB_DROP_REASON_TCP_MINTTL, + /** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */ + SKB_DROP_REASON_IPV6_BAD_EXTHDR, + /** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */ + SKB_DROP_REASON_IPV6_NDISC_FRAG, + /** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */ + SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT, + /** @SKB_DROP_REASON_IPV6_NDISC_BAD_CODE: invalid NDISC icmp6 code. */ + SKB_DROP_REASON_IPV6_NDISC_BAD_CODE, + /** @SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS: invalid NDISC options. */ + SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS, + /** + * @SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST: NEIGHBOUR SOLICITATION + * for another host. + */ + SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST, + /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */ + SKB_DROP_REASON_QUEUE_PURGE, + /** + * @SKB_DROP_REASON_TC_COOKIE_ERROR: An error occurred whilst + * processing a tc ext cookie. + */ + SKB_DROP_REASON_TC_COOKIE_ERROR, + /** + * @SKB_DROP_REASON_PACKET_SOCK_ERROR: generic packet socket errors + * after its filter matches an incoming packet. + */ + SKB_DROP_REASON_PACKET_SOCK_ERROR, + /** @SKB_DROP_REASON_TC_CHAIN_NOTFOUND: tc chain lookup failed. */ + SKB_DROP_REASON_TC_CHAIN_NOTFOUND, + /** + * @SKB_DROP_REASON_TC_RECLASSIFY_LOOP: tc exceeded max reclassify loop + * iterations. + */ + SKB_DROP_REASON_TC_RECLASSIFY_LOOP, + /** + * @SKB_DROP_REASON_VXLAN_INVALID_HDR: VXLAN header is invalid. E.g.: + * 1) reserved fields are not zero + * 2) "I" flag is not set + */ + SKB_DROP_REASON_VXLAN_INVALID_HDR, + /** @SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND: no VXLAN device found for VNI */ + SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND, + /** @SKB_DROP_REASON_MAC_INVALID_SOURCE: source mac is invalid */ + SKB_DROP_REASON_MAC_INVALID_SOURCE, + /** + * @SKB_DROP_REASON_VXLAN_ENTRY_EXISTS: trying to migrate a static + * entry or an entry pointing to a nexthop. + */ + SKB_DROP_REASON_VXLAN_ENTRY_EXISTS, + /** @SKB_DROP_REASON_NO_TX_TARGET: no target found for xmit */ + SKB_DROP_REASON_NO_TX_TARGET, + /** + * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to + * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. + */ + SKB_DROP_REASON_IP_TUNNEL_ECN, + /** + * @SKB_DROP_REASON_TUNNEL_TXINFO: packet without necessary metadata + * reached a device which is in "external" mode. + */ + SKB_DROP_REASON_TUNNEL_TXINFO, + /** + * @SKB_DROP_REASON_LOCAL_MAC: the source MAC address is equal to + * the MAC address of the local netdev. + */ + SKB_DROP_REASON_LOCAL_MAC, + /** + * @SKB_DROP_REASON_ARP_PVLAN_DISABLE: packet which is not IP is + * forwarded to the in_dev, and the proxy_arp_pvlan is not + * enabled. + */ + SKB_DROP_REASON_ARP_PVLAN_DISABLE, + /** + * @SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL: the destination MAC address + * is an IEEE MAC Control address. + */ + SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL, + /** + * @SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE: the STP state of the + * ingress bridge port does not allow frames to be forwarded. + */ + SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE, + /** + * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which + * shouldn't be used as a real 'reason' - only for tracing code gen + */ + SKB_DROP_REASON_MAX, + + /** + * @SKB_DROP_REASON_SUBSYS_MASK: subsystem mask in drop reasons, + * see &enum skb_drop_reason_subsys + */ + SKB_DROP_REASON_SUBSYS_MASK = 0xffff0000, +}; + +#define SKB_DROP_REASON_SUBSYS_SHIFT 16 + +#define SKB_DR_INIT(name, reason) \ + enum skb_drop_reason name = SKB_DROP_REASON_##reason +#define SKB_DR(name) \ + SKB_DR_INIT(name, NOT_SPECIFIED) +#define SKB_DR_SET(name, reason) \ + (name = SKB_DROP_REASON_##reason) +#define SKB_DR_OR(name, reason) \ + do { \ + if (name == SKB_DROP_REASON_NOT_SPECIFIED || \ + name == SKB_NOT_DROPPED_YET) \ + SKB_DR_SET(name, reason); \ + } while (0) + +#endif diff --git a/include/net/dropreason.h b/include/net/dropreason.h new file mode 100644 index 000000000000..7d3b1a2a6fec --- /dev/null +++ b/include/net/dropreason.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_DROPREASON_H +#define _LINUX_DROPREASON_H +#include <net/dropreason-core.h> + +/** + * enum skb_drop_reason_subsys - subsystem tag for (extended) drop reasons + */ +enum skb_drop_reason_subsys { + /** @SKB_DROP_REASON_SUBSYS_CORE: core drop reasons defined above */ + SKB_DROP_REASON_SUBSYS_CORE, + + /** + * @SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE: mac80211 drop reasons + * for unusable frames, see net/mac80211/drop.h + */ + SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE, + + /** + * @SKB_DROP_REASON_SUBSYS_OPENVSWITCH: openvswitch drop reasons, + * see net/openvswitch/drop.h + */ + SKB_DROP_REASON_SUBSYS_OPENVSWITCH, + + /** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */ + SKB_DROP_REASON_SUBSYS_NUM +}; + +struct drop_reason_list { + const char * const *reasons; + size_t n_reasons; +}; + +/* Note: due to dynamic registrations, access must be under RCU */ +extern const struct drop_reason_list __rcu * +drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM]; + +void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, + const struct drop_reason_list *list); +void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys); + +#endif diff --git a/include/net/dsa.h b/include/net/dsa.h index 35429a140dfa..55e2d97f247e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -22,10 +22,8 @@ #include <net/devlink.h> #include <net/switchdev.h> +struct dsa_8021q_context; struct tc_action; -struct phy_device; -struct fixed_phy_status; -struct phylink_link_state; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 @@ -45,10 +43,22 @@ struct phylink_link_state; #define DSA_TAG_PROTO_OCELOT_VALUE 15 #define DSA_TAG_PROTO_AR9331_VALUE 16 #define DSA_TAG_PROTO_RTL4_A_VALUE 17 +#define DSA_TAG_PROTO_HELLCREEK_VALUE 18 +#define DSA_TAG_PROTO_XRS700X_VALUE 19 +#define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 +#define DSA_TAG_PROTO_SEVILLE_VALUE 21 +#define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 +#define DSA_TAG_PROTO_SJA1110_VALUE 23 +#define DSA_TAG_PROTO_RTL8_4_VALUE 24 +#define DSA_TAG_PROTO_RTL8_4T_VALUE 25 +#define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 +#define DSA_TAG_PROTO_LAN937X_VALUE 27 +#define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, + DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, @@ -65,65 +75,53 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, DSA_TAG_PROTO_RTL4_A = DSA_TAG_PROTO_RTL4_A_VALUE, + DSA_TAG_PROTO_HELLCREEK = DSA_TAG_PROTO_HELLCREEK_VALUE, + DSA_TAG_PROTO_XRS700X = DSA_TAG_PROTO_XRS700X_VALUE, + DSA_TAG_PROTO_OCELOT_8021Q = DSA_TAG_PROTO_OCELOT_8021Q_VALUE, + DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, + DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, + DSA_TAG_PROTO_RTL8_4 = DSA_TAG_PROTO_RTL8_4_VALUE, + DSA_TAG_PROTO_RTL8_4T = DSA_TAG_PROTO_RTL8_4T_VALUE, + DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE, + DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, + DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, }; -struct packet_type; struct dsa_switch; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); - struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); - /* Used to determine which traffic should match the DSA filter in - * eth_type_trans, and which, if any, should bypass it and be processed - * as regular on the master net device. - */ - bool (*filter)(const struct sk_buff *skb, struct net_device *dev); - unsigned int overhead; + int (*connect)(struct dsa_switch *ds); + void (*disconnect)(struct dsa_switch *ds); + unsigned int needed_headroom; + unsigned int needed_tailroom; const char *name; enum dsa_tag_protocol proto; /* Some tagging protocols either mangle or shift the destination MAC - * address, in which case the DSA master would drop packets on ingress + * address, in which case the DSA conduit would drop packets on ingress * if what it understands out of the destination MAC address is not in * its RX filter. */ - bool promisc_on_master; - bool tail_tag; -}; - -/* This structure defines the control interfaces that are overlayed by the - * DSA layer on top of the DSA CPU/management net_device instance. This is - * used by the core net_device layer while calling various net_device_ops - * function pointers. - */ -struct dsa_netdevice_ops { - int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, - int cmd); -}; - -#define DSA_TAG_DRIVER_ALIAS "dsa_tag-" -#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ - MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) - -struct dsa_skb_cb { - struct sk_buff *clone; + bool promisc_on_conduit; }; -struct __dsa_skb_cb { - struct dsa_skb_cb cb; - u8 priv[48 - sizeof(struct dsa_skb_cb)]; +struct dsa_lag { + struct net_device *dev; + unsigned int id; + struct mutex fdb_lock; + struct list_head fdbs; + refcount_t refcount; }; -#define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb)) - -#define DSA_SKB_CB_PRIV(skb) \ - ((void *)(skb)->cb + offsetof(struct __dsa_skb_cb, priv)) - struct dsa_switch_tree { struct list_head list; + /* List of switch ports */ + struct list_head ports; + /* Notifier chain for switch-wide events */ struct raw_notifier_head nh; @@ -133,6 +131,19 @@ struct dsa_switch_tree { /* Number of switches attached to this tree */ struct kref refcount; + /* Maps offloaded LAG netdevs to a zero-based linear ID for + * drivers that need it. + */ + struct dsa_lag **lags; + + /* Tagging protocol operations */ + const struct dsa_device_ops *tag_ops; + + /* Default tagging protocol preferred by the switches in this + * tree. + */ + enum dsa_tag_protocol default_proto; + /* Has this tree been applied to the hardware? */ bool setup; @@ -142,13 +153,51 @@ struct dsa_switch_tree { */ struct dsa_platform_data *pd; - /* List of switch ports */ - struct list_head ports; - /* List of DSA links composing the routing table */ struct list_head rtable; + + /* Length of "lags" array */ + unsigned int lags_len; + + /* Track the largest switch index within a tree */ + unsigned int last_switch; }; +/* LAG IDs are one-based, the dst->lags array is zero-based */ +#define dsa_lags_foreach_id(_id, _dst) \ + for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++) \ + if ((_dst)->lags[(_id) - 1]) + +#define dsa_lag_foreach_port(_dp, _dst, _lag) \ + list_for_each_entry((_dp), &(_dst)->ports, list) \ + if (dsa_port_offloads_lag((_dp), (_lag))) + +#define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ + list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ + if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) + +static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst, + unsigned int id) +{ + /* DSA LAG IDs are one-based, dst->lags is zero-based */ + return dst->lags[id - 1]; +} + +static inline int dsa_lag_id(struct dsa_switch_tree *dst, + struct net_device *lag_dev) +{ + unsigned int id; + + dsa_lags_foreach_id(id, dst) { + struct dsa_lag *lag = dsa_lag_by_id(dst, id); + + if (lag->dev == lag_dev) + return lag->id; + } + + return -ENODEV; +} + /* TC matchall action types */ enum dsa_port_mall_action_type { DSA_PORT_MALL_MIRROR, @@ -178,24 +227,34 @@ struct dsa_mall_tc_entry { }; }; +struct dsa_bridge { + struct net_device *dev; + unsigned int num; + bool tx_fwd_offload; + refcount_t refcount; +}; struct dsa_port { - /* A CPU port is physically connected to a master device. - * A user port exposed to userspace has a slave device. + /* A CPU port is physically connected to a conduit device. A user port + * exposes a network device to user-space, called 'user' here. */ union { - struct net_device *master; - struct net_device *slave; + struct net_device *conduit; + struct net_device *user; }; - /* CPU port tagging operations used by master or slave devices */ + /* Copy of the tagging protocol operations, for quicker access + * in the data path. Valid only for the CPU ports. + */ const struct dsa_device_ops *tag_ops; - /* Copies for faster access in master receive hot path */ + /* Copies for faster access in conduit receive hot path */ struct dsa_switch_tree *dst; - struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); - bool (*filter)(const struct sk_buff *skb, struct net_device *dev); + struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev); + + struct dsa_switch *ds; + + unsigned int index; enum { DSA_PORT_TYPE_UNUSED = 0, @@ -204,42 +263,75 @@ struct dsa_port { DSA_PORT_TYPE_USER, } type; - struct dsa_switch *ds; - unsigned int index; const char *name; struct dsa_port *cpu_dp; - const char *mac; + u8 mac[ETH_ALEN]; + + u8 stp_state; + + /* Warning: the following bit fields are not atomic, and updating them + * can only be done from code paths where concurrency is not possible + * (probe time or under rtnl_lock). + */ + u8 vlan_filtering:1; + + /* Managed by DSA on user ports and by drivers on CPU and DSA ports */ + u8 learning:1; + + u8 lag_tx_enabled:1; + + /* conduit state bits, valid only on CPU ports */ + u8 conduit_admin_up:1; + u8 conduit_oper_up:1; + + /* Valid only on user ports */ + u8 cpu_port_in_lag:1; + + u8 setup:1; + struct device_node *dn; unsigned int ageing_time; - bool vlan_filtering; - u8 stp_state; - struct net_device *bridge_dev; + + struct dsa_bridge *bridge; struct devlink_port devlink_port; - bool devlink_port_setup; struct phylink *pl; struct phylink_config pl_config; + struct dsa_lag *lag; + struct net_device *hsr_dev; struct list_head list; /* - * Give the switch driver somewhere to hang its per-port private data - * structures (accessible from the tagger). - */ - void *priv; - - /* - * Original copy of the master netdev ethtool_ops + * Original copy of the conduit netdev ethtool_ops */ const struct ethtool_ops *orig_ethtool_ops; - /* - * Original copy of the master netdev net_device_ops + /* List of MAC addresses that must be forwarded on this port. + * These are only valid on CPU ports and DSA links. */ - const struct dsa_netdevice_ops *netdev_ops; + struct mutex addr_lists_lock; + struct list_head fdbs; + struct list_head mdbs; - bool setup; + struct mutex vlans_lock; + union { + /* List of VLANs that CPU and DSA ports are members of. + * Access to this is serialized by the sleepable @vlans_lock. + */ + struct list_head vlans; + /* List of VLANs that user ports are members of. + * Access to this is serialized by netif_addr_lock_bh(). + */ + struct list_head user_vlans; + }; }; +static inline struct dsa_port * +dsa_phylink_to_port(struct phylink_config *config) +{ + return container_of(config, struct dsa_port, pl_config); +} + /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, @@ -251,9 +343,37 @@ struct dsa_link { struct list_head list; }; -struct dsa_switch { - bool setup; +enum dsa_db_type { + DSA_DB_PORT, + DSA_DB_LAG, + DSA_DB_BRIDGE, +}; + +struct dsa_db { + enum dsa_db_type type; + + union { + const struct dsa_port *dp; + struct dsa_lag lag; + struct dsa_bridge bridge; + }; +}; + +struct dsa_mac_addr { + unsigned char addr[ETH_ALEN]; + u16 vid; + refcount_t refcount; + struct list_head list; + struct dsa_db db; +}; +struct dsa_vlan { + u16 vid; + refcount_t refcount; + struct list_head list; +}; + +struct dsa_switch { struct device *dev; /* @@ -262,6 +382,68 @@ struct dsa_switch { struct dsa_switch_tree *dst; unsigned int index; + /* Warning: the following bit fields are not atomic, and updating them + * can only be done from code paths where concurrency is not possible + * (probe time or under rtnl_lock). + */ + u32 setup:1; + + /* Disallow bridge core from requesting different VLAN awareness + * settings on ports if not hardware-supported + */ + u32 vlan_filtering_is_global:1; + + /* Keep VLAN filtering enabled on ports not offloading any upper */ + u32 needs_standalone_vlan_filtering:1; + + /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges + * that have vlan_filtering=0. All drivers should ideally set this (and + * then the option would get removed), but it is unknown whether this + * would break things or not. + */ + u32 configure_vlan_while_not_filtering:1; + + /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames. + * DEPRECATED: Do NOT set this field in new drivers. Instead look at + * the dsa_software_vlan_untag() comments. + */ + u32 untag_bridge_pvid:1; + /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames. + * Useful if the switch cannot preserve the VLAN tag as seen on the + * wire for user port ingress, and chooses to send all frames as + * VLAN-tagged to the CPU, including those which were originally + * untagged. + */ + u32 untag_vlan_aware_bridge_pvid:1; + + /* Let DSA manage the FDB entries towards the + * CPU, based on the software bridge database. + */ + u32 assisted_learning_on_cpu_port:1; + + /* In case vlan_filtering_is_global is set, the VLAN awareness state + * should be retrieved from here and not from the per-port settings. + */ + u32 vlan_filtering:1; + + /* For switches that only have the MRU configurable. To ensure the + * configured MTU is not exceeded, normalization of MRU on all bridged + * interfaces is needed. + */ + u32 mtu_enforcement_ingress:1; + + /* Drivers that isolate the FDBs of multiple bridges must set this + * to true to receive the bridge as an argument in .port_fdb_{add,del} + * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be + * passed as zero. + */ + u32 fdb_isolation:1; + + /* Drivers that have global DSCP mapping settings must set this to + * true to automatically apply the settings to all ports. + */ + u32 dscp_prio_mapping_is_global:1; + /* Listener for switch fabric events */ struct notifier_block nb; @@ -271,6 +453,8 @@ struct dsa_switch { */ void *priv; + void *tagger_data; + /* * Configuration data for this switch. */ @@ -282,58 +466,45 @@ struct dsa_switch { const struct dsa_switch_ops *ops; /* - * Slave mii_bus and devices for the individual ports. + * Allow a DSA switch driver to override the phylink MAC ops + */ + const struct phylink_mac_ops *phylink_mac_ops; + + /* + * User mii_bus and devices for the individual ports. */ u32 phys_mii_mask; - struct mii_bus *slave_mii_bus; + struct mii_bus *user_mii_bus; /* Ageing Time limits in msecs */ unsigned int ageing_time_min; unsigned int ageing_time_max; + /* Storage for drivers using tag_8021q */ + struct dsa_8021q_context *tag_8021q_ctx; + /* devlink used to represent this switch device */ struct devlink *devlink; /* Number of switch port queues */ unsigned int num_tx_queues; - /* Disallow bridge core from requesting different VLAN awareness - * settings on ports if not hardware-supported - */ - bool vlan_filtering_is_global; - - /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges - * that have vlan_filtering=0. All drivers should ideally set this (and - * then the option would get removed), but it is unknown whether this - * would break things or not. - */ - bool configure_vlan_while_not_filtering; - - /* If the switch driver always programs the CPU port as egress tagged - * despite the VLAN configuration indicating otherwise, then setting - * @untag_bridge_pvid will force the DSA receive path to pop the bridge's - * default_pvid VLAN tagged frames to offer a consistent behavior - * between a vlan_filtering=0 and vlan_filtering=1 bridge device. + /* Drivers that benefit from having an ID associated with each + * offloaded LAG should set this to the maximum number of + * supported IDs. DSA will then maintain a mapping of _at + * least_ these many IDs, accessible to drivers via + * dsa_lag_id(). */ - bool untag_bridge_pvid; + unsigned int num_lag_ids; - /* In case vlan_filtering_is_global is set, the VLAN awareness state - * should be retrieved from here and not from the per-port settings. - */ - bool vlan_filtering; - - /* MAC PCS does not provide link state change interrupt, and requires - * polling. Flag passed on to PHYLINK. + /* Drivers that support bridge forwarding offload or FDB isolation + * should set this to the maximum number of bridges spanning the same + * switch tree (or all trees, in the case of cross-tree bridging + * support) that can be offloaded. */ - bool pcs_poll; + unsigned int max_num_bridges; - /* For switches that only have the MRU configurable. To ensure the - * configured MTU is not exceeded, normalization of MRU on all bridged - * interfaces is needed. - */ - bool mtu_enforcement_ingress; - - size_t num_ports; + unsigned int num_ports; }; static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) @@ -348,6 +519,32 @@ static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) return NULL; } +static inline bool dsa_port_is_dsa(struct dsa_port *port) +{ + return port->type == DSA_PORT_TYPE_DSA; +} + +static inline bool dsa_port_is_cpu(struct dsa_port *port) +{ + return port->type == DSA_PORT_TYPE_CPU; +} + +static inline bool dsa_port_is_user(struct dsa_port *dp) +{ + return dp->type == DSA_PORT_TYPE_USER; +} + +static inline bool dsa_port_is_unused(struct dsa_port *dp) +{ + return dp->type == DSA_PORT_TYPE_UNUSED; +} + +static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp) +{ + return dsa_port_is_cpu(dp) && dp->conduit_admin_up && + dp->conduit_oper_up; +} + static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; @@ -368,14 +565,68 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER; } +#define dsa_tree_for_each_user_port(_dp, _dst) \ + list_for_each_entry((_dp), &(_dst)->ports, list) \ + if (dsa_port_is_user((_dp))) + +#define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \ + list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \ + if (dsa_port_is_user((_dp))) + +#define dsa_tree_for_each_cpu_port(_dp, _dst) \ + list_for_each_entry((_dp), &(_dst)->ports, list) \ + if (dsa_port_is_cpu((_dp))) + +#define dsa_switch_for_each_port(_dp, _ds) \ + list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ + if ((_dp)->ds == (_ds)) + +#define dsa_switch_for_each_port_safe(_dp, _next, _ds) \ + list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \ + if ((_dp)->ds == (_ds)) + +#define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \ + list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \ + if ((_dp)->ds == (_ds)) + +#define dsa_switch_for_each_available_port(_dp, _ds) \ + dsa_switch_for_each_port((_dp), (_ds)) \ + if (!dsa_port_is_unused((_dp))) + +#define dsa_switch_for_each_user_port(_dp, _ds) \ + dsa_switch_for_each_port((_dp), (_ds)) \ + if (dsa_port_is_user((_dp))) + +#define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \ + dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ + if (dsa_port_is_user((_dp))) + +#define dsa_switch_for_each_cpu_port(_dp, _ds) \ + dsa_switch_for_each_port((_dp), (_ds)) \ + if (dsa_port_is_cpu((_dp))) + +#define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \ + dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ + if (dsa_port_is_cpu((_dp))) + static inline u32 dsa_user_ports(struct dsa_switch *ds) { + struct dsa_port *dp; + u32 mask = 0; + + dsa_switch_for_each_user_port(dp, ds) + mask |= BIT(dp->index); + + return mask; +} + +static inline u32 dsa_cpu_ports(struct dsa_switch *ds) +{ + struct dsa_port *cpu_dp; u32 mask = 0; - int p; - for (p = 0; p < ds->num_ports; p++) - if (dsa_is_user_port(ds, p)) - mask |= BIT(p); + dsa_switch_for_each_cpu_port(cpu_dp, ds) + mask |= BIT(cpu_dp->index); return mask; } @@ -415,6 +666,50 @@ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } +/* Return true if this is the local port used to reach the CPU port */ +static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) +{ + if (dsa_is_unused_port(ds, port)) + return false; + + return port == dsa_upstream_port(ds, port); +} + +/* Return true if this is a DSA port leading away from the CPU */ +static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port) +{ + return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port); +} + +/* Return the local port used to reach the CPU port */ +static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) +{ + struct dsa_port *dp; + + dsa_switch_for_each_available_port(dp, ds) { + return dsa_upstream_port(ds, dp->index); + } + + return ds->num_ports; +} + +/* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning + * that the routing port from @downstream_ds to @upstream_ds is also the port + * which @downstream_ds uses to reach its dedicated CPU. + */ +static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds, + struct dsa_switch *downstream_ds) +{ + int routing_port; + + if (upstream_ds == downstream_ds) + return true; + + routing_port = dsa_routing_port(downstream_ds, upstream_ds->index); + + return dsa_is_upstream_port(downstream_ds, routing_port); +} + static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) { const struct dsa_switch *ds = dp->ds; @@ -425,15 +720,157 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) return dp->vlan_filtering; } +static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp) +{ + return dp->lag ? dp->lag->id : 0; +} + +static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp) +{ + return dp->lag ? dp->lag->dev : NULL; +} + +static inline bool dsa_port_offloads_lag(struct dsa_port *dp, + const struct dsa_lag *lag) +{ + return dsa_port_lag_dev_get(dp) == lag->dev; +} + +static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp) +{ + if (dp->cpu_port_in_lag) + return dsa_port_lag_dev_get(dp->cpu_dp); + + return dp->cpu_dp->conduit; +} + +static inline +struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) +{ + if (!dp->bridge) + return NULL; + + if (dp->lag) + return dp->lag->dev; + else if (dp->hsr_dev) + return dp->hsr_dev; + + return dp->user; +} + +static inline struct net_device * +dsa_port_bridge_dev_get(const struct dsa_port *dp) +{ + return dp->bridge ? dp->bridge->dev : NULL; +} + +static inline unsigned int dsa_port_bridge_num_get(struct dsa_port *dp) +{ + return dp->bridge ? dp->bridge->num : 0; +} + +static inline bool dsa_port_bridge_same(const struct dsa_port *a, + const struct dsa_port *b) +{ + struct net_device *br_a = dsa_port_bridge_dev_get(a); + struct net_device *br_b = dsa_port_bridge_dev_get(b); + + /* Standalone ports are not in the same bridge with one another */ + return (!br_a || !br_b) ? false : (br_a == br_b); +} + +static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, + const struct net_device *dev) +{ + return dsa_port_to_bridge_port(dp) == dev; +} + +static inline bool +dsa_port_offloads_bridge_dev(struct dsa_port *dp, + const struct net_device *bridge_dev) +{ + /* DSA ports connected to a bridge, and event was emitted + * for the bridge. + */ + return dsa_port_bridge_dev_get(dp) == bridge_dev; +} + +static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, + const struct dsa_bridge *bridge) +{ + return dsa_port_bridge_dev_get(dp) == bridge->dev; +} + +/* Returns true if any port of this tree offloads the given net_device */ +static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, + const struct net_device *dev) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_offloads_bridge_port(dp, dev)) + return true; + + return false; +} + +/* Returns true if any port of this tree offloads the given bridge */ +static inline bool +dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst, + const struct net_device *bridge_dev) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_offloads_bridge_dev(dp, bridge_dev)) + return true; + + return false; +} + +static inline bool dsa_port_tree_same(const struct dsa_port *a, + const struct dsa_port *b) +{ + return a->ds->dst == b->ds->dst; +} + typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { + /* + * Tagging protocol helpers called for the CPU ports and DSA links. + * @get_tag_protocol retrieves the initial tagging protocol and is + * mandatory. Switches which can operate using multiple tagging + * protocols should implement @change_tag_protocol and report in + * @get_tag_protocol the tagger in current use. + */ enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); + int (*change_tag_protocol)(struct dsa_switch *ds, + enum dsa_tag_protocol proto); + /* + * Method for switch drivers to connect to the tagging protocol driver + * in current use. The switch driver can provide handlers for certain + * types of packets for switch management. + */ + int (*connect_tag_protocol)(struct dsa_switch *ds, + enum dsa_tag_protocol proto); + + int (*port_change_conduit)(struct dsa_switch *ds, int port, + struct net_device *conduit, + struct netlink_ext_ack *extack); + /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); + + /* Per-port initialization and destruction methods. Mandatory if the + * driver registers devlink port regions, optional otherwise. + */ + int (*port_setup)(struct dsa_switch *ds, int port); + void (*port_teardown)(struct dsa_switch *ds, int port); + u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* @@ -444,38 +881,14 @@ struct dsa_switch_ops { int regnum, u16 val); /* - * Link state adjustment (called from libphy) - */ - void (*adjust_link)(struct dsa_switch *ds, int port, - struct phy_device *phydev); - void (*fixed_link_update)(struct dsa_switch *ds, int port, - struct fixed_phy_status *st); - - /* * PHYLINK integration */ - void (*phylink_validate)(struct dsa_switch *ds, int port, - unsigned long *supported, - struct phylink_link_state *state); - int (*phylink_mac_link_state)(struct dsa_switch *ds, int port, - struct phylink_link_state *state); - void (*phylink_mac_config)(struct dsa_switch *ds, int port, - unsigned int mode, - const struct phylink_link_state *state); - void (*phylink_mac_an_restart)(struct dsa_switch *ds, int port); - void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface); - void (*phylink_mac_link_up)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev, - int speed, int duplex, - bool tx_pause, bool rx_pause); + void (*phylink_get_caps)(struct dsa_switch *ds, int port, + struct phylink_config *config); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* - * ethtool hardware statistics. + * Port statistics counters. */ void (*get_strings)(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data); @@ -484,6 +897,23 @@ struct dsa_switch_ops { int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); void (*get_ethtool_phy_stats)(struct dsa_switch *ds, int port, uint64_t *data); + void (*get_eth_phy_stats)(struct dsa_switch *ds, int port, + struct ethtool_eth_phy_stats *phy_stats); + void (*get_eth_mac_stats)(struct dsa_switch *ds, int port, + struct ethtool_eth_mac_stats *mac_stats); + void (*get_eth_ctrl_stats)(struct dsa_switch *ds, int port, + struct ethtool_eth_ctrl_stats *ctrl_stats); + void (*get_rmon_stats)(struct dsa_switch *ds, int port, + struct ethtool_rmon_stats *rmon_stats, + const struct ethtool_rmon_hist_range **ranges); + void (*get_ts_stats)(struct dsa_switch *ds, int port, + struct ethtool_ts_stats *ts_stats); + void (*get_stats64)(struct dsa_switch *ds, int port, + struct rtnl_link_stats64 *s); + void (*get_pause_stats)(struct dsa_switch *ds, int port, + struct ethtool_pause_stats *pause_stats); + void (*self_test)(struct dsa_switch *ds, int port, + struct ethtool_test *etest, u64 *data); /* * ethtool Wake-on-LAN @@ -497,7 +927,34 @@ struct dsa_switch_ops { * ethtool timestamp info */ int (*get_ts_info)(struct dsa_switch *ds, int port, - struct ethtool_ts_info *ts); + struct kernel_ethtool_ts_info *ts); + + /* + * ethtool MAC merge layer + */ + int (*get_mm)(struct dsa_switch *ds, int port, + struct ethtool_mm_state *state); + int (*set_mm)(struct dsa_switch *ds, int port, + struct ethtool_mm_cfg *cfg, + struct netlink_ext_ack *extack); + void (*get_mm_stats)(struct dsa_switch *ds, int port, + struct ethtool_mm_stats *stats); + + /* + * DCB ops + */ + int (*port_get_default_prio)(struct dsa_switch *ds, int port); + int (*port_set_default_prio)(struct dsa_switch *ds, int port, + u8 prio); + int (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp); + int (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, + u8 prio); + int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, + u8 prio); + int (*port_set_apptrust)(struct dsa_switch *ds, int port, + const u8 *sel, int nsel); + int (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel, + int *nsel); /* * Suspend and resume @@ -512,13 +969,30 @@ struct dsa_switch_ops { struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port); + + /* + * Notification for MAC address changes on user ports. Drivers can + * currently only veto operations. They should not use the method to + * program the hardware, since the operation is not rolled back in case + * of other errors. + */ + int (*port_set_mac_address)(struct dsa_switch *ds, int port, + const unsigned char *addr); + + /* + * Compatibility between device trees defining multiple CPU ports and + * drivers which are not OK to use by default the numerically smallest + * CPU port of a switch for its local ports. This can return NULL, + * meaning "don't know/don't care". + */ + struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); + /* * Port's MAC EEE settings */ + bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, - struct ethtool_eee *e); - int (*get_mac_eee)(struct dsa_switch *ds, int port, - struct ethtool_eee *e); + struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); @@ -535,50 +1009,77 @@ struct dsa_switch_ops { struct ethtool_regs *regs, void *p); /* + * Upper device tracking. + */ + int (*port_prechangeupper)(struct dsa_switch *ds, int port, + struct netdev_notifier_changeupper_info *info); + + /* * Bridge integration */ int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, - struct net_device *bridge); + struct dsa_bridge bridge, + bool *tx_fwd_offload, + struct netlink_ext_ack *extack); void (*port_bridge_leave)(struct dsa_switch *ds, int port, - struct net_device *bridge); + struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); + int (*port_mst_state_set)(struct dsa_switch *ds, int port, + const struct switchdev_mst_state *state); void (*port_fast_age)(struct dsa_switch *ds, int port); - int (*port_egress_floods)(struct dsa_switch *ds, int port, - bool unicast, bool multicast); + int (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid); + int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); + int (*port_bridge_flags)(struct dsa_switch *ds, int port, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); + void (*port_set_host_flood)(struct dsa_switch *ds, int port, + bool uc, bool mc); /* * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering, - struct switchdev_trans *trans); - int (*port_vlan_prepare)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); - void (*port_vlan_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan); + struct netlink_ext_ack *extack); + int (*port_vlan_add)(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); + int (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge, + const struct switchdev_vlan_msti *msti); + /* * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); int (*port_fdb_del)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); + int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, + const unsigned char *addr, u16 vid, + struct dsa_db db); + int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, + const unsigned char *addr, u16 vid, + struct dsa_db db); /* * Multicast database */ - int (*port_mdb_prepare)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); - void (*port_mdb_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); + int (*port_mdb_add)(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db); int (*port_mdb_del)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db); /* * RXNFC */ @@ -598,7 +1099,7 @@ struct dsa_switch_ops { struct flow_cls_offload *cls, bool ingress); int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, - bool ingress); + bool ingress, struct netlink_ext_ack *extack); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, @@ -612,20 +1113,30 @@ struct dsa_switch_ops { */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, int sw_index, int port, - struct net_device *br); + struct dsa_bridge bridge, + struct netlink_ext_ack *extack); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, - struct net_device *br); + struct dsa_bridge bridge); + int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, + int port); + int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, + int port, struct dsa_lag lag, + struct netdev_lag_upper_info *info, + struct netlink_ext_ack *extack); + int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, + int port, struct dsa_lag lag); /* * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, - struct ifreq *ifr); - bool (*port_txtstamp)(struct dsa_switch *ds, int port, - struct sk_buff *clone, unsigned int type); + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); + void (*port_txtstamp)(struct dsa_switch *ds, int port, + struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); @@ -637,6 +1148,40 @@ struct dsa_switch_ops { int (*devlink_info_get)(struct dsa_switch *ds, struct devlink_info_req *req, struct netlink_ext_ack *extack); + int (*devlink_sb_pool_get)(struct dsa_switch *ds, + unsigned int sb_index, u16 pool_index, + struct devlink_sb_pool_info *pool_info); + int (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack); + int (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_threshold); + int (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 pool_index, + u32 threshold, + struct netlink_ext_ack *extack); + int (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 *p_pool_index, u32 *p_threshold); + int (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack); + int (*devlink_sb_occ_snapshot)(struct dsa_switch *ds, + unsigned int sb_index); + int (*devlink_sb_occ_max_clear)(struct dsa_switch *ds, + unsigned int sb_index); + int (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 pool_index, + u32 *p_cur, u32 *p_max); + int (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u32 *p_cur, u32 *p_max); /* * MTU change functionality. Switches can also adjust their MRU through @@ -647,6 +1192,52 @@ struct dsa_switch_ops { int (*port_change_mtu)(struct dsa_switch *ds, int port, int new_mtu); int (*port_max_mtu)(struct dsa_switch *ds, int port); + + /* + * LAG integration + */ + int (*port_lag_change)(struct dsa_switch *ds, int port); + int (*port_lag_join)(struct dsa_switch *ds, int port, + struct dsa_lag lag, + struct netdev_lag_upper_info *info, + struct netlink_ext_ack *extack); + int (*port_lag_leave)(struct dsa_switch *ds, int port, + struct dsa_lag lag); + + /* + * HSR integration + */ + int (*port_hsr_join)(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack); + int (*port_hsr_leave)(struct dsa_switch *ds, int port, + struct net_device *hsr); + + /* + * MRP integration + */ + int (*port_mrp_add)(struct dsa_switch *ds, int port, + const struct switchdev_obj_mrp *mrp); + int (*port_mrp_del)(struct dsa_switch *ds, int port, + const struct switchdev_obj_mrp *mrp); + int (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port, + const struct switchdev_obj_ring_role_mrp *mrp); + int (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port, + const struct switchdev_obj_ring_role_mrp *mrp); + + /* + * tag_8021q operations + */ + int (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid, + u16 flags); + int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); + + /* + * DSA conduit tracking operations + */ + void (*conduit_state_change)(struct dsa_switch *ds, + const struct net_device *conduit, + bool operational); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ @@ -656,7 +1247,8 @@ struct dsa_switch_ops { int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx); int dsa_devlink_param_set(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int dsa_devlink_params_register(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); @@ -721,7 +1313,12 @@ struct dsa_switch_driver { const struct dsa_switch_ops *ops; }; -struct net_device *dsa_dev_to_net_device(struct device *dev); +bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db); +bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) @@ -732,15 +1329,6 @@ static inline bool netdev_uses_dsa(const struct net_device *dev) return false; } -static inline bool dsa_can_decode(const struct sk_buff *skb, - struct net_device *dev) -{ -#if IS_ENABLED(CONFIG_NET_DSA) - return !dev->dsa_ptr->filter || dev->dsa_ptr->filter(skb, dev); -#endif - return false; -} - /* All DSA tags that push the EtherType to the right (basically all except tail * tags, which don't break dissection) can be treated the same from the * perspective of the flow dissector. @@ -760,52 +1348,18 @@ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb, { #if IS_ENABLED(CONFIG_NET_DSA) const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops; - int tag_len = ops->overhead; + int tag_len = ops->needed_headroom; *offset = tag_len; *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1]; #endif } -#if IS_ENABLED(CONFIG_NET_DSA) -static inline int __dsa_netdevice_ops_check(struct net_device *dev) -{ - int err = -EOPNOTSUPP; - - if (!dev->dsa_ptr) - return err; - - if (!dev->dsa_ptr->netdev_ops) - return err; - - return 0; -} - -static inline int dsa_ndo_do_ioctl(struct net_device *dev, struct ifreq *ifr, - int cmd) -{ - const struct dsa_netdevice_ops *ops; - int err; - - err = __dsa_netdevice_ops_check(dev); - if (err) - return err; - - ops = dev->dsa_ptr->netdev_ops; - - return ops->ndo_do_ioctl(dev, ifr, cmd); -} -#else -static inline int dsa_ndo_do_ioctl(struct net_device *dev, struct ifreq *ifr, - int cmd) -{ - return -EOPNOTSUPP; -} -#endif - void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); +void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); +void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); @@ -820,127 +1374,17 @@ static inline int dsa_switch_resume(struct dsa_switch *ds) } #endif /* CONFIG_PM_SLEEP */ -enum dsa_notifier_type { - DSA_PORT_REGISTER, - DSA_PORT_UNREGISTER, -}; - -struct dsa_notifier_info { - struct net_device *dev; -}; - -struct dsa_notifier_register_info { - struct dsa_notifier_info info; /* must be first */ - struct net_device *master; - unsigned int port_number; - unsigned int switch_number; -}; - -static inline struct net_device * -dsa_notifier_info_to_dev(const struct dsa_notifier_info *info) -{ - return info->dev; -} - #if IS_ENABLED(CONFIG_NET_DSA) -int register_dsa_notifier(struct notifier_block *nb); -int unregister_dsa_notifier(struct notifier_block *nb); -int call_dsa_notifiers(unsigned long val, struct net_device *dev, - struct dsa_notifier_info *info); +bool dsa_user_dev_check(const struct net_device *dev); #else -static inline int register_dsa_notifier(struct notifier_block *nb) -{ - return 0; -} - -static inline int unregister_dsa_notifier(struct notifier_block *nb) -{ - return 0; -} - -static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, - struct dsa_notifier_info *info) +static inline bool dsa_user_dev_check(const struct net_device *dev) { - return NOTIFY_DONE; + return false; } #endif -/* Broadcom tag specific helpers to insert and extract queue/port number */ -#define BRCM_TAG_SET_PORT_QUEUE(p, q) ((p) << 8 | q) -#define BRCM_TAG_GET_PORT(v) ((v) >> 8) -#define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) - - netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); -int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); -int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); -int dsa_port_get_phy_sset_count(struct dsa_port *dp); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); +bool dsa_supports_eee(struct dsa_switch *ds, int port); -struct dsa_tag_driver { - const struct dsa_device_ops *ops; - struct list_head list; - struct module *owner; -}; - -void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count, - struct module *owner); -void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count); - -#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ -static int __init dsa_tag_driver_module_init(void) \ -{ \ - dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ - THIS_MODULE); \ - return 0; \ -} \ -module_init(dsa_tag_driver_module_init); \ - \ -static void __exit dsa_tag_driver_module_exit(void) \ -{ \ - dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ -} \ -module_exit(dsa_tag_driver_module_exit) - -/** - * module_dsa_tag_drivers() - Helper macro for registering DSA tag - * drivers - * @__ops_array: Array of tag driver strucutres - * - * Helper macro for DSA tag drivers which do not do anything special - * in module init/exit. Each module may only use this macro once, and - * calling it replaces module_init() and module_exit(). - */ -#define module_dsa_tag_drivers(__ops_array) \ -dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) - -#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops - -/* Create a static structure we can build a linked list of dsa_tag - * drivers - */ -#define DSA_TAG_DRIVER(__ops) \ -static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ - .ops = &__ops, \ -} - -/** - * module_dsa_tag_driver() - Helper macro for registering a single DSA tag - * driver - * @__ops: Single tag driver structures - * - * Helper macro for DSA tag drivers which do not do anything special - * in module init/exit. Each module may only use this macro once, and - * calling it replaces module_init() and module_exit(). - */ -#define module_dsa_tag_driver(__ops) \ -DSA_TAG_DRIVER(__ops); \ - \ -static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ - &DSA_TAG_DRIVER_NAME(__ops) \ -}; \ -module_dsa_tag_drivers(dsa_tag_driver_array) #endif - diff --git a/include/net/dsa_stubs.h b/include/net/dsa_stubs.h new file mode 100644 index 000000000000..6f384897f287 --- /dev/null +++ b/include/net/dsa_stubs.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * include/net/dsa_stubs.h - Stubs for the Distributed Switch Architecture framework + */ + +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/net_tstamp.h> +#include <net/dsa.h> + +#if IS_ENABLED(CONFIG_NET_DSA) + +extern const struct dsa_stubs *dsa_stubs; + +struct dsa_stubs { + int (*conduit_hwtstamp_validate)(struct net_device *dev, + const struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); +}; + +static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, + const struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) +{ + if (!netdev_uses_dsa(dev)) + return 0; + + /* rtnl_lock() is a sufficient guarantee, because as long as + * netdev_uses_dsa() returns true, the dsa_core module is still + * registered, and so, dsa_unregister_stubs() couldn't have run. + * For netdev_uses_dsa() to start returning false, it would imply that + * dsa_conduit_teardown() has executed, which requires rtnl_lock(). + */ + ASSERT_RTNL(); + + return dsa_stubs->conduit_hwtstamp_validate(dev, config, extack); +} + +#else + +static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, + const struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) +{ + return 0; +} + +#endif diff --git a/include/net/dscp.h b/include/net/dscp.h new file mode 100644 index 000000000000..ba40540868c9 --- /dev/null +++ b/include/net/dscp.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> */ + +#ifndef __DSCP_H__ +#define __DSCP_H__ + +/* + * DSCP Pools and Codepoint Space Division: + * + * The Differentiated Services (Diffserv) architecture defines a method for + * classifying and managing network traffic using the DS field in IPv4 and IPv6 + * packet headers. This field can carry one of 64 distinct DSCP (Differentiated + * Services Code Point) values, which are divided into three pools based on + * their Least Significant Bits (LSB) patterns and intended usage. Each pool has + * a specific registration procedure for assigning DSCP values: + * + * Pool 1 (Standards Action Pool): + * - Codepoint Space: xxxxx0 + * This pool includes DSCP values ending in '0' (binary), allocated via + * Standards Action. It is intended for globally recognized traffic classes, + * ensuring interoperability across the internet. This pool encompasses + * well-known DSCP values such as CS0-CS7, AFxx, EF, and VOICE-ADMIT. + * + * Pool 2 (Experimental/Local Use Pool): + * - Codepoint Space: xxxx11 + * Reserved for DSCP values ending in '11' (binary), this pool is designated + * for Experimental or Local Use. It allows for private or temporary traffic + * marking schemes not intended for standardized global use, facilitating + * testing and network-specific configurations without impacting + * interoperability. + * + * Pool 3 (Preferential Standardization Pool): + * - Codepoint Space: xxxx01 + * Initially reserved for experimental or local use, this pool now serves as + * a secondary standardization resource should Pool 1 become exhausted. DSCP + * values ending in '01' (binary) are assigned via Standards Action, with a + * focus on adopting new, standardized traffic classes as the need arises. + * + * For pool updates see: + * https://www.iana.org/assignments/dscp-registry/dscp-registry.xhtml + */ + +/* Pool 1: Standardized DSCP values as per [RFC8126] */ +#define DSCP_CS0 0 /* 000000, [RFC2474] */ +/* CS0 is some times called default (DF) */ +#define DSCP_DF 0 /* 000000, [RFC2474] */ +#define DSCP_CS1 8 /* 001000, [RFC2474] */ +#define DSCP_CS2 16 /* 010000, [RFC2474] */ +#define DSCP_CS3 24 /* 011000, [RFC2474] */ +#define DSCP_CS4 32 /* 100000, [RFC2474] */ +#define DSCP_CS5 40 /* 101000, [RFC2474] */ +#define DSCP_CS6 48 /* 110000, [RFC2474] */ +#define DSCP_CS7 56 /* 111000, [RFC2474] */ +#define DSCP_AF11 10 /* 001010, [RFC2597] */ +#define DSCP_AF12 12 /* 001100, [RFC2597] */ +#define DSCP_AF13 14 /* 001110, [RFC2597] */ +#define DSCP_AF21 18 /* 010010, [RFC2597] */ +#define DSCP_AF22 20 /* 010100, [RFC2597] */ +#define DSCP_AF23 22 /* 010110, [RFC2597] */ +#define DSCP_AF31 26 /* 011010, [RFC2597] */ +#define DSCP_AF32 28 /* 011100, [RFC2597] */ +#define DSCP_AF33 30 /* 011110, [RFC2597] */ +#define DSCP_AF41 34 /* 100010, [RFC2597] */ +#define DSCP_AF42 36 /* 100100, [RFC2597] */ +#define DSCP_AF43 38 /* 100110, [RFC2597] */ +#define DSCP_EF 46 /* 101110, [RFC3246] */ +#define DSCP_VOICE_ADMIT 44 /* 101100, [RFC5865] */ + +/* Pool 3: Standardized assignments, previously available for experimental/local + * use + */ +#define DSCP_LE 1 /* 000001, [RFC8622] */ + +#define DSCP_MAX 64 + +#endif /* __DSCP_H__ */ diff --git a/include/net/dst.h b/include/net/dst.h index 8ea8812b0b41..78c78cdce0e9 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -16,8 +16,10 @@ #include <linux/bug.h> #include <linux/jiffies.h> #include <linux/refcount.h> +#include <linux/rcuref.h> #include <net/neighbour.h> #include <asm/processor.h> +#include <linux/indirect_call_wrapper.h> struct sk_buff; @@ -60,21 +62,35 @@ struct dst_entry { unsigned short trailer_len; /* space to reserve at tail */ /* - * __refcnt wants to be on a different cache line from + * __rcuref wants to be on a different cache line from * input/output/ops or performance tanks badly */ #ifdef CONFIG_64BIT - atomic_t __refcnt; /* 64-bit offset 64 */ + rcuref_t __rcuref; /* 64-bit offset 64 */ #endif int __use; unsigned long lastuse; - struct lwtunnel_state *lwtstate; struct rcu_head rcu_head; short error; short __pad; __u32 tclassid; #ifndef CONFIG_64BIT - atomic_t __refcnt; /* 32-bit offset 64 */ + struct lwtunnel_state *lwtstate; + rcuref_t __rcuref; /* 32-bit offset 64 */ +#endif + netdevice_tracker dev_tracker; + + /* + * Used by rtable and rt6_info. Moves lwtstate into the next cache + * line on 64bit so that lwtstate does not cause false sharing with + * __rcuref under contention of __rcuref. This also puts the + * frequently accessed members of rtable and rt6_info out of the + * __rcuref cache line. + */ + struct list_head rt_uncached; + struct uncached_list *rt_uncached_list; +#ifdef CONFIG_64BIT + struct lwtunnel_state *lwtstate; #endif }; @@ -193,9 +209,11 @@ dst_feature(const struct dst_entry *dst, u32 feature) return dst_metric(dst, RTAX_FEATURES) & feature; } +INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *)); +INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *)); static inline u32 dst_mtu(const struct dst_entry *dst) { - return dst->ops->mtu(dst); + return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst); } /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */ @@ -204,13 +222,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr return msecs_to_jiffies(dst_metric(dst, metric)); } -static inline u32 -dst_allfrag(const struct dst_entry *dst) -{ - int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG); - return ret; -} - static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { @@ -221,10 +232,10 @@ static inline void dst_hold(struct dst_entry *dst) { /* * If your kernel compilation stops here, please check - * the placement of __refcnt in struct dst_entry + * the placement of __rcuref in struct dst_entry */ - BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); - WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); + BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63); + WARN_ON(!rcuref_get(&dst->__rcuref)); } static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) @@ -235,12 +246,6 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) } } -static inline void dst_hold_and_use(struct dst_entry *dst, unsigned long time) -{ - dst_hold(dst); - dst_use_noref(dst, time); -} - static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) @@ -274,6 +279,7 @@ static inline void skb_dst_drop(struct sk_buff *skb) static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst) { + nskb->slow_gro |= !!refdst; nskb->_skb_refdst = refdst; if (!(nskb->_skb_refdst & SKB_DST_NOREF)) dst_clone(skb_dst(nskb)); @@ -293,7 +299,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb */ static inline bool dst_hold_safe(struct dst_entry *dst) { - return atomic_inc_not_zero(&dst->__refcnt); + return rcuref_get(&dst->__rcuref); } /** @@ -301,7 +307,7 @@ static inline bool dst_hold_safe(struct dst_entry *dst) * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. - * Returns true if dst is refcounted. + * Returns: true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { @@ -313,6 +319,7 @@ static inline bool skb_dst_force(struct sk_buff *skb) dst = NULL; skb->_skb_refdst = (unsigned long)dst; + skb->slow_gro |= !!dst; } return skb->_skb_refdst != 0UL; @@ -334,7 +341,7 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; /* - * Clear hash so that we can recalulate the hash for the + * Clear hash so that we can recalculate the hash for the * encapsulated packet, unless we have already determine the hash * over the L4 4-tuple. */ @@ -356,9 +363,8 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { - /* TODO : stats should be SMP safe */ - dev->stats.rx_packets++; - dev->stats.rx_bytes += skb->len; + DEV_STATS_INC(dev, rx_packets); + DEV_STATS_ADD(dev, rx_bytes, skb->len); __skb_tunnel_rx(skb, dev, net); } @@ -379,12 +385,11 @@ static inline int dst_discard(struct sk_buff *skb) { return dst_discard_out(&init_net, skb->sk, skb); } -void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_init(struct dst_entry *dst, struct dst_ops *ops, - struct net_device *dev, int initial_ref, int initial_obsolete, + struct net_device *dev, int initial_obsolete, unsigned short flags); -struct dst_entry *dst_destroy(struct dst_entry *dst); void dst_dev_put(struct dst_entry *dst); static inline void dst_confirm(struct dst_entry *dst) @@ -400,14 +405,12 @@ static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, co static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { - struct neighbour *n = NULL; + struct neighbour *n; - /* The packets from tunnel devices (eg bareudp) may have only - * metadata in the dst pointer of skb. Hence a pointer check of - * neigh_lookup is needed. - */ - if (dst->ops->neigh_lookup) - n = dst->ops->neigh_lookup(dst, skb, NULL); + if (WARN_ON_ONCE(!dst->ops->neigh_lookup)) + return NULL; + + n = dst->ops->neigh_lookup(dst, skb, NULL); return IS_ERR(n) ? NULL : n; } @@ -437,22 +440,45 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout) dst->expires = expires; } +static inline unsigned int dst_dev_overhead(struct dst_entry *dst, + struct sk_buff *skb) +{ + if (likely(dst)) + return LL_RESERVED_SPACE(dst->dev); + + return skb->mac_len; +} + +INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, + struct sk_buff *)); /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return skb_dst(skb)->output(net, sk, skb); + return INDIRECT_CALL_INET(skb_dst(skb)->output, + ip6_output, ip_output, + net, sk, skb); } +INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { - return skb_dst(skb)->input(skb); + return INDIRECT_CALL_INET(skb_dst(skb)->input, + ip6_input, ip_local_deliver, skb); } +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, + u32)); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { if (dst->obsolete) - dst = dst->ops->check(dst, cookie); + dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, + ipv4_dst_check, dst, cookie); return dst; } @@ -535,4 +561,15 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } +struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); +void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu, bool confirm_neigh); +void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); +u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old); +struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr); +unsigned int dst_blackhole_mtu(const struct dst_entry *dst); + #endif /* _NET_DST_H */ diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h index 67634675e919..1961699598e2 100644 --- a/include/net/dst_cache.h +++ b/include/net/dst_cache.h @@ -76,10 +76,21 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, */ static inline void dst_cache_reset(struct dst_cache *dst_cache) { - dst_cache->reset_ts = jiffies; + WRITE_ONCE(dst_cache->reset_ts, jiffies); } /** + * dst_cache_reset_now - invalidate the cache contents immediately + * @dst_cache: the cache + * + * The caller must be sure there are no concurrent users, as this frees + * all dst_cache users immediately, rather than waiting for the next + * per-cpu usage like dst_cache_reset does. Most callers should use the + * higher speed lazily-freed dst_cache_reset function instead. + */ +void dst_cache_reset_now(struct dst_cache *dst_cache); + +/** * dst_cache_init - initialize the cache, allocating the required storage * @dst_cache: the cache * @gfp: allocation flags @@ -91,7 +102,7 @@ int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp); * @dst_cache: the cache * * No synchronization is enforced: it must be called only when the cache - * is unsed. + * is unused. */ void dst_cache_destroy(struct dst_cache *dst_cache); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 56cb3c38569a..4160731dcb6e 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -4,11 +4,14 @@ #include <linux/skbuff.h> #include <net/ip_tunnels.h> +#include <net/macsec.h> #include <net/dst.h> enum metadata_type { METADATA_IP_TUNNEL, METADATA_HW_PORT_MUX, + METADATA_MACSEC, + METADATA_XFRM, }; struct hw_port_info { @@ -16,12 +19,24 @@ struct hw_port_info { u32 port_id; }; +struct macsec_info { + sci_t sci; +}; + +struct xfrm_md_info { + u32 if_id; + int link; + struct dst_entry *dst_orig; +}; + struct metadata_dst { struct dst_entry dst; enum metadata_type type; union { struct ip_tunnel_info tun_info; struct hw_port_info port_info; + struct macsec_info macsec_info; + struct xfrm_md_info xfrm_info; } u; }; @@ -45,12 +60,35 @@ skb_tunnel_info(const struct sk_buff *skb) return &md_dst->u.tun_info; dst = skb_dst(skb); - if (dst && dst->lwtstate) + if (dst && dst->lwtstate && + (dst->lwtstate->type == LWTUNNEL_ENCAP_IP || + dst->lwtstate->type == LWTUNNEL_ENCAP_IP6)) return lwt_tun_info(dst->lwtstate); return NULL; } +static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt) +{ + return (struct xfrm_md_info *)lwt->data; +} + +static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + struct dst_entry *dst; + + if (md_dst && md_dst->type == METADATA_XFRM) + return &md_dst->u.xfrm_info; + + dst = skb_dst(skb); + if (dst && dst->lwtstate && + dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM) + return lwt_xfrm_info(dst->lwtstate); + + return NULL; +} + static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -80,6 +118,12 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, return memcmp(&a->u.tun_info, &b->u.tun_info, sizeof(a->u.tun_info) + a->u.tun_info.options_len); + case METADATA_MACSEC: + return memcmp(&a->u.macsec_info, &b->u.macsec_info, + sizeof(a->u.macsec_info)); + case METADATA_XFRM: + return memcmp(&a->u.xfrm_info, &b->u.xfrm_info, + sizeof(a->u.xfrm_info)); default: return 1; } @@ -121,8 +165,20 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size); +#ifdef CONFIG_DST_CACHE + /* Unclone the dst cache if there is one */ + if (new_md->u.tun_info.dst_cache.cache) { + int ret; + + ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); + if (ret) { + metadata_dst_free(new_md); + return ERR_PTR(ret); + } + } +#endif + skb_dst_drop(skb); - dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); return new_md; } @@ -142,7 +198,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, __be32 daddr, __u8 tos, __u8 ttl, __be16 tp_dst, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { @@ -159,7 +215,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, } static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { @@ -174,7 +230,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad __u8 tos, __u8 ttl, __be16 tp_dst, __be32 label, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { @@ -187,7 +243,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad info = &tun_dst->u.tun_info; info->mode = IP_TUNNEL_INFO_IPV6; - info->key.tun_flags = flags; + ip_tunnel_flags_copy(info->key.tun_flags, flags); info->key.tun_id = tunnel_id; info->key.tp_src = 0; info->key.tp_dst = tp_dst; @@ -203,7 +259,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad } static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 88ff7bb2bb9b..3a9001a042a5 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -16,15 +16,15 @@ struct dst_ops { unsigned short family; unsigned int gc_thresh; - int (*gc)(struct dst_ops *ops); + void (*gc)(struct dst_ops *ops); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); unsigned int (*default_advmss)(const struct dst_entry *); unsigned int (*mtu)(const struct dst_entry *); u32 * (*cow_metrics)(struct dst_entry *, unsigned long); void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, - struct net_device *dev, int how); - struct dst_entry * (*negative_advice)(struct dst_entry *); + struct net_device *dev); + void (*negative_advice)(struct sock *sk, struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, diff --git a/include/net/eee.h b/include/net/eee.h new file mode 100644 index 000000000000..cfab1b8bc46a --- /dev/null +++ b/include/net/eee.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _EEE_H +#define _EEE_H + +#include <linux/types.h> + +struct eee_config { + u32 tx_lpi_timer; + bool tx_lpi_enabled; + bool eee_enabled; +}; + +static inline bool eeecfg_mac_can_tx_lpi(const struct eee_config *eeecfg) +{ + /* eee_enabled is the master on/off */ + return eeecfg->eee_enabled && eeecfg->tx_lpi_enabled; +} + +static inline void eeecfg_to_eee(struct ethtool_keee *eee, + const struct eee_config *eeecfg) +{ + eee->tx_lpi_timer = eeecfg->tx_lpi_timer; + eee->tx_lpi_enabled = eeecfg->tx_lpi_enabled; + eee->eee_enabled = eeecfg->eee_enabled; +} + +static inline void eee_to_eeecfg(struct eee_config *eeecfg, + const struct ethtool_keee *eee) +{ + eeecfg->tx_lpi_timer = eee->tx_lpi_timer; + eeecfg->tx_lpi_enabled = eee->tx_lpi_enabled; + eeecfg->eee_enabled = eee->eee_enabled; +} + +#endif diff --git a/include/net/erspan.h b/include/net/erspan.h index 0d9e86bd9893..c6209e7b6c96 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -58,6 +58,9 @@ * GRE proto ERSPAN type I/II = 0x88BE, type III = 0x22EB */ +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/skbuff.h> #include <uapi/linux/erspan.h> #define ERSPAN_VERSION 0x1 /* ERSPAN type II */ @@ -86,7 +89,7 @@ enum erspan_encap_type { ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ ERSPAN_ENCAP_8021Q = 0x2, /* originally 802.1Q encapsulated */ - ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag perserved in frame */ + ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag preserved in frame */ }; #define ERSPAN_V1_MDSIZE 4 @@ -189,7 +192,7 @@ static inline void erspan_build_header(struct sk_buff *skb, enc_type = ERSPAN_ENCAP_NOVLAN; /* If mirrored packet has vlan tag, extract tci and - * perserve vlan header in the mirrored frame. + * preserve vlan header in the mirrored frame. */ if (eth->h_proto == htons(ETH_P_8021Q)) { qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); diff --git a/include/net/esp.h b/include/net/esp.h index 9c5637d41d95..322950727dd0 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -5,6 +5,7 @@ #include <linux/skbuff.h> struct ip_esp_hdr; +struct xfrm_state; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) { diff --git a/include/net/espintcp.h b/include/net/espintcp.h index 0335bbd76552..c70efd704b6d 100644 --- a/include/net/espintcp.h +++ b/include/net/espintcp.h @@ -32,7 +32,7 @@ struct espintcp_ctx { static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* RCU is only needed for diag */ return (__force void *)icsk->icsk_ulp_data; diff --git a/include/net/ethoc.h b/include/net/ethoc.h index 78519ed42ab4..73810f3ca492 100644 --- a/include/net/ethoc.h +++ b/include/net/ethoc.h @@ -10,6 +10,9 @@ #ifndef LINUX_NET_ETHOC_H #define LINUX_NET_ETHOC_H 1 +#include <linux/if.h> +#include <linux/types.h> + struct ethoc_platform_data { u8 hwaddr[IFHWADDRLEN]; s8 phy_id; diff --git a/include/net/failover.h b/include/net/failover.h index bb15438f39c7..f2b42b4b9cd6 100644 --- a/include/net/failover.h +++ b/include/net/failover.h @@ -25,6 +25,7 @@ struct failover_ops { struct failover { struct list_head list; struct net_device __rcu *failover_dev; + netdevice_tracker dev_tracker; struct failover_ops __rcu *ops; }; diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 6d59221ff05a..48aad6128fea 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -28,7 +28,7 @@ enum fib_event_type { struct fib_notifier_ops { int family; struct list_head list; - unsigned int (*fib_seq_read)(struct net *net); + unsigned int (*fib_seq_read)(const struct net *net); int (*fib_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); struct module *owner; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 4b10676c69d1..6e68e359ad18 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -43,6 +43,10 @@ struct fib_rule { struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; + u16 sport_mask; + u16 dport_mask; + u8 iif_is_l3_master; + u8 oif_is_l3_master; struct rcu_head rcu; }; @@ -69,7 +73,7 @@ struct fib_rules_ops { int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); - bool (*suppress)(struct fib_rule *, + bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); @@ -91,7 +95,6 @@ struct fib_rules_ops { void (*flush_cache)(struct fib_rules_ops *ops); int nlgroup; - const struct nla_policy *policy; struct list_head rules_list; struct module *owner; struct net *fro_net; @@ -103,26 +106,6 @@ struct fib_rule_notifier_info { struct fib_rule *rule; }; -#define FRA_GENERIC_POLICY \ - [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, \ - [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \ - [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \ - [FRA_PRIORITY] = { .type = NLA_U32 }, \ - [FRA_FWMARK] = { .type = NLA_U32 }, \ - [FRA_TUN_ID] = { .type = NLA_U64 }, \ - [FRA_FWMASK] = { .type = NLA_U32 }, \ - [FRA_TABLE] = { .type = NLA_U32 }, \ - [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ - [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ - [FRA_GOTO] = { .type = NLA_U32 }, \ - [FRA_L3MDEV] = { .type = NLA_U8 }, \ - [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, \ - [FRA_PROTOCOL] = { .type = NLA_U8 }, \ - [FRA_IP_PROTO] = { .type = NLA_U8 }, \ - [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, \ - [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } - - static inline void fib_rule_get(struct fib_rule *rule) { refcount_inc(&rule->refcnt); @@ -167,6 +150,17 @@ static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, ntohs(port) <= a->end; } +static inline bool fib_rule_port_match(const struct fib_rule_port_range *range, + u16 port_mask, __be16 port) +{ + if ((range->start ^ ntohs(port)) & port_mask) + return false; + if (!port_mask && fib_rule_port_range_set(range) && + !fib_rule_port_inrange(range, port)) + return false; + return true; +} + static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && @@ -180,6 +174,12 @@ static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, a->end == b->end; } +static inline bool +fib_rule_port_is_range(const struct fib_rule_port_range *range) +{ + return range->start != range->end; +} + static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || @@ -193,17 +193,16 @@ void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); -int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, - u32 flags); +int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); -unsigned int fib_rules_seq_read(struct net *net, int family); +unsigned int fib_rules_seq_read(const struct net *net, int family); -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack); +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held); +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); @@ -218,7 +217,9 @@ INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg)); #endif diff --git a/include/net/firewire.h b/include/net/firewire.h index 299e5df38552..8fbff8d77865 100644 --- a/include/net/firewire.h +++ b/include/net/firewire.h @@ -2,6 +2,8 @@ #ifndef _NET_FIREWIRE_H #define _NET_FIREWIRE_H +#include <linux/types.h> + /* Pseudo L2 address */ #define FWNET_ALEN 16 union fwnet_hwaddr { @@ -11,8 +13,7 @@ union fwnet_hwaddr { __be64 uniq_id; /* EUI-64 */ u8 max_rec; /* max packet size */ u8 sspd; /* max speed */ - __be16 fifo_hi; /* hi 16bits of FIFO addr */ - __be32 fifo_lo; /* lo 32bits of FIFO addr */ + u8 fifo[6]; /* FIFO addr */ } __packed uc; }; diff --git a/include/net/flow.h b/include/net/flow.h index b2531df3f65f..a1839c278d87 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -8,12 +8,13 @@ #ifndef _NET_FLOW_H #define _NET_FLOW_H -#include <linux/socket.h> #include <linux/in6.h> #include <linux/atomic.h> -#include <net/flow_dissector.h> +#include <linux/container_of.h> #include <linux/uidgid.h> +struct flow_keys; + /* * ifindex generation is per-net namespace, and loopback is * always the 1st device in ns (see net_dev_init), thus any @@ -29,6 +30,7 @@ struct flowi_tunnel { struct flowi_common { int flowic_oif; int flowic_iif; + int flowic_l3mdev; __u32 flowic_mark; __u8 flowic_tos; __u8 flowic_scope; @@ -36,11 +38,12 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 -#define FLOWI_FLAG_SKIP_NH_OIF 0x04 +#define FLOWI_FLAG_L3MDEV_OIF 0x04 +#define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; - struct flowi_tunnel flowic_tun_key; __u32 flowic_multipath_hash; + struct flowi_tunnel flowic_tun_key; }; union flowi_uli { @@ -54,12 +57,6 @@ union flowi_uli { __u8 code; } icmpt; - struct { - __le16 dport; - __le16 sport; - } dnports; - - __be32 spi; __be32 gre_key; struct { @@ -71,6 +68,7 @@ struct flowi4 { struct flowi_common __fl_common; #define flowi4_oif __fl_common.flowic_oif #define flowi4_iif __fl_common.flowic_iif +#define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark #define flowi4_tos __fl_common.flowic_tos #define flowi4_scope __fl_common.flowic_scope @@ -90,7 +88,6 @@ struct flowi4 { #define fl4_dport uli.ports.dport #define fl4_icmp_type uli.icmpt.type #define fl4_icmp_code uli.icmpt.code -#define fl4_ipsec_spi uli.spi #define fl4_mh_type uli.mht.type #define fl4_gre_key uli.gre_key } __attribute__((__aligned__(BITS_PER_LONG/8))); @@ -104,6 +101,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; + fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; fl4->flowi4_tos = tos; fl4->flowi4_scope = scope; @@ -120,11 +118,10 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, } /* Reset some input parameters after previous lookup */ -static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __u8 tos, +static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __be32 daddr, __be32 saddr) { fl4->flowi4_oif = oif; - fl4->flowi4_tos = tos; fl4->daddr = daddr; fl4->saddr = saddr; } @@ -134,6 +131,7 @@ struct flowi6 { struct flowi_common __fl_common; #define flowi6_oif __fl_common.flowic_oif #define flowi6_iif __fl_common.flowic_iif +#define flowi6_l3mdev __fl_common.flowic_l3mdev #define flowi6_mark __fl_common.flowic_mark #define flowi6_scope __fl_common.flowic_scope #define flowi6_proto __fl_common.flowic_proto @@ -150,36 +148,20 @@ struct flowi6 { #define fl6_dport uli.ports.dport #define fl6_icmp_type uli.icmpt.type #define fl6_icmp_code uli.icmpt.code -#define fl6_ipsec_spi uli.spi #define fl6_mh_type uli.mht.type #define fl6_gre_key uli.gre_key __u32 mp_hash; } __attribute__((__aligned__(BITS_PER_LONG/8))); -struct flowidn { - struct flowi_common __fl_common; -#define flowidn_oif __fl_common.flowic_oif -#define flowidn_iif __fl_common.flowic_iif -#define flowidn_mark __fl_common.flowic_mark -#define flowidn_scope __fl_common.flowic_scope -#define flowidn_proto __fl_common.flowic_proto -#define flowidn_flags __fl_common.flowic_flags - __le16 daddr; - __le16 saddr; - union flowi_uli uli; -#define fld_sport uli.ports.sport -#define fld_dport uli.ports.dport -} __attribute__((__aligned__(BITS_PER_LONG/8))); - struct flowi { union { struct flowi_common __fl_common; struct flowi4 ip4; struct flowi6 ip6; - struct flowidn dn; } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif +#define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark #define flowi_tos u.__fl_common.flowic_tos #define flowi_scope u.__fl_common.flowic_scope @@ -195,14 +177,19 @@ static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) return container_of(fl4, struct flowi, u.ip4); } +static inline struct flowi_common *flowi4_to_flowi_common(struct flowi4 *fl4) +{ + return &(fl4->__fl_common); +} + static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6) { return container_of(fl6, struct flowi, u.ip6); } -static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn) +static inline struct flowi_common *flowi6_to_flowi_common(struct flowi6 *fl6) { - return container_of(fldn, struct flowi, u.dn); + return &(fl6->__fl_common); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index cc10b10dc3a1..ced79dc8e856 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -7,6 +7,7 @@ #include <linux/siphash.h> #include <linux/string.h> #include <uapi/linux/if_ether.h> +#include <uapi/linux/pkt_cls.h> struct bpf_prog; struct net; @@ -14,7 +15,10 @@ struct sk_buff; /** * struct flow_dissector_key_control: - * @thoff: Transport header offset + * @thoff: Transport header offset + * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_* + * @flags: Key flags. + * Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAG|ENCAPSULATION|F_*) */ struct flow_dissector_key_control { u16 thoff; @@ -22,9 +26,20 @@ struct flow_dissector_key_control { u32 flags; }; -#define FLOW_DIS_IS_FRAGMENT BIT(0) -#define FLOW_DIS_FIRST_FRAG BIT(1) -#define FLOW_DIS_ENCAPSULATION BIT(2) +/* The control flags are kept in sync with TCA_FLOWER_KEY_FLAGS_*, as those + * flags are exposed to userspace in some error paths, ie. unsupported flags. + */ +enum flow_dissector_ctrl_flags { + FLOW_DIS_IS_FRAGMENT = TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, + FLOW_DIS_FIRST_FRAG = TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, + FLOW_DIS_F_TUNNEL_CSUM = TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM, + FLOW_DIS_F_TUNNEL_DONT_FRAGMENT = TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT, + FLOW_DIS_F_TUNNEL_OAM = TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM, + FLOW_DIS_F_TUNNEL_CRIT_OPT = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT, + + /* These flags are internal to the kernel */ + FLOW_DIS_ENCAPSULATION = (TCA_FLOWER_KEY_FLAGS_MAX << 1), +}; enum flow_dissect_ret { FLOW_DISSECT_RET_OUT_GOOD, @@ -36,8 +51,9 @@ enum flow_dissect_ret { /** * struct flow_dissector_key_basic: - * @n_proto: Network header protocol (eg. IPv4/IPv6) + * @n_proto: Network header protocol (eg. IPv4/IPv6) * @ip_proto: Transport header protocol (eg. TCP/UDP) + * @padding: Unused */ struct flow_dissector_key_basic { __be16 n_proto; @@ -59,6 +75,8 @@ struct flow_dissector_key_vlan { __be16 vlan_tci; }; __be16 vlan_tpid; + __be16 vlan_eth_type; + u16 padding; }; struct flow_dissector_mpls_lse { @@ -92,7 +110,7 @@ struct flow_dissector_key_enc_opts { * here but seems difficult to #include */ u8 len; - __be16 dst_opt_type; + u32 dst_opt_type; }; struct flow_dissector_key_keyid { @@ -133,6 +151,7 @@ struct flow_dissector_key_tipc { * struct flow_dissector_key_addrs: * @v4addrs: IPv4 addresses * @v6addrs: IPv6 addresses + * @tipckey: TIPC key */ struct flow_dissector_key_addrs { union { @@ -143,14 +162,12 @@ struct flow_dissector_key_addrs { }; /** - * flow_dissector_key_arp: - * @ports: Operation, source and target addresses for an ARP header - * for Ethernet hardware addresses and IPv4 protocol addresses - * sip: Sender IP address - * tip: Target IP address - * op: Operation - * sha: Sender hardware address - * tpa: Target hardware address + * struct flow_dissector_key_arp: + * @sip: Sender IP address + * @tip: Target IP address + * @op: Operation + * @sha: Sender hardware address + * @tha: Target hardware address */ struct flow_dissector_key_arp { __u32 sip; @@ -161,10 +178,10 @@ struct flow_dissector_key_arp { }; /** - * flow_dissector_key_tp_ports: - * @ports: port numbers of Transport header - * src: source port number - * dst: destination port number + * struct flow_dissector_key_ports: + * @ports: port numbers of Transport header + * @src: source port number + * @dst: destination port number */ struct flow_dissector_key_ports { union { @@ -177,10 +194,26 @@ struct flow_dissector_key_ports { }; /** - * flow_dissector_key_icmp: - * type: ICMP type - * code: ICMP code - * id: session identifier + * struct flow_dissector_key_ports_range + * @tp: port number from packet + * @tp_min: min port number in range + * @tp_max: max port number in range + */ +struct flow_dissector_key_ports_range { + union { + struct flow_dissector_key_ports tp; + struct { + struct flow_dissector_key_ports tp_min; + struct flow_dissector_key_ports tp_max; + }; + }; +}; + +/** + * struct flow_dissector_key_icmp: + * @type: ICMP type + * @code: ICMP code + * @id: Session identifier */ struct flow_dissector_key_icmp { struct { @@ -223,10 +256,12 @@ struct flow_dissector_key_ip { * struct flow_dissector_key_meta: * @ingress_ifindex: ingress ifindex * @ingress_iftype: ingress interface type + * @l2_miss: packet did not match an L2 entry during forwarding */ struct flow_dissector_key_meta { int ingress_ifindex; u16 ingress_iftype; + u8 l2_miss; }; /** @@ -251,6 +286,62 @@ struct flow_dissector_key_hash { u32 hash; }; +/** + * struct flow_dissector_key_num_of_vlans: + * @num_of_vlans: num_of_vlans value + */ +struct flow_dissector_key_num_of_vlans { + u8 num_of_vlans; +}; + +/** + * struct flow_dissector_key_pppoe: + * @session_id: pppoe session id + * @ppp_proto: ppp protocol + * @type: pppoe eth type + */ +struct flow_dissector_key_pppoe { + __be16 session_id; + __be16 ppp_proto; + __be16 type; +}; + +/** + * struct flow_dissector_key_l2tpv3: + * @session_id: identifier for a l2tp session + */ +struct flow_dissector_key_l2tpv3 { + __be32 session_id; +}; + +/** + * struct flow_dissector_key_ipsec: + * @spi: identifier for a ipsec connection + */ +struct flow_dissector_key_ipsec { + __be32 spi; +}; + +/** + * struct flow_dissector_key_cfm + * @mdl_ver: maintenance domain level (mdl) and cfm protocol version + * @opcode: code specifying a type of cfm protocol packet + * + * See 802.1ag, ITU-T G.8013/Y.1731 + * 1 2 + * |7 6 5 4 3 2 1 0|7 6 5 4 3 2 1 0| + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | mdl | version | opcode | + * +-----+---------+-+-+-+-+-+-+-+-+ + */ +struct flow_dissector_key_cfm { + u8 mdl_ver; + u8 opcode; +}; + +#define FLOW_DIS_CFM_MDL_MASK GENMASK(7, 5) +#define FLOW_DIS_CFM_MDL_MAX 7 + enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ @@ -280,6 +371,11 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */ FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */ FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */ + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */ + FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */ + FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */ + FLOW_DISSECTOR_KEY_CFM, /* struct flow_dissector_key_cfm */ + FLOW_DISSECTOR_KEY_IPSEC, /* struct flow_dissector_key_ipsec */ FLOW_DISSECTOR_KEY_MAX, }; @@ -287,6 +383,7 @@ enum flow_dissector_key_id { #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(1) #define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(2) +#define FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP BIT(3) struct flow_dissector_key { enum flow_dissector_key_id key_id; @@ -295,7 +392,8 @@ struct flow_dissector_key { }; struct flow_dissector { - unsigned int used_keys; /* each bit repesents presence of one key id */ + unsigned long long used_keys; + /* each bit represents presence of one key id */ unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; @@ -348,14 +446,16 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys) } u32 flow_hash_from_keys(struct flow_keys *keys); +u32 flow_hash_from_keys_seed(struct flow_keys *keys, + const siphash_key_t *keyval); void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, - void *data, int thoff, int hlen); + const void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { - return flow_dissector->used_keys & (1 << key_id); + return flow_dissector->used_keys & (1ULL << key_id); } static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, @@ -368,8 +468,8 @@ static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissec struct bpf_flow_dissector { struct bpf_flow_keys *flow_keys; const struct sk_buff *skb; - void *data; - void *data_end; + const void *data; + const void *data_end; }; static inline void diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 123b1e9ea304..596ab9791e4d 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -32,6 +32,10 @@ struct flow_match_vlan { struct flow_dissector_key_vlan *key, *mask; }; +struct flow_match_arp { + struct flow_dissector_key_arp *key, *mask; +}; + struct flow_match_ipv4_addrs { struct flow_dissector_key_ipv4_addrs *key, *mask; }; @@ -48,6 +52,10 @@ struct flow_match_ports { struct flow_dissector_key_ports *key, *mask; }; +struct flow_match_ports_range { + struct flow_dissector_key_ports_range *key, *mask; +}; + struct flow_match_icmp { struct flow_dissector_key_icmp *key, *mask; }; @@ -56,6 +64,10 @@ struct flow_match_tcp { struct flow_dissector_key_tcp *key, *mask; }; +struct flow_match_ipsec { + struct flow_dissector_key_ipsec *key, *mask; +}; + struct flow_match_mpls { struct flow_dissector_key_mpls *key, *mask; }; @@ -72,6 +84,14 @@ struct flow_match_ct { struct flow_dissector_key_ct *key, *mask; }; +struct flow_match_pppoe { + struct flow_dissector_key_pppoe *key, *mask; +}; + +struct flow_match_l2tpv3 { + struct flow_dissector_key_l2tpv3 *key, *mask; +}; + struct flow_rule; void flow_rule_match_meta(const struct flow_rule *rule, @@ -86,6 +106,8 @@ void flow_rule_match_vlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_cvlan(const struct flow_rule *rule, struct flow_match_vlan *out); +void flow_rule_match_arp(const struct flow_rule *rule, + struct flow_match_arp *out); void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, @@ -94,8 +116,12 @@ void flow_rule_match_ip(const struct flow_rule *rule, struct flow_match_ip *out); void flow_rule_match_ports(const struct flow_rule *rule, struct flow_match_ports *out); +void flow_rule_match_ports_range(const struct flow_rule *rule, + struct flow_match_ports_range *out); void flow_rule_match_tcp(const struct flow_rule *rule, struct flow_match_tcp *out); +void flow_rule_match_ipsec(const struct flow_rule *rule, + struct flow_match_ipsec *out); void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out); void flow_rule_match_mpls(const struct flow_rule *rule, @@ -116,6 +142,10 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule, struct flow_match_enc_opts *out); void flow_rule_match_ct(const struct flow_rule *rule, struct flow_match_ct *out); +void flow_rule_match_pppoe(const struct flow_rule *rule, + struct flow_match_pppoe *out); +void flow_rule_match_l2tpv3(const struct flow_rule *rule, + struct flow_match_l2tpv3 *out); enum flow_action_id { FLOW_ACTION_ACCEPT = 0, @@ -137,6 +167,7 @@ enum flow_action_id { FLOW_ACTION_MARK, FLOW_ACTION_PTYPE, FLOW_ACTION_PRIORITY, + FLOW_ACTION_RX_QUEUE_MAPPING, FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, @@ -147,6 +178,12 @@ enum flow_action_id { FLOW_ACTION_MPLS_POP, FLOW_ACTION_MPLS_MANGLE, FLOW_ACTION_GATE, + FLOW_ACTION_PPPOE_PUSH, + FLOW_ACTION_JUMP, + FLOW_ACTION_PIPE, + FLOW_ACTION_VLAN_PUSH_ETH, + FLOW_ACTION_VLAN_POP_ETH, + FLOW_ACTION_CONTINUE, NUM_FLOW_ACTIONS, }; @@ -196,6 +233,9 @@ void flow_action_cookie_destroy(struct flow_action_cookie *cookie); struct flow_action_entry { enum flow_action_id id; + u32 hw_index; + unsigned long cookie; + u64 miss_cookie; enum flow_action_hw_stats hw_stats; action_destr destructor; void *destructor_priv; @@ -207,6 +247,10 @@ struct flow_action_entry { __be16 proto; u8 prio; } vlan; + struct { /* FLOW_ACTION_VLAN_PUSH_ETH */ + unsigned char dst[ETH_ALEN]; + unsigned char src[ETH_ALEN]; + } vlan_push_eth; struct { /* FLOW_ACTION_MANGLE */ /* FLOW_ACTION_ADD */ enum flow_action_mangle_base htype; @@ -218,6 +262,7 @@ struct flow_action_entry { u32 csum_flags; /* FLOW_ACTION_CSUM */ u32 mark; /* FLOW_ACTION_MARK */ u16 ptype; /* FLOW_ACTION_PTYPE */ + u16 rx_queue; /* FLOW_ACTION_RX_QUEUE_MAPPING */ u32 priority; /* FLOW_ACTION_PRIORITY */ struct { /* FLOW_ACTION_QUEUE */ u32 ctx; @@ -231,10 +276,18 @@ struct flow_action_entry { bool truncate; } sample; struct { /* FLOW_ACTION_POLICE */ - u32 index; u32 burst; u64 rate_bytes_ps; + u64 peakrate_bytes_ps; + u32 avrate; + u16 overhead; + u64 burst_pkt; + u64 rate_pkt_ps; u32 mtu; + struct { + enum flow_action_id act_id; + u32 extval; + } exceed, notexceed; } police; struct { /* FLOW_ACTION_CT */ int action; @@ -245,6 +298,7 @@ struct flow_action_entry { unsigned long cookie; u32 mark; u32 labels[4]; + bool orig_dir; } ct_metadata; struct { /* FLOW_ACTION_MPLS_PUSH */ u32 label; @@ -263,7 +317,6 @@ struct flow_action_entry { u8 ttl; } mpls_mangle; struct { - u32 index; s32 prio; u64 basetime; u64 cycletime; @@ -271,13 +324,16 @@ struct flow_action_entry { u32 num_entries; struct action_gate_entry *entries; } gate; + struct { /* FLOW_ACTION_PPPOE_PUSH */ + u16 sid; + } pppoe; }; - struct flow_action_cookie *cookie; /* user defined action cookie */ + struct flow_action_cookie *user_cookie; /* user defined action cookie */ }; struct flow_action { unsigned int num_entries; - struct flow_action_entry entries[]; + struct flow_action_entry entries[] __counted_by(num_entries); }; static inline bool flow_action_has_entries(const struct flow_action *action) @@ -286,16 +342,22 @@ static inline bool flow_action_has_entries(const struct flow_action *action) } /** - * flow_action_has_one_action() - check if exactly one action is present + * flow_offload_has_one_action() - check if exactly one action is present * @action: tc filter flow offload action * - * Returns true if exactly one action is present. + * Return: true if exactly one action is present. */ static inline bool flow_offload_has_one_action(const struct flow_action *action) { return action->num_entries == 1; } +static inline bool flow_action_is_last_entry(const struct flow_action *action, + const struct flow_action_entry *entry) +{ + return entry == &action->entries[action->num_entries - 1]; +} + #define flow_action_for_each(__i, __act, __actions) \ for (__i = 0, __act = &(__actions)->entries[0]; \ __i < (__actions)->num_entries; \ @@ -387,6 +449,96 @@ static inline bool flow_rule_match_key(const struct flow_rule *rule, return dissector_uses_key(rule->match.dissector, key); } +/** + * flow_rule_is_supp_control_flags() - check for supported control flags + * @supp_flags: control flags supported by driver + * @ctrl_flags: control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if only supported control flags are set, false otherwise. + */ +static inline bool flow_rule_is_supp_control_flags(const u32 supp_flags, + const u32 ctrl_flags, + struct netlink_ext_ack *extack) +{ + if (likely((ctrl_flags & ~supp_flags) == 0)) + return true; + + NL_SET_ERR_MSG_FMT_MOD(extack, + "Unsupported match on control.flags %#x", + ctrl_flags); + + return false; +} + +/** + * flow_rule_is_supp_enc_control_flags() - check for supported control flags + * @supp_enc_flags: encapsulation control flags supported by driver + * @enc_ctrl_flags: encapsulation control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if only supported control flags are set, false otherwise. + */ +static inline bool flow_rule_is_supp_enc_control_flags(const u32 supp_enc_flags, + const u32 enc_ctrl_flags, + struct netlink_ext_ack *extack) +{ + if (likely((enc_ctrl_flags & ~supp_enc_flags) == 0)) + return true; + + NL_SET_ERR_MSG_FMT_MOD(extack, + "Unsupported match on enc_control.flags %#x", + enc_ctrl_flags); + + return false; +} + +/** + * flow_rule_has_control_flags() - check for presence of any control flags + * @ctrl_flags: control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if control flags are set, false otherwise. + */ +static inline bool flow_rule_has_control_flags(const u32 ctrl_flags, + struct netlink_ext_ack *extack) +{ + return !flow_rule_is_supp_control_flags(0, ctrl_flags, extack); +} + +/** + * flow_rule_has_enc_control_flags() - check for presence of any control flags + * @enc_ctrl_flags: encapsulation control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if control flags are set, false otherwise. + */ +static inline bool flow_rule_has_enc_control_flags(const u32 enc_ctrl_flags, + struct netlink_ext_ack *extack) +{ + return !flow_rule_is_supp_enc_control_flags(0, enc_ctrl_flags, extack); +} + +/** + * flow_rule_match_has_control_flags() - match and check for any control flags + * @rule: The flow_rule under evaluation. + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if control flags are set, false otherwise. + */ +static inline bool flow_rule_match_has_control_flags(struct flow_rule *rule, + struct netlink_ext_ack *extack) +{ + struct flow_match_control match; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) + return false; + + flow_rule_match_control(rule, &match); + + return flow_rule_has_control_flags(match.mask->flags, extack); +} + struct flow_stats { u64 pkts; u64 bytes; @@ -444,6 +596,7 @@ struct flow_block_offload { struct list_head *driver_block_list; struct netlink_ext_ack *extack; struct Qdisc *sch; + struct list_head *cb_list_head; }; enum tc_setup_type; @@ -532,18 +685,38 @@ struct flow_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; + bool skip_sw; struct netlink_ext_ack *extack; }; struct flow_cls_offload { struct flow_cls_common_offload common; enum flow_cls_command command; + bool use_act_stats; unsigned long cookie; struct flow_rule *rule; struct flow_stats stats; u32 classid; }; +enum offload_act_command { + FLOW_ACT_REPLACE, + FLOW_ACT_DESTROY, + FLOW_ACT_STATS, +}; + +struct flow_offload_action { + struct netlink_ext_ack *extack; /* NULL in FLOW_ACT_STATS process*/ + enum offload_act_command command; + enum flow_action_id id; + u32 index; + unsigned long cookie; + struct flow_stats stats; + struct flow_action action; +}; + +struct flow_offload_action *offload_action_alloc(unsigned int num_actions); + static inline struct flow_rule * flow_cls_offload_flow_rule(struct flow_cls_offload *flow_cmd) { @@ -567,5 +740,6 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)); +bool flow_indr_dev_exists(void); #endif /* _NET_FLOW_OFFLOAD_H */ diff --git a/include/net/fou.h b/include/net/fou.h index 80f56e275b08..824eb4b231fd 100644 --- a/include/net/fou.h +++ b/include/net/fou.h @@ -17,4 +17,6 @@ int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type); +int register_fou_bpf(void); + #endif diff --git a/include/net/fq.h b/include/net/fq.h index e39f3f8d5f8a..99fbe4127b95 100644 --- a/include/net/fq.h +++ b/include/net/fq.h @@ -7,6 +7,10 @@ #ifndef __NET_SCHED_FQ_H #define __NET_SCHED_FQ_H +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/types.h> + struct fq_tin; /** @@ -19,8 +23,6 @@ struct fq_tin; * @flowchain: can be linked to fq_tin's new_flows or old_flows. Used for DRR++ * (deficit round robin) based round robin queuing similar to the one * found in net/sched/sch_fq_codel.c - * @backlogchain: can be linked to other fq_flow and fq. Used to keep track of - * fat flows and efficient head-dropping if packet limit is reached * @queue: sk_buff queue to hold packets * @backlog: number of bytes pending in the queue. The number of packets can be * found in @queue.qlen @@ -29,7 +31,6 @@ struct fq_tin; struct fq_flow { struct fq_tin *tin; struct list_head flowchain; - struct list_head backlogchain; struct sk_buff_head queue; u32 backlog; int deficit; @@ -47,6 +48,8 @@ struct fq_flow { struct fq_tin { struct list_head new_flows; struct list_head old_flows; + struct list_head tin_list; + struct fq_flow default_flow; u32 backlog_bytes; u32 backlog_packets; u32 overlimit; @@ -59,14 +62,14 @@ struct fq_tin { /** * struct fq - main container for fair queuing purposes * - * @backlogs: linked to fq_flows. Used to maintain fat flows for efficient - * head-dropping when @backlog reaches @limit * @limit: max number of packets that can be queued across all flows * @backlog: number of packets queued across all flows */ struct fq { struct fq_flow *flows; - struct list_head backlogs; + unsigned long *flows_bitmap; + + struct list_head tin_backlog; spinlock_t lock; u32 flows_cnt; u32 limit; @@ -95,9 +98,4 @@ typedef bool fq_skb_filter_t(struct fq *, struct sk_buff *, void *); -typedef struct fq_flow *fq_flow_get_default_t(struct fq *, - struct fq_tin *, - int idx, - struct sk_buff *); - #endif diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index e73d74d2fabf..9467e33dfb36 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -11,35 +11,37 @@ /* functions that are embedded into includer */ -static void fq_adjust_removal(struct fq *fq, - struct fq_flow *flow, - struct sk_buff *skb) + +static void +__fq_adjust_removal(struct fq *fq, struct fq_flow *flow, unsigned int packets, + unsigned int bytes, unsigned int truesize) { struct fq_tin *tin = flow->tin; + int idx; - tin->backlog_bytes -= skb->len; - tin->backlog_packets--; - flow->backlog -= skb->len; - fq->backlog--; - fq->memory_usage -= skb->truesize; -} + tin->backlog_bytes -= bytes; + tin->backlog_packets -= packets; + flow->backlog -= bytes; + fq->backlog -= packets; + fq->memory_usage -= truesize; -static void fq_rejigger_backlog(struct fq *fq, struct fq_flow *flow) -{ - struct fq_flow *i; + if (flow->backlog) + return; - if (flow->backlog == 0) { - list_del_init(&flow->backlogchain); - } else { - i = flow; + if (flow == &tin->default_flow) { + list_del_init(&tin->tin_list); + return; + } - list_for_each_entry_continue(i, &fq->backlogs, backlogchain) - if (i->backlog < flow->backlog) - break; + idx = flow - fq->flows; + __clear_bit(idx, fq->flows_bitmap); +} - list_move_tail(&flow->backlogchain, - &i->backlogchain); - } +static void fq_adjust_removal(struct fq *fq, + struct fq_flow *flow, + struct sk_buff *skb) +{ + __fq_adjust_removal(fq, flow, 1, skb->len, skb->truesize); } static struct sk_buff *fq_flow_dequeue(struct fq *fq, @@ -54,11 +56,37 @@ static struct sk_buff *fq_flow_dequeue(struct fq *fq, return NULL; fq_adjust_removal(fq, flow, skb); - fq_rejigger_backlog(fq, flow); return skb; } +static int fq_flow_drop(struct fq *fq, struct fq_flow *flow, + fq_skb_free_t free_func) +{ + unsigned int packets = 0, bytes = 0, truesize = 0; + struct fq_tin *tin = flow->tin; + struct sk_buff *skb; + int pending; + + lockdep_assert_held(&fq->lock); + + pending = min_t(int, 32, skb_queue_len(&flow->queue) / 2); + do { + skb = __skb_dequeue(&flow->queue); + if (!skb) + break; + + packets++; + bytes += skb->len; + truesize += skb->truesize; + free_func(fq, tin, flow, skb); + } while (packets < pending); + + __fq_adjust_removal(fq, flow, packets, bytes, truesize); + + return packets; +} + static struct sk_buff *fq_tin_dequeue(struct fq *fq, struct fq_tin *tin, fq_tin_dequeue_t dequeue_func) @@ -115,8 +143,7 @@ static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) static struct fq_flow *fq_flow_classify(struct fq *fq, struct fq_tin *tin, u32 idx, - struct sk_buff *skb, - fq_flow_get_default_t get_default_func) + struct sk_buff *skb) { struct fq_flow *flow; @@ -124,7 +151,7 @@ static struct fq_flow *fq_flow_classify(struct fq *fq, flow = &fq->flows[idx]; if (flow->tin && flow->tin != tin) { - flow = get_default_func(fq, tin, idx, skb); + flow = &tin->default_flow; tin->collisions++; fq->collisions++; } @@ -135,45 +162,68 @@ static struct fq_flow *fq_flow_classify(struct fq *fq, return flow; } -static void fq_recalc_backlog(struct fq *fq, - struct fq_tin *tin, - struct fq_flow *flow) +static struct fq_flow *fq_find_fattest_flow(struct fq *fq) { - struct fq_flow *i; + struct fq_tin *tin; + struct fq_flow *flow = NULL; + u32 len = 0; + int i; - if (list_empty(&flow->backlogchain)) - list_add_tail(&flow->backlogchain, &fq->backlogs); + for_each_set_bit(i, fq->flows_bitmap, fq->flows_cnt) { + struct fq_flow *cur = &fq->flows[i]; + unsigned int cur_len; - i = flow; - list_for_each_entry_continue_reverse(i, &fq->backlogs, - backlogchain) - if (i->backlog > flow->backlog) - break; + cur_len = cur->backlog; + if (cur_len <= len) + continue; + + flow = cur; + len = cur_len; + } + + list_for_each_entry(tin, &fq->tin_backlog, tin_list) { + unsigned int cur_len = tin->default_flow.backlog; + + if (cur_len <= len) + continue; + + flow = &tin->default_flow; + len = cur_len; + } - list_move(&flow->backlogchain, &i->backlogchain); + return flow; } static void fq_tin_enqueue(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb, - fq_skb_free_t free_func, - fq_flow_get_default_t get_default_func) + fq_skb_free_t free_func) { struct fq_flow *flow; + struct sk_buff *next; bool oom; lockdep_assert_held(&fq->lock); - flow = fq_flow_classify(fq, tin, idx, skb, get_default_func); + flow = fq_flow_classify(fq, tin, idx, skb); - flow->tin = tin; - flow->backlog += skb->len; - tin->backlog_bytes += skb->len; - tin->backlog_packets++; - fq->memory_usage += skb->truesize; - fq->backlog++; + if (!flow->backlog) { + if (flow != &tin->default_flow) + __set_bit(idx, fq->flows_bitmap); + else if (list_empty(&tin->tin_list)) + list_add(&tin->tin_list, &fq->tin_backlog); + } - fq_recalc_backlog(fq, tin, flow); + flow->tin = tin; + skb_list_walk_safe(skb, skb, next) { + skb_mark_not_on_list(skb); + flow->backlog += skb->len; + tin->backlog_bytes += skb->len; + tin->backlog_packets++; + fq->memory_usage += skb->truesize; + fq->backlog++; + __skb_queue_tail(&flow->queue, skb); + } if (list_empty(&flow->flowchain)) { flow->deficit = fq->quantum; @@ -181,21 +231,15 @@ static void fq_tin_enqueue(struct fq *fq, &tin->new_flows); } - __skb_queue_tail(&flow->queue, skb); oom = (fq->memory_usage > fq->memory_limit); while (fq->backlog > fq->limit || oom) { - flow = list_first_entry_or_null(&fq->backlogs, - struct fq_flow, - backlogchain); + flow = fq_find_fattest_flow(fq); if (!flow) return; - skb = fq_flow_dequeue(fq, flow); - if (!skb) + if (!fq_flow_drop(fq, flow, free_func)) return; - free_func(fq, flow->tin, flow, skb); - flow->tin->overlimit++; fq->overlimit++; if (oom) { @@ -224,8 +268,6 @@ static void fq_flow_filter(struct fq *fq, fq_adjust_removal(fq, flow, skb); free_func(fq, tin, flow, skb); } - - fq_rejigger_backlog(fq, flow); } static void fq_tin_filter(struct fq *fq, @@ -248,16 +290,18 @@ static void fq_flow_reset(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { + struct fq_tin *tin = flow->tin; struct sk_buff *skb; while ((skb = fq_flow_dequeue(fq, flow))) - free_func(fq, flow->tin, flow, skb); + free_func(fq, tin, flow, skb); - if (!list_empty(&flow->flowchain)) + if (!list_empty(&flow->flowchain)) { list_del_init(&flow->flowchain); - - if (!list_empty(&flow->backlogchain)) - list_del_init(&flow->backlogchain); + if (list_empty(&tin->new_flows) && + list_empty(&tin->old_flows)) + list_del_init(&tin->tin_list); + } flow->tin = NULL; @@ -283,6 +327,7 @@ static void fq_tin_reset(struct fq *fq, fq_flow_reset(fq, flow, free_func); } + WARN_ON_ONCE(!list_empty(&tin->tin_list)); WARN_ON_ONCE(tin->backlog_bytes); WARN_ON_ONCE(tin->backlog_packets); } @@ -290,7 +335,6 @@ static void fq_tin_reset(struct fq *fq, static void fq_flow_init(struct fq_flow *flow) { INIT_LIST_HEAD(&flow->flowchain); - INIT_LIST_HEAD(&flow->backlogchain); __skb_queue_head_init(&flow->queue); } @@ -298,6 +342,8 @@ static void fq_tin_init(struct fq_tin *tin) { INIT_LIST_HEAD(&tin->new_flows); INIT_LIST_HEAD(&tin->old_flows); + INIT_LIST_HEAD(&tin->tin_list); + fq_flow_init(&tin->default_flow); } static int fq_init(struct fq *fq, int flows_cnt) @@ -305,8 +351,8 @@ static int fq_init(struct fq *fq, int flows_cnt) int i; memset(fq, 0, sizeof(fq[0])); - INIT_LIST_HEAD(&fq->backlogs); spin_lock_init(&fq->lock); + INIT_LIST_HEAD(&fq->tin_backlog); fq->flows_cnt = max_t(u32, flows_cnt, 1); fq->quantum = 300; fq->limit = 8192; @@ -316,6 +362,13 @@ static int fq_init(struct fq *fq, int flows_cnt) if (!fq->flows) return -ENOMEM; + fq->flows_bitmap = bitmap_zalloc(fq->flows_cnt, GFP_KERNEL); + if (!fq->flows_bitmap) { + kvfree(fq->flows); + fq->flows = NULL; + return -ENOMEM; + } + for (i = 0; i < fq->flows_cnt; i++) fq_flow_init(&fq->flows[i]); @@ -332,6 +385,9 @@ static void fq_reset(struct fq *fq, kvfree(fq->flows); fq->flows = NULL; + + bitmap_free(fq->flows_bitmap); + fq->flows_bitmap = NULL; } #endif diff --git a/include/net/garp.h b/include/net/garp.h index 4d9a0c6a2e5f..59a07b171def 100644 --- a/include/net/garp.h +++ b/include/net/garp.h @@ -2,6 +2,8 @@ #ifndef _NET_GARP_H #define _NET_GARP_H +#include <linux/if_ether.h> +#include <linux/types.h> #include <net/stp.h> #define GARP_PROTOCOL_ID 0x1 diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 1424e02cef90..7aa2b8e1fb29 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -7,14 +7,17 @@ #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> -/* Note: this used to be in include/uapi/linux/gen_stats.h */ -struct gnet_stats_basic_packed { - __u64 bytes; - __u64 packets; -}; - -struct gnet_stats_basic_cpu { - struct gnet_stats_basic_packed bstats; +/* Throughput stats. + * Must be initialized beforehand with gnet_stats_basic_sync_init(). + * + * If no reads can ever occur parallel to writes (e.g. stack-allocated + * bstats), then the internal stat values can be written to and read + * from directly. Otherwise, use _bstats_set/update() for writes and + * gnet_stats_add_basic() for reads. + */ +struct gnet_stats_basic_sync { + u64_stats_t bytes; + u64_stats_t packets; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); @@ -34,6 +37,7 @@ struct gnet_dump { struct tc_stats tc_stats; }; +void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b); int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr); @@ -42,41 +46,38 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr); -int gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b); -void __gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b); -int gnet_stats_copy_basic_hw(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b); +int gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running); +void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running); +int gnet_stats_copy_basic_hw(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running); int gnet_stats_copy_rate_est(struct gnet_dump *d, struct net_rate_estimator __rcu **ptr); int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen); -void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu_q, - const struct gnet_stats_queue *q, __u32 qlen); +void gnet_stats_add_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu_q, + const struct gnet_stats_queue *q); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); -int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_new_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + bool running, struct nlattr *opt); void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); -int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **ptr, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + bool running, struct nlattr *opt); bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, struct gnet_stats_rate_est64 *sample); diff --git a/include/net/genetlink.h b/include/net/genetlink.h index e55ec1597ce7..a03d56765832 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -2,26 +2,40 @@ #ifndef __NET_GENERIC_NETLINK_H #define __NET_GENERIC_NETLINK_H -#include <linux/genetlink.h> +#include <linux/net.h> #include <net/netlink.h> #include <net/net_namespace.h> +#include <uapi/linux/genetlink.h> #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) +/* Non-parallel generic netlink requests are serialized by a global lock. */ +void genl_lock(void); +void genl_unlock(void); + +#define MODULE_ALIAS_GENL_FAMILY(family) \ + MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) + +/* Binding to multicast group requires %CAP_NET_ADMIN */ +#define GENL_MCAST_CAP_NET_ADMIN BIT(0) +/* Binding to multicast group requires %CAP_SYS_ADMIN */ +#define GENL_MCAST_CAP_SYS_ADMIN BIT(1) + /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family + * @flags: GENL_MCAST_* flags */ struct genl_multicast_group { char name[GENL_NAMSIZ]; + u8 flags; }; -struct genl_ops; +struct genl_split_ops; struct genl_info; /** * struct genl_family - generic netlink family - * @id: protocol family identifier (private) * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version @@ -35,65 +49,101 @@ struct genl_info; * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks + * @bind: called when family multicast group is added to a netlink socket + * @unbind: called when family multicast group is removed from a netlink socket + * @module: pointer to the owning module (set to THIS_MODULE) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups - * @mcgrp_offset: starting number of multicast group IDs in this family - * (private) + * @resv_start_op: first operation for which reserved fields of the header + * can be validated and policies are required (see below); + * new families should leave this field at zero * @ops: the operations supported by this family * @n_ops: number of operations supported by this family * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family + * @split_ops: the split do/dump form of operation definition + * @n_split_ops: number of entries in @split_ops, not that with split do/dump + * ops the number of entries is not the same as number of commands + * @sock_priv_size: the size of per-socket private memory + * @sock_priv_init: the per-socket private memory initializer + * @sock_priv_destroy: the per-socket private memory destructor + * + * Attribute policies (the combination of @policy and @maxattr fields) + * can be attached at the family level or at the operation level. + * If both are present the per-operation policy takes precedence. + * For operations before @resv_start_op lack of policy means that the core + * will perform no attribute parsing or validation. For newer operations + * if policy is not provided core will reject all TLV attributes. */ struct genl_family { - int id; /* private */ unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; - unsigned int mcgrp_offset; /* private */ u8 netnsok:1; u8 parallel_ops:1; u8 n_ops; u8 n_small_ops; + u8 n_split_ops; u8 n_mcgrps; + u8 resv_start_op; const struct nla_policy *policy; - int (*pre_doit)(const struct genl_ops *ops, + int (*pre_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); - void (*post_doit)(const struct genl_ops *ops, + void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); + int (*bind)(int mcgrp); + void (*unbind)(int mcgrp); const struct genl_ops * ops; const struct genl_small_ops *small_ops; + const struct genl_split_ops *split_ops; const struct genl_multicast_group *mcgrps; struct module *module; + + size_t sock_priv_size; + void (*sock_priv_init)(void *priv); + void (*sock_priv_destroy)(void *priv); + +/* private: internal use only */ + /* protocol family identifier */ + int id; + /* starting number of multicast group IDs in this family */ + unsigned int mcgrp_offset; + /* list of per-socket privs */ + struct xarray *sock_privs; }; /** * struct genl_info - receiving information * @snd_seq: sending sequence number * @snd_portid: netlink portid of sender + * @family: generic netlink family * @nlhdr: netlink message header * @genlhdr: generic netlink message header - * @userhdr: user specific header * @attrs: netlink attributes * @_net: network namespace - * @user_ptr: user pointers + * @ctx: storage space for the use by the family + * @user_ptr: user pointers (deprecated, use ctx instead) * @extack: extended ACK report struct */ struct genl_info { u32 snd_seq; u32 snd_portid; - struct nlmsghdr * nlhdr; + const struct genl_family *family; + const struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; - void * userhdr; struct nlattr ** attrs; possible_net_t _net; - void * user_ptr[2]; + union { + u8 ctx[NETLINK_CTX_SIZE]; + void * user_ptr[2]; + }; struct netlink_ext_ack *extack; }; -static inline struct net *genl_info_net(struct genl_info *info) +static inline struct net *genl_info_net(const struct genl_info *info) { return read_pnet(&info->_net); } @@ -103,8 +153,23 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) write_pnet(&info->_net, net); } +static inline void *genl_info_userhdr(const struct genl_info *info) +{ + return (u8 *)info->genlhdr + GENL_HDRLEN; +} + #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) +#define GENL_SET_ERR_MSG_FMT(info, msg, args...) \ + NL_SET_ERR_MSG_FMT((info)->extack, msg, ##args) + +/* Report that a root attribute is missing */ +#define GENL_REQ_ATTR_CHECK(info, attr) ({ \ + const struct genl_info *__info = (info); \ + \ + NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \ +}) + enum genl_validate_flags { GENL_DONT_VALIDATE_STRICT = BIT(0), GENL_DONT_VALIDATE_DUMP = BIT(1), @@ -115,7 +180,7 @@ enum genl_validate_flags { * struct genl_small_ops - generic netlink operations (small version) * @cmd: command identifier * @internal_flags: flags used by the family - * @flags: flags + * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @dumpit: callback for dumpers @@ -136,7 +201,7 @@ struct genl_small_ops { * struct genl_ops - generic netlink operations * @cmd: command identifier * @internal_flags: flags used by the family - * @flags: flags + * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @maxattr: maximum number of attributes supported * @policy: netlink policy (takes precedence over family policy) * @validate: validation flags from enum genl_validate_flags @@ -161,15 +226,66 @@ struct genl_ops { }; /** - * struct genl_info - info that is available during dumpit op call - * @family: generic netlink family - for internal genl code usage - * @ops: generic netlink ops - for internal genl code usage + * struct genl_split_ops - generic netlink operations (do/dump split version) + * @cmd: command identifier + * @internal_flags: flags used by the family + * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) + * @validate: validation flags from enum genl_validate_flags + * @policy: netlink policy (takes precedence over family policy) + * @maxattr: maximum number of attributes supported + * + * Do callbacks: + * @pre_doit: called before an operation's @doit callback, it may + * do additional, common, filtering and return an error + * @doit: standard command callback + * @post_doit: called after an operation's @doit callback, it may + * undo operations done by pre_doit, for example release locks + * + * Dump callbacks: + * @start: start callback for dumps + * @dumpit: callback for dumpers + * @done: completion callback for dumps + * + * Do callbacks can be used if %GENL_CMD_CAP_DO is set in @flags. + * Dump callbacks can be used if %GENL_CMD_CAP_DUMP is set in @flags. + * Exactly one of those flags must be set. + */ +struct genl_split_ops { + union { + struct { + int (*pre_doit)(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); + int (*doit)(struct sk_buff *skb, + struct genl_info *info); + void (*post_doit)(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); + }; + struct { + int (*start)(struct netlink_callback *cb); + int (*dumpit)(struct sk_buff *skb, + struct netlink_callback *cb); + int (*done)(struct netlink_callback *cb); + }; + }; + const struct nla_policy *policy; + unsigned int maxattr; + u8 cmd; + u8 internal_flags; + u8 flags; + u8 validate; +}; + +/** + * struct genl_dumpit_info - info that is available during dumpit op call + * @op: generic netlink ops - for internal genl code usage * @attrs: netlink attributes + * @info: struct genl_info describing the request */ struct genl_dumpit_info { - const struct genl_family *family; - struct genl_ops op; - struct nlattr **attrs; + struct genl_split_ops op; + struct genl_info info; }; static inline const struct genl_dumpit_info * @@ -178,6 +294,40 @@ genl_dumpit_info(struct netlink_callback *cb) return cb->data; } +static inline const struct genl_info * +genl_info_dump(struct netlink_callback *cb) +{ + return &genl_dumpit_info(cb)->info; +} + +/** + * genl_info_init_ntf() - initialize genl_info for notifications + * @info: genl_info struct to set up + * @family: pointer to the genetlink family + * @cmd: command to be used in the notification + * + * Initialize a locally declared struct genl_info to pass to various APIs. + * Intended to be used when creating notifications. + */ +static inline void +genl_info_init_ntf(struct genl_info *info, const struct genl_family *family, + u8 cmd) +{ + struct genlmsghdr *hdr = (void *) &info->user_ptr[0]; + + memset(info, 0, sizeof(*info)); + info->family = family; + info->genlhdr = hdr; + hdr->cmd = cmd; +} + +static inline bool genl_info_is_ntf(const struct genl_info *info) +{ + return !info->nlhdr; +} + +void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk); +void *genl_sk_priv_get(struct genl_family *family, struct sock *sk); int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, @@ -186,11 +336,37 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd); +static inline void * +__genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags) +{ + return genlmsg_put(skb, info->snd_portid, info->snd_seq, info->family, + flags, info->genlhdr->cmd); +} + +/** + * genlmsg_iput - start genetlink message based on genl_info + * @skb: skb in which message header will be placed + * @info: genl_info as provided to do/dump handlers + * + * Convenience wrapper which starts a genetlink message based on + * information in user request. @info should be either the struct passed + * by genetlink core to do/dump handlers (when constructing replies to + * such requests) or a struct initialized by genl_info_init_ntf() + * when constructing notifications. + * + * Returns: pointer to new genetlink header. + */ +static inline void * +genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) +{ + return __genlmsg_iput(skb, info, 0); +} + /** * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * - * Returns pointer to netlink header. + * Returns: pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { @@ -259,7 +435,7 @@ static inline void genl_dump_check_consistent(struct netlink_callback *cb, * @flags: netlink message flags * @cmd: generic netlink command * - * Returns pointer to user specific header + * Returns: pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, @@ -292,6 +468,35 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) } /** + * genlmsg_multicast_netns_filtered - multicast a netlink message + * to a specific netns with filter + * function + * @family: the generic netlink family + * @net: the net namespace + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: offset of multicast group in groups array + * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. + */ +static inline int +genlmsg_multicast_netns_filtered(const struct genl_family *family, + struct net *net, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags, + netlink_filter_fn filter, + void *filter_data) +{ + if (WARN_ON_ONCE(group >= family->n_mcgrps)) + return -EINVAL; + group = family->mcgrp_offset + group; + return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, + flags, filter, filter_data); +} + +/** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family * @net: the net namespace @@ -304,10 +509,8 @@ static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) - return -EINVAL; - group = family->mcgrp_offset + group; - return nlmsg_multicast(net->genl_sock, skb, portid, group, flags); + return genlmsg_multicast_netns_filtered(family, net, skb, portid, + group, flags, NULL, NULL); } /** @@ -332,16 +535,16 @@ static inline int genlmsg_multicast(const struct genl_family *family, * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array - * @flags: allocation flags * * This function must hold the RTNL or rcu_read_lock(). */ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, - unsigned int group, gfp_t flags); + unsigned int group); /** * genlmsg_unicast - unicast a netlink message + * @net: network namespace to look up @portid in * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ @@ -361,7 +564,7 @@ static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) } /** - * gennlmsg_data - head of message payload + * genlmsg_data - head of message payload * @gnlh: genetlink message header */ static inline void *genlmsg_data(const struct genlmsghdr *gnlh) diff --git a/include/net/geneve.h b/include/net/geneve.h index bced0b1d9fe4..5c96827a487e 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -59,7 +59,7 @@ struct genevehdr { __be16 proto_type; u8 vni[3]; u8 rsvd2; - struct geneve_opt options[]; + u8 options[]; }; static inline bool netif_is_geneve(const struct net_device *dev) diff --git a/include/net/gre.h b/include/net/gre.h index b60f212c16c6..ccd293203284 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -49,78 +49,61 @@ static inline bool netif_is_ip6gretap(const struct net_device *dev) !strcmp(dev->rtnl_link_ops->kind, "ip6gretap"); } -static inline int gre_calc_hlen(__be16 o_flags) +static inline int gre_calc_hlen(const unsigned long *o_flags) { int addend = 4; - if (o_flags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, o_flags)) addend += 4; - if (o_flags & TUNNEL_KEY) + if (test_bit(IP_TUNNEL_KEY_BIT, o_flags)) addend += 4; - if (o_flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, o_flags)) addend += 4; return addend; } -static inline __be16 gre_flags_to_tnl_flags(__be16 flags) +static inline void gre_flags_to_tnl_flags(unsigned long *dst, __be16 flags) { - __be16 tflags = 0; - - if (flags & GRE_CSUM) - tflags |= TUNNEL_CSUM; - if (flags & GRE_ROUTING) - tflags |= TUNNEL_ROUTING; - if (flags & GRE_KEY) - tflags |= TUNNEL_KEY; - if (flags & GRE_SEQ) - tflags |= TUNNEL_SEQ; - if (flags & GRE_STRICT) - tflags |= TUNNEL_STRICT; - if (flags & GRE_REC) - tflags |= TUNNEL_REC; - if (flags & GRE_VERSION) - tflags |= TUNNEL_VERSION; - - return tflags; + IP_TUNNEL_DECLARE_FLAGS(res) = { }; + + __assign_bit(IP_TUNNEL_CSUM_BIT, res, flags & GRE_CSUM); + __assign_bit(IP_TUNNEL_ROUTING_BIT, res, flags & GRE_ROUTING); + __assign_bit(IP_TUNNEL_KEY_BIT, res, flags & GRE_KEY); + __assign_bit(IP_TUNNEL_SEQ_BIT, res, flags & GRE_SEQ); + __assign_bit(IP_TUNNEL_STRICT_BIT, res, flags & GRE_STRICT); + __assign_bit(IP_TUNNEL_REC_BIT, res, flags & GRE_REC); + __assign_bit(IP_TUNNEL_VERSION_BIT, res, flags & GRE_VERSION); + + ip_tunnel_flags_copy(dst, res); } -static inline __be16 gre_tnl_flags_to_gre_flags(__be16 tflags) +static inline __be16 gre_tnl_flags_to_gre_flags(const unsigned long *tflags) { __be16 flags = 0; - if (tflags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, tflags)) flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) + if (test_bit(IP_TUNNEL_ROUTING_BIT, tflags)) flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) + if (test_bit(IP_TUNNEL_KEY_BIT, tflags)) flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, tflags)) flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) + if (test_bit(IP_TUNNEL_STRICT_BIT, tflags)) flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) + if (test_bit(IP_TUNNEL_REC_BIT, tflags)) flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) + if (test_bit(IP_TUNNEL_VERSION_BIT, tflags)) flags |= GRE_VERSION; return flags; } -static inline __sum16 gre_checksum(struct sk_buff *skb) -{ - __wsum csum; - - if (skb->ip_summed == CHECKSUM_PARTIAL) - csum = lco_csum(skb); - else - csum = skb_checksum(skb, 0, skb->len, 0); - return csum_fold(csum); -} - static inline void gre_build_header(struct sk_buff *skb, int hdr_len, - __be16 flags, __be16 proto, + const unsigned long *flags, __be16 proto, __be32 key, __be32 seq) { + IP_TUNNEL_DECLARE_FLAGS(cond) = { }; struct gre_base_hdr *greh; skb_push(skb, hdr_len); @@ -131,22 +114,32 @@ static inline void gre_build_header(struct sk_buff *skb, int hdr_len, greh->flags = gre_tnl_flags_to_gre_flags(flags); greh->protocol = proto; - if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { + __set_bit(IP_TUNNEL_KEY_BIT, cond); + __set_bit(IP_TUNNEL_CSUM_BIT, cond); + __set_bit(IP_TUNNEL_SEQ_BIT, cond); + + if (ip_tunnel_flags_intersect(flags, cond)) { __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - if (flags & TUNNEL_SEQ) { + if (test_bit(IP_TUNNEL_SEQ_BIT, flags)) { *ptr = seq; ptr--; } - if (flags & TUNNEL_KEY) { + if (test_bit(IP_TUNNEL_KEY_BIT, flags)) { *ptr = key; ptr--; } - if (flags & TUNNEL_CSUM && + if (test_bit(IP_TUNNEL_CSUM_BIT, flags) && !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; - *(__sum16 *)ptr = gre_checksum(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) { + *(__sum16 *)ptr = csum_fold(lco_csum(skb)); + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = sizeof(*greh); + } } } } diff --git a/include/net/gro.h b/include/net/gro.h new file mode 100644 index 000000000000..22d3a69e4404 --- /dev/null +++ b/include/net/gro.h @@ -0,0 +1,598 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_GRO_H +#define _NET_GRO_H + +#include <linux/indirect_call_wrapper.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/ip6_checksum.h> +#include <linux/skbuff.h> +#include <net/udp.h> +#include <net/hotdata.h> + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + +struct napi_gro_cb { + union { + struct { + /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ + void *frag0; + + /* Length of frag0. */ + unsigned int frag0_len; + }; + + struct { + /* used in skb_gro_receive() slow path */ + struct sk_buff *last; + + /* jiffies when first packet was created/queued */ + unsigned long age; + }; + }; + + /* This indicates where we are processing relative to skb->data. */ + int data_offset; + + /* This is non-zero if the packet cannot be merged with the new skb. */ + u16 flush; + + /* Number of segments aggregated. */ + u16 count; + + /* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ + u16 proto; + + u16 pad; + +/* Used in napi_gro_cb::free */ +#define NAPI_GRO_FREE 1 +#define NAPI_GRO_FREE_STOLEN_HEAD 2 + /* portion of the cb set to zero at every gro iteration */ + struct_group(zeroed, + + /* Start offset for remote checksum offload */ + u16 gro_remcsum_start; + + /* This is non-zero if the packet may be of the same flow. */ + u8 same_flow:1; + + /* Used in tunnel GRO receive */ + u8 encap_mark:1; + + /* GRO checksum is valid */ + u8 csum_valid:1; + + /* Number of checksums via CHECKSUM_UNNECESSARY */ + u8 csum_cnt:3; + + /* Free the skb? */ + u8 free:2; + + /* Used in foo-over-udp, set in udp[46]_gro_receive */ + u8 is_ipv6:1; + + /* Used in GRE, set in fou/gue_gro_receive */ + u8 is_fou:1; + + /* Used to determine if ipid_offset can be ignored */ + u8 ip_fixedid:1; + + /* Number of gro_receive callbacks this packet already went through */ + u8 recursion_counter:4; + + /* GRO is done by frag_list pointer chaining. */ + u8 is_flist:1; + ); + + /* used to support CHECKSUM_COMPLETE for tunneling protocols */ + __wsum csum; + + /* L3 offsets */ + union { + struct { + u16 network_offset; + u16 inner_network_offset; + }; + u16 network_offsets[2]; + }; +}; + +#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) + +#define GRO_RECURSION_LIMIT 15 +static inline int gro_recursion_inc_test(struct sk_buff *skb) +{ + return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; +} + +typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); +static inline struct sk_buff *call_gro_receive(gro_receive_t cb, + struct list_head *head, + struct sk_buff *skb) +{ + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + + return cb(head, skb); +} + +typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, + struct sk_buff *); +static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, + struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + + return cb(sk, head, skb); +} + +static inline unsigned int skb_gro_offset(const struct sk_buff *skb) +{ + return NAPI_GRO_CB(skb)->data_offset; +} + +static inline unsigned int skb_gro_len(const struct sk_buff *skb) +{ + return skb->len - NAPI_GRO_CB(skb)->data_offset; +} + +static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) +{ + NAPI_GRO_CB(skb)->data_offset += len; +} + +static inline void *skb_gro_header_fast(const struct sk_buff *skb, + unsigned int offset) +{ + return NAPI_GRO_CB(skb)->frag0 + offset; +} + +static inline bool skb_gro_may_pull(const struct sk_buff *skb, + unsigned int hlen) +{ + return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len); +} + +static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, + unsigned int offset) +{ + if (!pskb_may_pull(skb, hlen)) + return NULL; + + return skb->data + offset; +} + +static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen, + unsigned int offset) +{ + void *ptr; + + ptr = skb_gro_header_fast(skb, offset); + if (!skb_gro_may_pull(skb, hlen)) + ptr = skb_gro_header_slow(skb, hlen, offset); + return ptr; +} + +static inline int skb_gro_receive_network_offset(const struct sk_buff *skb) +{ + return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark]; +} + +static inline void *skb_gro_network_header(const struct sk_buff *skb) +{ + if (skb_gro_may_pull(skb, skb_gro_offset(skb))) + return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb)); + + return skb->data + skb_gro_receive_network_offset(skb); +} + +static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb, + int proto) +{ + const struct iphdr *iph = skb_gro_network_header(skb); + + return csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb_gro_len(skb), proto, 0); +} + +static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, + const void *start, unsigned int len) +{ + if (NAPI_GRO_CB(skb)->csum_valid) + NAPI_GRO_CB(skb)->csum = wsum_negate(csum_partial(start, len, + wsum_negate(NAPI_GRO_CB(skb)->csum))); +} + +/* GRO checksum functions. These are logical equivalents of the normal + * checksum functions (in skbuff.h) except that they operate on the GRO + * offsets and fields in sk_buff. + */ + +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb); + +static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) +{ + return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); +} + +static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, + bool zero_okay, + __sum16 check) +{ + return ((skb->ip_summed != CHECKSUM_PARTIAL || + skb_checksum_start_offset(skb) < + skb_gro_offset(skb)) && + !skb_at_gro_remcsum_start(skb) && + NAPI_GRO_CB(skb)->csum_cnt == 0 && + (!zero_okay || check)); +} + +static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, + __wsum psum) +{ + if (NAPI_GRO_CB(skb)->csum_valid && + !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) + return 0; + + NAPI_GRO_CB(skb)->csum = psum; + + return __skb_gro_checksum_complete(skb); +} + +static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) +{ + if (NAPI_GRO_CB(skb)->csum_cnt > 0) { + /* Consume a checksum from CHECKSUM_UNNECESSARY */ + NAPI_GRO_CB(skb)->csum_cnt--; + } else { + /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we + * verified a new top level checksum or an encapsulated one + * during GRO. This saves work if we fallback to normal path. + */ + __skb_incr_checksum_unnecessary(skb); + } +} + +#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ + compute_pseudo) \ +({ \ + __sum16 __ret = 0; \ + if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ + __ret = __skb_gro_checksum_validate_complete(skb, \ + compute_pseudo(skb, proto)); \ + if (!__ret) \ + skb_gro_incr_csum_unnecessary(skb); \ + __ret; \ +}) + +#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ + __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) + +#define skb_gro_checksum_validate_zero_check(skb, proto, check, \ + compute_pseudo) \ + __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) + +#define skb_gro_checksum_simple_validate(skb) \ + __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) + +static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) +{ + return (NAPI_GRO_CB(skb)->csum_cnt == 0 && + !NAPI_GRO_CB(skb)->csum_valid); +} + +static inline void __skb_gro_checksum_convert(struct sk_buff *skb, + __wsum pseudo) +{ + NAPI_GRO_CB(skb)->csum = ~pseudo; + NAPI_GRO_CB(skb)->csum_valid = 1; +} + +#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ +do { \ + if (__skb_gro_checksum_convert_check(skb)) \ + __skb_gro_checksum_convert(skb, \ + compute_pseudo(skb, proto)); \ +} while (0) + +struct gro_remcsum { + int offset; + __wsum delta; +}; + +static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) +{ + grc->offset = 0; + grc->delta = 0; +} + +static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, + unsigned int off, size_t hdrlen, + int start, int offset, + struct gro_remcsum *grc, + bool nopartial) +{ + __wsum delta; + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + + BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); + + if (!nopartial) { + NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; + return ptr; + } + + ptr = skb_gro_header(skb, off + plen, off); + if (!ptr) + return NULL; + + delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, + start, offset); + + /* Adjust skb->csum since we changed the packet */ + NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); + + grc->offset = off + hdrlen + offset; + grc->delta = delta; + + return ptr; +} + +static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, + struct gro_remcsum *grc) +{ + void *ptr; + size_t plen = grc->offset + sizeof(u16); + + if (!grc->delta) + return; + + ptr = skb_gro_header(skb, plen, grc->offset); + if (!ptr) + return; + + remcsum_unadjust((__sum16 *)ptr, grc->delta); +} + +#ifdef CONFIG_XFRM_OFFLOAD +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) +{ + if (PTR_ERR(pp) != -EINPROGRESS) + NAPI_GRO_CB(skb)->flush |= flush; +} +static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, + struct sk_buff *pp, + int flush, + struct gro_remcsum *grc) +{ + if (PTR_ERR(pp) != -EINPROGRESS) { + NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_remcsum_cleanup(skb, grc); + skb->remcsum_offload = 0; + } +} +#else +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) +{ + NAPI_GRO_CB(skb)->flush |= flush; +} +static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, + struct sk_buff *pp, + int flush, + struct gro_remcsum *grc) +{ + NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_remcsum_cleanup(skb, grc); + skb->remcsum_offload = 0; +} +#endif + +INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); + +INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); + +INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, + struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); + +#define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \ +({ \ + unlikely(gro_recursion_inc_test(skb)) ? \ + NAPI_GRO_CB(skb)->flush |= 1, NULL : \ + INDIRECT_CALL_INET(cb, f2, f1, head, skb); \ +}) + +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, struct sock *sk); +int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); + +static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) +{ + struct udphdr *uh; + unsigned int hlen, off; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*uh); + uh = skb_gro_header(skb, hlen, off); + + return uh; +} + +static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, + int proto) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + + return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb_gro_len(skb), proto, 0)); +} + +static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, + struct sk_buff *p, bool outer) +{ + const u32 id = ntohl(*(__be32 *)&iph->id); + const u32 id2 = ntohl(*(__be32 *)&iph2->id); + const u16 ipid_offset = (id >> 16) - (id2 >> 16); + const u16 count = NAPI_GRO_CB(p)->count; + const u32 df = id & IP_DF; + int flush; + + /* All fields must match except length and checksum. */ + flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); + + if (flush | (outer && df)) + return flush; + + /* When we receive our second frame we can make a decision on if we + * continue this flow as an atomic flow with a fixed ID or if we use + * an incrementing ID. + */ + if (count == 1 && df && !ipid_offset) + NAPI_GRO_CB(p)->ip_fixedid = true; + + return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid); +} + +static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) +{ + /* <Version:4><Traffic_Class:8><Flow_Label:20> */ + __be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2; + + /* Flush if Traffic Class fields are different. */ + return !!((first_word & htonl(0x0FF00000)) | + (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); +} + +static inline int __gro_receive_network_flush(const void *th, const void *th2, + struct sk_buff *p, const u16 diff, + bool outer) +{ + const void *nh = th - diff; + const void *nh2 = th2 - diff; + + if (((struct iphdr *)nh)->version == 6) + return ipv6_gro_flush(nh, nh2); + else + return inet_gro_flush(nh, nh2, p, outer); +} + +static inline int gro_receive_network_flush(const void *th, const void *th2, + struct sk_buff *p) +{ + const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; + int off = skb_transport_offset(p); + int flush; + + flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); + if (encap_mark) + flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); + + return flush; +} + +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); +void __gro_flush(struct gro_node *gro, bool flush_old); + +static inline void gro_flush(struct gro_node *gro, bool flush_old) +{ + if (!gro->bitmask) + return; + + __gro_flush(gro, flush_old); +} + +static inline void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + gro_flush(&napi->gro, flush_old); +} + +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static inline void gro_normal_list(struct gro_node *gro) +{ + if (!gro->rx_count) + return; + netif_receive_skb_list_internal(&gro->rx_list); + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb, + int segs) +{ + list_add_tail(&skb->list, &gro->rx_list); + gro->rx_count += segs; + if (gro->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) + gro_normal_list(gro); +} + +void gro_init(struct gro_node *gro); +void gro_cleanup(struct gro_node *gro); + +/* This function is the alternative of 'inet_iif' and 'inet_sdif' + * functions in case we can not rely on fields of IPCB. + * + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. + * The caller must hold the RCU read lock. + */ +static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) +{ + *iif = inet_iif(skb) ?: skb->dev->ifindex; + *sdif = 0; + +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (netif_is_l3_slave(skb->dev)) { + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); + + *sdif = *iif; + *iif = master ? master->ifindex : 0; + } +#endif +} + +/* This function is the alternative of 'inet6_iif' and 'inet6_sdif' + * functions in case we can not rely on fields of IP6CB. + * + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. + * The caller must hold the RCU read lock. + */ +static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) +{ + /* using skb->dev->ifindex because skb_dst(skb) is not initialized */ + *iif = skb->dev->ifindex; + *sdif = 0; + +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (netif_is_l3_slave(skb->dev)) { + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); + + *sdif = *iif; + *iif = master ? master->ifindex : 0; + } +#endif +} + +struct packet_offload *gro_find_receive_by_type(__be16 type); +struct packet_offload *gro_find_complete_by_type(__be16 type); + +#endif /* _NET_GRO_H */ diff --git a/include/net/gso.h b/include/net/gso.h new file mode 100644 index 000000000000..29975440cad5 --- /dev/null +++ b/include/net/gso.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_GSO_H +#define _NET_GSO_H + +#include <linux/skbuff.h> + +/* Keeps track of mac header offset relative to skb->head. + * It is useful for TSO of Tunneling protocol. e.g. GRE. + * For non-tunnel skb it points to skb_mac_header() and for + * tunnel skb it points to outer mac header. + * Keeps track of level of encapsulation of network headers. + */ +struct skb_gso_cb { + union { + int mac_offset; + int data_offset; + }; + int encap_level; + __wsum csum; + __u16 csum_start; +}; +#define SKB_GSO_CB_OFFSET 32 +#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) + +static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) +{ + return (skb_mac_header(inner_skb) - inner_skb->head) - + SKB_GSO_CB(inner_skb)->mac_offset; +} + +static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) +{ + int new_headroom, headroom; + int ret; + + headroom = skb_headroom(skb); + ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); + if (ret) + return ret; + + new_headroom = skb_headroom(skb); + SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); + return 0; +} + +static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) +{ + /* Do not update partial checksums if remote checksum is enabled. */ + if (skb->remcsum_offload) + return; + + SKB_GSO_CB(skb)->csum = res; + SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; +} + +/* Compute the checksum for a gso segment. First compute the checksum value + * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and + * then add in skb->csum (checksum from csum_start to end of packet). + * skb->csum and csum_start are then updated to reflect the checksum of the + * resultant packet starting from the transport header-- the resultant checksum + * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo + * header. + */ +static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) +{ + unsigned char *csum_start = skb_transport_header(skb); + int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; + __wsum partial = SKB_GSO_CB(skb)->csum; + + SKB_GSO_CB(skb)->csum = res; + SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; + + return csum_fold(csum_partial(csum_start, plen, partial)); +} + +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path); + +static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + return __skb_gso_segment(skb, features, true); +} + +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type); + +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features); + +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); + +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); + +static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, + int pulled_hlen, u16 mac_offset, + int mac_len) +{ + skb->protocol = protocol; + skb->encapsulation = 1; + skb_push(skb, pulled_hlen); + skb_reset_transport_header(skb); + skb->mac_header = mac_offset; + skb->network_header = skb->mac_header + mac_len; + skb->mac_len = mac_len; +} + +#endif /* _NET_GSO_H */ diff --git a/include/net/gtp.h b/include/net/gtp.h index 0e16ebb2a82d..c0253c8702d0 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -2,13 +2,22 @@ #ifndef _GTP_H_ #define _GTP_H_ +#include <linux/netdevice.h> +#include <linux/types.h> +#include <net/rtnetlink.h> + /* General GTP protocol related definitions. */ #define GTP0_PORT 3386 #define GTP1U_PORT 2152 +/* GTP messages types */ +#define GTP_ECHO_REQ 1 /* Echo Request */ +#define GTP_ECHO_RSP 2 /* Echo Response */ #define GTP_TPDU 255 +#define GTPIE_RECOVERY 14 + struct gtp0_header { /* According to GSM TS 09.60. */ __u8 flags; __u8 type; @@ -27,9 +36,51 @@ struct gtp1_header { /* According to 3GPP TS 29.060. */ __be32 tid; } __attribute__ ((packed)); +struct gtp1_header_long { /* According to 3GPP TS 29.060. */ + __u8 flags; + __u8 type; + __be16 length; + __be32 tid; + __be16 seq; + __u8 npdu; + __u8 next; +} __packed; + +/* GTP Information Element */ +struct gtp_ie { + __u8 tag; + __u8 val; +} __packed; + +struct gtp0_packet { + struct gtp0_header gtp0_h; + struct gtp_ie ie; +} __packed; + +struct gtp1u_packet { + struct gtp1_header_long gtp1u_h; + struct gtp_ie ie; +} __packed; + +struct gtp_pdu_session_info { /* According to 3GPP TS 38.415. */ + u8 pdu_type; + u8 qfi; +}; + +static inline bool netif_is_gtp(const struct net_device *dev) +{ + return dev->rtnl_link_ops && + !strcmp(dev->rtnl_link_ops->kind, "gtp"); +} + #define GTP1_F_NPDU 0x01 #define GTP1_F_SEQ 0x02 #define GTP1_F_EXTHDR 0x04 #define GTP1_F_MASK 0x07 +struct gtp_ext_hdr { + __u8 len; + __u8 data[]; +}; + #endif diff --git a/include/net/gue.h b/include/net/gue.h index e42402f180b7..dfca298bec9c 100644 --- a/include/net/gue.h +++ b/include/net/gue.h @@ -30,6 +30,9 @@ * may refer to options placed after this field. */ +#include <asm/byteorder.h> +#include <linux/types.h> + struct guehdr { union { struct { diff --git a/include/net/handshake.h b/include/net/handshake.h new file mode 100644 index 000000000000..8ebd4f9ed26e --- /dev/null +++ b/include/net/handshake.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generic netlink HANDSHAKE service. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2023, Oracle and/or its affiliates. + */ + +#ifndef _NET_HANDSHAKE_H +#define _NET_HANDSHAKE_H + +enum { + TLS_NO_KEYRING = 0, + TLS_NO_PEERID = 0, + TLS_NO_CERT = 0, + TLS_NO_PRIVKEY = 0, +}; + +typedef void (*tls_done_func_t)(void *data, int status, + key_serial_t peerid); + +struct tls_handshake_args { + struct socket *ta_sock; + tls_done_func_t ta_done; + void *ta_data; + const char *ta_peername; + unsigned int ta_timeout_ms; + key_serial_t ta_keyring; + key_serial_t ta_my_cert; + key_serial_t ta_my_privkey; + unsigned int ta_num_peerids; + key_serial_t ta_my_peerids[5]; +}; + +int tls_client_hello_anon(const struct tls_handshake_args *args, gfp_t flags); +int tls_client_hello_x509(const struct tls_handshake_args *args, gfp_t flags); +int tls_client_hello_psk(const struct tls_handshake_args *args, gfp_t flags); +int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags); +int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags); + +bool tls_handshake_cancel(struct sock *sk); +void tls_handshake_close(struct socket *sock); + +u8 tls_get_record_type(const struct sock *sk, const struct cmsghdr *msg); +void tls_alert_recv(const struct sock *sk, const struct msghdr *msg, + u8 *level, u8 *description); + +#endif /* _NET_HANDSHAKE_H */ diff --git a/include/net/hotdata.h b/include/net/hotdata.h new file mode 100644 index 000000000000..fda94b2647ff --- /dev/null +++ b/include/net/hotdata.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_HOTDATA_H +#define _NET_HOTDATA_H + +#include <linux/types.h> +#include <linux/netdevice.h> +#include <net/protocol.h> + +/* Read mostly data used in network fast paths. */ +struct net_hotdata { +#if IS_ENABLED(CONFIG_INET) + struct packet_offload ip_packet_offload; + struct net_offload tcpv4_offload; + struct net_protocol tcp_protocol; + struct net_offload udpv4_offload; + struct net_protocol udp_protocol; + struct packet_offload ipv6_packet_offload; + struct net_offload tcpv6_offload; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_protocol tcpv6_protocol; + struct inet6_protocol udpv6_protocol; +#endif + struct net_offload udpv6_offload; +#endif + struct list_head offload_base; + struct kmem_cache *skbuff_cache; + struct kmem_cache *skbuff_fclone_cache; + struct kmem_cache *skb_small_head_cache; +#ifdef CONFIG_RPS + struct rps_sock_flow_table __rcu *rps_sock_flow_table; + u32 rps_cpu_mask; +#endif + int gro_normal_batch; + int netdev_budget; + int netdev_budget_usecs; + int tstamp_prequeue; + int max_backlog; + int dev_tx_weight; + int dev_rx_weight; + int sysctl_max_skb_frags; + int sysctl_skb_defer_max; + int sysctl_mem_pcpu_rsv; +}; + +#define inet_ehash_secret net_hotdata.tcp_protocol.secret +#define udp_ehash_secret net_hotdata.udp_protocol.secret +#define inet6_ehash_secret net_hotdata.tcpv6_protocol.secret +#define tcp_ipv6_hash_secret net_hotdata.tcpv6_offload.secret +#define udp6_ehash_secret net_hotdata.udpv6_protocol.secret +#define udp_ipv6_hash_secret net_hotdata.udpv6_offload.secret + +extern struct net_hotdata net_hotdata; + +#endif /* _NET_HOTDATA_H */ diff --git a/include/net/hwbm.h b/include/net/hwbm.h index c81444611a22..bdbe91c609ff 100644 --- a/include/net/hwbm.h +++ b/include/net/hwbm.h @@ -2,6 +2,8 @@ #ifndef _HWBM_H #define _HWBM_H +#include <linux/mutex.h> + struct hwbm_pool { /* Capacity of the pool */ int size; @@ -9,9 +11,9 @@ struct hwbm_pool { int frag_size; /* Number of buffers currently used by this pool */ int buf_num; - /* constructor called during alocation */ + /* constructor called during allocation */ int (*construct)(struct hwbm_pool *bm_pool, void *buf); - /* protect acces to the buffer counter*/ + /* protect access to the buffer counter*/ struct mutex buf_lock; /* private data */ void *priv; diff --git a/include/net/icmp.h b/include/net/icmp.h index 9ac2d2672a93..caddf4a59ad1 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -46,12 +46,17 @@ static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 #if IS_ENABLED(CONFIG_NF_NAT) void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); #else -#define icmp_ndo_send icmp_send +static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + struct ip_options opts = { 0 }; + __icmp_send(skb_in, type, code, info, &opts); +} #endif int icmp_rcv(struct sk_buff *skb); int icmp_err(struct sk_buff *skb, u32 info); int icmp_init(void); void icmp_out_count(struct net *net, unsigned char type); +bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr); #endif /* _ICMP_H */ diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index 19c00d100096..813e163ce27c 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2017 Intel Deutschland GmbH - * Copyright (c) 2018-2019 Intel Corporation + * Copyright (c) 2018-2019, 2021-2022 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -18,33 +18,43 @@ #define __RADIOTAP_H #include <linux/kernel.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /** - * struct ieee82011_radiotap_header - base radiotap header + * struct ieee80211_radiotap_header - base radiotap header */ struct ieee80211_radiotap_header { - /** - * @it_version: radiotap version, always 0 - */ - uint8_t it_version; - - /** - * @it_pad: padding (or alignment) - */ - uint8_t it_pad; + __struct_group(ieee80211_radiotap_header_fixed, hdr, __packed, + /** + * @it_version: radiotap version, always 0 + */ + uint8_t it_version; + + /** + * @it_pad: padding (or alignment) + */ + uint8_t it_pad; + + /** + * @it_len: overall radiotap header length + */ + __le16 it_len; + + /** + * @it_present: (first) present word + */ + __le32 it_present; + ); /** - * @it_len: overall radiotap header length + * @it_optional: all remaining presence bitmaps */ - __le16 it_len; - - /** - * @it_present: (first) present word - */ - __le32 it_present; + __le32 it_optional[]; } __packed; +static_assert(offsetof(struct ieee80211_radiotap_header, it_optional) == sizeof(struct ieee80211_radiotap_header_fixed), + "struct member likely outside of __struct_group()"); + /* version is always 0 */ #define PKTHDR_RADIOTAP_VERSION 0 @@ -77,11 +87,14 @@ enum ieee80211_radiotap_presence { IEEE80211_RADIOTAP_HE_MU = 24, IEEE80211_RADIOTAP_ZERO_LEN_PSDU = 26, IEEE80211_RADIOTAP_LSIG = 27, + IEEE80211_RADIOTAP_TLV = 28, /* valid in every it_present bitmap, even vendor namespaces */ IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE = 29, IEEE80211_RADIOTAP_VENDOR_NAMESPACE = 30, - IEEE80211_RADIOTAP_EXT = 31 + IEEE80211_RADIOTAP_EXT = 31, + IEEE80211_RADIOTAP_EHT_USIG = 33, + IEEE80211_RADIOTAP_EHT = 34, }; /* for IEEE80211_RADIOTAP_FLAGS */ @@ -118,6 +131,7 @@ enum ieee80211_radiotap_tx_flags { IEEE80211_RADIOTAP_F_TX_RTS = 0x0004, IEEE80211_RADIOTAP_F_TX_NOACK = 0x0008, IEEE80211_RADIOTAP_F_TX_NOSEQNO = 0x0010, + IEEE80211_RADIOTAP_F_TX_ORDER = 0x0020, }; /* for IEEE80211_RADIOTAP_MCS "have" flags */ @@ -354,12 +368,230 @@ enum ieee80211_radiotap_zero_len_psdu_type { IEEE80211_RADIOTAP_ZERO_LEN_PSDU_VENDOR = 0xff, }; +struct ieee80211_radiotap_tlv { + __le16 type; + __le16 len; + u8 data[]; +} __packed; + +/** + * struct ieee80211_radiotap_vendor_content - radiotap vendor data content + * @oui: radiotap vendor namespace OUI + * @oui_subtype: radiotap vendor sub namespace + * @vendor_type: radiotap vendor type + * @reserved: should always be set to zero (to avoid leaking memory) + * @data: the actual vendor namespace data + */ +struct ieee80211_radiotap_vendor_content { + u8 oui[3]; + u8 oui_subtype; + __le16 vendor_type; + __le16 reserved; + u8 data[]; +} __packed; + +/** + * struct ieee80211_radiotap_vendor_tlv - vendor radiotap data information + * @type: should always be set to IEEE80211_RADIOTAP_VENDOR_NAMESPACE + * @len: length of data + * @content: vendor content see @ieee80211_radiotap_vendor_content + */ +struct ieee80211_radiotap_vendor_tlv { + __le16 type; /* IEEE80211_RADIOTAP_VENDOR_NAMESPACE */ + __le16 len; + struct ieee80211_radiotap_vendor_content content; +}; + +/* ieee80211_radiotap_eht_usig - content of U-SIG tlv (type 33) + * see www.radiotap.org/fields/U-SIG.html for details + */ +struct ieee80211_radiotap_eht_usig { + __le32 common; + __le32 value; + __le32 mask; +} __packed; + +/* ieee80211_radiotap_eht - content of EHT tlv (type 34) + * see www.radiotap.org/fields/EHT.html for details + */ +struct ieee80211_radiotap_eht { + __le32 known; + __le32 data[9]; + __le32 user_info[]; +} __packed; + +/* Known field for EHT TLV + * The ending defines for what the field applies as following + * O - OFDMA (including TB), M - MU-MIMO, S - EHT sounding. + */ +enum ieee80211_radiotap_eht_known { + IEEE80211_RADIOTAP_EHT_KNOWN_SPATIAL_REUSE = 0x00000002, + IEEE80211_RADIOTAP_EHT_KNOWN_GI = 0x00000004, + IEEE80211_RADIOTAP_EHT_KNOWN_EHT_LTF = 0x00000010, + IEEE80211_RADIOTAP_EHT_KNOWN_LDPC_EXTRA_SYM_OM = 0x00000020, + IEEE80211_RADIOTAP_EHT_KNOWN_PRE_PADD_FACOR_OM = 0x00000040, + IEEE80211_RADIOTAP_EHT_KNOWN_PE_DISAMBIGUITY_OM = 0x00000080, + IEEE80211_RADIOTAP_EHT_KNOWN_DISREGARD_O = 0x00000100, + IEEE80211_RADIOTAP_EHT_KNOWN_DISREGARD_S = 0x00000200, + IEEE80211_RADIOTAP_EHT_KNOWN_CRC1 = 0x00002000, + IEEE80211_RADIOTAP_EHT_KNOWN_TAIL1 = 0x00004000, + IEEE80211_RADIOTAP_EHT_KNOWN_CRC2_O = 0x00008000, + IEEE80211_RADIOTAP_EHT_KNOWN_TAIL2_O = 0x00010000, + IEEE80211_RADIOTAP_EHT_KNOWN_NSS_S = 0x00020000, + IEEE80211_RADIOTAP_EHT_KNOWN_BEAMFORMED_S = 0x00040000, + IEEE80211_RADIOTAP_EHT_KNOWN_NR_NON_OFDMA_USERS_M = 0x00080000, + IEEE80211_RADIOTAP_EHT_KNOWN_ENCODING_BLOCK_CRC_M = 0x00100000, + IEEE80211_RADIOTAP_EHT_KNOWN_ENCODING_BLOCK_TAIL_M = 0x00200000, + IEEE80211_RADIOTAP_EHT_KNOWN_RU_MRU_SIZE_OM = 0x00400000, + IEEE80211_RADIOTAP_EHT_KNOWN_RU_MRU_INDEX_OM = 0x00800000, + IEEE80211_RADIOTAP_EHT_KNOWN_RU_ALLOC_TB_FMT = 0x01000000, + IEEE80211_RADIOTAP_EHT_KNOWN_PRIMARY_80 = 0x02000000, +}; + +enum ieee80211_radiotap_eht_data { + /* Data 0 */ + IEEE80211_RADIOTAP_EHT_DATA0_SPATIAL_REUSE = 0x00000078, + IEEE80211_RADIOTAP_EHT_DATA0_GI = 0x00000180, + IEEE80211_RADIOTAP_EHT_DATA0_LTF = 0x00000600, + IEEE80211_RADIOTAP_EHT_DATA0_EHT_LTF = 0x00003800, + IEEE80211_RADIOTAP_EHT_DATA0_LDPC_EXTRA_SYM_OM = 0x00004000, + IEEE80211_RADIOTAP_EHT_DATA0_PRE_PADD_FACOR_OM = 0x00018000, + IEEE80211_RADIOTAP_EHT_DATA0_PE_DISAMBIGUITY_OM = 0x00020000, + IEEE80211_RADIOTAP_EHT_DATA0_DISREGARD_S = 0x000c0000, + IEEE80211_RADIOTAP_EHT_DATA0_DISREGARD_O = 0x003c0000, + IEEE80211_RADIOTAP_EHT_DATA0_CRC1_O = 0x03c00000, + IEEE80211_RADIOTAP_EHT_DATA0_TAIL1_O = 0xfc000000, + /* Data 1 */ + IEEE80211_RADIOTAP_EHT_DATA1_RU_SIZE = 0x0000001f, + IEEE80211_RADIOTAP_EHT_DATA1_RU_INDEX = 0x00001fe0, + IEEE80211_RADIOTAP_EHT_DATA1_RU_ALLOC_CC_1_1_1 = 0x003fe000, + IEEE80211_RADIOTAP_EHT_DATA1_RU_ALLOC_CC_1_1_1_KNOWN = 0x00400000, + IEEE80211_RADIOTAP_EHT_DATA1_PRIMARY_80 = 0xc0000000, + /* Data 2 */ + IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_1 = 0x000001ff, + IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_1_KNOWN = 0x00000200, + IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_1_1_2 = 0x0007fc00, + IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_1_1_2_KNOWN = 0x00080000, + IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_2 = 0x1ff00000, + IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_2_KNOWN = 0x20000000, + /* Data 3 */ + IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_1 = 0x000001ff, + IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_1_KNOWN = 0x00000200, + IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_2_2_1 = 0x0007fc00, + IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_2_2_1_KNOWN = 0x00080000, + IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_2 = 0x1ff00000, + IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_2_KNOWN = 0x20000000, + /* Data 4 */ + IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_2 = 0x000001ff, + IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_2_KNOWN = 0x00000200, + IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_1_2_3 = 0x0007fc00, + IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_1_2_3_KNOWN = 0x00080000, + IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_3 = 0x1ff00000, + IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_3_KNOWN = 0x20000000, + /* Data 5 */ + IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_4 = 0x000001ff, + IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_4_KNOWN = 0x00000200, + IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_2_2_4 = 0x0007fc00, + IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_2_2_4_KNOWN = 0x00080000, + IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_5 = 0x1ff00000, + IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_5_KNOWN = 0x20000000, + /* Data 6 */ + IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_5 = 0x000001ff, + IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_5_KNOWN = 0x00000200, + IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_1_2_6 = 0x0007fc00, + IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_1_2_6_KNOWN = 0x00080000, + IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_6 = 0x1ff00000, + IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_6_KNOWN = 0x20000000, + /* Data 7 */ + IEEE80211_RADIOTAP_EHT_DATA7_CRC2_O = 0x0000000f, + IEEE80211_RADIOTAP_EHT_DATA7_TAIL_2_O = 0x000003f0, + IEEE80211_RADIOTAP_EHT_DATA7_NSS_S = 0x0000f000, + IEEE80211_RADIOTAP_EHT_DATA7_BEAMFORMED_S = 0x00010000, + IEEE80211_RADIOTAP_EHT_DATA7_NUM_OF_NON_OFDMA_USERS = 0x000e0000, + IEEE80211_RADIOTAP_EHT_DATA7_USER_ENCODING_BLOCK_CRC = 0x00f00000, + IEEE80211_RADIOTAP_EHT_DATA7_USER_ENCODING_BLOCK_TAIL = 0x3f000000, + /* Data 8 */ + IEEE80211_RADIOTAP_EHT_DATA8_RU_ALLOC_TB_FMT_PS_160 = 0x00000001, + IEEE80211_RADIOTAP_EHT_DATA8_RU_ALLOC_TB_FMT_B0 = 0x00000002, + IEEE80211_RADIOTAP_EHT_DATA8_RU_ALLOC_TB_FMT_B7_B1 = 0x000001fc, +}; + +enum ieee80211_radiotap_eht_user_info { + IEEE80211_RADIOTAP_EHT_USER_INFO_STA_ID_KNOWN = 0x00000001, + IEEE80211_RADIOTAP_EHT_USER_INFO_MCS_KNOWN = 0x00000002, + IEEE80211_RADIOTAP_EHT_USER_INFO_CODING_KNOWN = 0x00000004, + IEEE80211_RADIOTAP_EHT_USER_INFO_NSS_KNOWN_O = 0x00000010, + IEEE80211_RADIOTAP_EHT_USER_INFO_BEAMFORMING_KNOWN_O = 0x00000020, + IEEE80211_RADIOTAP_EHT_USER_INFO_SPATIAL_CONFIG_KNOWN_M = 0x00000040, + IEEE80211_RADIOTAP_EHT_USER_INFO_DATA_FOR_USER = 0x00000080, + IEEE80211_RADIOTAP_EHT_USER_INFO_STA_ID = 0x0007ff00, + IEEE80211_RADIOTAP_EHT_USER_INFO_CODING = 0x00080000, + IEEE80211_RADIOTAP_EHT_USER_INFO_MCS = 0x00f00000, + IEEE80211_RADIOTAP_EHT_USER_INFO_NSS_O = 0x0f000000, + IEEE80211_RADIOTAP_EHT_USER_INFO_BEAMFORMING_O = 0x20000000, + IEEE80211_RADIOTAP_EHT_USER_INFO_SPATIAL_CONFIG_M = 0x3f000000, + IEEE80211_RADIOTAP_EHT_USER_INFO_RESEVED_c0000000 = 0xc0000000, +}; + +enum ieee80211_radiotap_eht_usig_common { + IEEE80211_RADIOTAP_EHT_USIG_COMMON_PHY_VER_KNOWN = 0x00000001, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_KNOWN = 0x00000002, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_UL_DL_KNOWN = 0x00000004, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BSS_COLOR_KNOWN = 0x00000008, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_TXOP_KNOWN = 0x00000010, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BAD_USIG_CRC = 0x00000020, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_VALIDATE_BITS_CHECKED = 0x00000040, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_VALIDATE_BITS_OK = 0x00000080, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_PHY_VER = 0x00007000, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW = 0x00038000, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_20MHZ = 0, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_40MHZ = 1, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_80MHZ = 2, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_160MHZ = 3, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_320MHZ_1 = 4, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_320MHZ_2 = 5, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_UL_DL = 0x00040000, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_BSS_COLOR = 0x01f80000, + IEEE80211_RADIOTAP_EHT_USIG_COMMON_TXOP = 0xfe000000, +}; + +enum ieee80211_radiotap_eht_usig_mu { + /* MU-USIG-1 */ + IEEE80211_RADIOTAP_EHT_USIG1_MU_B20_B24_DISREGARD = 0x0000001f, + IEEE80211_RADIOTAP_EHT_USIG1_MU_B25_VALIDATE = 0x00000020, + /* MU-USIG-2 */ + IEEE80211_RADIOTAP_EHT_USIG2_MU_B0_B1_PPDU_TYPE = 0x000000c0, + IEEE80211_RADIOTAP_EHT_USIG2_MU_B2_VALIDATE = 0x00000100, + IEEE80211_RADIOTAP_EHT_USIG2_MU_B3_B7_PUNCTURED_INFO = 0x00003e00, + IEEE80211_RADIOTAP_EHT_USIG2_MU_B8_VALIDATE = 0x00004000, + IEEE80211_RADIOTAP_EHT_USIG2_MU_B9_B10_SIG_MCS = 0x00018000, + IEEE80211_RADIOTAP_EHT_USIG2_MU_B11_B15_EHT_SIG_SYMBOLS = 0x003e0000, + IEEE80211_RADIOTAP_EHT_USIG2_MU_B16_B19_CRC = 0x03c00000, + IEEE80211_RADIOTAP_EHT_USIG2_MU_B20_B25_TAIL = 0xfc000000, +}; + +enum ieee80211_radiotap_eht_usig_tb { + /* TB-USIG-1 */ + IEEE80211_RADIOTAP_EHT_USIG1_TB_B20_B25_DISREGARD = 0x0000001f, + + /* TB-USIG-2 */ + IEEE80211_RADIOTAP_EHT_USIG2_TB_B0_B1_PPDU_TYPE = 0x000000c0, + IEEE80211_RADIOTAP_EHT_USIG2_TB_B2_VALIDATE = 0x00000100, + IEEE80211_RADIOTAP_EHT_USIG2_TB_B3_B6_SPATIAL_REUSE_1 = 0x00001e00, + IEEE80211_RADIOTAP_EHT_USIG2_TB_B7_B10_SPATIAL_REUSE_2 = 0x0001e000, + IEEE80211_RADIOTAP_EHT_USIG2_TB_B11_B15_DISREGARD = 0x003e0000, + IEEE80211_RADIOTAP_EHT_USIG2_TB_B16_B19_CRC = 0x03c00000, + IEEE80211_RADIOTAP_EHT_USIG2_TB_B20_B25_TAIL = 0xfc000000, +}; + /** * ieee80211_get_radiotap_len - get radiotap header length + * @data: pointer to the header + * Return: the radiotap header length */ static inline u16 ieee80211_get_radiotap_len(const char *data) { - struct ieee80211_radiotap_header *hdr = (void *)data; + const struct ieee80211_radiotap_header *hdr = (const void *)data; return get_unaligned_le16(&hdr->it_len); } diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h index d0d188c3294b..4de858f9929e 100644 --- a/include/net/ieee802154_netdev.h +++ b/include/net/ieee802154_netdev.h @@ -15,6 +15,22 @@ #ifndef IEEE802154_NETDEVICE_H #define IEEE802154_NETDEVICE_H +#define IEEE802154_REQUIRED_SIZE(struct_type, member) \ + (offsetof(typeof(struct_type), member) + \ + sizeof(((typeof(struct_type) *)(NULL))->member)) + +#define IEEE802154_ADDR_OFFSET \ + offsetof(typeof(struct sockaddr_ieee802154), addr) + +#define IEEE802154_MIN_NAMELEN (IEEE802154_ADDR_OFFSET + \ + IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, addr_type)) + +#define IEEE802154_NAMELEN_SHORT (IEEE802154_ADDR_OFFSET + \ + IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, short_addr)) + +#define IEEE802154_NAMELEN_LONG (IEEE802154_ADDR_OFFSET + \ + IEEE802154_REQUIRED_SIZE(struct ieee802154_addr_sa, hwaddr)) + #include <net/af_ieee802154.h> #include <linux/netdevice.h> #include <linux/skbuff.h> @@ -22,6 +38,46 @@ #include <net/cfg802154.h> +struct ieee802154_beacon_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u16 beacon_order:4, + superframe_order:4, + final_cap_slot:4, + battery_life_ext:1, + reserved0:1, + pan_coordinator:1, + assoc_permit:1; + u8 gts_count:3, + gts_reserved:4, + gts_permit:1; + u8 pend_short_addr_count:3, + reserved1:1, + pend_ext_addr_count:3, + reserved2:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + u16 assoc_permit:1, + pan_coordinator:1, + reserved0:1, + battery_life_ext:1, + final_cap_slot:4, + superframe_order:4, + beacon_order:4; + u8 gts_permit:1, + gts_reserved:4, + gts_count:3; + u8 reserved2:1, + pend_ext_addr_count:3, + reserved1:1, + pend_short_addr_count:3; +#else +#error "Please fix <asm/byteorder.h>" +#endif +} __packed; + +struct ieee802154_mac_cmd_pl { + u8 cmd_id; +} __packed; + struct ieee802154_sechdr { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 level:3, @@ -69,6 +125,63 @@ struct ieee802154_hdr_fc { #endif }; +struct ieee802154_assoc_req_pl { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved1:1, + device_type:1, + power_source:1, + rx_on_when_idle:1, + assoc_type:1, + reserved2:1, + security_cap:1, + alloc_addr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 alloc_addr:1, + security_cap:1, + reserved2:1, + assoc_type:1, + rx_on_when_idle:1, + power_source:1, + device_type:1, + reserved1:1; +#else +#error "Please fix <asm/byteorder.h>" +#endif +} __packed; + +struct ieee802154_assoc_resp_pl { + __le16 short_addr; + u8 status; +} __packed; + +enum ieee802154_frame_version { + IEEE802154_2003_STD, + IEEE802154_2006_STD, + IEEE802154_STD, + IEEE802154_RESERVED_STD, + IEEE802154_MULTIPURPOSE_STD = IEEE802154_2003_STD, +}; + +enum ieee802154_addressing_mode { + IEEE802154_NO_ADDRESSING, + IEEE802154_RESERVED, + IEEE802154_SHORT_ADDRESSING, + IEEE802154_EXTENDED_ADDRESSING, +}; + +enum ieee802154_association_status { + IEEE802154_ASSOCIATION_SUCCESSFUL = 0x00, + IEEE802154_PAN_AT_CAPACITY = 0x01, + IEEE802154_PAN_ACCESS_DENIED = 0x02, + IEEE802154_HOPPING_SEQUENCE_OFFSET_DUP = 0x03, + IEEE802154_FAST_ASSOCIATION_SUCCESSFUL = 0x80, +}; + +enum ieee802154_disassociation_reason { + IEEE802154_COORD_WISHES_DEVICE_TO_LEAVE = 0x1, + IEEE802154_DEVICE_WISHES_TO_LEAVE = 0x2, +}; + struct ieee802154_hdr { struct ieee802154_hdr_fc fc; u8 seq; @@ -77,6 +190,39 @@ struct ieee802154_hdr { struct ieee802154_sechdr sec; }; +struct ieee802154_beacon_frame { + struct ieee802154_hdr mhr; + struct ieee802154_beacon_hdr mac_pl; +}; + +struct ieee802154_mac_cmd_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; +}; + +struct ieee802154_beacon_req_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; +}; + +struct ieee802154_association_req_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; + struct ieee802154_assoc_req_pl assoc_req_pl; +}; + +struct ieee802154_association_resp_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; + struct ieee802154_assoc_resp_pl assoc_resp_pl; +}; + +struct ieee802154_disassociation_notif_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; + u8 disassoc_pl; +}; + /* pushes hdr onto the skb. fields of hdr->fc that can be calculated from * the contents of hdr will be, and the actual value of those bits in * hdr->fc will be ignored. this includes the INTRA_PAN bit and the frame @@ -102,6 +248,14 @@ int ieee802154_hdr_peek_addrs(const struct sk_buff *skb, */ int ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr); +/* pushes/pulls various frame types into/from an skb */ +int ieee802154_beacon_push(struct sk_buff *skb, + struct ieee802154_beacon_frame *beacon); +int ieee802154_mac_cmd_push(struct sk_buff *skb, void *frame, + const void *pl, unsigned int pl_len); +int ieee802154_mac_cmd_pl_pull(struct sk_buff *skb, + struct ieee802154_mac_cmd_pl *mac_pl); + int ieee802154_max_payload(const struct ieee802154_hdr *hdr); static inline int @@ -165,6 +319,33 @@ static inline void ieee802154_devaddr_to_raw(void *raw, __le64 addr) memcpy(raw, &temp, IEEE802154_ADDR_LEN); } +static inline int +ieee802154_sockaddr_check_size(struct sockaddr_ieee802154 *daddr, int len) +{ + struct ieee802154_addr_sa *sa; + int ret = 0; + + sa = &daddr->addr; + if (len < IEEE802154_MIN_NAMELEN) + return -EINVAL; + switch (sa->addr_type) { + case IEEE802154_ADDR_NONE: + break; + case IEEE802154_ADDR_SHORT: + if (len < IEEE802154_NAMELEN_SHORT) + ret = -EINVAL; + break; + case IEEE802154_ADDR_LONG: + if (len < IEEE802154_NAMELEN_LONG) + ret = -EINVAL; + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + static inline void ieee802154_addr_from_sa(struct ieee802154_addr *a, const struct ieee802154_addr_sa *sa) { diff --git a/include/net/ieee8021q.h b/include/net/ieee8021q.h new file mode 100644 index 000000000000..8bfe903dd3d0 --- /dev/null +++ b/include/net/ieee8021q.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> */ + +#ifndef _NET_IEEE8021Q_H +#define _NET_IEEE8021Q_H + +#include <linux/errno.h> + +/** + * enum ieee8021q_traffic_type - 802.1Q traffic type priority values (802.1Q-2022) + * + * @IEEE8021Q_TT_BK: Background + * @IEEE8021Q_TT_BE: Best Effort (default). According to 802.1Q-2022, BE is 0 + * but has higher priority than BK which is 1. + * @IEEE8021Q_TT_EE: Excellent Effort + * @IEEE8021Q_TT_CA: Critical Applications + * @IEEE8021Q_TT_VI: Video, < 100 ms latency and jitter + * @IEEE8021Q_TT_VO: Voice, < 10 ms latency and jitter + * @IEEE8021Q_TT_IC: Internetwork Control + * @IEEE8021Q_TT_NC: Network Control + */ +enum ieee8021q_traffic_type { + IEEE8021Q_TT_BK = 0, + IEEE8021Q_TT_BE = 1, + IEEE8021Q_TT_EE = 2, + IEEE8021Q_TT_CA = 3, + IEEE8021Q_TT_VI = 4, + IEEE8021Q_TT_VO = 5, + IEEE8021Q_TT_IC = 6, + IEEE8021Q_TT_NC = 7, + + /* private: */ + IEEE8021Q_TT_MAX, +}; + +#define SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp) ((dscp >> 3) & 0x7) + +#if IS_ENABLED(CONFIG_NET_IEEE8021Q_HELPERS) + +int ietf_dscp_to_ieee8021q_tt(u8 dscp); +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues); + +#else + +static inline int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ + return -EOPNOTSUPP; +} + +static inline int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, + unsigned int num_queues) +{ + return -EOPNOTSUPP; +} + +#endif +#endif /* _NET_IEEE8021Q_H */ diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 8bf5906073bc..238ad3349456 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -22,10 +22,6 @@ #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 -/* prefix flags */ -#define IF_PREFIX_ONLINK 0x01 -#define IF_PREFIX_AUTOCONF 0x02 - enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, @@ -64,6 +60,14 @@ struct inet6_ifaddr { struct hlist_node addr_lst; struct list_head if_list; + /* + * Used to safely traverse idev->addr_list in process context + * if the idev->lock needed to protect idev->addr_list cannot be held. + * In that case, add the items to this list temporarily and iterate + * without holding idev->lock. + * See addrconf_ifdown and dev_forward_change. + */ + struct list_head if_list_aux; struct list_head tmp_list; struct inet6_ifaddr *ifpub; @@ -71,6 +75,8 @@ struct inet6_ifaddr { bool tokenized; + u8 ifa_proto; + struct rcu_head rcu; struct in6_addr peer_addr; }; @@ -78,12 +84,10 @@ struct inet6_ifaddr { struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; - struct in6_addr sl_addr[]; + struct rcu_head rcu; + struct in6_addr sl_addr[] __counted_by(sl_max); }; -#define IP6_SFLSIZE(count) (sizeof(struct ip6_sf_socklist) + \ - (count) * sizeof(struct in6_addr)) - #define IP6_SFBLOCK 10 /* allocate this many at once */ struct ipv6_mc_socklist { @@ -91,18 +95,18 @@ struct ipv6_mc_socklist { int ifindex; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist __rcu *next; - rwlock_t sflock; - struct ip6_sf_socklist *sflist; + struct ip6_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip6_sf_list { - struct ip6_sf_list *sf_next; + struct ip6_sf_list __rcu *sf_next; struct in6_addr sf_addr; unsigned long sf_count[2]; /* include/exclude counts */ unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ + struct rcu_head rcu; }; #define MAF_TIMER_RUNNING 0x01 @@ -114,19 +118,19 @@ struct ip6_sf_list { struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; - struct ifmcaddr6 *next; - struct ip6_sf_list *mca_sources; - struct ip6_sf_list *mca_tomb; + struct ifmcaddr6 __rcu *next; + struct ip6_sf_list __rcu *mca_sources; + struct ip6_sf_list __rcu *mca_tomb; unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; - struct timer_list mca_timer; + struct delayed_work mca_work; unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; - spinlock_t mca_lock; unsigned long mca_cstamp; unsigned long mca_tstamp; + struct rcu_head rcu; }; /* Anycast stuff */ @@ -140,7 +144,7 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; - struct ifacaddr6 *aca_next; + struct ifacaddr6 __rcu *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; @@ -162,12 +166,12 @@ struct ipv6_devstat { struct inet6_dev { struct net_device *dev; + netdevice_tracker dev_tracker; struct list_head addr_list; - struct ifmcaddr6 *mc_list; - struct ifmcaddr6 *mc_tomb; - spinlock_t mc_lock; + struct ifmcaddr6 __rcu *mc_list; + struct ifmcaddr6 __rcu *mc_tomb; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; @@ -179,11 +183,20 @@ struct inet6_dev { unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; - struct timer_list mc_gq_timer; /* general query timer */ - struct timer_list mc_ifc_timer; /* interface change timer */ - struct timer_list mc_dad_timer; /* dad complete mc timer */ + struct delayed_work mc_gq_work; /* general query work */ + struct delayed_work mc_ifc_work; /* interface change work */ + struct delayed_work mc_dad_work; /* dad complete mc work */ + struct delayed_work mc_query_work; /* mld query work */ + struct delayed_work mc_report_work; /* mld report work */ + + struct sk_buff_head mc_query_queue; /* mld query queue */ + struct sk_buff_head mc_report_queue; /* mld report queue */ - struct ifacaddr6 *ac_list; + spinlock_t mc_query_lock; /* mld query queue lock */ + spinlock_t mc_report_lock; /* mld query report lock */ + struct mutex mc_lock; /* mld global lock */ + + struct ifacaddr6 __rcu *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; @@ -204,6 +217,8 @@ struct inet6_dev { unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; + + unsigned int ra_mtu; }; static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf) diff --git a/include/net/ila.h b/include/net/ila.h deleted file mode 100644 index f98dcd5791b0..000000000000 --- a/include/net/ila.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ILA kernel interface - * - * Copyright (c) 2015 Tom Herbert <tom@herbertland.com> - */ - -#ifndef _NET_ILA_H -#define _NET_ILA_H - -int ila_xlat_outgoing(struct sk_buff *skb); -int ila_xlat_incoming(struct sk_buff *skb); - -#endif /* _NET_ILA_H */ diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index 7392f959a405..745891d2e113 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -11,6 +11,8 @@ #include <linux/types.h> +struct flowi; +struct flowi6; struct request_sock; struct sk_buff; struct sock; @@ -19,8 +21,6 @@ struct sockaddr; struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, const struct request_sock *req, u8 proto); -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); - int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu); diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 81b965953036..c32878c69179 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -40,7 +40,7 @@ static inline unsigned int __inet6_ehashfn(const u32 lhash, * * The sockhash lock must be held as a reader here. */ -struct sock *__inet6_lookup_established(struct net *net, +struct sock *__inet6_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, @@ -48,7 +48,23 @@ struct sock *__inet6_lookup_established(struct net *net, const u16 hnum, const int dif, const int sdif); -struct sock *inet6_lookup_listener(struct net *net, +typedef u32 (inet6_ehashfn_t)(const struct net *net, + const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const __be16 fport); + +inet6_ehashfn_t inet6_ehashfn; + +INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn); + +struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum, + inet6_ehashfn_t *ehashfn); + +struct sock *inet6_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -57,7 +73,16 @@ struct sock *inet6_lookup_listener(struct net *net, const unsigned short hnum, const int dif, const int sdif); -static inline struct sock *__inet6_lookup(struct net *net, +struct sock *inet6_lookup_run_sk_lookup(const struct net *net, + int protocol, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, const int dif, + inet6_ehashfn_t *ehashfn); + +static inline struct sock *__inet6_lookup(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -78,6 +103,46 @@ static inline struct sock *__inet6_lookup(struct net *net, daddr, hnum, dif, sdif); } +static inline +struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, + const struct in6_addr *saddr, const __be16 sport, + const struct in6_addr *daddr, const __be16 dport, + bool *refcounted, inet6_ehashfn_t *ehashfn) +{ + struct sock *sk, *reuse_sk; + bool prefetched; + + sk = skb_steal_sock(skb, refcounted, &prefetched); + if (!sk) + return NULL; + + if (!prefetched || !sk_fullsock(sk)) + return sk; + + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + return sk; + } else if (sk->sk_protocol == IPPROTO_UDP) { + if (sk->sk_state != TCP_CLOSE) + return sk; + } else { + return sk; + } + + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, ntohs(dport), + ehashfn); + if (!reuse_sk) + return sk; + + /* We've chosen a new reuseport sock which is never refcounted. This + * implies that sk also isn't refcounted. + */ + WARN_ON_ONCE(*refcounted); + + return reuse_sk; +} + static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, @@ -85,33 +150,48 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, int iif, int sdif, bool *refcounted) { - struct sock *sk = skb_steal_sock(skb, refcounted); - + struct net *net = dev_net_rcu(skb_dst(skb)->dev); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct sock *sk; + + sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport, + refcounted, inet6_ehashfn); + if (IS_ERR(sk)) + return NULL; if (sk) return sk; - return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, - doff, &ipv6_hdr(skb)->saddr, sport, - &ipv6_hdr(skb)->daddr, ntohs(dport), + return __inet6_lookup(net, hashinfo, skb, + doff, &ip6h->saddr, sport, + &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } -struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, +struct sock *inet6_lookup(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); int inet6_hash(struct sock *sk); -#endif /* IS_ENABLED(CONFIG_IPV6) */ -#define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif, __sdif) \ - (((__sk)->sk_portpair == (__ports)) && \ - ((__sk)->sk_family == AF_INET6) && \ - ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr)) && \ - ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr)) && \ - (((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ - net_eq(sock_net(__sk), (__net))) +static inline bool inet6_match(const struct net *net, const struct sock *sk, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + const __portpair ports, + const int dif, const int sdif) +{ + if (!net_eq(sock_net(sk), net) || + sk->sk_family != AF_INET6 || + sk->sk_portpair != ports || + !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) || + !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return false; + + /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ + return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, + sdif); +} +#endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _INET6_HASHTABLES_H */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index cb2818862919..c17a6585d0b0 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -3,6 +3,10 @@ #define _INET_COMMON_H #include <linux/indirect_call_wrapper.h> +#include <linux/net.h> +#include <linux/netdev_features.h> +#include <linux/types.h> +#include <net/sock.h> extern const struct proto_ops inet_stream_ops; extern const struct proto_ops inet_dgram_ops; @@ -12,6 +16,8 @@ extern const struct proto_ops inet_dgram_ops; */ struct msghdr; +struct net; +struct page; struct sock; struct sockaddr; struct socket; @@ -23,24 +29,29 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); -int inet_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern); +int inet_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg); +void __inet_accept(struct socket *sock, struct socket *newsock, + struct sock *newsk); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); -ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags); +void inet_splice_eof(struct socket *sock); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); +int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) /* Called from BPF program. */ #define BIND_FROM_BPF (1 << 2) +/* Skip CAP_NET_BIND_SERVICE check. */ +#define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 7338b3865a2a..1735db332aab 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -25,6 +25,7 @@ #undef INET_CSK_CLEAR_TIMERS struct inet_bind_bucket; +struct inet_bind2_bucket; struct tcp_congestion_ops; /* @@ -43,13 +44,10 @@ struct inet_connection_sock_af_ops { struct request_sock *req_unhash, bool *own_req); u16 net_header_len; - u16 net_frag_header_len; - u16 sockaddr_len; int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); - void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); void (*mtu_reduced)(struct sock *sk); }; @@ -57,7 +55,7 @@ struct inet_connection_sock_af_ops { * * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node - * @icsk_timeout: Timeout + * @icsk_bind2_hash: Bind node in the bhash2 table * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout * @icsk_pmtu_cookie Last pmtu seen by socket @@ -65,8 +63,6 @@ struct inet_connection_sock_af_ops { * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data - * @icsk_clean_acked Clean acked data hook - * @icsk_listen_portaddr_node hash to the portaddr listener hashtable * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -76,25 +72,26 @@ struct inet_connection_sock_af_ops { * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data * @icsk_mtup; MTU probing control data + * @icsk_probes_tstamp: Probe timestamp (cleared by non-zero window ack) + * @icsk_user_timeout: TCP_USER_TIMEOUT value */ struct inet_connection_sock { /* inet_sock has to be the first member! */ struct inet_sock icsk_inet; struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; - unsigned long icsk_timeout; + struct inet_bind2_bucket *icsk_bind2_hash; struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; __u32 icsk_rto_min; + u32 icsk_rto_max; __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; - void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); - struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, @@ -111,41 +108,41 @@ struct inet_connection_sock { __u8 quick; /* Scheduled number of quick acks */ __u8 pingpong; /* The session is interactive */ __u8 retry; /* Number of attempts */ - __u32 ato; /* Predicted tick of soft clock */ - unsigned long timeout; /* Currently scheduled timeout */ + #define ATO_BITS 8 + __u32 ato:ATO_BITS, /* Predicted tick of soft clock */ + lrcv_flowlabel:20, /* last received ipv6 flowlabel */ + dst_quick_ack:1, /* cache dst RTAX_QUICKACK */ + unused:3; __u32 lrcvtime; /* timestamp of last received data packet */ __u16 last_seg_size; /* Size of last incoming segment */ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ } icsk_ack; struct { - int enabled; - /* Range of MTUs to search */ int search_high; int search_low; /* Information on the current probe. */ - int probe_size; + u32 probe_size:31, + /* Is the MTUP feature enabled for this connection? */ + enabled:1; u32 probe_timestamp; } icsk_mtup; + u32 icsk_probes_tstamp; u32 icsk_user_timeout; u64 icsk_ca_priv[104 / sizeof(u64)]; -#define ICSK_CA_PRIV_SIZE (13 * sizeof(u64)) +#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) }; #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ #define ICSK_TIME_DACK 2 /* Delayed ack timer */ #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ -#define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */ #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ #define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */ -static inline struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} +#define inet_csk(ptr) container_of_const(ptr, struct inet_connection_sock, icsk_inet.sk) static inline void *inet_csk_ca(const struct sock *sk) { @@ -161,7 +158,8 @@ enum inet_csk_ack_state_t { ICSK_ACK_TIMER = 2, ICSK_ACK_PUSHED = 4, ICSK_ACK_PUSHED2 = 8, - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ + ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */ + ICSK_ACK_NOMEM = 32, }; void inet_csk_init_xmit_timers(struct sock *sk, @@ -169,6 +167,7 @@ void inet_csk_init_xmit_timers(struct sock *sk, void (*delack_handler)(struct timer_list *), void (*keepalive_handler)(struct timer_list *)); void inet_csk_clear_xmit_timers(struct sock *sk); +void inet_csk_clear_xmit_timers_sync(struct sock *sk); static inline void inet_csk_schedule_ack(struct sock *sk) { @@ -185,20 +184,29 @@ static inline void inet_csk_delack_init(struct sock *sk) memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); } -void inet_csk_delete_keepalive_timer(struct sock *sk); -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); +static inline unsigned long +icsk_timeout(const struct inet_connection_sock *icsk) +{ + return READ_ONCE(icsk->icsk_retransmit_timer.expires); +} + +static inline unsigned long +icsk_delack_timeout(const struct inet_connection_sock *icsk) +{ + return READ_ONCE(icsk->icsk_delack_timer.expires); +} static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { - icsk->icsk_pending = 0; + smp_store_release(&icsk->icsk_pending, 0); #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_ack.pending, 0); icsk->icsk_ack.retry = 0; #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); @@ -223,16 +231,15 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, when = max_when; } + when += jiffies; if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || - what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE || - what == ICSK_TIME_REO_TIMEOUT) { - icsk->icsk_pending = what; - icsk->icsk_timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { + smp_store_release(&icsk->icsk_pending, what); + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, when); } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending |= ICSK_ACK_TIMER; - icsk->icsk_ack.timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + smp_store_release(&icsk->icsk_ack.pending, + icsk->icsk_ack.pending | ICSK_ACK_TIMER); + sk_reset_timer(sk, &icsk->icsk_delack_timer, when); } else { pr_debug("inet_csk BUG: unknown timer value\n"); } @@ -247,7 +254,7 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, return (unsigned long)min_t(u64, when, max_when); } -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern); +struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg); int inet_csk_get_port(struct sock *sk, unsigned short snum); @@ -260,7 +267,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, @@ -278,17 +285,25 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog; + return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog); } -void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); +bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); +static inline unsigned long +reqsk_timeout(struct request_sock *req, unsigned long max_timeout) +{ + u64 timeout = (u64)req->timeout << req->num_timeout; + + return (unsigned long)min_t(u64, timeout, max_timeout); +} + static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) { /* The below has to be done to allow calling inet_csk_destroy_sock */ sock_set_flag(sk, SOCK_DEAD); - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(*sk->sk_prot->orphan_count); } void inet_csk_destroy_sock(struct sock *sk); @@ -303,22 +318,19 @@ static inline __poll_t inet_csk_listen_poll(const struct sock *sk) (EPOLLIN | EPOLLRDNORM) : 0; } -int inet_csk_listen_start(struct sock *sk, int backlog); +int inet_csk_listen_start(struct sock *sk); void inet_csk_listen_stop(struct sock *sk); -void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); - /* update the fast reuse flag when adding a socket */ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct sock *sk); struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); -#define TCP_PINGPONG_THRESH 3 - static inline void inet_csk_enter_pingpong_mode(struct sock *sk) { - inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH; + inet_csk(sk)->icsk_ack.pingpong = + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh); } static inline void inet_csk_exit_pingpong_mode(struct sock *sk) @@ -328,7 +340,8 @@ static inline void inet_csk_exit_pingpong_mode(struct sock *sk) static inline bool inet_csk_in_pingpong_mode(struct sock *sk) { - return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH; + return inet_csk(sk)->icsk_ack.pingpong >= + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh); } static inline void inet_csk_inc_pingpong_cnt(struct sock *sk) @@ -339,9 +352,17 @@ static inline void inet_csk_inc_pingpong_cnt(struct sock *sk) icsk->icsk_ack.pingpong++; } -static inline bool inet_csk_has_ulp(struct sock *sk) +static inline bool inet_csk_has_ulp(const struct sock *sk) { - return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops; + return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; +} + +static inline void inet_init_csk_locks(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + spin_lock_init(&icsk->icsk_accept_queue.rskq_lock); + spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock); } #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h new file mode 100644 index 000000000000..72f250dffada --- /dev/null +++ b/include/net/inet_dscp.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * inet_dscp.h: helpers for handling differentiated services codepoints (DSCP) + * + * DSCP is defined in RFC 2474: + * + * 0 1 2 3 4 5 6 7 + * +---+---+---+---+---+---+---+---+ + * | DSCP | CU | + * +---+---+---+---+---+---+---+---+ + * + * DSCP: differentiated services codepoint + * CU: currently unused + * + * The whole DSCP + CU bits form the DS field. + * The DS field is also commonly called TOS or Traffic Class (for IPv6). + * + * Note: the CU bits are now used for Explicit Congestion Notification + * (RFC 3168). + */ + +#ifndef _INET_DSCP_H +#define _INET_DSCP_H + +#include <linux/types.h> + +/* Special type for storing DSCP values. + * + * A dscp_t variable stores a DS field with the CU (ECN) bits cleared. + * Using dscp_t allows to strictly separate DSCP and ECN bits, thus avoiding + * bugs where ECN bits are erroneously taken into account during FIB lookups + * or policy routing. + * + * Note: to get the real DSCP value contained in a dscp_t variable one would + * have to do a bit shift after calling inet_dscp_to_dsfield(). We could have + * a helper for that, but there's currently no users. + */ +typedef u8 __bitwise dscp_t; + +#define INET_DSCP_MASK 0xfc + +static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield) +{ + return (__force dscp_t)(dsfield & INET_DSCP_MASK); +} + +static inline __u8 inet_dscp_to_dsfield(dscp_t dscp) +{ + return (__force __u8)dscp; +} + +static inline bool inet_validate_dscp(__u8 val) +{ + return !(val & ~INET_DSCP_MASK); +} + +#endif /* _INET_DSCP_H */ diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index e1eaf1780288..ea32393464a2 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -8,6 +8,7 @@ #include <net/inet_sock.h> #include <net/dsfield.h> +#include <net/checksum.h> enum { INET_ECN_NOT_ECT = 0, @@ -75,8 +76,8 @@ static inline void INET_ECN_dontxmit(struct sock *sk) static inline int IP_ECN_set_ce(struct iphdr *iph) { - u32 check = (__force u32)iph->check; u32 ecn = (iph->tos + 1) & INET_ECN_MASK; + __be16 check_add; /* * After the last operation we have (in binary): @@ -93,23 +94,20 @@ static inline int IP_ECN_set_ce(struct iphdr *iph) * INET_ECN_ECT_1 => check += htons(0xFFFD) * INET_ECN_ECT_0 => check += htons(0xFFFE) */ - check += (__force u16)htons(0xFFFB) + (__force u16)htons(ecn); + check_add = (__force __be16)((__force u16)htons(0xFFFB) + + (__force u16)htons(ecn)); - iph->check = (__force __sum16)(check + (check>=0xFFFF)); + iph->check = csum16_add(iph->check, check_add); iph->tos |= INET_ECN_CE; return 1; } static inline int IP_ECN_set_ect1(struct iphdr *iph) { - u32 check = (__force u32)iph->check; - if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; - check += (__force u16)htons(0x100); - - iph->check = (__force __sum16)(check + (check>=0xFFFF)); + iph->check = csum16_add(iph->check, htons(0x1)); iph->tos ^= INET_ECN_MASK; return 1; } @@ -190,6 +188,23 @@ static inline int INET_ECN_set_ce(struct sk_buff *skb) return 0; } +static inline int skb_get_dsfield(struct sk_buff *skb) +{ + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + break; + return ipv4_get_dsfield(ip_hdr(skb)); + + case cpu_to_be16(ETH_P_IPV6): + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + break; + return ipv6_get_dsfield(ipv6_hdr(skb)); + } + + return -1; +} + static inline int INET_ECN_set_ect1(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index bac79e817776..0eccd9c3a883 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -4,6 +4,10 @@ #include <linux/rhashtable-types.h> #include <linux/completion.h> +#include <linux/in6.h> +#include <linux/rbtree_types.h> +#include <linux/refcount.h> +#include <net/dropreason-core.h> /* Per netns frag queues directory */ struct fqdir { @@ -21,21 +25,24 @@ struct fqdir { /* Keep atomic mem on separate cachelines in structs that include it */ atomic_long_t mem ____cacheline_aligned_in_smp; struct work_struct destroy_work; + struct llist_node free_list; }; /** - * fragment queue flags + * enum: fragment queue flags * * @INET_FRAG_FIRST_IN: first fragment has arrived * @INET_FRAG_LAST_IN: final fragment has arrived * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable + * @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed) */ enum { INET_FRAG_FIRST_IN = BIT(0), INET_FRAG_LAST_IN = BIT(1), INET_FRAG_COMPLETE = BIT(2), INET_FRAG_HASH_DEAD = BIT(3), + INET_FRAG_DROP = BIT(4), }; struct frag_v4_compare_key { @@ -69,6 +76,7 @@ struct frag_v6_compare_key { * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far + * @tstamp_type: stamp has a mono delivery time (EDT) * @flags: fragment queue flags * @max_size: maximum received fragment size * @fqdir: pointer to struct fqdir @@ -89,6 +97,7 @@ struct inet_frag_queue { ktime_t stamp; int len; int meat; + u8 tstamp_type; __u8 flags; u16 max_size; struct fqdir *fqdir; @@ -116,21 +125,29 @@ int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); static inline void fqdir_pre_exit(struct fqdir *fqdir) { - fqdir->high_thresh = 0; /* prevent creation of new frags */ - fqdir->dead = true; + /* Prevent creation of new frags. + * Pairs with READ_ONCE() in inet_frag_find(). + */ + WRITE_ONCE(fqdir->high_thresh, 0); + + /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire() + * and ip6frag_expire_frag_queue(). + */ + WRITE_ONCE(fqdir->dead, true); } void fqdir_exit(struct fqdir *fqdir); -void inet_frag_kill(struct inet_frag_queue *q); +void inet_frag_kill(struct inet_frag_queue *q, int *refs); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); /* Free all skbs in the queue; return the sum of their truesizes. */ -unsigned int inet_frag_rbtree_purge(struct rb_root *root); +unsigned int inet_frag_rbtree_purge(struct rb_root *root, + enum skb_drop_reason reason); -static inline void inet_frag_put(struct inet_frag_queue *q) +static inline void inet_frag_putn(struct inet_frag_queue *q, int refs) { - if (refcount_dec_and_test(&q->refcnt)) + if (refs && refcount_sub_and_test(refs, &q->refcnt)) inet_frag_destroy(q); } diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 92560974ea67..4564b5d348b1 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -23,6 +23,7 @@ #include <net/inet_connection_sock.h> #include <net/inet_sock.h> +#include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> @@ -87,10 +88,34 @@ struct inet_bind_bucket { unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; + struct hlist_head bhash2; + struct rcu_head rcu; +}; + +struct inet_bind2_bucket { + possible_net_t ib_net; + int l3mdev; + unsigned short port; +#if IS_ENABLED(CONFIG_IPV6) + unsigned short addr_type; + struct in6_addr v6_rcv_saddr; +#define rcv_saddr v6_rcv_saddr.s6_addr32[3] +#else + __be32 rcv_saddr; +#endif + /* Node in the bhash2 inet_bind_hashbucket chain */ + struct hlist_node node; + struct hlist_node bhash_node; + /* List of sockets hashed to this bucket */ struct hlist_head owners; }; -static inline struct net *ib_net(struct inet_bind_bucket *ib) +static inline struct net *ib_net(const struct inet_bind_bucket *ib) +{ + return read_pnet(&ib->ib_net); +} + +static inline struct net *ib2_net(const struct inet_bind2_bucket *ib) { return read_pnet(&ib->ib_net); } @@ -111,11 +136,7 @@ struct inet_bind_hashbucket { #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; - unsigned int count; - union { - struct hlist_head head; - struct hlist_nulls_head nulls_head; - }; + struct hlist_nulls_head nulls_head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ @@ -137,31 +158,27 @@ struct inet_hashinfo { * TCP hash as well as the others for fast bind/connect. */ struct kmem_cache *bind_bucket_cachep; + /* This bind table is hashed by local port */ struct inet_bind_hashbucket *bhash; + struct kmem_cache *bind2_bucket_cachep; + /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4) + * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used + * primarily for expediting bind conflict resolution. + */ + struct inet_bind_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; - /* All the above members are written once at bootup and - * never written again _or_ are predominantly read-access. - * - * Now align to a new cache line as all the following members - * might be often dirty. - */ - /* All sockets in TCP_LISTEN state will be in listening_hash. - * This is the only table where wildcard'd TCP sockets can - * exist. listening_hash is only hashed by local port number. - * If lhash2 is initialized, the same socket will also be hashed - * to lhash2 by port and address. - */ - struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE] - ____cacheline_aligned_in_smp; -}; + bool pernet; +} ____cacheline_aligned_in_smp; -#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \ - hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node) +static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk) +{ + return sock_net(sk)->ipv4.tcp_death_row.hashinfo; +} static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) @@ -197,23 +214,38 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks = NULL; } -static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if, - int dif, int sdif) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - return inet_bound_dev_eq(!!net->ipv4.sysctl_tcp_l3mdev_accept, - bound_dev_if, dif, sdif); -#else - return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); -#endif -} +struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, + unsigned int ehash_entries); +void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo); struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); -void inet_bind_bucket_destroy(struct kmem_cache *cachep, - struct inet_bind_bucket *tb); +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb); + +bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, + const struct net *net, unsigned short port, + int l3mdev); + +struct inet_bind2_bucket * +inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, + struct inet_bind_hashbucket *head, + struct inet_bind_bucket *tb, + const struct sock *sk); + +void inet_bind2_bucket_destroy(struct kmem_cache *cachep, + struct inet_bind2_bucket *tb); + +struct inet_bind2_bucket * +inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, + const struct net *net, + unsigned short port, int l3mdev, + const struct sock *sk); + +bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, + const struct net *net, unsigned short port, + int l3mdev, const struct sock *sk); static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) @@ -221,39 +253,53 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, return (lport + net_hash_mix(net)) & (bhash_size - 1); } -void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - const unsigned short snum); - -/* These can have wildcards, don't try too hard. */ -static inline u32 inet_lhashfn(const struct net *net, const unsigned short num) +static inline struct inet_bind_hashbucket * +inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, + const struct net *net, unsigned short port) { - return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1); -} + u32 hash; -static inline int inet_sk_listen_hashfn(const struct sock *sk) -{ - return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); + else +#endif + hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); + return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } +struct inet_bind_hashbucket * +inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); + +/* This should be called whenever a socket's sk_rcv_saddr (ipv4) or + * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's + * rcv_saddr field should already have been updated when this is called. + */ +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); +void inet_bhash2_reset_saddr(struct sock *sk); + +void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2, unsigned short port); + /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); -void inet_hashinfo_init(struct inet_hashinfo *h); void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit); int inet_hashinfo2_init_mod(struct inet_hashinfo *h); -bool inet_ehash_insert(struct sock *sk, struct sock *osk); -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, + bool *found_dup_sk); int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); -struct sock *__inet_lookup_listener(struct net *net, +struct sock *__inet_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, @@ -288,7 +334,6 @@ static inline struct sock *inet_lookup_listener(struct net *net, ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport))) #endif -#if (BITS_PER_LONG == 64) #ifdef __BIG_ENDIAN #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ @@ -300,34 +345,51 @@ static inline struct sock *inet_lookup_listener(struct net *net, (((__force __u64)(__be32)(__daddr)) << 32) | \ ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ -#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \ - (((__sk)->sk_portpair == (__ports)) && \ - ((__sk)->sk_addrpair == (__cookie)) && \ - (((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ - net_eq(sock_net(__sk), (__net))) -#else /* 32-bit arch */ -#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ - const int __name __deprecated __attribute__((unused)) -#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \ - (((__sk)->sk_portpair == (__ports)) && \ - ((__sk)->sk_daddr == (__saddr)) && \ - ((__sk)->sk_rcv_saddr == (__daddr)) && \ - (((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ - net_eq(sock_net(__sk), (__net))) -#endif /* 64-bit arch */ +static inline bool inet_match(const struct net *net, const struct sock *sk, + const __addrpair cookie, const __portpair ports, + int dif, int sdif) +{ + if (!net_eq(sock_net(sk), net) || + sk->sk_portpair != ports || + sk->sk_addrpair != cookie) + return false; + + /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ + return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, + sdif); +} /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM */ -struct sock *__inet_lookup_established(struct net *net, +struct sock *__inet_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); +typedef u32 (inet_ehashfn_t)(const struct net *net, + const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); + +inet_ehashfn_t inet_ehashfn; + +INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); + +struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum, + inet_ehashfn_t *ehashfn); + +struct sock *inet_lookup_run_sk_lookup(const struct net *net, + int protocol, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum, const int dif, + inet_ehashfn_t *ehashfn); + static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, @@ -377,6 +439,46 @@ static inline struct sock *inet_lookup(struct net *net, return sk; } +static inline +struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + bool *refcounted, inet_ehashfn_t *ehashfn) +{ + struct sock *sk, *reuse_sk; + bool prefetched; + + sk = skb_steal_sock(skb, refcounted, &prefetched); + if (!sk) + return NULL; + + if (!prefetched || !sk_fullsock(sk)) + return sk; + + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + return sk; + } else if (sk->sk_protocol == IPPROTO_UDP) { + if (sk->sk_state != TCP_CLOSE) + return sk; + } else { + return sk; + } + + reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, ntohs(dport), + ehashfn); + if (!reuse_sk) + return sk; + + /* We've chosen a new reuseport sock which is never refcounted. This + * implies that sk also isn't refcounted. + */ + WARN_ON_ONCE(*refcounted); + + return reuse_sk; +} + static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -385,22 +487,23 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const int sdif, bool *refcounted) { - struct sock *sk = skb_steal_sock(skb, refcounted); + struct net *net = dev_net_rcu(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); + struct sock *sk; + sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport, + refcounted, inet_ehashfn); + if (IS_ERR(sk)) + return NULL; if (sk) return sk; - return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + return __inet_lookup(net, hashinfo, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } -u32 inet6_ehashfn(const struct net *net, - const struct in6_addr *laddr, const u16 lport, - const struct in6_addr *faddr, const __be16 fport); - static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ @@ -418,10 +521,13 @@ static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) } int __inet_hash_connect(struct inet_timewait_death_row *death_row, - struct sock *sk, u32 port_offset, + struct sock *sk, u64 port_offset, + u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, - struct inet_timewait_sock **)); + struct inet_timewait_sock **, + bool rcu_lookup, + u32 hash)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 89163ef8cf4b..1086256549fa 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -19,6 +19,7 @@ #include <linux/netdevice.h> #include <net/flow.h> +#include <net/inet_dscp.h> #include <net/sock.h> #include <net/request_sock.h> #include <net/netns/hash.h> @@ -107,23 +108,26 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { - if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) + u32 mark = READ_ONCE(sk->sk_mark); + + if (!mark && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) return skb->mark; - return sk->sk_mark; + return mark; } static inline int inet_request_bound_dev_if(const struct sock *sk, struct sk_buff *skb) { + int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); - if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) + if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept)) return l3mdev_master_ifindex_by_index(net, skb->skb_iif); #endif - return sk->sk_bound_dev_if; + return bound_dev_if; } static inline int inet_sk_bound_l3mdev(const struct sock *sk) @@ -131,7 +135,7 @@ static inline int inet_sk_bound_l3mdev(const struct sock *sk) #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); - if (!net->ipv4.sysctl_tcp_l3mdev_accept) + if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept)) return l3mdev_master_ifindex_by_index(net, sk->sk_bound_dev_if); #endif @@ -147,6 +151,18 @@ static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if, return bound_dev_if == dif || bound_dev_if == sdif; } +static inline bool inet_sk_bound_dev_eq(const struct net *net, + int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept), + bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); +#endif +} + struct inet_cork { unsigned int flags; __be32 addr; @@ -157,8 +173,9 @@ struct inet_cork { u8 tx_flags; __u8 ttl; __s16 tos; - char priority; + u32 priority; __u16 gso_size; + u32 ts_opt_id; u64 transmit_time; u32 mark; }; @@ -180,13 +197,13 @@ struct rtable; * @inet_rcv_saddr - Bound local IPv4 addr * @inet_dport - Destination port * @inet_num - Local port + * @inet_flags - various atomic flags * @inet_saddr - Sending source * @uc_ttl - Unicast TTL * @inet_sport - Source port * @inet_id - ID counter for DF pkts * @tos - TOS * @mc_ttl - Multicasting TTL - * @is_icsk - is this an inet_connection_sock? * @uc_index - Unicast outgoing device index * @mc_index - Multicast device index * @mc_list - Group array @@ -204,54 +221,101 @@ struct inet_sock { #define inet_dport sk.__sk_common.skc_dport #define inet_num sk.__sk_common.skc_num + unsigned long inet_flags; __be32 inet_saddr; __s16 uc_ttl; - __u16 cmsg_flags; __be16 inet_sport; - __u16 inet_id; - struct ip_options_rcu __rcu *inet_opt; - int rx_dst_ifindex; + atomic_t inet_id; + __u8 tos; __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; - __u8 recverr:1, - is_icsk:1, - freebind:1, - hdrincl:1, - mc_loop:1, - transparent:1, - mc_all:1, - nodefrag:1; - __u8 bind_address_no_port:1, - recverr_rfc4884:1, - defer_connect:1; /* Indicates that fastopen_connect is set - * and cookie exists so we defer connect - * until first data frame is written - */ __u8 rcv_tos; __u8 convert_csum; int uc_index; int mc_index; __be32 mc_addr; + u32 local_port_range; /* high << 16 | low */ + struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; }; -#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ -#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ +#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_TS_OPT_ID 2 /* ts_opt_id field is valid, overriding sk_tskey */ + +enum { + INET_FLAGS_PKTINFO = 0, + INET_FLAGS_TTL = 1, + INET_FLAGS_TOS = 2, + INET_FLAGS_RECVOPTS = 3, + INET_FLAGS_RETOPTS = 4, + INET_FLAGS_PASSSEC = 5, + INET_FLAGS_ORIGDSTADDR = 6, + INET_FLAGS_CHECKSUM = 7, + INET_FLAGS_RECVFRAGSIZE = 8, + + INET_FLAGS_RECVERR = 9, + INET_FLAGS_RECVERR_RFC4884 = 10, + INET_FLAGS_FREEBIND = 11, + INET_FLAGS_HDRINCL = 12, + INET_FLAGS_MC_LOOP = 13, + INET_FLAGS_MC_ALL = 14, + INET_FLAGS_TRANSPARENT = 15, + INET_FLAGS_IS_ICSK = 16, + INET_FLAGS_NODEFRAG = 17, + INET_FLAGS_BIND_ADDRESS_NO_PORT = 18, + INET_FLAGS_DEFER_CONNECT = 19, + INET_FLAGS_MC6_LOOP = 20, + INET_FLAGS_RECVERR6_RFC4884 = 21, + INET_FLAGS_MC6_ALL = 22, + INET_FLAGS_AUTOFLOWLABEL_SET = 23, + INET_FLAGS_AUTOFLOWLABEL = 24, + INET_FLAGS_DONTFRAG = 25, + INET_FLAGS_RECVERR6 = 26, + INET_FLAGS_REPFLOW = 27, + INET_FLAGS_RTALERT_ISOLATE = 28, + INET_FLAGS_SNDFLOW = 29, + INET_FLAGS_RTALERT = 30, +}; /* cmsg flags for inet */ -#define IP_CMSG_PKTINFO BIT(0) -#define IP_CMSG_TTL BIT(1) -#define IP_CMSG_TOS BIT(2) -#define IP_CMSG_RECVOPTS BIT(3) -#define IP_CMSG_RETOPTS BIT(4) -#define IP_CMSG_PASSSEC BIT(5) -#define IP_CMSG_ORIGDSTADDR BIT(6) -#define IP_CMSG_CHECKSUM BIT(7) -#define IP_CMSG_RECVFRAGSIZE BIT(8) +#define IP_CMSG_PKTINFO BIT(INET_FLAGS_PKTINFO) +#define IP_CMSG_TTL BIT(INET_FLAGS_TTL) +#define IP_CMSG_TOS BIT(INET_FLAGS_TOS) +#define IP_CMSG_RECVOPTS BIT(INET_FLAGS_RECVOPTS) +#define IP_CMSG_RETOPTS BIT(INET_FLAGS_RETOPTS) +#define IP_CMSG_PASSSEC BIT(INET_FLAGS_PASSSEC) +#define IP_CMSG_ORIGDSTADDR BIT(INET_FLAGS_ORIGDSTADDR) +#define IP_CMSG_CHECKSUM BIT(INET_FLAGS_CHECKSUM) +#define IP_CMSG_RECVFRAGSIZE BIT(INET_FLAGS_RECVFRAGSIZE) + +#define IP_CMSG_ALL (IP_CMSG_PKTINFO | IP_CMSG_TTL | \ + IP_CMSG_TOS | IP_CMSG_RECVOPTS | \ + IP_CMSG_RETOPTS | IP_CMSG_PASSSEC | \ + IP_CMSG_ORIGDSTADDR | IP_CMSG_CHECKSUM | \ + IP_CMSG_RECVFRAGSIZE) + +static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) +{ + return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL; +} + +static inline dscp_t inet_sk_dscp(const struct inet_sock *inet) +{ + return inet_dsfield_to_dscp(READ_ONCE(inet->tos)); +} + +#define inet_test_bit(nr, sk) \ + test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet_set_bit(nr, sk) \ + set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet_clear_bit(nr, sk) \ + clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet_assign_bit(nr, sk, val) \ + assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) /** * sk_to_full_sk - Access to a full socket @@ -263,8 +327,10 @@ struct inet_sock { static inline struct sock *sk_to_full_sk(struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = inet_reqsk(sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } @@ -273,8 +339,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk) static inline const struct sock *sk_const_to_full_sk(const struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = ((const struct request_sock *)sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } @@ -284,10 +352,7 @@ static inline struct sock *skb_to_full_sk(const struct sk_buff *skb) return sk_to_full_sk(skb->sk); } -static inline struct inet_sock *inet_sk(const struct sock *sk) -{ - return (struct inet_sock *)sk; -} +#define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk) static inline void __inet_sk_copy_descendant(struct sock *sk_to, const struct sock *sk_from, @@ -344,7 +409,7 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { __u8 flags = 0; - if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) + if (inet_test_bit(TRANSPARENT, sk) || inet_test_bit(HDRINCL, sk)) flags |= FLOWI_FLAG_ANYSRC; return flags; } @@ -369,8 +434,21 @@ static inline bool inet_get_convert_csum(struct sock *sk) static inline bool inet_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { - return net->ipv4.sysctl_ip_nonlocal_bind || - inet->freebind || inet->transparent; + return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) || + test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || + test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); +} + +static inline bool inet_addr_valid_or_nonlocal(struct net *net, + struct inet_sock *inet, + __be32 addr, + int addr_type) +{ + return inet_can_nonlocal_bind(net, inet) || + addr == htonl(INADDR_ANY) || + addr_type == RTN_LOCAL || + addr_type == RTN_MULTICAST || + addr_type == RTN_BROADCAST; } #endif /* _INET_SOCK_H */ diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index dfd919b3119e..67a313575780 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -45,6 +45,8 @@ struct inet_timewait_sock { #define tw_node __tw_common.skc_nulls_node #define tw_bind_node __tw_common.skc_bind_node #define tw_refcnt __tw_common.skc_refcnt +#define tw_tx_queue_mapping __tw_common.skc_tx_queue_mapping +#define tw_rx_queue_mapping __tw_common.skc_rx_queue_mapping #define tw_hash __tw_common.skc_hash #define tw_prot __tw_common.skc_prot #define tw_net __tw_common.skc_net @@ -58,22 +60,27 @@ struct inet_timewait_sock { #define tw_dr __tw_common.skc_tw_dr __u32 tw_mark; - volatile unsigned char tw_substate; + unsigned char tw_substate; unsigned char tw_rcv_wscale; /* Socket demultiplex comparisons on incoming packets. */ /* these three are in inet_sock */ __be16 tw_sport; /* And these are ours. */ - unsigned int tw_kill : 1, - tw_transparent : 1, + unsigned int tw_transparent : 1, tw_flowlabel : 20, + tw_usec_ts : 1, tw_pad : 2, /* 2 bits hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; + /** + * @tw_reuse_stamp: Time of entry into %TCP_TIME_WAIT state in msec. + */ + u32 tw_entry_stamp; struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; + struct inet_bind2_bucket *tw_tb2; }; #define tw_tclass tw_tos @@ -92,17 +99,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, const int state); -void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, - struct inet_hashinfo *hashinfo); +void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo, + int timeo); void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm); -static inline void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) -{ - __inet_twsk_schedule(tw, timeo, false); -} - static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo) { __inet_twsk_schedule(tw, timeo, true); @@ -110,7 +114,7 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); +void inet_twsk_purge(struct inet_hashinfo *hashinfo); static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 74ff688568a0..f475757daafb 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -96,30 +96,28 @@ static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, - const struct inetpeer_addr *daddr, - int create); + const struct inetpeer_addr *daddr); static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, - int vif, int create) + int vif) { struct inetpeer_addr daddr; daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, - const struct in6_addr *v6daddr, - int create) + const struct in6_addr *v6daddr) { struct inetpeer_addr daddr; daddr.a6 = *v6daddr; daddr.family = AF_INET6; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, diff --git a/include/net/ioam6.h b/include/net/ioam6.h new file mode 100644 index 000000000000..2cbbee6e806a --- /dev/null +++ b/include/net/ioam6.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * IPv6 IOAM implementation + * + * Author: + * Justin Iurman <justin.iurman@uliege.be> + */ + +#ifndef _NET_IOAM6_H +#define _NET_IOAM6_H + +#include <linux/net.h> +#include <linux/ipv6.h> +#include <linux/ioam6.h> +#include <linux/ioam6_genl.h> +#include <linux/rhashtable-types.h> + +struct ioam6_namespace { + struct rhash_head head; + struct rcu_head rcu; + + struct ioam6_schema __rcu *schema; + + __be16 id; + __be32 data; + __be64 data_wide; +}; + +struct ioam6_schema { + struct rhash_head head; + struct rcu_head rcu; + + struct ioam6_namespace __rcu *ns; + + u32 id; + int len; + __be32 hdr; + + u8 data[]; +}; + +struct ioam6_pernet_data { + struct mutex lock; + struct rhashtable namespaces; + struct rhashtable schemas; +}; + +static inline struct ioam6_pernet_data *ioam6_pernet(struct net *net) +{ +#if IS_ENABLED(CONFIG_IPV6) + return net->ipv6.ioam6_data; +#else + return NULL; +#endif +} + +struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id); +void ioam6_fill_trace_data(struct sk_buff *skb, + struct ioam6_namespace *ns, + struct ioam6_trace_hdr *trace, + bool is_input); + +int ioam6_init(void); +void ioam6_exit(void); + +int ioam6_iptunnel_init(void); +void ioam6_iptunnel_exit(void); + +void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp, + void *opt, unsigned int opt_len); + +#endif /* _NET_IOAM6_H */ diff --git a/include/net/ip.h b/include/net/ip.h index 2d6b985d11cc..47ed6d23853d 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -24,6 +24,7 @@ #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/sockptr.h> +#include <linux/static_key.h> #include <net/inet_sock.h> #include <net/route.h> @@ -31,6 +32,8 @@ #include <net/flow.h> #include <net/flow_dissector.h> #include <net/netns/hash.h> +#include <net/lwtunnel.h> +#include <net/inet_dscp.h> #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ @@ -54,6 +57,8 @@ struct inet_skb_parm { #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) +#define IPSKB_NOPOLICY BIT(8) +#define IPSKB_MULTIPATH BIT(9) u16 frag_max_size; }; @@ -73,9 +78,9 @@ struct ipcm_cookie { __be32 addr; int oif; struct ip_options_rcu *opt; + __u8 protocol; __u8 ttl; __s16 tos; - char priority; __u16 gso_size; }; @@ -87,19 +92,22 @@ static inline void ipcm_init(struct ipcm_cookie *ipcm) static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, const struct inet_sock *inet) { - ipcm_init(ipcm); + *ipcm = (struct ipcm_cookie) { + .tos = READ_ONCE(inet->tos), + }; + + sockcm_init(&ipcm->sockc, &inet->sk); - ipcm->sockc.mark = inet->sk.sk_mark; - ipcm->sockc.tsflags = inet->sk.sk_tsflags; - ipcm->oif = inet->sk.sk_bound_dev_if; + ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; + ipcm->protocol = inet->inet_num; } #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) #define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb)) /* return enslaved device index if relevant */ -static inline int inet_sdif(struct sk_buff *skb) +static inline int inet_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) @@ -217,8 +225,6 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4, unsigned int flags); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb); -ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, - int offset, size_t size, int flags); struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork); @@ -239,14 +245,17 @@ static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4) return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base); } -static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) +/* Get the route scope that should be used when sending a packet. */ +static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, + const struct ipcm_cookie *ipc, + const struct msghdr *msg) { - return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(inet->tos); -} + if (sock_flag(&inet->sk, SOCK_LOCALROUTE) || + msg->msg_flags & MSG_DONTROUTE || + (ipc->opt && ipc->opt->opt.is_strictroute)) + return RT_SCOPE_LINK; -static inline __u8 get_rtconn_flags(struct ipcm_cookie* ipc, struct sock* sk) -{ - return (ipc->tos != -1) ? RT_CONN_FLAGS_TOS(sk, ipc->tos) : RT_CONN_FLAGS(sk); + return RT_SCOPE_UNIVERSE; } /* datagram.c */ @@ -273,11 +282,12 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, + struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len, u64 transmit_time); + unsigned int len, u64 transmit_time, u32 txhash); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define __IP_INC_STATS(net, field) __SNMP_INC_STATS64((net)->mib.ip_statistics, field) @@ -290,7 +300,11 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, #define NET_ADD_STATS(net, field, adnd) SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd) #define __NET_ADD_STATS(net, field, adnd) __SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd) -u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offct); +static inline u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) +{ + return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); +} + unsigned long snmp_fold_field(void __percpu *mib, int offt); #if BITS_PER_LONG==32 u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, @@ -333,10 +347,17 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } -void inet_get_local_port_range(struct net *net, int *low, int *high); +static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) +{ + u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); + + *low = range & 0xffff; + *high = range >> 16; +} +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL -static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) +static inline bool inet_is_local_reserved_port(const struct net *net, unsigned short port) { if (!net->ipv4.sysctl_local_reserved_ports) return false; @@ -350,7 +371,7 @@ static inline bool sysctl_dev_name_is_allowed(const char *name) static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { - return port < net->ipv4.sysctl_ip_prot_sock; + return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock); } #else @@ -377,7 +398,7 @@ void ipfrag_init(void); void ip_static_sysctl_init(void); #define IP4_REPLY_MARK(net, mark) \ - ((net)->ipv4.sysctl_fwmark_reflect ? (mark) : 0) + (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0) static inline bool ip_is_fragment(const struct iphdr *iph) { @@ -398,9 +419,14 @@ int ip_decrease_ttl(struct iphdr *iph) return --iph->ttl; } +static inline dscp_t ip4h_dscp(const struct iphdr *ip4h) +{ + return inet_dsfield_to_dscp(ip4h->tos); +} + static inline int ip_mtu_locked(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *)dst; + const struct rtable *rt = dst_rtable(dst); return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU); } @@ -417,54 +443,80 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst) static inline bool ip_sk_accept_pmtu(const struct sock *sk) { - return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE && - inet_sk(sk)->pmtudisc != IP_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); + + return pmtudisc != IP_PMTUDISC_INTERFACE && + pmtudisc != IP_PMTUDISC_OMIT; } static inline bool ip_sk_use_pmtu(const struct sock *sk) { - return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE; + return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE; } static inline bool ip_sk_ignore_df(const struct sock *sk) { - return inet_sk(sk)->pmtudisc < IP_PMTUDISC_DO || - inet_sk(sk)->pmtudisc == IP_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); + + return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT; } static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { - struct net *net = dev_net(dst->dev); - unsigned int mtu; + const struct rtable *rt = dst_rtable(dst); + unsigned int mtu, res; + struct net *net; + + rcu_read_lock(); - if (net->ipv4.sysctl_ip_fwd_use_pmtu || + net = dev_net_rcu(dst->dev); + if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || - !forwarding) - return dst_mtu(dst); + !forwarding) { + mtu = rt->rt_pmtu; + if (mtu && time_before(jiffies, rt->dst.expires)) + goto out; + } /* 'forwarding = true' case should always honour route mtu */ mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; + + mtu = READ_ONCE(dst->dev->mtu); - return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); + if (unlikely(ip_mtu_locked(dst))) { + if (rt->rt_uses_gateway && mtu > 576) + mtu = 576; + } + +out: + mtu = min_t(unsigned int, mtu, IP_MAX_MTU); + + res = mtu - lwtunnel_headroom(dst->lwtstate, mtu); + + rcu_read_unlock(); + + return res; } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { + unsigned int mtu; + if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); } - return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); + mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); + return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); } -struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, - int fc_mx_len, +struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack); static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics) { @@ -494,7 +546,6 @@ void ip_dst_metrics_put(struct dst_entry *dst) kfree(p); } -u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, @@ -502,19 +553,29 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { - /* This is only to work around buggy Windows95/2000 - * VJ compression implementations. If the ID field - * does not change, they drop every other packet in - * a TCP stream using header compression. + /* We had many attacks based on IPID, use the private + * generator as much as we can. + */ + if (sk && inet_sk(sk)->inet_daddr) { + int val; + + /* avoid atomic operations for TCP, + * as we hold socket lock at this point. */ - if (sk && inet_sk(sk)->inet_daddr) { - iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += segs; + if (sk_is_tcp(sk)) { + sock_owned_by_me(sk); + val = atomic_read(&inet_sk(sk)->inet_id); + atomic_set(&inet_sk(sk)->inet_id, val + segs); } else { - iph->id = 0; + val = atomic_add_return(segs, &inet_sk(sk)->inet_id); } + iph->id = htons(val); + return; + } + if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { + iph->id = 0; } else { + /* Unfortunately we need the big hammer to get a suitable IPID */ __ip_select_ident(net, iph, segs); } } @@ -541,18 +602,10 @@ static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow, BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) != offsetof(typeof(flow->addrs), v4addrs.src) + sizeof(flow->addrs.v4addrs.src)); - memcpy(&flow->addrs.v4addrs, &iph->saddr, sizeof(flow->addrs.v4addrs)); + memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } -static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto) -{ - const struct iphdr *iph = skb_gro_network_header(skb); - - return csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb_gro_len(skb), proto, 0); -} - /* * Map a multicast IP onto multicast MAC for type ethernet. */ @@ -633,11 +686,24 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif +#if IS_MODULE(CONFIG_IPV6) +#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X) +#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X) +#else +#define EXPORT_IPV6_MOD(X) +#define EXPORT_IPV6_MOD_GPL(X) +#endif + static inline unsigned int ipv4_addr_hash(__be32 ip) { return (__force unsigned int) ip; } +static inline u32 __ipv4_addr_hash(const __be32 ip, const u32 initval) +{ + return jhash_1word((__force u32)ip, initval); +} + static inline u32 ipv4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) @@ -698,7 +764,7 @@ int ip_forward(struct sk_buff *skb); */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, - __be32 daddr, struct rtable *rt, int is_frag); + __be32 daddr, struct rtable *rt); int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt); @@ -723,13 +789,18 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev); * Functions provided by ip_sockglue.c */ -void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb); +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst); void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); +DECLARE_STATIC_KEY_FALSE(ip4_min_ttl); +int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +int do_ip_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int ip_ra_control(struct sock *sk, unsigned char on, @@ -746,9 +817,8 @@ static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0); } -bool icmp_global_allow(void); -extern int sysctl_icmp_msgs_per_sec; -extern int sysctl_icmp_msgs_burst; +bool icmp_global_allow(struct net *net); +void icmp_global_consume(struct net *net); #ifdef CONFIG_PROC_FS int ip_misc_proc_init(void); @@ -767,5 +837,6 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val); void ip_sock_set_pktinfo(struct sock *sk); void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); +void __ip_sock_set_tos(struct sock *sk, int val); #endif /* _IP_H */ diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h index b3f4eaa88672..c8a96b888277 100644 --- a/include/net/ip6_checksum.h +++ b/include/net/ip6_checksum.h @@ -43,14 +43,6 @@ static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto) skb->len, proto, 0)); } -static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto) -{ - const struct ipv6hdr *iph = skb_gro_network_header(skb); - - return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, - skb_gro_len(skb), proto, 0)); -} - static __inline__ __sum16 tcp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, @@ -65,15 +57,9 @@ static inline void __tcp_v6_send_check(struct sk_buff *skb, { struct tcphdr *th = tcp_hdr(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - } else { - th->check = tcp_v6_check(skb->len, saddr, daddr, - csum_partial(th, th->doff << 2, - skb->csum)); - } + th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); } static inline void tcp_v6_gso_csum_prep(struct sk_buff *skb) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ac5ff3c3afb1..88b0dd4d8e09 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -20,6 +20,7 @@ #include <net/inetpeer.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> +#include <uapi/linux/bpf.h> #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_HASHSZ 256 @@ -29,12 +30,6 @@ #define RT6_DEBUG 2 -#if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) pr_debug(x) -#else -#define RT6_TRACE(x...) do { ; } while (0) -#endif - struct rt6_info; struct fib6_info; @@ -178,6 +173,9 @@ struct fib6_info { refcount_t fib6_ref; unsigned long expires; + + struct hlist_node gc_link; + struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] @@ -189,14 +187,18 @@ struct fib6_info { u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; + + u8 offload; + u8 trap; + u8 offload_failed; + u8 should_flush:1, dst_nocount:1, dst_nopolicy:1, fib6_destroying:1, - offload:1, - trap:1, - unused:2; + unused:4; + struct list_head purge_link; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; @@ -213,9 +215,6 @@ struct rt6_info { struct inet6_dev *rt6i_idev; u32 rt6i_flags; - struct list_head rt6i_uncached; - struct uncached_list *rt6i_uncached_list; - /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; }; @@ -236,9 +235,11 @@ struct fib6_result { for (rt = (w)->leaf; rt; \ rt = rcu_dereference_protected(rt->fib6_next, 1)) -static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) +#define dst_rt6_info(_ptr) container_of_const(_ptr, struct rt6_info, dst) + +static inline struct inet6_dev *ip6_dst_idev(const struct dst_entry *dst) { - return ((struct rt6_info *)dst)->rt6i_idev; + return dst_rt6_info(dst)->rt6i_idev; } static inline bool fib6_requires_src(const struct fib6_info *rt) @@ -246,12 +247,18 @@ static inline bool fib6_requires_src(const struct fib6_info *rt) return rt->fib6_src.plen > 0; } +/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever + * been added to a table before. + */ static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->fib6_flags &= ~RTF_EXPIRES; f6i->expires = 0; } +/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever + * been added to a table before. + */ static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { @@ -266,7 +273,7 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) return false; } -/* Function to safely get fn->sernum for passed in rt +/* Function to safely get fn->fn_sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely * Return false if not @@ -280,8 +287,8 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, fn = rcu_dereference(f6i->fib6_node); if (fn) { - *cookie = fn->fn_sernum; - /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ + *cookie = READ_ONCE(fn->fn_sernum); + /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */ smp_rmb(); status = true; } @@ -332,15 +339,10 @@ static inline bool fib6_info_hold_safe(struct fib6_info *f6i) static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) - call_rcu(&f6i->rcu, fib6_info_destroy_rcu); -} - -static inline void fib6_info_hw_flags_set(struct fib6_info *f6i, bool offload, - bool trap) -{ - f6i->offload = offload; - f6i->trap = trap; + if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { + DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); + call_rcu_hurry(&f6i->rcu, fib6_info_destroy_rcu); + } } enum fib6_walk_state { @@ -372,9 +374,8 @@ struct rt6_statistics { __u32 fib_rt_cache; /* cached rt entries in exception table */ __u32 fib_discarded_routes; /* total number of routes delete */ - /* The following stats are not protected by any lock */ + /* The following stat is not protected by any lock */ atomic_t fib_rt_alloc; /* total number of routes alloced */ - atomic_t fib_rt_uncache; /* rt entries in uncached list */ }; #define RTN_TL_ROOT 0x0001 @@ -394,7 +395,8 @@ struct fib6_table { struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; - unsigned int fib_seq; + unsigned int fib_seq; /* writes protected by rtnl_mutex */ + struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; @@ -476,13 +478,10 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr) rcu_read_lock(); from = rcu_dereference(rt->from); - if (from) { + if (from) *addr = from->fib6_prefsrc.addr; - } else { - struct in6_addr in6_zero = {}; - - *addr = in6_zero; - } + else + *addr = in6addr_any; rcu_read_unlock(); } @@ -491,6 +490,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib6_nh_release(struct fib6_nh *fib6_nh); +void fib6_nh_release_dsts(struct fib6_nh *fib6_nh); int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, @@ -513,6 +513,38 @@ void fib6_gc_cleanup(void); int fib6_init(void); +/* Add the route to the gc list if it is not already there + * + * The callers should hold f6i->fib6_table->tb6_lock. + */ +static inline void fib6_add_gc_list(struct fib6_info *f6i) +{ + /* If fib6_node is null, the f6i is not in (or removed from) the + * table. + * + * There is a gap between finding the f6i from the table and + * calling this function without the protection of the tb6_lock. + * This check makes sure the f6i is not added to the gc list when + * it is not on the table. + */ + if (!rcu_dereference_protected(f6i->fib6_node, + lockdep_is_held(&f6i->fib6_table->tb6_lock))) + return; + + if (hlist_unhashed(&f6i->gc_link)) + hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist); +} + +/* Remove the route from the gc list if it is on the list. + * + * The callers should hold f6i->fib6_table->tb6_lock. + */ +static inline void fib6_remove_gc_list(struct fib6_info *f6i) +{ + if (!hlist_unhashed(&f6i->gc_link)) + hlist_del_init(&f6i->gc_link); +} + struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; @@ -532,7 +564,7 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); -unsigned int fib6_tables_seq_read(struct net *net); +unsigned int fib6_tables_seq_read(const struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); @@ -545,6 +577,8 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) { return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } +void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, + bool offload, bool trap, bool offload_failed); #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) struct bpf_iter__ipv6_route { @@ -599,7 +633,7 @@ void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib6_rules_seq_read(struct net *net); +unsigned int fib6_rules_seq_read(const struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, @@ -611,7 +645,10 @@ static inline bool fib6_rules_early_flow_dissect(struct net *net, if (!net->ipv6.fib6_rules_require_fldissect) return false; - skb_flow_dissect_flow_keys(skb, flkeys, flag); + memset(flkeys, 0, sizeof(*flkeys)); + __skb_flow_dissect(net, skb, &flow_keys_dissector, + flkeys, NULL, 0, 0, 0, flag); + fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; fl6->flowi6_proto = flkeys->basic.ip_proto; @@ -640,7 +677,7 @@ static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, { return 0; } -static inline unsigned int fib6_rules_seq_read(struct net *net) +static inline unsigned int fib6_rules_seq_read(const struct net *net) { return 0; } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 2a5277758379..6dbdf60b342f 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -2,6 +2,16 @@ #ifndef _NET_IP6_ROUTE_H #define _NET_IP6_ROUTE_H +#include <net/addrconf.h> +#include <net/flow.h> +#include <net/ip6_fib.h> +#include <net/sock.h> +#include <net/lwtunnel.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/route.h> +#include <net/nexthop.h> + struct route_info { __u8 type; __u8 length; @@ -19,16 +29,6 @@ struct route_info { __u8 prefix[]; /* 0,8 or 16 */ }; -#include <net/addrconf.h> -#include <net/flow.h> -#include <net/ip6_fib.h> -#include <net/sock.h> -#include <net/lwtunnel.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/route.h> -#include <net/nexthop.h> - #define RT6_LOOKUP_F_IFACE 0x00000001 #define RT6_LOOKUP_F_REACHABLE 0x00000002 #define RT6_LOOKUP_F_HAS_SADDR 0x00000004 @@ -53,13 +53,12 @@ struct route_info { */ static inline int rt6_srcprefs2flags(unsigned int srcprefs) { - /* No need to bitmask because srcprefs have only 3 bits. */ - return srcprefs << 3; + return (srcprefs & IPV6_PREFER_SRC_MASK) << 3; } static inline unsigned int rt6_flags2srcprefs(int flags) { - return (flags >> 3) & 7; + return (flags >> 3) & IPV6_PREFER_SRC_MASK; } static inline bool rt6_need_strict(const struct in6_addr *daddr) @@ -84,10 +83,6 @@ struct dst_entry *ip6_route_input_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags); -struct dst_entry *ip6_route_output_flags_noref(struct net *net, - const struct sock *sk, - struct flowi6 *fl6, int flags); - struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); @@ -104,7 +99,7 @@ static inline struct dst_entry *ip6_route_output(struct net *net, static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags) { if (!(flags & RT6_LOOKUP_F_DST_NOREF) || - !list_empty(&rt->rt6i_uncached)) + !list_empty(&rt->dst.rt_uncached)) ip6_rt_put(rt); } @@ -132,18 +127,26 @@ void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, const struct in6_addr *daddr, - unsigned int prefs, + unsigned int prefs, int l3mdev_index, struct in6_addr *saddr) { + struct net_device *l3mdev; + struct net_device *dev; + bool same_vrf; int err = 0; - if (f6i && f6i->fib6_prefsrc.plen) { + rcu_read_lock(); + + l3mdev = dev_get_by_index_rcu(net, l3mdev_index); + if (!f6i || !f6i->fib6_prefsrc.plen || l3mdev) + dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + same_vrf = !l3mdev || l3mdev_master_dev_rcu(dev) == l3mdev; + if (f6i && f6i->fib6_prefsrc.plen && same_vrf) *saddr = f6i->fib6_prefsrc.addr; - } else { - struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + else + err = ipv6_dev_get_saddr(net, same_vrf ? dev : l3mdev, daddr, prefs, saddr); - err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr); - } + rcu_read_unlock(); return err; } @@ -160,7 +163,7 @@ void fib6_force_start_gc(struct net *net); struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, - gfp_t gfp_flags); + gfp_t gfp_flags, struct netlink_ext_ack *extack); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); @@ -174,7 +177,9 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, struct net_device *dev); struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, - struct net_device *dev, unsigned int pref); + struct net_device *dev, unsigned int pref, + u32 defrtr_usr_metric, + int lifetime); void rt6_purge_dflt_routers(struct net *net); @@ -213,12 +218,11 @@ void rt6_uncached_list_del(struct rt6_info *rt); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { const struct dst_entry *dst = skb_dst(skb); - const struct rt6_info *rt6 = NULL; if (dst) - rt6 = container_of(dst, struct rt6_info, dst); + return dst_rt6_info(dst); - return rt6; + return NULL; } /* @@ -230,7 +234,7 @@ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, { struct ipv6_pinfo *np = inet6_sk(sk); - np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); + np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES @@ -243,7 +247,7 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, static inline bool ipv6_unicast_destination(const struct sk_buff *skb) { - struct rt6_info *rt = (struct rt6_info *) skb_dst(skb); + const struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); return rt->rt6i_flags & RTF_LOCAL; } @@ -251,7 +255,7 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb) static inline bool ipv6_anycast_destination(const struct dst_entry *dst, const struct in6_addr *daddr) { - struct rt6_info *rt = (struct rt6_info *)dst; + const struct rt6_info *rt = dst_rt6_info(dst); return rt->rt6i_flags & RTF_ANYCAST || (rt->rt6i_dst.plen < 127 && @@ -262,25 +266,36 @@ static inline bool ipv6_anycast_destination(const struct dst_entry *dst, int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); -static inline int ip6_skb_dst_mtu(struct sk_buff *skb) +static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) { - struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + const struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; + const struct dst_entry *dst = skb_dst(skb); + unsigned int mtu; - return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); + if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) { + mtu = READ_ONCE(dst->dev->mtu); + mtu -= lwtunnel_headroom(dst->lwtstate, mtu); + } else { + mtu = dst_mtu(dst); + } + return mtu; } static inline bool ip6_sk_accept_pmtu(const struct sock *sk) { - return inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_INTERFACE && - inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc); + + return pmtudisc != IPV6_PMTUDISC_INTERFACE && + pmtudisc != IPV6_PMTUDISC_OMIT; } static inline bool ip6_sk_ignore_df(const struct sock *sk) { - return inet6_sk(sk)->pmtudisc < IPV6_PMTUDISC_DO || - inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT; + u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc); + + return pmtudisc < IPV6_PMTUDISC_DO || + pmtudisc == IPV6_PMTUDISC_OMIT; } static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt, @@ -308,25 +323,27 @@ static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info * !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws); } -static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst, + bool forwarding) { struct inet6_dev *idev; unsigned int mtu; - if (dst_metric_locked(dst, RTAX_MTU)) { + if (!forwarding || dst_metric_locked(dst, RTAX_MTU)) { mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; } mtu = IPV6_MIN_MTU; rcu_read_lock(); idev = __in6_dev_get(dst->dev); if (idev) - mtu = idev->cnf.mtu6; + mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); - return mtu; +out: + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } u32 ip6_mtu_from_fib6(const struct fib6_result *res, diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 028eaea1c854..399592405c72 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -30,8 +30,8 @@ struct __ip6_tnl_parm { struct in6_addr laddr; /* local tunnel end-point address */ struct in6_addr raddr; /* remote tunnel end-point address */ - __be16 i_flags; - __be16 o_flags; + IP_TUNNEL_DECLARE_FLAGS(i_flags); + IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; @@ -46,6 +46,7 @@ struct __ip6_tnl_parm { struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ struct net_device *dev; /* virtual device associated with tunnel */ + netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ @@ -57,7 +58,7 @@ struct ip6_tnl { /* These fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ - __u32 o_seqno; /* The last output seqno */ + atomic_t o_seqno; /* The last output seqno */ int hlen; /* tun_hlen + encap_hlen */ int tun_hlen; /* Precalculated header length */ int encap_hlen; /* Encap header length (FOU,GUE) */ diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 2ec062aaa978..48bb3cf41469 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -17,14 +17,17 @@ #include <linux/rcupdate.h> #include <net/fib_notifier.h> #include <net/fib_rules.h> +#include <net/inet_dscp.h> #include <net/inetpeer.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/refcount.h> +#include <linux/ip.h> +#include <linux/in_route.h> struct fib_config { u8 fc_dst_len; - u8 fc_tos; + dscp_t fc_dscp; u8 fc_protocol; u8 fc_scope; u8 fc_type; @@ -79,6 +82,7 @@ struct fnhe_hash_bucket { struct fib_nh_common { struct net_device *nhc_dev; + netdevice_tracker nhc_dev_tracker; int nhc_oif; unsigned char nhc_scope; u8 nhc_family; @@ -111,6 +115,7 @@ struct fib_nh { int nh_saddr_genid; #define fib_nh_family nh_common.nhc_family #define fib_nh_dev nh_common.nhc_dev +#define fib_nh_dev_tracker nh_common.nhc_dev_tracker #define fib_nh_oif nh_common.nhc_oif #define fib_nh_flags nh_common.nhc_flags #define fib_nh_lws nh_common.nhc_lwtstate @@ -133,7 +138,7 @@ struct fib_info { struct hlist_node fib_lhash; struct list_head nh_list; struct net *fib_net; - int fib_treeref; + refcount_t fib_treeref; refcount_t fib_clntref; unsigned int fib_flags; unsigned char fib_dead; @@ -151,11 +156,14 @@ struct fib_info { int fib_nhs; bool fib_nh_is_v6; bool nh_updated; + bool pfsrc_removed; struct nexthop *nh; struct rcu_head rcu; - struct fib_nh fib_nh[]; + struct fib_nh fib_nh[] __counted_by(fib_nhs); }; +int __net_init fib4_semantics_init(struct net *net); +void __net_exit fib4_semantics_exit(struct net *net); #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; @@ -169,6 +177,7 @@ struct fib_result { unsigned char type; unsigned char scope; u32 tclassid; + dscp_t dscp; struct fib_nh_common *nhc; struct fib_info *fi; struct fib_table *table; @@ -209,11 +218,12 @@ struct fib_rt_info { u32 tb_id; __be32 dst; int dst_len; - u8 tos; + dscp_t dscp; u8 type; u8 offload:1, trap:1, - unused:6; + offload_failed:1, + unused:5; }; struct fib_entry_notifier_info { @@ -221,7 +231,7 @@ struct fib_entry_notifier_info { u32 dst; int dst_len; struct fib_info *fi; - u8 tos; + dscp_t dscp; u8 type; u32 tb_id; }; @@ -259,6 +269,7 @@ struct fib_dump_filter { bool filter_set; bool dump_routes; bool dump_exceptions; + bool rtnl_held; unsigned char protocol; unsigned char rt_type; unsigned int flags; @@ -338,7 +349,7 @@ static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, return 0; } -static inline unsigned int fib4_rules_seq_read(struct net *net) +static inline unsigned int fib4_rules_seq_read(const struct net *net) { return 0; } @@ -402,7 +413,7 @@ static inline bool fib4_has_custom_rules(const struct net *net) bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib4_rules_seq_read(struct net *net); +unsigned int fib4_rules_seq_read(const struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, @@ -414,7 +425,10 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, if (!net->ipv4.fib_rules_require_fldissect) return false; - skb_flow_dissect_flow_keys(skb, flkeys, flag); + memset(flkeys, 0, sizeof(*flkeys)); + __skb_flow_dissect(net, skb, &flow_keys_dissector, + flkeys, NULL, 0, 0, 0, flag); + fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; fl4->flowi4_proto = flkeys->basic.ip_proto; @@ -424,6 +438,11 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, #endif /* CONFIG_IP_MULTIPLE_TABLES */ +static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4) +{ + return dscp == inet_dsfield_to_dscp(RT_TOS(fl4->flowi4_tos)); +} + /* Exported by fib_frontend.c */ extern const struct nla_policy rtm_ipv4_policy[]; void ip_fib_init(void); @@ -432,12 +451,25 @@ int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, - u8 tos, int oif, struct net_device *dev, + dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); + +static inline enum skb_drop_reason +fib_validate_source_reason(struct sk_buff *skb, __be32 src, __be32 dst, + dscp_t dscp, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int err = fib_validate_source(skb, src, dst, dscp, oif, dev, idev, + itag); + if (err < 0) + return -err; + return SKB_NOT_DROPPED_YET; +} + #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { - return net->ipv4.fib_num_tclassid_users; + return atomic_read(&net->ipv4.fib_num_tclassid_users); } #else static inline int fib_num_tclassid_users(struct net *net) @@ -465,13 +497,85 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig); +/* Fields used for sysctl_fib_multipath_hash_fields. + * Common to IPv4 and IPv6. + * + * Add new fields at the end. This is user API. + */ +#define FIB_MULTIPATH_HASH_FIELD_SRC_IP BIT(0) +#define FIB_MULTIPATH_HASH_FIELD_DST_IP BIT(1) +#define FIB_MULTIPATH_HASH_FIELD_IP_PROTO BIT(2) +#define FIB_MULTIPATH_HASH_FIELD_FLOWLABEL BIT(3) +#define FIB_MULTIPATH_HASH_FIELD_SRC_PORT BIT(4) +#define FIB_MULTIPATH_HASH_FIELD_DST_PORT BIT(5) +#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP BIT(6) +#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP BIT(7) +#define FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO BIT(8) +#define FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL BIT(9) +#define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT BIT(10) +#define FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT BIT(11) + +#define FIB_MULTIPATH_HASH_FIELD_OUTER_MASK \ + (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ + FIB_MULTIPATH_HASH_FIELD_DST_IP | \ + FIB_MULTIPATH_HASH_FIELD_IP_PROTO | \ + FIB_MULTIPATH_HASH_FIELD_FLOWLABEL | \ + FIB_MULTIPATH_HASH_FIELD_SRC_PORT | \ + FIB_MULTIPATH_HASH_FIELD_DST_PORT) + +#define FIB_MULTIPATH_HASH_FIELD_INNER_MASK \ + (FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP | \ + FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP | \ + FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO | \ + FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL | \ + FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT | \ + FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + +#define FIB_MULTIPATH_HASH_FIELD_ALL_MASK \ + (FIB_MULTIPATH_HASH_FIELD_OUTER_MASK | \ + FIB_MULTIPATH_HASH_FIELD_INNER_MASK) + +#define FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK \ + (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ + FIB_MULTIPATH_HASH_FIELD_DST_IP | \ + FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); + +static void +fib_multipath_hash_construct_key(siphash_key_t *key, u32 mp_seed) +{ + u64 mp_seed_64 = mp_seed; + + key->key[0] = (mp_seed_64 << 32) | mp_seed_64; + key->key[1] = key->key[0]; +} + +static inline u32 fib_multipath_hash_from_keys(const struct net *net, + struct flow_keys *keys) +{ + siphash_aligned_key_t hash_key; + u32 mp_seed; + + mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed; + fib_multipath_hash_construct_key(&hash_key, mp_seed); + + return flow_hash_from_keys_seed(keys, &hash_key); +} +#else +static inline u32 fib_multipath_hash_from_keys(const struct net *net, + struct flow_keys *keys) +{ + return flow_hash_from_keys(keys); +} #endif + int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); -void fib_select_multipath(struct fib_result *res, int hash); +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); @@ -553,5 +657,5 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, u8 rt_family, unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, - int nh_weight, u8 rt_family); + int nh_weight, u8 rt_family, u32 nh_tclassid); #endif /* _NET_FIB_H */ diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 02ccd32542d0..0c3d571a04a1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -36,6 +36,24 @@ (sizeof_field(struct ip_tunnel_key, u) - \ sizeof_field(struct ip_tunnel_key, u.ipv4)) +#define __ipt_flag_op(op, ...) \ + op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM) + +#define IP_TUNNEL_DECLARE_FLAGS(...) \ + __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__) + +#define ip_tunnel_flags_zero(...) __ipt_flag_op(bitmap_zero, __VA_ARGS__) +#define ip_tunnel_flags_copy(...) __ipt_flag_op(bitmap_copy, __VA_ARGS__) +#define ip_tunnel_flags_and(...) __ipt_flag_op(bitmap_and, __VA_ARGS__) +#define ip_tunnel_flags_or(...) __ipt_flag_op(bitmap_or, __VA_ARGS__) + +#define ip_tunnel_flags_empty(...) \ + __ipt_flag_op(bitmap_empty, __VA_ARGS__) +#define ip_tunnel_flags_intersect(...) \ + __ipt_flag_op(bitmap_intersects, __VA_ARGS__) +#define ip_tunnel_flags_subset(...) \ + __ipt_flag_op(bitmap_subset, __VA_ARGS__) + struct ip_tunnel_key { __be64 tun_id; union { @@ -48,12 +66,21 @@ struct ip_tunnel_key { struct in6_addr dst; } ipv6; } u; - __be16 tun_flags; + IP_TUNNEL_DECLARE_FLAGS(tun_flags); + __be32 label; /* Flow Label for IPv6 */ + u32 nhid; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ - __be32 label; /* Flow Label for IPv6 */ __be16 tp_src; __be16 tp_dst; + __u8 flow_flags; +}; + +struct ip_tunnel_encap { + u16 type; + u16 flags; + __be16 sport; + __be16 dport; }; /* Flags for ip_tunnel_info mode. */ @@ -66,13 +93,21 @@ struct ip_tunnel_key { GENMASK((sizeof_field(struct ip_tunnel_info, \ options_len) * BITS_PER_BYTE) - 1, 0) +#define ip_tunnel_info_opts(info) \ + _Generic(info, \ + const struct ip_tunnel_info * : ((const void *)(info)->options),\ + struct ip_tunnel_info * : ((void *)(info)->options)\ + ) + struct ip_tunnel_info { struct ip_tunnel_key key; + struct ip_tunnel_encap encap; #ifdef CONFIG_DST_CACHE struct dst_cache dst_cache; #endif u8 options_len; u8 mode; + u8 options[] __aligned_largest __counted_by(options_len); }; /* 6rd prefix/relay information */ @@ -85,13 +120,6 @@ struct ip_tunnel_6rd_parm { }; #endif -struct ip_tunnel_encap { - u16 type; - u16 flags; - __be16 sport; - __be16 dport; -}; - struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; @@ -101,10 +129,24 @@ struct ip_tunnel_prl_entry { struct metadata_dst; +/* Kernel-side variant of ip_tunnel_parm */ +struct ip_tunnel_parm_kern { + char name[IFNAMSIZ]; + IP_TUNNEL_DECLARE_FLAGS(i_flags); + IP_TUNNEL_DECLARE_FLAGS(o_flags); + __be32 i_key; + __be32 o_key; + int link; + struct iphdr iph; +}; + struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; + struct net_device *dev; + netdevice_tracker dev_tracker; + struct net *net; /* netns for packet i/o */ unsigned long err_time; /* Time when the last ICMP error @@ -113,7 +155,7 @@ struct ip_tunnel { /* These four fields used only by GRE */ u32 i_seqno; /* The last seen seqno */ - u32 o_seqno; /* The last output seqno */ + atomic_t o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ /* These four fields used only by ERSPAN */ @@ -124,7 +166,7 @@ struct ip_tunnel { struct dst_cache dst_cache; - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ @@ -145,7 +187,7 @@ struct ip_tunnel { }; struct tnl_ptk_info { - __be16 flags; + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 proto; __be32 key; __be32 seq; @@ -167,11 +209,80 @@ struct ip_tunnel_net { int type; }; +static inline void ip_tunnel_set_options_present(unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); + __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); + + ip_tunnel_flags_or(flags, flags, present); +} + +static inline void ip_tunnel_clear_options_present(unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); + __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); + + __ipt_flag_op(bitmap_andnot, flags, flags, present); +} + +static inline bool ip_tunnel_is_options_present(const unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); + __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); + + return ip_tunnel_flags_intersect(flags, present); +} + +static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(supp) = { }; + + bitmap_set(supp, 0, BITS_PER_TYPE(__be16)); + __set_bit(IP_TUNNEL_VTI_BIT, supp); + + return ip_tunnel_flags_subset(flags, supp); +} + +static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags) +{ + ip_tunnel_flags_zero(dst); + + bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16)); + __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI); +} + +static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags) +{ + __be16 ret; + + ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16))); + if (test_bit(IP_TUNNEL_VTI_BIT, flags)) + ret |= VTI_ISVTI; + + return ret; +} + static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, - __be64 tun_id, __be16 tun_flags) + __be64 tun_id, + const unsigned long *tun_flags) { key->tun_id = tun_id; key->u.ipv4.src = saddr; @@ -181,7 +292,7 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, key->tos = tos; key->ttl = ttl; key->label = label; - key->tun_flags = tun_flags; + ip_tunnel_flags_copy(key->tun_flags, tun_flags); /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. @@ -202,12 +313,8 @@ ip_tunnel_dst_cache_usable(const struct sk_buff *skb, { if (skb->mark) return false; - if (!info) - return true; - if (info->key.tun_flags & TUNNEL_NOCACHE) - return false; - return true; + return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info @@ -240,11 +347,19 @@ static inline __be32 tunnel_id_to_key32(__be64 tun_id) static inline void ip_tunnel_init_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif, - __u32 mark, __u32 tun_inner_hash) + __be32 key, __u8 tos, + struct net *net, int oif, + __u32 mark, __u32 tun_inner_hash, + __u8 flow_flags) { memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; + + if (oif) { + fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index(net, oif); + /* Legacy VRF/l3mdev use case */ + fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; + } + fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_tos = tos; @@ -252,6 +367,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; fl4->flowi4_multipath_hash = tun_inner_hash; + fl4->flowi4_flags = flow_flags; } int ip_tunnel_init(struct net_device *dev); @@ -261,35 +377,46 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); - -void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, - struct rtnl_link_ops *ops); +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); -int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); -int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd); +bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, + const void __user *data); +bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp); +int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); -void ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot); struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, - int link, __be16 flags, + int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key); +void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark); -int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark); + struct ip_tunnel_parm_kern *p, __u32 fwmark); +int ip_tunnel_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct ip_tunnel_parm_kern *p, + __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); +bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *encap); + +void ip_tunnel_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm_kern *parms); + extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); @@ -313,7 +440,8 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); -static inline bool pskb_inet_may_pull(struct sk_buff *skb) +static inline enum skb_drop_reason +pskb_inet_may_pull_reason(struct sk_buff *skb) { int nhlen; @@ -330,7 +458,49 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) nhlen = 0; } - return pskb_network_may_pull(skb, nhlen); + return pskb_network_may_pull_reason(skb, nhlen); +} + +static inline bool pskb_inet_may_pull(struct sk_buff *skb) +{ + return pskb_inet_may_pull_reason(skb) == SKB_NOT_DROPPED_YET; +} + +/* Variant of pskb_inet_may_pull(). + */ +static inline enum skb_drop_reason +skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit) +{ + int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; + __be16 type = skb->protocol; + enum skb_drop_reason reason; + + /* Essentially this is skb_protocol(skb, true) + * And we get MAC len. + */ + if (eth_type_vlan(type)) + type = __vlan_get_protocol(skb, type, &maclen); + + switch (type) { +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + nhlen = sizeof(struct ipv6hdr); + break; +#endif + case htons(ETH_P_IP): + nhlen = sizeof(struct iphdr); + break; + } + /* For ETH_P_IPV6/ETH_P_IP we make sure to pull + * a base network header in skb->head. + */ + reason = pskb_may_pull_reason(skb, maclen + nhlen); + if (reason) + return reason; + + skb_set_network_header(skb, maclen); + + return SKB_NOT_DROPPED_YET; } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) @@ -353,22 +523,23 @@ static inline int ip_encap_hlen(struct ip_tunnel_encap *e) return hlen; } -static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, +static inline int ip_tunnel_encap(struct sk_buff *skb, + struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { const struct ip_tunnel_encap_ops *ops; int ret = -EINVAL; - if (t->encap.type == TUNNEL_ENCAP_NONE) + if (e->type == TUNNEL_ENCAP_NONE) return 0; - if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) + if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); - ops = rcu_dereference(iptun_encaps[t->encap.type]); + ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->build_header)) - ret = ops->build_header(skb, &t->encap, protocol, fl4); + ret = ops->build_header(skb, e, protocol, fl4); rcu_read_unlock(); return ret; @@ -378,26 +549,41 @@ static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, const struct sk_buff *skb) { - if (skb->protocol == htons(ETH_P_IP)) + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IP)) return iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + else if (payload_protocol == htons(ETH_P_IPV6)) return ipv6_get_dsfield((const struct ipv6hdr *)iph); else return 0; } +static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, + const struct sk_buff *skb) +{ + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IPV6)) + return ip6_flowlabel((const struct ipv6hdr *)iph); + else + return 0; +} + static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { - if (skb->protocol == htons(ETH_P_IP)) + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IP)) return iph->ttl; - else if (skb->protocol == htons(ETH_P_IPV6)) + else if (payload_protocol == htons(ETH_P_IPV6)) return ((const struct ipv6hdr *)iph)->hop_limit; else return 0; } -/* Propogate ECN bits out */ +/* Propagate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) { @@ -447,40 +633,37 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += pkt_len; - tstats->tx_packets++; + u64_stats_add(&tstats->tx_bytes, pkt_len); + u64_stats_inc(&tstats->tx_packets); u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); - } else { - struct net_device_stats *err_stats = &dev->stats; - - if (pkt_len < 0) { - err_stats->tx_errors++; - err_stats->tx_aborted_errors++; - } else { - err_stats->tx_dropped++; - } + return; } -} -static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info) -{ - return info + 1; + if (pkt_len < 0) { + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_aborted_errors); + } else { + DEV_STATS_INC(dev, tx_dropped); + } } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { - memcpy(to, info + 1, info->options_len); + memcpy(to, ip_tunnel_info_opts(info), info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, - __be16 flags) + const unsigned long *flags) { - memcpy(ip_tunnel_info_opts(info), from, len); info->options_len = len; - info->key.tun_flags |= flags; + if (len > 0) { + memcpy(ip_tunnel_info_opts(info), from, len); + ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags, + flags); + } } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) @@ -523,10 +706,9 @@ static inline void ip_tunnel_info_opts_get(void *to, static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, - __be16 flags) + const unsigned long *flags) { info->options_len = 0; - info->key.tun_flags |= flags; } #endif /* CONFIG_INET */ diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d609e957a3ec..ff406ef4fd4a 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -29,6 +29,7 @@ #include <net/netfilter/nf_conntrack.h> #endif #include <net/net_namespace.h> /* Netw namespace */ +#include <linux/sched/isolation.h> #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 @@ -42,6 +43,8 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; +extern struct mutex __ip_vs_mutex; + struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ @@ -262,26 +265,6 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, pr_err(msg, ##__VA_ARGS__); \ } while (0) -#ifdef CONFIG_IP_VS_DEBUG -#define EnterFunction(level) \ - do { \ - if (level <= ip_vs_get_debug_level()) \ - printk(KERN_DEBUG \ - pr_fmt("Enter: %s, %s line %i\n"), \ - __func__, __FILE__, __LINE__); \ - } while (0) -#define LeaveFunction(level) \ - do { \ - if (level <= ip_vs_get_debug_level()) \ - printk(KERN_DEBUG \ - pr_fmt("Leave: %s, %s line %i\n"), \ - __func__, __FILE__, __LINE__); \ - } while (0) -#else -#define EnterFunction(level) do {} while (0) -#define LeaveFunction(level) do {} while (0) -#endif - /* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) @@ -351,11 +334,11 @@ struct ip_vs_seq { /* counters per cpu */ struct ip_vs_counters { - __u64 conns; /* connections scheduled */ - __u64 inpkts; /* incoming packets */ - __u64 outpkts; /* outgoing packets */ - __u64 inbytes; /* incoming bytes */ - __u64 outbytes; /* outgoing bytes */ + u64_stats_t conns; /* connections scheduled */ + u64_stats_t inpkts; /* incoming packets */ + u64_stats_t outpkts; /* outgoing packets */ + u64_stats_t inbytes; /* incoming bytes */ + u64_stats_t outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { @@ -363,9 +346,12 @@ struct ip_vs_cpu_stats { struct u64_stats_sync syncp; }; +/* Default nice for estimator kthreads */ +#define IPVS_EST_NICE 0 + /* IPVS statistics objects */ struct ip_vs_estimator { - struct list_head list; + struct hlist_node list; u64 last_inbytes; u64 last_outbytes; @@ -378,6 +364,10 @@ struct ip_vs_estimator { u64 outpps; u64 inbps; u64 outbps; + + s32 ktid:16, /* kthread ID, -1=temp list */ + ktrow:8, /* row/tick ID for kthread */ + ktcid:8; /* chain ID for kthread tick */ }; /* @@ -405,6 +395,77 @@ struct ip_vs_stats { struct ip_vs_kstats kstats0; /* reset values */ }; +struct ip_vs_stats_rcu { + struct ip_vs_stats s; + struct rcu_head rcu_head; +}; + +int ip_vs_stats_init_alloc(struct ip_vs_stats *s); +struct ip_vs_stats *ip_vs_stats_alloc(void); +void ip_vs_stats_release(struct ip_vs_stats *stats); +void ip_vs_stats_free(struct ip_vs_stats *stats); + +/* Process estimators in multiple timer ticks (20/50/100, see ktrow) */ +#define IPVS_EST_NTICKS 50 +/* Estimation uses a 2-second period containing ticks (in jiffies) */ +#define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NTICKS) + +/* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C). + * Value of 4 and above ensures kthreads will take work without exceeding + * the CPU capacity under different circumstances. + */ +#define IPVS_EST_LOAD_DIVISOR 8 + +/* Kthreads should not have work that exceeds the CPU load above 50% */ +#define IPVS_EST_CPU_KTHREADS (IPVS_EST_LOAD_DIVISOR / 2) + +/* Desired number of chains per timer tick (chain load factor in 100us units), + * 48=4.8ms of 40ms tick (12% CPU usage): + * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50 + */ +#define IPVS_EST_CHAIN_FACTOR \ + ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8) + +/* Compiled number of chains per tick + * The defines should match cond_resched_rcu + */ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) +#define IPVS_EST_TICK_CHAINS IPVS_EST_CHAIN_FACTOR +#else +#define IPVS_EST_TICK_CHAINS 1 +#endif + +#if IPVS_EST_NTICKS > 127 +#error Too many timer ticks for ktrow +#endif + +/* Multiple chains processed in same tick */ +struct ip_vs_est_tick_data { + struct rcu_head rcu_head; + struct hlist_head chains[IPVS_EST_TICK_CHAINS]; + DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS); + DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS); + int chain_len[IPVS_EST_TICK_CHAINS]; +}; + +/* Context for estimation kthread */ +struct ip_vs_est_kt_data { + struct netns_ipvs *ipvs; + struct task_struct *task; /* task if running */ + struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS]; + DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ + unsigned long est_timer; /* estimation timer (jiffies) */ + struct ip_vs_stats *calc_stats; /* Used for calculation */ + int tick_len[IPVS_EST_NTICKS]; /* est count */ + int id; /* ktid per netns */ + int chain_max; /* max ests per tick chain */ + int tick_max; /* max ests per tick */ + int est_count; /* attached ests to kthread */ + int est_max_count; /* max ests per kthread */ + int add_row; /* row for new ests */ + int est_row; /* estimated row */ +}; + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -523,7 +584,7 @@ struct ip_vs_conn { spinlock_t lock; /* lock for state transition */ volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for - * state transition triggerd + * state transition triggered * synchronization */ __u32 fwmark; /* Fire wall mark from skb */ @@ -549,8 +610,10 @@ struct ip_vs_conn { */ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */ - struct ip_vs_seq in_seq; /* incoming seq. struct */ - struct ip_vs_seq out_seq; /* outgoing seq. struct */ + struct_group(sync_conn_opt, + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ + ); const struct ip_vs_pe *pe; char *pe_data; @@ -572,7 +635,7 @@ struct ip_vs_service_user_kern { u16 protocol; union nf_inet_addr addr; /* virtual ip address */ __be16 port; - u32 fwmark; /* firwall mark of service */ + u32 fwmark; /* firewall mark of service */ /* virtual service options */ char *sched_name; @@ -688,6 +751,7 @@ struct ip_vs_dest { union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ + struct rcu_head rcu_head; struct list_head t_list; /* in dest_trash */ unsigned int in_rs_table:1; /* we are in rs_table */ }; @@ -869,7 +933,7 @@ struct netns_ipvs { atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ - struct ip_vs_stats tot_stats; /* Statistics & est. */ + struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ int num_services; /* no of virtual services */ int num_services6; /* IPv6 virtual services */ @@ -931,6 +995,13 @@ struct netns_ipvs { int sysctl_conn_reuse_mode; int sysctl_schedule_icmp; int sysctl_ignore_tunneled; + int sysctl_run_estimation; +#ifdef CONFIG_SYSCTL + cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */ + int est_cpulist_valid; /* cpulist set */ + int sysctl_est_nice; /* kthread nice */ + int est_stopped; /* stop tasks */ +#endif /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -941,9 +1012,17 @@ struct netns_ipvs { struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; /* ip_vs_est */ - struct list_head est_list; /* estimator list */ - spinlock_t est_lock; - struct timer_list est_timer; /* Estimation timer */ + struct delayed_work est_reload_work;/* Reload kthread tasks */ + struct mutex est_mutex; /* protect kthread tasks */ + struct hlist_head est_temp_list; /* Ests during calc phase */ + struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ + unsigned long est_max_threads;/* Hard limit of kthreads */ + int est_calc_phase; /* Calculation phase */ + int est_chain_max; /* Calculated chain_max */ + int est_kt_count; /* Allocated ptrs */ + int est_add_ktid; /* ktid where to add ests */ + atomic_t est_genid; /* kthreads reload genid */ + atomic_t est_genid_done; /* applied genid */ /* ip_vs_sync */ spinlock_t sync_lock; struct ipvs_master_sync_state *ms; @@ -957,7 +1036,7 @@ struct netns_ipvs { struct ipvs_sync_daemon_cfg bcfg; /* Backup Configuration */ /* net name space ptr */ struct net *net; /* Needed by timer routines */ - /* Number of heterogeneous destinations, needed becaus heterogeneous + /* Number of heterogeneous destinations, needed because heterogeneous * are not supported when synchronization is enabled. */ unsigned int mixed_address_family_dests; @@ -1071,6 +1150,24 @@ static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) return ipvs->sysctl_cache_bypass; } +static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_run_estimation; +} + +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + if (ipvs->est_cpulist_valid) + return ipvs->sysctl_est_cpulist; + else + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_est_nice; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -1163,6 +1260,21 @@ static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) return 0; } +static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) +{ + return 1; +} + +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return IPVS_EST_NICE; +} + #endif /* IPVS core functions @@ -1464,10 +1576,41 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state); void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts); /* IPVS rate estimator prototypes (from ip_vs_est.c) */ -void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); +int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); +void ip_vs_est_reload_start(struct netns_ipvs *ipvs); +int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd); +void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); + +static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + /* Stop tasks while cpulist is empty or if disabled with flag */ + ipvs->est_stopped = !sysctl_run_estimation(ipvs) || + (ipvs->est_cpulist_valid && + cpumask_empty(sysctl_est_cpulist(ipvs))); +#endif +} + +static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + return ipvs->est_stopped; +#else + return false; +#endif +} + +static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) +{ + unsigned int limit = IPVS_EST_CPU_KTHREADS * + cpumask_weight(sysctl_est_cpulist(ipvs)); + + return max(1U, limit); +} /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, @@ -1712,4 +1855,15 @@ ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) atomic_read(&dest->inactconns); } +#ifdef CONFIG_IP_VS_PROTO_TCP +INDIRECT_CALLABLE_DECLARE(int + tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP +INDIRECT_CALLABLE_DECLARE(int + udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); +#endif #endif /* _NET_IP_VS_H */ diff --git a/include/net/ipcomp.h b/include/net/ipcomp.h index fee6fc451597..51401f01e2a5 100644 --- a/include/net/ipcomp.h +++ b/include/net/ipcomp.h @@ -2,25 +2,16 @@ #ifndef _NET_IPCOMP_H #define _NET_IPCOMP_H -#include <linux/types.h> - -#define IPCOMP_SCRATCH_SIZE 65400 - -struct crypto_comp; - -struct ipcomp_data { - u16 threshold; - struct crypto_comp * __percpu *tfms; -}; +#include <linux/skbuff.h> struct ip_comp_hdr; -struct sk_buff; +struct netlink_ext_ack; struct xfrm_state; int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb); int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb); void ipcomp_destroy(struct xfrm_state *x); -int ipcomp_init_state(struct xfrm_state *x); +int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack); static inline struct ip_comp_hdr *ip_comp_hdr(const struct sk_buff *skb) { diff --git a/include/net/ipconfig.h b/include/net/ipconfig.h index e3534299bd2a..8276897d0c2e 100644 --- a/include/net/ipconfig.h +++ b/include/net/ipconfig.h @@ -7,6 +7,8 @@ /* The following are initdata: */ +#include <linux/types.h> + extern int ic_proto_enabled; /* Protocols enabled (see IC_xxx) */ extern int ic_set_manually; /* IPconfig parameters set manually */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index bd1f396cc9c7..2ccdf85f34f1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -15,12 +15,14 @@ #include <linux/refcount.h> #include <linux/jump_label_ratelimit.h> #include <net/if_inet6.h> -#include <net/ndisc.h> #include <net/flow.h> #include <net/flow_dissector.h> +#include <net/inet_dscp.h> #include <net/snmp.h> #include <net/netns/hash.h> +struct ip_tunnel_info; + #define SIN6_LEN_RFC2133 24 #define IPV6_MAXPLEN 65535 @@ -30,6 +32,7 @@ */ #define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ +#define NEXTHDR_IPV4 4 /* IPv4 in IPv6 */ #define NEXTHDR_TCP 6 /* TCP segment. */ #define NEXTHDR_UDP 17 /* UDP message. */ #define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ @@ -148,6 +151,17 @@ struct frag_hdr { __be32 identification; }; +/* + * Jumbo payload option, as described in RFC 2675 2. + */ +struct hop_jumbo_hdr { + u8 nexthdr; + u8 hdrlen; + u8 tlv_type; /* IPV6_TLV_JUMBO, 0xC2 */ + u8 tlv_len; /* 4 */ + __be32 jumbo_payload_len; +}; + #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 @@ -232,17 +246,20 @@ extern int sysctl_mld_qrv; #define _DEVADD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ + unsigned long _field = (field); \ + unsigned long _val = (val); \ if (likely(_idev != NULL)) \ - mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \ - mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\ + mod##SNMP_ADD_STATS((_idev)->stats.statname, _field, _val); \ + mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, _field, _val);\ }) #define _DEVUPD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ + unsigned long _val = (val); \ if (likely(_idev != NULL)) \ - mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \ - mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\ + mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, _val); \ + mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, _val);\ }) /* MIBs */ @@ -344,28 +361,21 @@ struct ipcm6_cookie { struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; + __u16 gso_size; __s8 dontfrag; struct ipv6_txoptions *opt; - __u16 gso_size; }; -static inline void ipcm6_init(struct ipcm6_cookie *ipc6) -{ - *ipc6 = (struct ipcm6_cookie) { - .hlimit = -1, - .tclass = -1, - .dontfrag = -1, - }; -} - static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, - const struct ipv6_pinfo *np) + const struct sock *sk) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, - .tclass = np->tclass, - .dontfrag = np->dontfrag, + .tclass = inet6_sk(sk)->tclass, + .dontfrag = inet6_test_bit(DONTFRAG, sk), }; + + sockcm_init(&ipc6->sockc, sk); } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) @@ -390,17 +400,20 @@ static inline void txopt_put(struct ipv6_txoptions *opt) kfree_rcu(opt, rcu); } +#if IS_ENABLED(CONFIG_IPV6) struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label); extern struct static_key_false_deferred ipv6_flowlabel_exclusive; static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { - if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key)) + if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) && + READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl)) return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); return NULL; } +#endif struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, @@ -411,7 +424,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); -bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); +bool ip6_autoflowlabel(struct net *net, const struct sock *sk); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { @@ -419,7 +432,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl) atomic_dec(&fl->users); } -void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); +enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, + u8 code, __be32 info); void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len); @@ -434,21 +448,97 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt); -struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, - struct ipv6_txoptions *opt); +struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, + struct ipv6_txoptions *opt); + +static inline struct ipv6_txoptions * +ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) +{ + if (!opt) + return NULL; + return __ipv6_fixup_options(opt_space, opt); +} bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt); struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt); -static inline bool ipv6_accept_ra(struct inet6_dev *idev) +/* This helper is specialized for BIG TCP needs. + * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. + * It assumes headers are already in skb->head. + * Returns: 0, or IPPROTO_TCP if a BIG TCP packet is there. + */ +static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) +{ + const struct hop_jumbo_hdr *jhdr; + const struct ipv6hdr *nhdr; + + if (likely(skb->len <= GRO_LEGACY_MAX_SIZE)) + return 0; + + if (skb->protocol != htons(ETH_P_IPV6)) + return 0; + + if (skb_network_offset(skb) + + sizeof(struct ipv6hdr) + + sizeof(struct hop_jumbo_hdr) > skb_headlen(skb)) + return 0; + + nhdr = ipv6_hdr(skb); + + if (nhdr->nexthdr != NEXTHDR_HOP) + return 0; + + jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1); + if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 || + jhdr->nexthdr != IPPROTO_TCP) + return 0; + return jhdr->nexthdr; +} + +/* Return 0 if HBH header is successfully removed + * Or if HBH removal is unnecessary (packet is not big TCP) + * Return error to indicate dropping the packet + */ +static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb) { + const int hophdr_len = sizeof(struct hop_jumbo_hdr); + int nexthdr = ipv6_has_hopopt_jumbo(skb); + struct ipv6hdr *h6; + + if (!nexthdr) + return 0; + + if (skb_cow_head(skb, 0)) + return -1; + + /* Remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][L4 Header] + */ + memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb), + skb_network_header(skb) - skb_mac_header(skb) + + sizeof(struct ipv6hdr)); + + __skb_pull(skb, hophdr_len); + skb->network_header += hophdr_len; + skb->mac_header += hophdr_len; + + h6 = ipv6_hdr(skb); + h6->nexthdr = nexthdr; + + return 0; +} + +static inline bool ipv6_accept_ra(const struct inet6_dev *idev) +{ + s32 accept_ra = READ_ONCE(idev->cnf.accept_ra); + /* If forwarding is enabled, RA are not accepted unless the special * hybrid mode (accept_ra=2) is enabled. */ - return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 : - idev->cnf.accept_ra; + return READ_ONCE(idev->cnf.forwarding) ? accept_ra == 2 : + accept_ra; } #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ @@ -660,12 +750,8 @@ static inline u32 ipv6_addr_hash(const struct in6_addr *a) /* more secured version of ipv6_addr_hash() */ static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) { - u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; - - return jhash_3words(v, - (__force u32)a->s6_addr32[2], - (__force u32)a->s6_addr32[3], - initval); + return jhash2((__force const u32 *)a->s6_addr32, + ARRAY_SIZE(a->s6_addr32), initval); } static inline bool ipv6_addr_loopback(const struct in6_addr *a) @@ -761,7 +847,7 @@ static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int * we should *never* get to this point since that * would mean the addrs are equal * - * However, we do get to it 8) And exacly, when + * However, we do get to it 8) And exactly, when * addresses are equal 8) * * ip route add 1111::/128 via ... @@ -821,9 +907,9 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, int hlimit; if (ipv6_addr_is_multicast(&fl6->daddr)) - hlimit = np->mcast_hops; + hlimit = READ_ONCE(np->mcast_hops); else - hlimit = np->hop_limit; + hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); return hlimit; @@ -839,7 +925,7 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) != offsetof(typeof(flow->addrs), v6addrs.src) + sizeof(flow->addrs.v6addrs.src)); - memcpy(&flow->addrs.v6addrs, &iph->saddr, sizeof(flow->addrs.v6addrs)); + memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -849,7 +935,8 @@ static inline bool ipv6_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv6.sysctl.ip_nonlocal_bind || - inet->freebind || inet->transparent; + test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || + test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } /* Sysctl settings for net ipv6.auto_flowlabels */ @@ -882,7 +969,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, hash = skb_get_hash_flowi6(skb, fl6); /* Since this is being sent on the wire obfuscate hash a bit - * to minimize possbility that any useful information to an + * to minimize possibility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ hash = rol32(hash, 16); @@ -925,11 +1012,19 @@ static inline int ip6_multipath_hash_policy(const struct net *net) { return net->ipv6.sysctl.multipath_hash_policy; } +static inline u32 ip6_multipath_hash_fields(const struct net *net) +{ + return net->ipv6.sysctl.multipath_hash_fields; +} #else static inline int ip6_multipath_hash_policy(const struct net *net) { return 0; } +static inline u32 ip6_multipath_hash_fields(const struct net *net) +{ + return 0; +} #endif /* @@ -956,6 +1051,11 @@ static inline u8 ip6_tclass(__be32 flowinfo) return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT; } +static inline dscp_t ip6_dscp(__be32 flowinfo) +{ + return inet_dsfield_to_dscp(ip6_tclass(flowinfo)); +} + static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) { return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; @@ -992,7 +1092,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, + void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags); @@ -1008,8 +1108,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, - struct ipcm6_cookie *ipc6, struct flowi6 *fl6, + void *from, size_t length, int transhdrlen, + struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); @@ -1026,12 +1126,6 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, st struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected); -struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, - struct net_device *dev, - struct net *net, struct socket *sock, - struct in6_addr *saddr, - const struct ip_tunnel_info *info, - u8 protocol, bool use_cache); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *orig_dst); @@ -1083,9 +1177,14 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, /* * socket options (ipv6_sockglue.c) */ +DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount); +int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +int do_ipv6_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); @@ -1106,8 +1205,11 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); +void inet6_cleanup_sock(struct sock *sk); +void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); @@ -1135,7 +1237,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p); + sockptr_t optval, size_t ss_offset); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); @@ -1162,7 +1264,9 @@ static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; } #ifdef CONFIG_SYSCTL struct ctl_table *ipv6_icmp_sysctl_init(struct net *net); +size_t ipv6_icmp_sysctl_table_size(void); struct ctl_table *ipv6_route_sysctl_init(struct net *net); +size_t ipv6_route_sysctl_table_size(struct net *net); int ipv6_sysctl_register(void); void ipv6_sysctl_unregister(void); #endif @@ -1186,15 +1290,16 @@ static inline int ip6_sock_set_v6only(struct sock *sk) static inline void ip6_sock_set_recverr(struct sock *sk) { - lock_sock(sk); - inet6_sk(sk)->recverr = true; - release_sock(sk); + inet6_set_bit(RECVERR6, sk); } -static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) +#define IPV6_PREFER_SRC_MASK (IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBLIC | \ + IPV6_PREFER_SRC_COA) + +static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val) { + unsigned int prefmask = ~IPV6_PREFER_SRC_MASK; unsigned int pref = 0; - unsigned int prefmask = ~0; /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ switch (val & (IPV6_PREFER_SRC_PUBLIC | @@ -1244,25 +1349,28 @@ static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) return -EINVAL; } - inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref; + WRITE_ONCE(inet6_sk(sk)->srcprefs, + (READ_ONCE(inet6_sk(sk)->srcprefs) & prefmask) | pref); return 0; } -static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) +static inline void ip6_sock_set_recvpktinfo(struct sock *sk) { - int ret; - lock_sock(sk); - ret = __ip6_sock_set_addr_preferences(sk, val); + inet6_sk(sk)->rxopt.bits.rxinfo = true; release_sock(sk); - return ret; } -static inline void ip6_sock_set_recvpktinfo(struct sock *sk) +#define IPV6_ADDR_WORDS 4 + +static inline void ipv6_addr_cpu_to_be32(__be32 *dst, const u32 *src) { - lock_sock(sk); - inet6_sk(sk)->rxopt.bits.rxinfo = true; - release_sock(sk); + cpu_to_be32_array(dst, src, IPV6_ADDR_WORDS); +} + +static inline void ipv6_addr_be32_to_cpu(u32 *dst, const __be32 *src) +{ + be32_to_cpu_array(dst, src, IPV6_ADDR_WORDS); } #endif /* _NET_IPV6_H */ diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index a21e8b1381a1..38ef66826939 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_FRAG_H #define _IPV6_FRAG_H +#include <linux/icmpv6.h> #include <linux/kernel.h> #include <net/addrconf.h> #include <net/ipv6.h> @@ -65,16 +66,19 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; struct sk_buff *head; + int refs = 1; rcu_read_lock(); - if (fq->q.fqdir->dead) + /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ + if (READ_ONCE(fq->q.fqdir->dead)) goto out_rcu_unlock; spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q); + fq->q.flags |= INET_FRAG_DROP; + inet_frag_kill(&fq->q, &refs); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) @@ -99,14 +103,44 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) spin_unlock(&fq->q.lock); icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); - kfree_skb(head); + kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); goto out_rcu_unlock; out: spin_unlock(&fq->q.lock); out_rcu_unlock: rcu_read_unlock(); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); } + +/* Check if the upper layer header is truncated in the first fragment. */ +static inline bool +ipv6frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp) +{ + u8 nexthdr = *nexthdrp; + __be16 frag_off; + int offset; + + offset = ipv6_skip_exthdr(skb, start, &nexthdr, &frag_off); + if (offset < 0 || (frag_off & htons(IP6_OFFSET))) + return false; + switch (nexthdr) { + case NEXTHDR_TCP: + offset += sizeof(struct tcphdr); + break; + case NEXTHDR_UDP: + offset += sizeof(struct udphdr); + break; + case NEXTHDR_ICMP: + offset += sizeof(struct icmp6hdr); + break; + default: + offset += 1; + } + if (offset > skb->len) + return true; + return false; +} + #endif #endif diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 8fce558b5fea..8a3465c8c2c5 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -9,6 +9,7 @@ #include <net/flow.h> #include <net/neighbour.h> #include <net/sock.h> +#include <net/ipv6.h> /* structs from net/ip6_fib.h */ struct fib6_info; @@ -47,6 +48,7 @@ struct ipv6_stub { struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void (*fib6_nh_release)(struct fib6_nh *fib6_nh); + void (*fib6_nh_release_dsts)(struct fib6_nh *fib6_nh); void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt); int (*ip6_del_rt)(struct net *net, struct fib6_info *rt, bool skip_notify); void (*fib6_rt_update)(struct net *net, struct fib6_info *rt, @@ -59,6 +61,9 @@ struct ipv6_stub { #if IS_ENABLED(CONFIG_XFRM) void (*xfrm6_local_rxpmtu)(struct sk_buff *skb, u32 mtu); int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb); + struct sk_buff *(*xfrm6_gro_udp_encap_rcv)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); int (*xfrm6_rcv_encap)(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); #endif @@ -66,6 +71,10 @@ struct ipv6_stub { int (*ipv6_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); + struct net_device *(*ipv6_dev_find)(struct net *net, const struct in6_addr *addr, + struct net_device *dev); + int (*ip6_xmit)(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); }; extern const struct ipv6_stub *ipv6_stub __read_mostly; @@ -73,11 +82,20 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly; struct ipv6_bpf_stub { int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); - struct sock *(*udp6_lib_lookup)(struct net *net, + struct sock *(*udp6_lib_lookup)(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); + int (*ipv6_setsockopt)(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); + int (*ipv6_getsockopt)(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); + int (*ipv6_dev_get_saddr)(struct net *net, + const struct net_device *dst_dev, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; diff --git a/include/net/ipx.h b/include/net/ipx.h deleted file mode 100644 index 9d1342807b59..000000000000 --- a/include/net/ipx.h +++ /dev/null @@ -1,171 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_INET_IPX_H_ -#define _NET_INET_IPX_H_ -/* - * The following information is in its entirety obtained from: - * - * Novell 'IPX Router Specification' Version 1.10 - * Part No. 107-000029-001 - * - * Which is available from ftp.novell.com - */ - -#include <linux/netdevice.h> -#include <net/datalink.h> -#include <linux/ipx.h> -#include <linux/list.h> -#include <linux/slab.h> -#include <linux/refcount.h> - -struct ipx_address { - __be32 net; - __u8 node[IPX_NODE_LEN]; - __be16 sock; -}; - -#define ipx_broadcast_node "\377\377\377\377\377\377" -#define ipx_this_node "\0\0\0\0\0\0" - -#define IPX_MAX_PPROP_HOPS 8 - -struct ipxhdr { - __be16 ipx_checksum __packed; -#define IPX_NO_CHECKSUM cpu_to_be16(0xFFFF) - __be16 ipx_pktsize __packed; - __u8 ipx_tctrl; - __u8 ipx_type; -#define IPX_TYPE_UNKNOWN 0x00 -#define IPX_TYPE_RIP 0x01 /* may also be 0 */ -#define IPX_TYPE_SAP 0x04 /* may also be 0 */ -#define IPX_TYPE_SPX 0x05 /* SPX protocol */ -#define IPX_TYPE_NCP 0x11 /* $lots for docs on this (SPIT) */ -#define IPX_TYPE_PPROP 0x14 /* complicated flood fill brdcast */ - struct ipx_address ipx_dest __packed; - struct ipx_address ipx_source __packed; -}; - -/* From af_ipx.c */ -extern int sysctl_ipx_pprop_broadcasting; - -struct ipx_interface { - /* IPX address */ - __be32 if_netnum; - unsigned char if_node[IPX_NODE_LEN]; - refcount_t refcnt; - - /* physical device info */ - struct net_device *if_dev; - struct datalink_proto *if_dlink; - __be16 if_dlink_type; - - /* socket support */ - unsigned short if_sknum; - struct hlist_head if_sklist; - spinlock_t if_sklist_lock; - - /* administrative overhead */ - int if_ipx_offset; - unsigned char if_internal; - unsigned char if_primary; - - struct list_head node; /* node in ipx_interfaces list */ -}; - -struct ipx_route { - __be32 ir_net; - struct ipx_interface *ir_intrfc; - unsigned char ir_routed; - unsigned char ir_router_node[IPX_NODE_LEN]; - struct list_head node; /* node in ipx_routes list */ - refcount_t refcnt; -}; - -struct ipx_cb { - u8 ipx_tctrl; - __be32 ipx_dest_net; - __be32 ipx_source_net; - struct { - __be32 netnum; - int index; - } last_hop; -}; - -#include <net/sock.h> - -struct ipx_sock { - /* struct sock has to be the first member of ipx_sock */ - struct sock sk; - struct ipx_address dest_addr; - struct ipx_interface *intrfc; - __be16 port; -#ifdef CONFIG_IPX_INTERN - unsigned char node[IPX_NODE_LEN]; -#endif - unsigned short type; - /* - * To handle special ncp connection-handling sockets for mars_nwe, - * the connection number must be stored in the socket. - */ - unsigned short ipx_ncp_conn; -}; - -static inline struct ipx_sock *ipx_sk(struct sock *sk) -{ - return (struct ipx_sock *)sk; -} - -#define IPX_SKB_CB(__skb) ((struct ipx_cb *)&((__skb)->cb[0])) - -#define IPX_MIN_EPHEMERAL_SOCKET 0x4000 -#define IPX_MAX_EPHEMERAL_SOCKET 0x7fff - -extern struct list_head ipx_routes; -extern rwlock_t ipx_routes_lock; - -extern struct list_head ipx_interfaces; -struct ipx_interface *ipx_interfaces_head(void); -extern spinlock_t ipx_interfaces_lock; - -extern struct ipx_interface *ipx_primary_net; - -int ipx_proc_init(void); -void ipx_proc_exit(void); - -const char *ipx_frame_name(__be16); -const char *ipx_device_name(struct ipx_interface *intrfc); - -static __inline__ void ipxitf_hold(struct ipx_interface *intrfc) -{ - refcount_inc(&intrfc->refcnt); -} - -void ipxitf_down(struct ipx_interface *intrfc); -struct ipx_interface *ipxitf_find_using_net(__be32 net); -int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node); -__be16 ipx_cksum(struct ipxhdr *packet, int length); -int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc, - unsigned char *node); -void ipxrtr_del_routes(struct ipx_interface *intrfc); -int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, - struct msghdr *msg, size_t len, int noblock); -int ipxrtr_route_skb(struct sk_buff *skb); -struct ipx_route *ipxrtr_lookup(__be32 net); -int ipxrtr_ioctl(unsigned int cmd, void __user *arg); - -static __inline__ void ipxitf_put(struct ipx_interface *intrfc) -{ - if (refcount_dec_and_test(&intrfc->refcnt)) - ipxitf_down(intrfc); -} - -static __inline__ void ipxrtr_hold(struct ipx_route *rt) -{ - refcount_inc(&rt->refcnt); -} - -static __inline__ void ipxrtr_put(struct ipx_route *rt) -{ - if (refcount_dec_and_test(&rt->refcnt)) - kfree(rt); -} -#endif /* _NET_INET_IPX_H_ */ diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index 9259ce2b22f3..df85d19fbf84 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -112,10 +112,12 @@ enum iucv_tx_notify { struct iucv_sock { struct sock sk; - char src_user_id[8]; - char src_name[8]; - char dst_user_id[8]; - char dst_name[8]; + struct_group(init, + char src_user_id[8]; + char src_name[8]; + char dst_user_id[8]; + char dst_name[8]; + ); struct list_head accept_q; spinlock_t accept_q_lock; struct sock *parent; @@ -128,11 +130,12 @@ struct iucv_sock { u8 flags; u16 msglimit; u16 msglimit_peer; + atomic_t skbs_in_xmit; atomic_t msg_sent; atomic_t msg_recv; atomic_t pendings; int transport; - void (*sk_txnotify)(struct sk_buff *skb, + void (*sk_txnotify)(struct sock *sk, enum iucv_tx_notify n); }; diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index f9e88401d7da..9804fa5d9c67 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -15,7 +15,7 @@ * To explore any of the IUCV functions, one must first register their * program using iucv_register(). Once your program has successfully * completed a register, it can exploit the other functions. - * For furthur reference on all IUCV functionality, refer to the + * For further reference on all IUCV functionality, refer to the * CP Programming Services book, also available on the web thru * www.vm.ibm.com/pubs, manual # SC24-6084 * @@ -30,6 +30,7 @@ #include <linux/types.h> #include <linux/slab.h> +#include <asm/dma-types.h> #include <asm/debug.h> /* @@ -76,12 +77,17 @@ * and iucv_message_reply if IUCV_IPBUFLST or IUCV_IPANSLST are used. */ struct iucv_array { - u32 address; + dma32_t address; u32 length; } __attribute__ ((aligned (8))); -extern struct bus_type iucv_bus; -extern struct device *iucv_root; +extern const struct bus_type iucv_bus; + +struct device_driver; + +struct device *iucv_alloc_device(const struct attribute_group **attrs, + struct device_driver *driver, void *priv, + const char *fmt, ...) __printf(4, 5); /* * struct iucv_path @@ -196,7 +202,7 @@ struct iucv_handler { * * Registers a driver with IUCV. * - * Returns 0 on success, -ENOMEM if the memory allocation for the pathid + * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus. */ int iucv_register(struct iucv_handler *handler, int smp); @@ -218,7 +224,7 @@ void iucv_unregister(struct iucv_handler *handle, int smp); * * Allocate a new path structure for use with iucv_connect. * - * Returns NULL if the memory allocation failed or a pointer to the + * Returns: NULL if the memory allocation failed or a pointer to the * path structure. */ static inline struct iucv_path *iucv_path_alloc(u16 msglim, u8 flags, gfp_t gfp) @@ -254,7 +260,7 @@ static inline void iucv_path_free(struct iucv_path *path) * This function is issued after the user received a connection pending * external interrupt and now wishes to complete the IUCV communication path. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, u8 *userdata, void *private); @@ -272,7 +278,7 @@ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, * successfully, you are not able to use the path until you receive an IUCV * Connection Complete external interrupt. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, u8 *userid, u8 *system, u8 *userdata, @@ -286,7 +292,7 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, * This function temporarily suspends incoming messages on an IUCV path. * You can later reactivate the path by invoking the iucv_resume function. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); @@ -298,7 +304,7 @@ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); * This function resumes incoming messages on an IUCV path that has * been stopped with iucv_path_quiesce. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_resume(struct iucv_path *path, u8 *userdata); @@ -309,7 +315,7 @@ int iucv_path_resume(struct iucv_path *path, u8 *userdata); * * This function terminates an IUCV path. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_sever(struct iucv_path *path, u8 *userdata); @@ -321,7 +327,7 @@ int iucv_path_sever(struct iucv_path *path, u8 *userdata); * * Cancels a message you have sent. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, u32 srccls); @@ -341,7 +347,7 @@ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual); @@ -361,7 +367,7 @@ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, @@ -376,7 +382,7 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * are notified of a message and the time that you complete the message, * the message may be rejected. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); @@ -393,7 +399,7 @@ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into * the parameter list. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *reply, size_t size); @@ -413,7 +419,7 @@ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -433,7 +439,7 @@ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -455,7 +461,7 @@ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * reply to the message and a buffer is provided into which IUCV moves * the reply to this message. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size, @@ -489,7 +495,7 @@ struct iucv_interface { int (*path_sever)(struct iucv_path *path, u8 userdata[16]); int (*iucv_register)(struct iucv_handler *handler, int smp); void (*iucv_unregister)(struct iucv_handler *handler, int smp); - struct bus_type *bus; + const struct bus_type *bus; struct device *root; }; diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index d2ea5863eedc..b80e474cb0aa 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -23,7 +23,7 @@ * to handle wireless statistics. * * The initial APIs served us well and has proven a reasonably good design. - * However, there is a few shortcommings : + * However, there are a few shortcomings : * o No events, everything is a request to the driver. * o Large ioctl function in driver with gigantic switch statement * (i.e. spaghetti code). @@ -38,13 +38,13 @@ * ------------------------------- * The new driver API is just a bunch of standard functions (handlers), * each handling a specific Wireless Extension. The driver just export - * the list of handler it supports, and those will be called apropriately. + * the list of handler it supports, and those will be called appropriately. * * I tried to keep the main advantage of the previous API (simplicity, * efficiency and light weight), and also I provide a good dose of backward * compatibility (most structures are the same, driver can use both API * simultaneously, ...). - * Hopefully, I've also addressed the shortcomming of the initial API. + * Hopefully, I've also addressed the shortcoming of the initial API. * * The advantage of the new API are : * o Handling of Extensions in driver broken in small contained functions @@ -84,7 +84,7 @@ /* ---------------------- THE IMPLEMENTATION ---------------------- */ /* - * Some of the choice I've made are pretty controversials. Defining an + * Some of the choice I've made are pretty controversial. Defining an * API is very much weighting compromises. This goes into some of the * details and the thinking behind the implementation. * @@ -140,7 +140,7 @@ * example to distinguish setting max rate and basic rate), I would * break the prototype. Using iwreq_data is more flexible. * 3) Also, the above form is not generic (see above). - * 4) I don't expect driver developper using the wrong field of the + * 4) I don't expect driver developer using the wrong field of the * union (Doh !), so static typechecking doesn't add much value. * 5) Lastly, you can skip the union by doing : * static int mydriver_ioctl_setrate(struct net_device *dev, @@ -279,8 +279,6 @@ #define IW_DESCR_FLAG_RESTRICT 0x0004 /* GET : request is ROOT only */ /* SET : Omit payload from generated iwevent */ #define IW_DESCR_FLAG_NOMAX 0x0008 /* GET : no limit on request size */ -/* Driver level flags */ -#define IW_DESCR_FLAG_WAIT 0x0100 /* Wait for driver event */ /****************************** TYPES ******************************/ @@ -373,11 +371,10 @@ struct iw_handler_def { */ struct iw_ioctl_description { __u8 header_type; /* NULL, iw_point or other */ - __u8 token_type; /* Future */ + __u8 flags; /* Special handling of the request */ __u16 token_size; /* Granularity of payload */ __u16 min_tokens; /* Min acceptable token number */ __u16 max_tokens; /* Max acceptable token number */ - __u32 flags; /* Special handling of the request */ }; /* Need to think of short header translation table. Later. */ @@ -404,39 +401,12 @@ struct iw_spy_data { u_char spy_thr_under[IW_MAX_SPY]; }; -/* --------------------- DEVICE WIRELESS DATA --------------------- */ -/* - * This is all the wireless data specific to a device instance that - * is managed by the core of Wireless Extensions or the 802.11 layer. - * We only keep pointer to those structures, so that a driver is free - * to share them between instances. - * This structure should be initialised before registering the device. - * Access to this data follow the same rules as any other struct net_device - * data (i.e. valid as long as struct net_device exist, same locking rules). - */ -/* Forward declaration */ -struct libipw_device; -/* The struct */ -struct iw_public_data { - /* Driver enhanced spy support */ - struct iw_spy_data * spy_data; - /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ - struct libipw_device * libipw; -}; - /**************************** PROTOTYPES ****************************/ /* - * Functions part of the Wireless Extensions (defined in net/core/wireless.c). - * Those may be called only within the kernel. + * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). + * Those may be called by driver modules. */ -/* First : function strictly used inside the kernel */ - -/* Handle /proc/net/wireless, called in net/code/dev.c */ -int dev_get_wireless_info(char *buffer, char **start, off_t offset, int length); - -/* Second : functions that may be called by driver modules */ - /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); @@ -450,23 +420,7 @@ static inline void wireless_nlevent_flush(void) {} /* We may need a function to send a stream of events to user space. * More on that later... */ -/* Standard handler for SIOCSIWSPY */ -int iw_handler_set_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWSPY */ -int iw_handler_get_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCSIWTHRSPY */ -int iw_handler_set_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWTHRSPY */ -int iw_handler_get_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Driver call to update spy records */ -void wireless_spy_update(struct net_device *dev, unsigned char *address, - struct iw_quality *wstats); - -/************************* INLINE FUNTIONS *************************/ +/************************* INLINE FUNCTIONS *************************/ /* * Function that are so simple that it's more efficient inlining them */ diff --git a/include/net/kcm.h b/include/net/kcm.h index 2d704f8f4905..441e993be634 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -47,9 +47,9 @@ struct kcm_stats { struct kcm_tx_msg { unsigned int sent; - unsigned int fragidx; unsigned int frag_offset; unsigned int msg_flags; + bool started_tx; struct sk_buff *frag_skb; struct sk_buff *last_skb; }; @@ -70,6 +70,7 @@ struct kcm_sock { struct work_struct tx_work; struct list_head wait_psock_list; struct sk_buff *seq_skb; + struct mutex tx_mutex; u32 tx_stopped : 1; /* Don't use bit fields here, these are set under different locks */ diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 031c661aa14d..1eb8dad18f7e 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -59,6 +59,20 @@ int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg); +static inline +bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) +{ + return !(fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF) && + fl->flowi_l3mdev == iifindex; +} + +static inline +bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) +{ + return fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF && + fl->flowi_l3mdev == oifindex; +} + void l3mdev_update_flow(struct net *net, struct flowi *fl); int l3mdev_master_ifindex_rcu(const struct net_device *dev); @@ -78,7 +92,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) struct net_device *dev; int rc = 0; - if (likely(ifindex)) { + if (ifindex) { rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); @@ -198,10 +212,12 @@ struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) if (netif_is_l3_slave(dev)) { struct net_device *master; + rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (master && master->l3mdev_ops->l3mdev_l3_out) skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, skb, proto); + rcu_read_unlock(); } return skb; @@ -325,6 +341,19 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, { return 1; } + +static inline +bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) +{ + return false; +} + +static inline +bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) +{ + return false; +} + static inline void l3mdev_update_flow(struct net *net, struct flowi *fl) { diff --git a/include/net/lapb.h b/include/net/lapb.h index ccc3d1f020b0..6c07420644e4 100644 --- a/include/net/lapb.h +++ b/include/net/lapb.h @@ -4,7 +4,7 @@ #include <linux/lapb.h> #include <linux/refcount.h> -#define LAPB_HEADER_LEN 20 /* LAPB over Ethernet + a bit more */ +#define LAPB_HEADER_LEN MAX_HEADER /* LAPB over Ethernet + a bit more */ #define LAPB_ACK_PENDING_CONDITION 0x01 #define LAPB_REJECT_CONDITION 0x02 @@ -92,6 +92,7 @@ struct lapb_cb { unsigned short n2, n2count; unsigned short t1, t2; struct timer_list t1timer, t2timer; + bool t1timer_running, t2timer_running; /* Internal control information */ struct sk_buff_head write_queue; @@ -103,6 +104,7 @@ struct lapb_cb { struct lapb_frame frmr_data; unsigned char frmr_type; + spinlock_t lock; refcount_t refcnt; }; diff --git a/include/net/lib80211.h b/include/net/lib80211.h deleted file mode 100644 index 8b47d3a51cf8..000000000000 --- a/include/net/lib80211.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * lib80211.h -- common bits for IEEE802.11 wireless drivers - * - * Copyright (c) 2008, John W. Linville <linville@tuxdriver.com> - * - * Some bits copied from old ieee80211 component, w/ original copyright - * notices below: - * - * Original code based on Host AP (software wireless LAN access point) driver - * for Intersil Prism2/2.5/3. - * - * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen - * <j@w1.fi> - * Copyright (c) 2002-2003, Jouni Malinen <j@w1.fi> - * - * Adaption to a generic IEEE 802.11 stack by James Ketrenos - * <jketreno@linux.intel.com> - * - * Copyright (c) 2004, Intel Corporation - * - */ - -#ifndef LIB80211_H -#define LIB80211_H - -#include <linux/types.h> -#include <linux/list.h> -#include <linux/atomic.h> -#include <linux/if.h> -#include <linux/skbuff.h> -#include <linux/ieee80211.h> -#include <linux/timer.h> -#include <linux/seq_file.h> - -#define NUM_WEP_KEYS 4 - -enum { - IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), -}; - -struct module; - -struct lib80211_crypto_ops { - const char *name; - struct list_head list; - - /* init new crypto context (e.g., allocate private data space, - * select IV, etc.); returns NULL on failure or pointer to allocated - * private data on success */ - void *(*init) (int keyidx); - - /* deinitialize crypto context and free allocated private data */ - void (*deinit) (void *priv); - - /* encrypt/decrypt return < 0 on error or >= 0 on success. The return - * value from decrypt_mpdu is passed as the keyidx value for - * decrypt_msdu. skb must have enough head and tail room for the - * encryption; if not, error will be returned; these functions are - * called for all MPDUs (i.e., fragments). - */ - int (*encrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - - /* These functions are called for full MSDUs, i.e. full frames. - * These can be NULL if full MSDU operations are not needed. */ - int (*encrypt_msdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_msdu) (struct sk_buff * skb, int keyidx, int hdr_len, - void *priv); - - int (*set_key) (void *key, int len, u8 * seq, void *priv); - int (*get_key) (void *key, int len, u8 * seq, void *priv); - - /* procfs handler for printing out key information and possible - * statistics */ - void (*print_stats) (struct seq_file *m, void *priv); - - /* Crypto specific flag get/set for configuration settings */ - unsigned long (*get_flags) (void *priv); - unsigned long (*set_flags) (unsigned long flags, void *priv); - - /* maximum number of bytes added by encryption; encrypt buf is - * allocated with extra_prefix_len bytes, copy of in_buf, and - * extra_postfix_len; encrypt need not use all this space, but - * the result must start at the beginning of the buffer and correct - * length must be returned */ - int extra_mpdu_prefix_len, extra_mpdu_postfix_len; - int extra_msdu_prefix_len, extra_msdu_postfix_len; - - struct module *owner; -}; - -struct lib80211_crypt_data { - struct list_head list; /* delayed deletion list */ - struct lib80211_crypto_ops *ops; - void *priv; - atomic_t refcnt; -}; - -struct lib80211_crypt_info { - char *name; - /* Most clients will already have a lock, - so just point to that. */ - spinlock_t *lock; - - struct lib80211_crypt_data *crypt[NUM_WEP_KEYS]; - int tx_keyidx; /* default TX key index (crypt[tx_keyidx]) */ - struct list_head crypt_deinit_list; - struct timer_list crypt_deinit_timer; - int crypt_quiesced; -}; - -int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, - spinlock_t *lock); -void lib80211_crypt_info_free(struct lib80211_crypt_info *info); -int lib80211_register_crypto_ops(struct lib80211_crypto_ops *ops); -int lib80211_unregister_crypto_ops(struct lib80211_crypto_ops *ops); -struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name); -void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info, - struct lib80211_crypt_data **crypt); - -#endif /* LIB80211_H */ diff --git a/include/net/libeth/cache.h b/include/net/libeth/cache.h new file mode 100644 index 000000000000..bdb0c043ce61 --- /dev/null +++ b/include/net/libeth/cache.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_CACHE_H +#define __LIBETH_CACHE_H + +#include <linux/cache.h> + +/** + * libeth_cacheline_group_assert - make sure cacheline group size is expected + * @type: type of the structure containing the group + * @grp: group name inside the struct + * @sz: expected group size + */ +#if defined(CONFIG_64BIT) && SMP_CACHE_BYTES == 64 +#define libeth_cacheline_group_assert(type, grp, sz) \ + static_assert(offsetof(type, __cacheline_group_end__##grp) - \ + offsetofend(type, __cacheline_group_begin__##grp) == \ + (sz)) +#define __libeth_cacheline_struct_assert(type, sz) \ + static_assert(sizeof(type) == (sz)) +#else /* !CONFIG_64BIT || SMP_CACHE_BYTES != 64 */ +#define libeth_cacheline_group_assert(type, grp, sz) \ + static_assert(offsetof(type, __cacheline_group_end__##grp) - \ + offsetofend(type, __cacheline_group_begin__##grp) <= \ + (sz)) +#define __libeth_cacheline_struct_assert(type, sz) \ + static_assert(sizeof(type) <= (sz)) +#endif /* !CONFIG_64BIT || SMP_CACHE_BYTES != 64 */ + +#define __libeth_cls1(sz1) SMP_CACHE_ALIGN(sz1) +#define __libeth_cls2(sz1, sz2) (SMP_CACHE_ALIGN(sz1) + SMP_CACHE_ALIGN(sz2)) +#define __libeth_cls3(sz1, sz2, sz3) \ + (SMP_CACHE_ALIGN(sz1) + SMP_CACHE_ALIGN(sz2) + SMP_CACHE_ALIGN(sz3)) +#define __libeth_cls(...) \ + CONCATENATE(__libeth_cls, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +/** + * libeth_cacheline_struct_assert - make sure CL-based struct size is expected + * @type: type of the struct + * @...: from 1 to 3 CL group sizes (read-mostly, read-write, cold) + * + * When a struct contains several CL groups, it's difficult to predict its size + * on different architectures. The macro instead takes sizes of all of the + * groups the structure contains and generates the final struct size. + */ +#define libeth_cacheline_struct_assert(type, ...) \ + __libeth_cacheline_struct_assert(type, __libeth_cls(__VA_ARGS__)); \ + static_assert(__alignof(type) >= SMP_CACHE_BYTES) + +/** + * libeth_cacheline_set_assert - make sure CL-based struct layout is expected + * @type: type of the struct + * @ro: expected size of the read-mostly group + * @rw: expected size of the read-write group + * @c: expected size of the cold group + * + * Check that each group size is expected and then do final struct size check. + */ +#define libeth_cacheline_set_assert(type, ro, rw, c) \ + libeth_cacheline_group_assert(type, read_mostly, ro); \ + libeth_cacheline_group_assert(type, read_write, rw); \ + libeth_cacheline_group_assert(type, cold, c); \ + libeth_cacheline_struct_assert(type, ro, rw, c) + +#endif /* __LIBETH_CACHE_H */ diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h new file mode 100644 index 000000000000..ab05024be518 --- /dev/null +++ b/include/net/libeth/rx.h @@ -0,0 +1,308 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_RX_H +#define __LIBETH_RX_H + +#include <linux/if_vlan.h> + +#include <net/page_pool/helpers.h> +#include <net/xdp.h> + +/* Rx buffer management */ + +/* Space reserved in front of each frame */ +#define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN) +/* Maximum headroom for worst-case calculations */ +#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM +/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */ +#define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) +/* Maximum supported L2-L4 header length */ +#define LIBETH_MAX_HEAD roundup_pow_of_two(max(MAX_HEADER, 256)) + +/* Always use order-0 pages */ +#define LIBETH_RX_PAGE_ORDER 0 +/* Pick a sane buffer stride and align to a cacheline boundary */ +#define LIBETH_RX_BUF_STRIDE SKB_DATA_ALIGN(128) +/* HW-writeable space in one buffer: truesize - headroom/tailroom, aligned */ +#define LIBETH_RX_PAGE_LEN(hr) \ + ALIGN_DOWN(SKB_MAX_ORDER(hr, LIBETH_RX_PAGE_ORDER), \ + LIBETH_RX_BUF_STRIDE) + +/** + * struct libeth_fqe - structure representing an Rx buffer (fill queue element) + * @page: page holding the buffer + * @offset: offset from the page start (to the headroom) + * @truesize: total space occupied by the buffer (w/ headroom and tailroom) + * + * Depending on the MTU, API switches between one-page-per-frame and shared + * page model (to conserve memory on bigger-page platforms). In case of the + * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```. + */ +struct libeth_fqe { + struct page *page; + u32 offset; + u32 truesize; +} __aligned_largest; + +/** + * enum libeth_fqe_type - enum representing types of Rx buffers + * @LIBETH_FQE_MTU: buffer size is determined by MTU + * @LIBETH_FQE_SHORT: buffer size is smaller than MTU, for short frames + * @LIBETH_FQE_HDR: buffer size is ```LIBETH_MAX_HEAD```-sized, for headers + */ +enum libeth_fqe_type { + LIBETH_FQE_MTU = 0U, + LIBETH_FQE_SHORT, + LIBETH_FQE_HDR, +}; + +/** + * struct libeth_fq - structure representing a buffer (fill) queue + * @fp: hotpath part of the structure + * @pp: &page_pool for buffer management + * @fqes: array of Rx buffers + * @truesize: size to allocate per buffer, w/overhead + * @count: number of descriptors/buffers the queue has + * @type: type of the buffers this queue has + * @hsplit: flag whether header split is enabled + * @buf_len: HW-writeable length per each buffer + * @nid: ID of the closest NUMA node with memory + */ +struct libeth_fq { + struct_group_tagged(libeth_fq_fp, fp, + struct page_pool *pp; + struct libeth_fqe *fqes; + + u32 truesize; + u32 count; + ); + + /* Cold fields */ + enum libeth_fqe_type type:2; + bool hsplit:1; + + u32 buf_len; + int nid; +}; + +int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi); +void libeth_rx_fq_destroy(struct libeth_fq *fq); + +/** + * libeth_rx_alloc - allocate a new Rx buffer + * @fq: fill queue to allocate for + * @i: index of the buffer within the queue + * + * Return: DMA address to be passed to HW for Rx on successful allocation, + * ```DMA_MAPPING_ERROR``` otherwise. + */ +static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i) +{ + struct libeth_fqe *buf = &fq->fqes[i]; + + buf->truesize = fq->truesize; + buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize); + if (unlikely(!buf->page)) + return DMA_MAPPING_ERROR; + + return page_pool_get_dma_addr(buf->page) + buf->offset + + fq->pp->p.offset; +} + +void libeth_rx_recycle_slow(struct page *page); + +/** + * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA + * @fqe: buffer to process + * @len: frame length from the descriptor + * + * Process the buffer after it's written by HW. The regular path is to + * synchronize DMA for CPU, but in case of no data it will be immediately + * recycled back to its PP. + * + * Return: true when there's data to process, false otherwise. + */ +static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe, + u32 len) +{ + struct page *page = fqe->page; + + /* Very rare, but possible case. The most common reason: + * the last fragment contained FCS only, which was then + * stripped by the HW. + */ + if (unlikely(!len)) { + libeth_rx_recycle_slow(page); + return false; + } + + page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len); + + return true; +} + +/* Converting abstract packet type numbers into a software structure with + * the packet parameters to do O(1) lookup on Rx. + */ + +enum { + LIBETH_RX_PT_OUTER_L2 = 0U, + LIBETH_RX_PT_OUTER_IPV4, + LIBETH_RX_PT_OUTER_IPV6, +}; + +enum { + LIBETH_RX_PT_NOT_FRAG = 0U, + LIBETH_RX_PT_FRAG, +}; + +enum { + LIBETH_RX_PT_TUNNEL_IP_NONE = 0U, + LIBETH_RX_PT_TUNNEL_IP_IP, + LIBETH_RX_PT_TUNNEL_IP_GRENAT, + LIBETH_RX_PT_TUNNEL_IP_GRENAT_MAC, + LIBETH_RX_PT_TUNNEL_IP_GRENAT_MAC_VLAN, +}; + +enum { + LIBETH_RX_PT_TUNNEL_END_NONE = 0U, + LIBETH_RX_PT_TUNNEL_END_IPV4, + LIBETH_RX_PT_TUNNEL_END_IPV6, +}; + +enum { + LIBETH_RX_PT_INNER_NONE = 0U, + LIBETH_RX_PT_INNER_UDP, + LIBETH_RX_PT_INNER_TCP, + LIBETH_RX_PT_INNER_SCTP, + LIBETH_RX_PT_INNER_ICMP, + LIBETH_RX_PT_INNER_TIMESYNC, +}; + +#define LIBETH_RX_PT_PAYLOAD_NONE PKT_HASH_TYPE_NONE +#define LIBETH_RX_PT_PAYLOAD_L2 PKT_HASH_TYPE_L2 +#define LIBETH_RX_PT_PAYLOAD_L3 PKT_HASH_TYPE_L3 +#define LIBETH_RX_PT_PAYLOAD_L4 PKT_HASH_TYPE_L4 + +struct libeth_rx_pt { + u32 outer_ip:2; + u32 outer_frag:1; + u32 tunnel_type:3; + u32 tunnel_end_prot:2; + u32 tunnel_end_frag:1; + u32 inner_prot:3; + enum pkt_hash_types payload_layer:2; + + u32 pad:2; + enum xdp_rss_hash_type hash_type:16; +}; + +/** + * struct libeth_rx_csum - checksum offload bits decoded from the Rx descriptor + * @l3l4p: detectable L3 and L4 integrity check is processed by the hardware + * @ipe: IP checksum error + * @eipe: external (outermost) IP header (only for tunels) + * @eudpe: external (outermost) UDP checksum error (only for tunels) + * @ipv6exadd: IPv6 header with extension headers + * @l4e: L4 integrity error + * @pprs: set for packets that skip checksum calculation in the HW pre parser + * @nat: the packet is a UDP tunneled packet + * @raw_csum_valid: set if raw checksum is valid + * @pad: padding to naturally align raw_csum field + * @raw_csum: raw checksum + */ +struct libeth_rx_csum { + u32 l3l4p:1; + u32 ipe:1; + u32 eipe:1; + u32 eudpe:1; + u32 ipv6exadd:1; + u32 l4e:1; + u32 pprs:1; + u32 nat:1; + + u32 raw_csum_valid:1; + u32 pad:7; + u32 raw_csum:16; +}; + +/** + * struct libeth_rqe_info - receive queue element info + * @len: packet length + * @ptype: packet type based on types programmed into the device + * @eop: whether it's the last fragment of the packet + * @rxe: MAC errors: CRC, Alignment, Oversize, Undersizes, Length error + * @vlan: C-VLAN or S-VLAN tag depending on the VLAN offload configuration + */ +struct libeth_rqe_info { + u32 len; + + u32 ptype:14; + u32 eop:1; + u32 rxe:1; + + u32 vlan:16; +}; + +void libeth_rx_pt_gen_hash_type(struct libeth_rx_pt *pt); + +/** + * libeth_rx_pt_get_ip_ver - get IP version from a packet type structure + * @pt: packet type params + * + * Wrapper to compile out the IPv6 code from the drivers when not supported + * by the kernel. + * + * Return: @pt.outer_ip or stub for IPv6 when not compiled-in. + */ +static inline u32 libeth_rx_pt_get_ip_ver(struct libeth_rx_pt pt) +{ +#if !IS_ENABLED(CONFIG_IPV6) + switch (pt.outer_ip) { + case LIBETH_RX_PT_OUTER_IPV4: + return LIBETH_RX_PT_OUTER_IPV4; + default: + return LIBETH_RX_PT_OUTER_L2; + } +#else + return pt.outer_ip; +#endif +} + +/* libeth_has_*() can be used to quickly check whether the HW metadata is + * available to avoid further expensive processing such as descriptor reads. + * They already check for the corresponding netdev feature to be enabled, + * thus can be used as drop-in replacements. + */ + +static inline bool libeth_rx_pt_has_checksum(const struct net_device *dev, + struct libeth_rx_pt pt) +{ + /* Non-zero _INNER* is only possible when _OUTER_IPV* is set, + * it is enough to check only for the L4 type. + */ + return likely(pt.inner_prot > LIBETH_RX_PT_INNER_NONE && + (dev->features & NETIF_F_RXCSUM)); +} + +static inline bool libeth_rx_pt_has_hash(const struct net_device *dev, + struct libeth_rx_pt pt) +{ + return likely(pt.payload_layer > LIBETH_RX_PT_PAYLOAD_NONE && + (dev->features & NETIF_F_RXHASH)); +} + +/** + * libeth_rx_pt_set_hash - fill in skb hash value basing on the PT + * @skb: skb to fill the hash in + * @hash: 32-bit hash value from the descriptor + * @pt: packet type + */ +static inline void libeth_rx_pt_set_hash(struct sk_buff *skb, u32 hash, + struct libeth_rx_pt pt) +{ + skb_set_hash(skb, hash, pt.payload_layer); +} + +#endif /* __LIBETH_RX_H */ diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h new file mode 100644 index 000000000000..35614f9523f6 --- /dev/null +++ b/include/net/libeth/tx.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_TX_H +#define __LIBETH_TX_H + +#include <linux/skbuff.h> + +#include <net/libeth/types.h> + +/* Tx buffer completion */ + +/** + * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion + * @LIBETH_SQE_EMPTY: unused/empty, no action required + * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required + * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() + * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA + * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats + */ +enum libeth_sqe_type { + LIBETH_SQE_EMPTY = 0U, + LIBETH_SQE_CTX, + LIBETH_SQE_SLAB, + LIBETH_SQE_FRAG, + LIBETH_SQE_SKB, +}; + +/** + * struct libeth_sqe - represents a Send Queue Element / Tx buffer + * @type: type of the buffer, see the enum above + * @rs_idx: index of the last buffer from the batch this one was sent in + * @raw: slab buffer to free via kfree() + * @skb: &sk_buff to consume + * @dma: DMA address to unmap + * @len: length of the mapped region to unmap + * @nr_frags: number of frags in the frame this buffer belongs to + * @packets: number of physical packets sent for this frame + * @bytes: number of physical bytes sent for this frame + * @priv: driver-private scratchpad + */ +struct libeth_sqe { + enum libeth_sqe_type type:32; + u32 rs_idx; + + union { + void *raw; + struct sk_buff *skb; + }; + + DEFINE_DMA_UNMAP_ADDR(dma); + DEFINE_DMA_UNMAP_LEN(len); + + u32 nr_frags; + u32 packets; + u32 bytes; + + unsigned long priv; +} __aligned_largest; + +/** + * LIBETH_SQE_CHECK_PRIV - check the driver's private SQE data + * @p: type or name of the object the driver wants to fit into &libeth_sqe + * + * Make sure the driver's private data fits into libeth_sqe::priv. To be used + * right after its declaration. + */ +#define LIBETH_SQE_CHECK_PRIV(p) \ + static_assert(sizeof(p) <= sizeof_field(struct libeth_sqe, priv)) + +/** + * struct libeth_cq_pp - completion queue poll params + * @dev: &device to perform DMA unmapping + * @ss: onstack NAPI stats to fill + * @napi: whether it's called from the NAPI context + * + * libeth uses this structure to access objects needed for performing full + * Tx complete operation without passing lots of arguments and change the + * prototypes each time a new one is added. + */ +struct libeth_cq_pp { + struct device *dev; + struct libeth_sq_napi_stats *ss; + + bool napi; +}; + +/** + * libeth_tx_complete - perform Tx completion for one SQE + * @sqe: SQE to complete + * @cp: poll params + * + * Do Tx complete for all the types of buffers, incl. freeing, unmapping, + * updating the stats etc. + */ +static inline void libeth_tx_complete(struct libeth_sqe *sqe, + const struct libeth_cq_pp *cp) +{ + switch (sqe->type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_SKB: + case LIBETH_SQE_FRAG: + case LIBETH_SQE_SLAB: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (sqe->type) { + case LIBETH_SQE_SKB: + cp->ss->packets += sqe->packets; + cp->ss->bytes += sqe->bytes; + + napi_consume_skb(sqe->skb, cp->napi); + break; + case LIBETH_SQE_SLAB: + kfree(sqe->raw); + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +#endif /* __LIBETH_TX_H */ diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h new file mode 100644 index 000000000000..603825e45133 --- /dev/null +++ b/include/net/libeth/types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_TYPES_H +#define __LIBETH_TYPES_H + +#include <linux/types.h> + +/** + * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @raw: alias to access all the fields as an array + */ +struct libeth_sq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +#endif /* __LIBETH_TYPES_H */ diff --git a/include/net/llc.h b/include/net/llc.h index df282d9b4017..e250dca03963 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -72,7 +72,9 @@ struct llc_sap { static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { - return &sap->sk_dev_hash[ifindex % LLC_SK_DEV_HASH_ENTRIES]; + u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); + + return &sap->sk_dev_hash[bucket]; } static inline @@ -133,7 +135,7 @@ static inline void llc_sap_put(struct llc_sap *sap) struct llc_sap *llc_sap_find(unsigned char sap_value); int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, - unsigned char *dmac, unsigned char dsap); + const unsigned char *dmac, unsigned char dsap); void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); diff --git a/include/net/llc_c_ac.h b/include/net/llc_c_ac.h index e766300b3e99..7620a9196922 100644 --- a/include/net/llc_c_ac.h +++ b/include/net/llc_c_ac.h @@ -16,6 +16,13 @@ * Connection state transition actions * (Fb = F bit; Pb = P bit; Xb = X bit) */ + +#include <linux/types.h> + +struct sk_buff; +struct sock; +struct timer_list; + #define LLC_CONN_AC_CLR_REMOTE_BUSY 1 #define LLC_CONN_AC_CONN_IND 2 #define LLC_CONN_AC_CONN_CONFIRM 3 @@ -168,7 +175,6 @@ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_adjust_npta_by_rr(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_adjust_npta_by_rnr(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb); -int llc_conn_ac_send_i_rsp_as_ack(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb); void llc_conn_busy_tmr_cb(struct timer_list *t); diff --git a/include/net/llc_c_ev.h b/include/net/llc_c_ev.h index 3948cf111dd0..241889955157 100644 --- a/include/net/llc_c_ev.h +++ b/include/net/llc_c_ev.h @@ -158,7 +158,6 @@ int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb); int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb); int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb); int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb); -int llc_conn_ev_sendack_tmr_exp(struct sock *sk, struct sk_buff *skb); /* NOT_USED functions and their variations */ int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb); int llc_conn_ev_rx_xxx_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb); diff --git a/include/net/llc_c_st.h b/include/net/llc_c_st.h index 48f3f891b2f9..a4bea0f33188 100644 --- a/include/net/llc_c_st.h +++ b/include/net/llc_c_st.h @@ -11,6 +11,10 @@ * * See the GNU General Public License for more details. */ + +#include <net/llc_c_ac.h> +#include <net/llc_c_ev.h> + /* Connection component state management */ /* connection states */ #define LLC_CONN_OUT_OF_SVC 0 /* prior to allocation */ @@ -40,8 +44,8 @@ struct llc_conn_state_trans { }; struct llc_conn_state { - u8 current_state; - struct llc_conn_state_trans **transitions; + u8 current_state; + const struct llc_conn_state_trans **transitions; }; extern struct llc_conn_state llc_conn_state_table[]; diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index ea985aa7a6c5..374411b3066c 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -38,6 +38,7 @@ struct llc_sock { struct llc_addr laddr; /* lsap/mac pair */ struct llc_addr daddr; /* dsap/mac pair */ struct net_device *dev; /* device to send to remote */ + netdevice_tracker dev_tracker; u32 copied_seq; /* head of yet unread data */ u8 retry_count; /* number of retries */ u8 ack_must_be_send; @@ -110,7 +111,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit); void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit); int llc_conn_remove_acked_pdus(struct sock *conn, u8 nr, u16 *how_many_unacked); struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, - struct llc_addr *laddr); + struct llc_addr *laddr, const struct net *net); void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk); void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk); diff --git a/include/net/llc_if.h b/include/net/llc_if.h index 8d5c543cd620..c72570a21a4f 100644 --- a/include/net/llc_if.h +++ b/include/net/llc_if.h @@ -62,7 +62,8 @@ #define LLC_STATUS_CONFLICT 7 /* disconnect conn */ #define LLC_STATUS_RESET_DONE 8 /* */ -int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap); +int llc_establish_connection(struct sock *sk, const u8 *lmac, u8 *dmac, + u8 dsap); int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb); int llc_send_disc(struct sock *sk); #endif /* LLC_IF_H */ diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h index c0f0a13ed818..86681f29bda7 100644 --- a/include/net/llc_pdu.h +++ b/include/net/llc_pdu.h @@ -15,9 +15,11 @@ #include <linux/if_ether.h> /* Lengths of frame formats */ -#define LLC_PDU_LEN_I 4 /* header and 2 control bytes */ -#define LLC_PDU_LEN_S 4 -#define LLC_PDU_LEN_U 3 /* header and 1 control byte */ +#define LLC_PDU_LEN_I 4 /* header and 2 control bytes */ +#define LLC_PDU_LEN_S 4 +#define LLC_PDU_LEN_U 3 /* header and 1 control byte */ +/* header and 1 control byte and XID info */ +#define LLC_PDU_LEN_U_XID (LLC_PDU_LEN_U + sizeof(struct llc_xid_info)) /* Known SAP addresses */ #define LLC_GLOBAL_SAP 0xFF #define LLC_NULL_SAP 0x00 /* not network-layer visible */ @@ -50,9 +52,10 @@ #define LLC_PDU_TYPE_U_MASK 0x03 /* 8-bit control field */ #define LLC_PDU_TYPE_MASK 0x03 -#define LLC_PDU_TYPE_I 0 /* first bit */ -#define LLC_PDU_TYPE_S 1 /* first two bits */ -#define LLC_PDU_TYPE_U 3 /* first two bits */ +#define LLC_PDU_TYPE_I 0 /* first bit */ +#define LLC_PDU_TYPE_S 1 /* first two bits */ +#define LLC_PDU_TYPE_U 3 /* first two bits */ +#define LLC_PDU_TYPE_U_XID 4 /* private type for detecting XID commands */ #define LLC_PDU_TYPE_IS_I(pdu) \ ((!(pdu->ctrl_1 & LLC_PDU_TYPE_I_MASK)) ? 1 : 0) @@ -230,9 +233,18 @@ static inline struct llc_pdu_un *llc_pdu_un_hdr(struct sk_buff *skb) static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, u8 ssap, u8 dsap, u8 cr) { - const int hlen = type == LLC_PDU_TYPE_U ? 3 : 4; + int hlen = 4; /* default value for I and S types */ struct llc_pdu_un *pdu; + switch (type) { + case LLC_PDU_TYPE_U: + hlen = 3; + break; + case LLC_PDU_TYPE_U_XID: + hlen = 6; + break; + } + skb_push(skb, hlen); skb_reset_network_header(skb); pdu = llc_pdu_un_hdr(skb); @@ -242,7 +254,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, } /** - * llc_pdu_decode_sa - extracs source address (MAC) of input frame + * llc_pdu_decode_sa - extracts, source address (MAC) of input frame * @skb: input skb that source address must be extracted from it. * @sa: pointer to source address (6 byte array). * @@ -250,21 +262,19 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, */ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); + memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); } /** * llc_pdu_decode_da - extracts dest address of input frame * @skb: input skb that destination address must be extracted from it - * @sa: pointer to destination address (6 byte array). + * @da: pointer to destination address (6 byte array). * * This function extracts destination address(MAC) of input frame. */ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) { - if (skb->protocol == htons(ETH_P_802_2)) - memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); + memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); } /** @@ -309,7 +319,7 @@ static inline void llc_pdu_init_as_ui_cmd(struct sk_buff *skb) /** * llc_pdu_init_as_test_cmd - sets PDU as TEST - * @skb - Address of the skb to build + * @skb: Address of the skb to build * * Sets a PDU as TEST */ @@ -357,6 +367,8 @@ struct llc_xid_info { /** * llc_pdu_init_as_xid_cmd - sets bytes 3, 4 & 5 of LLC header as XID * @skb: input skb that header must be set into it. + * @svcs_supported: The class of the LLC (I or II) + * @rx_window: The size of the receive window of the LLC * * This function sets third,fourth,fifth and sixth bytes of LLC header as * a XID PDU. @@ -374,7 +386,10 @@ static inline void llc_pdu_init_as_xid_cmd(struct sk_buff *skb, xid_info->fmt_id = LLC_XID_FMT_ID; /* 0x81 */ xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; /* size of receive window */ - skb_put(skb, sizeof(struct llc_xid_info)); + + /* no need to push/put since llc_pdu_header_init() has already + * pushed 3 + 3 bytes + */ } /** diff --git a/include/net/llc_s_ac.h b/include/net/llc_s_ac.h index a61b98c108ee..f71790305bc9 100644 --- a/include/net/llc_s_ac.h +++ b/include/net/llc_s_ac.h @@ -11,6 +11,10 @@ * * See the GNU General Public License for more details. */ + +struct llc_sap; +struct sk_buff; + /* SAP component actions */ #define SAP_ACT_UNITDATA_IND 1 #define SAP_ACT_SEND_UI 2 diff --git a/include/net/llc_s_ev.h b/include/net/llc_s_ev.h index 84db3a59ed28..fb7df1d70af3 100644 --- a/include/net/llc_s_ev.h +++ b/include/net/llc_s_ev.h @@ -13,6 +13,7 @@ */ #include <linux/skbuff.h> +#include <net/llc.h> /* Defines SAP component events */ /* Types of events (possible values in 'ev->type') */ diff --git a/include/net/llc_s_st.h b/include/net/llc_s_st.h index c4359e203013..fca49d483d20 100644 --- a/include/net/llc_s_st.h +++ b/include/net/llc_s_st.h @@ -12,6 +12,12 @@ * See the GNU General Public License for more details. */ +#include <linux/types.h> +#include <net/llc_s_ac.h> +#include <net/llc_s_ev.h> + +struct llc_sap_state_trans; + #define LLC_NR_SAP_STATES 2 /* size of state table */ /* structures and types */ @@ -23,8 +29,8 @@ struct llc_sap_state_trans { }; struct llc_sap_state { - u8 curr_state; - struct llc_sap_state_trans **transitions; + u8 curr_state; + const struct llc_sap_state_trans **transitions; }; /* only access to SAP state table */ diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 05cfd6ff6528..c306ebe379a0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -16,9 +16,12 @@ #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) #define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) +/* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return + * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety. + */ enum { LWTUNNEL_XMIT_DONE, - LWTUNNEL_XMIT_CONTINUE, + LWTUNNEL_XMIT_CONTINUE = 0x100, }; @@ -51,6 +54,9 @@ struct lwtunnel_encap_ops { }; #ifdef CONFIG_LWTUNNEL + +DECLARE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); + void lwtstate_free(struct lwtunnel_state *lws); static inline struct lwtunnel_state * @@ -200,6 +206,7 @@ static inline int lwtunnel_valid_encap_type(u16 encap_type, NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } + static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack) { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e8e295dae744..82617579d910 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2020 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation */ #ifndef MAC80211_H @@ -18,10 +18,11 @@ #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/ieee80211.h> +#include <linux/lockdep.h> #include <net/cfg80211.h> #include <net/codel.h> #include <net/ieee80211_radiotap.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /** * DOC: Introduction @@ -78,7 +79,7 @@ * helpers for sanity checking. Drivers must ensure all work added onto the * mac80211 workqueue should be cancelled on the driver stop() callback. * - * mac80211 will flushed the workqueue upon interface removal and during + * mac80211 will flush the workqueue upon interface removal and during * suspend. * * All work performed on the mac80211 workqueue must not acquire the RTNL lock. @@ -88,15 +89,13 @@ /** * DOC: mac80211 software tx queueing * - * mac80211 provides an optional intermediate queueing implementation designed - * to allow the driver to keep hardware queues short and provide some fairness - * between different stations/interfaces. - * In this model, the driver pulls data frames from the mac80211 queue instead - * of letting mac80211 push them via drv_tx(). - * Other frames (e.g. control or management) are still pushed using drv_tx(). + * mac80211 uses an intermediate queueing implementation, designed to allow the + * driver to keep hardware queues short and to provide some fairness between + * different stations/interfaces. * - * Drivers indicate that they use this model by implementing the .wake_tx_queue - * driver operation. + * Drivers must provide the .wake_tx_queue driver operation by either + * linking it to ieee80211_handle_wake_tx_queue() or implementing a custom + * handler. * * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with * another per-sta for non-data/non-mgmt and bufferable management frames, and @@ -105,9 +104,12 @@ * The driver is expected to initialize its private per-queue data for stations * and interfaces in the .add_interface and .sta_add ops. * - * The driver can't access the queue directly. To dequeue a frame from a - * txq, it calls ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a - * queue, it calls the .wake_tx_queue driver op. + * The driver can't access the internal TX queues (iTXQs) directly. + * Whenever mac80211 adds a new frame to a queue, it calls the .wake_tx_queue + * driver op. + * Drivers implementing a custom .wake_tx_queue op can get them by calling + * ieee80211_tx_dequeue(). Drivers using ieee80211_handle_wake_tx_queue() will + * simply get the individual frames pushed via the .tx driver operation. * * Drivers can optionally delegate responsibility for scheduling queues to * mac80211, to take advantage of airtime fairness accounting. In this case, to @@ -125,6 +127,22 @@ * via the usual ieee80211_tx_dequeue). */ +/** + * DOC: HW timestamping + * + * Timing Measurement and Fine Timing Measurement require accurate timestamps + * of the action frames TX/RX and their respective acks. + * + * To report hardware timestamps for Timing Measurement or Fine Timing + * Measurement frame RX, the low level driver should set the SKB's hwtstamp + * field to the frame RX timestamp and report the ack TX timestamp in the + * ieee80211_rx_status struct. + * + * Similarly, to report hardware timestamps for Timing Measurement or Fine + * Timing Measurement frame TX, the driver should set the SKB's hwtstamp field + * to the frame TX timestamp and report the ack RX timestamp in the + * ieee80211_tx_status struct. + */ struct device; /** @@ -195,14 +213,31 @@ struct ieee80211_low_level_stats { * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, * this is used only with channel switching with CSA - * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed + * @IEEE80211_CHANCTX_CHANGE_MIN_DEF: The min chandef changed + * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider + * bandwidth) OFDMA settings need to be changed + * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap + * was changed. */ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_WIDTH = BIT(0), IEEE80211_CHANCTX_CHANGE_RX_CHAINS = BIT(1), IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), - IEEE80211_CHANCTX_CHANGE_MIN_WIDTH = BIT(4), + IEEE80211_CHANCTX_CHANGE_MIN_DEF = BIT(4), + IEEE80211_CHANCTX_CHANGE_AP = BIT(5), + IEEE80211_CHANCTX_CHANGE_PUNCTURING = BIT(6), +}; + +/** + * struct ieee80211_chan_req - A channel "request" + * @oper: channel definition to use for operation + * @ap: the channel definition of the AP, if any + * (otherwise the chan member is %NULL) + */ +struct ieee80211_chan_req { + struct cfg80211_chan_def oper; + struct cfg80211_chan_def ap; }; /** @@ -213,6 +248,9 @@ enum ieee80211_chanctx_change { * * @def: the channel definition * @min_def: the minimum channel definition currently required. + * @ap: the channel definition the AP actually is operating as, + * for use with (wider bandwidth) OFDMA + * @radio_idx: index of the wiphy radio used used for this channel * @rx_chains_static: The number of RX chains that must always be * active on the channel to receive MIMO transmissions * @rx_chains_dynamic: The number of RX chains that must be enabled @@ -225,7 +263,9 @@ enum ieee80211_chanctx_change { struct ieee80211_chanctx_conf { struct cfg80211_chan_def def; struct cfg80211_chan_def min_def; + struct cfg80211_chan_def ap; + int radio_idx; u8 rx_chains_static, rx_chains_dynamic; bool radar_enabled; @@ -261,11 +301,13 @@ enum ieee80211_chanctx_switch_mode { * done. * * @vif: the vif that should be switched from old_ctx to new_ctx + * @link_conf: the link conf that's switching * @old_ctx: the old context to which the vif was assigned * @new_ctx: the new context to which the vif must be assigned */ struct ieee80211_vif_chanctx_switch { struct ieee80211_vif *vif; + struct ieee80211_bss_conf *link_conf; struct ieee80211_chanctx_conf *old_ctx; struct ieee80211_chanctx_conf *new_ctx; }; @@ -273,8 +315,8 @@ struct ieee80211_vif_chanctx_switch { /** * enum ieee80211_bss_change - BSS change notification flags * - * These flags are used with the bss_info_changed() callback - * to indicate which BSS parameter changed. + * These flags are used with the bss_info_changed(), link_info_changed() + * and vif_cfg_changed() callbacks to indicate which parameter(s) changed. * * @BSS_CHANGED_ASSOC: association status changed (associated/disassociated), * also implies a change in the AID. @@ -320,7 +362,9 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed. * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response * status changed. - * + * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed. + * @BSS_CHANGED_MLD_TTLM: negotiated TID to link mapping was changed + * @BSS_CHANGED_TPE: transmit power envelope changed */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -354,7 +398,10 @@ enum ieee80211_bss_change { BSS_CHANGED_HE_OBSS_PD = 1<<28, BSS_CHANGED_HE_BSS_COLOR = 1<<29, BSS_CHANGED_FILS_DISCOVERY = 1<<30, - BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31, + BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = BIT_ULL(31), + BSS_CHANGED_MLD_VALID_LINKS = BIT_ULL(33), + BSS_CHANGED_MLD_TTLM = BIT_ULL(34), + BSS_CHANGED_TPE = BIT_ULL(35), /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -453,9 +500,9 @@ struct ieee80211_ba_event { /** * struct ieee80211_event - event to be sent to the driver * @type: The event itself. See &enum ieee80211_event_type. - * @rssi: relevant if &type is %RSSI_EVENT - * @mlme: relevant if &type is %AUTH_EVENT - * @ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT + * @u.rssi: relevant if &type is %RSSI_EVENT + * @u.mlme: relevant if &type is %AUTH_EVENT + * @u.ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT * @u:union holding the fields above */ struct ieee80211_event { @@ -507,17 +554,54 @@ struct ieee80211_fils_discovery { u32 max_interval; }; +#define IEEE80211_TPE_EIRP_ENTRIES_320MHZ 5 +struct ieee80211_parsed_tpe_eirp { + bool valid; + s8 power[IEEE80211_TPE_EIRP_ENTRIES_320MHZ]; + u8 count; +}; + +#define IEEE80211_TPE_PSD_ENTRIES_320MHZ 16 +struct ieee80211_parsed_tpe_psd { + bool valid; + s8 power[IEEE80211_TPE_PSD_ENTRIES_320MHZ]; + u8 count, n; +}; + +/** + * struct ieee80211_parsed_tpe - parsed transmit power envelope information + * @max_local: maximum local EIRP, one value for 20, 40, 80, 160, 320 MHz each + * (indexed by TX power category) + * @max_reg_client: maximum regulatory client EIRP, one value for 20, 40, 80, + * 160, 320 MHz each + * (indexed by TX power category) + * @psd_local: maximum local power spectral density, one value for each 20 MHz + * subchannel per bss_conf's chanreq.oper + * (indexed by TX power category) + * @psd_reg_client: maximum regulatory power spectral density, one value for + * each 20 MHz subchannel per bss_conf's chanreq.oper + * (indexed by TX power category) + */ +struct ieee80211_parsed_tpe { + struct ieee80211_parsed_tpe_eirp max_local[2], max_reg_client[2]; + struct ieee80211_parsed_tpe_psd psd_local[2], psd_reg_client[2]; +}; + /** * struct ieee80211_bss_conf - holds the BSS's changing parameters * * This structure keeps information about a BSS (and an association * to that BSS) that can change during the lifetime of the BSS. * + * @vif: reference to owning VIF + * @bss: the cfg80211 bss descriptor. Valid only for a station, and only + * when associated. Note: This contains information which is not + * necessarily authenticated. For example, information coming from probe + * responses. + * @addr: (link) address used locally + * @link_id: link ID, or 0 for non-MLO * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE - * @multi_sta_back_32bit: supports BA bitmap of 32-bits in Multi-STA BACK * @uora_exists: is the UORA element advertised by AP - * @ack_enabled: indicates support to receive a multi-TID that solicits either - * ACK, BACK or both * @uora_ocw_range: UORA element's OCW Range field * @frame_time_rts_th: HE duration RTS threshold, in units of 32us * @he_support: does this BSS support HE @@ -526,11 +610,7 @@ struct ieee80211_fils_discovery { * @twt_responder: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) * @twt_protected: does this BSS support protected TWT frames - * @assoc: association status - * @ibss_joined: indicates whether this station is part of an IBSS - * or not - * @ibss_creator: indicates if a new IBSS network is being created - * @aid: association ID number, valid only when @assoc is true + * @twt_broadcast: does this BSS support broadcast TWT * @use_cts_prot: use CTS protection * @use_short_preamble: use 802.11b short preamble * @use_short_slot: use short slot time (only relevant for ERP) @@ -551,6 +631,8 @@ struct ieee80211_fils_discovery { * IMPORTANT: These three sync_* parameters would possibly be out of sync * by the time the driver will use them. The synchronized view is currently * guaranteed only in certain callbacks. + * Note also that this is not used with MLD associations, mac80211 doesn't + * know how to track beacons for all of the links for this. * @beacon_int: beacon interval * @assoc_capability: capabilities taken from assoc resp * @basic_rates: bitmap of basic rates, each bit stands for an @@ -560,7 +642,7 @@ struct ieee80211_fils_discovery { * @mcast_rate: per-band multicast rate index + 1 (0: disabled) * @bssid: The BSSID for this BSS * @enable_beacon: whether beaconing should be enabled or not - * @chandef: Channel definition for this BSS -- the hardware might be + * @chanreq: Channel request for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. * @mu_group: VHT MU-MIMO group membership data * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. @@ -576,21 +658,7 @@ struct ieee80211_fils_discovery { * threshold event and can't be enabled simultaneously with it. * @cqm_rssi_high: Connection quality monitor RSSI upper threshold. * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis - * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The - * may filter ARP queries targeted for other addresses than listed here. - * The driver must allow ARP queries targeted for all address listed here - * to pass through. An empty list implies no ARP queries need to pass. - * @arp_addr_cnt: Number of addresses currently on the list. Note that this - * may be larger than %IEEE80211_BSS_ARP_ADDR_LIST_LEN (the arp_addr_list - * array size), it's up to the driver what to do in that case. * @qos: This is a QoS-enabled BSS. - * @idle: This interface is idle. There's also a global idle flag in the - * hardware config which may be more appropriate depending on what - * your driver/device needs to do. - * @ps: power-save mode (STA only). This flag is NOT affected by - * offchannel/dynamic_ps operations. - * @ssid: The SSID of the current vif. Valid in AP and IBSS mode. - * @ssid_len: Length of SSID given in @ssid. * @hidden_ssid: The SSID of the current vif is hidden. Only valid in AP-mode. * @txpower: TX power in dBm. INT_MIN means not configured. * @txpower_type: TX power adjustment used to control per packet Transmit @@ -614,6 +682,9 @@ struct ieee80211_fils_discovery { * responder functionality. * @ftmr_params: configurable lci/civic parameter when enabling FTM responder. * @nontransmitted: this BSS is a nontransmitted BSS profile + * @tx_bss_conf: Pointer to the BSS configuration of transmitting interface + * if MBSSID is enabled. This pointer is RCU-protected due to CSA finish + * and BSS color change flows accessing it. * @transmitter_bssid: the address of transmitter AP * @bssid_index: index inside the multiple BSSID set * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set @@ -621,32 +692,89 @@ struct ieee80211_fils_discovery { * nontransmitted BSSIDs * @profile_periodicity: the least number of beacon frames need to be received * in order to discover all the nontransmitted BSSIDs in the set. - * @he_oper: HE operation information of the AP we are connected to + * @he_oper: HE operation information of the BSS (AP/Mesh) or of the AP we are + * connected to (STA) * @he_obss_pd: OBSS Packet Detection parameters. * @he_bss_color: BSS coloring settings, if BSS supports HE * @fils_discovery: FILS discovery configuration * @unsol_bcast_probe_resp_interval: Unsolicited broadcast probe response * interval. - * @s1g: BSS is S1G BSS (affects Association Request format). * @beacon_tx_rate: The configured beacon transmit rate that needs to be passed * to driver when rate control is offloaded to firmware. + * @power_type: power type of BSS for 6 GHz + * @tpe: transmit power envelope information + * @pwr_reduction: power constraint of BSS. + * @eht_support: does this BSS support EHT + * @epcs_support: does this BSS support EPCS + * @csa_active: marks whether a channel switch is going on. + * @mu_mimo_owner: indicates interface owns MU-MIMO capability + * @chanctx_conf: The channel context this interface is assigned to, or %NULL + * when it is not assigned. This pointer is RCU-protected due to the TX + * path needing to access it; even though the netdev carrier will always + * be off when it is %NULL there can still be races and packets could be + * processed after it switches back to %NULL. + * @color_change_active: marks whether a color change is ongoing. + * @color_change_color: the bss color that will be used after the change. + * @ht_ldpc: in AP mode, indicates interface has HT LDPC capability. + * @vht_ldpc: in AP mode, indicates interface has VHT LDPC capability. + * @he_ldpc: in AP mode, indicates interface has HE LDPC capability. + * @vht_su_beamformer: in AP mode, does this BSS support operation as an VHT SU + * beamformer + * @vht_su_beamformee: in AP mode, does this BSS support operation as an VHT SU + * beamformee + * @vht_mu_beamformer: in AP mode, does this BSS support operation as an VHT MU + * beamformer + * @vht_mu_beamformee: in AP mode, does this BSS support operation as an VHT MU + * beamformee + * @he_su_beamformer: in AP-mode, does this BSS support operation as an HE SU + * beamformer + * @he_su_beamformee: in AP-mode, does this BSS support operation as an HE SU + * beamformee + * @he_mu_beamformer: in AP-mode, does this BSS support operation as an HE MU + * beamformer + * @he_full_ul_mumimo: does this BSS support the reception (AP) or transmission + * (non-AP STA) of an HE TB PPDU on an RU that spans the entire PPDU + * bandwidth + * @eht_su_beamformer: in AP-mode, does this BSS enable operation as an EHT SU + * beamformer + * @eht_su_beamformee: in AP-mode, does this BSS enable operation as an EHT SU + * beamformee + * @eht_mu_beamformer: in AP-mode, does this BSS enable operation as an EHT MU + * beamformer + * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the + * reception of an EHT TB PPDU on an RU that spans the entire PPDU + * bandwidth + * @eht_disable_mcs15: disable EHT-MCS 15 reception capability. + * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This + * information is the latest known value. It can come from this link's + * beacon or from a beacon sent by another link. + * @bss_param_ch_cnt_link_id: in BSS-mode, the link_id to which the beacon + * that updated &bss_param_ch_cnt belongs. E.g. if link 1 doesn't hear + * its beacons, and link 2 sent a beacon with an RNR element that updated + * link 1's BSS params change count, then, link 1's + * bss_param_ch_cnt_link_id will be 2. That means that link 1 knows that + * link 2 was the link that updated its bss_param_ch_cnt value. + * In case link 1 hears its beacon again, bss_param_ch_cnt_link_id will + * be updated to 1, even if bss_param_ch_cnt didn't change. This allows + * the link to know that it heard the latest value from its own beacon + * (as opposed to hearing its value from another link's beacon). */ struct ieee80211_bss_conf { + struct ieee80211_vif *vif; + struct cfg80211_bss *bss; + const u8 *bssid; + unsigned int link_id; + u8 addr[ETH_ALEN] __aligned(2); u8 htc_trig_based_pkt_ext; - bool multi_sta_back_32bit; bool uora_exists; - bool ack_enabled; u8 uora_ocw_range; u16 frame_time_rts_th; bool he_support; bool twt_requester; bool twt_responder; bool twt_protected; - /* association related data */ - bool assoc, ibss_joined; - bool ibss_creator; - u16 aid; + bool twt_broadcast; /* erp related data */ bool use_cts_prot; bool use_short_preamble; @@ -666,15 +794,9 @@ struct ieee80211_bss_conf { u32 cqm_rssi_hyst; s32 cqm_rssi_low; s32 cqm_rssi_high; - struct cfg80211_chan_def chandef; + struct ieee80211_chan_req chanreq; struct ieee80211_mu_group_data mu_group; - __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; - int arp_addr_cnt; bool qos; - bool idle; - bool ps; - u8 ssid[IEEE80211_MAX_SSID_LEN]; - size_t ssid_len; bool hidden_ssid; int txpower; enum nl80211_tx_power_setting txpower_type; @@ -686,6 +808,7 @@ struct ieee80211_bss_conf { struct ieee80211_ftm_responder_params *ftmr_params; /* Multiple BSSID data */ bool nontransmitted; + struct ieee80211_bss_conf __rcu *tx_bss_conf; u8 transmitter_bssid[ETH_ALEN]; u8 bssid_index; u8 bssid_indicator; @@ -699,8 +822,41 @@ struct ieee80211_bss_conf { struct cfg80211_he_bss_color he_bss_color; struct ieee80211_fils_discovery fils_discovery; u32 unsol_bcast_probe_resp_interval; - bool s1g; struct cfg80211_bitrate_mask beacon_tx_rate; + enum ieee80211_ap_reg_power power_type; + + struct ieee80211_parsed_tpe tpe; + + u8 pwr_reduction; + bool eht_support; + bool epcs_support; + bool csa_active; + + bool mu_mimo_owner; + struct ieee80211_chanctx_conf __rcu *chanctx_conf; + + bool color_change_active; + u8 color_change_color; + + bool ht_ldpc; + bool vht_ldpc; + bool he_ldpc; + bool vht_su_beamformer; + bool vht_su_beamformee; + bool vht_mu_beamformer; + bool vht_mu_beamformee; + bool he_su_beamformer; + bool he_su_beamformee; + bool he_mu_beamformer; + bool he_full_ul_mumimo; + bool eht_su_beamformer; + bool eht_su_beamformee; + bool eht_mu_beamformer; + bool eht_80mhz_full_bw_ul_mumimo; + bool eht_disable_mcs15; + + u8 bss_param_ch_cnt; + u8 bss_param_ch_cnt_link_id; }; /** @@ -856,6 +1012,20 @@ enum mac80211_tx_info_flags { * it can be sent out. * @IEEE80211_TX_CTRL_NO_SEQNO: Do not overwrite the sequence number that * has already been assigned to this frame. + * @IEEE80211_TX_CTRL_DONT_REORDER: This frame should not be reordered + * relative to other frames that have this flag set, independent + * of their QoS TID or other priority field values. + * @IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX: first MLO TX, used mostly internally + * for sequence number assignment + * @IEEE80211_TX_CTRL_DONT_USE_RATE_MASK: Don't use rate mask for this frame + * which is transmitted due to scanning or offchannel TX, not in normal + * operation on the interface. + * @IEEE80211_TX_CTRL_MLO_LINK: If not @IEEE80211_LINK_UNSPECIFIED, this + * frame should be transmitted on the specific link. This really is + * only relevant for frames that do not have data present, and is + * also not used for 802.3 format frames. Note that even if the frame + * is on a specific link, address translation might still apply if + * it's intended for an MLD. * * These flags are used in tx_info->control.flags. */ @@ -868,6 +1038,26 @@ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP = BIT(5), IEEE80211_TX_INTCFL_NEED_TXPROCESSING = BIT(6), IEEE80211_TX_CTRL_NO_SEQNO = BIT(7), + IEEE80211_TX_CTRL_DONT_REORDER = BIT(8), + IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX = BIT(9), + IEEE80211_TX_CTRL_DONT_USE_RATE_MASK = BIT(10), + IEEE80211_TX_CTRL_MLO_LINK = 0xf0000000, +}; + +#define IEEE80211_LINK_UNSPECIFIED 0xf +#define IEEE80211_TX_CTRL_MLO_LINK_UNSPEC \ + u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, \ + IEEE80211_TX_CTRL_MLO_LINK) + +/** + * enum mac80211_tx_status_flags - flags to describe transmit status + * + * @IEEE80211_TX_STATUS_ACK_SIGNAL_VALID: ACK signal is valid + * + * These flags are used in tx_info->status.flags. + */ +enum mac80211_tx_status_flags { + IEEE80211_TX_STATUS_ACK_SIGNAL_VALID = BIT(0), }; /* @@ -977,6 +1167,11 @@ struct ieee80211_tx_rate { #define IEEE80211_MAX_TX_RETRY 31 +static inline bool ieee80211_rate_valid(struct ieee80211_tx_rate *rate) +{ + return rate->idx >= 0 && rate->count > 0; +} + static inline void ieee80211_rate_set_vht(struct ieee80211_tx_rate *rate, u8 mcs, u8 nss) { @@ -1006,9 +1201,13 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * (3) TX status information - driver tells mac80211 what happened * * @flags: transmit info flags, defined above - * @band: the band to transmit on (use for checking for races) + * @band: the band to transmit on (use e.g. for checking for races), + * not valid if the interface is an MLD since we won't know which + * link the frame will be transmitted on * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC - * @ack_frame_id: internal frame ID for TX status, used internally + * @status_data: internal data for TX status handling, assigned privately, + * see also &enum ieee80211_status_data for the internal documentation + * @status_data_idr: indicates status data is IDR allocated ID for ack frame * @tx_time_est: TX time estimate in units of 4us, used internally * @control: union part for control data * @control.rates: TX rates array to try @@ -1033,25 +1232,21 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @status.antenna: (legacy, kept only for iwlegacy) * @status.tx_time: airtime consumed for transmission; note this is only * used for WMM AC, not for airtime fairness - * @status.is_valid_ack_signal: ACK signal is valid + * @status.flags: status flags, see &enum mac80211_tx_status_flags * @status.status_driver_data: driver use area * @ack: union part for pure ACK data * @ack.cookie: cookie for the ACK * @driver_data: array of driver_data pointers - * @ampdu_ack_len: number of acked aggregated frames. - * relevant only if IEEE80211_TX_STAT_AMPDU was set. - * @ampdu_len: number of aggregated frames. - * relevant only if IEEE80211_TX_STAT_AMPDU was set. - * @ack_signal: signal strength of the ACK frame */ struct ieee80211_tx_info { /* common information */ u32 flags; u32 band:3, - ack_frame_id:13, + status_data_idr:1, + status_data:13, hw_queue:4, tx_time_est:10; - /* 2 free bits */ + /* 1 free bit */ union { struct { @@ -1065,7 +1260,11 @@ struct ieee80211_tx_info { u8 use_cts_prot:1; u8 short_preamble:1; u8 skip_table:1; - /* 2 bytes free */ + + /* for injection only (bitmap) */ + u8 antennas:2; + + /* 14 bits free */ }; /* only needed before rate control */ unsigned long jiffies; @@ -1085,9 +1284,11 @@ struct ieee80211_tx_info { u8 ampdu_ack_len; u8 ampdu_len; u8 antenna; + u8 pad; u16 tx_time; - bool is_valid_ack_signal; - void *status_driver_data[19 / sizeof(void *)]; + u8 flags; + u8 pad2; + void *status_driver_data[16 / sizeof(void *)]; } status; struct { struct ieee80211_tx_rate driver_rates[ @@ -1118,20 +1319,45 @@ ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info) return info->tx_time_est << 2; } +/*** + * struct ieee80211_rate_status - mrr stage for status path + * + * This struct is used in struct ieee80211_tx_status to provide drivers a + * dynamic way to report about used rates and power levels per packet. + * + * @rate_idx The actual used rate. + * @try_count How often the rate was tried. + * @tx_power_idx An idx into the ieee80211_hw->tx_power_levels list of the + * corresponding wifi hardware. The idx shall point to the power level + * that was used when sending the packet. + */ +struct ieee80211_rate_status { + struct rate_info rate_idx; + u8 try_count; + u8 tx_power_idx; +}; + /** * struct ieee80211_tx_status - extended tx status info for rate control * * @sta: Station that the packet was transmitted for * @info: Basic tx status information * @skb: Packet skb (can be NULL if not provided by the driver) - * @rate: The TX rate that was used when sending the packet + * @rates: Mrr stages that were used when sending the packet + * @n_rates: Number of mrr stages (count of instances for @rates) * @free_list: list where processed skbs are stored to be free'd by the driver + * @ack_hwtstamp: Hardware timestamp of the received ack in nanoseconds + * Only needed for Timing measurement and Fine timing measurement action + * frames. Only reported by devices that have timestamping enabled. */ struct ieee80211_tx_status { struct ieee80211_sta *sta; struct ieee80211_tx_info *info; struct sk_buff *skb; - struct rate_info *rate; + struct ieee80211_rate_status *rates; + ktime_t ack_hwtstamp; + u8 n_rates; + struct list_head *free_list; }; @@ -1175,9 +1401,9 @@ static inline struct ieee80211_rx_status *IEEE80211_SKB_RXCB(struct sk_buff *skb * in the TX status but the rate control information (it does clear * the count since you need to fill that in anyway). * - * NOTE: You can only use this function if you do NOT use - * info->driver_data! Use info->rate_driver_data - * instead if you need only the less space that allows. + * NOTE: While the rates array is kept intact, this will wipe all of the + * driver_data fields in info, so it's up to the driver to restore + * any fields it needs after calling this helper. */ static inline void ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) @@ -1192,12 +1418,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) /* clear the rate counts */ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) info->status.rates[i].count = 0; - - BUILD_BUG_ON( - offsetof(struct ieee80211_tx_info, status.ack_signal) != 20); - memset(&info->status.ampdu_ack_len, 0, - sizeof(struct ieee80211_tx_info) - - offsetof(struct ieee80211_tx_info, status.ampdu_ack_len)); + memset_after(&info->status, 0, rates); } @@ -1223,6 +1444,9 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * the frame. * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on * the frame. + * @RX_FLAG_MACTIME: The timestamp passed in the RX status (@mactime + * field) is valid if this field is non-zero, and the position + * where the timestamp was sampled depends on the value. * @RX_FLAG_MACTIME_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the first symbol of the MPDU * was received. This is useful in monitor mode and for proper IBSS @@ -1232,6 +1456,11 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * (including FCS) was received. * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the SYNC preamble was received. + * @RX_FLAG_MACTIME_IS_RTAP_TS64: The timestamp passed in the RX status @mactime + * is only for use in the radiotap timestamp header, not otherwise a valid + * @mactime value. Note this is a separate flag so that we continue to see + * %RX_FLAG_MACTIME as unset. Also note that in this case the timestamp is + * reported to be 64 bits wide, not just 32. * @RX_FLAG_NO_SIGNAL_VAL: The signal strength value is not present. * Valid only for data frames (mainly A-MPDU) * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference @@ -1242,8 +1471,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU * @RX_FLAG_AMPDU_DELIM_CRC_ERROR: A delimiter CRC error has been detected * on this subframe - * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC - * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was * done by the hardware * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without @@ -1263,9 +1490,12 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * subframes share the same sequence number. Reported subframes can be * either regular MSDU or singly A-MSDUs. Subframes must not be * interleaved with other frames. - * @RX_FLAG_RADIOTAP_VENDOR_DATA: This frame contains vendor-specific - * radiotap data in the skb->data (before the frame) as described by - * the &struct ieee80211_vendor_radiotap. + * @RX_FLAG_RADIOTAP_TLV_AT_END: This frame contains radiotap TLVs in the + * skb->data (before the 802.11 header). + * If used, the SKB's mac_header pointer must be set to point + * to the 802.11 header after the TLVs, and any padding added after TLV + * data to align to 4 must be cleared by the driver putting the TLVs + * in the skb. * @RX_FLAG_ALLOW_SAME_PN: Allow the same PN as same packet before. * This is used for AMSDU subframes which can have the same PN as * the first subframe. @@ -1293,16 +1523,18 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * the "0-length PSDU" field included there. The value for it is * in &struct ieee80211_rx_status. Note that if this value isn't * known the frame shouldn't be reported. + * @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by + * hardware or driver) */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), RX_FLAG_DECRYPTED = BIT(1), - RX_FLAG_MACTIME_PLCP_START = BIT(2), + RX_FLAG_ONLY_MONITOR = BIT(2), RX_FLAG_MMIC_STRIPPED = BIT(3), RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), RX_FLAG_FAILED_PLCP_CRC = BIT(6), - RX_FLAG_MACTIME_START = BIT(7), + RX_FLAG_MACTIME_IS_RTAP_TS64 = BIT(7), RX_FLAG_NO_SIGNAL_VAL = BIT(8), RX_FLAG_AMPDU_DETAILS = BIT(9), RX_FLAG_PN_VALIDATED = BIT(10), @@ -1310,12 +1542,14 @@ enum mac80211_rx_flags { RX_FLAG_AMPDU_LAST_KNOWN = BIT(12), RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), - RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(15), - RX_FLAG_MACTIME_END = BIT(16), - RX_FLAG_ONLY_MONITOR = BIT(17), + /* one free bit at 15 */ + RX_FLAG_MACTIME = BIT(16) | BIT(17), + RX_FLAG_MACTIME_PLCP_START = 1 << 16, + RX_FLAG_MACTIME_START = 2 << 16, + RX_FLAG_MACTIME_END = 3 << 16, RX_FLAG_SKIP_MONITOR = BIT(18), RX_FLAG_AMSDU_MORE = BIT(19), - RX_FLAG_RADIOTAP_VENDOR_DATA = BIT(20), + RX_FLAG_RADIOTAP_TLV_AT_END = BIT(20), RX_FLAG_MIC_STRIPPED = BIT(21), RX_FLAG_ALLOW_SAME_PN = BIT(22), RX_FLAG_ICV_STRIPPED = BIT(23), @@ -1325,6 +1559,7 @@ enum mac80211_rx_flags { RX_FLAG_RADIOTAP_HE_MU = BIT(27), RX_FLAG_RADIOTAP_LSIG = BIT(28), RX_FLAG_NO_PSDU = BIT(29), + RX_FLAG_8023 = BIT(30), }; /** @@ -1356,6 +1591,7 @@ enum mac80211_rx_encoding { RX_ENC_HT, RX_ENC_VHT, RX_ENC_HE, + RX_ENC_EHT, }; /** @@ -1369,6 +1605,9 @@ enum mac80211_rx_encoding { * (TSF) timer when the first data symbol (MPDU) arrived at the hardware. * @boottime_ns: CLOCK_BOOTTIME timestamp the frame was received at, this is * needed only for beacons and probe responses that update the scan cache. + * @ack_tx_hwtstamp: Hardware timestamp for the ack TX in nanoseconds. Only + * needed for Timing measurement and Fine timing measurement action frames. + * Only reported by devices that have timestamping enabled. * @device_timestamp: arbitrary timestamp for the device, mac80211 doesn't use * it but can store it and pass it back to the driver for synchronisation * @band: the active band when this frame was received @@ -1386,7 +1625,7 @@ enum mac80211_rx_encoding { * @antenna: antenna used * @rate_idx: index of data rate into band's supported rates or MCS index if * HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT) - * @nss: number of streams (VHT and HE only) + * @nss: number of streams (VHT, HE and EHT only) * @flag: %RX_FLAG_\* * @encoding: &enum mac80211_rx_encoding * @bw: &enum rate_info_bw @@ -1394,22 +1633,41 @@ enum mac80211_rx_encoding { * @he_ru: HE RU, from &enum nl80211_he_ru_alloc * @he_gi: HE GI, from &enum nl80211_he_gi * @he_dcm: HE DCM value + * @eht: EHT specific rate information + * @eht.ru: EHT RU, from &enum nl80211_eht_ru_alloc + * @eht.gi: EHT GI, from &enum nl80211_eht_gi * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU - * @ampdu_delimiter_crc: A-MPDU delimiter CRC * @zero_length_psdu_type: radiotap type of the 0-length PSDU + * @link_valid: if the link which is identified by @link_id is valid. This flag + * is set only when connection is MLO. + * @link_id: id of the link used to receive the packet. This is used along with + * @link_valid. */ struct ieee80211_rx_status { u64 mactime; - u64 boottime_ns; + union { + u64 boottime_ns; + ktime_t ack_tx_hwtstamp; + }; u32 device_timestamp; u32 ampdu_reference; u32 flag; u16 freq: 13, freq_offset: 1; u8 enc_flags; - u8 encoding:2, bw:3, he_ru:3; - u8 he_gi:2, he_dcm:1; + u8 encoding:3, bw:4; + union { + struct { + u8 he_ru:3; + u8 he_gi:2; + u8 he_dcm:1; + }; + struct { + u8 ru:4; + u8 gi:2; + } eht; + }; u8 rate_idx; u8 nss; u8 rx_flags; @@ -1418,8 +1676,8 @@ struct ieee80211_rx_status { s8 signal; u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; - u8 ampdu_delimiter_crc; u8 zero_length_psdu_type; + u8 link_valid:1, link_id:4; }; static inline u32 @@ -1430,39 +1688,6 @@ ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status) } /** - * struct ieee80211_vendor_radiotap - vendor radiotap data information - * @present: presence bitmap for this vendor namespace - * (this could be extended in the future if any vendor needs more - * bits, the radiotap spec does allow for that) - * @align: radiotap vendor namespace alignment. This defines the needed - * alignment for the @data field below, not for the vendor namespace - * description itself (which has a fixed 2-byte alignment) - * Must be a power of two, and be set to at least 1! - * @oui: radiotap vendor namespace OUI - * @subns: radiotap vendor sub namespace - * @len: radiotap vendor sub namespace skip length, if alignment is done - * then that's added to this, i.e. this is only the length of the - * @data field. - * @pad: number of bytes of padding after the @data, this exists so that - * the skb data alignment can be preserved even if the data has odd - * length - * @data: the actual vendor namespace data - * - * This struct, including the vendor data, goes into the skb->data before - * the 802.11 header. It's split up in mac80211 using the align/oui/subns - * data. - */ -struct ieee80211_vendor_radiotap { - u32 present; - u8 align; - u8 oui[3]; - u8 subns; - u8 pad; - u16 len; - u8 data[]; -} __packed; - -/** * enum ieee80211_conf_flags - configuration flags * * Flags to define PHY configuration options @@ -1602,8 +1827,9 @@ struct ieee80211_conf { * @chandef: the new channel to switch to * @count: the number of TBTT's until the channel switch event * @delay: maximum delay between the time the AP transmitted the last beacon in - * current channel and the expected time of the first beacon in the new - * channel, expressed in TU. + * current channel and the expected time of the first beacon in the new + * channel, expressed in TU. + * @link_id: the link ID of the link doing the channel switch, 0 for non-MLO */ struct ieee80211_channel_switch { u64 timestamp; @@ -1611,6 +1837,7 @@ struct ieee80211_channel_switch { bool block_tx; struct cfg80211_chan_def chandef; u8 count; + u8 link_id; u32 delay; }; @@ -1630,12 +1857,24 @@ struct ieee80211_channel_switch { * @IEEE80211_VIF_GET_NOA_UPDATE: request to handle NOA attributes * and send P2P_PS notification to the driver if NOA changed, even * this is not pure P2P vif. + * @IEEE80211_VIF_EML_ACTIVE: The driver indicates that EML operation is + * enabled for the interface. + * @IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW: Ignore wider bandwidth OFDMA + * operation on this interface and request a channel context without + * the AP definition. Use this e.g. because the device is able to + * handle OFDMA (downlink and trigger for uplink) on a per-AP basis. + * @IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC: indicates that the AP sta should + * be removed only after setting the vif as unassociated, and not the + * opposite. Only relevant for STA vifs. */ enum ieee80211_vif_flags { IEEE80211_VIF_BEACON_FILTER = BIT(0), IEEE80211_VIF_SUPPORTS_CQM_RSSI = BIT(1), IEEE80211_VIF_SUPPORTS_UAPSD = BIT(2), IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), + IEEE80211_VIF_EML_ACTIVE = BIT(4), + IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW = BIT(5), + IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC = BIT(6), }; @@ -1646,11 +1885,92 @@ enum ieee80211_vif_flags { * The driver supports sending frames passed as 802.3 frames by mac80211. * It must also support sending 802.11 packets for the same interface. * @IEEE80211_OFFLOAD_ENCAP_4ADDR: support 4-address mode encapsulation offload + * @IEEE80211_OFFLOAD_DECAP_ENABLED: rx encapsulation offload is enabled + * The driver supports passing received 802.11 frames as 802.3 frames to + * mac80211. */ enum ieee80211_offload_flags { IEEE80211_OFFLOAD_ENCAP_ENABLED = BIT(0), IEEE80211_OFFLOAD_ENCAP_4ADDR = BIT(1), + IEEE80211_OFFLOAD_DECAP_ENABLED = BIT(2), +}; + +/** + * struct ieee80211_vif_cfg - interface configuration + * @assoc: association status + * @ibss_joined: indicates whether this station is part of an IBSS or not + * @ibss_creator: indicates if a new IBSS network is being created + * @ps: power-save mode (STA only). This flag is NOT affected by + * offchannel/dynamic_ps operations. + * @aid: association ID number, valid only when @assoc is true + * @eml_cap: EML capabilities as described in P802.11be_D4.1 Figure 9-1001j. + * @eml_med_sync_delay: Medium Synchronization delay as described in + * P802.11be_D4.1 Figure 9-1001i. + * @mld_capa_op: MLD Capabilities and Operations per P802.11be_D4.1 + * Figure 9-1001k + * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The + * may filter ARP queries targeted for other addresses than listed here. + * The driver must allow ARP queries targeted for all address listed here + * to pass through. An empty list implies no ARP queries need to pass. + * @arp_addr_cnt: Number of addresses currently on the list. Note that this + * may be larger than %IEEE80211_BSS_ARP_ADDR_LIST_LEN (the arp_addr_list + * array size), it's up to the driver what to do in that case. + * @ssid: The SSID of the current vif. Valid in AP and IBSS mode. + * @ssid_len: Length of SSID given in @ssid. + * @s1g: BSS is S1G BSS (affects Association Request format). + * @idle: This interface is idle. There's also a global idle flag in the + * hardware config which may be more appropriate depending on what + * your driver/device needs to do. + * @ap_addr: AP MLD address, or BSSID for non-MLO connections + * (station mode only) + */ +struct ieee80211_vif_cfg { + /* association related data */ + bool assoc, ibss_joined; + bool ibss_creator; + bool ps; + u16 aid; + u16 eml_cap; + u16 eml_med_sync_delay; + u16 mld_capa_op; + + __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; + int arp_addr_cnt; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + size_t ssid_len; + bool s1g; + bool idle; + u8 ap_addr[ETH_ALEN] __aligned(2); +}; + +#define IEEE80211_TTLM_NUM_TIDS 8 + +/** + * struct ieee80211_neg_ttlm - negotiated TID to link map info + * + * @downlink: bitmap of active links per TID for downlink, or 0 if mapping for + * this TID is not included. + * @uplink: bitmap of active links per TID for uplink, or 0 if mapping for this + * TID is not included. + * @valid: info is valid or not. + */ +struct ieee80211_neg_ttlm { + u16 downlink[IEEE80211_TTLM_NUM_TIDS]; + u16 uplink[IEEE80211_TTLM_NUM_TIDS]; + bool valid; +}; + +/** + * enum ieee80211_neg_ttlm_res - return value for negotiated TTLM handling + * @NEG_TTLM_RES_ACCEPT: accept the request + * @NEG_TTLM_RES_REJECT: reject the request + * @NEG_TTLM_RES_SUGGEST_PREFERRED: reject and suggest a new mapping + */ +enum ieee80211_neg_ttlm_res { + NEG_TTLM_RES_ACCEPT, + NEG_TTLM_RES_REJECT, + NEG_TTLM_RES_SUGGEST_PREFERRED }; /** @@ -1660,31 +1980,44 @@ enum ieee80211_offload_flags { * use during the life of a virtual interface. * * @type: type of this virtual interface + * @cfg: vif configuration, see &struct ieee80211_vif_cfg * @bss_conf: BSS configuration for this interface, either our own * or the BSS we're associated to + * @link_conf: in case of MLD, the per-link BSS configuration, + * indexed by link ID + * @valid_links: bitmap of valid links, or 0 for non-MLO. + * @active_links: The bitmap of active links, or 0 for non-MLO. + * The driver shouldn't change this directly, but use the + * API calls meant for that purpose. + * @dormant_links: subset of the valid links that are disabled/suspended + * due to advertised or negotiated TTLM respectively. + * 0 for non-MLO. + * @suspended_links: subset of dormant_links representing links that are + * suspended due to negotiated TTLM, and could be activated in the + * future by tearing down the TTLM negotiation. + * 0 for non-MLO. + * @neg_ttlm: negotiated TID to link mapping info. + * see &struct ieee80211_neg_ttlm. * @addr: address of this interface + * @addr_valid: indicates if the address is actively used. Set to false for + * passive monitor interfaces, true in all other cases. * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively - * @csa_active: marks whether a channel switch is going on. Internally it is - * write-protected by sdata_lock and local->mtx so holding either is fine - * for read access. - * @mu_mimo_owner: indicates interface owns MU-MIMO capability + * @netdev_features: tx netdev features supported by the hardware for this + * vif. mac80211 initializes this to hw->netdev_features, and the driver + * can mask out specific tx features. mac80211 will handle software fixup + * for masked offloads (GSO, CSUM) * @driver_flags: flags/capabilities the driver has for this interface, * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed * at runtime, mac80211 will never touch this field - * @offloaad_flags: hardware offload capabilities/flags for this interface. + * @offload_flags: hardware offload capabilities/flags for this interface. * These are initialized by mac80211 before calling .add_interface, * .change_interface or .update_vif_offload and updated by the driver * within these ops, based on supported features or runtime change * restrictions. * @hw_queue: hardware queue for each AC * @cab_queue: content-after-beacon (DTIM beacon really) queue, AP mode only - * @chanctx_conf: The channel context this interface is assigned to, or %NULL - * when it is not assigned. This pointer is RCU-protected due to the TX - * path needing to access it; even though the netdev carrier will always - * be off when it is %NULL there can still be races and packets could be - * processed after it switches back to %NULL. * @debugfs_dir: debugfs dentry, can be used by drivers to create own per * interface debug files. Note that it will be NULL for the virtual * monitor interface (if that is requested.) @@ -1694,27 +2027,27 @@ enum ieee80211_offload_flags { * for this interface. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*). - * @txq: the multicast data TX queue (if driver uses the TXQ abstraction) - * @txqs_stopped: per AC flag to indicate that intermediate TXQs are stopped, - * protected by fq->lock. + * @txq: the multicast data TX queue * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see * &enum ieee80211_offload_flags. */ struct ieee80211_vif { enum nl80211_iftype type; + struct ieee80211_vif_cfg cfg; struct ieee80211_bss_conf bss_conf; + struct ieee80211_bss_conf __rcu *link_conf[IEEE80211_MLD_MAX_NUM_LINKS]; + u16 valid_links, active_links, dormant_links, suspended_links; + struct ieee80211_neg_ttlm neg_ttlm; u8 addr[ETH_ALEN] __aligned(2); + bool addr_valid; bool p2p; - bool csa_active; - bool mu_mimo_owner; u8 cab_queue; u8 hw_queue[IEEE80211_NUM_ACS]; struct ieee80211_txq *txq; - struct ieee80211_chanctx_conf __rcu *chanctx_conf; - + netdev_features_t netdev_features; u32 driver_flags; u32 offload_flags; @@ -1725,12 +2058,52 @@ struct ieee80211_vif { bool probe_req_reg; bool rx_mcast_action_reg; - bool txqs_stopped[IEEE80211_NUM_ACS]; - /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; +/** + * ieee80211_vif_usable_links - Return the usable links for the vif + * @vif: the vif for which the usable links are requested + * Return: the usable link bitmap + */ +static inline u16 ieee80211_vif_usable_links(const struct ieee80211_vif *vif) +{ + return vif->valid_links & ~vif->dormant_links; +} + +/** + * ieee80211_vif_is_mld - Returns true iff the vif is an MLD one + * @vif: the vif + * Return: %true if the vif is an MLD, %false otherwise. + */ +static inline bool ieee80211_vif_is_mld(const struct ieee80211_vif *vif) +{ + /* valid_links != 0 indicates this vif is an MLD */ + return vif->valid_links != 0; +} + +/** + * ieee80211_vif_link_active - check if a given link is active + * @vif: the vif + * @link_id: the link ID to check + * Return: %true if the vif is an MLD and the link is active, or if + * the vif is not an MLD and the link ID is 0; %false otherwise. + */ +static inline bool ieee80211_vif_link_active(const struct ieee80211_vif *vif, + unsigned int link_id) +{ + if (!ieee80211_vif_is_mld(vif)) + return link_id == 0; + return vif->active_links & BIT(link_id); +} + +#define for_each_vif_active_link(vif, link, link_id) \ + for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++) \ + if ((!(vif)->active_links || \ + (vif)->active_links & BIT(link_id)) && \ + (link = link_conf_dereference_check(vif, link_id))) + static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) { #ifdef CONFIG_MAC80211_MESH @@ -1746,7 +2119,7 @@ static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that get a wdev. * - * Note that this function may return %NULL if the given wdev isn't + * Return: pointer to the wdev, or %NULL if the given wdev isn't * associated with a vif that the driver knows about (e.g. monitor * or AP_VLAN interfaces.) */ @@ -1758,13 +2131,25 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); * * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that needs to get the wdev for a vif. + * This can also be useful to get the netdev associated to a vif. * - * Note that this function may return %NULL if the given wdev isn't - * associated with a vif that the driver knows about (e.g. monitor - * or AP_VLAN interfaces.) + * Return: pointer to the wdev */ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); +static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif) +{ + return lockdep_is_held(&ieee80211_vif_to_wdev(vif)->wiphy->mtx); +} + +#define link_conf_dereference_protected(vif, link_id) \ + rcu_dereference_protected((vif)->link_conf[link_id], \ + lockdep_vif_wiphy_mutex_held(vif)) + +#define link_conf_dereference_check(vif, link_id) \ + rcu_dereference_check((vif)->link_conf[link_id], \ + lockdep_vif_wiphy_mutex_held(vif)) + /** * enum ieee80211_key_flags - key flags * @@ -1807,8 +2192,10 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation. * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver - * for a AES_CMAC key to indicate that it requires sequence number - * generation only + * for a AES_CMAC or a AES_GMAC key to indicate that it requires sequence + * number generation only + * @IEEE80211_KEY_FLAG_SPP_AMSDU: SPP A-MSDUs can be used with this key + * (set by mac80211 from the sta->spp_amsdu flag) */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), @@ -1822,6 +2209,7 @@ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), IEEE80211_KEY_FLAG_NO_AUTO_TX = BIT(9), IEEE80211_KEY_FLAG_GENERATE_MMIE = BIT(10), + IEEE80211_KEY_FLAG_SPP_AMSDU = BIT(11), }; /** @@ -1837,7 +2225,7 @@ enum ieee80211_key_flags { * @tx_pn: PN used for TX keys, may be used by the driver as well if it * needs to do software PN assignment by itself (e.g. due to TSO) * @flags: key flags, see &enum ieee80211_key_flags. - * @keyidx: the key index (0-3) + * @keyidx: the key index (0-7) * @keylen: key material length * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 byte) * data block: @@ -1846,6 +2234,7 @@ enum ieee80211_key_flags { * - Temporal Authenticator Rx MIC Key (64 bits) * @icv_len: The ICV length for this key type * @iv_len: The IV length for this key type + * @link_id: the link ID, 0 for non-MLO, or -1 for pairwise keys */ struct ieee80211_key_conf { atomic64_t tx_pn; @@ -1855,6 +2244,7 @@ struct ieee80211_key_conf { u8 hw_key_idx; s8 keyidx; u16 flags; + s8 link_id; u8 keylen; u8 key[]; }; @@ -1904,36 +2294,6 @@ struct ieee80211_key_seq { }; /** - * struct ieee80211_cipher_scheme - cipher scheme - * - * This structure contains a cipher scheme information defining - * the secure packet crypto handling. - * - * @cipher: a cipher suite selector - * @iftype: a cipher iftype bit mask indicating an allowed cipher usage - * @hdr_len: a length of a security header used the cipher - * @pn_len: a length of a packet number in the security header - * @pn_off: an offset of pn from the beginning of the security header - * @key_idx_off: an offset of key index byte in the security header - * @key_idx_mask: a bit mask of key_idx bits - * @key_idx_shift: a bit shift needed to get key_idx - * key_idx value calculation: - * (sec_header_base[key_idx_off] & key_idx_mask) >> key_idx_shift - * @mic_len: a mic length in bytes - */ -struct ieee80211_cipher_scheme { - u32 cipher; - u16 iftype; - u8 hdr_len; - u8 pn_len; - u8 pn_off; - u8 key_idx_off; - u8 key_idx_mask; - u8 key_idx_shift; - u8 mic_len; -}; - -/** * enum set_key_cmd - key command * * Used with the set_key() callback in &struct ieee80211_ops, this @@ -1972,6 +2332,7 @@ enum ieee80211_sta_state { * @IEEE80211_STA_RX_BW_80: station can receive up to 80 MHz * @IEEE80211_STA_RX_BW_160: station can receive up to 160 MHz * (including 80+80 MHz) + * @IEEE80211_STA_RX_BW_320: station can receive up to 320 MHz * * Implementation note: 20 must be zero to be initialized * correctly, the values must be sorted. @@ -1981,8 +2342,11 @@ enum ieee80211_sta_rx_bandwidth { IEEE80211_STA_RX_BW_40, IEEE80211_STA_RX_BW_80, IEEE80211_STA_RX_BW_160, + IEEE80211_STA_RX_BW_320, }; +#define IEEE80211_STA_RX_BW_MAX IEEE80211_STA_RX_BW_320 + /** * struct ieee80211_sta_rates - station rate selection table * @@ -2020,6 +2384,81 @@ struct ieee80211_sta_txpwr { }; /** + * struct ieee80211_sta_aggregates - info that is aggregated from active links + * + * Used for any per-link data that needs to be aggregated and updated in the + * main &struct ieee80211_sta when updated or the active links change. + * + * @max_amsdu_len: indicates the maximal length of an A-MSDU in bytes. + * This field is always valid for packets with a VHT preamble. + * For packets with a HT preamble, additional limits apply: + * + * * If the skb is transmitted as part of a BA agreement, the + * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. + * * If the skb is not part of a BA agreement, the A-MSDU maximal + * size is min(max_amsdu_len, 7935) bytes. + * + * Both additional HT limits must be enforced by the low level + * driver. This is defined by the spec (IEEE 802.11-2012 section + * 8.3.2.2 NOTE 2). + * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control. + * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID + */ +struct ieee80211_sta_aggregates { + u16 max_amsdu_len; + + u16 max_rc_amsdu_len; + u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; +}; + +/** + * struct ieee80211_link_sta - station Link specific info + * All link specific info for a STA link for a non MLD STA(single) + * or a MLD STA(multiple entries) are stored here. + * + * @sta: reference to owning STA + * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr + * in ieee80211_sta. For MLO Link STA this addr can be same or different + * from addr in ieee80211_sta (representing MLD STA addr) + * @link_id: the link ID for this link STA (0 for deflink) + * @smps_mode: current SMPS mode (off, static or dynamic) + * @supp_rates: Bitmap of supported rates + * @ht_cap: HT capabilities of this STA; restricted to our own capabilities + * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities + * @he_cap: HE capabilities of this STA + * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities + * @eht_cap: EHT capabilities of this STA + * @agg: per-link data for multi-link aggregation + * @bandwidth: current bandwidth the station can receive with + * @rx_nss: in HT/VHT, the maximum number of spatial streams the + * station can receive at the moment, changed by operating mode + * notifications and capabilities. The value is only valid after + * the station moves to associated state. + * @txpwr: the station tx power configuration + * + */ +struct ieee80211_link_sta { + struct ieee80211_sta *sta; + + u8 addr[ETH_ALEN]; + u8 link_id; + enum ieee80211_smps_mode smps_mode; + + u32 supp_rates[NUM_NL80211_BANDS]; + struct ieee80211_sta_ht_cap ht_cap; + struct ieee80211_sta_vht_cap vht_cap; + struct ieee80211_sta_he_cap he_cap; + struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct ieee80211_sta_eht_cap eht_cap; + + struct ieee80211_sta_aggregates agg; + + u8 rx_nss; + enum ieee80211_sta_rx_bandwidth bandwidth; + struct ieee80211_sta_txpwr txpwr; +}; + +/** * struct ieee80211_sta - station table entry * * A station table entry represents a station we are possibly @@ -2028,14 +2467,11 @@ struct ieee80211_sta_txpwr { * either be protected by rcu_read_lock() explicitly or implicitly, * or you must take good care to not use such a pointer after a * call to your sta_remove callback that removed it. + * This also represents the MLD STA in case of MLO association + * and holds pointers to various link STA's * * @addr: MAC address * @aid: AID we assigned to the station if we're an AP - * @supp_rates: Bitmap of supported rates (per band) - * @ht_cap: HT capabilities of this STA; restricted to our own capabilities - * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities - * @he_cap: HE capabilities of this STA - * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. @@ -2047,75 +2483,89 @@ struct ieee80211_sta_txpwr { * if wme is supported. The bits order is like in * IEEE80211_WMM_IE_STA_QOSINFO_AC_*. * @max_sp: max Service Period. Only valid if wme is supported. - * @bandwidth: current bandwidth the station can receive with - * @rx_nss: in HT/VHT, the maximum number of spatial streams the - * station can receive at the moment, changed by operating mode - * notifications and capabilities. The value is only valid after - * the station moves to associated state. - * @smps_mode: current SMPS mode (off, static or dynamic) * @rates: rate control selection table * @tdls: indicates whether the STA is a TDLS peer * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only * valid if the STA is a TDLS peer in the first place. * @mfp: indicates whether the STA uses management frame protection or not. + * @mlo: indicates whether the STA is MLO station. * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. + * @eml_cap: EML capabilities of this MLO station + * @cur: currently valid data as aggregated from the active links + * For non MLO STA it will point to the deflink data. For MLO STA + * ieee80211_sta_recalc_aggregates() must be called to update it. * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not. - * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control. - * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID - * @txpwr: the station tx power configuration - * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that - * the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames + * @txq: per-TID data TX queues; note that the last entry (%IEEE80211_NUM_TIDS) + * is used for non-data frames + * @deflink: This holds the default link STA information, for non MLO STA all link + * specific STA information is accessed through @deflink or through + * link[0] which points to address of @deflink. For MLO Link STA + * the first added link STA will point to deflink. + * @link: reference to Link Sta entries. For Non MLO STA, except 1st link, + * i.e link[0] all links would be assigned to NULL by default and + * would access link information via @deflink or link[0]. For MLO + * STA, first link STA being added will point its link pointer to + * @deflink address and remaining would be allocated and the address + * would be assigned to link[link_id] where link_id is the id assigned + * by the AP. + * @valid_links: bitmap of valid links, or 0 for non-MLO + * @spp_amsdu: indicates whether the STA uses SPP A-MSDU or not. */ struct ieee80211_sta { - u32 supp_rates[NUM_NL80211_BANDS]; - u8 addr[ETH_ALEN]; + u8 addr[ETH_ALEN] __aligned(2); u16 aid; - struct ieee80211_sta_ht_cap ht_cap; - struct ieee80211_sta_vht_cap vht_cap; - struct ieee80211_sta_he_cap he_cap; - struct ieee80211_he_6ghz_capa he_6ghz_capa; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; u8 max_sp; - u8 rx_nss; - enum ieee80211_sta_rx_bandwidth bandwidth; - enum ieee80211_smps_mode smps_mode; struct ieee80211_sta_rates __rcu *rates; bool tdls; bool tdls_initiator; bool mfp; + bool mlo; + bool spp_amsdu; u8 max_amsdu_subframes; + u16 eml_cap; + + struct ieee80211_sta_aggregates *cur; - /** - * @max_amsdu_len: - * indicates the maximal length of an A-MSDU in bytes. - * This field is always valid for packets with a VHT preamble. - * For packets with a HT preamble, additional limits apply: - * - * * If the skb is transmitted as part of a BA agreement, the - * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. - * * If the skb is not part of a BA agreement, the A-MSDU maximal - * size is min(max_amsdu_len, 7935) bytes. - * - * Both additional HT limits must be enforced by the low level - * driver. This is defined by the spec (IEEE 802.11-2012 section - * 8.3.2.2 NOTE 2). - */ - u16 max_amsdu_len; bool support_p2p_ps; - u16 max_rc_amsdu_len; - u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; - struct ieee80211_sta_txpwr txpwr; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; + u16 valid_links; + struct ieee80211_link_sta deflink; + struct ieee80211_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; + /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; +#ifdef CONFIG_LOCKDEP +bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta); +#else +static inline bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) +{ + return true; +} +#endif + +#define link_sta_dereference_protected(sta, link_id) \ + rcu_dereference_protected((sta)->link[link_id], \ + lockdep_sta_mutex_held(sta)) + +#define link_sta_dereference_check(sta, link_id) \ + rcu_dereference_check((sta)->link[link_id], \ + lockdep_sta_mutex_held(sta)) + +#define for_each_sta_active_link(vif, sta, link_sta, link_id) \ + for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) \ + if ((!(vif)->active_links || \ + (vif)->active_links & BIT(link_id)) && \ + ((link_sta) = link_sta_dereference_check(sta, link_id))) + /** * enum sta_notify_cmd - sta notify command * @@ -2260,6 +2710,11 @@ struct ieee80211_txq { * a virtual monitor interface when monitor interfaces are the only * active interfaces. * + * @IEEE80211_HW_NO_VIRTUAL_MONITOR: The driver would like to be informed + * of any monitor interface, as well as their configured channel. + * This is useful for supporting multiple monitor interfaces on different + * channels. + * * @IEEE80211_HW_NO_AUTO_VIF: The driver would like for no wlanX to * be created. It is expected user-space will create vifs as * desired (and thus have them named as desired). @@ -2347,14 +2802,6 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on * TDLS links. * - * @IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP: The driver requires the - * mgd_prepare_tx() callback to be called before transmission of a - * deauthentication frame in case the association was completed but no - * beacon was heard. This is required in multi-channel scenarios, where the - * virtual interface might not be given air time for the transmission of - * the frame, as it is not synced with the AP/P2P GO yet, and thus the - * deauthentication frame might not be transmitted. - * * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't * support QoS NDP for AP probing - that's most likely a driver bug. * @@ -2386,6 +2833,36 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD: Hardware supports tx encapsulation * offload * + * @IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD: Hardware supports rx decapsulation + * offload + * + * @IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP: Hardware supports concurrent rx + * decapsulation offload and passing raw 802.11 frames for monitor iface. + * If this is supported, the driver must pass both 802.3 frames for real + * usage and 802.11 frames with %RX_FLAG_ONLY_MONITOR set for monitor to + * the stack. + * + * @IEEE80211_HW_DETECTS_COLOR_COLLISION: HW/driver has support for BSS color + * collision detection and doesn't need it in software. + * + * @IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX: Hardware/driver handles transmitting + * multicast frames on all links, mac80211 should not do that. + * + * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT + * and connecting with a lower bandwidth instead + * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in + * EHT in 5 GHz and connecting with a lower bandwidth instead + * + * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so + * no need to stop queues. This really should be set by a driver that + * implements MLO, so operation can continue on other links when one + * link is switching. + * + * @IEEE80211_HW_STRICT: strictly enforce certain things mandated by the spec + * but otherwise ignored/worked around for interoperability. This is a + * HW flag so drivers can opt in according to their own control, e.g. in + * testing. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2402,6 +2879,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_DYNAMIC_PS, IEEE80211_HW_MFP_CAPABLE, IEEE80211_HW_WANT_MONITOR_VIF, + IEEE80211_HW_NO_VIRTUAL_MONITOR, IEEE80211_HW_NO_AUTO_VIF, IEEE80211_HW_SW_CRYPTO_CONTROL, IEEE80211_HW_SUPPORT_FAST_XMIT, @@ -2429,7 +2907,6 @@ enum ieee80211_hw_flags { IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, - IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP, IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP, IEEE80211_HW_BUFF_MMPDU_TXQ, IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW, @@ -2439,6 +2916,14 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT, IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD, + IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD, + IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, + IEEE80211_HW_DETECTS_COLOR_COLLISION, + IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, + IEEE80211_HW_DISALLOW_PUNCTURING, + IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ, + IEEE80211_HW_HANDLES_QUIET_CSA, + IEEE80211_HW_STRICT, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS @@ -2527,8 +3012,6 @@ enum ieee80211_hw_flags { * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * - * @radiotap_he: HE radiotap validity flags - * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * @units_pos member is set to a non-negative value then the timestamp * field will be added and populated from the &struct ieee80211_rx_status @@ -2554,9 +3037,6 @@ enum ieee80211_hw_flags { * deliver to a WMM STA during any Service Period triggered by the WMM STA. * Use IEEE80211_WMM_IE_STA_QOSINFO_SP_* for correct values. * - * @n_cipher_schemes: a size of an array of cipher schemes definitions. - * @cipher_schemes: a pointer to an array of cipher scheme definitions - * supported by HW. * @max_nan_de_entries: maximum number of NAN DE functions supported by the * device. * @@ -2568,6 +3048,12 @@ enum ieee80211_hw_flags { * refilling deficit of each TXQ. * * @max_mtu: the max mtu could be set. + * + * @tx_power_levels: a list of power levels supported by the wifi hardware. + * The power levels can be specified either as integer or fractions. + * The power level at idx 0 shall be the maximum positive power level. + * + * @max_txpwr_levels_idx: the maximum valid idx of 'tx_power_levels' list. */ struct ieee80211_hw { struct ieee80211_conf conf; @@ -2600,12 +3086,12 @@ struct ieee80211_hw { netdev_features_t netdev_features; u8 uapsd_queues; u8 uapsd_max_sp_len; - u8 n_cipher_schemes; - const struct ieee80211_cipher_scheme *cipher_schemes; u8 max_nan_de_entries; u8 tx_sk_pacing_shift; u8 weight_multiplier; u32 max_mtu; + const s8 *tx_power_levels; + u8 max_txpwr_levels_idx; }; static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw, @@ -2735,6 +3221,19 @@ ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw, void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); /** + * ieee80211_purge_tx_queue - purge TX skb queue + * @hw: the hardware + * @skbs: the skbs + * + * Free a set of transmit skbs. Use this function when device is going to stop + * but some transmit skbs without TX status are still queued. + * This function does not take the list lock and the caller must hold the + * relevant locks to use it. + */ +void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, + struct sk_buff_head *skbs); + +/** * DOC: Hardware crypto acceleration * * mac80211 is capable of taking advantage of many hardware @@ -2755,7 +3254,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * The set_key() call for the %SET_KEY command should return 0 if * the key is now in use, -%EOPNOTSUPP or -%ENOSPC if it couldn't be * added; if you return 0 then hw_key_idx must be assigned to the - * hardware key index, you are free to use the full u8 range. + * hardware key index. You are free to use the full u8 range. * * Note that in the case that the @IEEE80211_HW_SW_CRYPTO_CONTROL flag is * set, mac80211 will not automatically fall back to software crypto if @@ -2765,7 +3264,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * When the cmd is %DISABLE_KEY then it must succeed. * * Note that it is permissible to not decrypt a frame even if a key - * for it has been uploaded to hardware, the stack will not make any + * for it has been uploaded to hardware. The stack will not make any * decision based on whether a key has been uploaded or not but rather * based on the receive flags. * @@ -2780,7 +3279,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * The update_tkip_key() call updates the driver with the new phase 1 key. * This happens every time the iv16 wraps around (every 65536 packets). The * set_key() call will happen only once for each key (unless the AP did - * rekeying), it will not include a valid phase 1 key. The valid phase 1 key is + * rekeying); it will not include a valid phase 1 key. The valid phase 1 key is * provided by update_tkip_key only. The trigger that makes mac80211 call this * handler is software decryption with wrap around of iv16. * @@ -2791,13 +3290,13 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * Mac80211 drivers should set the @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 flag * when they are able to replace in-use PTK keys according to the following * requirements: - * 1) They do not hand over frames decrypted with the old key to - mac80211 once the call to set_key() with command %DISABLE_KEY has been - completed when also setting @IEEE80211_KEY_FLAG_GENERATE_IV for any key, + * 1) They do not hand over frames decrypted with the old key to mac80211 + once the call to set_key() with command %DISABLE_KEY has been completed, 2) either drop or continue to use the old key for any outgoing frames queued at the time of the key deletion (including re-transmits), 3) never send out a frame queued prior to the set_key() %SET_KEY command - encrypted with the new key and + encrypted with the new key when also needing + @IEEE80211_KEY_FLAG_GENERATE_IV and 4) never send out a frame unencrypted when it should be encrypted. Mac80211 will not queue any new frames for a deleted key to the driver. */ @@ -2807,7 +3306,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * * mac80211 has support for various powersave implementations. * - * First, it can support hardware that handles all powersaving by itself, + * First, it can support hardware that handles all powersaving by itself; * such hardware should simply set the %IEEE80211_HW_SUPPORTS_PS hardware * flag. In that case, it will be told about the desired powersave mode * with the %IEEE80211_CONF_PS flag depending on the association status. @@ -2832,12 +3331,12 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * %IEEE80211_HW_PS_NULLFUNC_STACK flags. The hardware is of course still * required to pass up beacons. The hardware is still required to handle * waking up for multicast traffic; if it cannot the driver must handle that - * as best as it can, mac80211 is too slow to do that. + * as best as it can; mac80211 is too slow to do that. * * Dynamic powersave is an extension to normal powersave in which the * hardware stays awake for a user-specified period of time after sending a * frame so that reply frames need not be buffered and therefore delayed to - * the next wakeup. It's compromise of getting good enough latency when + * the next wakeup. It's a compromise of getting good enough latency when * there's data traffic and still saving significantly power in idle * periods. * @@ -2902,7 +3401,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * Note that change, for the sake of simplification, also includes information * elements appearing or disappearing from the beacon. * - * Some hardware supports an "ignore list" instead, just make sure nothing + * Some hardware supports an "ignore list" instead. Just make sure nothing * that was requested is on the ignore list, and include commonly changing * information element IDs in the ignore list, for example 11 (BSS load) and * the various vendor-assigned IEs with unknown contents (128, 129, 133-136, @@ -2913,7 +3412,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * In addition to these capabilities, hardware should support notifying the * host of changes in the beacon RSSI. This is relevant to implement roaming * when no traffic is flowing (when traffic is flowing we see the RSSI of - * the received data packets). This can consist in notifying the host when + * the received data packets). This can consist of notifying the host when * the RSSI changes significantly or when it drops below or rises above * configurable thresholds. In the future these thresholds will also be * configured by mac80211 (which gets them from userspace) to implement @@ -3060,8 +3559,8 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * period starts for any reason, @release_buffered_frames is called * with the number of frames to be released and which TIDs they are * to come from. In this case, the driver is responsible for setting - * the EOSP (for uAPSD) and MORE_DATA bits in the released frames, - * to help the @more_data parameter is passed to tell the driver if + * the EOSP (for uAPSD) and MORE_DATA bits in the released frames. + * To help the @more_data parameter is passed to tell the driver if * there is more data on other TIDs -- the TIDs to release frames * from are ignored since mac80211 doesn't know how many frames the * buffers for those TIDs contain. @@ -3110,7 +3609,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); * Additionally, the driver has to then use these HW queue IDs for the queue * management functions (ieee80211_stop_queue() et al.) * - * The driver is free to set up the queue mappings as needed, multiple virtual + * The driver is free to set up the queue mappings as needed; multiple virtual * interfaces may map to the same hardware queues if needed. The setup has to * happen during add_interface or change_interface callbacks. For example, a * driver supporting station+station and station+AP modes might decide to have @@ -3311,7 +3810,7 @@ enum ieee80211_roc_type { }; /** - * enum ieee80211_reconfig_complete_type - reconfig type + * enum ieee80211_reconfig_type - reconfig type * * This enum is used by the reconfig_complete() callback to indicate what * reconfiguration type was completed. @@ -3327,6 +3826,26 @@ enum ieee80211_reconfig_type { }; /** + * struct ieee80211_prep_tx_info - prepare TX information + * @duration: if non-zero, hint about the required duration, + * only used with the mgd_prepare_tx() method. + * @subtype: frame subtype (auth, (re)assoc, deauth, disassoc) + * @success: whether the frame exchange was successful, only + * used with the mgd_complete_tx() method, and then only + * valid for auth and (re)assoc. + * @was_assoc: set if this call is due to deauth/disassoc + * while just having been associated + * @link_id: the link id on which the frame will be TX'ed. + * 0 for a non-MLO connection. + */ +struct ieee80211_prep_tx_info { + u16 duration; + u16 subtype; + u8 success:1, was_assoc:1; + int link_id; +}; + +/** * struct ieee80211_ops - callbacks from mac80211 to the driver * * This structure contains various callbacks that the driver may @@ -3416,6 +3935,22 @@ enum ieee80211_reconfig_type { * for association indication. The @changed parameter indicates which * of the bss parameters has changed when a call is made. The callback * can sleep. + * Note: this callback is called if @vif_cfg_changed or @link_info_changed + * are not implemented. + * + * @vif_cfg_changed: Handler for configuration requests related to interface + * (MLD) parameters from &struct ieee80211_vif_cfg that vary during the + * lifetime of the interface (e.g. assoc status, IP addresses, etc.) + * The @changed parameter indicates which value changed. + * The callback can sleep. + * + * @link_info_changed: Handler for configuration requests related to link + * parameters from &struct ieee80211_bss_conf that are related to an + * individual link. e.g. legacy/HT/VHT/... rate information. + * The @changed parameter indicates which value changed, and the @link_id + * parameter indicates the link ID. Note that the @link_id will be 0 for + * non-MLO connections. + * The callback can sleep. * * @prepare_multicast: Prepare for multicast filter configuration. * This callback is optional, and its return value is passed @@ -3531,11 +4066,28 @@ enum ieee80211_reconfig_type { * the station. See @sta_pre_rcu_remove if needed. * This callback can sleep. * + * @vif_add_debugfs: Drivers can use this callback to add a debugfs vif + * directory with its files. This callback should be within a + * CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. + * + * @link_add_debugfs: Drivers can use this callback to add debugfs files + * when a link is added to a mac80211 vif. This callback should be within + * a CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. + * For non-MLO the callback will be called once for the default bss_conf + * with the vif's directory rather than a separate subdirectory. + * * @sta_add_debugfs: Drivers can use this callback to add debugfs files * when a station is added to mac80211's station list. This callback * should be within a CONFIG_MAC80211_DEBUGFS conditional. This * callback can sleep. * + * @link_sta_add_debugfs: Drivers can use this callback to add debugfs files + * when a link is added to a mac80211 station. This callback + * should be within a CONFIG_MAC80211_DEBUGFS conditional. This + * callback can sleep. + * For non-MLO the callback will be called once for the deflink with the + * station's directory rather than a separate subdirectory. + * * @sta_notify: Notifies low level driver about power state transition of an * associated station, AP, IBSS/WDS/mesh peer etc. For a VIF operating * in AP mode, this callback will not be called when the flag @@ -3562,8 +4114,8 @@ enum ieee80211_reconfig_type { * in @sta_state. * The callback can sleep. * - * @sta_rc_update: Notifies the driver of changes to the bitrates that can be - * used to transmit to the station. The changes are advertised with bits + * @link_sta_rc_update: Notifies the driver of changes to the bitrates that can + * be used to transmit to the station. The changes are advertised with bits * from &enum ieee80211_rate_control_changed and the values are reflected * in the station data. This callback should only be used when the driver * uses hardware rate control (%IEEE80211_HW_HAS_RATE_CONTROL) since @@ -3639,6 +4191,10 @@ enum ieee80211_reconfig_type { * Note that vif can be NULL. * The callback can sleep. * + * @flush_sta: Flush or drop all pending frames from the hardware queue(s) for + * the given station, as it's about to be removed. + * The callback can sleep. + * * @channel_switch: Drivers that need (or want) to offload the channel * switch operation for CSAs received from the AP may implement this * callback. They must then call ieee80211_chswitch_done() to indicate @@ -3718,11 +4274,15 @@ enum ieee80211_reconfig_type { * This callback must be atomic. * * @get_et_sset_count: Ethtool API to get string-set count. + * Note that the wiphy mutex is not held for this callback since it's + * expected to return a static value. * * @get_et_stats: Ethtool API to get a set of u64 stats. * * @get_et_strings: Ethtool API to get a set of strings to describe stats * and perhaps other supported types of ethtool data-sets. + * Note that the wiphy mutex is not held for this callback since it's + * expected to return a static value. * * @mgd_prepare_tx: Prepare for transmitting a management frame for association * before associated. In multi-channel scenarios, a virtual interface is @@ -3730,17 +4290,18 @@ enum ieee80211_reconfig_type { * yet it need not necessarily be given airtime, in particular since any * transmission to a P2P GO needs to be synchronized against the GO's * powersave state. mac80211 will call this function before transmitting a - * management frame prior to having successfully associated to allow the - * driver to give it channel time for the transmission, to get a response - * and to be able to synchronize with the GO. - * For drivers that set %IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP, mac80211 - * would also call this function before transmitting a deauthentication - * frame in case that no beacon was heard from the AP/P2P GO. + * management frame prior to transmitting that frame to allow the driver + * to give it channel time for the transmission, to get a response and be + * able to synchronize with the GO. * The callback will be called before each transmission and upon return * mac80211 will transmit the frame right away. - * If duration is greater than zero, mac80211 hints to the driver the - * duration for which the operation is requested. + * Additional information is passed in the &struct ieee80211_prep_tx_info + * data. If duration there is greater than zero, mac80211 hints to the + * driver the duration for which the operation is requested. * The callback is optional and can (should!) sleep. + * @mgd_complete_tx: Notify the driver that the response frame for a previously + * transmitted frame announced with @mgd_prepare_tx was received, the data + * is filled similarly to @mgd_prepare_tx though the duration is not used. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending * a TDLS discovery-request, we expect a reply to arrive on the AP's @@ -3807,7 +4368,7 @@ enum ieee80211_reconfig_type { * after a channel switch procedure is completed, allowing the * driver to go back to a normal configuration. * @abort_channel_switch: This is an optional callback that is called - * when channel switch procedure was completed, allowing the + * when channel switch procedure was aborted, allowing the * driver to go back to a normal configuration. * @channel_switch_rx_beacon: This is an optional callback that is called * when channel switch procedure is in progress and additional beacon with @@ -3877,13 +4438,64 @@ enum ieee80211_reconfig_type { * This callback may sleep. * @sta_set_4addr: Called to notify the driver when a station starts/stops using * 4-address mode + * @set_sar_specs: Update the SAR (TX power) settings. + * @sta_set_decap_offload: Called to notify the driver when a station is allowed + * to use rx decapsulation offload + * @add_twt_setup: Update hw with TWT agreement parameters received from the peer. + * This callback allows the hw to check if requested parameters + * are supported and if there is enough room for a new agreement. + * The hw is expected to set agreement result in the req_type field of + * twt structure. + * @twt_teardown_request: Update the hw with TWT teardown request received + * from the peer. + * @set_radar_background: Configure dedicated offchannel chain available for + * radar/CAC detection on some hw. This chain can't be used to transmit + * or receive frames and it is bounded to a running wdev. + * Background radar/CAC detection allows to avoid the CAC downtime + * switching to a different channel during CAC detection on the selected + * radar channel. + * The caller is expected to set chandef pointer to NULL in order to + * disable background CAC/radar detection. + * @net_fill_forward_path: Called from .ndo_fill_forward_path in order to + * resolve a path for hardware flow offloading + * @can_activate_links: Checks if a specific active_links bitmap is + * supported by the driver. + * @change_vif_links: Change the valid links on an interface, note that while + * removing the old link information is still valid (link_conf pointer), + * but may immediately disappear after the function returns. The old or + * new links bitmaps may be 0 if going from/to a non-MLO situation. + * The @old array contains pointers to the old bss_conf structures + * that were already removed, in case they're needed. + * This callback can sleep. + * @change_sta_links: Change the valid links of a station, similar to + * @change_vif_links. This callback can sleep. + * Note that a sta can also be inserted or removed with valid links, + * i.e. passed to @sta_add/@sta_state with sta->valid_links not zero. + * In fact, cannot change from having valid_links and not having them. + * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. This is + * not restored at HW reset by mac80211 so drivers need to take care of + * that. + * @net_setup_tc: Called from .ndo_setup_tc in order to prepare hardware + * flow offloading for flows originating from the vif. + * Note that the driver must not assume that the vif driver_data is valid + * at this point, since the callback can be called during netdev teardown. + * @can_neg_ttlm: for managed interface, requests the driver to determine + * if the requested TID-To-Link mapping can be accepted or not. + * If it's not accepted the driver may suggest a preferred mapping and + * modify @ttlm parameter with the suggested TID-to-Link mapping. + * @prep_add_interface: prepare for interface addition. This can be used by + * drivers to prepare for the addition of a new interface, e.g., allocate + * the needed resources etc. This callback doesn't guarantee that an + * interface with the specified type would be added, and thus drivers that + * implement this callback need to handle such cases. The type is the full + * &enum nl80211_iftype. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb); int (*start)(struct ieee80211_hw *hw); - void (*stop)(struct ieee80211_hw *hw); + void (*stop)(struct ieee80211_hw *hw, bool suspend); #ifdef CONFIG_PM int (*suspend)(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan); int (*resume)(struct ieee80211_hw *hw); @@ -3900,10 +4512,19 @@ struct ieee80211_ops { void (*bss_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, - u32 changed); + u64 changed); + void (*vif_cfg_changed)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u64 changed); + void (*link_info_changed)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_bss_conf *info, + u64 changed); - int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); - void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); + int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf); + void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf); u64 (*prepare_multicast)(struct ieee80211_hw *hw, struct netdev_hw_addr_list *mc_list); @@ -3957,10 +4578,20 @@ struct ieee80211_ops { int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); #ifdef CONFIG_MAC80211_DEBUGFS + void (*vif_add_debugfs)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); + void (*link_add_debugfs)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf, + struct dentry *dir); void (*sta_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct dentry *dir); + void (*link_sta_add_debugfs)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + struct dentry *dir); #endif void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd, struct ieee80211_sta *sta); @@ -3974,10 +4605,10 @@ struct ieee80211_ops { void (*sta_pre_rcu_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); - void (*sta_rc_update)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct ieee80211_sta *sta, - u32 changed); + void (*link_sta_rc_update)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + u32 changed); void (*sta_rate_tbl_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); @@ -3986,7 +4617,8 @@ struct ieee80211_ops { struct ieee80211_sta *sta, struct station_info *sinfo); int (*conf_tx)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, u16 ac, + struct ieee80211_vif *vif, + unsigned int link_id, u16 ac, const struct ieee80211_tx_queue_params *params); u64 (*get_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*set_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4043,6 +4675,8 @@ struct ieee80211_ops { #endif void (*flush)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop); + void (*flush_sta)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_sta *sta); void (*channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); @@ -4088,10 +4722,14 @@ struct ieee80211_ops { void (*mgd_prepare_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - u16 duration); + struct ieee80211_prep_tx_info *info); + void (*mgd_complete_tx)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_prep_tx_info *info); void (*mgd_protect_tdls_discover)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + unsigned int link_id); int (*add_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); @@ -4102,9 +4740,11 @@ struct ieee80211_ops { u32 changed); int (*assign_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx); void (*unassign_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx); int (*switch_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif_chanctx_switch *vifs, @@ -4127,9 +4767,11 @@ struct ieee80211_ops { struct ieee80211_channel_switch *ch_switch); int (*post_channel_switch)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf); void (*abort_channel_switch)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf); void (*channel_switch_rx_beacon)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); @@ -4139,7 +4781,7 @@ struct ieee80211_ops { u32 (*get_expected_throughput)(struct ieee80211_hw *hw, struct ieee80211_sta *sta); int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - int *dbm); + unsigned int link_id, int *dbm); int (*tdls_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4192,6 +4834,47 @@ struct ieee80211_ops { struct ieee80211_vif *vif); void (*sta_set_4addr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, bool enabled); + int (*set_sar_specs)(struct ieee80211_hw *hw, + const struct cfg80211_sar_specs *sar); + void (*sta_set_decap_offload)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, bool enabled); + void (*add_twt_setup)(struct ieee80211_hw *hw, + struct ieee80211_sta *sta, + struct ieee80211_twt_setup *twt); + void (*twt_teardown_request)(struct ieee80211_hw *hw, + struct ieee80211_sta *sta, u8 flowid); + int (*set_radar_background)(struct ieee80211_hw *hw, + struct cfg80211_chan_def *chandef); + int (*net_fill_forward_path)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + struct net_device_path_ctx *ctx, + struct net_device_path *path); + bool (*can_activate_links)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u16 active_links); + int (*change_vif_links)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u16 old_links, u16 new_links, + struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]); + int (*change_sta_links)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + u16 old_links, u16 new_links); + int (*set_hw_timestamp)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct cfg80211_set_hw_timestamp *hwts); + int (*net_setup_tc)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct net_device *dev, + enum tc_setup_type type, + void *type_data); + enum ieee80211_neg_ttlm_res + (*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_neg_ttlm *ttlm); + void (*prep_add_interface)(struct ieee80211_hw *hw, + enum nl80211_iftype type); }; /** @@ -4440,7 +5123,7 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw); * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled and RCU read lock * @@ -4465,7 +5148,7 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *sta, * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled. * @@ -4490,7 +5173,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *sta, * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * In process context use instead ieee80211_rx_ni(). * @@ -4510,7 +5193,7 @@ static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) * * Calls to this function, ieee80211_rx() or ieee80211_rx_ni() may not * be mixed for a single hardware.Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call @@ -4525,7 +5208,7 @@ void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); * * Calls to this function, ieee80211_rx() and ieee80211_rx_irqsafe() may * not be mixed for a single hardware. Must not run concurrently with - * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call @@ -4670,22 +5353,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, int max_rates); /** - * ieee80211_sta_set_expected_throughput - set the expected tpt for a station - * - * Call this function to notify mac80211 about a change in expected throughput - * to a station. A driver for a device that does rate control in firmware can - * call this function when the expected throughput estimate towards a station - * changes. The information is used to tune the CoDel AQM applied to traffic - * going towards that station (which can otherwise be too aggressive and cause - * slow stations to starve). - * - * @pubsta: the station to set throughput for. - * @thr: the current expected throughput in kbps. - */ -void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, - u32 thr); - -/** * ieee80211_tx_rate_update - transmit rate update callback * * Drivers should call this functions with a non-NULL pub sta @@ -4701,7 +5368,7 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_tx_info *info); /** - * ieee80211_tx_status - transmit status callback + * ieee80211_tx_status_skb - transmit status callback * * Call this function for all transmitted frames after they have been * transmitted. It is permissible to not call this function for @@ -4716,13 +5383,13 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw, * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ -void ieee80211_tx_status(struct ieee80211_hw *hw, - struct sk_buff *skb); +void ieee80211_tx_status_skb(struct ieee80211_hw *hw, + struct sk_buff *skb); /** * ieee80211_tx_status_ext - extended transmit status callback * - * This function can be used as a replacement for ieee80211_tx_status + * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that may want to provide extra information that does not * fit into &struct ieee80211_tx_info. * @@ -4739,7 +5406,7 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, /** * ieee80211_tx_status_noskb - transmit status callback without skb * - * This function can be used as a replacement for ieee80211_tx_status + * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that cannot reliably map tx status information back to * specific skbs. * @@ -4767,9 +5434,9 @@ static inline void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, /** * ieee80211_tx_status_ni - transmit status callback (in process context) * - * Like ieee80211_tx_status() but can be called in process context. + * Like ieee80211_tx_status_skb() but can be called in process context. * - * Calls to this function, ieee80211_tx_status() and + * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_irqsafe() may not be mixed * for a single hardware. * @@ -4780,17 +5447,17 @@ static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw, struct sk_buff *skb) { local_bh_disable(); - ieee80211_tx_status(hw, skb); + ieee80211_tx_status_skb(hw, skb); local_bh_enable(); } /** * ieee80211_tx_status_irqsafe - IRQ-safe transmit status callback * - * Like ieee80211_tx_status() but can be called in IRQ context + * Like ieee80211_tx_status_skb() but can be called in IRQ context * (internally defers to a tasklet.) * - * Calls to this function, ieee80211_tx_status() and + * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_ni() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by @@ -4800,26 +5467,6 @@ void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); /** - * ieee80211_tx_status_8023 - transmit status callback for 802.3 frame format - * - * Call this function for all transmitted data frames after their transmit - * completion. This callback should only be called for data frames which - * are using driver's (or hardware's) offload capability of encap/decap - * 802.11 frames. - * - * This function may not be called in IRQ context. Calls to this function - * for a single hardware must be synchronized against each other and all - * calls in the same tx status family. - * - * @hw: the hardware the frame was transmitted by - * @vif: the interface for which the frame was transmitted - * @skb: the frame that was transmitted, owned by mac80211 after this call - */ -void ieee80211_tx_status_8023(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct sk_buff *skb); - -/** * ieee80211_report_low_ack - report non-responding station * * When operating in AP-mode, call this function to report a non-responding @@ -4839,12 +5486,14 @@ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets); * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets * to countdown counters. This array can contain zero values which * should be ignored. + * @mbssid_off: position of the multiple bssid element */ struct ieee80211_mutable_offsets { u16 tim_offset; u16 tim_length; u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; + u16 mbssid_off; }; /** @@ -4853,6 +5502,8 @@ struct ieee80211_mutable_offsets { * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. + * @link_id: the link id to which the beacon belongs (or 0 for an AP STA + * that is not associated with AP MLD). * * If the driver implements beaconing modes, it must use this function to * obtain the beacon template. @@ -4869,7 +5520,76 @@ struct ieee80211_mutable_offsets { struct sk_buff * ieee80211_beacon_get_template(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_mutable_offsets *offs); + struct ieee80211_mutable_offsets *offs, + unsigned int link_id); + +/** + * ieee80211_beacon_get_template_ema_index - EMA beacon template generation + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @offs: &struct ieee80211_mutable_offsets pointer to struct that will + * receive the offsets that may be updated by the driver. + * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP). + * @ema_index: index of the beacon in the EMA set. + * + * This function follows the same rules as ieee80211_beacon_get_template() + * but returns a beacon template which includes multiple BSSID element at the + * requested index. + * + * Return: The beacon template. %NULL indicates the end of EMA templates. + */ +struct sk_buff * +ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_mutable_offsets *offs, + unsigned int link_id, u8 ema_index); + +/** + * struct ieee80211_ema_beacons - List of EMA beacons + * @cnt: count of EMA beacons. + * + * @bcn: array of EMA beacons. + * @bcn.skb: the skb containing this specific beacon + * @bcn.offs: &struct ieee80211_mutable_offsets pointer to struct that will + * receive the offsets that may be updated by the driver. + */ +struct ieee80211_ema_beacons { + u8 cnt; + struct { + struct sk_buff *skb; + struct ieee80211_mutable_offsets offs; + } bcn[]; +}; + +/** + * ieee80211_beacon_get_template_ema_list - EMA beacon template generation + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP) + * + * This function follows the same rules as ieee80211_beacon_get_template() + * but allocates and returns a pointer to list of all beacon templates required + * to cover all profiles in the multiple BSSID set. Each template includes only + * one multiple BSSID element. + * + * Driver must call ieee80211_beacon_free_ema_list() to free the memory. + * + * Return: EMA beacon templates of type struct ieee80211_ema_beacons *. + * %NULL on error. + */ +struct ieee80211_ema_beacons * +ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + unsigned int link_id); + +/** + * ieee80211_beacon_free_ema_list - free an EMA beacon template list + * @ema_beacons: list of EMA beacons of type &struct ieee80211_ema_beacons pointers. + * + * This function will free a list previously acquired by calling + * ieee80211_beacon_get_template_ema_list() + */ +void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons); /** * ieee80211_beacon_get_tim - beacon generation function @@ -4880,6 +5600,8 @@ ieee80211_beacon_get_template(struct ieee80211_hw *hw, * @tim_length: pointer to variable that will receive the TIM IE length, * (including the ID and length bytes!). * Set to 0 if invalid (in non-AP modes). + * @link_id: the link id to which the beacon belongs (or 0 for an AP STA + * that is not associated with AP MLD). * * If the driver implements beaconing modes, it must use this function to * obtain the beacon frame. @@ -4895,26 +5617,31 @@ ieee80211_beacon_get_template(struct ieee80211_hw *hw, */ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - u16 *tim_offset, u16 *tim_length); + u16 *tim_offset, u16 *tim_length, + unsigned int link_id); /** * ieee80211_beacon_get - beacon generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: the link id to which the beacon belongs (or 0 for an AP STA + * that is not associated with AP MLD). * * See ieee80211_beacon_get_tim(). * * Return: See ieee80211_beacon_get_tim(). */ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, - struct ieee80211_vif *vif) + struct ieee80211_vif *vif, + unsigned int link_id) { - return ieee80211_beacon_get_tim(hw, vif, NULL, NULL); + return ieee80211_beacon_get_tim(hw, vif, NULL, NULL, link_id); } /** * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * * The beacon counter should be updated after each beacon transmission. * This function is called implicitly when @@ -4924,7 +5651,8 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, * * Return: new countdown value */ -u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif); +u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, + unsigned int link_id); /** * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown @@ -4942,20 +5670,34 @@ void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter); /** * ieee80211_csa_finish - notify mac80211 about channel switch * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * * After a channel switch announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the channel can be changed. */ -void ieee80211_csa_finish(struct ieee80211_vif *vif); +void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1 * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * - * This function returns whether the countdown reached zero. + * Return: %true if the countdown reached 1, %false otherwise */ -bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif); +bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, + unsigned int link_id); + +/** + * ieee80211_color_change_finish - notify mac80211 about color change + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO + * + * After a color change announcement was scheduled and the counter in this + * announcement hits 1, this function must be called by the driver to + * notify mac80211 that the color can be changed + */ +void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id); /** * ieee80211_proberesp_get - retrieve a Probe Response template @@ -4993,6 +5735,9 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, * ieee80211_nullfunc_get - retrieve a nullfunc template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: If the vif is an MLD, get a frame with the link addresses + * for the given link ID. For a link_id < 0 you get a frame with + * MLD addresses, however useful that might be. * @qos_ok: QoS NDP is acceptable to the caller, this should be set * if at all possible * @@ -5010,7 +5755,7 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, */ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - bool qos_ok); + int link_id, bool qos_ok); /** * ieee80211_probereq_get - retrieve a Probe Request template @@ -5266,12 +6011,11 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, * ieee80211_remove_key - remove the given key * @keyconf: the parameter passed with the set key * + * Context: Must be called with the wiphy mutex held. + * * Remove the given key. If the key was uploaded to the hardware at the * time this function is called, it is not deleted in the hardware but * instead assumed to have been removed already. - * - * Note that due to locking considerations this function can (currently) - * only be called during key iteration (ieee80211_iter_keys().) */ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); @@ -5279,13 +6023,14 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on * @keyconf: new key data + * @link_id: the link id of the key or -1 for non-MLO * * When GTK rekeying was done while the system was suspended, (a) new * key(s) will be available. These will be needed by mac80211 for proper * RX processing, so this function allows setting them. * - * The function returns the newly allocated key structure, which will - * have similar contents to the passed key configuration but point to + * Return: the newly allocated key structure, which will have + * similar contents to the passed key configuration but point to * mac80211-owned memory. In case of errors, the function returns an * ERR_PTR(), use IS_ERR() etc. * @@ -5306,7 +6051,8 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, - struct ieee80211_key_conf *keyconf); + struct ieee80211_key_conf *keyconf, + int link_id); /** * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying @@ -5319,11 +6065,31 @@ void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid, const u8 *replay_ctr, gfp_t gfp); /** + * ieee80211_key_mic_failure - increment MIC failure counter for the key + * + * Note: this is really only safe if no other RX function is called + * at the same time. + * + * @keyconf: the key in question + */ +void ieee80211_key_mic_failure(struct ieee80211_key_conf *keyconf); + +/** + * ieee80211_key_replay - increment replay counter for the key + * + * Note: this is really only safe if no other RX function is called + * at the same time. + * + * @keyconf: the key in question + */ +void ieee80211_key_replay(struct ieee80211_key_conf *keyconf); + +/** * ieee80211_wake_queue - wake specific queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * - * Drivers should use this function instead of netif_wake_queue. + * Drivers must use this function instead of netif_wake_queue. */ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue); @@ -5332,7 +6098,7 @@ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue); * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * - * Drivers should use this function instead of netif_stop_queue. + * Drivers must use this function instead of netif_stop_queue. */ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue); @@ -5341,7 +6107,7 @@ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue); * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * - * Drivers should use this function instead of netif_stop_queue. + * Drivers must use this function instead of netif_queue_stopped. * * Return: %true if the queue is stopped. %false otherwise. */ @@ -5352,7 +6118,7 @@ int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue); * ieee80211_stop_queues - stop all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * - * Drivers should use this function instead of netif_stop_queue. + * Drivers must use this function instead of netif_tx_stop_all_queues. */ void ieee80211_stop_queues(struct ieee80211_hw *hw); @@ -5360,7 +6126,7 @@ void ieee80211_stop_queues(struct ieee80211_hw *hw); * ieee80211_wake_queues - wake all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * - * Drivers should use this function instead of netif_wake_queue. + * Drivers must use this function instead of netif_tx_wake_all_queues. */ void ieee80211_wake_queues(struct ieee80211_hw *hw); @@ -5487,23 +6253,23 @@ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw, void *data); /** - * ieee80211_iterate_active_interfaces_rtnl - iterate active interfaces + * ieee80211_iterate_active_interfaces_mtx - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. - * This version can only be used while holding the RTNL. + * This version can only be used while holding the wiphy mutex. * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ -void ieee80211_iterate_active_interfaces_rtnl(struct ieee80211_hw *hw, - u32 iter_flags, - void (*iterator)(void *data, +void ieee80211_iterate_active_interfaces_mtx(struct ieee80211_hw *hw, + u32 iter_flags, + void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), - void *data); + void *data); /** * ieee80211_iterate_stations_atomic - iterate stations @@ -5521,6 +6287,24 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data); + +/** + * ieee80211_iterate_stations_mtx - iterate stations + * + * This function iterates over all stations associated with a given + * hardware that are currently uploaded to the driver and calls the callback + * function for them. This version can only be used while holding the wiphy + * mutex. + * + * @hw: the hardware struct of which the interfaces should be iterated over + * @iterator: the iterator function to call + * @data: first argument of the iterator function + */ +void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, + void (*iterator)(void *data, + struct ieee80211_sta *sta), + void *data); + /** * ieee80211_queue_work - add work onto the mac80211 workqueue * @@ -5547,6 +6331,20 @@ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, unsigned long delay); /** + * ieee80211_refresh_tx_agg_session_timer - Refresh a tx agg session timer. + * @sta: the station for which to start a BA session + * @tid: the TID to BA on. + * + * This function allows low level driver to refresh tx agg session timer + * to maintain BA session, the session level will still be managed by the + * mac80211. + * + * Note: must be called in an RCU critical section. + */ +void ieee80211_refresh_tx_agg_session_timer(struct ieee80211_sta *sta, + u16 tid); + +/** * ieee80211_start_tx_ba_session - Start a tx Block Ack session. * @sta: the station for which to start a BA session * @tid: the TID to BA on. @@ -5641,6 +6439,24 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *localaddr); /** + * ieee80211_find_sta_by_link_addrs - find STA by link addresses + * @hw: pointer as obtained from ieee80211_alloc_hw() + * @addr: remote station's link address + * @localaddr: local link address, use %NULL for any (but avoid that) + * @link_id: pointer to obtain the link ID if the STA is found, + * may be %NULL if the link ID is not needed + * + * Obtain the STA by link address, must use RCU protection. + * + * Return: pointer to STA if found, otherwise %NULL. + */ +struct ieee80211_sta * +ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, + const u8 *addr, + const u8 *localaddr, + unsigned int *link_id); + +/** * ieee80211_sta_block_awake - block station from waking up * @hw: the hardware * @pubsta: the station @@ -5716,6 +6532,19 @@ void ieee80211_sta_eosp(struct ieee80211_sta *pubsta); void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); /** + * ieee80211_sta_recalc_aggregates - recalculate aggregate data after a change + * @pubsta: the station + * + * Call this function after changing a per-link aggregate data as referenced in + * &struct ieee80211_sta_aggregates by accessing the agg field of + * &struct ieee80211_link_sta. + * + * With non MLO the data in deflink will be referenced directly. In that case + * there is no need to call this function. + */ +void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta); + +/** * ieee80211_sta_register_airtime - register airtime usage for a sta/tid * * Register airtime usage for a given sta on a given tid. The driver must call @@ -5749,8 +6578,8 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, * @hw: pointer obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * - * Return true if the AQL's airtime limit has not been reached and the txq can - * continue to send more packets to the device. Otherwise return false. + * Return: %true if the AQL's airtime limit has not been reached and the txq can + * continue to send more packets to the device. Otherwise return %false. */ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); @@ -5762,12 +6591,12 @@ ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * + * Context: Must be called with wiphy mutex held; can sleep. + * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. This is intended for use in WoWLAN if the device - * needs reprogramming of the keys during suspend. Note that due - * to locking reasons, it is also only safe to call this at few - * spots since it must hold the RTNL and be able to sleep. + * needs reprogramming of the keys during suspend. * * The order in which the keys are iterated matches the order * in which they were originally installed and handed to the @@ -5833,6 +6662,31 @@ void ieee80211_iter_chan_contexts_atomic( void *iter_data); /** + * ieee80211_iter_chan_contexts_mtx - iterate channel contexts + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @iter: iterator function + * @iter_data: data passed to iterator function + * + * Iterate all active channel contexts. This function can only be used while + * holding the wiphy mutex. + * + * The iterator will not find a context that's being added (during + * the driver callback to add it) but will find it while it's being + * removed. + * + * Note that during hardware restart, all contexts that existed + * before the restart are considered already present so will be + * found while iterating, whether they've been re-added already + * or not. + */ +void ieee80211_iter_chan_contexts_mtx( + struct ieee80211_hw *hw, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf, + void *data), + void *iter_data); + +/** * ieee80211_ap_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. @@ -5877,6 +6731,17 @@ void ieee80211_beacon_loss(struct ieee80211_vif *vif); void ieee80211_connection_loss(struct ieee80211_vif *vif); /** + * ieee80211_disconnect - request disconnection + * + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @reconnect: immediate reconnect is desired + * + * Request disconnection from the current network and, if enabled, send a + * hint to the higher layers that immediate reconnect is desired. + */ +void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect); + +/** * ieee80211_resume_disconnect - disconnect from AP after resume * * @vif: &struct ieee80211_vif pointer from the add_interface callback. @@ -5900,6 +6765,16 @@ void ieee80211_connection_loss(struct ieee80211_vif *vif); void ieee80211_resume_disconnect(struct ieee80211_vif *vif); /** + * ieee80211_hw_restart_disconnect - disconnect from AP after + * hardware restart + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * Instructs mac80211 to disconnect from the AP after + * hardware restart. + */ +void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif); + +/** * ieee80211_cqm_rssi_notify - inform a configured connection quality monitoring * rssi threshold triggered * @@ -5929,29 +6804,46 @@ void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp); * ieee80211_radar_detected - inform that a radar was detected * * @hw: pointer as obtained from ieee80211_alloc_hw() + * @chanctx_conf: Channel context on which radar is detected. Mandatory to + * pass a valid pointer during MLO. For non-MLO %NULL can be passed */ -void ieee80211_radar_detected(struct ieee80211_hw *hw); +void ieee80211_radar_detected(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf); /** * ieee80211_chswitch_done - Complete channel switch process * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @success: make the channel switch successful or not + * @link_id: the link_id on which the switch was done. Ignored if success is + * false. * * Complete the channel switch post-process: set the new operational channel * and wake up the suspended queues. */ -void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success); +void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, + unsigned int link_id); + +/** + * ieee80211_channel_switch_disconnect - disconnect due to channel switch error + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * Instruct mac80211 to disconnect due to a channel switch error. The channel + * switch can request to block the tx and so, we need to make sure we do not send + * a deauth frame in this case. + */ +void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif); /** * ieee80211_request_smps - request SM PS transition * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: link ID for MLO, or 0 * @smps_mode: new SM PS mode * * This allows the driver to request an SM PS transition in managed * mode. This is useful when the driver has more information than * the stack about possible interference, for example by bluetooth. */ -void ieee80211_request_smps(struct ieee80211_vif *vif, +void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id, enum ieee80211_smps_mode smps_mode); /** @@ -5996,6 +6888,7 @@ void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap, * marks frames marked in the bitmap as having been filtered. Afterwards, it * checks if any frames in the window starting from @ssn can now be released * (in case they were only waiting for frames that were filtered.) + * (Only work correctly if @max_rx_aggregation_subframes <= 64 frames) */ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, u16 ssn, u64 filtered, @@ -6127,6 +7020,11 @@ enum rate_control_capabilities { * otherwise the NSS difference doesn't bother us. */ RATE_CTRL_CAPA_VHT_EXT_NSS_BW = BIT(0), + /** + * @RATE_CTRL_CAPA_AMPDU_TRIGGER: + * mac80211 should start A-MPDU sessions on tx + */ + RATE_CTRL_CAPA_AMPDU_TRIGGER = BIT(1), }; struct rate_control_ops { @@ -6167,7 +7065,7 @@ static inline int rate_supported(struct ieee80211_sta *sta, enum nl80211_band band, int index) { - return (sta == NULL || sta->supp_rates[band] & BIT(index)); + return (sta == NULL || sta->deflink.supp_rates[band] & BIT(index)); } static inline s8 @@ -6210,6 +7108,8 @@ bool rate_usable_index_exists(struct ieee80211_supported_band *sband, * @hw: pointer as obtained from ieee80211_alloc_hw() * @pubsta: &struct ieee80211_sta pointer to the target destination. * @rates: new tx rate set to be used for this station. + * + * Return: 0 on success. An error code otherwise. */ int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, @@ -6275,9 +7175,52 @@ ieee80211_vif_type_p2p(struct ieee80211_vif *vif) } /** + * ieee80211_get_he_iftype_cap_vif - return HE capabilities for sband/vif + * @sband: the sband to search for the iftype on + * @vif: the vif to get the iftype from + * + * Return: pointer to the struct ieee80211_sta_he_cap, or %NULL is none found + */ +static inline const struct ieee80211_sta_he_cap * +ieee80211_get_he_iftype_cap_vif(const struct ieee80211_supported_band *sband, + struct ieee80211_vif *vif) +{ + return ieee80211_get_he_iftype_cap(sband, ieee80211_vif_type_p2p(vif)); +} + +/** + * ieee80211_get_he_6ghz_capa_vif - return HE 6 GHz capabilities + * @sband: the sband to search for the STA on + * @vif: the vif to get the iftype from + * + * Return: the 6GHz capabilities + */ +static inline __le16 +ieee80211_get_he_6ghz_capa_vif(const struct ieee80211_supported_band *sband, + struct ieee80211_vif *vif) +{ + return ieee80211_get_he_6ghz_capa(sband, ieee80211_vif_type_p2p(vif)); +} + +/** + * ieee80211_get_eht_iftype_cap_vif - return ETH capabilities for sband/vif + * @sband: the sband to search for the iftype on + * @vif: the vif to get the iftype from + * + * Return: pointer to the struct ieee80211_sta_eht_cap, or %NULL is none found + */ +static inline const struct ieee80211_sta_eht_cap * +ieee80211_get_eht_iftype_cap_vif(const struct ieee80211_supported_band *sband, + struct ieee80211_vif *vif) +{ + return ieee80211_get_eht_iftype_cap(sband, ieee80211_vif_type_p2p(vif)); +} + +/** * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data * * @vif: the specified virtual interface + * @link_id: the link ID for MLO, otherwise 0 * @membership: 64 bits array - a bit is set if station is member of the group * @position: 2 bits per group id indicating the position in the group * @@ -6286,7 +7229,7 @@ ieee80211_vif_type_p2p(struct ieee80211_vif *vif) * matching GroupId management frame. * Calls to this function need to be serialized with RX path. */ -void ieee80211_update_mu_groups(struct ieee80211_vif *vif, +void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id, const u8 *membership, const u8 *position); void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif, @@ -6327,6 +7270,8 @@ void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif, * @band: the band to transmit on * @sta: optional pointer to get the station to send the frame to * + * Return: %true if the skb was prepared, %false otherwise + * * Note: must be called under RCU lock */ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, @@ -6334,9 +7279,17 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, int band, struct ieee80211_sta **sta); /** - * Sanity-check and parse the radiotap header of injected frames + * ieee80211_parse_tx_radiotap - Sanity-check and parse the radiotap header + * of injected frames. + * + * To accurately parse and take into account rate and retransmission fields, + * you must initialize the chandef field in the ieee80211_tx_info structure + * of the skb before calling this function. + * * @skb: packet injected by userspace * @dev: the &struct device of this 802.11 device + * + * Return: %true if the radiotap header was parsed, %false otherwise */ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev); @@ -6389,7 +7342,7 @@ int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf); /** - * ieee80211_tdls_oper - request userspace to perform a TDLS operation + * ieee80211_tdls_oper_request - request userspace to perform a TDLS operation * @vif: virtual interface * @peer: the peer's destination address * @oper: the requested TDLS operation @@ -6446,7 +7399,7 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() * - * Returns the skb if successful, %NULL if no frame was available. + * Return: the skb if successful, %NULL if no frame was available. * * Note that this must be called in an rcu_read_lock() critical section, * which can only be released after the SKB was handled. Some pointers in @@ -6472,6 +7425,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() + * + * Return: the skb if successful, %NULL if no frame was available. */ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, struct ieee80211_txq *txq) @@ -6486,12 +7441,24 @@ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, } /** + * ieee80211_handle_wake_tx_queue - mac80211 handler for wake_tx_queue callback + * + * @hw: pointer as obtained from wake_tx_queue() callback(). + * @txq: pointer as obtained from wake_tx_queue() callback(). + * + * Drivers can use this function for the mandatory mac80211 wake_tx_queue + * callback in struct ieee80211_ops. They should not call this function. + */ +void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, + struct ieee80211_txq *txq); + +/** * ieee80211_next_txq - get next tx queue to pull packets from * * @hw: pointer as obtained from ieee80211_alloc_hw() * @ac: AC number to return packets from. * - * Returns the next txq if successful, %NULL if no queue is eligible. If a txq + * Return: the next txq if successful, %NULL if no queue is eligible. If a txq * is returned, it should be returned with ieee80211_return_txq() after the * driver has finished scheduling it. */ @@ -6556,7 +7523,7 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, * * This function is used to check whether given txq is allowed to transmit by * the airtime scheduler, and can be used by drivers to access the airtime - * fairness accounting without going using the scheduling order enfored by + * fairness accounting without using the scheduling order enforced by * next_txq(). * * Returns %true if the airtime scheduler thinks the TXQ should be allowed to @@ -6574,6 +7541,8 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface + * + * Return: %true if transmission is allowed, %false otherwise */ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq); @@ -6634,6 +7603,8 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif, * @status: &struct ieee80211_rx_status containing the transmission rate * information. * @len: frame length in bytes + * + * Return: the airtime estimate */ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, struct ieee80211_rx_status *status, @@ -6648,23 +7619,13 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, * @hw: pointer as obtained from ieee80211_alloc_hw() * @info: &struct ieee80211_tx_info of the frame. * @len: frame length in bytes + * + * Return: the airtime estimate */ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int len); /** - * ieee80211_set_hw_80211_encap - enable hardware encapsulation offloading. - * - * This function is used to notify mac80211 that a vif can be passed raw 802.3 - * frames. The driver needs to then handle the 802.11 encapsulation inside the - * hardware or firmware. - * - * @vif: &struct ieee80211_vif pointer from the add_interface callback. - * @enable: indicate if the feature should be turned on or off - */ -bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable); - -/** * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. @@ -6689,4 +7650,173 @@ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif); + +/** + * ieee80211_obss_color_collision_notify - notify userland about a BSS color + * collision. + * @link_id: valid link_id during MLO or 0 for non-MLO + * + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is + * aware of. + */ +void +ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, + u64 color_bitmap, u8 link_id); + +/** + * ieee80211_is_tx_data - check if frame is a data frame + * + * The function is used to check if a frame is a data frame. Frames with + * hardware encapsulation enabled are data frames. + * + * @skb: the frame to be transmitted. + * + * Return: %true if @skb is a data frame, %false otherwise + */ +static inline bool ieee80211_is_tx_data(struct sk_buff *skb) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (void *) skb->data; + + return info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP || + ieee80211_is_data(hdr->frame_control); +} + +/** + * ieee80211_set_active_links - set active links in client mode + * @vif: interface to set active links on + * @active_links: the new active links bitmap + * + * Context: Must be called with wiphy mutex held; may sleep; calls + * back into the driver. + * + * This changes the active links on an interface. The interface + * must be in client mode (in AP mode, all links are always active), + * and @active_links must be a subset of the vif's valid_links. + * + * If a link is switched off and another is switched on at the same + * time (e.g. active_links going from 0x1 to 0x10) then you will get + * a sequence of calls like + * + * - change_vif_links(0x11) + * - unassign_vif_chanctx(link_id=0) + * - assign_vif_chanctx(link_id=4) + * - change_sta_links(0x11) for each affected STA (the AP) + * (TDLS connections on now inactive links should be torn down) + * - remove group keys on the old link (link_id 0) + * - add new group keys (GTK/IGTK/BIGTK) on the new link (link_id 4) + * - change_sta_links(0x10) for each affected STA (the AP) + * - change_vif_links(0x10) + * + * Return: 0 on success. An error code otherwise. + */ +int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); + +/** + * ieee80211_set_active_links_async - asynchronously set active links + * @vif: interface to set active links on + * @active_links: the new active links bitmap + * + * See ieee80211_set_active_links() for more information, the only + * difference here is that the link change is triggered async and + * can be called in any context, but the link switch will only be + * completed after it returns. + */ +void ieee80211_set_active_links_async(struct ieee80211_vif *vif, + u16 active_links); + +/** + * ieee80211_send_teardown_neg_ttlm - tear down a negotiated TTLM request + * @vif: the interface on which the tear down request should be sent. + * + * This function can be used to tear down a previously accepted negotiated + * TTLM request. + */ +void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif); + +/** + * ieee80211_chan_width_to_rx_bw - convert channel width to STA RX bandwidth + * @width: the channel width value to convert + * Return: the STA RX bandwidth value for the channel width + */ +static inline enum ieee80211_sta_rx_bandwidth +ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) +{ + switch (width) { + default: + WARN_ON_ONCE(1); + fallthrough; + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + return IEEE80211_STA_RX_BW_20; + case NL80211_CHAN_WIDTH_40: + return IEEE80211_STA_RX_BW_40; + case NL80211_CHAN_WIDTH_80: + return IEEE80211_STA_RX_BW_80; + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_80P80: + return IEEE80211_STA_RX_BW_160; + case NL80211_CHAN_WIDTH_320: + return IEEE80211_STA_RX_BW_320; + } +} + +/** + * ieee80211_prepare_rx_omi_bw - prepare for sending BW RX OMI + * @link_sta: the link STA the OMI is going to be sent to + * @bw: the bandwidth requested + * + * When the driver decides to do RX OMI to change bandwidth with a STA + * it calls this function to prepare, then sends the OMI, and finally + * calls ieee80211_finalize_rx_omi_bw(). + * + * Note that the (link) STA rate control is updated accordingly as well, + * but the chanctx might not be updated if there are other users. + * If the intention is to reduce the listen bandwidth, the driver must + * ensure there are no TDLS stations nor other uses of the chanctx. + * + * Also note that in order to sequence correctly, narrowing bandwidth + * will only happen in ieee80211_finalize_rx_omi_bw(), whereas widening + * again (e.g. going back to normal) will happen here. + * + * Note that we treat this symmetrically, so if the driver calls this + * and tells the peer to only send with a lower bandwidth, we assume + * that the driver also wants to only send at that lower bandwidth, to + * allow narrowing of the chanctx request for this station/interface. + * + * Finally, the driver must ensure that if the function returned %true, + * ieee80211_finalize_rx_omi_bw() is also called, even for example in + * case of HW restart. + * + * Context: Must be called with wiphy mutex held, and will call back + * into the driver, so ensure no driver locks are held. + * + * Return: %true if changes are going to be made, %false otherwise + */ +bool ieee80211_prepare_rx_omi_bw(struct ieee80211_link_sta *link_sta, + enum ieee80211_sta_rx_bandwidth bw); + +/** + * ieee80211_finalize_rx_omi_bw - finalize BW RX OMI update + * @link_sta: the link STA the OMI was sent to + * + * See ieee80211_client_prepare_rx_omi_bw(). Context is the same here + * as well. + */ +void ieee80211_finalize_rx_omi_bw(struct ieee80211_link_sta *link_sta); + +/* for older drivers - let's not document these ... */ +int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx); +void ieee80211_emulate_remove_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx); +void ieee80211_emulate_change_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx, + u32 changed); +int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, + enum ieee80211_chanctx_switch_mode mode); + #endif /* MAC80211_H */ diff --git a/include/net/mac802154.h b/include/net/mac802154.h index d524ffb9eb25..d72006a85f02 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -7,7 +7,7 @@ #ifndef NET_MAC802154_H #define NET_MAC802154_H -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/af_ieee802154.h> #include <linux/ieee802154.h> #include <linux/skbuff.h> @@ -111,9 +111,6 @@ struct ieee802154_hw { * promiscuous mode setting. * * @IEEE802154_HW_RX_OMIT_CKSUM: Indicates that receiver omits FCS. - * - * @IEEE802154_HW_RX_DROP_BAD_CKSUM: Indicates that receiver will not filter - * frames with bad checksum. */ enum ieee802154_hw_flags { IEEE802154_HW_TX_OMIT_CKSUM = BIT(0), @@ -123,7 +120,6 @@ enum ieee802154_hw_flags { IEEE802154_HW_AFILT = BIT(4), IEEE802154_HW_PROMISCUOUS = BIT(5), IEEE802154_HW_RX_OMIT_CKSUM = BIT(6), - IEEE802154_HW_RX_DROP_BAD_CKSUM = BIT(7), }; /* Indicates that receiver omits FCS and xmitter will add FCS on it's own. */ @@ -144,7 +140,7 @@ enum ieee802154_hw_flags { * * xmit_sync: * Handler that 802.15.4 module calls for each transmitted frame. - * skb cntains the buffer starting from the IEEE 802.15.4 header. + * skb contains the buffer starting from the IEEE 802.15.4 header. * The low-level driver should send the frame based on available * configuration. This is called by a workqueue and useful for * synchronous 802.15.4 drivers. @@ -156,7 +152,7 @@ enum ieee802154_hw_flags { * * xmit_async: * Handler that 802.15.4 module calls for each transmitted frame. - * skb cntains the buffer starting from the IEEE 802.15.4 header. + * skb contains the buffer starting from the IEEE 802.15.4 header. * The low-level driver should send the frame based on available * configuration. * This function should return zero or negative errno. @@ -460,30 +456,34 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw); */ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi); + /** - * ieee802154_wake_queue - wake ieee802154 queue - * @hw: pointer as obtained from ieee802154_alloc_hw(). + * ieee802154_xmit_complete - frame transmission complete * - * Drivers should use this function instead of netif_wake_queue. + * @hw: pointer as obtained from ieee802154_alloc_hw(). + * @skb: buffer for transmission + * @ifs_handling: indicate interframe space handling */ -void ieee802154_wake_queue(struct ieee802154_hw *hw); +void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, + bool ifs_handling); /** - * ieee802154_stop_queue - stop ieee802154 queue - * @hw: pointer as obtained from ieee802154_alloc_hw(). + * ieee802154_xmit_error - offloaded frame transmission failed * - * Drivers should use this function instead of netif_stop_queue. + * @hw: pointer as obtained from ieee802154_alloc_hw(). + * @skb: buffer for transmission + * @reason: error code */ -void ieee802154_stop_queue(struct ieee802154_hw *hw); +void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, + int reason); /** - * ieee802154_xmit_complete - frame transmission complete + * ieee802154_xmit_hw_error - frame could not be offloaded to the transmitter + * because of a hardware error (bus error, timeout, etc) * * @hw: pointer as obtained from ieee802154_alloc_hw(). * @skb: buffer for transmission - * @ifs_handling: indicate interframe space handling */ -void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, - bool ifs_handling); +void ieee802154_xmit_hw_error(struct ieee802154_hw *hw, struct sk_buff *skb); #endif /* NET_MAC802154_H */ diff --git a/include/net/macsec.h b/include/net/macsec.h index 52874cdfe226..bc7de5b53e54 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -8,22 +8,38 @@ #define _NET_MACSEC_H_ #include <linux/u64_stats_sync.h> +#include <linux/if_vlan.h> #include <uapi/linux/if_link.h> #include <uapi/linux/if_macsec.h> #define MACSEC_DEFAULT_PN_LEN 4 #define MACSEC_XPN_PN_LEN 8 -#define MACSEC_SALT_LEN 12 #define MACSEC_NUM_AN 4 /* 2 bits for the association number */ +#define MACSEC_SCI_LEN 8 +#define MACSEC_PORT_ES (htons(0x0001)) + +#define MACSEC_TCI_VERSION 0x80 +#define MACSEC_TCI_ES 0x40 /* end station */ +#define MACSEC_TCI_SC 0x20 /* SCI present */ +#define MACSEC_TCI_SCB 0x10 /* epon */ +#define MACSEC_TCI_E 0x08 /* encryption */ +#define MACSEC_TCI_C 0x04 /* changed text */ +#define MACSEC_AN_MASK 0x03 /* association number */ +#define MACSEC_TCI_CONFID (MACSEC_TCI_E | MACSEC_TCI_C) + +#define MACSEC_DEFAULT_ICV_LEN 16 + typedef u64 __bitwise sci_t; typedef u32 __bitwise ssci_t; +struct metadata_dst; + typedef union salt { struct { - u32 ssci; - u64 pn; + ssci_t ssci; + __be64 pn; } __packed; u8 bytes[MACSEC_SALT_LEN]; } __packed salt_t; @@ -183,6 +199,7 @@ struct macsec_tx_sa { * @scb: single copy broadcast flag * @sa: array of secure associations * @stats: stats for this TXSC + * @md_dst: MACsec offload metadata dst */ struct macsec_tx_sc { bool active; @@ -193,6 +210,7 @@ struct macsec_tx_sc { bool scb; struct macsec_tx_sa __rcu *sa[MACSEC_NUM_AN]; struct pcpu_tx_sc_stats __percpu *stats; + struct metadata_dst *md_dst; }; /** @@ -229,6 +247,23 @@ struct macsec_secy { /** * struct macsec_context - MACsec context for hardware offloading + * @netdev: a valid pointer to a struct net_device if @offload == + * MACSEC_OFFLOAD_MAC + * @phydev: a valid pointer to a struct phy_device if @offload == + * MACSEC_OFFLOAD_PHY + * @offload: MACsec offload status + * @secy: pointer to a MACsec SecY + * @rx_sc: pointer to a RX SC + * @update_pn: when updating the SA, update the next PN + * @assoc_num: association number of the target SA + * @key: key of the target SA + * @rx_sa: pointer to an RX SA if a RX SA is added/updated/removed + * @tx_sa: pointer to an TX SA if a TX SA is added/updated/removed + * @tx_sc_stats: pointer to TX SC stats structure + * @tx_sa_stats: pointer to TX SA stats structure + * @rx_sc_stats: pointer to RX SC stats structure + * @rx_sa_stats: pointer to RX SA stats structure + * @dev_stats: pointer to dev stats structure */ struct macsec_context { union { @@ -240,8 +275,9 @@ struct macsec_context { struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct { + bool update_pn; unsigned char assoc_num; - u8 key[MACSEC_KEYID_LEN]; + u8 key[MACSEC_MAX_KEY_LEN]; union { struct macsec_rx_sa *rx_sa; struct macsec_tx_sa *tx_sa; @@ -254,12 +290,38 @@ struct macsec_context { struct macsec_rx_sa_stats *rx_sa_stats; struct macsec_dev_stats *dev_stats; } stats; - - u8 prepare:1; }; /** * struct macsec_ops - MACsec offloading operations + * @mdo_dev_open: called when the MACsec interface transitions to the up state + * @mdo_dev_stop: called when the MACsec interface transitions to the down + * state + * @mdo_add_secy: called when a new SecY is added + * @mdo_upd_secy: called when the SecY flags are changed or the MAC address of + * the MACsec interface is changed + * @mdo_del_secy: called when the hw offload is disabled or the MACsec + * interface is removed + * @mdo_add_rxsc: called when a new RX SC is added + * @mdo_upd_rxsc: called when a certain RX SC is updated + * @mdo_del_rxsc: called when a certain RX SC is removed + * @mdo_add_rxsa: called when a new RX SA is added + * @mdo_upd_rxsa: called when a certain RX SA is updated + * @mdo_del_rxsa: called when a certain RX SA is removed + * @mdo_add_txsa: called when a new TX SA is added + * @mdo_upd_txsa: called when a certain TX SA is updated + * @mdo_del_txsa: called when a certain TX SA is removed + * @mdo_get_dev_stats: called when dev stats are read + * @mdo_get_tx_sc_stats: called when TX SC stats are read + * @mdo_get_tx_sa_stats: called when TX SA stats are read + * @mdo_get_rx_sc_stats: called when RX SC stats are read + * @mdo_get_rx_sa_stats: called when RX SA stats are read + * @mdo_insert_tx_tag: called to insert the TX tag + * @needed_headroom: number of bytes reserved at the beginning of the sk_buff + * for the TX tag + * @needed_tailroom: number of bytes reserved at the end of the sk_buff for the + * TX tag + * @rx_uses_md_dst: whether MACsec device offload supports sk_buff md_dst */ struct macsec_ops { /* Device wide */ @@ -286,8 +348,37 @@ struct macsec_ops { int (*mdo_get_tx_sa_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sa_stats)(struct macsec_context *ctx); + /* Offload tag */ + int (*mdo_insert_tx_tag)(struct phy_device *phydev, + struct sk_buff *skb); + unsigned int needed_headroom; + unsigned int needed_tailroom; + bool rx_uses_md_dst; }; void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); +static inline bool macsec_send_sci(const struct macsec_secy *secy) +{ + const struct macsec_tx_sc *tx_sc = &secy->tx_sc; + + return tx_sc->send_sci || + (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); +} +struct net_device *macsec_get_real_dev(const struct net_device *dev); +bool macsec_netdev_is_offloaded(struct net_device *dev); + +static inline void *macsec_netdev_priv(const struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_VLAN_8021Q) + if (is_vlan_dev(dev)) + return netdev_priv(vlan_dev_priv(dev)->real_dev); +#endif + return netdev_priv(dev); +} + +static inline u64 sci_to_cpu(sci_t sci) +{ + return be64_to_cpu((__force __be64)sci); +} #endif /* _NET_MACSEC_H_ */ diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h new file mode 100644 index 000000000000..3ce56a816425 --- /dev/null +++ b/include/net/mana/gdma.h @@ -0,0 +1,913 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2021, Microsoft Corporation. */ + +#ifndef _GDMA_H +#define _GDMA_H + +#include <linux/dma-mapping.h> +#include <linux/netdevice.h> + +#include "shm_channel.h" + +#define GDMA_STATUS_MORE_ENTRIES 0x00000105 + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +enum gdma_request_type { + GDMA_VERIFY_VF_DRIVER_VERSION = 1, + GDMA_QUERY_MAX_RESOURCES = 2, + GDMA_LIST_DEVICES = 3, + GDMA_REGISTER_DEVICE = 4, + GDMA_DEREGISTER_DEVICE = 5, + GDMA_GENERATE_TEST_EQE = 10, + GDMA_CREATE_QUEUE = 12, + GDMA_DISABLE_QUEUE = 13, + GDMA_ALLOCATE_RESOURCE_RANGE = 22, + GDMA_DESTROY_RESOURCE_RANGE = 24, + GDMA_CREATE_DMA_REGION = 25, + GDMA_DMA_REGION_ADD_PAGES = 26, + GDMA_DESTROY_DMA_REGION = 27, + GDMA_CREATE_PD = 29, + GDMA_DESTROY_PD = 30, + GDMA_CREATE_MR = 31, + GDMA_DESTROY_MR = 32, + GDMA_QUERY_HWC_TIMEOUT = 84, /* 0x54 */ +}; + +#define GDMA_RESOURCE_DOORBELL_PAGE 27 + +enum gdma_queue_type { + GDMA_INVALID_QUEUE, + GDMA_SQ, + GDMA_RQ, + GDMA_CQ, + GDMA_EQ, +}; + +enum gdma_work_request_flags { + GDMA_WR_NONE = 0, + GDMA_WR_OOB_IN_SGL = BIT(0), + GDMA_WR_PAD_BY_SGE0 = BIT(1), +}; + +enum gdma_eqe_type { + GDMA_EQE_COMPLETION = 3, + GDMA_EQE_TEST_EVENT = 64, + GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, + GDMA_EQE_HWC_INIT_DATA = 130, + GDMA_EQE_HWC_INIT_DONE = 131, + GDMA_EQE_HWC_SOC_RECONFIG = 132, + GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, + GDMA_EQE_HWC_SOC_SERVICE = 134, + GDMA_EQE_RNIC_QP_FATAL = 176, +}; + +enum { + GDMA_DEVICE_NONE = 0, + GDMA_DEVICE_HWC = 1, + GDMA_DEVICE_MANA = 2, + GDMA_DEVICE_MANA_IB = 3, +}; + +enum gdma_service_type { + GDMA_SERVICE_TYPE_NONE = 0, + GDMA_SERVICE_TYPE_RDMA_SUSPEND = 1, + GDMA_SERVICE_TYPE_RDMA_RESUME = 2, +}; + +struct mana_service_work { + struct work_struct work; + struct gdma_dev *gdma_dev; + enum gdma_service_type event; +}; + +struct gdma_resource { + /* Protect the bitmap */ + spinlock_t lock; + + /* The bitmap size in bits. */ + u32 size; + + /* The bitmap tracks the resources. */ + unsigned long *map; +}; + +union gdma_doorbell_entry { + u64 as_uint64; + + struct { + u64 id : 24; + u64 reserved : 8; + u64 tail_ptr : 31; + u64 arm : 1; + } cq; + + struct { + u64 id : 24; + u64 wqe_cnt : 8; + u64 tail_ptr : 32; + } rq; + + struct { + u64 id : 24; + u64 reserved : 8; + u64 tail_ptr : 32; + } sq; + + struct { + u64 id : 16; + u64 reserved : 16; + u64 tail_ptr : 31; + u64 arm : 1; + } eq; +}; /* HW DATA */ + +struct gdma_msg_hdr { + u32 hdr_type; + u32 msg_type; + u16 msg_version; + u16 hwc_msg_id; + u32 msg_size; +}; /* HW DATA */ + +struct gdma_dev_id { + union { + struct { + u16 type; + u16 instance; + }; + + u32 as_uint32; + }; +}; /* HW DATA */ + +struct gdma_req_hdr { + struct gdma_msg_hdr req; + struct gdma_msg_hdr resp; /* The expected response */ + struct gdma_dev_id dev_id; + u32 activity_id; +}; /* HW DATA */ + +struct gdma_resp_hdr { + struct gdma_msg_hdr response; + struct gdma_dev_id dev_id; + u32 activity_id; + u32 status; + u32 reserved; +}; /* HW DATA */ + +struct gdma_general_req { + struct gdma_req_hdr hdr; +}; /* HW DATA */ + +#define GDMA_MESSAGE_V1 1 +#define GDMA_MESSAGE_V2 2 +#define GDMA_MESSAGE_V3 3 +#define GDMA_MESSAGE_V4 4 + +struct gdma_general_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +#define GDMA_STANDARD_HEADER_TYPE 0 + +static inline void mana_gd_init_req_hdr(struct gdma_req_hdr *hdr, u32 code, + u32 req_size, u32 resp_size) +{ + hdr->req.hdr_type = GDMA_STANDARD_HEADER_TYPE; + hdr->req.msg_type = code; + hdr->req.msg_version = GDMA_MESSAGE_V1; + hdr->req.msg_size = req_size; + + hdr->resp.hdr_type = GDMA_STANDARD_HEADER_TYPE; + hdr->resp.msg_type = code; + hdr->resp.msg_version = GDMA_MESSAGE_V1; + hdr->resp.msg_size = resp_size; +} + +/* The 16-byte struct is part of the GDMA work queue entry (WQE). */ +struct gdma_sge { + u64 address; + u32 mem_key; + u32 size; +}; /* HW DATA */ + +struct gdma_wqe_request { + struct gdma_sge *sgl; + u32 num_sge; + + u32 inline_oob_size; + const void *inline_oob_data; + + u32 flags; + u32 client_data_unit; +}; + +enum gdma_page_type { + GDMA_PAGE_TYPE_4K, +}; + +#define GDMA_INVALID_DMA_REGION 0 + +struct gdma_mem_info { + struct device *dev; + + dma_addr_t dma_handle; + void *virt_addr; + u64 length; + + /* Allocated by the PF driver */ + u64 dma_region_handle; +}; + +#define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8 + +struct gdma_dev { + struct gdma_context *gdma_context; + + struct gdma_dev_id dev_id; + + u32 pdid; + u32 doorbell; + u32 gpa_mkey; + + /* GDMA driver specific pointer */ + void *driver_data; + + struct auxiliary_device *adev; + bool is_suspended; + bool rdma_teardown; +}; + +/* MANA_PAGE_SIZE is the DMA unit */ +#define MANA_PAGE_SHIFT 12 +#define MANA_PAGE_SIZE BIT(MANA_PAGE_SHIFT) +#define MANA_PAGE_ALIGN(x) ALIGN((x), MANA_PAGE_SIZE) +#define MANA_PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), MANA_PAGE_SIZE) +#define MANA_PFN(a) ((a) >> MANA_PAGE_SHIFT) + +/* Required by HW */ +#define MANA_MIN_QSIZE MANA_PAGE_SIZE + +#define GDMA_CQE_SIZE 64 +#define GDMA_EQE_SIZE 16 +#define GDMA_MAX_SQE_SIZE 512 +#define GDMA_MAX_RQE_SIZE 256 + +#define GDMA_COMP_DATA_SIZE 0x3C + +#define GDMA_EVENT_DATA_SIZE 0xC + +/* The WQE size must be a multiple of the Basic Unit, which is 32 bytes. */ +#define GDMA_WQE_BU_SIZE 32 + +#define INVALID_PDID UINT_MAX +#define INVALID_DOORBELL UINT_MAX +#define INVALID_MEM_KEY UINT_MAX +#define INVALID_QUEUE_ID UINT_MAX +#define INVALID_PCI_MSIX_INDEX UINT_MAX + +struct gdma_comp { + u32 cqe_data[GDMA_COMP_DATA_SIZE / 4]; + u32 wq_num; + bool is_sq; +}; + +struct gdma_event { + u32 details[GDMA_EVENT_DATA_SIZE / 4]; + u8 type; +}; + +struct gdma_queue; + +struct mana_eq { + struct gdma_queue *eq; + struct dentry *mana_eq_debugfs; +}; + +typedef void gdma_eq_callback(void *context, struct gdma_queue *q, + struct gdma_event *e); + +typedef void gdma_cq_callback(void *context, struct gdma_queue *q); + +/* The 'head' is the producer index. For SQ/RQ, when the driver posts a WQE + * (Note: the WQE size must be a multiple of the 32-byte Basic Unit), the + * driver increases the 'head' in BUs rather than in bytes, and notifies + * the HW of the updated head. For EQ/CQ, the driver uses the 'head' to track + * the HW head, and increases the 'head' by 1 for every processed EQE/CQE. + * + * The 'tail' is the consumer index for SQ/RQ. After the CQE of the SQ/RQ is + * processed, the driver increases the 'tail' to indicate that WQEs have + * been consumed by the HW, so the driver can post new WQEs into the SQ/RQ. + * + * The driver doesn't use the 'tail' for EQ/CQ, because the driver ensures + * that the EQ/CQ is big enough so they can't overflow, and the driver uses + * the owner bits mechanism to detect if the queue has become empty. + */ +struct gdma_queue { + struct gdma_dev *gdma_dev; + + enum gdma_queue_type type; + u32 id; + + struct gdma_mem_info mem_info; + + void *queue_mem_ptr; + u32 queue_size; + + bool monitor_avl_buf; + + u32 head; + u32 tail; + struct list_head entry; + + /* Extra fields specific to EQ/CQ. */ + union { + struct { + bool disable_needed; + + gdma_eq_callback *callback; + void *context; + + unsigned int msix_index; + + u32 log2_throttle_limit; + } eq; + + struct { + gdma_cq_callback *callback; + void *context; + + struct gdma_queue *parent; /* For CQ/EQ relationship */ + } cq; + }; +}; + +struct gdma_queue_spec { + enum gdma_queue_type type; + bool monitor_avl_buf; + unsigned int queue_size; + + /* Extra fields specific to EQ/CQ. */ + union { + struct { + gdma_eq_callback *callback; + void *context; + + unsigned long log2_throttle_limit; + unsigned int msix_index; + } eq; + + struct { + gdma_cq_callback *callback; + void *context; + + struct gdma_queue *parent_eq; + + } cq; + }; +}; + +#define MANA_IRQ_NAME_SZ 32 + +struct gdma_irq_context { + void (*handler)(void *arg); + /* Protect the eq_list */ + spinlock_t lock; + struct list_head eq_list; + char name[MANA_IRQ_NAME_SZ]; +}; + +struct gdma_context { + struct device *dev; + struct dentry *mana_pci_debugfs; + + /* Per-vPort max number of queues */ + unsigned int max_num_queues; + unsigned int max_num_msix; + unsigned int num_msix_usable; + struct gdma_irq_context *irq_contexts; + + /* L2 MTU */ + u16 adapter_mtu; + + /* This maps a CQ index to the queue structure. */ + unsigned int max_num_cqs; + struct gdma_queue **cq_table; + + /* Protect eq_test_event and test_event_eq_id */ + struct mutex eq_test_event_mutex; + struct completion eq_test_event; + u32 test_event_eq_id; + + bool is_pf; + phys_addr_t bar0_pa; + void __iomem *bar0_va; + void __iomem *shm_base; + void __iomem *db_page_base; + phys_addr_t phys_db_page_base; + u32 db_page_size; + int numa_node; + + /* Shared memory chanenl (used to bootstrap HWC) */ + struct shm_channel shm_channel; + + /* Hardware communication channel (HWC) */ + struct gdma_dev hwc; + + /* Azure network adapter */ + struct gdma_dev mana; + + /* Azure RDMA adapter */ + struct gdma_dev mana_ib; + + u64 pf_cap_flags1; + + struct workqueue_struct *service_wq; +}; + +static inline bool mana_gd_is_mana(struct gdma_dev *gd) +{ + return gd->dev_id.type == GDMA_DEVICE_MANA; +} + +static inline bool mana_gd_is_hwc(struct gdma_dev *gd) +{ + return gd->dev_id.type == GDMA_DEVICE_HWC; +} + +u8 *mana_gd_get_wqe_ptr(const struct gdma_queue *wq, u32 wqe_offset); +u32 mana_gd_wq_avail_space(struct gdma_queue *wq); + +int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq); + +int mana_gd_create_hwc_queue(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +int mana_gd_create_mana_eq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue); + +int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe); + +void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit); + +struct gdma_wqe { + u32 reserved :24; + u32 last_vbytes :8; + + union { + u32 flags; + + struct { + u32 num_sge :8; + u32 inline_oob_size_div4:3; + u32 client_oob_in_sgl :1; + u32 reserved1 :4; + u32 client_data_unit :14; + u32 reserved2 :2; + }; + }; +}; /* HW DATA */ + +#define INLINE_OOB_SMALL_SIZE 8 +#define INLINE_OOB_LARGE_SIZE 24 + +#define MAX_TX_WQE_SIZE 512 +#define MAX_RX_WQE_SIZE 256 + +#define MAX_TX_WQE_SGL_ENTRIES ((GDMA_MAX_SQE_SIZE - \ + sizeof(struct gdma_sge) - INLINE_OOB_SMALL_SIZE) / \ + sizeof(struct gdma_sge)) + +#define MAX_RX_WQE_SGL_ENTRIES ((GDMA_MAX_RQE_SIZE - \ + sizeof(struct gdma_sge)) / sizeof(struct gdma_sge)) + +struct gdma_cqe { + u32 cqe_data[GDMA_COMP_DATA_SIZE / 4]; + + union { + u32 as_uint32; + + struct { + u32 wq_num : 24; + u32 is_sq : 1; + u32 reserved : 4; + u32 owner_bits : 3; + }; + } cqe_info; +}; /* HW DATA */ + +#define GDMA_CQE_OWNER_BITS 3 + +#define GDMA_CQE_OWNER_MASK ((1 << GDMA_CQE_OWNER_BITS) - 1) + +#define SET_ARM_BIT 1 + +#define GDMA_EQE_OWNER_BITS 3 + +union gdma_eqe_info { + u32 as_uint32; + + struct { + u32 type : 8; + u32 reserved1 : 8; + u32 client_id : 2; + u32 reserved2 : 11; + u32 owner_bits : 3; + }; +}; /* HW DATA */ + +#define GDMA_EQE_OWNER_MASK ((1 << GDMA_EQE_OWNER_BITS) - 1) +#define INITIALIZED_OWNER_BIT(log2_num_entries) (1UL << (log2_num_entries)) + +struct gdma_eqe { + u32 details[GDMA_EVENT_DATA_SIZE / 4]; + u32 eqe_info; +}; /* HW DATA */ + +#define GDMA_REG_DB_PAGE_OFFSET 8 +#define GDMA_REG_DB_PAGE_SIZE 0x10 +#define GDMA_REG_SHM_OFFSET 0x18 + +#define GDMA_PF_REG_DB_PAGE_SIZE 0xD0 +#define GDMA_PF_REG_DB_PAGE_OFF 0xC8 +#define GDMA_PF_REG_SHM_OFF 0x70 + +#define GDMA_SRIOV_REG_CFG_BASE_OFF 0x108 + +#define MANA_PF_DEVICE_ID 0x00B9 +#define MANA_VF_DEVICE_ID 0x00BA + +struct gdma_posted_wqe_info { + u32 wqe_size_in_bu; +}; + +/* GDMA_GENERATE_TEST_EQE */ +struct gdma_generate_test_event_req { + struct gdma_req_hdr hdr; + u32 queue_index; +}; /* HW DATA */ + +/* GDMA_VERIFY_VF_DRIVER_VERSION */ +enum { + GDMA_PROTOCOL_V1 = 1, + GDMA_PROTOCOL_FIRST = GDMA_PROTOCOL_V1, + GDMA_PROTOCOL_LAST = GDMA_PROTOCOL_V1, +}; + +#define GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT BIT(0) + +/* Advertise to the NIC firmware: the NAPI work_done variable race is fixed, + * so the driver is able to reliably support features like busy_poll. + */ +#define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2) +#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3) +#define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4) +#define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5) + +/* Driver can handle holes (zeros) in the device list */ +#define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) + +#define GDMA_DRV_CAP_FLAGS1 \ + (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ + GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ + GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ + GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP) + +#define GDMA_DRV_CAP_FLAGS2 0 + +#define GDMA_DRV_CAP_FLAGS3 0 + +#define GDMA_DRV_CAP_FLAGS4 0 + +struct gdma_verify_ver_req { + struct gdma_req_hdr hdr; + + /* Mandatory fields required for protocol establishment */ + u64 protocol_ver_min; + u64 protocol_ver_max; + + /* Gdma Driver Capability Flags */ + u64 gd_drv_cap_flags1; + u64 gd_drv_cap_flags2; + u64 gd_drv_cap_flags3; + u64 gd_drv_cap_flags4; + + /* Advisory fields */ + u64 drv_ver; + u32 os_type; /* Linux = 0x10; Windows = 0x20; Other = 0x30 */ + u32 reserved; + u32 os_ver_major; + u32 os_ver_minor; + u32 os_ver_build; + u32 os_ver_platform; + u64 reserved_2; + u8 os_ver_str1[128]; + u8 os_ver_str2[128]; + u8 os_ver_str3[128]; + u8 os_ver_str4[128]; +}; /* HW DATA */ + +struct gdma_verify_ver_resp { + struct gdma_resp_hdr hdr; + u64 gdma_protocol_ver; + u64 pf_cap_flags1; + u64 pf_cap_flags2; + u64 pf_cap_flags3; + u64 pf_cap_flags4; +}; /* HW DATA */ + +/* GDMA_QUERY_MAX_RESOURCES */ +struct gdma_query_max_resources_resp { + struct gdma_resp_hdr hdr; + u32 status; + u32 max_sq; + u32 max_rq; + u32 max_cq; + u32 max_eq; + u32 max_db; + u32 max_mst; + u32 max_cq_mod_ctx; + u32 max_mod_cq; + u32 max_msix; +}; /* HW DATA */ + +/* GDMA_LIST_DEVICES */ +#define GDMA_DEV_LIST_SIZE 64 +struct gdma_list_devices_resp { + struct gdma_resp_hdr hdr; + u32 num_of_devs; + u32 reserved; + struct gdma_dev_id devs[GDMA_DEV_LIST_SIZE]; +}; /* HW DATA */ + +/* GDMA_REGISTER_DEVICE */ +struct gdma_register_device_resp { + struct gdma_resp_hdr hdr; + u32 pdid; + u32 gpa_mkey; + u32 db_id; +}; /* HW DATA */ + +struct gdma_allocate_resource_range_req { + struct gdma_req_hdr hdr; + u32 resource_type; + u32 num_resources; + u32 alignment; + u32 allocated_resources; +}; + +struct gdma_allocate_resource_range_resp { + struct gdma_resp_hdr hdr; + u32 allocated_resources; +}; + +struct gdma_destroy_resource_range_req { + struct gdma_req_hdr hdr; + u32 resource_type; + u32 num_resources; + u32 allocated_resources; +}; + +/* GDMA_CREATE_QUEUE */ +struct gdma_create_queue_req { + struct gdma_req_hdr hdr; + u32 type; + u32 reserved1; + u32 pdid; + u32 doolbell_id; + u64 gdma_region; + u32 reserved2; + u32 queue_size; + u32 log2_throttle_limit; + u32 eq_pci_msix_index; + u32 cq_mod_ctx_id; + u32 cq_parent_eq_id; + u8 rq_drop_on_overrun; + u8 rq_err_on_wqe_overflow; + u8 rq_chain_rec_wqes; + u8 sq_hw_db; + u32 reserved3; +}; /* HW DATA */ + +struct gdma_create_queue_resp { + struct gdma_resp_hdr hdr; + u32 queue_index; +}; /* HW DATA */ + +/* GDMA_DISABLE_QUEUE */ +struct gdma_disable_queue_req { + struct gdma_req_hdr hdr; + u32 type; + u32 queue_index; + u32 alloc_res_id_on_creation; +}; /* HW DATA */ + +/* GDMA_QUERY_HWC_TIMEOUT */ +struct gdma_query_hwc_timeout_req { + struct gdma_req_hdr hdr; + u32 timeout_ms; + u32 reserved; +}; + +struct gdma_query_hwc_timeout_resp { + struct gdma_resp_hdr hdr; + u32 timeout_ms; + u32 reserved; +}; + +enum gdma_mr_access_flags { + GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0), + GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1), + GDMA_ACCESS_FLAG_REMOTE_READ = BIT_ULL(2), + GDMA_ACCESS_FLAG_REMOTE_WRITE = BIT_ULL(3), + GDMA_ACCESS_FLAG_REMOTE_ATOMIC = BIT_ULL(4), +}; + +/* GDMA_CREATE_DMA_REGION */ +struct gdma_create_dma_region_req { + struct gdma_req_hdr hdr; + + /* The total size of the DMA region */ + u64 length; + + /* The offset in the first page */ + u32 offset_in_page; + + /* enum gdma_page_type */ + u32 gdma_page_type; + + /* The total number of pages */ + u32 page_count; + + /* If page_addr_list_len is smaller than page_count, + * the remaining page addresses will be added via the + * message GDMA_DMA_REGION_ADD_PAGES. + */ + u32 page_addr_list_len; + u64 page_addr_list[]; +}; /* HW DATA */ + +struct gdma_create_dma_region_resp { + struct gdma_resp_hdr hdr; + u64 dma_region_handle; +}; /* HW DATA */ + +/* GDMA_DMA_REGION_ADD_PAGES */ +struct gdma_dma_region_add_pages_req { + struct gdma_req_hdr hdr; + + u64 dma_region_handle; + + u32 page_addr_list_len; + u32 reserved3; + + u64 page_addr_list[]; +}; /* HW DATA */ + +/* GDMA_DESTROY_DMA_REGION */ +struct gdma_destroy_dma_region_req { + struct gdma_req_hdr hdr; + + u64 dma_region_handle; +}; /* HW DATA */ + +enum gdma_pd_flags { + GDMA_PD_FLAG_INVALID = 0, + GDMA_PD_FLAG_ALLOW_GPA_MR = 1, +}; + +struct gdma_create_pd_req { + struct gdma_req_hdr hdr; + enum gdma_pd_flags flags; + u32 reserved; +};/* HW DATA */ + +struct gdma_create_pd_resp { + struct gdma_resp_hdr hdr; + u64 pd_handle; + u32 pd_id; + u32 reserved; +};/* HW DATA */ + +struct gdma_destroy_pd_req { + struct gdma_req_hdr hdr; + u64 pd_handle; +};/* HW DATA */ + +struct gdma_destory_pd_resp { + struct gdma_resp_hdr hdr; +};/* HW DATA */ + +enum gdma_mr_type { + /* + * Guest Physical Address - MRs of this type allow access + * to any DMA-mapped memory using bus-logical address + */ + GDMA_MR_TYPE_GPA = 1, + /* Guest Virtual Address - MRs of this type allow access + * to memory mapped by PTEs associated with this MR using a virtual + * address that is set up in the MST + */ + GDMA_MR_TYPE_GVA = 2, + /* Guest zero-based address MRs */ + GDMA_MR_TYPE_ZBVA = 4, +}; + +struct gdma_create_mr_params { + u64 pd_handle; + enum gdma_mr_type mr_type; + union { + struct { + u64 dma_region_handle; + u64 virtual_address; + enum gdma_mr_access_flags access_flags; + } gva; + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } zbva; + }; +}; + +struct gdma_create_mr_request { + struct gdma_req_hdr hdr; + u64 pd_handle; + enum gdma_mr_type mr_type; + u32 reserved_1; + + union { + struct { + u64 dma_region_handle; + u64 virtual_address; + enum gdma_mr_access_flags access_flags; + } gva; + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } zbva; + }; + u32 reserved_2; +};/* HW DATA */ + +struct gdma_create_mr_response { + struct gdma_resp_hdr hdr; + u64 mr_handle; + u32 lkey; + u32 rkey; +};/* HW DATA */ + +struct gdma_destroy_mr_request { + struct gdma_req_hdr hdr; + u64 mr_handle; +};/* HW DATA */ + +struct gdma_destroy_mr_response { + struct gdma_resp_hdr hdr; +};/* HW DATA */ + +int mana_gd_verify_vf_version(struct pci_dev *pdev); + +int mana_gd_register_device(struct gdma_dev *gd); +int mana_gd_deregister_device(struct gdma_dev *gd); + +int mana_gd_post_work_request(struct gdma_queue *wq, + const struct gdma_wqe_request *wqe_req, + struct gdma_posted_wqe_info *wqe_info); + +int mana_gd_post_and_ring(struct gdma_queue *queue, + const struct gdma_wqe_request *wqe, + struct gdma_posted_wqe_info *wqe_info); + +int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r); +void mana_gd_free_res_map(struct gdma_resource *r); + +void mana_gd_wq_ring_doorbell(struct gdma_context *gc, + struct gdma_queue *queue); + +int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, + struct gdma_mem_info *gmi); + +void mana_gd_free_memory(struct gdma_mem_info *gmi); + +int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, + u32 resp_len, void *resp); + +int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); +void mana_register_debugfs(void); +void mana_unregister_debugfs(void); + +int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event); + +#endif /* _GDMA_H */ diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h new file mode 100644 index 000000000000..83cf93338eb3 --- /dev/null +++ b/include/net/mana/hw_channel.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2021, Microsoft Corporation. */ + +#ifndef _HW_CHANNEL_H +#define _HW_CHANNEL_H + +#define DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ 4 + +#define HW_CHANNEL_MAX_REQUEST_SIZE 0x1000 +#define HW_CHANNEL_MAX_RESPONSE_SIZE 0x1000 + +#define HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH 1 + +#define HWC_INIT_DATA_CQID 1 +#define HWC_INIT_DATA_RQID 2 +#define HWC_INIT_DATA_SQID 3 +#define HWC_INIT_DATA_QUEUE_DEPTH 4 +#define HWC_INIT_DATA_MAX_REQUEST 5 +#define HWC_INIT_DATA_MAX_RESPONSE 6 +#define HWC_INIT_DATA_MAX_NUM_CQS 7 +#define HWC_INIT_DATA_PDID 8 +#define HWC_INIT_DATA_GPA_MKEY 9 +#define HWC_INIT_DATA_PF_DEST_RQ_ID 10 +#define HWC_INIT_DATA_PF_DEST_CQ_ID 11 + +#define HWC_DATA_CFG_HWC_TIMEOUT 1 + +#define HW_CHANNEL_WAIT_RESOURCE_TIMEOUT_MS 30000 + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +union hwc_init_eq_id_db { + u32 as_uint32; + + struct { + u32 eq_id : 16; + u32 doorbell : 16; + }; +}; /* HW DATA */ + +union hwc_init_type_data { + u32 as_uint32; + + struct { + u32 value : 24; + u32 type : 8; + }; +}; /* HW DATA */ + +union hwc_init_soc_service_type { + u32 as_uint32; + + struct { + u32 value : 28; + u32 type : 4; + }; +}; /* HW DATA */ + +struct hwc_rx_oob { + u32 type : 6; + u32 eom : 1; + u32 som : 1; + u32 vendor_err : 8; + u32 reserved1 : 16; + + u32 src_virt_wq : 24; + u32 src_vfid : 8; + + u32 reserved2; + + union { + u32 wqe_addr_low; + u32 wqe_offset; + }; + + u32 wqe_addr_high; + + u32 client_data_unit : 14; + u32 reserved3 : 18; + + u32 tx_oob_data_size; + + u32 chunk_offset : 21; + u32 reserved4 : 11; +}; /* HW DATA */ + +struct hwc_tx_oob { + u32 reserved1; + + u32 reserved2; + + u32 vrq_id : 24; + u32 dest_vfid : 8; + + u32 vrcq_id : 24; + u32 reserved3 : 8; + + u32 vscq_id : 24; + u32 loopback : 1; + u32 lso_override: 1; + u32 dest_pf : 1; + u32 reserved4 : 5; + + u32 vsq_id : 24; + u32 reserved5 : 8; +}; /* HW DATA */ + +struct hwc_work_request { + void *buf_va; + void *buf_sge_addr; + u32 buf_len; + u32 msg_size; + + struct gdma_wqe_request wqe_req; + struct hwc_tx_oob tx_oob; + + struct gdma_sge sge; +}; + +/* hwc_dma_buf represents the array of in-flight WQEs. + * mem_info as know as the GDMA mapped memory is partitioned and used by + * in-flight WQEs. + * The number of WQEs is determined by the number of in-flight messages. + */ +struct hwc_dma_buf { + struct gdma_mem_info mem_info; + + u32 gpa_mkey; + + u32 num_reqs; + struct hwc_work_request reqs[] __counted_by(num_reqs); +}; + +typedef void hwc_rx_event_handler_t(void *ctx, u32 gdma_rxq_id, + const struct hwc_rx_oob *rx_oob); + +typedef void hwc_tx_event_handler_t(void *ctx, u32 gdma_txq_id, + const struct hwc_rx_oob *rx_oob); + +struct hwc_cq { + struct hw_channel_context *hwc; + + struct gdma_queue *gdma_cq; + struct gdma_queue *gdma_eq; + struct gdma_comp *comp_buf; + u16 queue_depth; + + hwc_rx_event_handler_t *rx_event_handler; + void *rx_event_ctx; + + hwc_tx_event_handler_t *tx_event_handler; + void *tx_event_ctx; +}; + +struct hwc_wq { + struct hw_channel_context *hwc; + + struct gdma_queue *gdma_wq; + struct hwc_dma_buf *msg_buf; + u16 queue_depth; + + struct hwc_cq *hwc_cq; +}; + +struct hwc_caller_ctx { + struct completion comp_event; + void *output_buf; + u32 output_buflen; + + u32 error; /* Linux error code */ + u32 status_code; +}; + +struct hw_channel_context { + struct gdma_dev *gdma_dev; + struct device *dev; + + u16 num_inflight_msg; + u32 max_req_msg_size; + + u16 hwc_init_q_depth_max; + u32 hwc_init_max_req_msg_size; + u32 hwc_init_max_resp_msg_size; + + struct completion hwc_init_eqe_comp; + + struct hwc_wq *rxq; + struct hwc_wq *txq; + struct hwc_cq *cq; + + struct semaphore sema; + struct gdma_resource inflight_msg_res; + + u32 pf_dest_vrq_id; + u32 pf_dest_vrcq_id; + u32 hwc_timeout; + + struct hwc_caller_ctx *caller_ctx; +}; + +int mana_hwc_create_channel(struct gdma_context *gc); +void mana_hwc_destroy_channel(struct gdma_context *gc); + +int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, + const void *req, u32 resp_len, void *resp); + +#endif /* _HW_CHANNEL_H */ diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h new file mode 100644 index 000000000000..9abb66461211 --- /dev/null +++ b/include/net/mana/mana.h @@ -0,0 +1,838 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2021, Microsoft Corporation. */ + +#ifndef _MANA_H +#define _MANA_H + +#include <net/xdp.h> + +#include "gdma.h" +#include "hw_channel.h" + +/* Microsoft Azure Network Adapter (MANA)'s definitions + * + * Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +/* MANA protocol version */ +#define MANA_MAJOR_VERSION 0 +#define MANA_MINOR_VERSION 1 +#define MANA_MICRO_VERSION 1 + +typedef u64 mana_handle_t; +#define INVALID_MANA_HANDLE ((mana_handle_t)-1) + +enum TRI_STATE { + TRI_STATE_UNKNOWN = -1, + TRI_STATE_FALSE = 0, + TRI_STATE_TRUE = 1 +}; + +/* Number of entries for hardware indirection table must be in power of 2 */ +#define MANA_INDIRECT_TABLE_MAX_SIZE 512 +#define MANA_INDIRECT_TABLE_DEF_SIZE 64 + +/* The Toeplitz hash key's length in bytes: should be multiple of 8 */ +#define MANA_HASH_KEY_SIZE 40 + +#define COMP_ENTRY_SIZE 64 + +/* This Max value for RX buffers is derived from __alloc_page()'s max page + * allocation calculation. It allows maximum 2^(MAX_ORDER -1) pages. RX buffer + * size beyond this value gets rejected by __alloc_page() call. + */ +#define MAX_RX_BUFFERS_PER_QUEUE 8192 +#define DEF_RX_BUFFERS_PER_QUEUE 1024 +#define MIN_RX_BUFFERS_PER_QUEUE 128 + +/* This max value for TX buffers is derived as the maximum allocatable + * pages supported on host per guest through testing. TX buffer size beyond + * this value is rejected by the hardware. + */ +#define MAX_TX_BUFFERS_PER_QUEUE 16384 +#define DEF_TX_BUFFERS_PER_QUEUE 256 +#define MIN_TX_BUFFERS_PER_QUEUE 128 + +#define EQ_SIZE (8 * MANA_PAGE_SIZE) + +#define LOG2_EQ_THROTTLE 3 + +#define MAX_PORTS_IN_MANA_DEV 256 + +/* Update this count whenever the respective structures are changed */ +#define MANA_STATS_RX_COUNT 5 +#define MANA_STATS_TX_COUNT 11 + +struct mana_stats_rx { + u64 packets; + u64 bytes; + u64 xdp_drop; + u64 xdp_tx; + u64 xdp_redirect; + struct u64_stats_sync syncp; +}; + +struct mana_stats_tx { + u64 packets; + u64 bytes; + u64 xdp_xmit; + u64 tso_packets; + u64 tso_bytes; + u64 tso_inner_packets; + u64 tso_inner_bytes; + u64 short_pkt_fmt; + u64 long_pkt_fmt; + u64 csum_partial; + u64 mana_map_err; + struct u64_stats_sync syncp; +}; + +struct mana_txq { + struct gdma_queue *gdma_sq; + + union { + u32 gdma_txq_id; + struct { + u32 reserved1 : 10; + u32 vsq_frame : 14; + u32 reserved2 : 8; + }; + }; + + u16 vp_offset; + + struct net_device *ndev; + + /* The SKBs are sent to the HW and we are waiting for the CQEs. */ + struct sk_buff_head pending_skbs; + struct netdev_queue *net_txq; + + atomic_t pending_sends; + + bool napi_initialized; + + struct mana_stats_tx stats; +}; + +/* skb data and frags dma mappings */ +struct mana_skb_head { + /* GSO pkts may have 2 SGEs for the linear part*/ + dma_addr_t dma_handle[MAX_SKB_FRAGS + 2]; + + u32 size[MAX_SKB_FRAGS + 2]; +}; + +#define MANA_HEADROOM sizeof(struct mana_skb_head) + +enum mana_tx_pkt_format { + MANA_SHORT_PKT_FMT = 0, + MANA_LONG_PKT_FMT = 1, +}; + +struct mana_tx_short_oob { + u32 pkt_fmt : 2; + u32 is_outer_ipv4 : 1; + u32 is_outer_ipv6 : 1; + u32 comp_iphdr_csum : 1; + u32 comp_tcp_csum : 1; + u32 comp_udp_csum : 1; + u32 supress_txcqe_gen : 1; + u32 vcq_num : 24; + + u32 trans_off : 10; /* Transport header offset */ + u32 vsq_frame : 14; + u32 short_vp_offset : 8; +}; /* HW DATA */ + +struct mana_tx_long_oob { + u32 is_encap : 1; + u32 inner_is_ipv6 : 1; + u32 inner_tcp_opt : 1; + u32 inject_vlan_pri_tag : 1; + u32 reserved1 : 12; + u32 pcp : 3; /* 802.1Q */ + u32 dei : 1; /* 802.1Q */ + u32 vlan_id : 12; /* 802.1Q */ + + u32 inner_frame_offset : 10; + u32 inner_ip_rel_offset : 6; + u32 long_vp_offset : 12; + u32 reserved2 : 4; + + u32 reserved3; + u32 reserved4; +}; /* HW DATA */ + +struct mana_tx_oob { + struct mana_tx_short_oob s_oob; + struct mana_tx_long_oob l_oob; +}; /* HW DATA */ + +enum mana_cq_type { + MANA_CQ_TYPE_RX, + MANA_CQ_TYPE_TX, +}; + +enum mana_cqe_type { + CQE_INVALID = 0, + CQE_RX_OKAY = 1, + CQE_RX_COALESCED_4 = 2, + CQE_RX_OBJECT_FENCE = 3, + CQE_RX_TRUNCATED = 4, + + CQE_TX_OKAY = 32, + CQE_TX_SA_DROP = 33, + CQE_TX_MTU_DROP = 34, + CQE_TX_INVALID_OOB = 35, + CQE_TX_INVALID_ETH_TYPE = 36, + CQE_TX_HDR_PROCESSING_ERROR = 37, + CQE_TX_VF_DISABLED = 38, + CQE_TX_VPORT_IDX_OUT_OF_RANGE = 39, + CQE_TX_VPORT_DISABLED = 40, + CQE_TX_VLAN_TAGGING_VIOLATION = 41, +}; + +#define MANA_CQE_COMPLETION 1 + +struct mana_cqe_header { + u32 cqe_type : 6; + u32 client_type : 2; + u32 vendor_err : 24; +}; /* HW DATA */ + +/* NDIS HASH Types */ +#define NDIS_HASH_IPV4 BIT(0) +#define NDIS_HASH_TCP_IPV4 BIT(1) +#define NDIS_HASH_UDP_IPV4 BIT(2) +#define NDIS_HASH_IPV6 BIT(3) +#define NDIS_HASH_TCP_IPV6 BIT(4) +#define NDIS_HASH_UDP_IPV6 BIT(5) +#define NDIS_HASH_IPV6_EX BIT(6) +#define NDIS_HASH_TCP_IPV6_EX BIT(7) +#define NDIS_HASH_UDP_IPV6_EX BIT(8) + +#define MANA_HASH_L3 (NDIS_HASH_IPV4 | NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX) +#define MANA_HASH_L4 \ + (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 | \ + NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX) + +struct mana_rxcomp_perpkt_info { + u32 pkt_len : 16; + u32 reserved1 : 16; + u32 reserved2; + u32 pkt_hash; +}; /* HW DATA */ + +#define MANA_RXCOMP_OOB_NUM_PPI 4 + +/* Receive completion OOB */ +struct mana_rxcomp_oob { + struct mana_cqe_header cqe_hdr; + + u32 rx_vlan_id : 12; + u32 rx_vlantag_present : 1; + u32 rx_outer_iphdr_csum_succeed : 1; + u32 rx_outer_iphdr_csum_fail : 1; + u32 reserved1 : 1; + u32 rx_hashtype : 9; + u32 rx_iphdr_csum_succeed : 1; + u32 rx_iphdr_csum_fail : 1; + u32 rx_tcp_csum_succeed : 1; + u32 rx_tcp_csum_fail : 1; + u32 rx_udp_csum_succeed : 1; + u32 rx_udp_csum_fail : 1; + u32 reserved2 : 1; + + struct mana_rxcomp_perpkt_info ppi[MANA_RXCOMP_OOB_NUM_PPI]; + + u32 rx_wqe_offset; +}; /* HW DATA */ + +struct mana_tx_comp_oob { + struct mana_cqe_header cqe_hdr; + + u32 tx_data_offset; + + u32 tx_sgl_offset : 5; + u32 tx_wqe_offset : 27; + + u32 reserved[12]; +}; /* HW DATA */ + +struct mana_rxq; + +#define CQE_POLLING_BUFFER 512 + +struct mana_cq { + struct gdma_queue *gdma_cq; + + /* Cache the CQ id (used to verify if each CQE comes to the right CQ. */ + u32 gdma_id; + + /* Type of the CQ: TX or RX */ + enum mana_cq_type type; + + /* Pointer to the mana_rxq that is pushing RX CQEs to the queue. + * Only and must be non-NULL if type is MANA_CQ_TYPE_RX. + */ + struct mana_rxq *rxq; + + /* Pointer to the mana_txq that is pushing TX CQEs to the queue. + * Only and must be non-NULL if type is MANA_CQ_TYPE_TX. + */ + struct mana_txq *txq; + + /* Buffer which the CQ handler can copy the CQE's into. */ + struct gdma_comp gdma_comp_buf[CQE_POLLING_BUFFER]; + + /* NAPI data */ + struct napi_struct napi; + int work_done; + int work_done_since_doorbell; + int budget; +}; + +struct mana_recv_buf_oob { + /* A valid GDMA work request representing the data buffer. */ + struct gdma_wqe_request wqe_req; + + void *buf_va; + bool from_pool; /* allocated from a page pool */ + + /* SGL of the buffer going to be sent as part of the work request. */ + u32 num_sge; + struct gdma_sge sgl[MAX_RX_WQE_SGL_ENTRIES]; + + /* Required to store the result of mana_gd_post_work_request. + * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the + * work queue when the WQE is consumed. + */ + struct gdma_posted_wqe_info wqe_inf; +}; + +#define MANA_RXBUF_PAD (SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) \ + + ETH_HLEN) + +#define MANA_XDP_MTU_MAX (PAGE_SIZE - MANA_RXBUF_PAD - XDP_PACKET_HEADROOM) + +struct mana_rxq { + struct gdma_queue *gdma_rq; + /* Cache the gdma receive queue id */ + u32 gdma_id; + + /* Index of RQ in the vPort, not gdma receive queue id */ + u32 rxq_idx; + + u32 datasize; + u32 alloc_size; + u32 headroom; + + mana_handle_t rxobj; + + struct mana_cq rx_cq; + + struct completion fence_event; + + struct net_device *ndev; + + /* Total number of receive buffers to be allocated */ + u32 num_rx_buf; + + u32 buf_index; + + struct mana_stats_rx stats; + + struct bpf_prog __rcu *bpf_prog; + struct xdp_rxq_info xdp_rxq; + void *xdp_save_va; /* for reusing */ + bool xdp_flush; + int xdp_rc; /* XDP redirect return code */ + + struct page_pool *page_pool; + struct dentry *mana_rx_debugfs; + + /* MUST BE THE LAST MEMBER: + * Each receive buffer has an associated mana_recv_buf_oob. + */ + struct mana_recv_buf_oob rx_oobs[] __counted_by(num_rx_buf); +}; + +struct mana_tx_qp { + struct mana_txq txq; + + struct mana_cq tx_cq; + + mana_handle_t tx_object; + + struct dentry *mana_tx_debugfs; +}; + +struct mana_ethtool_stats { + u64 stop_queue; + u64 wake_queue; + u64 hc_rx_discards_no_wqe; + u64 hc_rx_err_vport_disabled; + u64 hc_rx_bytes; + u64 hc_rx_ucast_pkts; + u64 hc_rx_ucast_bytes; + u64 hc_rx_bcast_pkts; + u64 hc_rx_bcast_bytes; + u64 hc_rx_mcast_pkts; + u64 hc_rx_mcast_bytes; + u64 hc_tx_err_gf_disabled; + u64 hc_tx_err_vport_disabled; + u64 hc_tx_err_inval_vportoffset_pkt; + u64 hc_tx_err_vlan_enforcement; + u64 hc_tx_err_eth_type_enforcement; + u64 hc_tx_err_sa_enforcement; + u64 hc_tx_err_sqpdid_enforcement; + u64 hc_tx_err_cqpdid_enforcement; + u64 hc_tx_err_mtu_violation; + u64 hc_tx_err_inval_oob; + u64 hc_tx_bytes; + u64 hc_tx_ucast_pkts; + u64 hc_tx_ucast_bytes; + u64 hc_tx_bcast_pkts; + u64 hc_tx_bcast_bytes; + u64 hc_tx_mcast_pkts; + u64 hc_tx_mcast_bytes; + u64 hc_tx_err_gdma; + u64 tx_cqe_err; + u64 tx_cqe_unknown_type; + u64 rx_coalesced_err; + u64 rx_cqe_unknown_type; +}; + +struct mana_context { + struct gdma_dev *gdma_dev; + + u16 num_ports; + u8 bm_hostmode; + + struct mana_eq *eqs; + struct dentry *mana_eqs_debugfs; + + struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; +}; + +struct mana_port_context { + struct mana_context *ac; + struct net_device *ndev; + + u8 mac_addr[ETH_ALEN]; + + enum TRI_STATE rss_state; + + mana_handle_t default_rxobj; + bool tx_shortform_allowed; + u16 tx_vp_offset; + + struct mana_tx_qp *tx_qp; + + /* Indirection Table for RX & TX. The values are queue indexes */ + u32 *indir_table; + u32 indir_table_sz; + + /* Indirection table containing RxObject Handles */ + mana_handle_t *rxobj_table; + + /* Hash key used by the NIC */ + u8 hashkey[MANA_HASH_KEY_SIZE]; + + /* This points to an array of num_queues of RQ pointers. */ + struct mana_rxq **rxqs; + + /* pre-allocated rx buffer array */ + void **rxbufs_pre; + dma_addr_t *das_pre; + int rxbpre_total; + u32 rxbpre_datasize; + u32 rxbpre_alloc_size; + u32 rxbpre_headroom; + + struct bpf_prog *bpf_prog; + + /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */ + unsigned int max_queues; + unsigned int num_queues; + + unsigned int rx_queue_size; + unsigned int tx_queue_size; + + mana_handle_t port_handle; + mana_handle_t pf_filter_handle; + + /* Mutex for sharing access to vport_use_count */ + struct mutex vport_mutex; + int vport_use_count; + + u16 port_idx; + + bool port_is_up; + bool port_st_save; /* Saved port state */ + + struct mana_ethtool_stats eth_stats; + + /* Debugfs */ + struct dentry *mana_port_debugfs; +}; + +netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev); +int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx, + bool update_hash, bool update_tab); + +int mana_alloc_queues(struct net_device *ndev); +int mana_attach(struct net_device *ndev); +int mana_detach(struct net_device *ndev, bool from_close); + +int mana_probe(struct gdma_dev *gd, bool resuming); +void mana_remove(struct gdma_dev *gd, bool suspending); + +int mana_rdma_probe(struct gdma_dev *gd); +void mana_rdma_remove(struct gdma_dev *gd); + +void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev); +int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames, + u32 flags); +u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq, + struct xdp_buff *xdp, void *buf_va, uint pkt_len); +struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); +void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); +int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); +void mana_query_gf_stats(struct mana_port_context *apc); +int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues); +void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); + +extern const struct ethtool_ops mana_ethtool_ops; +extern struct dentry *mana_debugfs_root; + +/* A CQ can be created not associated with any EQ */ +#define GDMA_CQ_NO_EQ 0xffff + +struct mana_obj_spec { + u32 queue_index; + u64 gdma_region; + u32 queue_size; + u32 attached_eq; + u32 modr_ctx_id; +}; + +enum mana_command_code { + MANA_QUERY_DEV_CONFIG = 0x20001, + MANA_QUERY_GF_STAT = 0x20002, + MANA_CONFIG_VPORT_TX = 0x20003, + MANA_CREATE_WQ_OBJ = 0x20004, + MANA_DESTROY_WQ_OBJ = 0x20005, + MANA_FENCE_RQ = 0x20006, + MANA_CONFIG_VPORT_RX = 0x20007, + MANA_QUERY_VPORT_CONFIG = 0x20008, + + /* Privileged commands for the PF mode */ + MANA_REGISTER_FILTER = 0x28000, + MANA_DEREGISTER_FILTER = 0x28001, + MANA_REGISTER_HW_PORT = 0x28003, + MANA_DEREGISTER_HW_PORT = 0x28004, +}; + +/* Query Device Configuration */ +struct mana_query_device_cfg_req { + struct gdma_req_hdr hdr; + + /* MANA Nic Driver Capability flags */ + u64 mn_drv_cap_flags1; + u64 mn_drv_cap_flags2; + u64 mn_drv_cap_flags3; + u64 mn_drv_cap_flags4; + + u32 proto_major_ver; + u32 proto_minor_ver; + u32 proto_micro_ver; + + u32 reserved; +}; /* HW DATA */ + +struct mana_query_device_cfg_resp { + struct gdma_resp_hdr hdr; + + u64 pf_cap_flags1; + u64 pf_cap_flags2; + u64 pf_cap_flags3; + u64 pf_cap_flags4; + + u16 max_num_vports; + u8 bm_hostmode; /* response v3: Bare Metal Host Mode */ + u8 reserved; + u32 max_num_eqs; + + /* response v2: */ + u16 adapter_mtu; + u16 reserved2; + u32 reserved3; +}; /* HW DATA */ + +/* Query vPort Configuration */ +struct mana_query_vport_cfg_req { + struct gdma_req_hdr hdr; + u32 vport_index; +}; /* HW DATA */ + +struct mana_query_vport_cfg_resp { + struct gdma_resp_hdr hdr; + u32 max_num_sq; + u32 max_num_rq; + u32 num_indirection_ent; + u32 reserved1; + u8 mac_addr[6]; + u8 reserved2[2]; + mana_handle_t vport; +}; /* HW DATA */ + +/* Configure vPort */ +struct mana_config_vport_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + u32 pdid; + u32 doorbell_pageid; +}; /* HW DATA */ + +struct mana_config_vport_resp { + struct gdma_resp_hdr hdr; + u16 tx_vport_offset; + u8 short_form_allowed; + u8 reserved; +}; /* HW DATA */ + +/* Create WQ Object */ +struct mana_create_wqobj_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + u32 wq_type; + u32 reserved; + u64 wq_gdma_region; + u64 cq_gdma_region; + u32 wq_size; + u32 cq_size; + u32 cq_moderation_ctx_id; + u32 cq_parent_qid; +}; /* HW DATA */ + +struct mana_create_wqobj_resp { + struct gdma_resp_hdr hdr; + u32 wq_id; + u32 cq_id; + mana_handle_t wq_obj; +}; /* HW DATA */ + +/* Destroy WQ Object */ +struct mana_destroy_wqobj_req { + struct gdma_req_hdr hdr; + u32 wq_type; + u32 reserved; + mana_handle_t wq_obj_handle; +}; /* HW DATA */ + +struct mana_destroy_wqobj_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Fence RQ */ +struct mana_fence_rq_req { + struct gdma_req_hdr hdr; + mana_handle_t wq_obj_handle; +}; /* HW DATA */ + +struct mana_fence_rq_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Query stats RQ */ +struct mana_query_gf_stat_req { + struct gdma_req_hdr hdr; + u64 req_stats; +}; /* HW DATA */ + +struct mana_query_gf_stat_resp { + struct gdma_resp_hdr hdr; + u64 reported_stats; + /* rx errors/discards */ + u64 rx_discards_nowqe; + u64 rx_err_vport_disabled; + /* rx bytes/packets */ + u64 hc_rx_bytes; + u64 hc_rx_ucast_pkts; + u64 hc_rx_ucast_bytes; + u64 hc_rx_bcast_pkts; + u64 hc_rx_bcast_bytes; + u64 hc_rx_mcast_pkts; + u64 hc_rx_mcast_bytes; + /* tx errors */ + u64 tx_err_gf_disabled; + u64 tx_err_vport_disabled; + u64 tx_err_inval_vport_offset_pkt; + u64 tx_err_vlan_enforcement; + u64 tx_err_ethtype_enforcement; + u64 tx_err_SA_enforcement; + u64 tx_err_SQPDID_enforcement; + u64 tx_err_CQPDID_enforcement; + u64 tx_err_mtu_violation; + u64 tx_err_inval_oob; + /* tx bytes/packets */ + u64 hc_tx_bytes; + u64 hc_tx_ucast_pkts; + u64 hc_tx_ucast_bytes; + u64 hc_tx_bcast_pkts; + u64 hc_tx_bcast_bytes; + u64 hc_tx_mcast_pkts; + u64 hc_tx_mcast_bytes; + /* tx error */ + u64 tx_err_gdma; +}; /* HW DATA */ + +/* Configure vPort Rx Steering */ +struct mana_cfg_rx_steer_req_v2 { + struct gdma_req_hdr hdr; + mana_handle_t vport; + u16 num_indir_entries; + u16 indir_tab_offset; + u32 rx_enable; + u32 rss_enable; + u8 update_default_rxobj; + u8 update_hashkey; + u8 update_indir_tab; + u8 reserved; + mana_handle_t default_rxobj; + u8 hashkey[MANA_HASH_KEY_SIZE]; + u8 cqe_coalescing_enable; + u8 reserved2[7]; + mana_handle_t indir_tab[] __counted_by(num_indir_entries); +}; /* HW DATA */ + +struct mana_cfg_rx_steer_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Register HW vPort */ +struct mana_register_hw_vport_req { + struct gdma_req_hdr hdr; + u16 attached_gfid; + u8 is_pf_default_vport; + u8 reserved1; + u8 allow_all_ether_types; + u8 reserved2; + u8 reserved3; + u8 reserved4; +}; /* HW DATA */ + +struct mana_register_hw_vport_resp { + struct gdma_resp_hdr hdr; + mana_handle_t hw_vport_handle; +}; /* HW DATA */ + +/* Deregister HW vPort */ +struct mana_deregister_hw_vport_req { + struct gdma_req_hdr hdr; + mana_handle_t hw_vport_handle; +}; /* HW DATA */ + +struct mana_deregister_hw_vport_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Register filter */ +struct mana_register_filter_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + u8 mac_addr[6]; + u8 reserved1; + u8 reserved2; + u8 reserved3; + u8 reserved4; + u16 reserved5; + u32 reserved6; + u32 reserved7; + u32 reserved8; +}; /* HW DATA */ + +struct mana_register_filter_resp { + struct gdma_resp_hdr hdr; + mana_handle_t filter_handle; +}; /* HW DATA */ + +/* Deregister filter */ +struct mana_deregister_filter_req { + struct gdma_req_hdr hdr; + mana_handle_t filter_handle; +}; /* HW DATA */ + +struct mana_deregister_filter_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Requested GF stats Flags */ +/* Rx discards/Errors */ +#define STATISTICS_FLAGS_RX_DISCARDS_NO_WQE 0x0000000000000001 +#define STATISTICS_FLAGS_RX_ERRORS_VPORT_DISABLED 0x0000000000000002 +/* Rx bytes/pkts */ +#define STATISTICS_FLAGS_HC_RX_BYTES 0x0000000000000004 +#define STATISTICS_FLAGS_HC_RX_UCAST_PACKETS 0x0000000000000008 +#define STATISTICS_FLAGS_HC_RX_UCAST_BYTES 0x0000000000000010 +#define STATISTICS_FLAGS_HC_RX_MCAST_PACKETS 0x0000000000000020 +#define STATISTICS_FLAGS_HC_RX_MCAST_BYTES 0x0000000000000040 +#define STATISTICS_FLAGS_HC_RX_BCAST_PACKETS 0x0000000000000080 +#define STATISTICS_FLAGS_HC_RX_BCAST_BYTES 0x0000000000000100 +/* Tx errors */ +#define STATISTICS_FLAGS_TX_ERRORS_GF_DISABLED 0x0000000000000200 +#define STATISTICS_FLAGS_TX_ERRORS_VPORT_DISABLED 0x0000000000000400 +#define STATISTICS_FLAGS_TX_ERRORS_INVAL_VPORT_OFFSET_PACKETS \ + 0x0000000000000800 +#define STATISTICS_FLAGS_TX_ERRORS_VLAN_ENFORCEMENT 0x0000000000001000 +#define STATISTICS_FLAGS_TX_ERRORS_ETH_TYPE_ENFORCEMENT \ + 0x0000000000002000 +#define STATISTICS_FLAGS_TX_ERRORS_SA_ENFORCEMENT 0x0000000000004000 +#define STATISTICS_FLAGS_TX_ERRORS_SQPDID_ENFORCEMENT 0x0000000000008000 +#define STATISTICS_FLAGS_TX_ERRORS_CQPDID_ENFORCEMENT 0x0000000000010000 +#define STATISTICS_FLAGS_TX_ERRORS_MTU_VIOLATION 0x0000000000020000 +#define STATISTICS_FLAGS_TX_ERRORS_INVALID_OOB 0x0000000000040000 +/* Tx bytes/pkts */ +#define STATISTICS_FLAGS_HC_TX_BYTES 0x0000000000080000 +#define STATISTICS_FLAGS_HC_TX_UCAST_PACKETS 0x0000000000100000 +#define STATISTICS_FLAGS_HC_TX_UCAST_BYTES 0x0000000000200000 +#define STATISTICS_FLAGS_HC_TX_MCAST_PACKETS 0x0000000000400000 +#define STATISTICS_FLAGS_HC_TX_MCAST_BYTES 0x0000000000800000 +#define STATISTICS_FLAGS_HC_TX_BCAST_PACKETS 0x0000000001000000 +#define STATISTICS_FLAGS_HC_TX_BCAST_BYTES 0x0000000002000000 +/* Tx error */ +#define STATISTICS_FLAGS_TX_ERRORS_GDMA_ERROR 0x0000000004000000 + +#define MANA_MAX_NUM_QUEUES 64 + +#define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1) + +struct mana_tx_package { + struct gdma_wqe_request wqe_req; + struct gdma_sge sgl_array[5]; + struct gdma_sge *sgl_ptr; + + struct mana_tx_oob tx_oob; + + struct gdma_posted_wqe_info wqe_info; +}; + +int mana_create_wq_obj(struct mana_port_context *apc, + mana_handle_t vport, + u32 wq_type, struct mana_obj_spec *wq_spec, + struct mana_obj_spec *cq_spec, + mana_handle_t *wq_obj); + +void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, + mana_handle_t wq_obj); + +int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, + u32 doorbell_pg_id); +void mana_uncfg_vport(struct mana_port_context *apc); + +struct net_device *mana_get_primary_netdev(struct mana_context *ac, + u32 port_index, + netdevice_tracker *tracker); +#endif /* _MANA_H */ diff --git a/include/net/mana/mana_auxiliary.h b/include/net/mana/mana_auxiliary.h new file mode 100644 index 000000000000..373d59756846 --- /dev/null +++ b/include/net/mana/mana_auxiliary.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022, Microsoft Corporation. */ + +#include "mana.h" +#include <linux/auxiliary_bus.h> + +struct mana_adev { + struct auxiliary_device adev; + struct gdma_dev *mdev; +}; diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h new file mode 100644 index 000000000000..5199b41497ff --- /dev/null +++ b/include/net/mana/shm_channel.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2021, Microsoft Corporation. */ + +#ifndef _SHM_CHANNEL_H +#define _SHM_CHANNEL_H + +struct shm_channel { + struct device *dev; + void __iomem *base; +}; + +void mana_smc_init(struct shm_channel *sc, struct device *dev, + void __iomem *base); + +int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, + u64 cq_addr, u64 rq_addr, u64 sq_addr, + u32 eq_msix_index); + +int mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf); + +#endif /* _SHM_CHANNEL_H */ diff --git a/include/net/mctp.h b/include/net/mctp.h new file mode 100644 index 000000000000..07d458990113 --- /dev/null +++ b/include/net/mctp.h @@ -0,0 +1,319 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Management Component Transport Protocol (MCTP) + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#ifndef __NET_MCTP_H +#define __NET_MCTP_H + +#include <linux/bits.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <net/net_namespace.h> +#include <net/sock.h> + +/* MCTP packet definitions */ +struct mctp_hdr { + u8 ver; + u8 dest; + u8 src; + u8 flags_seq_tag; +}; + +#define MCTP_VER_MIN 1 +#define MCTP_VER_MAX 1 + +/* Definitions for flags_seq_tag field */ +#define MCTP_HDR_FLAG_SOM BIT(7) +#define MCTP_HDR_FLAG_EOM BIT(6) +#define MCTP_HDR_FLAG_TO BIT(3) +#define MCTP_HDR_FLAGS GENMASK(5, 3) +#define MCTP_HDR_SEQ_SHIFT 4 +#define MCTP_HDR_SEQ_MASK GENMASK(1, 0) +#define MCTP_HDR_TAG_SHIFT 0 +#define MCTP_HDR_TAG_MASK GENMASK(2, 0) + +#define MCTP_INITIAL_DEFAULT_NET 1 + +static inline bool mctp_address_unicast(mctp_eid_t eid) +{ + return eid >= 8 && eid < 255; +} + +static inline bool mctp_address_broadcast(mctp_eid_t eid) +{ + return eid == 255; +} + +static inline bool mctp_address_null(mctp_eid_t eid) +{ + return eid == 0; +} + +static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) +{ + return match == eid || match == MCTP_ADDR_ANY; +} + +static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb) +{ + return (struct mctp_hdr *)skb_network_header(skb); +} + +/* socket implementation */ +struct mctp_sock { + struct sock sk; + + /* bind() params */ + unsigned int bind_net; + mctp_eid_t bind_addr; + __u8 bind_type; + + /* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */ + bool addr_ext; + + /* list of mctp_sk_key, for incoming tag lookup. updates protected + * by sk->net->keys_lock + */ + struct hlist_head keys; + + /* mechanism for expiring allocated keys; will release an allocated + * tag, and any netdev state for a request/response pairing + */ + struct timer_list key_expiry; +}; + +/* Key for matching incoming packets to sockets or reassembly contexts. + * Packets are matched on (peer EID, local EID, tag). + * + * Lifetime / locking requirements: + * + * - individual key data (ie, the struct itself) is protected by key->lock; + * changes must be made with that lock held. + * + * - the lookup fields: peer_addr, local_addr and tag are set before the + * key is added to lookup lists, and never updated. + * + * - A ref to the key must be held (throuh key->refs) if a pointer to the + * key is to be accessed after key->lock is released. + * + * - a mctp_sk_key contains a reference to a struct sock; this is valid + * for the life of the key. On sock destruction (through unhash), the key is + * removed from lists (see below), and marked invalid. + * + * - these mctp_sk_keys appear on two lists: + * 1) the struct mctp_sock->keys list + * 2) the struct netns_mctp->keys list + * + * presences on these lists requires a (single) refcount to be held; both + * lists are updated as a single operation. + * + * Updates and lookups in either list are performed under the + * netns_mctp->keys lock. Lookup functions will need to lock the key and + * take a reference before unlocking the keys_lock. Consequently, the list's + * keys_lock *cannot* be acquired with the individual key->lock held. + * + * - a key may have a sk_buff attached as part of an in-progress message + * reassembly (->reasm_head). The reasm data is protected by the individual + * key->lock. + * + * - there are two destruction paths for a mctp_sk_key: + * + * - through socket unhash (see mctp_sk_unhash). This performs the list + * removal under keys_lock. + * + * - where a key is established to receive a reply message: after receiving + * the (complete) reply, or during reassembly errors. Here, we clean up + * the reassembly context (marking reasm_dead, to prevent another from + * starting), and remove the socket from the netns & socket lists. + * + * - through an expiry timeout, on a per-socket timer + */ +struct mctp_sk_key { + unsigned int net; + mctp_eid_t peer_addr; + mctp_eid_t local_addr; /* MCTP_ADDR_ANY for local owned tags */ + __u8 tag; /* incoming tag match; invert TO for local */ + + /* we hold a ref to sk when set */ + struct sock *sk; + + /* routing lookup list */ + struct hlist_node hlist; + + /* per-socket list */ + struct hlist_node sklist; + + /* lock protects against concurrent updates to the reassembly and + * expiry data below. + */ + spinlock_t lock; + + /* Keys are referenced during the output path, which may sleep */ + refcount_t refs; + + /* incoming fragment reassembly context */ + struct sk_buff *reasm_head; + struct sk_buff **reasm_tailp; + bool reasm_dead; + u8 last_seq; + + /* key validity */ + bool valid; + + /* expiry timeout; valid (above) cleared on expiry */ + unsigned long expiry; + + /* free to use for device flow state tracking. Initialised to + * zero on initial key creation + */ + unsigned long dev_flow_state; + struct mctp_dev *dev; + + /* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire + * automatically on timeout or response, instead SIOCMCTPDROPTAG + * is used. + */ + bool manual_alloc; +}; + +struct mctp_skb_cb { + unsigned int magic; + unsigned int net; + int ifindex; /* extended/direct addressing if set */ + mctp_eid_t src; + unsigned char halen; + unsigned char haddr[MAX_ADDR_LEN]; +}; + +/* skb control-block accessors with a little extra debugging for initial + * development. + * + * TODO: remove checks & mctp_skb_cb->magic; replace callers of __mctp_cb + * with mctp_cb(). + * + * __mctp_cb() is only for the initial ingress code; we should see ->magic set + * at all times after this. + */ +static inline struct mctp_skb_cb *__mctp_cb(struct sk_buff *skb) +{ + struct mctp_skb_cb *cb = (void *)skb->cb; + + cb->magic = 0x4d435450; + return cb; +} + +static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) +{ + struct mctp_skb_cb *cb = (void *)skb->cb; + + BUILD_BUG_ON(sizeof(struct mctp_skb_cb) > sizeof(skb->cb)); + WARN_ON(cb->magic != 0x4d435450); + return cb; +} + +/* If CONFIG_MCTP_FLOWS, we may add one of these as a SKB extension, + * indicating the flow to the device driver. + */ +struct mctp_flow { + struct mctp_sk_key *key; +}; + +/* Route definition. + * + * These are held in the pernet->mctp.routes list, with RCU protection for + * removed routes. We hold a reference to the netdev; routes need to be + * dropped on NETDEV_UNREGISTER events. + * + * Updates to the route table are performed under rtnl; all reads under RCU, + * so routes cannot be referenced over a RCU grace period. Specifically: A + * caller cannot block between mctp_route_lookup and mctp_route_release() + */ +struct mctp_route { + mctp_eid_t min, max; + + unsigned char type; + unsigned int mtu; + struct mctp_dev *dev; + int (*output)(struct mctp_route *route, + struct sk_buff *skb); + + struct list_head list; + refcount_t refs; + struct rcu_head rcu; +}; + +/* route interfaces */ +struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr); + +/* always takes ownership of skb */ +int mctp_local_output(struct sock *sk, struct mctp_route *rt, + struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); + +void mctp_key_unref(struct mctp_sk_key *key); +struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, + unsigned int netid, + mctp_eid_t local, mctp_eid_t peer, + bool manual, u8 *tagp); + +/* routing <--> device interface */ +unsigned int mctp_default_net(struct net *net); +int mctp_default_net_set(struct net *net, unsigned int index); +int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr); +int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr); +void mctp_route_remove_dev(struct mctp_dev *mdev); + +/* neighbour definitions */ +enum mctp_neigh_source { + MCTP_NEIGH_STATIC, + MCTP_NEIGH_DISCOVER, +}; + +struct mctp_neigh { + struct mctp_dev *dev; + mctp_eid_t eid; + enum mctp_neigh_source source; + + unsigned char ha[MAX_ADDR_LEN]; + + struct list_head list; + struct rcu_head rcu; +}; + +int mctp_neigh_init(void); +void mctp_neigh_exit(void); + +// ret_hwaddr may be NULL, otherwise must have space for MAX_ADDR_LEN +int mctp_neigh_lookup(struct mctp_dev *dev, mctp_eid_t eid, + void *ret_hwaddr); +void mctp_neigh_remove_dev(struct mctp_dev *mdev); + +int mctp_routes_init(void); +void mctp_routes_exit(void); + +int mctp_device_init(void); +void mctp_device_exit(void); + +/* MCTP IDs and Codes from DMTF specification + * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes" + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf + */ +enum mctp_phys_binding { + MCTP_PHYS_BINDING_UNSPEC = 0x00, + MCTP_PHYS_BINDING_SMBUS = 0x01, + MCTP_PHYS_BINDING_PCIE_VDM = 0x02, + MCTP_PHYS_BINDING_USB = 0x03, + MCTP_PHYS_BINDING_KCS = 0x04, + MCTP_PHYS_BINDING_SERIAL = 0x05, + MCTP_PHYS_BINDING_I3C = 0x06, + MCTP_PHYS_BINDING_MMBI = 0x07, + MCTP_PHYS_BINDING_PCC = 0x08, + MCTP_PHYS_BINDING_UCIE = 0x09, + MCTP_PHYS_BINDING_VENDOR = 0xFF, +}; + +#endif /* __NET_MCTP_H */ diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h new file mode 100644 index 000000000000..957d9ef924c5 --- /dev/null +++ b/include/net/mctpdevice.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Management Component Transport Protocol (MCTP) - device + * definitions. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#ifndef __NET_MCTPDEVICE_H +#define __NET_MCTPDEVICE_H + +#include <linux/list.h> +#include <linux/types.h> +#include <linux/refcount.h> + +struct mctp_sk_key; + +struct mctp_dev { + struct net_device *dev; + + refcount_t refs; + + unsigned int net; + enum mctp_phys_binding binding; + + const struct mctp_netdev_ops *ops; + + /* Only modified under RTNL. Reads have addrs_lock held */ + u8 *addrs; + size_t num_addrs; + spinlock_t addrs_lock; + + struct rcu_head rcu; +}; + +struct mctp_netdev_ops { + void (*release_flow)(struct mctp_dev *dev, + struct mctp_sk_key *key); +}; + +#define MCTP_INITIAL_DEFAULT_NET 1 + +struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev); +struct mctp_dev *__mctp_dev_get(const struct net_device *dev); + +int mctp_register_netdev(struct net_device *dev, + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding); +void mctp_unregister_netdev(struct net_device *dev); + +void mctp_dev_hold(struct mctp_dev *mdev); +void mctp_dev_put(struct mctp_dev *mdev); + +void mctp_dev_set_key(struct mctp_dev *dev, struct mctp_sk_key *key); +void mctp_dev_release_key(struct mctp_dev *dev, struct mctp_sk_key *key); + +#endif /* __NET_MCTPDEVICE_H */ diff --git a/include/net/mld.h b/include/net/mld.h index 496bddb59942..c07359808493 100644 --- a/include/net/mld.h +++ b/include/net/mld.h @@ -92,6 +92,9 @@ struct mld2_query { #define MLD_EXP_MIN_LIMIT 32768UL #define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) +#define MLD_MAX_QUEUE 8 +#define MLD_MAX_SKBS 32 + static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h index 9deb3a3735da..0c71c27979fb 100644 --- a/include/net/mpls_iptunnel.h +++ b/include/net/mpls_iptunnel.h @@ -6,6 +6,9 @@ #ifndef _NET_MPLS_IPTUNNEL_H #define _NET_MPLS_IPTUNNEL_H 1 +#include <linux/types.h> +#include <net/lwtunnel.h> + struct mpls_iptunnel_encap { u8 labels; u8 ttl_propagate; diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 753ba7e755d6..f7263fe2a2e4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -12,6 +12,9 @@ #include <linux/tcp.h> #include <linux/types.h> +struct mptcp_info; +struct mptcp_sock; +struct mptcp_pm_addr_entry; struct seq_file; /* MPTCP sk_buff extension data */ @@ -23,43 +26,107 @@ struct mptcp_ext { u64 data_seq; u32 subflow_seq; u16 data_len; + __sum16 csum; u8 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, - __unused:2; - /* one byte hole */ + frozen:1, + reset_transient:1; + u8 reset_reason:4, + csum_reqd:1, + infinite_map:1; }; -struct mptcp_out_options { -#if IS_ENABLED(CONFIG_MPTCP) - u16 suboptions; - u64 sndr_key; - u64 rcvr_key; +#define MPTCPOPT_HMAC_LEN 20 +#define MPTCP_RM_IDS_MAX 8 + +struct mptcp_rm_list { + u8 ids[MPTCP_RM_IDS_MAX]; + u8 nr; +}; + +struct mptcp_addr_info { + u8 id; + sa_family_t family; + __be16 port; union { - struct in_addr addr; + struct in_addr addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) - struct in6_addr addr6; + struct in6_addr addr6; #endif }; - u8 addr_id; - u64 ahmac; - u8 rm_id; +}; + +struct mptcp_out_options { +#if IS_ENABLED(CONFIG_MPTCP) + u16 suboptions; + struct mptcp_rm_list rm_list; u8 join_id; u8 backup; - u32 nonce; - u64 thmac; - u32 token; - u8 hmac[20]; - struct mptcp_ext ext_copy; + u8 reset_reason:4, + reset_transient:1, + csum_reqd:1, + allow_join_id0:1; + union { + struct { + u64 sndr_key; + u64 rcvr_key; + u64 data_seq; + u32 subflow_seq; + u16 data_len; + __sum16 csum; + }; + struct { + struct mptcp_addr_info addr; + u64 ahmac; + }; + struct { + struct mptcp_ext ext_copy; + u64 fail_seq; + }; + struct { + u32 nonce; + u32 token; + u64 thmac; + u8 hmac[MPTCPOPT_HMAC_LEN]; + }; + }; #endif }; -#ifdef CONFIG_MPTCP -extern struct request_sock_ops mptcp_subflow_request_sock_ops; +#define MPTCP_SCHED_NAME_MAX 16 +#define MPTCP_SCHED_MAX 128 +#define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) +struct mptcp_sched_ops { + int (*get_send)(struct mptcp_sock *msk); + int (*get_retrans)(struct mptcp_sock *msk); + + char name[MPTCP_SCHED_NAME_MAX]; + struct module *owner; + struct list_head list; + + void (*init)(struct mptcp_sock *msk); + void (*release)(struct mptcp_sock *msk); +} ____cacheline_aligned_in_smp; + +#define MPTCP_PM_NAME_MAX 16 +#define MPTCP_PM_MAX 128 +#define MPTCP_PM_BUF_MAX (MPTCP_PM_NAME_MAX * MPTCP_PM_MAX) + +struct mptcp_pm_ops { + char name[MPTCP_PM_NAME_MAX]; + struct module *owner; + struct list_head list; + + void (*init)(struct mptcp_sock *msk); + void (*release)(struct mptcp_sock *msk); +} ____cacheline_aligned_in_smp; + +#ifdef CONFIG_MPTCP void mptcp_init(void); static inline bool sk_is_mptcp(const struct sock *sk) @@ -85,9 +152,12 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); -void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); +bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); +void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, + struct mptcp_out_options *opts); + +void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info); /* move the skb extension owership, with the assumption that 'to' is * newly allocated @@ -106,6 +176,19 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to, from->active_extensions = 0; } +static inline void mptcp_skb_ext_copy(struct sk_buff *to, + struct sk_buff *from) +{ + struct mptcp_ext *from_ext; + + from_ext = skb_ext_find(from, SKB_EXT_MPTCP); + if (!from_ext) + return; + + from_ext->frozen = 1; + skb_ext_copy(to, from); +} + static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, const struct mptcp_ext *from_ext) { @@ -133,6 +216,21 @@ void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); +struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener); + +__be32 mptcp_get_reset_option(const struct sk_buff *skb); + +static inline __be32 mptcp_reset_option(const struct sk_buff *skb) +{ + if (skb_ext_exist(skb, SKB_EXT_MPTCP)) + return mptcp_get_reset_option(skb); + + return htonl(0u); +} + +void mptcp_active_detect_blackhole(struct sock *sk, bool expired); #else static inline void mptcp_init(void) @@ -154,12 +252,6 @@ static inline bool rsk_drop_req(const struct request_sock *req) return false; } -static inline void mptcp_parse_option(const struct sk_buff *skb, - const unsigned char *ptr, int opsize, - struct tcp_options_received *opt_rx) -{ -} - static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) @@ -183,9 +275,10 @@ static inline bool mptcp_established_options(struct sock *sk, return false; } -static inline void mptcp_incoming_options(struct sock *sk, +static inline bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { + return true; } static inline void mptcp_skb_ext_move(struct sk_buff *to, @@ -193,6 +286,11 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to, { } +static inline void mptcp_skb_ext_copy(struct sk_buff *to, + struct sk_buff *from) +{ +} + static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { @@ -208,6 +306,17 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, { return 0; /* TCP fallback */ } + +static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + return NULL; +} + +static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } + +static inline void mptcp_active_detect_blackhole(struct sock *sk, bool expired) { } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -218,4 +327,14 @@ static inline int mptcpv6_init(void) { return 0; } static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { } #endif +#if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL) +struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk); +#else +static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; } +#endif + +#if !IS_ENABLED(CONFIG_MPTCP) +struct mptcp_sock { }; +#endif + #endif /* __NET_MPTCP_H */ diff --git a/include/net/mrp.h b/include/net/mrp.h index 1c308c034e1a..b28915ffea28 100644 --- a/include/net/mrp.h +++ b/include/net/mrp.h @@ -2,6 +2,10 @@ #ifndef _NET_MRP_H #define _NET_MRP_H +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/types.h> + #define MRP_END_MARK 0x0 struct mrp_pdu_hdr { @@ -120,6 +124,7 @@ struct mrp_applicant { struct sk_buff *pdu; struct rb_root mad; struct rcu_head rcu; + bool active; }; struct mrp_port { diff --git a/include/net/ncsi.h b/include/net/ncsi.h index fbefe80361ee..08a50d9acb0a 100644 --- a/include/net/ncsi.h +++ b/include/net/ncsi.h @@ -2,6 +2,8 @@ #ifndef __NET_NCSI_H #define __NET_NCSI_H +#include <linux/types.h> + /* * The NCSI device states seen from external. More NCSI device states are * only visible internally (in net/ncsi/internal.h). When the NCSI device diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 38e4094960ce..3c88d5bc5eed 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -137,7 +137,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts); -void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, +void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad); #define NDISC_OPS_REDIRECT_DATA_SPACE 2 @@ -147,11 +147,6 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, * The following hooks can be defined; unless noted otherwise, they are * optional and can be filled with a null pointer. * - * int (*is_useropt)(u8 nd_opt_type): - * This function is called when IPv6 decide RA userspace options. if - * this function returns 1 then the option given by nd_opt_type will - * be handled as userspace option additional to the IPv6 options. - * * int (*parse_options)(const struct net_device *dev, * struct nd_opt_hdr *nd_opt, * struct ndisc_options *ndopts): @@ -200,7 +195,6 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, * addresses. E.g. 802.15.4 6LoWPAN. */ struct ndisc_ops { - int (*is_useropt)(u8 nd_opt_type); int (*parse_options)(const struct net_device *dev, struct nd_opt_hdr *nd_opt, struct ndisc_options *ndopts); @@ -224,15 +218,6 @@ struct ndisc_ops { }; #if IS_ENABLED(CONFIG_IPV6) -static inline int ndisc_ops_is_useropt(const struct net_device *dev, - u8 nd_opt_type) -{ - if (dev->ndisc_ops && dev->ndisc_ops->is_useropt) - return dev->ndisc_ops->is_useropt(nd_opt_type); - else - return 0; -} - static inline int ndisc_ops_parse_options(const struct net_device *dev, struct nd_opt_hdr *nd_opt, struct ndisc_options *ndopts) @@ -395,11 +380,11 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, cons { struct neighbour *n; - rcu_read_lock_bh(); + rcu_read_lock(); n = __ipv6_neigh_lookup_noref(dev, pkey); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; - rcu_read_unlock_bh(); + rcu_read_unlock(); return n; } @@ -409,16 +394,10 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev, { struct neighbour *n; - rcu_read_lock_bh(); + rcu_read_lock(); n = __ipv6_neigh_lookup_noref(dev, pkey); - if (n) { - unsigned long now = jiffies; - - /* avoid dirtying neighbour */ - if (READ_ONCE(n->confirmed) != now) - WRITE_ONCE(n->confirmed, now); - } - rcu_read_unlock_bh(); + neigh_confirm(n); + rcu_read_unlock(); } static inline void __ipv6_confirm_neigh_stub(struct net_device *dev, @@ -426,16 +405,10 @@ static inline void __ipv6_confirm_neigh_stub(struct net_device *dev, { struct neighbour *n; - rcu_read_lock_bh(); + rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(dev, pkey); - if (n) { - unsigned long now = jiffies; - - /* avoid dirtying neighbour */ - if (READ_ONCE(n->confirmed) != now) - WRITE_ONCE(n->confirmed, now); - } - rcu_read_unlock_bh(); + neigh_confirm(n); + rcu_read_unlock(); } /* uses ipv6_stub and is meant for use outside of IPv6 core */ @@ -457,12 +430,17 @@ int ndisc_late_init(void); void ndisc_late_cleanup(void); void ndisc_cleanup(void); -int ndisc_rcv(struct sk_buff *skb); +enum skb_drop_reason ndisc_rcv(struct sk_buff *skb); +struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, + const struct in6_addr *saddr, u64 nonce); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce); +void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, + const struct in6_addr *saddr); + void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, @@ -487,17 +465,14 @@ int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); -int igmp6_event_query(struct sk_buff *skb); +void igmp6_event_query(struct sk_buff *skb); -int igmp6_event_report(struct sk_buff *skb); +void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, +int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); -int ndisc_ifinfo_sysctl_strategy(struct ctl_table *ctl, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen); #endif void inet6_ifinfo_notify(int event, struct inet6_dev *idev); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 81ee17594c32..9a832cab5b1d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -29,6 +29,7 @@ #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> +#include <net/neighbour_tables.h> /* * NUD stands for "neighbor unreachability detection" @@ -48,6 +49,7 @@ enum { NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, + NEIGH_VAR_INTERVAL_PROBE_TIME_MS, NEIGH_VAR_GC_STALETIME, NEIGH_VAR_QUEUE_LEN_BYTES, NEIGH_VAR_PROXY_QLEN, @@ -70,6 +72,7 @@ enum { struct neigh_parms { possible_net_t net; struct net_device *dev; + netdevice_tracker dev_tracker; struct list_head list; int (*neigh_setup)(struct neighbour *); struct neigh_table *tbl; @@ -81,6 +84,7 @@ struct neigh_parms { struct rcu_head rcu_head; int reachable_time; + u32 qlen; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; @@ -132,7 +136,8 @@ struct neigh_statistics { #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { - struct neighbour __rcu *next; + struct hlist_node hash; + struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; @@ -144,20 +149,22 @@ struct neighbour { struct timer_list timer; unsigned long used; atomic_t probes; - __u8 flags; - __u8 nud_state; - __u8 type; - __u8 dead; + u8 nud_state; + u8 type; + u8 dead; u8 protocol; + u32 flags; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8); struct hh_cache hh; int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; struct list_head gc_list; + struct list_head managed_list; struct rcu_head rcu; struct net_device *dev; - u8 primary_key[0]; + netdevice_tracker dev_tracker; + u8 primary_key[]; } __randomize_layout; struct neigh_ops { @@ -172,9 +179,10 @@ struct pneigh_entry { struct pneigh_entry *next; possible_net_t net; struct net_device *dev; - u8 flags; + netdevice_tracker dev_tracker; + u32 flags; u8 protocol; - u8 key[]; + u32 key[]; }; /* @@ -184,7 +192,7 @@ struct pneigh_entry { #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { - struct neighbour __rcu **hash_buckets; + struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; @@ -204,6 +212,7 @@ struct neigh_table { int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); + int (*is_multicast)(const void *pkey); bool (*allow_add)(const struct net_device *dev, struct netlink_ext_ack *extack); char *id; @@ -215,11 +224,13 @@ struct neigh_table { int gc_thresh3; unsigned long last_flush; struct delayed_work gc_work; + struct delayed_work managed_work; struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; atomic_t gc_entries; struct list_head gc_list; + struct list_head managed_list; rwlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; @@ -227,14 +238,6 @@ struct neigh_table { struct pneigh_entry **phash_buckets; }; -enum { - NEIGH_ARP_TABLE = 0, - NEIGH_ND_TABLE = 1, - NEIGH_DN_TABLE = 2, - NEIGH_NR_TABLES, - NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ -}; - static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; @@ -249,19 +252,29 @@ static inline void *neighbour_priv(const struct neighbour *n) } /* flags for neigh_update() */ -#define NEIGH_UPDATE_F_OVERRIDE 0x00000001 -#define NEIGH_UPDATE_F_WEAK_OVERRIDE 0x00000002 -#define NEIGH_UPDATE_F_OVERRIDE_ISROUTER 0x00000004 -#define NEIGH_UPDATE_F_EXT_LEARNED 0x20000000 -#define NEIGH_UPDATE_F_ISROUTER 0x40000000 -#define NEIGH_UPDATE_F_ADMIN 0x80000000 +#define NEIGH_UPDATE_F_OVERRIDE BIT(0) +#define NEIGH_UPDATE_F_WEAK_OVERRIDE BIT(1) +#define NEIGH_UPDATE_F_OVERRIDE_ISROUTER BIT(2) +#define NEIGH_UPDATE_F_USE BIT(3) +#define NEIGH_UPDATE_F_MANAGED BIT(4) +#define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) +#define NEIGH_UPDATE_F_ISROUTER BIT(6) +#define NEIGH_UPDATE_F_ADMIN BIT(7) + +/* In-kernel representation for NDA_FLAGS_EXT flags: */ +#define NTF_OLD_MASK 0xff +#define NTF_EXT_SHIFT 8 +#define NTF_EXT_MASK (NTF_EXT_MANAGED) + +#define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; -static inline bool neigh_key_eq16(const struct neighbour *n, const void *pkey) -{ - return *(const u16 *)n->primary_key == *(const u16 *)pkey; -} +#define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash) +#define neigh_for_each_in_bucket_rcu(pos, head) \ + hlist_for_each_entry_rcu(pos, head, hash) +#define neigh_for_each_in_bucket_safe(pos, tmp, head) \ + hlist_for_each_entry_safe(pos, tmp, head, hash) static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { @@ -286,17 +299,14 @@ static inline struct neighbour *___neigh_lookup_noref( const void *pkey, struct net_device *dev) { - struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht); + struct neigh_hash_table *nht = rcu_dereference(tbl->nht); struct neighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); - for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference_bh(n->next)) { + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val]) if (n->dev == dev && key_eq(n, pkey)) return n; - } return NULL; } @@ -308,12 +318,21 @@ static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl, return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev); } +static inline void neigh_confirm(struct neighbour *n) +{ + if (n) { + unsigned long now = jiffies; + + /* avoid dirtying neighbour */ + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); + } +} + void neigh_table_init(int index, struct neigh_table *tbl); int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev); -struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, - const void *pkey); struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref); static inline struct neighbour *neigh_create(struct neigh_table *tbl, @@ -323,11 +342,12 @@ static inline struct neighbour *neigh_create(struct neigh_table *tbl, return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl); +bool neigh_remove_one(struct neighbour *ndel); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); @@ -371,8 +391,6 @@ void neigh_for_each(struct neigh_table *tbl, void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); -void pneigh_for_each(struct neigh_table *tbl, - void (*cb)(struct pneigh_entry *)); struct neigh_seq_state { struct seq_net_private p; @@ -391,12 +409,12 @@ void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, void *neigh_seq_next(struct seq_file *, void *, loff_t *); void neigh_seq_stop(struct seq_file *, void *); -int neigh_proc_dointvec(struct ctl_table *ctl, int write, +int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, +int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); -int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, +int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, @@ -433,17 +451,24 @@ static inline struct neighbour * neigh_clone(struct neighbour *neigh) #define neigh_hold(n) refcount_inc(&(n)->refcnt) -static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +static __always_inline int neigh_event_send_probe(struct neighbour *neigh, + struct sk_buff *skb, + const bool immediate_ok) { unsigned long now = jiffies; - + if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); - if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) - return __neigh_event_send(neigh, skb); + if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) + return __neigh_event_send(neigh, skb, immediate_ok); return 0; } +static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + return neigh_event_send_probe(neigh, skb, true); +} + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { @@ -503,10 +528,15 @@ static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, { const struct hh_cache *hh = &n->hh; - if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache) + /* n->nud_state and hh->hh_len could be changed under us. + * neigh_hh_output() is taking care of the race later. + */ + if (!skip_cache && + (READ_ONCE(n->nud_state) & NUD_CONNECTED) && + READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); - else - return n->output(n, skb); + + return READ_ONCE(n->output)(n, skb); } static inline struct neighbour * diff --git a/include/net/neighbour_tables.h b/include/net/neighbour_tables.h new file mode 100644 index 000000000000..bcffbe8f7601 --- /dev/null +++ b/include/net/neighbour_tables.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_NEIGHBOUR_TABLES_H +#define _NET_NEIGHBOUR_TABLES_H + +enum { + NEIGH_ARP_TABLE = 0, + NEIGH_ND_TABLE = 1, + NEIGH_NR_TABLES, + NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ +}; + +#endif diff --git a/include/net/net_debug.h b/include/net/net_debug.h new file mode 100644 index 000000000000..47f7a4a878b9 --- /dev/null +++ b/include/net/net_debug.h @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NET_DEBUG_H +#define _LINUX_NET_DEBUG_H + +#include <linux/bug.h> +#include <linux/kern_levels.h> + +struct net_device; + +__printf(3, 4) __cold +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...); +__printf(2, 3) __cold +void netdev_emerg(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_alert(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_crit(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_err(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_warn(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_notice(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_info(const struct net_device *dev, const char *format, ...); + +#define netdev_level_once(level, dev, fmt, ...) \ +do { \ + static bool __section(".data..once") __print_once; \ + \ + if (!__print_once) { \ + __print_once = true; \ + netdev_printk(level, dev, fmt, ##__VA_ARGS__); \ + } \ +} while (0) + +#define netdev_emerg_once(dev, fmt, ...) \ + netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__) +#define netdev_alert_once(dev, fmt, ...) \ + netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__) +#define netdev_crit_once(dev, fmt, ...) \ + netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__) +#define netdev_err_once(dev, fmt, ...) \ + netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__) +#define netdev_warn_once(dev, fmt, ...) \ + netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__) +#define netdev_notice_once(dev, fmt, ...) \ + netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__) +#define netdev_info_once(dev, fmt, ...) \ + netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__) + +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +#define netdev_dbg(__dev, format, args...) \ +do { \ + dynamic_netdev_dbg(__dev, format, ##args); \ +} while (0) +#elif defined(DEBUG) +#define netdev_dbg(__dev, format, args...) \ + netdev_printk(KERN_DEBUG, __dev, format, ##args) +#else +#define netdev_dbg(__dev, format, args...) \ +({ \ + if (0) \ + netdev_printk(KERN_DEBUG, __dev, format, ##args); \ +}) +#endif + +#if defined(VERBOSE_DEBUG) +#define netdev_vdbg netdev_dbg +#else + +#define netdev_vdbg(dev, format, args...) \ +({ \ + if (0) \ + netdev_printk(KERN_DEBUG, dev, format, ##args); \ + 0; \ +}) +#endif + +/* netif printk helpers, similar to netdev_printk */ + +#define netif_printk(priv, type, level, dev, fmt, args...) \ +do { \ + if (netif_msg_##type(priv)) \ + netdev_printk(level, (dev), fmt, ##args); \ +} while (0) + +#define netif_level(level, priv, type, dev, fmt, args...) \ +do { \ + if (netif_msg_##type(priv)) \ + netdev_##level(dev, fmt, ##args); \ +} while (0) + +#define netif_emerg(priv, type, dev, fmt, args...) \ + netif_level(emerg, priv, type, dev, fmt, ##args) +#define netif_alert(priv, type, dev, fmt, args...) \ + netif_level(alert, priv, type, dev, fmt, ##args) +#define netif_crit(priv, type, dev, fmt, args...) \ + netif_level(crit, priv, type, dev, fmt, ##args) +#define netif_err(priv, type, dev, fmt, args...) \ + netif_level(err, priv, type, dev, fmt, ##args) +#define netif_warn(priv, type, dev, fmt, args...) \ + netif_level(warn, priv, type, dev, fmt, ##args) +#define netif_notice(priv, type, dev, fmt, args...) \ + netif_level(notice, priv, type, dev, fmt, ##args) +#define netif_info(priv, type, dev, fmt, args...) \ + netif_level(info, priv, type, dev, fmt, ##args) + +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +#define netif_dbg(priv, type, netdev, format, args...) \ +do { \ + if (netif_msg_##type(priv)) \ + dynamic_netdev_dbg(netdev, format, ##args); \ +} while (0) +#elif defined(DEBUG) +#define netif_dbg(priv, type, dev, format, args...) \ + netif_printk(priv, type, KERN_DEBUG, dev, format, ##args) +#else +#define netif_dbg(priv, type, dev, format, args...) \ +({ \ + if (0) \ + netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ + 0; \ +}) +#endif + +/* if @cond then downgrade to debug, else print at @level */ +#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...) \ + do { \ + if (cond) \ + netif_dbg(priv, type, netdev, fmt, ##args); \ + else \ + netif_ ## level(priv, type, netdev, fmt, ##args); \ + } while (0) + +#if defined(VERBOSE_DEBUG) +#define netif_vdbg netif_dbg +#else +#define netif_vdbg(priv, type, dev, format, args...) \ +({ \ + if (0) \ + netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ + 0; \ +}) +#endif + + +#if defined(CONFIG_DEBUG_NET) +#define DEBUG_NET_WARN_ON_ONCE(cond) ((void)WARN_ON_ONCE(cond)) +#define DEBUG_NET_WARN_ONCE(cond, format...) ((void)WARN_ONCE(cond, format)) +#else +#define DEBUG_NET_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define DEBUG_NET_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) +#endif + +#endif /* _LINUX_NET_DEBUG_H */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 22bc07f4b043..025a7574b275 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -22,22 +22,27 @@ #include <net/netns/nexthop.h> #include <net/netns/ieee802154_6lowpan.h> #include <net/netns/sctp.h> -#include <net/netns/dccp.h> #include <net/netns/netfilter.h> -#include <net/netns/x_tables.h> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netns/conntrack.h> #endif +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) +#include <net/netns/flow_table.h> +#endif #include <net/netns/nftables.h> #include <net/netns/xfrm.h> #include <net/netns/mpls.h> #include <net/netns/can.h> #include <net/netns/xdp.h> +#include <net/netns/smc.h> #include <net/netns/bpf.h> +#include <net/netns/mctp.h> +#include <net/net_trackers.h> #include <linux/ns_common.h> #include <linux/idr.h> #include <linux/skbuff.h> #include <linux/notifier.h> +#include <linux/xarray.h> struct user_namespace; struct proc_dir_entry; @@ -60,15 +65,10 @@ struct net { refcount_t passive; /* To decide when the network * namespace should be freed. */ - refcount_t count; /* To decided when the network - * namespace should be shut down. - */ spinlock_t rules_mod_lock; - unsigned int dev_unreg_count; - unsigned int dev_base_seq; /* protected by rtnl_mutex */ - int ifindex; + u32 ifindex; spinlock_t nsid_lock; atomic_t fnhe_genid; @@ -80,8 +80,12 @@ struct net { * or to unregister pernet ops * (pernet_ops_rwsem write locked). */ + struct llist_node defer_free_list; struct llist_node cleanup_list; /* namespaces on death row */ + struct list_head ptype_all; + struct list_head ptype_specific; + #ifdef CONFIG_KEYS struct key_tag *key_domain; /* Key domain of operation tag */ #endif @@ -90,7 +94,10 @@ struct net { struct idr netns_ids; struct ns_common ns; - + struct ref_tracker_dir refcnt_tracker; + struct ref_tracker_dir notrefcnt_tracker; /* tracker for objects not + * refcounted against netns + */ struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; @@ -106,6 +113,7 @@ struct net { struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + struct xarray dev_by_index; struct raw_notifier_head netdev_chain; /* Note that @hash_mix can be read millions times per second, @@ -121,7 +129,9 @@ struct net { struct netns_core core; struct netns_mib mib; struct netns_packet packet; +#if IS_ENABLED(CONFIG_UNIX) struct netns_unix unx; +#endif struct netns_nexthop nexthop; struct netns_ipv4 ipv4; #if IS_ENABLED(CONFIG_IPV6) @@ -133,29 +143,16 @@ struct net { #if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE) struct netns_sctp sctp; #endif -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) - struct netns_dccp dccp; -#endif #ifdef CONFIG_NETFILTER struct netns_nf nf; - struct netns_xt xt; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct netns_ct ct; #endif #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE) struct netns_nftables nft; #endif -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - struct netns_nf_frag nf_frag; - struct ctl_table_header *nf_frag_frags_hdr; -#endif - struct sock *nfnl; - struct sock *nfnl_stash; -#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT) - struct list_head nfnl_acct_list; -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - struct list_head nfct_timeout_list; +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + struct netns_ft ft; #endif #endif #ifdef CONFIG_WEXT_CORE @@ -171,7 +168,7 @@ struct net { struct netns_xfrm xfrm; #endif - atomic64_t net_cookie; /* written once */ + u64 net_cookie; /* written once */ #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; @@ -185,10 +182,20 @@ struct net { #ifdef CONFIG_XDP_SOCKETS struct netns_xdp xdp; #endif +#if IS_ENABLED(CONFIG_MCTP) + struct netns_mctp mctp; +#endif #if IS_ENABLED(CONFIG_CRYPTO_USER) struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; +#if IS_ENABLED(CONFIG_SMC) + struct netns_smc smc; +#endif +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + /* Move to a better place when the config guard is removed. */ + struct mutex rtnl_mutex; +#endif } __randomize_layout; #include <linux/seq_file_net.h> @@ -203,6 +210,11 @@ struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); void net_ns_barrier(void); + +struct ns_common *get_net_ns(struct ns_common *ns); +struct net *get_net_ns_by_fd(int fd); +extern struct task_struct *cleanup_net_task; + #else /* CONFIG_NET_NS */ #include <linux/sched.h> #include <linux/nsproxy.h> @@ -222,15 +234,22 @@ static inline void net_ns_get_ownership(const struct net *net, } static inline void net_ns_barrier(void) {} + +static inline struct ns_common *get_net_ns(struct ns_common *ns) +{ + return ERR_PTR(-EINVAL); +} + +static inline struct net *get_net_ns_by_fd(int fd) +{ + return ERR_PTR(-EINVAL); +} #endif /* CONFIG_NET_NS */ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); -struct net *get_net_ns_by_fd(int fd); - -u64 __net_gen_cookie(struct net *net); #ifdef CONFIG_SYSCTL void ipx_register_sysctl(void); @@ -243,9 +262,10 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +/* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { - refcount_inc(&net->count); + refcount_inc(&net->ns.count); return net; } @@ -256,14 +276,15 @@ static inline struct net *maybe_get_net(struct net *net) * exists. If the reference count is zero this * function fails and returns NULL. */ - if (!refcount_inc_not_zero(&net->count)) + if (!refcount_inc_not_zero(&net->ns.count)) net = NULL; return net; } +/* Try using put_net_track() instead */ static inline void put_net(struct net *net) { - if (refcount_dec_and_test(&net->count)) + if (refcount_dec_and_test(&net->ns.count)) __put_net(net); } @@ -275,10 +296,11 @@ int net_eq(const struct net *net1, const struct net *net2) static inline int check_net(const struct net *net) { - return refcount_read(&net->count) != 0; + return refcount_read(&net->ns.count) != 0; } void net_drop_ns(void *); +void net_passive_dec(struct net *net); #else @@ -308,26 +330,92 @@ static inline int check_net(const struct net *net) } #define net_drop_ns NULL + +static inline void net_passive_dec(struct net *net) +{ + refcount_dec(&net->passive); +} +#endif + +static inline void net_passive_inc(struct net *net) +{ + refcount_inc(&net->passive); +} + +/* Returns true if the netns initialization is completed successfully */ +static inline bool net_initialized(const struct net *net) +{ + return READ_ONCE(net->list.next); +} + +static inline void __netns_tracker_alloc(struct net *net, + netns_tracker *tracker, + bool refcounted, + gfp_t gfp) +{ +#ifdef CONFIG_NET_NS_REFCNT_TRACKER + ref_tracker_alloc(refcounted ? &net->refcnt_tracker : + &net->notrefcnt_tracker, + tracker, gfp); #endif +} + +static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker, + gfp_t gfp) +{ + __netns_tracker_alloc(net, tracker, true, gfp); +} + +static inline void __netns_tracker_free(struct net *net, + netns_tracker *tracker, + bool refcounted) +{ +#ifdef CONFIG_NET_NS_REFCNT_TRACKER + ref_tracker_free(refcounted ? &net->refcnt_tracker : + &net->notrefcnt_tracker, tracker); +#endif +} + +static inline struct net *get_net_track(struct net *net, + netns_tracker *tracker, gfp_t gfp) +{ + get_net(net); + netns_tracker_alloc(net, tracker, gfp); + return net; +} +static inline void put_net_track(struct net *net, netns_tracker *tracker) +{ + __netns_tracker_free(net, tracker, true); + put_net(net); +} typedef struct { #ifdef CONFIG_NET_NS - struct net *net; + struct net __rcu *net; #endif } possible_net_t; static inline void write_pnet(possible_net_t *pnet, struct net *net) { #ifdef CONFIG_NET_NS - pnet->net = net; + rcu_assign_pointer(pnet->net, net); #endif } static inline struct net *read_pnet(const possible_net_t *pnet) { #ifdef CONFIG_NET_NS - return pnet->net; + return rcu_dereference_protected(pnet->net, true); +#else + return &init_net; +#endif +} + +static inline struct net *read_pnet_rcu(const possible_net_t *pnet) +{ +#ifdef CONFIG_NET_NS + return rcu_dereference(pnet->net); #else return &init_net; #endif @@ -386,8 +474,11 @@ struct pernet_operations { void (*pre_exit)(struct net *net); void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); - unsigned int *id; - size_t size; + /* Following method is called with RTNL held. */ + void (*exit_rtnl)(struct net *net, + struct list_head *dev_kill_list); + unsigned int * const id; + const size_t size; }; /* @@ -415,17 +506,18 @@ int register_pernet_device(struct pernet_operations *); void unregister_pernet_device(struct pernet_operations *); struct ctl_table; -struct ctl_table_header; +#define register_net_sysctl(net, path, table) \ + register_net_sysctl_sz(net, path, table, ARRAY_SIZE(table)) #ifdef CONFIG_SYSCTL int net_sysctl_init(void); -struct ctl_table_header *register_net_sysctl(struct net *net, const char *path, - struct ctl_table *table); +struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path, + struct ctl_table *table, size_t table_size); void unregister_net_sysctl_table(struct ctl_table_header *header); #else static inline int net_sysctl_init(void) { return 0; } -static inline struct ctl_table_header *register_net_sysctl(struct net *net, - const char *path, struct ctl_table *table) +static inline struct ctl_table_header *register_net_sysctl_sz(struct net *net, + const char *path, struct ctl_table *table, size_t table_size) { return NULL; } @@ -483,4 +575,10 @@ static inline void fnhe_genid_bump(struct net *net) atomic_inc(&net->fnhe_genid); } +#ifdef CONFIG_NET +void net_ns_init(void); +#else +static inline void net_ns_init(void) {} +#endif + #endif /* __NET_NET_NAMESPACE_H */ diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h new file mode 100644 index 000000000000..5c3f49b52fe9 --- /dev/null +++ b/include/net/net_shaper.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_SHAPER_H_ +#define _NET_SHAPER_H_ + +#include <linux/types.h> + +#include <uapi/linux/net_shaper.h> + +struct net_device; +struct devlink; +struct netlink_ext_ack; + +enum net_shaper_binding_type { + NET_SHAPER_BINDING_TYPE_NETDEV, + /* NET_SHAPER_BINDING_TYPE_DEVLINK_PORT */ +}; + +struct net_shaper_binding { + enum net_shaper_binding_type type; + union { + struct net_device *netdev; + struct devlink *devlink; + }; +}; + +struct net_shaper_handle { + enum net_shaper_scope scope; + u32 id; +}; + +/** + * struct net_shaper - represents a shaping node on the NIC H/W + * zeroed field are considered not set. + * @parent: Unique identifier for the shaper parent, usually implied + * @handle: Unique identifier for this shaper + * @metric: Specify if the rate limits refers to PPS or BPS + * @bw_min: Minimum guaranteed rate for this shaper + * @bw_max: Maximum peak rate allowed for this shaper + * @burst: Maximum burst for the peek rate of this shaper + * @priority: Scheduling priority for this shaper + * @weight: Scheduling weight for this shaper + */ +struct net_shaper { + struct net_shaper_handle parent; + struct net_shaper_handle handle; + enum net_shaper_metric metric; + u64 bw_min; + u64 bw_max; + u64 burst; + u32 priority; + u32 weight; + + /* private: */ + u32 leaves; /* accounted only for NODE scope */ + struct rcu_head rcu; +}; + +/** + * struct net_shaper_ops - Operations on device H/W shapers + * + * The operations applies to either net_device and devlink objects. + * The initial shaping configuration at device initialization is empty: + * does not constraint the rate in any way. + * The network core keeps track of the applied user-configuration in + * the net_device or devlink structure. + * The operations are serialized via a per device lock. + * + * Device not supporting any kind of nesting should not provide the + * group operation. + * + * Each shaper is uniquely identified within the device with a 'handle' + * comprising the shaper scope and a scope-specific id. + */ +struct net_shaper_ops { + /** + * @group: create the specified shapers scheduling group + * + * Nest the @leaves shapers identified under the * @node shaper. + * All the shapers belong to the device specified by @binding. + * The @leaves arrays size is specified by @leaves_count. + * Create either the @leaves and the @node shaper; or if they already + * exists, links them together in the desired way. + * @leaves scope must be NET_SHAPER_SCOPE_QUEUE. + */ + int (*group)(struct net_shaper_binding *binding, int leaves_count, + const struct net_shaper *leaves, + const struct net_shaper *node, + struct netlink_ext_ack *extack); + + /** + * @set: Updates the specified shaper + * + * Updates or creates the @shaper on the device specified by @binding. + */ + int (*set)(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack); + + /** + * @delete: Removes the specified shaper + * + * Removes the shaper configuration as identified by the given @handle + * on the device specified by @binding, restoring the default behavior. + */ + int (*delete)(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle, + struct netlink_ext_ack *extack); + + /** + * @capabilities: get the shaper features supported by the device + * + * Fills the bitmask @cap with the supported capabilities for the + * specified @scope and device specified by @binding. + */ + void (*capabilities)(struct net_shaper_binding *binding, + enum net_shaper_scope scope, unsigned long *cap); +}; + +#endif diff --git a/include/net/net_trackers.h b/include/net/net_trackers.h new file mode 100644 index 000000000000..d94c76cf15a9 --- /dev/null +++ b/include/net/net_trackers.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_NET_TRACKERS_H +#define __NET_NET_TRACKERS_H +#include <linux/ref_tracker.h> + +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER +typedef struct ref_tracker *netdevice_tracker; +#else +typedef struct {} netdevice_tracker; +#endif + +#ifdef CONFIG_NET_NS_REFCNT_TRACKER +typedef struct ref_tracker *netns_tracker; +#else +typedef struct {} netns_tracker; +#endif + +#endif /* __NET_NET_TRACKERS_H */ diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h new file mode 100644 index 000000000000..3d3aef80beac --- /dev/null +++ b/include/net/netdev_lock.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_NETDEV_LOCK_H +#define _NET_NETDEV_LOCK_H + +#include <linux/lockdep.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> + +static inline bool netdev_trylock(struct net_device *dev) +{ + return mutex_trylock(&dev->lock); +} + +static inline void netdev_assert_locked(const struct net_device *dev) +{ + lockdep_assert_held(&dev->lock); +} + +static inline void +netdev_assert_locked_or_invisible(const struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_assert_locked(dev); +} + +static inline bool netdev_need_ops_lock(const struct net_device *dev) +{ + bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; + +#if IS_ENABLED(CONFIG_NET_SHAPER) + ret |= !!dev->netdev_ops->net_shaper_ops; +#endif + + return ret; +} + +static inline void netdev_lock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); +} + +static inline void netdev_unlock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); +} + +static inline void netdev_lock_ops_to_full(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_lock(dev); +} + +static inline void netdev_unlock_full_to_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_unlock(dev); +} + +static inline void netdev_ops_assert_locked(const struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + lockdep_assert_held(&dev->lock); + else + ASSERT_RTNL(); +} + +static inline void +netdev_ops_assert_locked_or_invisible(const struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_ops_assert_locked(dev); +} + +static inline void netdev_lock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); + else + rtnl_lock(); +} + +static inline void netdev_unlock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + rtnl_unlock(); +} + +static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) +{ + if (a == b) + return 0; + + /* Allow locking multiple devices only under rtnl_lock, + * the exact order doesn't matter. + * Note that upper devices don't lock their ops, so nesting + * mostly happens in batched device removal for now. + */ + return lockdep_rtnl_is_held() ? -1 : 1; +} + +#define netdev_lockdep_set_classes(dev) \ +{ \ + static struct lock_class_key qdisc_tx_busylock_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + static struct lock_class_key dev_addr_list_lock_key; \ + static struct lock_class_key dev_instance_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ + lockdep_set_class(&(dev)->addr_list_lock, \ + &dev_addr_list_lock_key); \ + lockdep_set_class(&(dev)->lock, \ + &dev_instance_lock_key); \ + lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ + lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ + &qdisc_xmit_lock_key); \ +} + +#define netdev_lock_dereference(p, dev) \ + rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock)) + +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr); + +#endif diff --git a/include/net/netdev_netlink.h b/include/net/netdev_netlink.h new file mode 100644 index 000000000000..075962dbe743 --- /dev/null +++ b/include/net/netdev_netlink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_NETDEV_NETLINK_H +#define __NET_NETDEV_NETLINK_H + +#include <linux/list.h> + +struct netdev_nl_sock { + struct mutex lock; + struct list_head bindings; +}; + +#endif /* __NET_NETDEV_NETLINK_H */ diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h new file mode 100644 index 000000000000..ba2eaf39089b --- /dev/null +++ b/include/net/netdev_queues.h @@ -0,0 +1,315 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NET_QUEUES_H +#define _LINUX_NET_QUEUES_H + +#include <linux/netdevice.h> + +/** + * struct netdev_config - queue-related configuration for a netdev + * @hds_thresh: HDS Threshold value. + * @hds_config: HDS value from userspace. + */ +struct netdev_config { + u32 hds_thresh; + u8 hds_config; +}; + +/* See the netdev.yaml spec for definition of each statistic */ +struct netdev_queue_stats_rx { + u64 bytes; + u64 packets; + u64 alloc_fail; + + u64 hw_drops; + u64 hw_drop_overruns; + + u64 csum_complete; + u64 csum_unnecessary; + u64 csum_none; + u64 csum_bad; + + u64 hw_gro_packets; + u64 hw_gro_bytes; + u64 hw_gro_wire_packets; + u64 hw_gro_wire_bytes; + + u64 hw_drop_ratelimits; +}; + +struct netdev_queue_stats_tx { + u64 bytes; + u64 packets; + + u64 hw_drops; + u64 hw_drop_errors; + + u64 csum_none; + u64 needs_csum; + + u64 hw_gso_packets; + u64 hw_gso_bytes; + u64 hw_gso_wire_packets; + u64 hw_gso_wire_bytes; + + u64 hw_drop_ratelimits; + + u64 stop; + u64 wake; +}; + +/** + * struct netdev_stat_ops - netdev ops for fine grained stats + * @get_queue_stats_rx: get stats for a given Rx queue + * @get_queue_stats_tx: get stats for a given Tx queue + * @get_base_stats: get base stats (not belonging to any live instance) + * + * Query stats for a given object. The values of the statistics are undefined + * on entry (specifically they are *not* zero-initialized). Drivers should + * assign values only to the statistics they collect. Statistics which are not + * collected must be left undefined. + * + * Queue objects are not necessarily persistent, and only currently active + * queues are queried by the per-queue callbacks. This means that per-queue + * statistics will not generally add up to the total number of events for + * the device. The @get_base_stats callback allows filling in the delta + * between events for currently live queues and overall device history. + * @get_base_stats can also be used to report any miscellaneous packets + * transferred outside of the main set of queues used by the networking stack. + * When the statistics for the entire device are queried, first @get_base_stats + * is issued to collect the delta, and then a series of per-queue callbacks. + * Only statistics which are set in @get_base_stats will be reported + * at the device level, meaning that unlike in queue callbacks, setting + * a statistic to zero in @get_base_stats is a legitimate thing to do. + * This is because @get_base_stats has a second function of designating which + * statistics are in fact correct for the entire device (e.g. when history + * for some of the events is not maintained, and reliable "total" cannot + * be provided). + * + * Ops are called under the instance lock if netdev_need_ops_lock() + * returns true, otherwise under rtnl_lock. + * Device drivers can assume that when collecting total device stats, + * the @get_base_stats and subsequent per-queue calls are performed + * "atomically" (without releasing the relevant lock). + * + * Device drivers are encouraged to reset the per-queue statistics when + * number of queues change. This is because the primary use case for + * per-queue statistics is currently to detect traffic imbalance. + */ +struct netdev_stat_ops { + void (*get_queue_stats_rx)(struct net_device *dev, int idx, + struct netdev_queue_stats_rx *stats); + void (*get_queue_stats_tx)(struct net_device *dev, int idx, + struct netdev_queue_stats_tx *stats); + void (*get_base_stats)(struct net_device *dev, + struct netdev_queue_stats_rx *rx, + struct netdev_queue_stats_tx *tx); +}; + +void netdev_stat_queue_sum(struct net_device *netdev, + int rx_start, int rx_end, + struct netdev_queue_stats_rx *rx_sum, + int tx_start, int tx_end, + struct netdev_queue_stats_tx *tx_sum); + +/** + * struct netdev_queue_mgmt_ops - netdev ops for queue management + * + * @ndo_queue_mem_size: Size of the struct that describes a queue's memory. + * + * @ndo_queue_mem_alloc: Allocate memory for an RX queue at the specified index. + * The new memory is written at the specified address. + * + * @ndo_queue_mem_free: Free memory from an RX queue. + * + * @ndo_queue_start: Start an RX queue with the specified memory and at the + * specified index. + * + * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped + * queue's memory is written at the specified address. + * + * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while + * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only + * be called for an interface which is open. + */ +struct netdev_queue_mgmt_ops { + size_t ndo_queue_mem_size; + int (*ndo_queue_mem_alloc)(struct net_device *dev, + void *per_queue_mem, + int idx); + void (*ndo_queue_mem_free)(struct net_device *dev, + void *per_queue_mem); + int (*ndo_queue_start)(struct net_device *dev, + void *per_queue_mem, + int idx); + int (*ndo_queue_stop)(struct net_device *dev, + void *per_queue_mem, + int idx); +}; + +/** + * DOC: Lockless queue stopping / waking helpers. + * + * The netif_txq_maybe_stop() and __netif_txq_completed_wake() + * macros are designed to safely implement stopping + * and waking netdev queues without full lock protection. + * + * We assume that there can be no concurrent stop attempts and no concurrent + * wake attempts. The try-stop should happen from the xmit handler, + * while wake up should be triggered from NAPI poll context. + * The two may run concurrently (single producer, single consumer). + * + * The try-stop side is expected to run from the xmit handler and therefore + * it does not reschedule Tx (netif_tx_start_queue() instead of + * netif_tx_wake_queue()). Uses of the ``stop`` macros outside of the xmit + * handler may lead to xmit queue being enabled but not run. + * The waking side does not have similar context restrictions. + * + * The macros guarantee that rings will not remain stopped if there's + * space available, but they do *not* prevent false wake ups when + * the ring is full! Drivers should check for ring full at the start + * for the xmit handler. + * + * All descriptor ring indexes (and other relevant shared state) must + * be updated before invoking the macros. + */ + +#define netif_txq_try_stop(txq, get_desc, start_thrs) \ + ({ \ + int _res; \ + \ + netif_tx_stop_queue(txq); \ + /* Producer index and stop bit must be visible \ + * to consumer before we recheck. \ + * Pairs with a barrier in __netif_txq_completed_wake(). \ + */ \ + smp_mb__after_atomic(); \ + \ + /* We need to check again in a case another \ + * CPU has just made room available. \ + */ \ + _res = 0; \ + if (unlikely(get_desc >= start_thrs)) { \ + netif_tx_start_queue(txq); \ + _res = -1; \ + } \ + _res; \ + }) \ + +/** + * netif_txq_maybe_stop() - locklessly stop a Tx queue, if needed + * @txq: struct netdev_queue to stop/start + * @get_desc: get current number of free descriptors (see requirements below!) + * @stop_thrs: minimal number of available descriptors for queue to be left + * enabled + * @start_thrs: minimal number of descriptors to re-enable the queue, can be + * equal to @stop_thrs or higher to avoid frequent waking + * + * All arguments may be evaluated multiple times, beware of side effects. + * @get_desc must be a formula or a function call, it must always + * return up-to-date information when evaluated! + * Expected to be used from ndo_start_xmit, see the comment on top of the file. + * + * Returns: + * 0 if the queue was stopped + * 1 if the queue was left enabled + * -1 if the queue was re-enabled (raced with waking) + */ +#define netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs) \ + ({ \ + int _res; \ + \ + _res = 1; \ + if (unlikely(get_desc < stop_thrs)) \ + _res = netif_txq_try_stop(txq, get_desc, start_thrs); \ + _res; \ + }) \ + +/* Variant of netdev_tx_completed_queue() which guarantees smp_mb() if + * @bytes != 0, regardless of kernel config. + */ +static inline void +netdev_txq_completed_mb(struct netdev_queue *dev_queue, + unsigned int pkts, unsigned int bytes) +{ + if (IS_ENABLED(CONFIG_BQL)) + netdev_tx_completed_queue(dev_queue, pkts, bytes); + else if (bytes) + smp_mb(); +} + +/** + * __netif_txq_completed_wake() - locklessly wake a Tx queue, if needed + * @txq: struct netdev_queue to stop/start + * @pkts: number of packets completed + * @bytes: number of bytes completed + * @get_desc: get current number of free descriptors (see requirements below!) + * @start_thrs: minimal number of descriptors to re-enable the queue + * @down_cond: down condition, predicate indicating that the queue should + * not be woken up even if descriptors are available + * + * All arguments may be evaluated multiple times. + * @get_desc must be a formula or a function call, it must always + * return up-to-date information when evaluated! + * Reports completed pkts/bytes to BQL. + * + * Returns: + * 0 if the queue was woken up + * 1 if the queue was already enabled (or disabled but @down_cond is true) + * -1 if the queue was left unchanged (@start_thrs not reached) + */ +#define __netif_txq_completed_wake(txq, pkts, bytes, \ + get_desc, start_thrs, down_cond) \ + ({ \ + int _res; \ + \ + /* Report to BQL and piggy back on its barrier. \ + * Barrier makes sure that anybody stopping the queue \ + * after this point sees the new consumer index. \ + * Pairs with barrier in netif_txq_try_stop(). \ + */ \ + netdev_txq_completed_mb(txq, pkts, bytes); \ + \ + _res = -1; \ + if (pkts && likely(get_desc >= start_thrs)) { \ + _res = 1; \ + if (unlikely(netif_tx_queue_stopped(txq)) && \ + !(down_cond)) { \ + netif_tx_wake_queue(txq); \ + _res = 0; \ + } \ + } \ + _res; \ + }) + +#define netif_txq_completed_wake(txq, pkts, bytes, get_desc, start_thrs) \ + __netif_txq_completed_wake(txq, pkts, bytes, get_desc, start_thrs, false) + +/* subqueue variants follow */ + +#define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs) \ + ({ \ + struct netdev_queue *_txq; \ + \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_try_stop(_txq, get_desc, start_thrs); \ + }) + +#define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \ + ({ \ + struct netdev_queue *_txq; \ + \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_maybe_stop(_txq, get_desc, stop_thrs, start_thrs); \ + }) + +#define netif_subqueue_completed_wake(dev, idx, pkts, bytes, \ + get_desc, start_thrs) \ + ({ \ + struct netdev_queue *_txq; \ + \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_completed_wake(_txq, pkts, bytes, \ + get_desc, start_thrs); \ + }) + +#endif diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h new file mode 100644 index 000000000000..8cdcd138b33f --- /dev/null +++ b/include/net/netdev_rx_queue.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NETDEV_RX_QUEUE_H +#define _LINUX_NETDEV_RX_QUEUE_H + +#include <linux/kobject.h> +#include <linux/netdevice.h> +#include <linux/sysfs.h> +#include <net/xdp.h> +#include <net/page_pool/types.h> + +/* This structure contains an instance of an RX queue. */ +struct netdev_rx_queue { + struct xdp_rxq_info xdp_rxq; +#ifdef CONFIG_RPS + struct rps_map __rcu *rps_map; + struct rps_dev_flow_table __rcu *rps_flow_table; +#endif + struct kobject kobj; + const struct attribute_group **groups; + struct net_device *dev; + netdevice_tracker dev_tracker; + + /* All fields below are "ops protected", + * see comment about net_device::lock + */ +#ifdef CONFIG_XDP_SOCKETS + struct xsk_buff_pool *pool; +#endif + struct napi_struct *napi; + struct pp_memory_provider_params mp_params; +} ____cacheline_aligned_in_smp; + +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + const char *buf, size_t len); +}; + +static inline struct netdev_rx_queue * +__netif_get_rx_queue(struct net_device *dev, unsigned int rxq) +{ + return dev->_rx + rxq; +} + +static inline unsigned int +get_netdev_rx_queue_index(struct netdev_rx_queue *queue) +{ + struct net_device *dev = queue->dev; + int index = queue - dev->_rx; + + BUG_ON(index >= dev->num_rx_queues); + return index; +} + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); + +#endif diff --git a/include/net/netevent.h b/include/net/netevent.h index 4107016c3bb4..1be3757a8b7f 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -14,6 +14,7 @@ struct dst_entry; struct neighbour; +struct notifier_block ; struct netevent_redirect { struct dst_entry *old; diff --git a/include/net/netfilter/ipv4/nf_defrag_ipv4.h b/include/net/netfilter/ipv4/nf_defrag_ipv4.h index bcbd724cc048..7fda9ce9f694 100644 --- a/include/net/netfilter/ipv4/nf_defrag_ipv4.h +++ b/include/net/netfilter/ipv4/nf_defrag_ipv4.h @@ -3,6 +3,7 @@ #define _NF_DEFRAG_IPV4_H struct net; -int nf_defrag_ipv4_enable(struct net *); +int nf_defrag_ipv4_enable(struct net *net); +void nf_defrag_ipv4_disable(struct net *net); #endif /* _NF_DEFRAG_IPV4_H */ diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index 40e0e0623f46..c653fcb88354 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -8,8 +8,8 @@ #include <net/netfilter/nf_reject.h> void nf_send_unreach(struct sk_buff *skb_in, int code, int hook); -void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook); - +void nf_send_reset(struct net *net, struct sock *, struct sk_buff *oldskb, + int hook); const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *_oth, int hook); struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, @@ -18,4 +18,14 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, const struct tcphdr *oth); +struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code); +struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook); + + #endif /* _IPV4_NF_REJECT_H */ diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index 7b3c873f8839..e95483192d1b 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -4,7 +4,4 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6; -#include <linux/sysctl.h> -extern struct ctl_table nf_ct_ipv6_sysctl_table[]; - #endif /* _NF_CONNTRACK_IPV6_H*/ diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index 6d31cd041143..ceadf8ba25a4 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -5,7 +5,8 @@ #include <linux/skbuff.h> #include <linux/types.h> -int nf_defrag_ipv6_enable(struct net *); +int nf_defrag_ipv6_enable(struct net *net); +void nf_defrag_ipv6_disable(struct net *net); int nf_ct_frag6_init(void); void nf_ct_frag6_cleanup(void); @@ -13,4 +14,9 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user); struct inet_frags_ctl; +struct nft_ct_frag6_pernet { + struct ctl_table_header *nf_frag_frags_hdr; + struct fqdir *fqdir; +}; + #endif /* _NF_DEFRAG_IPV6_H */ diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index 4a3ef9ebdf6f..d729344ba644 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -7,9 +7,8 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char code, unsigned int hooknum); - -void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook); - +void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, + int hook); const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *otcph, unsigned int *otcplen, int hook); @@ -20,4 +19,13 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, const struct tcphdr *oth, unsigned int otcplen); +struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook); +struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code); + #endif /* _IPV6_NF_REJECT_H */ diff --git a/include/net/netfilter/nf_bpf_link.h b/include/net/netfilter/nf_bpf_link.h new file mode 100644 index 000000000000..6c984b0ea838 --- /dev/null +++ b/include/net/netfilter/nf_bpf_link.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +struct bpf_nf_ctx { + const struct nf_hook_state *state; + struct sk_buff *skb; +}; + +#if IS_ENABLED(CONFIG_NETFILTER_BPF_LINK) +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +#else +static inline int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +#endif diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 439379ca9ffa..3f02a45773e8 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -43,10 +43,27 @@ union nf_conntrack_expect_proto { /* insert expect proto private data here */ }; +struct nf_conntrack_net_ecache { + struct delayed_work dwork; + spinlock_t dying_lock; + struct hlist_nulls_head dying_list; +}; + struct nf_conntrack_net { + /* only used when new connection is allocated: */ + atomic_t count; + unsigned int expect_count; + + /* only used from work queues, configuration plane, and so on: */ unsigned int users4; unsigned int users6; unsigned int users_bridge; +#ifdef CONFIG_SYSCTL + struct ctl_table_header *sysctl_header; +#endif +#ifdef CONFIG_NF_CONNTRACK_EVENTS + struct nf_conntrack_net_ecache ecache; +#endif }; #include <linux/types.h> @@ -62,6 +79,8 @@ struct nf_conn { * Hint, SKB address this struct and refcnt via skb->_nfct and * helpers nf_conntrack_get() and nf_conntrack_put(). * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt, + * except that the latter uses internal indirection and does not + * result in a conntrack module dependency. * beware nf_ct_get() is different and don't inc refcnt. */ struct nf_conntrack ct_general; @@ -80,7 +99,6 @@ struct nf_conn { /* Have we seen traffic both ways yet? (bitset) */ unsigned long status; - u16 cpu; possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) @@ -108,6 +126,12 @@ struct nf_conn { }; static inline struct nf_conn * +nf_ct_to_nf_conn(const struct nf_conntrack *nfct) +{ + return container_of(nfct, struct nf_conn, ct_general); +} + +static inline struct nf_conn * nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash) { return container_of(hash, struct nf_conn, @@ -136,10 +160,6 @@ static inline struct net *nf_ct_net(const struct nf_conn *ct) return read_pnet(&ct->ct_net); } -/* Alter reply tuple (maybe alter helper). */ -void nf_conntrack_alter_reply(struct nf_conn *ct, - const struct nf_conntrack_tuple *newreply); - /* Is this tuple taken? (ignoring any belonging to the given conntrack). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, @@ -155,17 +175,17 @@ nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) return (struct nf_conn *)(nfct & NFCT_PTRMASK); } +void nf_ct_destroy(struct nf_conntrack *nfct); + +void nf_conntrack_tcp_set_closing(struct nf_conn *ct); + /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { - WARN_ON(!ct); - nf_conntrack_put(&ct->ct_general); + if (ct && refcount_dec_and_test(&ct->ct_general.use)) + nf_ct_destroy(&ct->ct_general); } -/* Protocol module loading */ -int nf_ct_l3proto_try_module_get(unsigned short l3proto); -void nf_ct_l3proto_module_put(unsigned short l3proto); - /* load module; enable/disable conntrack in this namespace */ int nf_ct_netns_get(struct net *net, u8 nfproto); void nf_ct_netns_put(struct net *net, u8 nfproto); @@ -184,8 +204,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_tuple *tuple); void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, - u32 extra_jiffies, bool do_acct); + u32 extra_jiffies, unsigned int bytes); /* Refresh conntrack for this many jiffies and do accounting */ static inline void nf_ct_refresh_acct(struct nf_conn *ct, @@ -193,15 +212,14 @@ static inline void nf_ct_refresh_acct(struct nf_conn *ct, const struct sk_buff *skb, u32 extra_jiffies) { - __nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, true); + __nf_ct_refresh_acct(ct, ctinfo, extra_jiffies, skb->len); } /* Refresh conntrack for this many jiffies */ static inline void nf_ct_refresh(struct nf_conn *ct, - const struct sk_buff *skb, u32 extra_jiffies) { - __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, false); + __nf_ct_refresh_acct(ct, 0, extra_jiffies, 0); } /* kill conntrack and do accounting */ @@ -214,13 +232,16 @@ static inline bool nf_ct_kill(struct nf_conn *ct) return nf_ct_delete(ct, 0, 0); } -/* Set all unconfirmed conntrack as dying */ -void nf_ct_unconfirmed_destroy(struct net *); +struct nf_ct_iter_data { + struct net *net; + void *data; + u32 portid; + int report; +}; /* Iterate over all conntracks: if iter returns true, it's deleted. */ -void nf_ct_iterate_cleanup_net(struct net *net, - int (*iter)(struct nf_conn *i, void *data), - void *data, u32 portid, int report); +void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), + const struct nf_ct_iter_data *iter_data); /* also set unconfirmed conntracks as dying. Only use in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), @@ -257,19 +278,29 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } +static inline void nf_conntrack_alter_reply(struct nf_conn *ct, + const struct nf_conntrack_tuple *newreply) +{ + /* Must be unconfirmed, so not in hash table yet */ + if (WARN_ON(nf_ct_is_confirmed(ct))) + return; + + ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; +} + #define nfct_time_stamp ((u32)(jiffies)) /* jiffies until ct expires, 0 if already expired */ static inline unsigned long nf_ct_expires(const struct nf_conn *ct) { - s32 timeout = ct->timeout - nfct_time_stamp; + s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; - return timeout > 0 ? timeout : 0; + return max(timeout, 0); } static inline bool nf_ct_is_expired(const struct nf_conn *ct) { - return (__s32)(ct->timeout - nfct_time_stamp) <= 0; + return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0; } /* use after obtaining a reference count */ @@ -281,16 +312,6 @@ static inline bool nf_ct_should_gc(const struct nf_conn *ct) #define NF_CT_DAY (86400 * HZ) -/* Set an arbitrary timeout large enough not to ever expire, this save - * us a check for the IPS_OFFLOAD_BIT from the packet path via - * nf_ct_is_expired(). - */ -static inline void nf_ct_offload_timeout(struct nf_conn *ct) -{ - if (nf_ct_expires(ct) < NF_CT_DAY / 2) - ct->timeout = nfct_time_stamp + NF_CT_DAY; -} - struct kernel_param; int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp); @@ -324,6 +345,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, void nf_ct_tmpl_free(struct nf_conn *tmpl); u32 nf_ct_get_id(const struct nf_conn *ct); +u32 nf_conntrack_count(const struct net *net); static inline void nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) @@ -331,6 +353,17 @@ nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) skb_set_nfct(skb, (unsigned long)ct | info); } +extern unsigned int nf_conntrack_net_id; + +static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net) +{ + return net_generic(net, nf_conntrack_net_id); +} + +int nf_ct_skb_network_trim(struct sk_buff *skb, int family); +int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, + u16 zone, u8 family, u8 *proto, u16 *mru); + #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v)) diff --git a/include/net/netfilter/nf_conntrack_acct.h b/include/net/netfilter/nf_conntrack_acct.h index 7f44a771530e..a120685cac93 100644 --- a/include/net/netfilter/nf_conntrack_acct.h +++ b/include/net/netfilter/nf_conntrack_acct.h @@ -78,7 +78,4 @@ static inline void nf_ct_acct_update(struct nf_conn *ct, u32 dir, void nf_conntrack_acct_pernet_init(struct net *net); -int nf_conntrack_acct_init(void); -void nf_conntrack_acct_fini(void); - #endif /* _NF_CONNTRACK_ACCT_H */ diff --git a/include/net/netfilter/nf_conntrack_act_ct.h b/include/net/netfilter/nf_conntrack_act_ct.h new file mode 100644 index 000000000000..e5f2f0b73a9a --- /dev/null +++ b/include/net/netfilter/nf_conntrack_act_ct.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _NF_CONNTRACK_ACT_CT_H +#define _NF_CONNTRACK_ACT_CT_H + +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter/nf_conntrack_common.h> +#include <net/netfilter/nf_conntrack_extend.h> + +struct nf_conn_act_ct_ext { + int ifindex[IP_CT_DIR_MAX]; +}; + +static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_find(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NET_ACT_CT) + return nf_ct_ext_find(ct, NF_CT_EXT_ACT_CT); +#else + return NULL; +#endif +} + +static inline void nf_conn_act_ct_ext_fill(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ +#if IS_ENABLED(CONFIG_NET_ACT_CT) + struct nf_conn_act_ct_ext *act_ct_ext; + + act_ct_ext = nf_conn_act_ct_ext_find(ct); + if (dev_net(skb->dev) == &init_net && act_ct_ext) + act_ct_ext->ifindex[CTINFO2DIR(ctinfo)] = skb->dev->ifindex; +#endif +} + +static inline struct +nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ +#if IS_ENABLED(CONFIG_NET_ACT_CT) + struct nf_conn_act_ct_ext *act_ct = nf_ct_ext_find(ct, NF_CT_EXT_ACT_CT); + + if (act_ct) + return act_ct; + + act_ct = nf_ct_ext_add(ct, NF_CT_EXT_ACT_CT, GFP_ATOMIC); + nf_conn_act_ct_ext_fill(skb, ct, ctinfo); + return act_ct; +#else + return NULL; +#endif +} + +#endif /* _NF_CONNTRACK_ACT_CT_H */ diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h new file mode 100644 index 000000000000..2d0da478c8e0 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _NF_CONNTRACK_BPF_H +#define _NF_CONNTRACK_BPF_H + +#include <linux/kconfig.h> +#include <net/netfilter/nf_conntrack.h> + +struct nf_conn___init { + struct nf_conn ct; +}; + +#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern int register_nf_conntrack_bpf(void); +extern void cleanup_nf_conntrack_bpf(void); + +#else + +static inline int register_nf_conntrack_bpf(void) +{ + return 0; +} + +static inline void cleanup_nf_conntrack_bpf(void) +{ +} + +#endif + +#if (IS_BUILTIN(CONFIG_NF_NAT) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_NF_NAT) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern int register_nf_nat_bpf(void); + +#else + +static inline int register_nf_nat_bpf(void) +{ + return 0; +} + +#endif + +#endif /* _NF_CONNTRACK_BPF_H */ diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 09f2efea0b97..3384859a8921 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -30,7 +30,6 @@ void nf_conntrack_cleanup_net(struct net *net); void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list); void nf_conntrack_proto_pernet_init(struct net *net); -void nf_conntrack_proto_pernet_fini(struct net *net); int nf_conntrack_proto_init(void); void nf_conntrack_proto_fini(void); @@ -59,16 +58,20 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) int ret = NF_ACCEPT; if (ct) { - if (!nf_ct_is_confirmed(ct)) + if (!nf_ct_is_confirmed(ct)) { ret = __nf_conntrack_confirm(skb); - if (likely(ret == NF_ACCEPT)) + + if (ret == NF_ACCEPT) + ct = (struct nf_conn *)skb_nfct(skb); + } + + if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct)) nf_ct_deliver_cached_events(ct); } return ret; } -unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo); +unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *proto); @@ -80,4 +83,21 @@ void nf_conntrack_lock(spinlock_t *lock); extern spinlock_t nf_conntrack_expect_lock; +/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ + +static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) +{ + if (timeout > INT_MAX) + timeout = INT_MAX; + + if (nf_ct_is_confirmed(ct)) + WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); + else + ct->timeout = (u32)timeout; +} + +int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); +void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off); +int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status); + #endif /* _NF_CONNTRACK_CORE_H */ diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 9645b47fa7e4..1b58b5b91ff6 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -10,14 +10,13 @@ struct nf_conncount_data; struct nf_conncount_list { spinlock_t list_lock; + u32 last_gc; /* jiffies at most recent gc */ struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ }; -struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, - unsigned int keylen); -void nf_conncount_destroy(struct net *net, unsigned int family, - struct nf_conncount_data *data); +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); +void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data); unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index eb81f9195e28..8dcf7c371ee9 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -12,19 +12,21 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/netfilter/nf_conntrack_extend.h> +#include <asm/local64.h> enum nf_ct_ecache_state { - NFCT_ECACHE_UNKNOWN, /* destroy event not sent */ NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */ NFCT_ECACHE_DESTROY_SENT, /* sent destroy event after failure */ }; struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ - u16 missed; /* missed events */ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + local64_t timestamp; /* event timestamp, in nanoseconds */ +#endif u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ - enum nf_ct_ecache_state state:8;/* ecache state */ + u32 missed; /* missed events */ u32 portid; /* netlink portid of destroyer */ }; @@ -38,28 +40,12 @@ nf_ct_ecache_find(const struct nf_conn *ct) #endif } -static inline struct nf_conntrack_ecache * -nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) +static inline bool nf_ct_ecache_exist(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *e; - - if (!ctmask && !expmask && net->ct.sysctl_events) { - ctmask = ~0; - expmask = ~0; - } - if (!ctmask && !expmask) - return NULL; - - e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); - if (e) { - e->ctmask = ctmask; - e->expmask = expmask; - } - return e; + return nf_ct_ext_exist(ct, NF_CT_EXT_ECACHE); #else - return NULL; + return false; #endif } @@ -72,19 +58,26 @@ struct nf_ct_event { int report; }; +struct nf_exp_event { + struct nf_conntrack_expect *exp; + u32 portid; + int report; +}; + struct nf_ct_event_notifier { - int (*fcn)(unsigned int events, struct nf_ct_event *item); + int (*ct_event)(unsigned int events, const struct nf_ct_event *item); + int (*exp_event)(unsigned int events, const struct nf_exp_event *item); }; -int nf_conntrack_register_notifier(struct net *net, - struct nf_ct_event_notifier *nb); -void nf_conntrack_unregister_notifier(struct net *net, - struct nf_ct_event_notifier *nb); +void nf_conntrack_register_notifier(struct net *net, + const struct nf_ct_event_notifier *nb); +void nf_conntrack_unregister_notifier(struct net *net); void nf_ct_deliver_cached_events(struct nf_conn *ct); int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, u32 portid, int report); +bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp); #else static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct) @@ -99,6 +92,10 @@ static inline int nf_conntrack_eventmask_report(unsigned int eventmask, return 0; } +static inline bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) +{ + return false; +} #endif static inline void @@ -115,6 +112,14 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) if (e == NULL) return; +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + /* renew only if this is the first cached event, so that the + * timestamp reflects the first, not the last, generated event. + */ + if (local64_read(&e->timestamp) && READ_ONCE(e->cache) == 0) + local64_set(&e->timestamp, ktime_get_real_ns()); +#endif + set_bit(event, &e->cache); #endif } @@ -124,59 +129,38 @@ nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct, u32 portid, int report) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - const struct net *net = nf_ct_net(ct); - - if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) - return 0; - - return nf_conntrack_eventmask_report(1 << event, ct, portid, report); -#else - return 0; + if (nf_ct_ecache_exist(ct)) + return nf_conntrack_eventmask_report(1 << event, ct, portid, report); #endif + return 0; } static inline int nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - const struct net *net = nf_ct_net(ct); - - if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) - return 0; - - return nf_conntrack_eventmask_report(1 << event, ct, 0, 0); -#else - return 0; + if (nf_ct_ecache_exist(ct)) + return nf_conntrack_eventmask_report(1 << event, ct, 0, 0); #endif + return 0; } #ifdef CONFIG_NF_CONNTRACK_EVENTS - -struct nf_exp_event { - struct nf_conntrack_expect *exp; - u32 portid; - int report; -}; - -struct nf_exp_event_notifier { - int (*fcn)(unsigned int events, struct nf_exp_event *item); -}; - -int nf_ct_expect_register_notifier(struct net *net, - struct nf_exp_event_notifier *nb); -void nf_ct_expect_unregister_notifier(struct net *net, - struct nf_exp_event_notifier *nb); - void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp, u32 portid, int report); +void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state); + void nf_conntrack_ecache_pernet_init(struct net *net); void nf_conntrack_ecache_pernet_fini(struct net *net); -int nf_conntrack_ecache_init(void); -void nf_conntrack_ecache_fini(void); +struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net); +static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) +{ + return net->ct.ecache_dwork_pending; +} #else /* CONFIG_NF_CONNTRACK_EVENTS */ static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e, @@ -186,43 +170,18 @@ static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e, { } -static inline void nf_conntrack_ecache_pernet_init(struct net *net) -{ -} - -static inline void nf_conntrack_ecache_pernet_fini(struct net *net) +static inline void nf_conntrack_ecache_work(struct net *net, + enum nf_ct_ecache_state s) { } -static inline int nf_conntrack_ecache_init(void) +static inline void nf_conntrack_ecache_pernet_init(struct net *net) { - return 0; } -static inline void nf_conntrack_ecache_fini(void) +static inline void nf_conntrack_ecache_pernet_fini(struct net *net) { } - +static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return false; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ - -static inline void nf_conntrack_ecache_delayed_work(struct net *net) -{ -#ifdef CONFIG_NF_CONNTRACK_EVENTS - if (!delayed_work_pending(&net->ct.ecache_dwork)) { - schedule_delayed_work(&net->ct.ecache_dwork, HZ); - net->ct.ecache_dwork_pending = true; - } -#endif -} - -static inline void nf_conntrack_ecache_work(struct net *net) -{ -#ifdef CONFIG_NF_CONNTRACK_EVENTS - if (net->ct.ecache_dwork_pending) { - net->ct.ecache_dwork_pending = false; - mod_delayed_work(system_wq, &net->ct.ecache_dwork, 0); - } -#endif -} - #endif /*_NF_CONNTRACK_ECACHE_H*/ diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 0855b60fba17..165e7a03b8e9 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -26,6 +26,15 @@ struct nf_conntrack_expect { struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_mask mask; + /* Usage count. */ + refcount_t use; + + /* Flags */ + unsigned int flags; + + /* Expectation class */ + unsigned int class; + /* Function to call after setup and insertion */ void (*expectfn)(struct nf_conn *new, struct nf_conntrack_expect *this); @@ -39,15 +48,6 @@ struct nf_conntrack_expect { /* Timer function; deletes the expectation. */ struct timer_list timeout; - /* Usage count. */ - refcount_t use; - - /* Flags */ - unsigned int flags; - - /* Expectation class */ - unsigned int class; - #if IS_ENABLED(CONFIG_NF_NAT) union nf_inet_addr saved_addr; /* This is the original per-proto part, used to map the @@ -100,7 +100,7 @@ nf_ct_expect_find_get(struct net *net, struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple); + const struct nf_conntrack_tuple *tuple, bool unlink); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report); diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index e1e588387103..0b247248b032 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -28,24 +28,18 @@ enum nf_ct_ext_id { #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) NF_CT_EXT_SYNPROXY, #endif +#if IS_ENABLED(CONFIG_NET_ACT_CT) + NF_CT_EXT_ACT_CT, +#endif NF_CT_EXT_NUM, }; -#define NF_CT_EXT_HELPER_TYPE struct nf_conn_help -#define NF_CT_EXT_NAT_TYPE struct nf_conn_nat -#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj -#define NF_CT_EXT_ACCT_TYPE struct nf_conn_acct -#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache -#define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp -#define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout -#define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels -#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy - /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { u8 offset[NF_CT_EXT_NUM]; u8 len; - char data[]; + unsigned int gen_id; + char data[] __aligned(8); }; static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id) @@ -58,33 +52,28 @@ static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id) return (ct->ext && __nf_ct_ext_exist(ct->ext, id)); } -static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id) +void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id); + +static inline void *nf_ct_ext_find(const struct nf_conn *ct, u8 id) { - if (!nf_ct_ext_exist(ct, id)) + struct nf_ct_ext *ext = ct->ext; + + if (!ext || !__nf_ct_ext_exist(ext, id)) return NULL; + if (unlikely(ext->gen_id)) + return __nf_ct_ext_find(ext, id); + return (void *)ct->ext + ct->ext->offset[id]; } -#define nf_ct_ext_find(ext, id) \ - ((id##_TYPE *)__nf_ct_ext_find((ext), (id))) - -/* Destroy all relationships */ -void nf_ct_ext_destroy(struct nf_conn *ct); /* Add this type, returns pointer to data or NULL. */ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); -struct nf_ct_ext_type { - /* Destroys relationships (can be NULL). */ - void (*destroy)(struct nf_conn *ct); - - enum nf_ct_ext_id id; - - /* Length and min alignment. */ - u8 len; - u8 align; -}; +/* ext genid. if ext->id != ext_genid, extensions cannot be used + * anymore unless conntrack has CONFIRMED bit set. + */ +extern atomic_t nf_conntrack_ext_genid; +void nf_ct_ext_bump_genid(void); -int nf_ct_extend_register(const struct nf_ct_ext_type *type); -void nf_ct_extend_unregister(const struct nf_ct_ext_type *type); #endif /* _NF_CONNTRACK_EXTEND_H */ diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 37f0fbefb060..de2f956abf34 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -115,6 +115,11 @@ struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags); +int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, u16 proto); +int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, + u8 proto, bool nat, struct nf_conntrack_helper **hp); + void nf_ct_helper_destroy(struct nf_conn *ct); static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct) @@ -131,8 +136,6 @@ static inline void *nfct_help_data(const struct nf_conn *ct) return (void *)help->data; } -void nf_conntrack_helper_pernet_init(struct net *net); - int nf_conntrack_helper_init(void); void nf_conntrack_helper_fini(void); diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 88186b95b3c2..1f47bef51722 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -32,7 +32,7 @@ struct nf_conntrack_l4proto { /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct); + struct nf_conn *ct, bool destroy); /* convert nfnetlink attributes to protoinfo */ int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct); @@ -159,22 +159,26 @@ unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL -__printf(3, 4) __cold +__printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, + const struct nf_hook_state *state, const char *fmt, ...); -__printf(5, 6) __cold +__printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, - struct net *net, - u16 pf, u8 protonum, + const struct nf_hook_state *state, + u8 protonum, const char *fmt, ...); #else -static inline __printf(5, 6) __cold -void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, - u16 pf, u8 protonum, const char *fmt, ...) {} -static inline __printf(3, 4) __cold +static inline __printf(4, 5) __cold +void nf_l4proto_log_invalid(const struct sk_buff *skb, + const struct nf_hook_state *state, + u8 protonum, + const char *fmt, ...) {} +static inline __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, + const struct nf_hook_state *state, const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ @@ -203,6 +207,20 @@ static inline struct nf_icmp_net *nf_icmpv6_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmpv6; } + +/* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ +static inline void nf_ct_set_tcp_be_liberal(struct nf_conn *ct) +{ + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; +} + +/* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ +static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) +{ + return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && + test_bit(IPS_ASSURED_BIT, &ct->status); +} #endif #ifdef CONFIG_NF_CT_PROTO_DCCP diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index ba916411c4e1..6903f72bcc15 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -17,10 +17,18 @@ struct nf_conn_labels { unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)]; }; +/* Can't use nf_ct_ext_find(), flow dissector cannot use symbols + * exported by nf_conntrack module. + */ static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_LABELS - return nf_ct_ext_find(ct, NF_CT_EXT_LABELS); + struct nf_ct_ext *ext = ct->ext; + + if (!ext || !__nf_ct_ext_exist(ext, NF_CT_EXT_LABELS)) + return NULL; + + return (void *)ct->ext + ct->ext->offset[NF_CT_EXT_LABELS]; #else return NULL; #endif @@ -31,7 +39,7 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_LABELS struct net *net = nf_ct_net(ct); - if (net->ct.labels_used == 0) + if (atomic_read(&net->ct.labels_used) == 0) return NULL; return nf_ct_ext_add(ct, NF_CT_EXT_LABELS, GFP_ATOMIC); @@ -44,13 +52,9 @@ int nf_connlabels_replace(struct nf_conn *ct, const u32 *data, const u32 *mask, unsigned int words); #ifdef CONFIG_NF_CONNTRACK_LABELS -int nf_conntrack_labels_init(void); -void nf_conntrack_labels_fini(void); int nf_connlabels_get(struct net *net, unsigned int bit); void nf_connlabels_put(struct net *net); #else -static inline int nf_conntrack_labels_init(void) { return 0; } -static inline void nf_conntrack_labels_fini(void) {} static inline int nf_connlabels_get(struct net *net, unsigned int bit) { return 0; } static inline void nf_connlabels_put(struct net *net) {} #endif diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h index 0a10b50537ae..883c414b768e 100644 --- a/include/net/netfilter/nf_conntrack_seqadj.h +++ b/include/net/netfilter/nf_conntrack_seqadj.h @@ -42,7 +42,4 @@ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, u32 seq); -int nf_conntrack_seqadj_init(void); -void nf_conntrack_seqadj_fini(void); - #endif /* _NF_CONNTRACK_SEQADJ_H */ diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 659b0ea25b4d..9fdaba911de6 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -17,14 +17,6 @@ struct nf_ct_timeout { char data[]; }; -struct ctnl_timeout { - struct list_head head; - struct rcu_head rcu_head; - refcount_t refcnt; - char name[CTNL_TIMEOUT_NAME_MAX]; - struct nf_ct_timeout timeout; -}; - struct nf_conn_timeout { struct nf_ct_timeout __rcu *timeout; }; @@ -89,23 +81,11 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -int nf_conntrack_timeout_init(void); -void nf_conntrack_timeout_fini(void); void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name); void nf_ct_destroy_timeout(struct nf_conn *ct); #else -static inline int nf_conntrack_timeout_init(void) -{ - return 0; -} - -static inline void nf_conntrack_timeout_fini(void) -{ - return; -} - static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) @@ -120,8 +100,12 @@ static inline void nf_ct_destroy_timeout(struct nf_conn *ct) #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -extern struct nf_ct_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name); -extern void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout); +struct nf_ct_timeout_hooks { + struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name); + void (*timeout_put)(struct nf_ct_timeout *timeout); +}; + +extern const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook; #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */ diff --git a/include/net/netfilter/nf_conntrack_timestamp.h b/include/net/netfilter/nf_conntrack_timestamp.h index 820ea34b6029..57138d974a9f 100644 --- a/include/net/netfilter/nf_conntrack_timestamp.h +++ b/include/net/netfilter/nf_conntrack_timestamp.h @@ -40,21 +40,8 @@ struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp) #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP void nf_conntrack_tstamp_pernet_init(struct net *net); - -int nf_conntrack_tstamp_init(void); -void nf_conntrack_tstamp_fini(void); #else static inline void nf_conntrack_tstamp_pernet_init(struct net *net) {} - -static inline int nf_conntrack_tstamp_init(void) -{ - return 0; -} - -static inline void nf_conntrack_tstamp_fini(void) -{ - return; -} #endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */ #endif /* _NF_CONNTRACK_TSTAMP_H */ diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h index 9334371c94e2..f7dd950ff250 100644 --- a/include/net/netfilter/nf_conntrack_tuple.h +++ b/include/net/netfilter/nf_conntrack_tuple.h @@ -67,6 +67,9 @@ struct nf_conntrack_tuple { /* The protocol. */ u_int8_t protonum; + /* The direction must be ignored for the tuplehash */ + struct { } __nfct_hash_offsetend; + /* The direction (for tuplehash) */ u_int8_t dir; } dst; diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 16e8b2f8d006..d711642e78b5 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -10,6 +10,8 @@ #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/flow_offload.h> #include <net/dst.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> struct nf_flowtable; struct nf_flow_rule; @@ -21,6 +23,8 @@ struct nf_flow_key { struct flow_dissector_key_control control; struct flow_dissector_key_control enc_control; struct flow_dissector_key_basic basic; + struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; @@ -49,14 +53,17 @@ struct nf_flowtable_type { struct list_head list; int family; int (*init)(struct nf_flowtable *ft); + bool (*gc)(const struct flow_offload *flow); int (*setup)(struct nf_flowtable *ft, struct net_device *dev, enum flow_block_command cmd); int (*action)(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); void (*free)(struct nf_flowtable *ft); + void (*get)(struct nf_flowtable *ft); + void (*put)(struct nf_flowtable *ft); nf_hookfn *hook; struct module *owner; }; @@ -67,12 +74,13 @@ enum nf_flowtable_flags { }; struct nf_flowtable { - struct list_head list; - struct rhashtable rhashtable; - int priority; + unsigned int flags; /* readonly in datapath */ + int priority; /* control path (padding hole) */ + struct rhashtable rhashtable; /* datapath, read-mostly members come first */ + + struct list_head list; /* slowpath parts */ const struct nf_flowtable_type *type; struct delayed_work gc_work; - unsigned int flags; struct flow_block flow_block; struct rw_semaphore flow_block_lock; /* Guards flow_block */ possible_net_t net; @@ -86,9 +94,19 @@ static inline bool nf_flowtable_hw_offload(struct nf_flowtable *flowtable) enum flow_offload_tuple_dir { FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, - FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX +}; +#define FLOW_OFFLOAD_DIR_MAX IP_CT_DIR_MAX + +enum flow_offload_xmit_type { + FLOW_OFFLOAD_XMIT_UNSPEC = 0, + FLOW_OFFLOAD_XMIT_NEIGH, + FLOW_OFFLOAD_XMIT_XFRM, + FLOW_OFFLOAD_XMIT_DIRECT, + FLOW_OFFLOAD_XMIT_TC, }; +#define NF_FLOW_TABLE_ENCAP_MAX 2 + struct flow_offload_tuple { union { struct in_addr src_v4; @@ -107,11 +125,34 @@ struct flow_offload_tuple { u8 l3proto; u8 l4proto; - u8 dir; + struct { + u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; - u16 mtu; + /* All members above are keys for lookups, see flow_offload_hash(). */ + struct { } __hash; - struct dst_entry *dst_cache; + u8 dir:2, + xmit_type:3, + encap_num:2, + in_vlan_ingress:2; + u16 mtu; + union { + struct { + struct dst_entry *dst_cache; + u32 dst_cookie; + }; + struct { + u32 ifidx; + u32 hw_ifidx; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + } out; + struct { + u32 iifidx; + } tc; + }; }; struct flow_offload_tuple_rhash { @@ -122,12 +163,14 @@ struct flow_offload_tuple_rhash { enum nf_flow_flags { NF_FLOW_SNAT, NF_FLOW_DNAT, + NF_FLOW_CLOSING, NF_FLOW_TEARDOWN, NF_FLOW_HW, NF_FLOW_HW_DYING, NF_FLOW_HW_DEAD, - NF_FLOW_HW_REFRESH, NF_FLOW_HW_PENDING, + NF_FLOW_HW_BIDIRECTIONAL, + NF_FLOW_HW_ESTABLISHED, }; enum flow_offload_type { @@ -147,6 +190,8 @@ struct flow_offload { #define NF_FLOW_TIMEOUT (30 * HZ) #define nf_flowtable_time_stamp (u32)jiffies +unsigned long flow_offload_get_timeout(struct flow_offload *flow); + static inline __s32 nf_flow_timeout_delta(unsigned int timeout) { return (__s32)(timeout - nf_flowtable_time_stamp); @@ -154,7 +199,23 @@ static inline __s32 nf_flow_timeout_delta(unsigned int timeout) struct nf_flow_route { struct { - struct dst_entry *dst; + struct dst_entry *dst; + struct { + u32 ifindex; + struct { + u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps:2, + ingress_vlans:2; + } in; + struct { + u32 ifindex; + u32 hw_ifindex; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + } out; + enum flow_offload_xmit_type xmit_type; } tuple[FLOW_OFFLOAD_DIR_MAX]; }; @@ -183,6 +244,11 @@ nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, } list_add_tail(&block_cb->list, &block->cb_list); + up_write(&flow_table->flow_block_lock); + + if (flow_table->type->get) + flow_table->type->get(flow_table); + return 0; unlock: up_write(&flow_table->flow_block_lock); @@ -205,17 +271,21 @@ nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, WARN_ON(true); } up_write(&flow_table->flow_block_lock); + + if (flow_table->type->put) + flow_table->type->put(flow_table); } -int flow_offload_route_init(struct flow_offload *flow, - const struct nf_flow_route *route); +void flow_offload_route_init(struct flow_offload *flow, + struct nf_flow_route *route); int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); void flow_offload_refresh(struct nf_flowtable *flow_table, - struct flow_offload *flow); + struct flow_offload *flow, bool force); struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple); +void nf_flow_table_gc_run(struct nf_flowtable *flow_table); void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, struct net_device *dev); void nf_flow_table_cleanup(struct net_device *dev); @@ -225,22 +295,37 @@ void nf_flow_table_free(struct nf_flowtable *flow_table); void flow_offload_teardown(struct flow_offload *flow); -int nf_flow_snat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir); -int nf_flow_dnat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir); +void nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir); +void nf_flow_dnat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir); struct flow_ports { __be16 source, dest; }; +struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev); +int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd); + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); +#if (IS_BUILTIN(CONFIG_NF_FLOW_TABLE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_NF_FLOW_TABLE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) +extern int nf_flow_register_bpf(void); +#else +static inline int nf_flow_register_bpf(void) +{ + return 0; +} +#endif + #define MODULE_ALIAS_NF_FLOWTABLE(family) \ MODULE_ALIAS("nf-flowtable-" __stringify(family)) @@ -252,17 +337,66 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, struct flow_offload *flow); void nf_flow_table_offload_flush(struct nf_flowtable *flowtable); +void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable); + int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct net_device *dev, enum flow_block_command cmd); -int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); -int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); +static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) +{ + __be16 proto; + + proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); + switch (proto) { + case htons(PPP_IP): + return htons(ETH_P_IP); + case htons(PPP_IPV6): + return htons(ETH_P_IPV6); + } + + return 0; +} + +static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) +{ + if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + return false; + + *inner_proto = __nf_flow_pppoe_proto(skb); + + return true; +} + +#define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count) +#define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count) +#define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count) \ + this_cpu_inc((net)->ft.stat->count) +#define NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count) \ + this_cpu_dec((net)->ft.stat->count) + +#ifdef CONFIG_NF_FLOW_TABLE_PROCFS +int nf_flow_table_init_proc(struct net *net); +void nf_flow_table_fini_proc(struct net *net); +#else +static inline int nf_flow_table_init_proc(struct net *net) +{ + return 0; +} + +static inline void nf_flow_table_fini_proc(struct net *net) +{ +} +#endif /* CONFIG_NF_FLOW_TABLE_PROCFS */ + #endif /* _NF_FLOW_TABLE_H */ diff --git a/include/net/netfilter/nf_hooks_lwtunnel.h b/include/net/netfilter/nf_hooks_lwtunnel.h new file mode 100644 index 000000000000..cef7a4eb8f97 --- /dev/null +++ b/include/net/netfilter/nf_hooks_lwtunnel.h @@ -0,0 +1,7 @@ +#include <linux/sysctl.h> +#include <linux/types.h> + +#ifdef CONFIG_SYSCTL +int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#endif diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 716db4a0fed8..e55eedc84ed7 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -68,7 +68,6 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf); int nf_logger_find_get(int pf, enum nf_log_type type); void nf_logger_put(int pf, enum nf_log_type type); -void nf_logger_request_module(int pf, enum nf_log_type type); #define MODULE_ALIAS_NF_LOGGER(family, type) \ MODULE_ALIAS("nf-logger-" __stringify(family) "-" __stringify(type)) @@ -99,28 +98,4 @@ struct nf_log_buf; struct nf_log_buf *nf_log_buf_open(void); __printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...); void nf_log_buf_close(struct nf_log_buf *m); - -/* common logging functions */ -int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset); -int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset, - unsigned int logflags); -void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, - struct sock *sk); -void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb); -void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix); -void nf_log_l2packet(struct net *net, u_int8_t pf, - __be16 protocol, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, const char *prefix); - #endif /* _NF_LOG_H */ diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 0d412dd63707..9877f064548a 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -104,9 +104,11 @@ unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); -int nf_xfrm_me_harder(struct net *n, struct sk_buff *s, unsigned int family); +int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int *action, + const struct nf_nat_range2 *range, bool commit); -static inline int nf_nat_initialized(struct nf_conn *ct, +static inline int nf_nat_initialized(const struct nf_conn *ct, enum nf_nat_manip_type manip) { if (manip == NF_NAT_MANIP_SRC) diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h index efae84646353..44c421b9be85 100644 --- a/include/net/netfilter/nf_nat_helper.h +++ b/include/net/netfilter/nf_nat_helper.h @@ -38,4 +38,5 @@ bool nf_nat_mangle_udp_packet(struct sk_buff *skb, struct nf_conn *ct, * to port ct->master->saved_proto. */ void nf_nat_follow_master(struct nf_conn *ct, struct nf_conntrack_expect *this); +u16 nf_nat_exp_find_port(struct nf_conntrack_expect *exp, u16 port); #endif diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h index 2418653a66db..279380de904c 100644 --- a/include/net/netfilter/nf_nat_redirect.h +++ b/include/net/netfilter/nf_nat_redirect.h @@ -6,8 +6,7 @@ #include <uapi/linux/netfilter/nf_nat.h> unsigned int -nf_nat_redirect_ipv4(struct sk_buff *skb, - const struct nf_nat_ipv4_multi_range_compat *mr, +nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum); unsigned int nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index e770bba00066..4aeffddb7586 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -33,17 +33,16 @@ struct nf_queue_handler { void (*nf_hook_drop)(struct net *net); }; -void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh); -void nf_unregister_queue_handler(struct net *net); -void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); +void nf_register_queue_handler(const struct nf_queue_handler *qh); +void nf_unregister_queue_handler(void); -void nf_queue_entry_get_refs(struct nf_queue_entry *entry); +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) { while (*jhash_initval == 0) - *jhash_initval = prandom_u32(); + *jhash_initval = get_random_u32(); } static inline u32 hash_v4(const struct iphdr *iph, u32 initval) diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h index 9051c3a0c8e7..7c669792fb9c 100644 --- a/include/net/netfilter/nf_reject.h +++ b/include/net/netfilter/nf_reject.h @@ -5,12 +5,28 @@ #include <linux/types.h> #include <uapi/linux/in.h> -static inline bool nf_reject_verify_csum(__u8 proto) +static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff, + __u8 proto) { /* Skip protocols that don't use 16-bit one's complement checksum * of the entire payload. */ switch (proto) { + /* Protocols with optional checksums. */ + case IPPROTO_UDP: { + const struct udphdr *udp_hdr; + struct udphdr _udp_hdr; + + udp_hdr = skb_header_pointer(skb, dataoff, + sizeof(_udp_hdr), + &_udp_hdr); + if (!udp_hdr || udp_hdr->check) + return true; + + return false; + } + case IPPROTO_GRE: + /* Protocols with other integrity checks. */ case IPPROTO_AH: case IPPROTO_ESP: @@ -19,9 +35,6 @@ static inline bool nf_reject_verify_csum(__u8 proto) /* Protocols with partial checksums. */ case IPPROTO_UDPLITE: case IPPROTO_DCCP: - - /* Protocols with optional checksums. */ - case IPPROTO_GRE: return false; } return true; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 55b4cadf290a..e4d8e451e935 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -2,7 +2,7 @@ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> @@ -13,6 +13,7 @@ #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> +#include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) @@ -20,37 +21,55 @@ struct module; #define NFT_JUMP_STACK_SIZE 16 +enum { + NFT_PKTINFO_L4PROTO = (1 << 0), + NFT_PKTINFO_INNER = (1 << 1), + NFT_PKTINFO_INNER_FULL = (1 << 2), +}; + struct nft_pktinfo { struct sk_buff *skb; - bool tprot_set; + const struct nf_hook_state *state; + u8 flags; u8 tprot; - /* for x_tables compatibility */ - struct xt_action_param xt; + u16 fragoff; + u16 thoff; + u16 inneroff; }; +static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) +{ + return pkt->state->sk; +} + +static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) +{ + return pkt->thoff; +} + static inline struct net *nft_net(const struct nft_pktinfo *pkt) { - return pkt->xt.state->net; + return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { - return pkt->xt.state->hook; + return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { - return pkt->xt.state->pf; + return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { - return pkt->xt.state->in; + return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { - return pkt->xt.state->out; + return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, @@ -58,16 +77,15 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, const struct nf_hook_state *state) { pkt->skb = skb; - pkt->xt.state = state; + pkt->state = state; } -static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { - pkt->tprot_set = false; + pkt->flags = 0; pkt->tprot = 0; - pkt->xt.thoff = 0; - pkt->xt.fragoff = 0; + pkt->thoff = 0; + pkt->fragoff = 0; } /** @@ -88,6 +106,8 @@ struct nft_data { }; } __attribute__((aligned(__alignof__(u64)))); +#define NFT_REG32_NUM 20 + /** * struct nft_regs - nf_tables register set * @@ -98,11 +118,22 @@ struct nft_data { */ struct nft_regs { union { - u32 data[20]; + u32 data[NFT_REG32_NUM]; struct nft_verdict verdict; }; }; +struct nft_regs_track { + struct { + const struct nft_expr *selector; + const struct nft_expr *bitwise; + u8 num_reg; + } regs[NFT_REG32_NUM]; + + const struct nft_expr *cur; + const struct nft_expr *last; +}; + /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit @@ -127,14 +158,29 @@ static inline void nft_reg_store16(u32 *dreg, u16 val) *(u16 *)dreg = val; } +static inline void nft_reg_store_be16(u32 *dreg, __be16 val) +{ + nft_reg_store16(dreg, (__force __u16)val); +} + static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } -static inline void nft_reg_store64(u32 *dreg, u64 val) +static inline __be16 nft_reg_load_be16(const u32 *sreg) +{ + return (__force __be16)nft_reg_load16(sreg); +} + +static inline __be32 nft_reg_load_be32(const u32 *sreg) +{ + return *(__force __be32 *)sreg; +} + +static inline void nft_reg_store64(u64 *dreg, u64 val) { - put_unaligned(val, (u64 *)dreg); + put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) @@ -159,9 +205,11 @@ static inline void nft_data_copy(u32 *dst, const struct nft_data *src, * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number + * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message + * @reg_inited: bitmap of initialised registers */ struct nft_ctx { struct net *net; @@ -174,15 +222,21 @@ struct nft_ctx { u8 family; u8 level; bool report; + DECLARE_BITMAP(reg_inited, NFT_REG32_NUM); +}; + +enum nft_data_desc_flags { + NFT_DATA_DESC_SETELEM = (1 << 0), }; struct nft_data_desc { enum nft_data_types type; + unsigned int size; unsigned int len; + unsigned int flags; }; -int nft_data_init(const struct nft_ctx *ctx, - struct nft_data *data, unsigned int size, +int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); @@ -200,14 +254,14 @@ static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); -unsigned int nft_parse_register(const struct nlattr *attr); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); -int nft_validate_register_load(enum nft_registers reg, unsigned int len); -int nft_validate_register_store(const struct nft_ctx *ctx, - enum nft_registers reg, - const struct nft_data *data, - enum nft_data_types type, unsigned int len); +int nft_parse_register_load(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *sreg, u32 len); +int nft_parse_register_store(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *dreg, + const struct nft_data *data, + enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object @@ -224,11 +278,15 @@ struct nft_userdata { unsigned char data[]; }; +/* placeholder structure for opaque set element backend representation. */ +struct nft_elem_priv { }; + /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key + * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { @@ -244,35 +302,67 @@ struct nft_set_elem { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; - void *priv; + struct nft_elem_priv *priv; +}; + +static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) +{ + return (void *)priv; +} + + +/** + * enum nft_iter_type - nftables set iterator type + * + * @NFT_ITER_UNSPEC: unspecified, to catch errors + * @NFT_ITER_READ: read-only iteration over set elements + * @NFT_ITER_UPDATE: iteration under mutex to update set element state + */ +enum nft_iter_type { + NFT_ITER_UNSPEC, + NFT_ITER_READ, + NFT_ITER_UPDATE, }; struct nft_set; struct nft_set_iter { u8 genmask; + enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem); + struct nft_elem_priv *elem_priv); }; /** * struct nft_set_desc - description of set elements * + * @ktype: key type * @klen: key length + * @dtype: data type * @dlen: data length + * @objtype: object type * @size: number of set elements + * @policy: set policy + * @gc_int: garbage collector interval + * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { + u32 ktype; unsigned int klen; + u32 dtype; unsigned int dlen; + u32 objtype; unsigned int size; + u32 policy; + u32 gc_int; + u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; @@ -281,9 +371,9 @@ struct nft_set_desc { /** * enum nft_set_class - performance class * - * @NFT_LOOKUP_O_1: constant, O(1) - * @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N) - * @NFT_LOOKUP_O_N: linear, O(N) + * @NFT_SET_CLASS_O_1: constant, O(1) + * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) + * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, @@ -305,8 +395,39 @@ struct nft_set_estimate { enum nft_set_class space; }; +#define NFT_EXPR_MAXATTR 16 +#define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ + ALIGN(size, __alignof__(struct nft_expr))) + +/** + * struct nft_expr - nf_tables expression + * + * @ops: expression ops + * @data: expression private data + */ +struct nft_expr { + const struct nft_expr_ops *ops; + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); +}; + +static inline void *nft_expr_priv(const struct nft_expr *expr) +{ + return (void *)expr->data; +} + +struct nft_expr_info; + +int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, + struct nft_expr_info *info); +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); +void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); +int nft_expr_dump(struct sk_buff *skb, unsigned int attr, + const struct nft_expr *expr, bool reset); +bool nft_expr_reduce_bitwise(struct nft_regs_track *track, + const struct nft_expr *expr); + struct nft_set_ext; -struct nft_expr; /** * struct nft_set_ops - nf_tables set operations @@ -321,9 +442,16 @@ struct nft_expr; * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements + * @ksize: kernel set size + * @usize: userspace set size + * @adjust_maxsize: delta to adjust maximum set size + * @commit: commit set elements + * @abort: abort set elements * @privsize: function to return size of set private data + * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance + * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster @@ -337,7 +465,8 @@ struct nft_set_ops { const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, - void *(*new)(struct nft_set *, + struct nft_elem_priv * + (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, @@ -349,27 +478,31 @@ struct nft_set_ops { int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext); + struct nft_elem_priv **priv); void (*activate)(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem); - void * (*deactivate)(const struct net *net, + struct nft_elem_priv *elem_priv); + struct nft_elem_priv * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); - bool (*flush)(const struct net *net, + void (*flush)(const struct net *net, const struct nft_set *set, - void *priv); + struct nft_elem_priv *priv); void (*remove)(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem); + struct nft_elem_priv *elem_priv); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); - void * (*get)(const struct net *net, + struct nft_elem_priv * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); - + u32 (*ksize)(u32 size); + u32 (*usize)(u32 size); + u32 (*adjust_maxsize)(const struct nft_set *set); + void (*commit)(struct nft_set *set); + void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, @@ -378,7 +511,8 @@ struct nft_set_ops { int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); - void (*destroy)(const struct nft_set *set); + void (*destroy)(const struct nft_ctx *ctx, + const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; @@ -396,11 +530,28 @@ struct nft_set_type { }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) +struct nft_set_elem_expr { + u8 size; + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_expr)))); +}; + +#define nft_setelem_expr_at(__elem_expr, __offset) \ + ((struct nft_expr *)&__elem_expr->data[__offset]) + +#define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ + for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ + __size < (__elem_expr)->size; \ + __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) + +#define NFT_SET_EXPR_MAX 2 + /** * struct nft_set - nf_tables set instance * * @list: table set list node * @bindings: list of set bindings + * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set @@ -419,17 +570,22 @@ struct nft_set_type { * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data - * @expr: stateful expression + * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags + * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length + * @num_exprs: numbers of exprs + * @exprs: stateful expression + * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { struct list_head list; struct list_head bindings; + refcount_t refs; struct nft_table *table; possible_net_t net; char *name; @@ -448,13 +604,17 @@ struct nft_set { u16 policy; u16 udlen; unsigned char *udata; - struct nft_expr *expr; + struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:14, + u16 flags:13, + dead:1, genmask:2; u8 klen; u8 dlen; + u8 num_exprs; + struct nft_expr *exprs[NFT_SET_EXPR_MAX]; + struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; @@ -469,6 +629,16 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline enum nft_data_types nft_set_datatype(const struct nft_set *set) +{ + return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; +} + +static inline bool nft_set_gc_is_pending(const struct nft_set *s) +{ + return refcount_read(&s->refs) != 1; +} + static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); @@ -480,9 +650,14 @@ struct nft_set *nft_set_lookup_global(const struct net *net, const struct nlattr *nla_set_id, u8 genmask); +struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, + const struct nft_set *set); + static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { - return set->gc_int ? msecs_to_jiffies(set->gc_int) : HZ; + u32 gc_int = READ_ONCE(set->gc_int); + + return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** @@ -502,6 +677,7 @@ struct nft_set_binding { }; enum nft_trans_phase; +void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase); @@ -517,9 +693,8 @@ void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout - * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element - * @NFT_SET_EXT_EXPR: expression assiociated with the element + * @NFT_SET_EXT_EXPRESSIONS: expressions associated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ @@ -529,9 +704,8 @@ enum nft_set_extensions { NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, - NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, - NFT_SET_EXT_EXPR, + NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; @@ -554,24 +728,29 @@ extern const struct nft_set_ext_type nft_set_ext_types[]; * * @len: length of extension area * @offset: offsets of individual extension types + * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; + u8 ext_len[NFT_SET_EXT_NUM]; }; /** * struct nft_set_ext - set extensions * - * @genmask: generation mask + * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data + * + * This structure must be aligned to word size, otherwise atomic bitops + * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; -}; +} __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { @@ -579,18 +758,23 @@ static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) tmpl->len = sizeof(struct nft_set_ext); } -static inline void nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, - unsigned int len) +static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, + unsigned int len) { tmpl->len = ALIGN(tmpl->len, nft_set_ext_types[id].align); - BUG_ON(tmpl->len > U8_MAX); + if (tmpl->len > U8_MAX) + return -EINVAL; + tmpl->offset[id] = tmpl->len; - tmpl->len += nft_set_ext_types[id].len + len; + tmpl->ext_len[id] = nft_set_ext_types[id].len + len; + tmpl->len += tmpl->ext_len[id]; + + return 0; } -static inline void nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) +static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) { - nft_set_ext_add_length(tmpl, id, 0); + return nft_set_ext_add_length(tmpl, id, 0); } static inline void nft_set_ext_init(struct nft_set_ext *ext, @@ -634,36 +818,45 @@ static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } -static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) +struct nft_timeout { + u64 timeout; + u64 expiration; +}; + +static inline struct nft_timeout *nft_set_ext_timeout(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } -static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) +static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) { - return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); + return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } -static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) +static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { - return nft_set_ext(ext, NFT_SET_EXT_USERDATA); + return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } -static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext) +static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, + u64 tstamp) { - return nft_set_ext(ext, NFT_SET_EXT_EXPR); + if (!nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) || + READ_ONCE(nft_set_ext_timeout(ext)->timeout) == 0) + return false; + + return time_after_eq64(tstamp, READ_ONCE(nft_set_ext_timeout(ext)->expiration)); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { - return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && - time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); + return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, - void *elem) + const struct nft_elem_priv *elem_priv) { - return elem + set->ops->elemsize; + return (void *)elem_priv + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) @@ -675,68 +868,19 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); -void *nft_set_elem_init(const struct nft_set *set, - const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *key_end, const u32 *data, - u64 timeout, u64 expiration, gfp_t gfp); -void nft_set_elem_destroy(const struct nft_set *set, void *elem, +struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const u32 *key, const u32 *key_end, + const u32 *data, + u64 timeout, u64 expiration, gfp_t gfp); +int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_expr *expr_array[]); +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, bool destroy_expr); - -/** - * struct nft_set_gc_batch_head - nf_tables set garbage collection batch - * - * @rcu: rcu head - * @set: set the elements belong to - * @cnt: count of elements - */ -struct nft_set_gc_batch_head { - struct rcu_head rcu; - const struct nft_set *set; - unsigned int cnt; -}; - -#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \ - sizeof(struct nft_set_gc_batch_head)) / \ - sizeof(void *)) - -/** - * struct nft_set_gc_batch - nf_tables set garbage collection batch - * - * @head: GC batch head - * @elems: garbage collection elements - */ -struct nft_set_gc_batch { - struct nft_set_gc_batch_head head; - void *elems[NFT_SET_GC_BATCH_SIZE]; -}; - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp); -void nft_set_gc_batch_release(struct rcu_head *rcu); - -static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb) -{ - if (gcb != NULL) - call_rcu(&gcb->head.rcu, nft_set_gc_batch_release); -} - -static inline struct nft_set_gc_batch * -nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb, - gfp_t gfp) -{ - if (gcb != NULL) { - if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems)) - return gcb; - nft_set_gc_batch_complete(gcb); - } - return nft_set_gc_batch_alloc(set, gfp); -} - -static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb, - void *elem) -{ - gcb->elems[gcb->head.cnt++] = elem; -} +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_elem_priv *elem_priv); struct nft_expr_ops; /** @@ -745,6 +889,7 @@ struct nft_expr_ops; * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present + * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference @@ -758,6 +903,7 @@ struct nft_expr_type { const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; + const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; @@ -772,6 +918,7 @@ struct nft_expr_type { enum nft_trans_phase { NFT_TRANS_PREPARE, + NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE @@ -784,23 +931,30 @@ struct nft_offload_ctx; * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function + * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu + * @destroy_clone: destruction clone function * @dump: function to dump parameters - * @type: expression type * @validate: validate expression, called during loop detection + * @reduce: reduce expression + * @gc: garbage collection expression + * @offload: hardware offload expression + * @offload_action: function to report true/false to allocate one slot or not in the flow + * offload array + * @offload_stats: function to synchronize hardware stats via updating the counter expression + * @type: expression type * @data: extra data to attach to this expression operation */ -struct nft_expr; struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, - const struct nft_expr *src); + const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, @@ -816,46 +970,24 @@ struct nft_expr_ops { void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, + bool reset); int (*validate)(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); + const struct nft_expr *expr); + bool (*reduce)(struct nft_regs_track *track, + const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); - u32 offload_flags; + bool (*offload_action)(const struct nft_expr *expr); + void (*offload_stats)(struct nft_expr *expr, + const struct flow_stats *stats); const struct nft_expr_type *type; void *data; }; -#define NFT_EXPR_MAXATTR 16 -#define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ - ALIGN(size, __alignof__(struct nft_expr))) - -/** - * struct nft_expr - nf_tables expression - * - * @ops: expression ops - * @data: expression private data - */ -struct nft_expr { - const struct nft_expr_ops *ops; - unsigned char data[] - __attribute__((aligned(__alignof__(u64)))); -}; - -static inline void *nft_expr_priv(const struct nft_expr *expr) -{ - return (void *)expr->data; -} - -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); -void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); -int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr); - /** * struct nft_rule - nf_tables rule * @@ -902,17 +1034,26 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) return (void *)&rule->data[rule->dlen]; } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase); +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; - - if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) { - expr = nft_set_ext_expr(ext); - expr->ops->eval(expr, regs, pkt); + u32 size; + + if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { + elem_expr = nft_set_ext_expr(ext); + nft_setelem_expr_foreach(expr, elem_expr, size) { + expr->ops->eval(expr, regs, pkt); + if (regs->verdict.code == NFT_BREAK) + return; + } } } @@ -928,21 +1069,54 @@ static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, #define NFT_CHAIN_POLICY_UNSET U8_MAX +struct nft_rule_dp { + u64 is_last:1, + dlen:12, + handle:42; /* for tracing */ + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_expr)))); +}; + +struct nft_rule_dp_last { + struct nft_rule_dp end; /* end of nft_rule_blob marker */ + struct rcu_head h; /* call_rcu head */ + struct nft_rule_blob *blob; /* ptr to free via call_rcu */ + const struct nft_chain *chain; /* for nftables tracing */ +}; + +static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule) +{ + return (void *)rule + sizeof(*rule) + rule->dlen; +} + +struct nft_rule_blob { + unsigned long size; + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_rule_dp)))); +}; + /** * struct nft_chain - nf_tables chain * + * @blob_gen_0: rule blob pointer to the current generation + * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain - * @flags: bitmask of enum nft_chain_flags + * @flags: bitmask of enum NFTA_CHAIN_FLAGS + * @bound: bind or not + * @genmask: generation mask * @name: name of the chain + * @udlen: user data length + * @udata: user data in the chain + * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { - struct nft_rule *__rcu *rules_gen_0; - struct nft_rule *__rcu *rules_gen_1; + struct nft_rule_blob __rcu *blob_gen_0; + struct nft_rule_blob __rcu *blob_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; @@ -957,10 +1131,21 @@ struct nft_chain { u8 *udata; /* Only used during control plane commit phase: */ - struct nft_rule **rules_next; + struct nft_rule_blob *blob_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_elem_priv *elem_priv); +int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); + +struct nft_hook; +void nf_tables_chain_device_notify(const struct nft_chain *chain, + const struct nft_hook *hook, + const struct net_device *dev, int event); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, @@ -997,13 +1182,19 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); +static inline bool nft_chain_binding(const struct nft_chain *chain) +{ + return chain->flags & NFT_CHAIN_BINDING; +} + static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } +int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); -void nf_tables_chain_destroy(struct nft_ctx *ctx); +void nf_tables_chain_destroy(struct nft_chain *chain); struct nft_stats { u64 bytes; @@ -1013,11 +1204,17 @@ struct nft_stats { struct nft_hook { struct list_head list; - bool inactive; - struct nf_hook_ops ops; + struct list_head ops_list; struct rcu_head rcu; + char ifname[IFNAMSIZ]; + u8 ifnamelen; }; +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev); +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev); + /** * struct nft_base_chain - nf_tables base chain * @@ -1025,6 +1222,7 @@ struct nft_hook { * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy + * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) @@ -1050,10 +1248,31 @@ static inline bool nft_is_base_chain(const struct nft_chain *chain) return chain->flags & NFT_CHAIN_BASE; } -int __nft_release_basechain(struct nft_ctx *ctx); - unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); +static inline bool nft_use_inc(u32 *use) +{ + if (*use == UINT_MAX) + return false; + + (*use)++; + + return true; +} + +static inline void nft_use_dec(u32 *use) +{ + WARN_ON_ONCE((*use)-- == 0); +} + +/* For error and abort path: restore use counter to previous state. */ +static inline void nft_use_inc_restore(u32 *use) +{ + WARN_ON_ONCE(!nft_use_inc(use)); +} + +#define nft_use_dec_restore nft_use_dec + /** * struct nft_table - nf_tables table * @@ -1066,10 +1285,14 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table + * @family:address family * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask - * @afinfo: address family info + * @nlpid: netlink port ID * @name: name of the table + * @udlen: length of the user data + * @udata: user data + * @validate_state: internal, set when transaction adds jumps */ struct nft_table { struct list_head list; @@ -1084,11 +1307,24 @@ struct nft_table { u16 family:6, flags:8, genmask:2; + u32 nlpid; char *name; u16 udlen; u8 *udata; + u8 validate_state; }; +static inline bool nft_table_has_owner(const struct nft_table *table) +{ + return table->flags & NFT_TABLE_F_OWNER; +} + +static inline bool nft_table_is_orphan(const struct nft_table *table) +{ + return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) == + NFT_TABLE_F_PERSIST; +} + static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || @@ -1119,11 +1355,13 @@ struct nft_object_hash_key { * struct nft_object - nf_tables stateful object * * @list: table stateful object list node - * @key: keys that identify this object * @rhlhead: nft_objname_ht node + * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle + * @udlen: length of user data + * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ @@ -1131,8 +1369,8 @@ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; - u32 genmask:2, - use:30; + u32 genmask:2; + u32 use; u64 handle; u16 udlen; u8 *udata; @@ -1156,7 +1394,7 @@ struct nft_object *nft_obj_lookup(const struct net *net, void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, - int event, int family, int report, gfp_t gfp); + int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type @@ -1167,6 +1405,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute + * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { @@ -1176,6 +1415,7 @@ struct nft_object_type { struct list_head list; u32 type; unsigned int maxattr; + u8 family; struct module *owner; const struct nla_policy *policy; }; @@ -1189,6 +1429,7 @@ struct nft_object_type { * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object + * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, @@ -1224,9 +1465,8 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle - * @dev_name: array of device names + * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector - * @ops: array of hooks */ struct nft_flowtable { struct list_head list; @@ -1234,15 +1474,16 @@ struct nft_flowtable { char *name; int hooknum; int ops_len; - u32 genmask:2, - use:30; + u32 genmask:2; + u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask); @@ -1256,31 +1497,29 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * - * @pkt: pktinfo currently processed - * @basechain: base chain currently processed - * @chain: chain currently processed - * @rule: rule that was evaluated - * @verdict: verdict given by rule + * @trace: other struct members are initialised + * @nf_trace: copy of skb->nf_trace before rule evaluation * @type: event type (enum nft_trace_types) + * @skbid: hash of skb to be used as trace id * @packet_dumped: packet headers sent in a previous traceinfo message - * @trace: other struct members are initialised + * @basechain: base chain currently processed */ struct nft_traceinfo { - const struct nft_pktinfo *pkt; - const struct nft_base_chain *basechain; - const struct nft_chain *chain; - const struct nft_rule *rule; - const struct nft_verdict *verdict; - enum nft_trace_types type; - bool packet_dumped; bool trace; + bool nf_trace; + bool packet_dumped; + enum nft_trace_types type:8; + u32 skbid; + const struct nft_base_chain *basechain; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, - const struct nft_verdict *verdict, const struct nft_chain *basechain); -void nft_trace_notify(struct nft_traceinfo *info); +void nft_trace_notify(const struct nft_pktinfo *pkt, + const struct nft_verdict *verdict, + const struct nft_rule_dp *rule, + struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) @@ -1371,151 +1610,290 @@ static inline void nft_set_elem_change_active(const struct net *net, #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ -/* - * We use a free bit in the genmask field to indicate the element - * is busy, meaning it is currently being processed either by - * the netlink API or GC. - * - * Even though the genmask is only a single byte wide, this works - * because the extension structure if fully constant once initialized, - * so there are no non-atomic write accesses unless it is already - * marked busy. - */ -#define NFT_SET_ELEM_BUSY_MASK (1 << 2) +#define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT 2 +#define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif -static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) +static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); - return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); + set_bit(NFT_SET_ELEM_DEAD_BIT, word); } -static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) +static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; - clear_bit(NFT_SET_ELEM_BUSY_BIT, word); + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** - * struct nft_trans - nf_tables object update in transaction + * struct nft_trans - nf_tables object update in transaction * - * @list: used internally - * @msg_type: message type - * @put_net: ctx->net needs to be put - * @ctx: transaction context - * @data: internal information related to the transaction + * @list: used internally + * @net: struct net + * @table: struct nft_table the object resides in + * @msg_type: message type + * @seq: netlink sequence number + * @flags: modifiers to new request + * @report: notify via unicast netlink message + * @put_net: net needs to be put + * + * This is the information common to all objects in the transaction, + * this must always be the first member of derived sub-types. */ struct nft_trans { struct list_head list; + struct net *net; + struct nft_table *table; int msg_type; - bool put_net; - struct nft_ctx ctx; - char data[]; + u32 seq; + u16 flags; + u8 report:1; + u8 put_net:1; +}; + +/** + * struct nft_trans_binding - nf_tables object with binding support in transaction + * @nft_trans: base structure, MUST be first member + * @binding_list: list of objects with possible bindings + * + * This is the base type used by objects that can be bound to a chain. + */ +struct nft_trans_binding { + struct nft_trans nft_trans; + struct list_head binding_list; }; struct nft_trans_rule { + struct nft_trans nft_trans; struct nft_rule *rule; + struct nft_chain *chain; struct nft_flow_rule *flow; u32 rule_id; + bool bound; }; -#define nft_trans_rule(trans) \ - (((struct nft_trans_rule *)trans->data)->rule) -#define nft_trans_flow_rule(trans) \ - (((struct nft_trans_rule *)trans->data)->flow) -#define nft_trans_rule_id(trans) \ - (((struct nft_trans_rule *)trans->data)->rule_id) +#define nft_trans_container_rule(trans) \ + container_of(trans, struct nft_trans_rule, nft_trans) +#define nft_trans_rule(trans) \ + nft_trans_container_rule(trans)->rule +#define nft_trans_flow_rule(trans) \ + nft_trans_container_rule(trans)->flow +#define nft_trans_rule_id(trans) \ + nft_trans_container_rule(trans)->rule_id +#define nft_trans_rule_bound(trans) \ + nft_trans_container_rule(trans)->bound +#define nft_trans_rule_chain(trans) \ + nft_trans_container_rule(trans)->chain struct nft_trans_set { + struct nft_trans_binding nft_trans_binding; + struct list_head list_trans_newset; struct nft_set *set; u32 set_id; + u32 gc_int; + u64 timeout; + bool update; bool bound; + u32 size; }; -#define nft_trans_set(trans) \ - (((struct nft_trans_set *)trans->data)->set) -#define nft_trans_set_id(trans) \ - (((struct nft_trans_set *)trans->data)->set_id) -#define nft_trans_set_bound(trans) \ - (((struct nft_trans_set *)trans->data)->bound) +#define nft_trans_container_set(t) \ + container_of(t, struct nft_trans_set, nft_trans_binding.nft_trans) +#define nft_trans_set(trans) \ + nft_trans_container_set(trans)->set +#define nft_trans_set_id(trans) \ + nft_trans_container_set(trans)->set_id +#define nft_trans_set_bound(trans) \ + nft_trans_container_set(trans)->bound +#define nft_trans_set_update(trans) \ + nft_trans_container_set(trans)->update +#define nft_trans_set_timeout(trans) \ + nft_trans_container_set(trans)->timeout +#define nft_trans_set_gc_int(trans) \ + nft_trans_container_set(trans)->gc_int +#define nft_trans_set_size(trans) \ + nft_trans_container_set(trans)->size struct nft_trans_chain { - bool update; + struct nft_trans_binding nft_trans_binding; + struct nft_chain *chain; char *name; struct nft_stats __percpu *stats; u8 policy; + bool update; + bool bound; u32 chain_id; + struct nft_base_chain *basechain; + struct list_head hook_list; }; -#define nft_trans_chain_update(trans) \ - (((struct nft_trans_chain *)trans->data)->update) -#define nft_trans_chain_name(trans) \ - (((struct nft_trans_chain *)trans->data)->name) -#define nft_trans_chain_stats(trans) \ - (((struct nft_trans_chain *)trans->data)->stats) -#define nft_trans_chain_policy(trans) \ - (((struct nft_trans_chain *)trans->data)->policy) -#define nft_trans_chain_id(trans) \ - (((struct nft_trans_chain *)trans->data)->chain_id) +#define nft_trans_container_chain(t) \ + container_of(t, struct nft_trans_chain, nft_trans_binding.nft_trans) +#define nft_trans_chain(trans) \ + nft_trans_container_chain(trans)->chain +#define nft_trans_chain_update(trans) \ + nft_trans_container_chain(trans)->update +#define nft_trans_chain_name(trans) \ + nft_trans_container_chain(trans)->name +#define nft_trans_chain_stats(trans) \ + nft_trans_container_chain(trans)->stats +#define nft_trans_chain_policy(trans) \ + nft_trans_container_chain(trans)->policy +#define nft_trans_chain_bound(trans) \ + nft_trans_container_chain(trans)->bound +#define nft_trans_chain_id(trans) \ + nft_trans_container_chain(trans)->chain_id +#define nft_trans_basechain(trans) \ + nft_trans_container_chain(trans)->basechain +#define nft_trans_chain_hooks(trans) \ + nft_trans_container_chain(trans)->hook_list struct nft_trans_table { + struct nft_trans nft_trans; bool update; - bool enable; }; -#define nft_trans_table_update(trans) \ - (((struct nft_trans_table *)trans->data)->update) -#define nft_trans_table_enable(trans) \ - (((struct nft_trans_table *)trans->data)->enable) +#define nft_trans_container_table(trans) \ + container_of(trans, struct nft_trans_table, nft_trans) +#define nft_trans_table_update(trans) \ + nft_trans_container_table(trans)->update + +enum nft_trans_elem_flags { + NFT_TRANS_UPD_TIMEOUT = (1 << 0), + NFT_TRANS_UPD_EXPIRATION = (1 << 1), +}; + +struct nft_elem_update { + u64 timeout; + u64 expiration; + u8 flags; +}; + +struct nft_trans_one_elem { + struct nft_elem_priv *priv; + struct nft_elem_update *update; +}; struct nft_trans_elem { + struct nft_trans nft_trans; struct nft_set *set; - struct nft_set_elem elem; bool bound; + unsigned int nelems; + struct nft_trans_one_elem elems[] __counted_by(nelems); }; -#define nft_trans_elem_set(trans) \ - (((struct nft_trans_elem *)trans->data)->set) -#define nft_trans_elem(trans) \ - (((struct nft_trans_elem *)trans->data)->elem) -#define nft_trans_elem_set_bound(trans) \ - (((struct nft_trans_elem *)trans->data)->bound) +#define nft_trans_container_elem(t) \ + container_of(t, struct nft_trans_elem, nft_trans) +#define nft_trans_elem_set(trans) \ + nft_trans_container_elem(trans)->set +#define nft_trans_elem_set_bound(trans) \ + nft_trans_container_elem(trans)->bound struct nft_trans_obj { + struct nft_trans nft_trans; struct nft_object *obj; struct nft_object *newobj; bool update; }; -#define nft_trans_obj(trans) \ - (((struct nft_trans_obj *)trans->data)->obj) -#define nft_trans_obj_newobj(trans) \ - (((struct nft_trans_obj *)trans->data)->newobj) -#define nft_trans_obj_update(trans) \ - (((struct nft_trans_obj *)trans->data)->update) +#define nft_trans_container_obj(t) \ + container_of(t, struct nft_trans_obj, nft_trans) +#define nft_trans_obj(trans) \ + nft_trans_container_obj(trans)->obj +#define nft_trans_obj_newobj(trans) \ + nft_trans_container_obj(trans)->newobj +#define nft_trans_obj_update(trans) \ + nft_trans_container_obj(trans)->update struct nft_trans_flowtable { + struct nft_trans nft_trans; struct nft_flowtable *flowtable; - bool update; struct list_head hook_list; + u32 flags; + bool update; }; -#define nft_trans_flowtable(trans) \ - (((struct nft_trans_flowtable *)trans->data)->flowtable) -#define nft_trans_flowtable_update(trans) \ - (((struct nft_trans_flowtable *)trans->data)->update) -#define nft_trans_flowtable_hooks(trans) \ - (((struct nft_trans_flowtable *)trans->data)->hook_list) +#define nft_trans_container_flowtable(t) \ + container_of(t, struct nft_trans_flowtable, nft_trans) +#define nft_trans_flowtable(trans) \ + nft_trans_container_flowtable(trans)->flowtable +#define nft_trans_flowtable_update(trans) \ + nft_trans_container_flowtable(trans)->update +#define nft_trans_flowtable_hooks(trans) \ + nft_trans_container_flowtable(trans)->hook_list +#define nft_trans_flowtable_flags(trans) \ + nft_trans_container_flowtable(trans)->flags + +#define NFT_TRANS_GC_BATCHCOUNT 256 + +struct nft_trans_gc { + struct list_head list; + struct net *net; + struct nft_set *set; + u32 seq; + u16 count; + struct nft_elem_priv *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; +}; + +static inline void nft_ctx_update(struct nft_ctx *ctx, + const struct nft_trans *trans) +{ + switch (trans->msg_type) { + case NFT_MSG_NEWRULE: + case NFT_MSG_DELRULE: + case NFT_MSG_DESTROYRULE: + ctx->chain = nft_trans_rule_chain(trans); + break; + case NFT_MSG_NEWCHAIN: + case NFT_MSG_DELCHAIN: + case NFT_MSG_DESTROYCHAIN: + ctx->chain = nft_trans_chain(trans); + break; + default: + ctx->chain = NULL; + break; + } + + ctx->net = trans->net; + ctx->table = trans->table; + ctx->family = trans->table->family; + ctx->report = trans->report; + ctx->flags = trans->flags; + ctx->seq = trans->seq; +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_destroy(struct nft_trans_gc *trans); + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + +void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq); +struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); + +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); @@ -1523,5 +1901,65 @@ void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); -void nf_tables_trans_destroy_flush_work(void); +void nf_tables_trans_destroy_flush_work(struct net *net); + +int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); +__be64 nf_jiffies64_to_msecs(u64 input); + +#ifdef CONFIG_MODULES +__printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); +#else +static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } +#endif + +struct nftables_pernet { + struct list_head tables; + struct list_head commit_list; + struct list_head destroy_list; + struct list_head commit_set_list; + struct list_head binding_list; + struct list_head module_list; + struct list_head notify_list; + struct mutex commit_mutex; + u64 table_handle; + u64 tstamp; + unsigned int base_seq; + unsigned int gc_seq; + u8 validate_state; + struct work_struct destroy_work; +}; + +extern unsigned int nf_tables_net_id; + +static inline struct nftables_pernet *nft_pernet(const struct net *net) +{ + return net_generic(net, nf_tables_net_id); +} + +static inline u64 nft_net_tstamp(const struct net *net) +{ + return nft_pernet(net)->tstamp; +} + +#define __NFT_REDUCE_READONLY 1UL +#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY + +static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) +{ + return expr->ops->reduce == NFT_REDUCE_READONLY; +} + +void nft_reg_track_update(struct nft_regs_track *track, + const struct nft_expr *expr, u8 dreg, u8 len); +void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); +void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); + +static inline bool nft_reg_track_cmp(struct nft_regs_track *track, + const struct nft_expr *expr, u8 dreg) +{ + return track->regs[dreg].selector && + track->regs[dreg].selector->ops == expr->ops && + track->regs[dreg].num_reg == 0; +} + #endif /* _NET_NF_TABLES_H */ diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 8657e6815b07..03b6165756fc 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -3,9 +3,11 @@ #define _NET_NF_TABLES_CORE_H #include <net/netfilter/nf_tables.h> +#include <linux/indirect_call_wrapper.h> extern struct nft_expr_type nft_imm_type; extern struct nft_expr_type nft_cmp_type; +extern struct nft_expr_type nft_counter_type; extern struct nft_expr_type nft_lookup_type; extern struct nft_expr_type nft_bitwise_type; extern struct nft_expr_type nft_byteorder_type; @@ -15,10 +17,14 @@ extern struct nft_expr_type nft_range_type; extern struct nft_expr_type nft_meta_type; extern struct nft_expr_type nft_rt_type; extern struct nft_expr_type nft_exthdr_type; +extern struct nft_expr_type nft_last_type; +extern struct nft_expr_type nft_objref_type; +extern struct nft_expr_type nft_inner_type; #ifdef CONFIG_NETWORK_SECMARK extern struct nft_object_type nft_secmark_obj_type; #endif +extern struct nft_object_type nft_counter_obj_type; int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); @@ -26,51 +32,50 @@ void nf_tables_core_module_exit(void); struct nft_bitwise_fast_expr { u32 mask; u32 xor; - enum nft_registers sreg:8; - enum nft_registers dreg:8; + u8 sreg; + u8 dreg; }; struct nft_cmp_fast_expr { u32 data; u32 mask; - enum nft_registers sreg:8; + u8 sreg; + u8 len; + bool inv; +}; + +struct nft_cmp16_fast_expr { + struct nft_data data; + struct nft_data mask; + u8 sreg; u8 len; bool inv; }; struct nft_immediate_expr { struct nft_data data; - enum nft_registers dreg:8; + u8 dreg; u8 dlen; }; -/* Calculate the mask for the nft_cmp_fast expression. On big endian the - * mask needs to include the *upper* bytes when interpreting that data as - * something smaller than the full u32, therefore a cpu_to_le32 is done. - */ -static inline u32 nft_cmp_fast_mask(unsigned int len) -{ - return cpu_to_le32(~0U >> (sizeof_field(struct nft_cmp_fast_expr, - data) * BITS_PER_BYTE - len)); -} - extern const struct nft_expr_ops nft_cmp_fast_ops; +extern const struct nft_expr_ops nft_cmp16_fast_ops; -struct nft_payload { - enum nft_payload_bases base:8; - u8 offset; +struct nft_ct { + enum nft_ct_keys key:8; + enum ip_conntrack_dir dir:8; u8 len; - enum nft_registers dreg:8; + union { + u8 dreg; + u8 sreg; + }; }; -struct nft_payload_set { +struct nft_payload { enum nft_payload_bases base:8; u8 offset; u8 len; - enum nft_registers sreg:8; - u8 csum_type; - u8 csum_offset; - u8 csum_flags; + u8 dreg; }; extern const struct nft_expr_ops nft_payload_fast_ops; @@ -88,6 +93,38 @@ extern const struct nft_set_type nft_set_bitmap_type; extern const struct nft_set_type nft_set_pipapo_type; extern const struct nft_set_type nft_set_pipapo_avx2_type; +#ifdef CONFIG_MITIGATION_RETPOLINE +bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_hash_lookup_fast(const struct net *net, + const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +#else +static inline bool +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + return set->ops->lookup(net, set, key, ext); +} +#endif + +/* called from nft_pipapo_avx2.c */ +bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +/* called from nft_set_pipapo.c */ +bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); + +void nft_counter_init_seqcount(void); + struct nft_expr; struct nft_regs; struct nft_pktinfo; @@ -111,4 +148,37 @@ void nft_dynset_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_rt_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); +void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); +void nft_ct_get_fast_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt); + +enum { + NFT_PAYLOAD_CTX_INNER_TUN = (1 << 0), + NFT_PAYLOAD_CTX_INNER_LL = (1 << 1), + NFT_PAYLOAD_CTX_INNER_NH = (1 << 2), + NFT_PAYLOAD_CTX_INNER_TH = (1 << 3), +}; + +struct nft_inner_tun_ctx { + unsigned long cookie; + u16 type; + u16 inner_tunoff; + u16 inner_lloff; + u16 inner_nhoff; + u16 inner_thoff; + __be16 llproto; + u8 l4proto; + u8 flags; +}; + +int nft_payload_inner_offset(const struct nft_pktinfo *pkt); +void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx); + +void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); +void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index 1f7bea39ad1b..fcf967286e37 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -5,80 +5,82 @@ #include <net/netfilter/nf_tables.h> #include <net/ip.h> -static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) { struct iphdr *ip; ip = ip_hdr(pkt->skb); - pkt->tprot_set = true; + pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = ip->protocol; - pkt->xt.thoff = ip_hdrlen(pkt->skb); - pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET; + pkt->thoff = ip_hdrlen(pkt->skb); + pkt->fragoff = ntohs(ip->frag_off) & IP_OFFSET; } -static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { struct iphdr *iph, _iph; - u32 len, thoff; + u32 len, thoff, skb_len; - iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), - &_iph); + iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), + sizeof(*iph), &_iph); if (!iph) return -1; if (iph->ihl < 5 || iph->version != 4) return -1; - len = ntohs(iph->tot_len); + len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; - if (skb->len < len) + skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + + if (skb_len < len) return -1; else if (len < thoff) return -1; + else if (thoff < sizeof(*iph)) + return -1; - pkt->tprot_set = true; + pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; + pkt->thoff = skb_network_offset(pkt->skb) + thoff; + pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; } -static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { - if (__nft_set_pktinfo_ipv4_validate(pkt, skb) < 0) - nft_set_pktinfo_unspec(pkt, skb); + if (__nft_set_pktinfo_ipv4_validate(pkt) < 0) + nft_set_pktinfo_unspec(pkt); } -static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) { struct iphdr *iph; u32 len, thoff; - if (!pskb_may_pull(skb, sizeof(*iph))) + if (!pskb_may_pull(pkt->skb, sizeof(*iph))) return -1; - iph = ip_hdr(skb); + iph = ip_hdr(pkt->skb); if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; - len = ntohs(iph->tot_len); + len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; - if (skb->len < len) { + if (pkt->skb->len < len) { __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } else if (len < thoff) { goto inhdr_error; + } else if (thoff < sizeof(*iph)) { + return -1; } - pkt->tprot_set = true; + pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; + pkt->thoff = thoff; + pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; @@ -86,4 +88,5 @@ inhdr_error: __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INHDRERRORS); return -1; } + #endif diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 867de29f3f7a..a0633eeaec97 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -6,38 +6,36 @@ #include <net/ipv6.h> #include <net/netfilter/nf_tables.h> -static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt) { unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); - if (protohdr < 0) { - nft_set_pktinfo_unspec(pkt, skb); + if (protohdr < 0 || thoff > U16_MAX) { + nft_set_pktinfo_unspec(pkt); return; } - pkt->tprot_set = true; + pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = frag_off; + pkt->thoff = thoff; + pkt->fragoff = frag_off; } -static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; + u32 pkt_len, skb_len; int protohdr; - u32 pkt_len; - ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), - &_ip6h); + ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), + sizeof(*ip6h), &_ip6h); if (!ip6h) return -1; @@ -45,17 +43,18 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, return -1; pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > skb->len) + skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + if (pkt_len + sizeof(*ip6h) > skb_len) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); - if (protohdr < 0) + if (protohdr < 0 || thoff > U16_MAX) return -1; - pkt->tprot_set = true; + pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = frag_off; + pkt->thoff = thoff; + pkt->fragoff = frag_off; return 0; #else @@ -63,15 +62,13 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, #endif } -static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { - if (__nft_set_pktinfo_ipv6_validate(pkt, skb) < 0) - nft_set_pktinfo_unspec(pkt, skb); + if (__nft_set_pktinfo_ipv6_validate(pkt) < 0) + nft_set_pktinfo_unspec(pkt); } -static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; @@ -82,28 +79,28 @@ static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt, int protohdr; u32 pkt_len; - if (!pskb_may_pull(skb, sizeof(*ip6h))) + if (!pskb_may_pull(pkt->skb, sizeof(*ip6h))) return -1; - ip6h = ipv6_hdr(skb); + ip6h = ipv6_hdr(pkt->skb); if (ip6h->version != 6) goto inhdr_error; pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > skb->len) { + if (pkt_len + sizeof(*ip6h) > pkt->skb->len) { idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); - if (protohdr < 0) + if (protohdr < 0 || thoff > U16_MAX) goto inhdr_error; - pkt->tprot_set = true; + pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = frag_off; + pkt->thoff = thoff; + pkt->fragoff = frag_off; return 0; diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index ea7d1d78b92d..3568b6a2f5f0 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -4,11 +4,16 @@ #include <net/flow_offload.h> #include <net/netfilter/nf_tables.h> +enum nft_offload_reg_flags { + NFT_OFFLOAD_F_NETWORK2HOST = (1 << 0), +}; + struct nft_offload_reg { u32 key; u32 len; u32 base_offset; u32 offset; + u32 flags; struct nft_data data; struct nft_data mask; }; @@ -37,6 +42,7 @@ void nft_offload_update_dependency(struct nft_offload_ctx *ctx, struct nft_flow_key { struct flow_dissector_key_basic basic; + struct flow_dissector_key_control control; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; @@ -44,6 +50,7 @@ struct nft_flow_key { struct flow_dissector_key_ports tp; struct flow_dissector_key_ip ip; struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_eth_addrs eth_addrs; struct flow_dissector_key_meta meta; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ @@ -60,23 +67,32 @@ struct nft_flow_rule { struct flow_rule *rule; }; -#define NFT_OFFLOAD_F_ACTION (1 << 0) +void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, + enum flow_dissector_key_id addr_type); struct nft_rule; struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule); +int nft_flow_rule_stats(const struct nft_chain *chain, const struct nft_rule *rule); void nft_flow_rule_destroy(struct nft_flow_rule *flow); int nft_flow_rule_offload_commit(struct net *net); -#define NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ +#define NFT_OFFLOAD_MATCH_FLAGS(__key, __base, __field, __len, __reg, __flags) \ (__reg)->base_offset = \ offsetof(struct nft_flow_key, __base); \ (__reg)->offset = \ offsetof(struct nft_flow_key, __base.__field); \ (__reg)->len = __len; \ (__reg)->key = __key; \ + (__reg)->flags = __flags; + +#define NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ + NFT_OFFLOAD_MATCH_FLAGS(__key, __base, __field, __len, __reg, 0) + +#define NFT_OFFLOAD_MATCH_EXACT(__key, __base, __field, __len, __reg) \ + NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ memset(&(__reg)->mask, 0xff, (__reg)->len); -int nft_chain_offload_priority(struct nft_base_chain *basechain); +bool nft_chain_offload_support(const struct nft_base_chain *basechain); int nft_offload_init(void); void nft_offload_exit(void); diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 82d0e41b76f2..06985530517b 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -17,6 +17,13 @@ static inline bool nf_tproxy_sk_is_transparent(struct sock *sk) return false; } +static inline void nf_tproxy_twsk_deschedule_put(struct inet_timewait_sock *tw) +{ + local_bh_disable(); + inet_twsk_deschedule_put(tw); + local_bh_enable(); +} + /* assign a socket to the skb -- consumes sk */ static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) { @@ -29,6 +36,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); /** * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections + * @net: The network namespace. * @skb: The skb being processed. * @laddr: IPv4 address to redirect to or zero. * @lport: TCP port to redirect to or zero. @@ -41,7 +49,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); * * nf_tproxy_handle_time_wait4() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * @@ -100,7 +108,7 @@ nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, * * nf_tproxy_handle_time_wait6() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 628b6fa579cd..7370fba844ef 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -2,10 +2,11 @@ #ifndef _NFT_FIB_H_ #define _NFT_FIB_H_ +#include <net/l3mdev.h> #include <net/netfilter/nf_tables.h> struct nft_fib { - enum nft_registers dreg:8; + u8 dreg; u8 result; u32 flags; }; @@ -18,12 +19,39 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in) return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } -int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr); +static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) +{ + const struct net_device *indev = nft_in(pkt); + const struct sock *sk; + + switch (nft_hook(pkt)) { + case NF_INET_PRE_ROUTING: + case NF_INET_INGRESS: + case NF_INET_LOCAL_IN: + break; + default: + return false; + } + + sk = pkt->skb->sk; + if (sk && sk_fullsock(sk)) + return sk->sk_rx_dst_ifindex == indev->ifindex; + + return nft_fib_is_loopback(pkt->skb, indev); +} + +static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt, + const struct net_device *iif) +{ + const struct net_device *dev = iif ? iif : pkt->skb->dev; + + return l3mdev_master_ifindex_rcu(dev); +} + +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); -int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data); - +int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr); void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); @@ -37,4 +65,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct net_device *dev); + +bool nft_fib_reduce(struct nft_regs_track *track, + const struct nft_expr *expr); #endif diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 07e2fd507963..d602263590fe 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -6,9 +6,10 @@ struct nft_meta { enum nft_meta_keys key:8; + u8 len; union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; + u8 dreg; + u8 sreg; }; }; @@ -23,10 +24,10 @@ int nft_meta_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]); int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -40,7 +41,14 @@ void nft_meta_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr); int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); + const struct nft_expr *expr); + +bool nft_meta_get_reduce(struct nft_regs_track *track, + const struct nft_expr *expr); + +struct nft_inner_tun_ctx; +void nft_meta_inner_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx); #endif diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 56b123a42220..19060212988a 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -15,14 +15,14 @@ struct nft_reject { extern const struct nla_policy nft_reject_policy[]; int nft_reject_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); + const struct nft_expr *expr); int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); -int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_reject_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset); int nft_reject_icmp_code(u8 code); int nft_reject_icmpv6_code(u8 code); diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index 832ab69efda5..4c3809e141f4 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -6,7 +6,7 @@ struct xt_rateest { /* keep lock and bstats on same cache line to speedup xt_rateest_tg() */ - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; spinlock_t lock; diff --git a/include/net/netkit.h b/include/net/netkit.h new file mode 100644 index 000000000000..9ec0163739f4 --- /dev/null +++ b/include/net/netkit.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __NET_NETKIT_H +#define __NET_NETKIT_H + +#include <linux/bpf.h> + +#ifdef CONFIG_NETKIT +int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +INDIRECT_CALLABLE_DECLARE(struct net_device *netkit_peer_dev(struct net_device *dev)); +#else +static inline int netkit_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_detach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} + +static inline struct net_device *netkit_peer_dev(struct net_device *dev) +{ + return NULL; +} +#endif /* CONFIG_NETKIT */ +#endif /* __NET_NETKIT_H */ diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 43ae50337685..02914b1df38b 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -30,7 +30,7 @@ struct calipso_doi; /* * NetLabel - A management interface for maintaining network packet label - * mapping tables for explicit packet labling protocols. + * mapping tables for explicit packet labeling protocols. * * Network protocols such as CIPSO and RIPSO require a label translation layer * to convert the label on the packet into something meaningful on the host @@ -97,7 +97,7 @@ struct calipso_doi; /* NetLabel audit information */ struct netlbl_audit { - u32 secid; + struct lsm_prop prop; kuid_t loginuid; unsigned int sessionid; }; @@ -145,15 +145,14 @@ struct netlbl_lsm_cache { * processing. * */ -#define NETLBL_CATMAP_MAPTYPE u64 #define NETLBL_CATMAP_MAPCNT 4 -#define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) +#define NETLBL_CATMAP_MAPSIZE (sizeof(u64) * 8) #define NETLBL_CATMAP_SIZE (NETLBL_CATMAP_MAPSIZE * \ NETLBL_CATMAP_MAPCNT) -#define NETLBL_CATMAP_BIT (NETLBL_CATMAP_MAPTYPE)0x01 +#define NETLBL_CATMAP_BIT ((u64)0x01) struct netlbl_lsm_catmap { u32 startbit; - NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; + u64 bitmap[NETLBL_CATMAP_MAPCNT]; struct netlbl_lsm_catmap *next; }; @@ -209,6 +208,7 @@ struct netlbl_lsm_secattr { * struct netlbl_calipso_ops - NetLabel CALIPSO operations * @doi_add: add a CALIPSO DOI * @doi_free: free a CALIPSO DOI + * @doi_remove: remove a CALIPSO DOI * @doi_getdef: returns a reference to a DOI * @doi_putdef: releases a reference of a DOI * @doi_walk: enumerate the DOI list @@ -275,15 +275,17 @@ struct netlbl_calipso_ops { * on success, NULL on failure. * */ -static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc(gfp_t flags) +static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc_noprof(gfp_t flags) { struct netlbl_lsm_cache *cache; - cache = kzalloc(sizeof(*cache), flags); + cache = kzalloc_noprof(sizeof(*cache), flags); if (cache) refcount_set(&cache->refcount, 1); return cache; } +#define netlbl_secattr_cache_alloc(...) \ + alloc_hooks(netlbl_secattr_cache_alloc_noprof(__VA_ARGS__)) /** * netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct @@ -312,10 +314,11 @@ static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache) * on failure. * */ -static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc(gfp_t flags) +static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc_noprof(gfp_t flags) { - return kzalloc(sizeof(struct netlbl_lsm_catmap), flags); + return kzalloc_noprof(sizeof(struct netlbl_lsm_catmap), flags); } +#define netlbl_catmap_alloc(...) alloc_hooks(netlbl_catmap_alloc_noprof(__VA_ARGS__)) /** * netlbl_catmap_free - Free a LSM secattr catmap @@ -377,10 +380,11 @@ static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) * pointer on success, or NULL on failure. * */ -static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(gfp_t flags) +static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc_noprof(gfp_t flags) { - return kzalloc(sizeof(struct netlbl_lsm_secattr), flags); + return kzalloc_noprof(sizeof(struct netlbl_lsm_secattr), flags); } +#define netlbl_secattr_alloc(...) alloc_hooks(netlbl_secattr_alloc_noprof(__VA_ARGS__)) /** * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct @@ -471,7 +475,8 @@ void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state); int netlbl_enabled(void); int netlbl_sock_setattr(struct sock *sk, u16 family, - const struct netlbl_lsm_secattr *secattr); + const struct netlbl_lsm_secattr *secattr, + bool sk_locked); void netlbl_sock_delattr(struct sock *sk); int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); @@ -488,6 +493,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr); void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway); +bool netlbl_sk_lock_check(struct sock *sk); /* * LSM label mapping cache operations @@ -615,7 +621,8 @@ static inline int netlbl_enabled(void) } static inline int netlbl_sock_setattr(struct sock *sk, u16 family, - const struct netlbl_lsm_secattr *secattr) + const struct netlbl_lsm_secattr *secattr, + bool sk_locked) { return -ENOSYS; } @@ -674,6 +681,11 @@ static inline struct audit_buffer *netlbl_audit_start(int type, { return NULL; } + +static inline bool netlbl_sk_lock_check(struct sock *sk) +{ + return true; +} #endif /* CONFIG_NETLABEL */ const struct netlbl_calipso_ops * diff --git a/include/net/netlink.h b/include/net/netlink.h index 7356f41d23ba..90a560dc167a 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -41,7 +41,8 @@ * nlmsg_get_pos() return current position in message * nlmsg_trim() trim part of message * nlmsg_cancel() cancel message construction - * nlmsg_free() free a netlink message + * nlmsg_consume() free a netlink message (expected) + * nlmsg_free() free a netlink message (drop) * * Message Sending: * nlmsg_multicast() multicast message to several groups @@ -117,6 +118,7 @@ * nla_nest_start(skb, type) start a nested attribute * nla_nest_end(skb, nla) finalize a nested attribute * nla_nest_cancel(skb, nla) cancel nested attribute construction + * nla_put_empty_nest(skb, type) create an empty nest * * Attribute Length Calculations: * nla_attr_size(payload) length of attribute w/o padding @@ -128,6 +130,8 @@ * nla_len(nla) length of attribute payload * * Attribute Payload Access for Basic Types: + * nla_get_uint(nla) get payload for a uint attribute + * nla_get_sint(nla) get payload for a sint attribute * nla_get_u8(nla) get payload for a u8 attribute * nla_get_u16(nla) get payload for a u16 attribute * nla_get_u32(nla) get payload for a u32 attribute @@ -139,10 +143,12 @@ * nla_get_flag(nla) return 1 if flag is true * nla_get_msecs(nla) get payload for a msecs attribute * + * The same functions also exist with _default(). + * * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area - * nla_strlcpy(dst, nla, size) copy attribute to a sized string + * nla_strscpy(dst, nla, size) copy attribute to a sized string * nla_strcmp(nla, str) compare attribute with string * * Attribute Parsing: @@ -155,7 +161,11 @@ * nla_parse() parse and validate stream of attrs * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes + * nla_for_each_attr_type() loop over all attributes with the + * given type * nla_for_each_nested() loop over the nested attributes + * nla_for_each_nested_type() loop over the nested attributes with + * the given type *========================================================================= */ @@ -181,6 +191,10 @@ enum { NLA_S64, NLA_BITFIELD32, NLA_REJECT, + NLA_BE16, + NLA_BE32, + NLA_SINT, + NLA_UINT, __NLA_TYPE_MAX, }; @@ -227,10 +241,12 @@ enum nla_policy_validation { * nested header (or empty); len field is used if * nested_policy is also used, for the max attr * number in the nested policy. + * NLA_SINT, NLA_UINT, * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, * NLA_S32, NLA_S64, + * NLA_BE16, NLA_BE32, * NLA_MSECS Leaving the length field zero will verify the * given type fits, using it verifies minimum length * just like "All other" @@ -257,10 +273,14 @@ enum nla_policy_validation { * while an array has the nested attributes at another * level down and the attribute types directly in the * nesting don't matter. + * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64, + * NLA_BE16, + * NLA_BE32, + * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, @@ -271,9 +291,11 @@ enum nla_policy_validation { * Note that in the interest of code simplicity and * struct size both limits are s16, so you cannot * enforce a range that doesn't fall within the range - * of s16 - do that as usual in the code instead. + * of s16 - do that using the NLA_POLICY_FULL_RANGE() + * or NLA_POLICY_FULL_RANGE_SIGNED() macros instead. * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and * NLA_POLICY_RANGE() macros. + * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, @@ -282,6 +304,7 @@ enum nla_policy_validation { * to a struct netlink_range_validation that indicates * the min/max values. * Use NLA_POLICY_FULL_RANGE(). + * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, @@ -298,7 +321,13 @@ enum nla_policy_validation { * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: + * NLA_U8, NLA_U16, + * NLA_U32, NLA_U64, + * NLA_S8, NLA_S16, + * NLA_S32, NLA_S64, + * NLA_MSECS, * NLA_BINARY Validation function called for the attribute. + * * All other Unused - but note that it's a union * * Example: @@ -317,18 +346,10 @@ struct nla_policy { u8 validation_type; u16 len; union { - const u32 bitfield32_valid; - const u32 mask; - const char *reject_message; - const struct nla_policy *nested_policy; - struct netlink_range_validation *range; - struct netlink_range_validation_signed *range_signed; - struct { - s16 min, max; - }; - int (*validate)(const struct nlattr *attr, - struct netlink_ext_ack *extack); - /* This entry is special, and used for the attribute at index 0 + /** + * @strict_start_type: first attribute to validate strictly + * + * This entry is special, and used for the attribute at index 0 * only, and specifies special data about the policy, namely it * specifies the "boundary type" where strict length validation * starts for any attribute types >= this value, also, strict @@ -347,6 +368,19 @@ struct nla_policy { * was added to enforce strict validation from thereon. */ u16 strict_start_type; + + /* private: use NLA_POLICY_*() to set */ + const u32 bitfield32_valid; + const u32 mask; + const char *reject_message; + const struct nla_policy *nested_policy; + const struct netlink_range_validation *range; + const struct netlink_range_validation_signed *range_signed; + struct { + s16 min, max; + }; + int (*validate)(const struct nlattr *attr, + struct netlink_ext_ack *extack); }; }; @@ -364,10 +398,13 @@ struct nla_policy { #define NLA_POLICY_BITFIELD32(valid) \ { .type = NLA_BITFIELD32, .bitfield32_valid = valid } -#define __NLA_IS_UINT_TYPE(tp) \ - (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || tp == NLA_U64) +#define __NLA_IS_UINT_TYPE(tp) \ + (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || \ + tp == NLA_U64 || tp == NLA_UINT || \ + tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_IS_SINT_TYPE(tp) \ - (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) + (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \ + tp == NLA_SINT) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ @@ -441,6 +478,7 @@ struct nla_policy { .max = _len \ } #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) +#define NLA_POLICY_MAX_LEN(_len) NLA_POLICY_MAX(NLA_BINARY, _len) /** * struct nl_info - netlink source information @@ -506,7 +544,7 @@ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); -size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize); +ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize); char *nla_strdup(const struct nlattr *nla, gfp_t flags); int nla_memcpy(void *dest, const struct nlattr *src, int count); int nla_memcmp(const struct nlattr *nla, const void *data, size_t size); @@ -580,6 +618,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) } /** + * nlmsg_payload - message payload if the data fits in the len + * @nlh: netlink message header + * @len: struct length + * + * Returns: The netlink message payload/data if the length is sufficient, + * otherwise NULL. + */ +static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(len)) + return NULL; + + return nlmsg_data(nlh); +} + +/** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header @@ -618,7 +672,7 @@ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * - * Returns the next netlink message in the message stream and + * Returns: the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * @@ -645,7 +699,7 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -670,7 +724,7 @@ static inline int nla_parse(struct nlattr **tb, int maxtype, * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -695,7 +749,7 @@ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, @@ -741,6 +795,7 @@ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected + * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() @@ -760,6 +815,7 @@ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected + * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() @@ -779,6 +835,7 @@ static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected + * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated_strict() @@ -796,10 +853,10 @@ nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, /** * nlmsg_find_attr - find a specific attribute in a netlink message * @nlh: netlink message header - * @hdrlen: length of familiy specific header + * @hdrlen: length of family specific header * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) @@ -814,14 +871,13 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy - * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in liberal mode. - * See documenation of struct nla_policy for more details. + * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, @@ -842,9 +898,9 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len, * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in strict mode. - * See documenation of struct nla_policy for more details. + * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, @@ -857,7 +913,7 @@ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, /** * nlmsg_validate_deprecated - validate a netlink message including attributes * @nlh: netlinket message header - * @hdrlen: length of familiy specific header + * @hdrlen: length of family specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct @@ -881,18 +937,29 @@ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, * nlmsg_report - need to report back to application? * @nlh: netlink message header * - * Returns 1 if a report back to the application is requested. + * Returns: 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { - return !!(nlh->nlmsg_flags & NLM_F_ECHO); + return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0; +} + +/** + * nlmsg_seq - return the seq number of netlink message + * @nlh: netlink message header + * + * Returns: 0 if netlink message is NULL + */ +static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) +{ + return nlh ? nlh->nlmsg_seq : 0; } /** * nlmsg_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @nlh: netlink message header - * @hdrlen: length of familiy specific header + * @hdrlen: length of family specific header * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ @@ -908,7 +975,7 @@ static inline int nlmsg_report(const struct nlmsghdr *nlh) * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, @@ -921,6 +988,27 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se } /** + * nlmsg_append - Add more data to a nlmsg in a skb + * @skb: socket buffer to store message in + * @size: length of message payload + * + * Append data to an existing nlmsg, used when constructing a message + * with multiple fixed-format headers (which is rare). + * Returns: NULL if the tailroom of the skb is insufficient to store + * the extra payload. + */ +static inline void *nlmsg_append(struct sk_buff *skb, u32 size) +{ + if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size))) + return NULL; + + if (NLMSG_ALIGN(size) - size) + memset(skb_tail_pointer(skb) + size, 0, + NLMSG_ALIGN(size) - size); + return __skb_put(skb, NLMSG_ALIGN(size)); +} + +/** * nlmsg_put_answer - Add a new callback based netlink message to an skb * @skb: socket buffer to store message in * @cb: netlink callback @@ -928,7 +1016,7 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, @@ -954,11 +1042,25 @@ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) } /** + * nlmsg_new_large - Allocate a new netlink message with non-contiguous + * physical memory + * @payload: size of the message payload + * + * The allocated skb is unable to have frag page for shinfo->frags*, + * as the NULL setting for skb->head in netlink_skb_destructor() will + * bypass most of the handling in skb_release_data() + */ +static inline struct sk_buff *nlmsg_new_large(size_t payload) +{ + return netlink_alloc_large_skb(nlmsg_total_size(payload), 0); +} + +/** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * - * Corrects the netlink message header to include the appeneded + * Corrects the netlink message header to include the appended * attributes. Only necessary if attributes have been added to * the message. */ @@ -971,7 +1073,7 @@ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * - * Returns a pointer to the current tail of the message. + * Returns: a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { @@ -1007,7 +1109,7 @@ static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh) } /** - * nlmsg_free - free a netlink message + * nlmsg_free - drop a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_free(struct sk_buff *skb) @@ -1016,21 +1118,38 @@ static inline void nlmsg_free(struct sk_buff *skb) } /** - * nlmsg_multicast - multicast a netlink message + * nlmsg_consume - free a netlink message + * @skb: socket buffer of netlink message + */ +static inline void nlmsg_consume(struct sk_buff *skb) +{ + consume_skb(skb); +} + +/** + * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. */ -static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, - u32 portid, unsigned int group, gfp_t flags) +static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, + gfp_t flags, + netlink_filter_fn filter, + void *filter_data) { int err; NETLINK_CB(skb).dst_group = group; - err = netlink_broadcast(sk, skb, portid, group, flags); + err = netlink_broadcast_filtered(sk, skb, portid, group, flags, + filter, filter_data); if (err > 0) err = 0; @@ -1038,6 +1157,21 @@ static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, } /** + * nlmsg_multicast - multicast a netlink message + * @sk: netlink socket to spread messages to + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: multicast group id + * @flags: allocation flags + */ +static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags) +{ + return nlmsg_multicast_filtered(sk, skb, portid, group, flags, + NULL, NULL); +} + +/** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to * @skb: netlink message as socket buffer @@ -1143,7 +1277,7 @@ static inline void *nla_data(const struct nlattr *nla) * nla_len - length of payload * @nla: netlink attribute */ -static inline int nla_len(const struct nlattr *nla) +static inline u16 nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } @@ -1165,7 +1299,7 @@ static inline int nla_ok(const struct nlattr *nla, int remaining) * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * - * Returns the next netlink attribute in the attribute stream and + * Returns: the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) @@ -1181,7 +1315,7 @@ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) @@ -1312,6 +1446,22 @@ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) } /** + * nla_put_uint - Add a variable-size unsigned int to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value) +{ + u64 tmp64 = value; + u32 tmp32 = value; + + if (tmp64 == tmp32) + return nla_put_u32(skb, attrtype, tmp32); + return nla_put(skb, attrtype, sizeof(u64), &tmp64); +} + +/** * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type @@ -1466,6 +1616,22 @@ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, } /** + * nla_put_sint - Add a variable-size signed int to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value) +{ + s64 tmp64 = value; + s32 tmp32 = value; + + if (tmp64 == tmp32) + return nla_put_s32(skb, attrtype, tmp32); + return nla_put(skb, attrtype, sizeof(s64), &tmp64); +} + +/** * nla_put_string - Add a string netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type @@ -1555,6 +1721,20 @@ static inline u32 nla_get_u32(const struct nlattr *nla) } /** + * nla_get_u32_default - return payload of u32 attribute or default + * @nla: u32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u32(nla); +} + +/** * nla_get_be32 - return payload of __be32 attribute * @nla: __be32 netlink attribute */ @@ -1564,6 +1744,21 @@ static inline __be32 nla_get_be32(const struct nlattr *nla) } /** + * nla_get_be32_default - return payload of be32 attribute or default + * @nla: __be32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be32 nla_get_be32_default(const struct nlattr *nla, + __be32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be32(nla); +} + +/** * nla_get_le32 - return payload of __le32 attribute * @nla: __le32 netlink attribute */ @@ -1573,6 +1768,21 @@ static inline __le32 nla_get_le32(const struct nlattr *nla) } /** + * nla_get_le32_default - return payload of le32 attribute or default + * @nla: __le32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le32 nla_get_le32_default(const struct nlattr *nla, + __le32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le32(nla); +} + +/** * nla_get_u16 - return payload of u16 attribute * @nla: u16 netlink attribute */ @@ -1582,6 +1792,20 @@ static inline u16 nla_get_u16(const struct nlattr *nla) } /** + * nla_get_u16_default - return payload of u16 attribute or default + * @nla: u16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u16(nla); +} + +/** * nla_get_be16 - return payload of __be16 attribute * @nla: __be16 netlink attribute */ @@ -1591,6 +1815,21 @@ static inline __be16 nla_get_be16(const struct nlattr *nla) } /** + * nla_get_be16_default - return payload of be16 attribute or default + * @nla: __be16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be16 nla_get_be16_default(const struct nlattr *nla, + __be16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be16(nla); +} + +/** * nla_get_le16 - return payload of __le16 attribute * @nla: __le16 netlink attribute */ @@ -1600,6 +1839,21 @@ static inline __le16 nla_get_le16(const struct nlattr *nla) } /** + * nla_get_le16_default - return payload of le16 attribute or default + * @nla: __le16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le16 nla_get_le16_default(const struct nlattr *nla, + __le16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le16(nla); +} + +/** * nla_get_u8 - return payload of u8 attribute * @nla: u8 netlink attribute */ @@ -1609,6 +1863,20 @@ static inline u8 nla_get_u8(const struct nlattr *nla) } /** + * nla_get_u8_default - return payload of u8 attribute or default + * @nla: u8 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u8(nla); +} + +/** * nla_get_u64 - return payload of u64 attribute * @nla: u64 netlink attribute */ @@ -1622,6 +1890,45 @@ static inline u64 nla_get_u64(const struct nlattr *nla) } /** + * nla_get_u64_default - return payload of u64 attribute or default + * @nla: u64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u64(nla); +} + +/** + * nla_get_uint - return payload of uint attribute + * @nla: uint netlink attribute + */ +static inline u64 nla_get_uint(const struct nlattr *nla) +{ + if (nla_len(nla) == sizeof(u32)) + return nla_get_u32(nla); + return nla_get_u64(nla); +} + +/** + * nla_get_uint_default - return payload of uint attribute or default + * @nla: uint netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_uint(nla); +} + +/** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute */ @@ -1635,6 +1942,21 @@ static inline __be64 nla_get_be64(const struct nlattr *nla) } /** + * nla_get_be64_default - return payload of be64 attribute or default + * @nla: __be64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be64 nla_get_be64_default(const struct nlattr *nla, + __be64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be64(nla); +} + +/** * nla_get_le64 - return payload of __le64 attribute * @nla: __le64 netlink attribute */ @@ -1644,6 +1966,21 @@ static inline __le64 nla_get_le64(const struct nlattr *nla) } /** + * nla_get_le64_default - return payload of le64 attribute or default + * @nla: __le64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le64 nla_get_le64_default(const struct nlattr *nla, + __le64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le64(nla); +} + +/** * nla_get_s32 - return payload of s32 attribute * @nla: s32 netlink attribute */ @@ -1653,6 +1990,20 @@ static inline s32 nla_get_s32(const struct nlattr *nla) } /** + * nla_get_s32_default - return payload of s32 attribute or default + * @nla: s32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s32(nla); +} + +/** * nla_get_s16 - return payload of s16 attribute * @nla: s16 netlink attribute */ @@ -1662,6 +2013,20 @@ static inline s16 nla_get_s16(const struct nlattr *nla) } /** + * nla_get_s16_default - return payload of s16 attribute or default + * @nla: s16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s16(nla); +} + +/** * nla_get_s8 - return payload of s8 attribute * @nla: s8 netlink attribute */ @@ -1671,6 +2036,20 @@ static inline s8 nla_get_s8(const struct nlattr *nla) } /** + * nla_get_s8_default - return payload of s8 attribute or default + * @nla: s8 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s8(nla); +} + +/** * nla_get_s64 - return payload of s64 attribute * @nla: s64 netlink attribute */ @@ -1684,6 +2063,45 @@ static inline s64 nla_get_s64(const struct nlattr *nla) } /** + * nla_get_s64_default - return payload of s64 attribute or default + * @nla: s64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s64(nla); +} + +/** + * nla_get_sint - return payload of uint attribute + * @nla: uint netlink attribute + */ +static inline s64 nla_get_sint(const struct nlattr *nla) +{ + if (nla_len(nla) == sizeof(s32)) + return nla_get_s32(nla); + return nla_get_s64(nla); +} + +/** + * nla_get_sint_default - return payload of sint attribute or default + * @nla: sint netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_sint(nla); +} + +/** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute */ @@ -1696,7 +2114,7 @@ static inline int nla_get_flag(const struct nlattr *nla) * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * - * Returns the number of milliseconds in jiffies. + * Returns: the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { @@ -1706,6 +2124,21 @@ static inline unsigned long nla_get_msecs(const struct nlattr *nla) } /** + * nla_get_msecs_default - return payload of msecs attribute or default + * @nla: msecs netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline unsigned long nla_get_msecs_default(const struct nlattr *nla, + unsigned long defvalue) +{ + if (!nla) + return defvalue; + return nla_get_msecs(nla); +} + +/** * nla_get_in_addr - return payload of IPv4 address attribute * @nla: IPv4 address netlink attribute */ @@ -1715,6 +2148,21 @@ static inline __be32 nla_get_in_addr(const struct nlattr *nla) } /** + * nla_get_in_addr_default - return payload of be32 attribute or default + * @nla: IPv4 address netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be32 nla_get_in_addr_default(const struct nlattr *nla, + __be32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_in_addr(nla); +} + +/** * nla_get_in6_addr - return payload of IPv6 address attribute * @nla: IPv6 address netlink attribute */ @@ -1743,10 +2191,11 @@ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) * @src: netlink attribute to duplicate from * @gfp: GFP mask */ -static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) +static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) { - return kmemdup(nla_data(src), nla_len(src), gfp); + return kmemdup_noprof(nla_data(src), nla_len(src), gfp); } +#define nla_memdup(...) alloc_hooks(nla_memdup_noprof(__VA_ARGS__)) /** * nla_nest_start_noflag - Start a new level of nested attributes @@ -1757,7 +2206,7 @@ static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) @@ -1778,7 +2227,7 @@ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { @@ -1791,9 +2240,9 @@ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) * @start: container attribute * * Corrects the container attribute header to include the all - * appeneded attributes. + * appended attributes. * - * Returns the total data length of the skb. + * Returns: the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { @@ -1815,6 +2264,20 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) } /** + * nla_put_empty_nest - Create an empty nest + * @skb: socket buffer the message is stored in + * @attrtype: attribute type of the container + * + * This function is a helper for creating empty nests. + * + * Returns: 0 when successful or -EMSGSIZE on failure. + */ +static inline int nla_put_empty_nest(struct sk_buff *skb, int attrtype) +{ + return nla_nest_start(skb, attrtype) ? 0 : -EMSGSIZE; +} + +/** * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected @@ -1824,9 +2287,9 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * * Validates all attributes in the nested attribute stream against the * specified policy. Attributes with a type exceeding maxtype will be - * ignored. See documenation of struct nla_policy for more details. + * ignored. See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, @@ -1859,7 +2322,7 @@ nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * - * Return true if padding is needed to align the next attribute (nla_data()) to + * Return: true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) @@ -1886,7 +2349,7 @@ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * - * Returns zero on success or a negative error code. + * Returns: zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { @@ -1923,6 +2386,18 @@ static inline int nla_total_size_64bit(int payload) pos = nla_next(pos, &(rem))) /** + * nla_for_each_attr_type - iterate over a stream of attributes + * @pos: loop counter, set to current attribute + * @type: required attribute type for @pos + * @head: head of attribute stream + * @len: length of attribute stream + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_attr_type(pos, type, head, len, rem) \ + nla_for_each_attr(pos, head, len, rem) \ + if (nla_type(pos) == type) + +/** * nla_for_each_nested - iterate over nested attributes * @pos: loop counter, set to current attribute * @nla: attribute containing the nested attributes @@ -1932,6 +2407,17 @@ static inline int nla_total_size_64bit(int payload) nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) /** + * nla_for_each_nested_type - iterate over nested attributes + * @pos: loop counter, set to current attribute + * @type: required attribute type for @pos + * @nla: attribute containing the nested attributes + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_nested_type(pos, type, nla, rem) \ + nla_for_each_nested(pos, nla, rem) \ + if (nla_type(pos) == type) + +/** * nla_is_last - Test if attribute is last in stream * @nla: attribute to test * @rem: bytes remaining in stream diff --git a/include/net/netmem.h b/include/net/netmem.h new file mode 100644 index 000000000000..386164fb9c18 --- /dev/null +++ b/include/net/netmem.h @@ -0,0 +1,299 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Network memory + * + * Author: Mina Almasry <almasrymina@google.com> + */ + +#ifndef _NET_NETMEM_H +#define _NET_NETMEM_H + +#include <linux/dma-mapping.h> +#include <linux/mm.h> +#include <net/net_debug.h> + +/* net_iov */ + +DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); + +/* We overload the LSB of the struct page pointer to indicate whether it's + * a page or net_iov. + */ +#define NET_IOV 0x01UL + +enum net_iov_type { + NET_IOV_DMABUF, + NET_IOV_IOURING, + + /* Force size to unsigned long to make the NET_IOV_ASSERTS below pass. + */ + NET_IOV_MAX = ULONG_MAX +}; + +struct net_iov { + enum net_iov_type type; + unsigned long pp_magic; + struct page_pool *pp; + struct net_iov_area *owner; + unsigned long dma_addr; + atomic_long_t pp_ref_count; +}; + +struct net_iov_area { + /* Array of net_iovs for this area. */ + struct net_iov *niovs; + size_t num_niovs; + + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; +}; + +/* These fields in struct page are used by the page_pool and net stack: + * + * struct { + * unsigned long pp_magic; + * struct page_pool *pp; + * unsigned long _pp_mapping_pad; + * unsigned long dma_addr; + * atomic_long_t pp_ref_count; + * }; + * + * We mirror the page_pool fields here so the page_pool can access these fields + * without worrying whether the underlying fields belong to a page or net_iov. + * + * The non-net stack fields of struct page are private to the mm stack and must + * never be mirrored to net_iov. + */ +#define NET_IOV_ASSERT_OFFSET(pg, iov) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct net_iov, iov)) +NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); +NET_IOV_ASSERT_OFFSET(pp, pp); +NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); +NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); +#undef NET_IOV_ASSERT_OFFSET + +static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov) +{ + return niov->owner; +} + +static inline unsigned int net_iov_idx(const struct net_iov *niov) +{ + return niov - net_iov_owner(niov)->niovs; +} + +/* netmem */ + +/** + * typedef netmem_ref - a nonexistent type marking a reference to generic + * network memory. + * + * A netmem_ref currently is always a reference to a struct page. This + * abstraction is introduced so support for new memory types can be added. + * + * Use the supplied helpers to obtain the underlying memory pointer and fields. + */ +typedef unsigned long __bitwise netmem_ref; + +static inline bool netmem_is_net_iov(const netmem_ref netmem) +{ + return (__force unsigned long)netmem & NET_IOV; +} + +/** + * __netmem_to_page - unsafely get pointer to the &page backing @netmem + * @netmem: netmem reference to convert + * + * Unsafe version of netmem_to_page(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB, no WARN). When @netmem points to IOV, + * provokes undefined behaviour. + * + * Return: pointer to the &page (garbage if @netmem is not page-backed). + */ +static inline struct page *__netmem_to_page(netmem_ref netmem) +{ + return (__force struct page *)netmem; +} + +/* This conversion fails (returns NULL) if the netmem_ref is not struct page + * backed. + */ +static inline struct page *netmem_to_page(netmem_ref netmem) +{ + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return NULL; + + return __netmem_to_page(netmem); +} + +static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return (struct net_iov *)((__force unsigned long)netmem & + ~NET_IOV); + + DEBUG_NET_WARN_ON_ONCE(true); + return NULL; +} + +static inline netmem_ref net_iov_to_netmem(struct net_iov *niov) +{ + return (__force netmem_ref)((unsigned long)niov | NET_IOV); +} + +static inline netmem_ref page_to_netmem(struct page *page) +{ + return (__force netmem_ref)page; +} + +/** + * virt_to_netmem - convert virtual memory pointer to a netmem reference + * @data: host memory pointer to convert + * + * Return: netmem reference to the &page backing this virtual address. + */ +static inline netmem_ref virt_to_netmem(const void *data) +{ + return page_to_netmem(virt_to_page(data)); +} + +static inline int netmem_ref_count(netmem_ref netmem) +{ + /* The non-pp refcount of net_iov is always 1. On net_iov, we only + * support pp refcounting which uses the pp_ref_count field. + */ + if (netmem_is_net_iov(netmem)) + return 1; + + return page_ref_count(netmem_to_page(netmem)); +} + +static inline unsigned long netmem_pfn_trace(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return 0; + + return page_to_pfn(netmem_to_page(netmem)); +} + +static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) +{ + return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); +} + +/** + * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem + * @netmem: netmem reference to get the pointer from + * + * Unsafe version of netmem_get_pp(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (avoids clearing the LSB). When @netmem points to IOV, + * provokes invalid memory access. + * + * Return: pointer to the &page_pool (garbage if @netmem is not page-backed). + */ +static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) +{ + return __netmem_to_page(netmem)->pp; +} + +static inline struct page_pool *netmem_get_pp(netmem_ref netmem) +{ + return __netmem_clear_lsb(netmem)->pp; +} + +static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem) +{ + return &__netmem_clear_lsb(netmem)->pp_ref_count; +} + +static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid) +{ + /* NUMA node preference only makes sense if we're allocating + * system memory. Memory providers (which give us net_iovs) + * choose for us. + */ + if (netmem_is_net_iov(netmem)) + return true; + + return page_to_nid(netmem_to_page(netmem)) == pref_nid; +} + +static inline netmem_ref netmem_compound_head(netmem_ref netmem) +{ + /* niov are never compounded */ + if (netmem_is_net_iov(netmem)) + return netmem; + + return page_to_netmem(compound_head(netmem_to_page(netmem))); +} + +/** + * __netmem_address - unsafely get pointer to the memory backing @netmem + * @netmem: netmem reference to get the pointer for + * + * Unsafe version of netmem_address(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB). When @netmem points to IOV, provokes + * undefined behaviour. + * + * Return: pointer to the memory (garbage if @netmem is not page-backed). + */ +static inline void *__netmem_address(netmem_ref netmem) +{ + return page_address(__netmem_to_page(netmem)); +} + +static inline void *netmem_address(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return NULL; + + return __netmem_address(netmem); +} + +/** + * netmem_is_pfmemalloc - check if @netmem was allocated under memory pressure + * @netmem: netmem reference to check + * + * Return: true if @netmem is page-backed and the page was allocated under + * memory pressure, false otherwise. + */ +static inline bool netmem_is_pfmemalloc(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return false; + + return page_is_pfmemalloc(netmem_to_page(netmem)); +} + +static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) +{ + return __netmem_clear_lsb(netmem)->dma_addr; +} + +void get_netmem(netmem_ref netmem); +void put_netmem(netmem_ref netmem); + +#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \ + do { \ + if (!netmem_is_net_iov(NETMEM)) \ + dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \ + else \ + dma_unmap_addr_set(PTR, ADDR_NAME, 0); \ + } while (0) + +static inline void netmem_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + if (!addr) + return; + + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +} + +#endif /* _NET_NETMEM_H */ diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index 0ca6a1b87185..2c01a278d1eb 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -6,11 +6,18 @@ #ifndef __NETNS_BPF_H__ #define __NETNS_BPF_H__ -#include <linux/bpf-netns.h> +#include <linux/list.h> struct bpf_prog; struct bpf_prog_array; +enum netns_bpf_attach_type { + NETNS_BPF_INVALID = -1, + NETNS_BPF_FLOW_DISSECTOR = 0, + NETNS_BPF_SK_LOOKUP, + MAX_NETNS_BPF_ATTACH_TYPE +}; + struct netns_bpf { /* Array of programs to run compiled from progs or links */ struct bpf_prog_array __rcu *run_array[MAX_NETNS_BPF_ATTACH_TYPE]; diff --git a/include/net/netns/can.h b/include/net/netns/can.h index 52fbd8291a96..48b79f7e6236 100644 --- a/include/net/netns/can.h +++ b/include/net/netns/can.h @@ -7,6 +7,7 @@ #define __NETNS_CAN_H__ #include <linux/spinlock.h> +#include <linux/timer.h> struct can_dev_rcv_lists; struct can_pkg_stats; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 806454e767bf..bae914815aa3 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -24,9 +24,13 @@ struct nf_generic_net { struct nf_tcp_net { unsigned int timeouts[TCP_CONNTRACK_TIMEOUT_MAX]; - int tcp_loose; - int tcp_be_liberal; - int tcp_max_retrans; + u8 tcp_loose; + u8 tcp_be_liberal; + u8 tcp_max_retrans; + u8 tcp_ignore_invalid_rst; +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + unsigned int offload_timeout; +#endif }; enum udp_conntrack { @@ -37,6 +41,9 @@ enum udp_conntrack { struct nf_udp_net { unsigned int timeouts[UDP_CT_MAX]; +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + unsigned int offload_timeout; +#endif }; struct nf_icmp_net { @@ -45,7 +52,7 @@ struct nf_icmp_net { #ifdef CONFIG_NF_CT_PROTO_DCCP struct nf_dccp_net { - int dccp_loose; + u8 dccp_loose; unsigned int dccp_timeout[CT_DCCP_MAX + 1]; }; #endif @@ -86,37 +93,21 @@ struct nf_ip_net { #endif }; -struct ct_pcpu { - spinlock_t lock; - struct hlist_nulls_head unconfirmed; - struct hlist_nulls_head dying; -}; - struct netns_ct { - atomic_t count; - unsigned int expect_count; #ifdef CONFIG_NF_CONNTRACK_EVENTS - struct delayed_work ecache_dwork; bool ecache_dwork_pending; #endif - bool auto_assign_helper_warned; -#ifdef CONFIG_SYSCTL - struct ctl_table_header *sysctl_header; -#endif - unsigned int sysctl_log_invalid; /* Log invalid packets */ - int sysctl_events; - int sysctl_acct; - int sysctl_auto_assign_helper; - int sysctl_tstamp; - int sysctl_checksum; + u8 sysctl_log_invalid; /* Log invalid packets */ + u8 sysctl_events; + u8 sysctl_acct; + u8 sysctl_tstamp; + u8 sysctl_checksum; - struct ct_pcpu __percpu *pcpu_lists; struct ip_conntrack_stat __percpu *stat; struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; - struct nf_exp_event_notifier __rcu *nf_expect_event_cb; struct nf_ip_net nf_ct_proto; #if defined(CONFIG_NF_CONNTRACK_LABELS) - unsigned int labels_used; + atomic_t labels_used; #endif }; #endif diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 36c2d998a43c..9b36f0ff0c20 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -2,19 +2,28 @@ #ifndef __NETNS_CORE_H__ #define __NETNS_CORE_H__ +#include <linux/types.h> + struct ctl_table_header; struct prot_inuse; +struct cpumask; struct netns_core { /* core sysctls */ struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + int sysctl_optmem_max; + u8 sysctl_txrehash; + u8 sysctl_tstamp_allow_data; #ifdef CONFIG_PROC_FS - int __percpu *sock_inuse; struct prot_inuse __percpu *prot_inuse; #endif + +#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) + struct cpumask *rps_default_mask; +#endif }; #endif diff --git a/include/net/netns/dccp.h b/include/net/netns/dccp.h deleted file mode 100644 index cdbc4f5b8390..000000000000 --- a/include/net/netns/dccp.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NETNS_DCCP_H__ -#define __NETNS_DCCP_H__ - -struct sock; - -struct netns_dccp { - struct sock *v4_ctl_sk; - struct sock *v6_ctl_sk; -}; - -#endif diff --git a/include/net/netns/flow_table.h b/include/net/netns/flow_table.h new file mode 100644 index 000000000000..1c5fc657e267 --- /dev/null +++ b/include/net/netns/flow_table.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETNS_FLOW_TABLE_H +#define __NETNS_FLOW_TABLE_H + +struct nf_flow_table_stat { + unsigned int count_wq_add; + unsigned int count_wq_del; + unsigned int count_wq_stats; +}; + +struct netns_ft { + struct nf_flow_table_stat __percpu *stat; +}; +#endif diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index 8a1ab47c3fb3..00c399edeed1 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -8,6 +8,7 @@ #include <linux/bug.h> #include <linux/rcupdate.h> +#include <net/net_namespace.h> /* * Generic net pointers are to be used by modules to put some private @@ -32,7 +33,7 @@ struct net_generic { struct rcu_head rcu; } s; - void *ptr[0]; + DECLARE_FLEX_ARRAY(void *, ptr); }; }; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8e4fcac4df72..6373e3f17da8 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -9,9 +9,9 @@ #include <linux/uidgid.h> #include <net/inet_frag.h> #include <linux/rcupdate.h> +#include <linux/seqlock.h> #include <linux/siphash.h> -struct tcpm_hash_bucket; struct ctl_table_header; struct ipv4_devconf; struct fib_rules_ops; @@ -19,8 +19,7 @@ struct hlist_head; struct fib_table; struct sock; struct local_ports { - seqlock_t lock; - int range[2]; + u32 range; /* high << 16 | low */ bool warned; }; @@ -32,15 +31,70 @@ struct ping_group_range { struct inet_hashinfo; struct inet_timewait_death_row { - atomic_t tw_count; + refcount_t tw_refcount; + /* Padding to avoid false sharing, tw_refcount can be often written */ struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; int sysctl_max_tw_buckets; }; struct tcp_fastopen_context; +#ifdef CONFIG_IP_ROUTE_MULTIPATH +struct sysctl_fib_multipath_hash_seed { + u32 user_seed; + u32 mp_seed; +}; +#endif + +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. + * Please update the document when adding new fields. + */ + + /* TX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_tx); + u8 sysctl_tcp_early_retrans; + u8 sysctl_tcp_tso_win_divisor; + u8 sysctl_tcp_tso_rtt_log; + u8 sysctl_tcp_autocorking; + int sysctl_tcp_min_snd_mss; + unsigned int sysctl_tcp_notsent_lowat; + int sysctl_tcp_limit_output_bytes; + int sysctl_tcp_min_rtt_wlen; + int sysctl_tcp_wmem[3]; + u8 sysctl_ip_fwd_use_pmtu; + __cacheline_group_end(netns_ipv4_read_tx); + + /* TXRX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_txrx); + u8 sysctl_tcp_moderate_rcvbuf; + __cacheline_group_end(netns_ipv4_read_txrx); + + /* RX readonly hotpath cache line */ + __cacheline_group_begin(netns_ipv4_read_rx); + u8 sysctl_ip_early_demux; + u8 sysctl_tcp_early_demux; + u8 sysctl_tcp_l3mdev_accept; + /* 3 bytes hole, try to pack */ + int sysctl_tcp_reordering; + int sysctl_tcp_rmem[3]; + __cacheline_group_end(netns_ipv4_read_rx); + + struct inet_timewait_death_row tcp_death_row; + struct udp_table *udp_table; + +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -54,153 +108,154 @@ struct netns_ipv4 { struct mutex ra_mutex; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; - bool fib_has_custom_rules; - unsigned int fib_rules_require_fldissect; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; + unsigned int fib_rules_require_fldissect; + bool fib_has_custom_rules; #endif bool fib_has_custom_local_routes; + bool fib_offload_disabled; + u8 sysctl_tcp_shrink_window; #ifdef CONFIG_IP_ROUTE_CLASSID - int fib_num_tclassid_users; + atomic_t fib_num_tclassid_users; #endif struct hlist_head *fib_table_hash; - bool fib_offload_disabled; struct sock *fibnl; + struct hlist_head *fib_info_hash; + unsigned int fib_info_hash_bits; + unsigned int fib_info_cnt; - struct sock * __percpu *icmp_sk; struct sock *mc_autojoin_sk; struct inet_peer_base *peers; - struct sock * __percpu *tcp_sk; struct fqdir *fqdir; -#ifdef CONFIG_NETFILTER - struct xt_table *iptable_filter; - struct xt_table *iptable_mangle; - struct xt_table *iptable_raw; - struct xt_table *arptable_filter; -#ifdef CONFIG_SECURITY - struct xt_table *iptable_security; -#endif - struct xt_table *nat_table; -#endif - int sysctl_icmp_echo_ignore_all; - int sysctl_icmp_echo_ignore_broadcasts; - int sysctl_icmp_ignore_bogus_error_responses; + u8 sysctl_icmp_echo_ignore_all; + u8 sysctl_icmp_echo_enable_probe; + u8 sysctl_icmp_echo_ignore_broadcasts; + u8 sysctl_icmp_ignore_bogus_error_responses; + u8 sysctl_icmp_errors_use_inbound_ifaddr; int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; - int sysctl_icmp_errors_use_inbound_ifaddr; + int sysctl_icmp_msgs_per_sec; + int sysctl_icmp_msgs_burst; + atomic_t icmp_global_credit; + u32 icmp_global_stamp; + u32 ip_rt_min_pmtu; + int ip_rt_mtu_expires; + int ip_rt_min_advmss; struct local_ports ip_local_ports; - int sysctl_tcp_ecn; - int sysctl_tcp_ecn_fallback; + u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_fallback; - int sysctl_ip_default_ttl; - int sysctl_ip_no_pmtu_disc; - int sysctl_ip_fwd_use_pmtu; - int sysctl_ip_fwd_update_priority; - int sysctl_ip_nonlocal_bind; - int sysctl_ip_autobind_reuse; + u8 sysctl_ip_default_ttl; + u8 sysctl_ip_no_pmtu_disc; + u8 sysctl_ip_fwd_update_priority; + u8 sysctl_ip_nonlocal_bind; + u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ - int sysctl_ip_dynaddr; - int sysctl_ip_early_demux; + u8 sysctl_ip_dynaddr; #ifdef CONFIG_NET_L3_MASTER_DEV - int sysctl_raw_l3mdev_accept; + u8 sysctl_raw_l3mdev_accept; #endif - int sysctl_tcp_early_demux; - int sysctl_udp_early_demux; + u8 sysctl_udp_early_demux; - int sysctl_nexthop_compat_mode; + u8 sysctl_nexthop_compat_mode; - int sysctl_fwmark_reflect; - int sysctl_tcp_fwmark_accept; -#ifdef CONFIG_NET_L3_MASTER_DEV - int sysctl_tcp_l3mdev_accept; -#endif - int sysctl_tcp_mtu_probing; + u8 sysctl_fwmark_reflect; + u8 sysctl_tcp_fwmark_accept; + u8 sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; - int sysctl_tcp_min_snd_mss; int sysctl_tcp_probe_threshold; u32 sysctl_tcp_probe_interval; int sysctl_tcp_keepalive_time; - int sysctl_tcp_keepalive_probes; int sysctl_tcp_keepalive_intvl; + u8 sysctl_tcp_keepalive_probes; - int sysctl_tcp_syn_retries; - int sysctl_tcp_synack_retries; - int sysctl_tcp_syncookies; - int sysctl_tcp_reordering; - int sysctl_tcp_retries1; - int sysctl_tcp_retries2; - int sysctl_tcp_orphan_retries; + u8 sysctl_tcp_syn_retries; + u8 sysctl_tcp_synack_retries; + u8 sysctl_tcp_syncookies; + u8 sysctl_tcp_migrate_req; + u8 sysctl_tcp_comp_sack_nr; + u8 sysctl_tcp_backlog_ack_defer; + u8 sysctl_tcp_pingpong_thresh; + + u8 sysctl_tcp_retries1; + u8 sysctl_tcp_retries2; + u8 sysctl_tcp_orphan_retries; + u8 sysctl_tcp_tw_reuse; + unsigned int sysctl_tcp_tw_reuse_delay; int sysctl_tcp_fin_timeout; - unsigned int sysctl_tcp_notsent_lowat; - int sysctl_tcp_tw_reuse; - int sysctl_tcp_sack; - int sysctl_tcp_window_scaling; - int sysctl_tcp_timestamps; - int sysctl_tcp_early_retrans; - int sysctl_tcp_recovery; - int sysctl_tcp_thin_linear_timeouts; - int sysctl_tcp_slow_start_after_idle; - int sysctl_tcp_retrans_collapse; - int sysctl_tcp_stdurg; - int sysctl_tcp_rfc1337; - int sysctl_tcp_abort_on_overflow; - int sysctl_tcp_fack; + u8 sysctl_tcp_sack; + u8 sysctl_tcp_window_scaling; + u8 sysctl_tcp_timestamps; + int sysctl_tcp_rto_min_us; + int sysctl_tcp_rto_max_ms; + u8 sysctl_tcp_recovery; + u8 sysctl_tcp_thin_linear_timeouts; + u8 sysctl_tcp_slow_start_after_idle; + u8 sysctl_tcp_retrans_collapse; + u8 sysctl_tcp_stdurg; + u8 sysctl_tcp_rfc1337; + u8 sysctl_tcp_abort_on_overflow; + u8 sysctl_tcp_fack; /* obsolete */ int sysctl_tcp_max_reordering; - int sysctl_tcp_dsack; - int sysctl_tcp_app_win; - int sysctl_tcp_adv_win_scale; - int sysctl_tcp_frto; - int sysctl_tcp_nometrics_save; - int sysctl_tcp_no_ssthresh_metrics_save; - int sysctl_tcp_moderate_rcvbuf; - int sysctl_tcp_tso_win_divisor; - int sysctl_tcp_workaround_signed_windows; - int sysctl_tcp_limit_output_bytes; + int sysctl_tcp_adv_win_scale; /* obsolete */ + u8 sysctl_tcp_dsack; + u8 sysctl_tcp_app_win; + u8 sysctl_tcp_frto; + u8 sysctl_tcp_nometrics_save; + u8 sysctl_tcp_no_ssthresh_metrics_save; + u8 sysctl_tcp_workaround_signed_windows; int sysctl_tcp_challenge_ack_limit; - int sysctl_tcp_min_tso_segs; - int sysctl_tcp_min_rtt_wlen; - int sysctl_tcp_autocorking; + u8 sysctl_tcp_min_tso_segs; + u8 sysctl_tcp_reflect_tos; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; - int sysctl_tcp_wmem[3]; - int sysctl_tcp_rmem[3]; - int sysctl_tcp_comp_sack_nr; + unsigned int sysctl_tcp_child_ehash_entries; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; - struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; const struct tcp_congestion_ops __rcu *tcp_congestion_control; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; - spinlock_t tcp_fastopen_ctx_lock; unsigned int sysctl_tcp_fastopen_blackhole_timeout; atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; - int sysctl_tcp_reflect_tos; + u32 tcp_challenge_timestamp; + u32 tcp_challenge_count; + u8 sysctl_tcp_plb_enabled; + u8 sysctl_tcp_plb_idle_rehash_rounds; + u8 sysctl_tcp_plb_rehash_rounds; + u8 sysctl_tcp_plb_suspend_rto_sec; + int sysctl_tcp_plb_cong_thresh; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; + u8 sysctl_fib_notify_on_flag_change; + u8 sysctl_tcp_syn_linear_timeouts; + #ifdef CONFIG_NET_L3_MASTER_DEV - int sysctl_udp_l3mdev_accept; + u8 sysctl_udp_l3mdev_accept; #endif + u8 sysctl_igmp_llm_reports; int sysctl_igmp_max_memberships; int sysctl_igmp_max_msf; - int sysctl_igmp_llm_reports; int sysctl_igmp_qrv; struct ping_group_range ping_group_range; atomic_t dev_addr_genid; + unsigned int sysctl_udp_child_hash_entries; + #ifdef CONFIG_SYSCTL unsigned long *sysctl_local_reserved_ports; int sysctl_ip_prot_sock; @@ -215,17 +270,22 @@ struct netns_ipv4 { #endif #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH - int sysctl_fib_multipath_use_neigh; - int sysctl_fib_multipath_hash_policy; + struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; + u32 sysctl_fib_multipath_hash_fields; + u8 sysctl_fib_multipath_use_neigh; + u8 sysctl_fib_multipath_hash_policy; #endif struct fib_notifier_ops *notifier_ops; - unsigned int fib_seq; /* protected by rtnl_mutex */ + unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct fib_notifier_ops *ipmr_notifier_ops; unsigned int ipmr_seq; /* protected by rtnl_mutex */ atomic_t rt_genid; siphash_key_t ip_id_key; + struct hlist_head *inet_addr_lst; + struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 5ec054473d81..47dc70d8100a 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -20,7 +20,6 @@ struct netns_sysctl_ipv6 { struct ctl_table_header *frags_hdr; struct ctl_table_header *xfrm6_hdr; #endif - int bindv6only; int flush_delay; int ip6_rt_max_size; int ip6_rt_gc_min_interval; @@ -29,60 +28,61 @@ struct netns_sysctl_ipv6 { int ip6_rt_gc_elasticity; int ip6_rt_mtu_expires; int ip6_rt_min_advmss; - int multipath_hash_policy; - int flowlabel_consistency; - int auto_flowlabels; + u32 multipath_hash_fields; + u8 multipath_hash_policy; + u8 bindv6only; + u8 flowlabel_consistency; + u8 auto_flowlabels; int icmpv6_time; - int icmpv6_echo_ignore_all; - int icmpv6_echo_ignore_multicast; - int icmpv6_echo_ignore_anycast; + u8 icmpv6_echo_ignore_all; + u8 icmpv6_echo_ignore_multicast; + u8 icmpv6_echo_ignore_anycast; DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1); unsigned long *icmpv6_ratemask_ptr; - int anycast_src_echo_reply; - int ip_nonlocal_bind; - int fwmark_reflect; + u8 anycast_src_echo_reply; + u8 ip_nonlocal_bind; + u8 fwmark_reflect; + u8 flowlabel_state_ranges; int idgen_retries; int idgen_delay; - int flowlabel_state_ranges; int flowlabel_reflect; int max_dst_opts_cnt; int max_hbh_opts_cnt; int max_dst_opts_len; int max_hbh_opts_len; int seg6_flowlabel; - bool skip_notify_on_dev_down; + u32 ioam6_id; + u64 ioam6_id_wide; + u8 skip_notify_on_dev_down; + u8 fib_notify_on_flag_change; + u8 icmpv6_error_anycast_as_unicast; }; struct netns_ipv6 { + /* Keep ip6_dst_ops at the beginning of netns_sysctl_ipv6 */ + struct dst_ops ip6_dst_ops; + struct netns_sysctl_ipv6 sysctl; struct ipv6_devconf *devconf_all; struct ipv6_devconf *devconf_dflt; struct inet_peer_base *peers; struct fqdir *fqdir; -#ifdef CONFIG_NETFILTER - struct xt_table *ip6table_filter; - struct xt_table *ip6table_mangle; - struct xt_table *ip6table_raw; -#ifdef CONFIG_SECURITY - struct xt_table *ip6table_security; -#endif - struct xt_table *ip6table_nat; -#endif struct fib6_info *fib6_null_entry; struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; + spinlock_t fib_table_hash_lock; struct fib6_table *fib6_main_tbl; struct list_head fib6_walkers; - struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + atomic_t ip6_rt_gc_expire; + unsigned long ip6_rt_last_gc; + unsigned char flowlabel_has_excl; #ifdef CONFIG_IPV6_MULTIPLE_TABLES - unsigned int fib6_rules_require_fldissect; bool fib6_has_custom_rules; + unsigned int fib6_rules_require_fldissect; #ifdef CONFIG_IPV6_SUBTREES unsigned int fib6_routes_require_src; #endif @@ -91,11 +91,15 @@ struct netns_ipv6 { struct fib6_table *fib6_local_tbl; struct fib_rules_ops *fib6_rules_ops; #endif - struct sock * __percpu *icmp_sk; struct sock *ndisc_sk; struct sock *tcp_sk; struct sock *igmp_sk; struct sock *mc_autojoin_sk; + + struct hlist_head *inet6_addr_lst; + spinlock_t addrconf_hash_lock; + struct delayed_work addr_chk_work; + #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES struct mr_table *mrt6; @@ -115,6 +119,7 @@ struct netns_ipv6 { spinlock_t lock; u32 seq; } ip6addrlbl_table; + struct ioam6_pernet_data *ioam6_data; }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h new file mode 100644 index 000000000000..1db8f9aaddb4 --- /dev/null +++ b/include/net/netns/mctp.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * MCTP per-net structures + */ + +#ifndef __NETNS_MCTP_H__ +#define __NETNS_MCTP_H__ + +#include <linux/mutex.h> +#include <linux/types.h> + +struct netns_mctp { + /* Only updated under RTNL, entries freed via RCU */ + struct list_head routes; + + /* Bound sockets: list of sockets bound by type. + * This list is updated from non-atomic contexts (under bind_lock), + * and read (under rcu) in packet rx + */ + struct mutex bind_lock; + struct hlist_head binds; + + /* tag allocations. This list is read and updated from atomic contexts, + * but elements are free()ed after a RCU grace-period + */ + spinlock_t keys_lock; + struct hlist_head keys; + + /* MCTP network */ + unsigned int default_net; + + /* neighbour table */ + struct mutex neigh_lock; + struct list_head neighbours; +}; + +#endif /* __NETNS_MCTP_H__ */ diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h index 59b2c3a3db42..7e373664b1e7 100644 --- a/include/net/netns/mib.h +++ b/include/net/netns/mib.h @@ -5,22 +5,19 @@ #include <net/snmp.h> struct netns_mib { - DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); +#if IS_ENABLED(CONFIG_IPV6) + DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); +#endif + + DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); DEFINE_SNMP_STAT(struct linux_mib, net_statistics); - DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); - DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics); - DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); - DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics); + DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); #if IS_ENABLED(CONFIG_IPV6) - struct proc_dir_entry *proc_net_devsnmp6; DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6); - DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6); - DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); - DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); - DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib, icmpv6msg_statistics); #endif + #ifdef CONFIG_XFRM_STATISTICS DEFINE_SNMP_STAT(struct linux_xfrm_mib, xfrm_statistics); #endif @@ -30,6 +27,19 @@ struct netns_mib { #ifdef CONFIG_MPTCP DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics); #endif + + DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics); +#if IS_ENABLED(CONFIG_IPV6) + DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6); +#endif + + DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); + DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics); +#if IS_ENABLED(CONFIG_IPV6) + DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); + DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib, icmpv6msg_statistics); + struct proc_dir_entry *proc_net_devsnmp6; +#endif }; #endif diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h index a7bdcfbb0b28..19ad2574b267 100644 --- a/include/net/netns/mpls.h +++ b/include/net/netns/mpls.h @@ -6,6 +6,8 @@ #ifndef __NETNS_MPLS_H__ #define __NETNS_MPLS_H__ +#include <linux/types.h> + struct mpls_route; struct ctl_table_header; diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index ca043342c0eb..a6a0bf4a247e 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -12,10 +12,12 @@ struct netns_nf { #if defined CONFIG_PROC_FS struct proc_dir_entry *proc_netfilter; #endif - const struct nf_queue_handler __rcu *queue_handler; const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO]; #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; +#ifdef CONFIG_LWTUNNEL + struct ctl_table_header *nf_lwtnl_dir_header; +#endif #endif struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS]; struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS]; @@ -25,14 +27,11 @@ struct netns_nf { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; #endif -#if IS_ENABLED(CONFIG_DECNET) - struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS]; -#endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) - bool defrag_ipv4; + unsigned int defrag_ipv4_users; #endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - bool defrag_ipv6; + unsigned int defrag_ipv6_users; #endif }; #endif diff --git a/include/net/netns/nexthop.h b/include/net/netns/nexthop.h index 1849e77eb68a..434239b37014 100644 --- a/include/net/netns/nexthop.h +++ b/include/net/netns/nexthop.h @@ -6,6 +6,7 @@ #ifndef __NETNS_NEXTHOP_H__ #define __NETNS_NEXTHOP_H__ +#include <linux/notifier.h> #include <linux/rbtree.h> struct netns_nexthop { diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 6c0806bd8d1e..cc8060c017d5 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -2,17 +2,8 @@ #ifndef _NETNS_NFTABLES_H_ #define _NETNS_NFTABLES_H_ -#include <linux/list.h> - struct netns_nftables { - struct list_head tables; - struct list_head commit_list; - struct list_head module_list; - struct list_head notify_list; - struct mutex commit_mutex; - unsigned int base_seq; u8 gencursor; - u8 validate_state; }; #endif diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index d8d02e4188d1..d25cd7a9c5ff 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -2,6 +2,9 @@ #ifndef __NETNS_SCTP_H__ #define __NETNS_SCTP_H__ +#include <linux/timer.h> +#include <net/snmp.h> + struct sock; struct proc_dir_entry; struct sctp_mib; @@ -22,6 +25,14 @@ struct netns_sctp { */ struct sock *ctl_sock; + /* UDP tunneling listening sock. */ + struct sock *udp4_sock; + struct sock *udp6_sock; + /* UDP tunneling listening port. */ + int udp_port; + /* UDP tunneling remote encap port. */ + int encap_port; + /* This is the global local address list. * We actively maintain this complete list of addresses on * the system by catching address add/delete events. @@ -76,6 +87,9 @@ struct netns_sctp { /* HB.interval - 30 seconds */ unsigned int hb_interval; + /* The interval for PLPMTUD probe timer */ + unsigned int probe_interval; + /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts @@ -111,14 +125,14 @@ struct netns_sctp { int pf_expose; /* - * Policy for preforming sctp/socket accounting + * Policy for performing sctp/socket accounting * 0 - do socket level accounting, all assocs share sk_sndbuf * 1 - do sctp accounting, each asoc may use sk_sndbuf bytes */ int sndbuf_policy; /* - * Policy for preforming sctp/socket accounting + * Policy for performing sctp/socket accounting * 0 - do socket level accounting, all assocs share sk_rcvbuf * 1 - do sctp accounting, each asoc may use sk_rcvbuf bytes */ @@ -161,6 +175,10 @@ struct netns_sctp { /* Threshold for autoclose timeout, in seconds. */ unsigned long max_autoclose; + +#ifdef CONFIG_NET_L3_MASTER_DEV + int l3mdev_accept; +#endif }; #endif /* __NETNS_SCTP_H__ */ diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h new file mode 100644 index 000000000000..fc752a50f91b --- /dev/null +++ b/include/net/netns/smc.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETNS_SMC_H__ +#define __NETNS_SMC_H__ +#include <linux/mutex.h> +#include <linux/percpu.h> + +struct smc_stats_rsn; +struct smc_stats; +struct netns_smc { + /* per cpu counters for SMC */ + struct smc_stats __percpu *smc_stats; + /* protect fback_rsn */ + struct mutex mutex_fback_rsn; + struct smc_stats_rsn *fback_rsn; + + bool limit_smc_hs; /* constraint on handshake */ +#ifdef CONFIG_SYSCTL + struct ctl_table_header *smc_hdr; +#endif + unsigned int sysctl_autocorking_size; + unsigned int sysctl_smcr_buf_type; + int sysctl_smcr_testlink_time; + int sysctl_wmem; + int sysctl_rmem; + int sysctl_max_links_per_lgr; + int sysctl_max_conns_per_lgr; +}; +#endif diff --git a/include/net/netns/unix.h b/include/net/netns/unix.h index 91a3d7e39198..9859d134d5a8 100644 --- a/include/net/netns/unix.h +++ b/include/net/netns/unix.h @@ -5,8 +5,16 @@ #ifndef __NETNS_UNIX_H__ #define __NETNS_UNIX_H__ +#include <linux/spinlock.h> + +struct unix_table { + spinlock_t *locks; + struct hlist_head *buckets; +}; + struct ctl_table_header; struct netns_unix { + struct unix_table table; int sysctl_max_dgram_qlen; struct ctl_table_header *ctl; }; diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h deleted file mode 100644 index 9bc5a12fdbb0..000000000000 --- a/include/net/netns/x_tables.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NETNS_X_TABLES_H -#define __NETNS_X_TABLES_H - -#include <linux/list.h> -#include <linux/netfilter_defs.h> - -struct ebt_table; - -struct netns_xt { - struct list_head tables[NFPROTO_NUMPROTO]; - bool notrack_deprecated_warning; - bool clusterip_deprecated_warning; -#if defined(CONFIG_BRIDGE_NF_EBTABLES) || \ - defined(CONFIG_BRIDGE_NF_EBTABLES_MODULE) - struct ebt_table *broute_table; - struct ebt_table *frame_filter; - struct ebt_table *frame_nat; -#endif -}; -#endif diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h index e5734261ba0a..21a4f25a187a 100644 --- a/include/net/netns/xdp.h +++ b/include/net/netns/xdp.h @@ -2,8 +2,8 @@ #ifndef __NETNS_XDP_H__ #define __NETNS_XDP_H__ -#include <linux/rculist.h> #include <linux/mutex.h> +#include <linux/types.h> struct netns_xdp { struct mutex lock; diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 59f45b1e9dac..23dd647fe024 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -42,6 +42,8 @@ struct netns_xfrm { struct hlist_head __rcu *state_bydst; struct hlist_head __rcu *state_bysrc; struct hlist_head __rcu *state_byspi; + struct hlist_head __rcu *state_byseq; + struct hlist_head __percpu *state_cache_input; unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; @@ -49,7 +51,7 @@ struct netns_xfrm { struct list_head policy_all; struct hlist_head *policy_byidx; unsigned int policy_idx_hmask; - struct hlist_head policy_inexact[XFRM_POLICY_MAX]; + unsigned int idx_generator; struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; struct work_struct policy_hash_work; @@ -64,6 +66,9 @@ struct netns_xfrm { u32 sysctl_aevent_rseqth; int sysctl_larval_drop; u32 sysctl_acq_expires; + + u8 policy_default[XFRM_POLICY_MAX]; + #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_hdr; #endif @@ -72,9 +77,13 @@ struct netns_xfrm { #if IS_ENABLED(CONFIG_IPV6) struct dst_ops xfrm6_dst_ops; #endif - spinlock_t xfrm_state_lock; + spinlock_t xfrm_state_lock; + seqcount_spinlock_t xfrm_state_hash_generation; + seqcount_spinlock_t xfrm_policy_hash_generation; + spinlock_t xfrm_policy_lock; struct mutex xfrm_cfg_mutex; + struct delayed_work nat_keepalive_work; }; #endif diff --git a/include/net/netrom.h b/include/net/netrom.h index 80f15b1c1a48..f0565a5987d1 100644 --- a/include/net/netrom.h +++ b/include/net/netrom.h @@ -14,6 +14,7 @@ #include <net/sock.h> #include <linux/refcount.h> #include <linux/seq_file.h> +#include <net/ax25.h> #define NR_NETWORK_LEN 15 #define NR_TRANSPORT_LEN 5 diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 2fd76a9b6dc8..572e69cda476 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -40,6 +40,14 @@ struct nh_config { struct nlattr *nh_grp; u16 nh_grp_type; + u16 nh_grp_res_num_buckets; + unsigned long nh_grp_res_idle_timer; + unsigned long nh_grp_res_unbalanced_timer; + bool nh_grp_res_has_num_buckets; + bool nh_grp_res_has_idle_timer; + bool nh_grp_res_has_unbalanced_timer; + + bool nh_hw_stats; struct nlattr *nh_encap; u16 nh_encap_type; @@ -63,22 +71,72 @@ struct nh_info { }; }; +struct nh_res_bucket { + struct nh_grp_entry __rcu *nh_entry; + atomic_long_t used_time; + unsigned long migrated_time; + bool occupied; + u8 nh_flags; +}; + +struct nh_res_table { + struct net *net; + u32 nhg_id; + struct delayed_work upkeep_dw; + + /* List of NHGEs that have too few buckets ("uw" for underweight). + * Reclaimed buckets will be given to entries in this list. + */ + struct list_head uw_nh_entries; + unsigned long unbalanced_since; + + u32 idle_timer; + u32 unbalanced_timer; + + u16 num_nh_buckets; + struct nh_res_bucket nh_buckets[] __counted_by(num_nh_buckets); +}; + +struct nh_grp_entry_stats { + u64_stats_t packets; + struct u64_stats_sync syncp; +}; + struct nh_grp_entry { struct nexthop *nh; - u8 weight; - atomic_t upper_bound; + struct nh_grp_entry_stats __percpu *stats; + u16 weight; + + union { + struct { + atomic_t upper_bound; + } hthr; + struct { + /* Member on uw_nh_entries. */ + struct list_head uw_nh_entry; + + u16 count_buckets; + u16 wants_buckets; + } res; + }; struct list_head nh_list; struct nexthop *nh_parent; /* nexthop of group with this entry */ + u64 packets_hw; }; struct nh_group { struct nh_group *spare; /* spare group for removals */ u16 num_nh; - bool mpath; + bool is_multipath; + bool hash_threshold; + bool resilient; bool fdb_nh; bool has_v4; - struct nh_grp_entry nh_entries[]; + bool hw_stats; + + struct nh_res_table __rcu *res_table; + struct nh_grp_entry nh_entries[] __counted_by(num_nh); }; struct nexthop { @@ -94,6 +152,8 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool dead; + spinlock_t lock; /* protect dead and f6i_list */ refcount_t refcnt; struct rcu_head rcu; @@ -105,11 +165,97 @@ struct nexthop { }; enum nexthop_event_type { - NEXTHOP_EVENT_DEL + NEXTHOP_EVENT_DEL, + NEXTHOP_EVENT_REPLACE, + NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, + NEXTHOP_EVENT_BUCKET_REPLACE, + NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, +}; + +enum nh_notifier_info_type { + NH_NOTIFIER_INFO_TYPE_SINGLE, + NH_NOTIFIER_INFO_TYPE_GRP, + NH_NOTIFIER_INFO_TYPE_RES_TABLE, + NH_NOTIFIER_INFO_TYPE_RES_BUCKET, + NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS, +}; + +struct nh_notifier_single_info { + struct net_device *dev; + u8 gw_family; + union { + __be32 ipv4; + struct in6_addr ipv6; + }; + u32 id; + u8 is_reject:1, + is_fdb:1, + has_encap:1; +}; + +struct nh_notifier_grp_entry_info { + u16 weight; + struct nh_notifier_single_info nh; +}; + +struct nh_notifier_grp_info { + u16 num_nh; + bool is_fdb; + bool hw_stats; + struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh); +}; + +struct nh_notifier_res_bucket_info { + u16 bucket_index; + unsigned int idle_timer_ms; + bool force; + struct nh_notifier_single_info old_nh; + struct nh_notifier_single_info new_nh; +}; + +struct nh_notifier_res_table_info { + u16 num_nh_buckets; + bool hw_stats; + struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets); +}; + +struct nh_notifier_grp_hw_stats_entry_info { + u32 id; + u64 packets; +}; + +struct nh_notifier_grp_hw_stats_info { + u16 num_nh; + bool hw_stats_used; + struct nh_notifier_grp_hw_stats_entry_info stats[] __counted_by(num_nh); +}; + +struct nh_notifier_info { + struct net *net; + struct netlink_ext_ack *extack; + u32 id; + enum nh_notifier_info_type type; + union { + struct nh_notifier_single_info *nh; + struct nh_notifier_grp_info *nh_grp; + struct nh_notifier_res_table_info *nh_res_table; + struct nh_notifier_res_bucket_info *nh_res_bucket; + struct nh_notifier_grp_hw_stats_info *nh_grp_hw_stats; + }; }; -int register_nexthop_notifier(struct net *net, struct notifier_block *nb); +int register_nexthop_notifier(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); +int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); +void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap); +void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, + bool offload, bool trap); +void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, + unsigned long *activity); +void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, + unsigned int nh_idx, + u64 delta_packets); /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); @@ -123,7 +269,7 @@ static inline bool nexthop_get(struct nexthop *nh) static inline void nexthop_put(struct nexthop *nh) { if (refcount_dec_and_test(&nh->refcnt)) - call_rcu(&nh->rcu, nexthop_free_rcu); + call_rcu_hurry(&nh->rcu, nexthop_free_rcu); } static inline bool nexthop_cmp(const struct nexthop *nh1, @@ -164,7 +310,7 @@ static inline bool nexthop_is_multipath(const struct nexthop *nh) struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); - return nh_grp->mpath; + return nh_grp->is_multipath; } return false; } @@ -179,7 +325,7 @@ static inline unsigned int nexthop_num_path(const struct nexthop *nh) struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); - if (nh_grp->mpath) + if (nh_grp->is_multipath) rc = nh_grp->num_nh; } @@ -202,7 +348,7 @@ static inline int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, u8 rt_family) { - struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { @@ -211,7 +357,7 @@ int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, struct fib_nh_common *nhc = &nhi->fib_nhc; int weight = nhg->nh_entries[i].weight; - if (fib_add_nexthop(skb, nhc, weight, rt_family) < 0) + if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0) return -EMSGSIZE; } @@ -260,7 +406,7 @@ struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel) struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); - if (nh_grp->mpath) { + if (nh_grp->is_multipath) { nh = nexthop_mpath_select(nh_grp, nhsel); if (!nh) return NULL; @@ -362,6 +508,7 @@ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel) int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack); +/* Caller should either hold rcu_read_lock(), or RTNL. */ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) { struct nh_info *nhi; diff --git a/include/net/nfc/digital.h b/include/net/nfc/digital.h index 963db96bcbbb..bb3e8fdc0692 100644 --- a/include/net/nfc/digital.h +++ b/include/net/nfc/digital.h @@ -191,7 +191,7 @@ struct digital_poll_tech { struct nfc_digital_dev { struct nfc_dev *nfc_dev; - struct nfc_digital_ops *ops; + const struct nfc_digital_ops *ops; u32 protocols; @@ -236,7 +236,7 @@ struct nfc_digital_dev { void (*skb_add_crc)(struct sk_buff *skb); }; -struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops, +struct nfc_digital_dev *nfc_digital_allocate_device(const struct nfc_digital_ops *ops, __u32 supported_protocols, __u32 driver_capabilities, int tx_headroom, diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h index b35f37a57686..756c11084f65 100644 --- a/include/net/nfc/hci.h +++ b/include/net/nfc/hci.h @@ -118,7 +118,7 @@ struct nfc_hci_dev { struct sk_buff_head msg_rx_queue; - struct nfc_hci_ops *ops; + const struct nfc_hci_ops *ops; struct nfc_llc *llc; @@ -151,7 +151,7 @@ struct nfc_hci_dev { }; /* hci device allocation */ -struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops, +struct nfc_hci_dev *nfc_hci_allocate_device(const struct nfc_hci_ops *ops, struct nfc_hci_init_data *init_data, unsigned long quirks, u32 protocols, @@ -168,7 +168,7 @@ void nfc_hci_set_clientdata(struct nfc_hci_dev *hdev, void *clientdata); void *nfc_hci_get_clientdata(struct nfc_hci_dev *hdev); static inline int nfc_hci_set_vendor_cmds(struct nfc_hci_dev *hdev, - struct nfc_vendor_cmd *cmds, + const struct nfc_vendor_cmd *cmds, int n_cmds) { return nfc_set_vendor_cmds(hdev->ndev, cmds, n_cmds); diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h index 0550e0380b8d..09efcaed7c3f 100644 --- a/include/net/nfc/nci.h +++ b/include/net/nfc/nci.h @@ -25,6 +25,8 @@ #define NCI_MAX_PARAM_LEN 251 #define NCI_MAX_PAYLOAD_SIZE 255 #define NCI_MAX_PACKET_SIZE 258 +#define NCI_MAX_LARGE_PARAMS_NCI_v2 15 +#define NCI_VER_2_MASK 0x20 /* NCI Status Codes */ #define NCI_STATUS_OK 0x00 @@ -131,6 +133,9 @@ #define NCI_LF_CON_BITR_F_212 0x02 #define NCI_LF_CON_BITR_F_424 0x04 +/* NCI 2.x Feature Enable Bit */ +#define NCI_FEATURE_DISABLE 0x00 + /* NCI Reset types */ #define NCI_RESET_TYPE_KEEP_CONFIG 0x00 #define NCI_RESET_TYPE_RESET_CONFIG 0x01 @@ -220,6 +225,11 @@ struct nci_core_reset_cmd { } __packed; #define NCI_OP_CORE_INIT_CMD nci_opcode_pack(NCI_GID_CORE, 0x01) +/* To support NCI 2.x */ +struct nci_core_init_v2_cmd { + u8 feature1; + u8 feature2; +}; #define NCI_OP_CORE_SET_CONFIG_CMD nci_opcode_pack(NCI_GID_CORE, 0x02) struct set_config_param { @@ -322,7 +332,7 @@ struct nci_core_init_rsp_1 { __le32 nfcc_features; __u8 num_supported_rf_interfaces; __u8 supported_rf_interfaces[]; /* variable size array */ - /* continuted in nci_core_init_rsp_2 */ + /* continued in nci_core_init_rsp_2 */ } __packed; struct nci_core_init_rsp_2 { @@ -334,6 +344,20 @@ struct nci_core_init_rsp_2 { __le32 manufact_specific_info; } __packed; +/* To support NCI ver 2.x */ +struct nci_core_init_rsp_nci_ver2 { + u8 status; + __le32 nfcc_features; + u8 max_logical_connections; + __le16 max_routing_table_size; + u8 max_ctrl_pkt_payload_len; + u8 max_data_pkt_hci_payload_len; + u8 number_of_hci_credit; + __le16 max_nfc_v_frame_size; + u8 num_supported_rf_interfaces; + u8 supported_rf_interfaces[]; +} __packed; + #define NCI_OP_CORE_SET_CONFIG_RSP nci_opcode_pack(NCI_GID_CORE, 0x02) struct nci_core_set_config_rsp { __u8 status; @@ -372,6 +396,16 @@ struct nci_nfcee_discover_rsp { /* --------------------------- */ /* ---- NCI Notifications ---- */ /* --------------------------- */ +#define NCI_OP_CORE_RESET_NTF nci_opcode_pack(NCI_GID_CORE, 0x00) +struct nci_core_reset_ntf { + u8 reset_trigger; + u8 config_status; + u8 nci_ver; + u8 manufact_id; + u8 manufacturer_specific_len; + __le32 manufact_specific_info; +} __packed; + #define NCI_OP_CORE_CONN_CREDITS_NTF nci_opcode_pack(NCI_GID_CORE, 0x06) struct conn_credit_entry { __u8 conn_id; @@ -441,7 +475,7 @@ struct nci_rf_discover_ntf { #define NCI_OP_RF_INTF_ACTIVATED_NTF nci_opcode_pack(NCI_GID_RF_MGMT, 0x05) struct activation_params_nfca_poll_iso_dep { __u8 rats_res_len; - __u8 rats_res[20]; + __u8 rats_res[NFC_ATS_MAXSIZE]; }; struct activation_params_nfcb_poll_iso_dep { diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index 43c9c5d2bedb..e180bdf2f82b 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -30,6 +30,7 @@ enum nci_flag { NCI_UP, NCI_DATA_EXCHANGE, NCI_DATA_EXCHANGE_TO, + NCI_UNREG, }; /* NCI device states */ @@ -82,10 +83,10 @@ struct nci_ops { void (*hci_cmd_received)(struct nci_dev *ndev, u8 pipe, u8 cmd, struct sk_buff *skb); - struct nci_driver_ops *prop_ops; + const struct nci_driver_ops *prop_ops; size_t n_prop_ops; - struct nci_driver_ops *core_ops; + const struct nci_driver_ops *core_ops; size_t n_core_ops; }; @@ -194,7 +195,7 @@ struct nci_hci_dev { /* NCI Core structures */ struct nci_dev { struct nfc_dev *nfc_dev; - struct nci_ops *ops; + const struct nci_ops *ops; struct nci_hci_dev *hci_dev; int tx_headroom; @@ -264,10 +265,14 @@ struct nci_dev { /* stored during intf_activated_ntf */ __u8 remote_gb[NFC_MAX_GT_LEN]; __u8 remote_gb_len; + + /* stored during intf_activated_ntf */ + __u8 target_ats[NFC_ATS_MAXSIZE]; + __u8 target_ats_len; }; /* ----- NCI Devices ----- */ -struct nci_dev *nci_allocate_device(struct nci_ops *ops, +struct nci_dev *nci_allocate_device(const struct nci_ops *ops, __u32 supported_protocols, int tx_headroom, int tx_tailroom); @@ -276,28 +281,31 @@ int nci_register_device(struct nci_dev *ndev); void nci_unregister_device(struct nci_dev *ndev); int nci_request(struct nci_dev *ndev, void (*req)(struct nci_dev *ndev, - unsigned long opt), - unsigned long opt, __u32 timeout); -int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload); -int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload); + const void *opt), + const void *opt, __u32 timeout); +int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, + const __u8 *payload); +int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, + const __u8 *payload); int nci_core_reset(struct nci_dev *ndev); int nci_core_init(struct nci_dev *ndev); int nci_recv_frame(struct nci_dev *ndev, struct sk_buff *skb); int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb); -int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val); +int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, const __u8 *val); int nci_nfcee_discover(struct nci_dev *ndev, u8 action); int nci_nfcee_mode_set(struct nci_dev *ndev, u8 nfcee_id, u8 nfcee_mode); int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, u8 number_destination_params, size_t params_len, - struct core_conn_create_dest_spec_params *params); + const struct core_conn_create_dest_spec_params *params); int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id); -int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len, +int nci_nfcc_loopback(struct nci_dev *ndev, const void *data, size_t data_len, struct sk_buff **resp); struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev); +void nci_hci_deallocate(struct nci_dev *ndev); int nci_hci_send_event(struct nci_dev *ndev, u8 gate, u8 event, const u8 *param, size_t param_len); int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, @@ -342,7 +350,7 @@ static inline void *nci_get_drvdata(struct nci_dev *ndev) } static inline int nci_set_vendor_cmds(struct nci_dev *ndev, - struct nfc_vendor_cmd *cmds, + const struct nfc_vendor_cmd *cmds, int n_cmds) { return nfc_set_vendor_cmds(ndev->nfc_dev, cmds, n_cmds); @@ -359,7 +367,7 @@ int nci_core_rsp_packet(struct nci_dev *ndev, __u16 opcode, int nci_core_ntf_packet(struct nci_dev *ndev, __u16 opcode, struct sk_buff *skb); void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb); -int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload); +int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, const void *payload); int nci_send_data(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb); int nci_conn_max_data_pkt_payload_size(struct nci_dev *ndev, __u8 conn_id); void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, @@ -377,7 +385,7 @@ void nci_req_complete(struct nci_dev *ndev, int result); struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev, int conn_id); int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type, - struct dest_spec_params *params); + const struct dest_spec_params *params); /* ----- NCI status code ----- */ int nci_to_errno(__u8 code); @@ -430,8 +438,6 @@ struct nci_uart_ops { int (*open)(struct nci_uart *nci_uart); void (*close)(struct nci_uart *nci_uart); int (*recv)(struct nci_uart *nci_uart, struct sk_buff *skb); - int (*recv_buf)(struct nci_uart *nci_uart, const u8 *data, char *flags, - int count); int (*send)(struct nci_uart *nci_uart, struct sk_buff *skb); void (*tx_start)(struct nci_uart *nci_uart); void (*tx_done)(struct nci_uart *nci_uart); diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 2cd3a261bcbc..127e6c7d910d 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -80,12 +80,14 @@ struct nfc_ops { #define NFC_ATR_REQ_GT_OFFSET 14 /** - * struct nfc_target - NFC target descriptiom + * struct nfc_target - NFC target description * * @sens_res: 2 bytes describing the target SENS_RES response, if the target * is a type A one. The %sens_res most significant byte must be byte 2 * as described by the NFC Forum digital specification (i.e. the platform * configuration one) while %sens_res least significant byte is byte 1. + * @ats_len: length of Answer To Select in bytes + * @ats: Answer To Select returned by an ISO 14443 Type A target upon activation */ struct nfc_target { u32 idx; @@ -105,6 +107,8 @@ struct nfc_target { u8 is_iso15693; u8 iso15693_dsfid; u8 iso15693_uid[NFC_ISO15693_UID_MAXSIZE]; + u8 ats_len; + u8 ats[NFC_ATS_MAXSIZE]; }; /** @@ -188,17 +192,17 @@ struct nfc_dev { struct rfkill *rfkill; - struct nfc_vendor_cmd *vendor_cmds; + const struct nfc_vendor_cmd *vendor_cmds; int n_vendor_cmds; - struct nfc_ops *ops; + const struct nfc_ops *ops; struct genl_info *cur_cmd_info; }; #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev) -extern struct class nfc_class; +extern const struct class nfc_class; -struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, +struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom); @@ -230,10 +234,10 @@ static inline void nfc_set_parent_dev(struct nfc_dev *nfc_dev, } /** - * nfc_set_drvdata - set driver specifc data + * nfc_set_drvdata - set driver specific data * * @dev: The nfc device - * @data: Pointer to driver specifc data + * @data: Pointer to driver specific data */ static inline void nfc_set_drvdata(struct nfc_dev *dev, void *data) { @@ -241,11 +245,11 @@ static inline void nfc_set_drvdata(struct nfc_dev *dev, void *data) } /** - * nfc_get_drvdata - get driver specifc data + * nfc_get_drvdata - get driver specific data * * @dev: The nfc device */ -static inline void *nfc_get_drvdata(struct nfc_dev *dev) +static inline void *nfc_get_drvdata(const struct nfc_dev *dev) { return dev_get_drvdata(&dev->dev); } @@ -255,7 +259,7 @@ static inline void *nfc_get_drvdata(struct nfc_dev *dev) * * @dev: The nfc device whose name to return */ -static inline const char *nfc_device_name(struct nfc_dev *dev) +static inline const char *nfc_device_name(const struct nfc_dev *dev) { return dev_name(&dev->dev); } @@ -266,7 +270,7 @@ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, struct sk_buff *nfc_alloc_recv_skb(unsigned int size, gfp_t gfp); int nfc_set_remote_general_bytes(struct nfc_dev *dev, - u8 *gt, u8 gt_len); + const u8 *gt, u8 gt_len); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len); int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, @@ -280,7 +284,7 @@ int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, - u8 *gb, size_t gb_len); + const u8 *gb, size_t gb_len); int nfc_tm_deactivated(struct nfc_dev *dev); int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb); @@ -297,7 +301,7 @@ void nfc_send_to_raw_sock(struct nfc_dev *dev, struct sk_buff *skb, u8 payload_type, u8 direction); static inline int nfc_set_vendor_cmds(struct nfc_dev *dev, - struct nfc_vendor_cmd *cmds, + const struct nfc_vendor_cmd *cmds, int n_cmds) { if (dev->vendor_cmds || dev->n_vendor_cmds) diff --git a/include/net/nl802154.h b/include/net/nl802154.h index ddcee128f5d9..a994dea74596 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -19,6 +19,8 @@ * */ +#include <linux/types.h> + #define NL802154_GENL_NAME "nl802154" enum nl802154_commands { @@ -56,9 +58,6 @@ enum nl802154_commands { NL802154_CMD_SET_WPAN_PHY_NETNS, - /* add new commands above here */ - -#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL NL802154_CMD_SET_SEC_PARAMS, NL802154_CMD_GET_SEC_KEY, /* can dump */ NL802154_CMD_NEW_SEC_KEY, @@ -72,7 +71,19 @@ enum nl802154_commands { NL802154_CMD_GET_SEC_LEVEL, /* can dump */ NL802154_CMD_NEW_SEC_LEVEL, NL802154_CMD_DEL_SEC_LEVEL, -#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + + NL802154_CMD_SCAN_EVENT, + NL802154_CMD_TRIGGER_SCAN, + NL802154_CMD_ABORT_SCAN, + NL802154_CMD_SCAN_DONE, + NL802154_CMD_SEND_BEACONS, + NL802154_CMD_STOP_BEACONS, + NL802154_CMD_ASSOCIATE, + NL802154_CMD_DISASSOCIATE, + NL802154_CMD_SET_MAX_ASSOCIATIONS, + NL802154_CMD_LIST_ASSOCIATIONS, + + /* add new commands above here */ /* used to define NL802154_CMD_MAX below */ __NL802154_CMD_AFTER_LAST, @@ -131,6 +142,18 @@ enum nl802154_attrs { NL802154_ATTR_PID, NL802154_ATTR_NETNS_FD, + NL802154_ATTR_COORDINATOR, + NL802154_ATTR_SCAN_TYPE, + NL802154_ATTR_SCAN_FLAGS, + NL802154_ATTR_SCAN_CHANNELS, + NL802154_ATTR_SCAN_PREAMBLE_CODES, + NL802154_ATTR_SCAN_MEAN_PRF, + NL802154_ATTR_SCAN_DURATION, + NL802154_ATTR_SCAN_DONE_REASON, + NL802154_ATTR_BEACON_INTERVAL, + NL802154_ATTR_MAX_ASSOCIATIONS, + NL802154_ATTR_PEER, + /* add attributes here, update the policy in nl802154.c */ #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL @@ -150,10 +173,9 @@ enum nl802154_attrs { }; enum nl802154_iftype { - /* for backwards compatibility TODO */ - NL802154_IFTYPE_UNSPEC = -1, + NL802154_IFTYPE_UNSPEC = (~(__u32)0), - NL802154_IFTYPE_NODE, + NL802154_IFTYPE_NODE = 0, NL802154_IFTYPE_MONITOR, NL802154_IFTYPE_COORD, @@ -170,7 +192,7 @@ enum nl802154_iftype { * @NL802154_CAP_ATTR_TX_POWERS: a nested attribute for * nl802154_wpan_phy_tx_power * @NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL: minimum value for cca_ed_level - * @NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL: maxmimum value for cca_ed_level + * @NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL: maximum value for cca_ed_level * @NL802154_CAP_ATTR_CCA_MODES: nl802154_cca_modes flags * @NL802154_CAP_ATTR_CCA_OPTS: nl802154_cca_opts flags * @NL802154_CAP_ATTR_MIN_MINBE: minimum of minbe value @@ -218,6 +240,93 @@ enum nl802154_wpan_phy_capability_attr { }; /** + * enum nl802154_coord - Netlink attributes for a coord + * + * @__NL802154_COORD_INVALID: invalid + * @NL802154_COORD_PANID: PANID of the coordinator (2 bytes) + * @NL802154_COORD_ADDR: coordinator address, (8 bytes or 2 bytes) + * @NL802154_COORD_CHANNEL: channel number, related to @NL802154_COORD_PAGE (u8) + * @NL802154_COORD_PAGE: channel page, related to @NL802154_COORD_CHANNEL (u8) + * @NL802154_COORD_PREAMBLE_CODE: Preamble code used when the beacon was received, + * this is PHY dependent and optional (u8) + * @NL802154_COORD_MEAN_PRF: Mean PRF used when the beacon was received, + * this is PHY dependent and optional (u8) + * @NL802154_COORD_SUPERFRAME_SPEC: superframe specification of the PAN (u16) + * @NL802154_COORD_LINK_QUALITY: signal quality of beacon in unspecified units, + * scaled to 0..255 (u8) + * @NL802154_COORD_GTS_PERMIT: set to true if GTS is permitted on this PAN + * @NL802154_COORD_PAYLOAD_DATA: binary data containing the raw data from the + * frame payload, (only if beacon or probe response had data) + * @NL802154_COORD_PAD: attribute used for padding for 64-bit alignment + * @NL802154_COORD_MAX: highest coordinator attribute + */ +enum nl802154_coord { + __NL802154_COORD_INVALID, + NL802154_COORD_PANID, + NL802154_COORD_ADDR, + NL802154_COORD_CHANNEL, + NL802154_COORD_PAGE, + NL802154_COORD_PREAMBLE_CODE, + NL802154_COORD_MEAN_PRF, + NL802154_COORD_SUPERFRAME_SPEC, + NL802154_COORD_LINK_QUALITY, + NL802154_COORD_GTS_PERMIT, + NL802154_COORD_PAYLOAD_DATA, + NL802154_COORD_PAD, + + /* keep last */ + NL802154_COORD_MAX, +}; + +/** + * enum nl802154_scan_types - Scan types + * + * @__NL802154_SCAN_INVALID: scan type number 0 is reserved + * @NL802154_SCAN_ED: An ED scan allows a device to obtain a measure of the peak + * energy in each requested channel + * @NL802154_SCAN_ACTIVE: Locate any coordinator transmitting Beacon frames using + * a Beacon Request command + * @NL802154_SCAN_PASSIVE: Locate any coordinator transmitting Beacon frames + * @NL802154_SCAN_ORPHAN: Relocate coordinator following a loss of synchronisation + * @NL802154_SCAN_ENHANCED_ACTIVE: Same as Active using Enhanced Beacon Request + * command instead of Beacon Request command + * @NL802154_SCAN_RIT_PASSIVE: Passive scan for RIT Data Request command frames + * instead of Beacon frames + * @NL802154_SCAN_ATTR_MAX: Maximum SCAN attribute number + */ +enum nl802154_scan_types { + __NL802154_SCAN_INVALID, + NL802154_SCAN_ED, + NL802154_SCAN_ACTIVE, + NL802154_SCAN_PASSIVE, + NL802154_SCAN_ORPHAN, + NL802154_SCAN_ENHANCED_ACTIVE, + NL802154_SCAN_RIT_PASSIVE, + + /* keep last */ + NL802154_SCAN_ATTR_MAX, +}; + +/** + * enum nl802154_scan_done_reasons - End of scan reasons + * + * @__NL802154_SCAN_DONE_REASON_INVALID: scan done reason number 0 is reserved. + * @NL802154_SCAN_DONE_REASON_FINISHED: The scan just finished naturally after + * going through all the requested and possible (complex) channels. + * @NL802154_SCAN_DONE_REASON_ABORTED: The scan was aborted upon user request. + * a Beacon Request command + * @NL802154_SCAN_DONE_REASON_MAX: Maximum scan done reason attribute number. + */ +enum nl802154_scan_done_reasons { + __NL802154_SCAN_DONE_REASON_INVALID, + NL802154_SCAN_DONE_REASON_FINISHED, + NL802154_SCAN_DONE_REASON_ABORTED, + + /* keep last */ + NL802154_SCAN_DONE_REASON_MAX, +}; + +/** * enum nl802154_cca_modes - cca modes * * @__NL802154_CCA_INVALID: cca mode number 0 is reserved @@ -282,8 +391,6 @@ enum nl802154_supported_bool_states { NL802154_SUPPORTED_BOOL_MAX = __NL802154_SUPPORTED_BOOL_AFTER_LAST - 1 }; -#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL - enum nl802154_dev_addr_modes { NL802154_DEV_ADDR_NONE, __NL802154_DEV_ADDR_INVALID, @@ -303,12 +410,26 @@ enum nl802154_dev_addr_attrs { NL802154_DEV_ADDR_ATTR_SHORT, NL802154_DEV_ADDR_ATTR_EXTENDED, NL802154_DEV_ADDR_ATTR_PAD, + NL802154_DEV_ADDR_ATTR_PEER_TYPE, /* keep last */ __NL802154_DEV_ADDR_ATTR_AFTER_LAST, NL802154_DEV_ADDR_ATTR_MAX = __NL802154_DEV_ADDR_ATTR_AFTER_LAST - 1 }; +enum nl802154_peer_type { + NL802154_PEER_TYPE_UNSPEC, + + NL802154_PEER_TYPE_PARENT, + NL802154_PEER_TYPE_CHILD, + + /* keep last */ + __NL802154_PEER_TYPE_AFTER_LAST, + NL802154_PEER_TYPE_MAX = __NL802154_PEER_TYPE_AFTER_LAST - 1 +}; + +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + enum nl802154_key_id_modes { NL802154_KEY_ID_MODE_IMPLICIT, NL802154_KEY_ID_MODE_INDEX, diff --git a/include/net/nsh.h b/include/net/nsh.h index 350b1ad11c7f..16a751093896 100644 --- a/include/net/nsh.h +++ b/include/net/nsh.h @@ -192,7 +192,7 @@ /** * struct nsh_md1_ctx - Keeps track of NSH context data - * @nshc<1-4>: NSH Contexts. + * @context: NSH Contexts. */ struct nsh_md1_ctx { __be32 context[4]; diff --git a/include/net/p8022.h b/include/net/p8022.h deleted file mode 100644 index c2bacc66bfbc..000000000000 --- a/include/net/p8022.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_P8022_H -#define _NET_P8022_H -struct datalink_proto * -register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)); -void unregister_8022_client(struct datalink_proto *proto); - -struct datalink_proto *make_8023_client(void); -void destroy_8023_client(struct datalink_proto *dl); -#endif diff --git a/include/net/page_pool.h b/include/net/page_pool.h deleted file mode 100644 index 81d7773f96cd..000000000000 --- a/include/net/page_pool.h +++ /dev/null @@ -1,218 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * page_pool.h - * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> - * Copyright (C) 2016 Red Hat, Inc. - */ - -/** - * DOC: page_pool allocator - * - * This page_pool allocator is optimized for the XDP mode that - * uses one-frame-per-page, but have fallbacks that act like the - * regular page allocator APIs. - * - * Basic use involve replacing alloc_pages() calls with the - * page_pool_alloc_pages() call. Drivers should likely use - * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). - * - * API keeps track of in-flight pages, in-order to let API user know - * when it is safe to dealloactor page_pool object. Thus, API users - * must make sure to call page_pool_release_page() when a page is - * "leaving" the page_pool. Or call page_pool_put_page() where - * appropiate. For maintaining correct accounting. - * - * API user must only call page_pool_put_page() once on a page, as it - * will either recycle the page, or in case of elevated refcnt, it - * will release the DMA mapping and in-flight state accounting. We - * hope to lift this requirement in the future. - */ -#ifndef _NET_PAGE_POOL_H -#define _NET_PAGE_POOL_H - -#include <linux/mm.h> /* Needed by ptr_ring */ -#include <linux/ptr_ring.h> -#include <linux/dma-direction.h> - -#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA - * map/unmap - */ -#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets - * from page_pool will be - * DMA-synced-for-device according to - * the length provided by the device - * driver. - * Please note DMA-sync-for-CPU is still - * device driver responsibility - */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) - -/* - * Fast allocation side cache array/stack - * - * The cache size and refill watermark is related to the network - * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX - * ring is usually refilled and the max consumed elements will be 64, - * thus a natural max size of objects needed in the cache. - * - * Keeping room for more objects, is due to XDP_DROP use-case. As - * XDP_DROP allows the opportunity to recycle objects directly into - * this array, as it shares the same softirq/NAPI protection. If - * cache is already full (or partly full) then the XDP_DROP recycles - * would have to take a slower code path. - */ -#define PP_ALLOC_CACHE_SIZE 128 -#define PP_ALLOC_CACHE_REFILL 64 -struct pp_alloc_cache { - u32 count; - void *cache[PP_ALLOC_CACHE_SIZE]; -}; - -struct page_pool_params { - unsigned int flags; - unsigned int order; - unsigned int pool_size; - int nid; /* Numa node id to allocate from pages from */ - struct device *dev; /* device, for DMA pre-mapping purposes */ - enum dma_data_direction dma_dir; /* DMA mapping direction */ - unsigned int max_len; /* max DMA sync memory size */ - unsigned int offset; /* DMA addr offset */ -}; - -struct page_pool { - struct page_pool_params p; - - struct delayed_work release_dw; - void (*disconnect)(void *); - unsigned long defer_start; - unsigned long defer_warn; - - u32 pages_state_hold_cnt; - - /* - * Data structure for allocation side - * - * Drivers allocation side usually already perform some kind - * of resource protection. Piggyback on this protection, and - * require driver to protect allocation side. - * - * For NIC drivers this means, allocate a page_pool per - * RX-queue. As the RX-queue is already protected by - * Softirq/BH scheduling and napi_schedule. NAPI schedule - * guarantee that a single napi_struct will only be scheduled - * on a single CPU (see napi_schedule). - */ - struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; - - /* Data structure for storing recycled pages. - * - * Returning/freeing pages is more complicated synchronization - * wise, because free's can happen on remote CPUs, with no - * association with allocation resource. - * - * Use ptr_ring, as it separates consumer and producer - * effeciently, it a way that doesn't bounce cache-lines. - * - * TODO: Implement bulk return pages into this structure. - */ - struct ptr_ring ring; - - atomic_t pages_state_release_cnt; - - /* A page_pool is strictly tied to a single RX-queue being - * protected by NAPI, due to above pp_alloc_cache. This - * refcnt serves purpose is to simplify drivers error handling. - */ - refcount_t user_cnt; - - u64 destroy_cnt; -}; - -struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); - -static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) -{ - gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); - - return page_pool_alloc_pages(pool, gfp); -} - -/* get the stored dma direction. A driver might decide to treat this locally and - * avoid the extra cache line from page_pool to determine the direction - */ -static -inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) -{ - return pool->p.dma_dir; -} - -struct page_pool *page_pool_create(const struct page_pool_params *params); - -#ifdef CONFIG_PAGE_POOL -void page_pool_destroy(struct page_pool *pool); -void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); -void page_pool_release_page(struct page_pool *pool, struct page *page); -#else -static inline void page_pool_destroy(struct page_pool *pool) -{ -} - -static inline void page_pool_use_xdp_mem(struct page_pool *pool, - void (*disconnect)(void *)) -{ -} -static inline void page_pool_release_page(struct page_pool *pool, - struct page *page) -{ -} -#endif - -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct); - -/* Same as above but will try to sync the entire area pool->max_len */ -static inline void page_pool_put_full_page(struct page_pool *pool, - struct page *page, bool allow_direct) -{ - /* When page_pool isn't compiled-in, net/core/xdp.c doesn't - * allow registering MEM_TYPE_PAGE_POOL, but shield linker. - */ -#ifdef CONFIG_PAGE_POOL - page_pool_put_page(pool, page, -1, allow_direct); -#endif -} - -/* Same as above but the caller must guarantee safe context. e.g NAPI */ -static inline void page_pool_recycle_direct(struct page_pool *pool, - struct page *page) -{ - page_pool_put_full_page(pool, page, true); -} - -static inline dma_addr_t page_pool_get_dma_addr(struct page *page) -{ - return page->dma_addr; -} - -static inline bool is_page_pool_compiled_in(void) -{ -#ifdef CONFIG_PAGE_POOL - return true; -#else - return false; -#endif -} - -static inline bool page_pool_put(struct page_pool *pool) -{ - return refcount_dec_and_test(&pool->user_cnt); -} - -/* Caller must provide appropriate safe context, e.g. NAPI. */ -void page_pool_update_nid(struct page_pool *pool, int new_nid); -static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) -{ - if (unlikely(pool->p.nid != new_nid)) - page_pool_update_nid(pool, new_nid); -} -#endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h new file mode 100644 index 000000000000..93f2c31baf9b --- /dev/null +++ b/include/net/page_pool/helpers.h @@ -0,0 +1,506 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool/helpers.h + * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> + * Copyright (C) 2016 Red Hat, Inc. + */ + +/** + * DOC: page_pool allocator + * + * The page_pool allocator is optimized for recycling page or page fragment used + * by skb packet and xdp frame. + * + * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), + * which allocate memory with or without page splitting depending on the + * requested memory size. + * + * If the driver knows that it always requires full pages or its allocations are + * always smaller than half a page, it can use one of the more specific API + * calls: + * + * 1. page_pool_alloc_pages(): allocate memory without page splitting when + * driver knows that the memory it need is always bigger than half of the page + * allocated from page pool. There is no cache line dirtying for 'struct page' + * when a page is recycled back to the page pool. + * + * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver + * knows that the memory it need is always smaller than or equal to half of the + * page allocated from page pool. Page splitting enables memory saving and thus + * avoids TLB/cache miss for data access, but there also is some cost to + * implement page splitting, mainly some cache line dirtying/bouncing for + * 'struct page' and atomic operation for page->pp_ref_count. + * + * The API keeps track of in-flight pages, in order to let API users know when + * it is safe to free a page_pool object, the API users must call + * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or + * attach the page_pool object to a page_pool-aware object like skbs marked with + * skb_mark_for_recycle(). + * + * page_pool_put_page() may be called multiple times on the same page if a page + * is split into multiple fragments. For the last fragment, it will either + * recycle the page, or in case of page->_refcount > 1, it will release the DMA + * mapping and in-flight state accounting. + * + * dma_sync_single_range_for_device() is only called for the last fragment when + * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the + * last freed fragment to do the sync_for_device operation for all fragments in + * the same page when a page is split. The API user must setup pool->p.max_len + * and pool->p.offset correctly and ensure that page_pool_put_page() is called + * with dma_sync_size being -1 for fragment API. + */ +#ifndef _NET_PAGE_POOL_HELPERS_H +#define _NET_PAGE_POOL_HELPERS_H + +#include <linux/dma-mapping.h> + +#include <net/page_pool/types.h> +#include <net/net_debug.h> +#include <net/netmem.h> + +#ifdef CONFIG_PAGE_POOL_STATS +/* Deprecated driver-facing API, use netlink instead */ +int page_pool_ethtool_stats_get_count(void); +u8 *page_pool_ethtool_stats_get_strings(u8 *data); +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats); + +bool page_pool_get_stats(const struct page_pool *pool, + struct page_pool_stats *stats); +#else +static inline int page_pool_ethtool_stats_get_count(void) +{ + return 0; +} + +static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) +{ + return data; +} + +static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) +{ + return data; +} +#endif + +/** + * page_pool_dev_alloc_pages() - allocate a page. + * @pool: pool from which to allocate + * + * Get a page from the page allocator or page_pool caches. + */ +static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_pages(pool, gfp); +} + +/** + * page_pool_dev_alloc_frag() - allocate a page fragment. + * @pool: pool from which to allocate + * @offset: offset to the allocated page + * @size: requested size + * + * Get a page fragment from the page allocator or page_pool caches. + * + * Return: allocated page fragment, otherwise return NULL. + */ +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_frag(pool, offset, size, gfp); +} + +static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + netmem_ref netmem; + + if ((*size << 1) > max_size) { + *size = max_size; + *offset = 0; + return page_pool_alloc_netmems(pool, gfp); + } + + netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); + if (unlikely(!netmem)) + return 0; + + /* There is very likely not enough space for another fragment, so append + * the remaining size to the current fragment to avoid truesize + * underestimate problem. + */ + if (pool->frag_offset + *size > max_size) { + *size = max_size - *offset; + pool->frag_offset = max_size; + } + + return netmem; +} + +static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size) +{ + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + + return page_pool_alloc_netmem(pool, offset, size, gfp); +} + +static inline struct page *page_pool_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); +} + +/** + * page_pool_dev_alloc() - allocate a page or a page fragment. + * @pool: pool from which to allocate + * @offset: offset to the allocated page + * @size: in as the requested size, out as the allocated size + * + * Get a page or a page fragment from the page allocator or page_pool caches + * depending on the requested size in order to allocate memory with least memory + * utilization and performance penalty. + * + * Return: allocated page or page fragment, otherwise return NULL. + */ +static inline struct page *page_pool_dev_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc(pool, offset, size, gfp); +} + +static inline void *page_pool_alloc_va(struct page_pool *pool, + unsigned int *size, gfp_t gfp) +{ + unsigned int offset; + struct page *page; + + /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ + page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); + if (unlikely(!page)) + return NULL; + + return page_address(page) + offset; +} + +/** + * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its + * va. + * @pool: pool from which to allocate + * @size: in as the requested size, out as the allocated size + * + * This is just a thin wrapper around the page_pool_alloc() API, and + * it returns va of the allocated page or page fragment. + * + * Return: the va for the allocated page or page fragment, otherwise return NULL. + */ +static inline void *page_pool_dev_alloc_va(struct page_pool *pool, + unsigned int *size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_va(pool, size, gfp); +} + +/** + * page_pool_get_dma_dir() - Retrieve the stored DMA direction. + * @pool: pool from which page was allocated + * + * Get the stored dma direction. A driver might decide to store this locally + * and avoid the extra cache line from page_pool to determine the direction. + */ +static inline enum dma_data_direction +page_pool_get_dma_dir(const struct page_pool *pool) +{ + return pool->p.dma_dir; +} + +static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr) +{ + atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr); +} + +/** + * page_pool_fragment_page() - split a fresh page into fragments + * @page: page to split + * @nr: references to set + * + * pp_ref_count represents the number of outstanding references to the page, + * which will be freed using page_pool APIs (rather than page allocator APIs + * like put_page()). Such references are usually held by page_pool-aware + * objects like skbs marked for page pool recycling. + * + * This helper allows the caller to take (set) multiple references to a + * freshly allocated page. The page must be freshly allocated (have a + * pp_ref_count of 1). This is commonly done by drivers and + * "fragment allocators" to save atomic operations - either when they know + * upfront how many references they will need; or to take MAX references and + * return the unused ones with a single atomic dec(), instead of performing + * multiple atomic inc() operations. + */ +static inline void page_pool_fragment_page(struct page *page, long nr) +{ + page_pool_fragment_netmem(page_to_netmem(page), nr); +} + +static inline long page_pool_unref_netmem(netmem_ref netmem, long nr) +{ + atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); + long ret; + + /* If nr == pp_ref_count then we have cleared all remaining + * references to the page: + * 1. 'n == 1': no need to actually overwrite it. + * 2. 'n != 1': overwrite it with one, which is the rare case + * for pp_ref_count draining. + * + * The main advantage to doing this is that not only we avoid a atomic + * update, as an atomic_read is generally a much cheaper operation than + * an atomic update, especially when dealing with a page that may be + * referenced by only 2 or 3 users; but also unify the pp_ref_count + * handling by ensuring all pages have partitioned into only 1 piece + * initially, and only overwrite it when the page is partitioned into + * more than one piece. + */ + if (atomic_long_read(pp_ref_count) == nr) { + /* As we have ensured nr is always one for constant case using + * the BUILD_BUG_ON(), only need to handle the non-constant case + * here for pp_ref_count draining, which is a rare case. + */ + BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); + if (!__builtin_constant_p(nr)) + atomic_long_set(pp_ref_count, 1); + + return 0; + } + + ret = atomic_long_sub_return(nr, pp_ref_count); + WARN_ON(ret < 0); + + /* We are the last user here too, reset pp_ref_count back to 1 to + * ensure all pages have been partitioned into 1 piece initially, + * this should be the rare case when the last two fragment users call + * page_pool_unref_page() currently. + */ + if (unlikely(!ret)) + atomic_long_set(pp_ref_count, 1); + + return ret; +} + +static inline long page_pool_unref_page(struct page *page, long nr) +{ + return page_pool_unref_netmem(page_to_netmem(page), nr); +} + +static inline void page_pool_ref_netmem(netmem_ref netmem) +{ + atomic_long_inc(netmem_get_pp_ref_count_ref(netmem)); +} + +static inline void page_pool_ref_page(struct page *page) +{ + page_pool_ref_netmem(page_to_netmem(page)); +} + +static inline bool page_pool_unref_and_test(netmem_ref netmem) +{ + /* If page_pool_unref_page() returns 0, we were the last user */ + return page_pool_unref_netmem(netmem, 1) == 0; +} + +static inline void page_pool_put_netmem(struct page_pool *pool, + netmem_ref netmem, + unsigned int dma_sync_size, + bool allow_direct) +{ + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL + if (!page_pool_unref_and_test(netmem)) + return; + + page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); +#endif +} + +/** + * page_pool_put_page() - release a reference to a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * @dma_sync_size: how much of the page may have been touched by the device + * @allow_direct: released by the consumer, allow lockless caching + * + * The outcome of this depends on the page refcnt. If the driver bumps + * the refcnt > 1 this will unmap the page. If the page refcnt is 1 + * the allocator owns the page and will try to recycle it in one of the pool + * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device + * using dma_sync_single_range_for_device(). + */ +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size, + bool allow_direct) +{ + page_pool_put_netmem(pool, page_to_netmem(page), dma_sync_size, + allow_direct); +} + +static inline void page_pool_put_full_netmem(struct page_pool *pool, + netmem_ref netmem, + bool allow_direct) +{ + page_pool_put_netmem(pool, netmem, -1, allow_direct); +} + +/** + * page_pool_put_full_page() - release a reference on a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * @allow_direct: released by the consumer, allow lockless caching + * + * Similar to page_pool_put_page(), but will DMA sync the entire memory area + * as configured in &page_pool_params.max_len. + */ +static inline void page_pool_put_full_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + page_pool_put_netmem(pool, page_to_netmem(page), -1, allow_direct); +} + +/** + * page_pool_recycle_direct() - release a reference on a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * + * Similar to page_pool_put_full_page() but caller must guarantee safe context + * (e.g NAPI), since it will recycle the page directly into the pool fast cache. + */ +static inline void page_pool_recycle_direct(struct page_pool *pool, + struct page *page) +{ + page_pool_put_full_page(pool, page, true); +} + +static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, + netmem_ref netmem) +{ + page_pool_put_full_netmem(pool, netmem, true); +} + +#define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ + (sizeof(dma_addr_t) > sizeof(unsigned long)) + +/** + * page_pool_free_va() - free a va into the page_pool + * @pool: pool from which va was allocated + * @va: va to be freed + * @allow_direct: freed by the consumer, allow lockless caching + * + * Free a va allocated from page_pool_allo_va(). + */ +static inline void page_pool_free_va(struct page_pool *pool, void *va, + bool allow_direct) +{ + page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); +} + +static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) +{ + dma_addr_t ret = netmem_get_dma_addr(netmem); + + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) + ret <<= PAGE_SHIFT; + + return ret; +} + +/** + * page_pool_get_dma_addr() - Retrieve the stored DMA address. + * @page: page allocated from a page pool + * + * Fetch the DMA address of the page. The page pool to which the page belongs + * must had been created with PP_FLAG_DMA_MAP. + */ +static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) +{ + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) + ret <<= PAGE_SHIFT; + + return ret; +} + +static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const dma_addr_t dma_addr, + u32 offset, u32 dma_sync_size) +{ + dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, + offset + pool->p.offset, dma_sync_size, + page_pool_get_dma_dir(pool)); +} + +/** + * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW + * @pool: &page_pool the @page belongs to + * @page: page to sync + * @offset: offset from page start to "hard" start if using PP frags + * @dma_sync_size: size of the data written to the page + * + * Can be used as a shorthand to sync Rx pages before accessing them in the + * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``. + * Note that this version performs DMA sync unconditionally, even if the + * associated PP doesn't perform sync-for-device. + */ +static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const struct page *page, + u32 offset, u32 dma_sync_size) +{ + __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, + dma_sync_size); +} + +static inline void +page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, + const netmem_ref netmem, u32 offset, + u32 dma_sync_size) +{ + if (!pool->dma_sync_for_cpu) + return; + + __page_pool_dma_sync_for_cpu(pool, + page_pool_get_dma_addr_netmem(netmem), + offset, dma_sync_size); +} + +static inline bool page_pool_put(struct page_pool *pool) +{ + return refcount_dec_and_test(&pool->user_cnt); +} + +static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) +{ + if (unlikely(pool->p.nid != new_nid)) + page_pool_update_nid(pool, new_nid); +} + +static inline bool page_pool_is_unreadable(struct page_pool *pool) +{ + return !!pool->mp_ops; +} + +#endif /* _NET_PAGE_POOL_HELPERS_H */ diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h new file mode 100644 index 000000000000..ada4f968960a --- /dev/null +++ b/include/net/page_pool/memory_provider.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H +#define _NET_PAGE_POOL_MEMORY_PROVIDER_H + +#include <net/netmem.h> +#include <net/page_pool/types.h> + +struct netdev_rx_queue; +struct netlink_ext_ack; +struct sk_buff; + +struct memory_provider_ops { + netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); + bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); + int (*init)(struct page_pool *pool); + void (*destroy)(struct page_pool *pool); + int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq); + void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); +}; + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); +void net_mp_niov_clear_page_pool(struct net_iov *niov); + +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p); +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack); +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p); +void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *old_p); + +/** + * net_mp_netmem_place_in_cache() - give a netmem to a page pool + * @pool: the page pool to place the netmem into + * @netmem: netmem to give + * + * Push an accounted netmem into the page pool's allocation cache. The caller + * must ensure that there is space in the cache. It should only be called off + * the mp_ops->alloc_netmems() path. + */ +static inline void net_mp_netmem_place_in_cache(struct page_pool *pool, + netmem_ref netmem) +{ + pool->alloc.cache[pool->alloc.count++] = netmem; +} + +#endif diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h new file mode 100644 index 000000000000..431b593de709 --- /dev/null +++ b/include/net/page_pool/types.h @@ -0,0 +1,308 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _NET_PAGE_POOL_TYPES_H +#define _NET_PAGE_POOL_TYPES_H + +#include <linux/dma-direction.h> +#include <linux/ptr_ring.h> +#include <linux/types.h> +#include <linux/xarray.h> +#include <net/netmem.h> + +#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA + * map/unmap + */ +#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets + * from page_pool will be + * DMA-synced-for-device according to + * the length provided by the device + * driver. + * Please note DMA-sync-for-CPU is still + * device driver responsibility + */ +#define PP_FLAG_SYSTEM_POOL BIT(2) /* Global system page_pool */ + +/* Allow unreadable (net_iov backed) netmem in this page_pool. Drivers setting + * this must be able to support unreadable netmem, where netmem_address() would + * return NULL. This flag should not be set for header page_pools. + * + * If the driver sets PP_FLAG_ALLOW_UNREADABLE_NETMEM, it should also set + * page_pool_params.slow.queue_idx. + */ +#define PP_FLAG_ALLOW_UNREADABLE_NETMEM BIT(3) + +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ + PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) + +/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) + +/* + * Fast allocation side cache array/stack + * + * The cache size and refill watermark is related to the network + * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX + * ring is usually refilled and the max consumed elements will be 64, + * thus a natural max size of objects needed in the cache. + * + * Keeping room for more objects, is due to XDP_DROP use-case. As + * XDP_DROP allows the opportunity to recycle objects directly into + * this array, as it shares the same softirq/NAPI protection. If + * cache is already full (or partly full) then the XDP_DROP recycles + * would have to take a slower code path. + */ +#define PP_ALLOC_CACHE_SIZE 128 +#define PP_ALLOC_CACHE_REFILL 64 +struct pp_alloc_cache { + u32 count; + netmem_ref cache[PP_ALLOC_CACHE_SIZE]; +}; + +/** + * struct page_pool_params - page pool parameters + * @fast: params accessed frequently on hotpath + * @order: 2^order pages on allocation + * @pool_size: size of the ptr_ring + * @nid: NUMA node id to allocate from pages from + * @dev: device, for DMA pre-mapping purposes + * @napi: NAPI which is the sole consumer of pages, otherwise NULL + * @dma_dir: DMA mapping direction + * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV + * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV + * @slow: params with slowpath access only (initialization and Netlink) + * @netdev: netdev this pool will serve (leave as NULL if none or multiple) + * @queue_idx: queue idx this page_pool is being created for. + * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL, + * PP_FLAG_ALLOW_UNREADABLE_NETMEM. + */ +struct page_pool_params { + struct_group_tagged(page_pool_params_fast, fast, + unsigned int order; + unsigned int pool_size; + int nid; + struct device *dev; + struct napi_struct *napi; + enum dma_data_direction dma_dir; + unsigned int max_len; + unsigned int offset; + ); + struct_group_tagged(page_pool_params_slow, slow, + struct net_device *netdev; + unsigned int queue_idx; + unsigned int flags; +/* private: used by test code only */ + void (*init_callback)(netmem_ref netmem, void *arg); + void *init_arg; + ); +}; + +#ifdef CONFIG_PAGE_POOL_STATS +/** + * struct page_pool_alloc_stats - allocation statistics + * @fast: successful fast path allocations + * @slow: slow path order-0 allocations + * @slow_high_order: slow path high order allocations + * @empty: ptr ring is empty, so a slow path allocation was forced + * @refill: an allocation which triggered a refill of the cache + * @waive: pages obtained from the ptr ring that cannot be added to + * the cache due to a NUMA mismatch + */ +struct page_pool_alloc_stats { + u64 fast; + u64 slow; + u64 slow_high_order; + u64 empty; + u64 refill; + u64 waive; +}; + +/** + * struct page_pool_recycle_stats - recycling (freeing) statistics + * @cached: recycling placed page in the page pool cache + * @cache_full: page pool cache was full + * @ring: page placed into the ptr ring + * @ring_full: page released from page pool because the ptr ring was full + * @released_refcnt: page released (and not recycled) because refcnt > 1 + */ +struct page_pool_recycle_stats { + u64 cached; + u64 cache_full; + u64 ring; + u64 ring_full; + u64 released_refcnt; +}; + +/** + * struct page_pool_stats - combined page pool use statistics + * @alloc_stats: see struct page_pool_alloc_stats + * @recycle_stats: see struct page_pool_recycle_stats + * + * Wrapper struct for combining page pool stats with different storage + * requirements. + */ +struct page_pool_stats { + struct page_pool_alloc_stats alloc_stats; + struct page_pool_recycle_stats recycle_stats; +}; +#endif + +/* The whole frag API block must stay within one cacheline. On 32-bit systems, + * sizeof(long) == sizeof(int), so that the block size is ``3 * sizeof(long)``. + * On 64-bit systems, the actual size is ``2 * sizeof(long) + sizeof(int)``. + * The closest pow-2 to both of them is ``4 * sizeof(long)``, so just use that + * one for simplicity. + * Having it aligned to a cacheline boundary may be excessive and doesn't bring + * any good. + */ +#define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long)) + +struct memory_provider_ops; + +struct pp_memory_provider_params { + void *mp_priv; + const struct memory_provider_ops *mp_ops; +}; + +struct page_pool { + struct page_pool_params_fast p; + + int cpuid; + u32 pages_state_hold_cnt; + + bool has_init_callback:1; /* slow::init_callback is set */ + bool dma_map:1; /* Perform DMA mapping */ + bool dma_sync:1; /* Perform DMA sync for device */ + bool dma_sync_for_cpu:1; /* Perform DMA sync for cpu */ +#ifdef CONFIG_PAGE_POOL_STATS + bool system:1; /* This is a global percpu pool */ +#endif + + __cacheline_group_begin_aligned(frag, PAGE_POOL_FRAG_GROUP_ALIGN); + long frag_users; + netmem_ref frag_page; + unsigned int frag_offset; + __cacheline_group_end_aligned(frag, PAGE_POOL_FRAG_GROUP_ALIGN); + + struct delayed_work release_dw; + void (*disconnect)(void *pool); + unsigned long defer_start; + unsigned long defer_warn; + +#ifdef CONFIG_PAGE_POOL_STATS + /* these stats are incremented while in softirq context */ + struct page_pool_alloc_stats alloc_stats; +#endif + u32 xdp_mem_id; + + /* + * Data structure for allocation side + * + * Drivers allocation side usually already perform some kind + * of resource protection. Piggyback on this protection, and + * require driver to protect allocation side. + * + * For NIC drivers this means, allocate a page_pool per + * RX-queue. As the RX-queue is already protected by + * Softirq/BH scheduling and napi_schedule. NAPI schedule + * guarantee that a single napi_struct will only be scheduled + * on a single CPU (see napi_schedule). + */ + struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; + + /* Data structure for storing recycled pages. + * + * Returning/freeing pages is more complicated synchronization + * wise, because free's can happen on remote CPUs, with no + * association with allocation resource. + * + * Use ptr_ring, as it separates consumer and producer + * efficiently, it a way that doesn't bounce cache-lines. + * + * TODO: Implement bulk return pages into this structure. + */ + struct ptr_ring ring; + + void *mp_priv; + const struct memory_provider_ops *mp_ops; + + struct xarray dma_mapped; + +#ifdef CONFIG_PAGE_POOL_STATS + /* recycle stats are per-cpu to avoid locking */ + struct page_pool_recycle_stats __percpu *recycle_stats; +#endif + atomic_t pages_state_release_cnt; + + /* A page_pool is strictly tied to a single RX-queue being + * protected by NAPI, due to above pp_alloc_cache. This + * refcnt serves purpose is to simplify drivers error handling. + */ + refcount_t user_cnt; + + u64 destroy_cnt; + + /* Slow/Control-path information follows */ + struct page_pool_params_slow slow; + /* User-facing fields, protected by page_pools_lock */ + struct { + struct hlist_node list; + u64 detach_time; + u32 id; + } user; +}; + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp); +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp); +netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, + unsigned int *offset, unsigned int size, + gfp_t gfp); +struct page_pool *page_pool_create(const struct page_pool_params *params); +struct page_pool *page_pool_create_percpu(const struct page_pool_params *params, + int cpuid); + +struct xdp_mem_info; + +#ifdef CONFIG_PAGE_POOL +void page_pool_disable_direct_recycling(struct page_pool *pool); +void page_pool_destroy(struct page_pool *pool); +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + const struct xdp_mem_info *mem); +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count); +#else +static inline void page_pool_destroy(struct page_pool *pool) +{ +} + +static inline void page_pool_use_xdp_mem(struct page_pool *pool, + void (*disconnect)(void *), + const struct xdp_mem_info *mem) +{ +} + +static inline void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) +{ +} +#endif + +void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, + unsigned int dma_sync_size, + bool allow_direct); +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, + bool allow_direct); + +static inline bool is_page_pool_compiled_in(void) +{ +#ifdef CONFIG_PAGE_POOL + return true; +#else + return false; +#endif +} + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid); + +#endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/pfcp.h b/include/net/pfcp.h new file mode 100644 index 000000000000..af14f970b80e --- /dev/null +++ b/include/net/pfcp.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PFCP_H_ +#define _PFCP_H_ + +#include <uapi/linux/if_ether.h> +#include <net/dst_metadata.h> +#include <linux/netdevice.h> +#include <uapi/linux/ipv6.h> +#include <net/udp_tunnel.h> +#include <uapi/linux/udp.h> +#include <uapi/linux/ip.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bits.h> + +#define PFCP_PORT 8805 + +/* PFCP protocol header */ +struct pfcphdr { + u8 flags; + u8 message_type; + __be16 message_length; +}; + +/* PFCP header flags */ +#define PFCP_SEID_FLAG BIT(0) +#define PFCP_MP_FLAG BIT(1) + +#define PFCP_VERSION_MASK GENMASK(4, 0) + +#define PFCP_HLEN (sizeof(struct udphdr) + sizeof(struct pfcphdr)) + +/* PFCP node related messages */ +struct pfcphdr_node { + u8 seq_number[3]; + u8 reserved; +}; + +/* PFCP session related messages */ +struct pfcphdr_session { + __be64 seid; + u8 seq_number[3]; +#ifdef __LITTLE_ENDIAN_BITFIELD + u8 message_priority:4, + reserved:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:4, + message_priprity:4; +#else +#error "Please fix <asm/byteorder>" +#endif +}; + +struct pfcp_metadata { + u8 type; + __be64 seid; +} __packed; + +enum { + PFCP_TYPE_NODE = 0, + PFCP_TYPE_SESSION = 1, +}; + +#define PFCP_HEADROOM (sizeof(struct iphdr) + sizeof(struct udphdr) + \ + sizeof(struct pfcphdr) + sizeof(struct ethhdr)) +#define PFCP6_HEADROOM (sizeof(struct ipv6hdr) + sizeof(struct udphdr) + \ + sizeof(struct pfcphdr) + sizeof(struct ethhdr)) + +static inline struct pfcphdr *pfcp_hdr(struct sk_buff *skb) +{ + return (struct pfcphdr *)(udp_hdr(skb) + 1); +} + +static inline struct pfcphdr_node *pfcp_hdr_node(struct sk_buff *skb) +{ + return (struct pfcphdr_node *)(pfcp_hdr(skb) + 1); +} + +static inline struct pfcphdr_session *pfcp_hdr_session(struct sk_buff *skb) +{ + return (struct pfcphdr_session *)(pfcp_hdr(skb) + 1); +} + +static inline bool netif_is_pfcp(const struct net_device *dev) +{ + return dev->rtnl_link_ops && + !strcmp(dev->rtnl_link_ops->kind, "pfcp"); +} + +#endif diff --git a/include/net/phonet/pep.h b/include/net/phonet/pep.h index 27b1ab5e4e6d..645dddf5ce77 100644 --- a/include/net/phonet/pep.h +++ b/include/net/phonet/pep.h @@ -10,6 +10,9 @@ #ifndef NET_PHONET_PEP_H #define NET_PHONET_PEP_H +#include <linux/skbuff.h> +#include <net/phonet/phonet.h> + struct pep_sock { struct pn_sock pn_sk; diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h index a27bdc6cfeab..cf5ecae4a2fc 100644 --- a/include/net/phonet/phonet.h +++ b/include/net/phonet/phonet.h @@ -10,6 +10,10 @@ #ifndef AF_PHONET_H #define AF_PHONET_H +#include <linux/phonet.h> +#include <linux/skbuff.h> +#include <net/sock.h> + /* * The lower layers may not require more space, ever. Make sure it's * enough. @@ -105,4 +109,25 @@ void phonet_sysctl_exit(void); int isi_register(void); void isi_unregister(void); +static inline bool sk_is_phonet(struct sock *sk) +{ + return sk->sk_family == PF_PHONET; +} + +static inline int phonet_sk_ioctl(struct sock *sk, unsigned int cmd, + void __user *arg) +{ + int karg; + + switch (cmd) { + case SIOCPNADDRESOURCE: + case SIOCPNDELRESOURCE: + if (get_user(karg, (int __user *)arg)) + return -EFAULT; + + return sk->sk_prot->ioctl(sk, cmd, &karg); + } + /* A positive return value means that the ioctl was not processed */ + return 1; +} #endif diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index 05b49d4d2b11..37a3e83531c6 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -10,9 +10,14 @@ #ifndef PN_DEV_H #define PN_DEV_H +#include <linux/list.h> +#include <linux/spinlock.h> + +struct net; + struct phonet_device_list { struct list_head list; - struct mutex lock; + spinlock_t lock; }; struct phonet_device_list *phonet_device_list(struct net *net); @@ -33,11 +38,11 @@ int phonet_address_add(struct net_device *dev, u8 addr); int phonet_address_del(struct net_device *dev, u8 addr); u8 phonet_address_get(struct net_device *dev, u8 addr); int phonet_address_lookup(struct net *net, u8 addr); -void phonet_address_notify(int event, struct net_device *dev, u8 addr); +void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr); int phonet_route_add(struct net_device *dev, u8 daddr); int phonet_route_del(struct net_device *dev, u8 daddr); -void rtm_phonet_notify(int event, struct net_device *dev, u8 dst); +void rtm_phonet_notify(struct net *net, int event, u32 ifindex, u8 dst); struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr); struct net_device *phonet_route_output(struct net *net, u8 daddr); diff --git a/include/net/pie.h b/include/net/pie.h index 3fe2361e03b4..01cbc66825a4 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -17,7 +17,7 @@ /** * struct pie_params - contains pie parameters * @target: target delay in pschedtime - * @tudpate: interval at which drop probability is calculated + * @tupdate: interval at which drop probability is calculated * @limit: total number of packets that can be in the queue * @alpha: parameter to control drop probability * @beta: parameter to control drop probability diff --git a/include/net/ping.h b/include/net/ping.h index 2fe78874318c..bc7779262e60 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -16,14 +16,7 @@ #define PING_HTABLE_SIZE 64 #define PING_HTABLE_MASK (PING_HTABLE_SIZE-1) -#define ping_portaddr_for_each_entry(__sk, node, list) \ - hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) - -/* - * gid_t is either uint or ushort. We want to pass it to - * proc_dointvec_minmax(), so it must not be larger than MAX_INT - */ -#define GID_T_MAX (((gid_t)~0U) >> 1) +#define GID_T_MAX (((gid_t)~0U) - 1) /* Compatibility glue so we can support IPv6 when it's compiled as a module */ struct pingv6_ops { @@ -71,12 +64,12 @@ void ping_err(struct sk_buff *skb, int offset, u32 info); int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *); -int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, +int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len); int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -bool ping_rcv(struct sk_buff *skb); +enum skb_drop_reason ping_rcv(struct sk_buff *skb); #ifdef CONFIG_PROC_FS void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family); diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index d4d461236351..c64fd896b1f9 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -23,7 +23,9 @@ struct tcf_walker { }; int register_tcf_proto_ops(struct tcf_proto_ops *ops); -int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); +void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); +#define NET_CLS_ALIAS_PREFIX "net-cls-" +#define MODULE_ALIAS_NET_CLS(kind) MODULE_ALIAS(NET_CLS_ALIAS_PREFIX kind) struct tcf_block_ext_info { enum flow_block_binder_type binder_type; @@ -48,7 +50,7 @@ void tcf_chain_put_by_act(struct tcf_chain *chain); struct tcf_chain *tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain); struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain, - struct tcf_proto *tp, bool rtnl_held); + struct tcf_proto *tp); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, @@ -59,6 +61,8 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); +int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, + int police, struct tcf_proto *tp, u32 handle, bool used_action_miss); static inline bool tcf_block_shared(struct tcf_block *block) { @@ -70,18 +74,38 @@ static inline bool tcf_block_non_null_shared(struct tcf_block *block) return block && block->index; } +#ifdef CONFIG_NET_CLS_ACT +DECLARE_STATIC_KEY_FALSE(tcf_sw_enabled_key); + +static inline bool tcf_block_bypass_sw(struct tcf_block *block) +{ + return block && !atomic_read(&block->useswcnt); +} +#endif + static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); return block->q; } -int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res, bool compat_mode); -int tcf_classify_ingress(struct sk_buff *skb, - const struct tcf_block *ingress_block, - const struct tcf_proto *tp, struct tcf_result *res, - bool compat_mode); +int tcf_classify(struct sk_buff *skb, + const struct tcf_block *block, + const struct tcf_proto *tp, struct tcf_result *res, + bool compat_mode); + +static inline bool tc_cls_stats_dump(struct tcf_proto *tp, + struct tcf_walker *arg, + void *filter) +{ + if (arg->count >= arg->skip && arg->fn(tp, filter, arg) < 0) { + arg->stop = 1; + return false; + } + + arg->count++; + return true; +} #else static inline bool tcf_block_shared(struct tcf_block *block) @@ -125,33 +149,14 @@ static inline struct Qdisc *tcf_block_q(struct tcf_block *block) return NULL; } -static inline -int tc_setup_cb_block_register(struct tcf_block *block, flow_setup_cb_t *cb, - void *cb_priv) -{ - return 0; -} - -static inline -void tc_setup_cb_block_unregister(struct tcf_block *block, flow_setup_cb_t *cb, - void *cb_priv) -{ -} - -static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, +static inline int tcf_classify(struct sk_buff *skb, + const struct tcf_block *block, + const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { return TC_ACT_UNSPEC; } -static inline int tcf_classify_ingress(struct sk_buff *skb, - const struct tcf_block *ingress_block, - const struct tcf_proto *tp, - struct tcf_result *res, bool compat_mode) -{ - return TC_ACT_UNSPEC; -} - #endif static inline unsigned long @@ -205,12 +210,26 @@ tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) __tcf_unbind_filter(q, r); } +static inline void tc_cls_bind_class(u32 classid, unsigned long cl, + void *q, struct tcf_result *res, + unsigned long base) +{ + if (res->classid == classid) { + if (cl) + __tcf_bind_filter(q, res, base); + else + __tcf_unbind_filter(q, res); + } +} + struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; - struct net *net; + struct net *net; + netns_tracker ns_tracker; + struct tcf_exts_miss_cookie_node *miss_cookie_node; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. @@ -222,18 +241,11 @@ struct tcf_exts { static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, int action, int police) { -#ifdef CONFIG_NET_CLS_ACT - exts->type = 0; - exts->nr_actions = 0; - exts->net = net; - exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), - GFP_KERNEL); - if (!exts->actions) - return -ENOMEM; +#ifdef CONFIG_NET_CLS + return tcf_exts_init_ex(exts, net, action, police, NULL, 0, false); +#else + return -EOPNOTSUPP; #endif - exts->action = action; - exts->police = police; - return 0; } /* Return false if the netns is being destroyed in cleanup_net(). Callers @@ -244,6 +256,8 @@ static inline bool tcf_exts_get_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT exts->net = maybe_get_net(exts->net); + if (exts->net) + netns_tracker_alloc(exts->net, &exts->ns_tracker, GFP_KERNEL); return exts->net != NULL; #else return true; @@ -254,7 +268,7 @@ static inline void tcf_exts_put_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT if (exts->net) - put_net(exts->net); + put_net_track(exts->net, &exts->ns_tracker); #endif } @@ -266,26 +280,38 @@ static inline void tcf_exts_put_net(struct tcf_exts *exts) for (; 0; (void)(i), (void)(a), (void)(exts)) #endif +#define tcf_act_for_each_action(i, a, actions) \ + for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = actions[i]); i++) + +static inline bool tc_act_in_hw(struct tc_action *act) +{ + return !!act->in_hw_count; +} + static inline void -tcf_exts_stats_update(const struct tcf_exts *exts, - u64 bytes, u64 packets, u64 drops, u64 lastuse, - u8 used_hw_stats, bool used_hw_stats_valid) +tcf_exts_hw_stats_update(const struct tcf_exts *exts, + struct flow_stats *stats, + bool use_act_stats) { #ifdef CONFIG_NET_CLS_ACT int i; - preempt_disable(); - for (i = 0; i < exts->nr_actions; i++) { struct tc_action *a = exts->actions[i]; - tcf_action_stats_update(a, bytes, packets, drops, - lastuse, true); - a->used_hw_stats = used_hw_stats; - a->used_hw_stats_valid = used_hw_stats_valid; - } + if (use_act_stats || tc_act_in_hw(a)) { + if (!tcf_action_update_hw_stats(a)) + continue; + } + + preempt_disable(); + tcf_action_stats_update(a, stats->bytes, stats->pkts, stats->drops, + stats->lastused, true); + preempt_enable(); - preempt_enable(); + a->used_hw_stats = stats->used_hw_stats; + a->used_hw_stats_valid = stats->used_hw_stats_valid; + } #endif } @@ -293,7 +319,7 @@ tcf_exts_stats_update(const struct tcf_exts *exts, * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * - * Returns true if at least one action is present. + * Returns: true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { @@ -325,10 +351,25 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, return TC_ACT_OK; } +static inline int +tcf_exts_exec_ex(struct sk_buff *skb, struct tcf_exts *exts, int act_index, + struct tcf_result *res) +{ +#ifdef CONFIG_NET_CLS_ACT + return tcf_action_exec(skb, exts->actions + act_index, + exts->nr_actions - act_index, res); +#else + return TC_ACT_OK; +#endif +} + int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, - struct tcf_exts *exts, bool ovr, bool rtnl_held, + struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack); +int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, + struct nlattr *rate_tlv, struct tcf_exts *exts, + u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); @@ -337,6 +378,9 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); /** * struct tcf_pkt_info - packet information + * + * @ptr: start of the pkt data + * @nexthdr: offset of the next header */ struct tcf_pkt_info { unsigned char * ptr; @@ -355,6 +399,7 @@ struct tcf_ematch_ops; * @ops: the operations lookup table of the corresponding ematch module * @datalen: length of the ematch specific configuration data * @data: ematch specific data + * @net: the network namespace */ struct tcf_ematch { struct tcf_ematch_ops * ops; @@ -446,7 +491,7 @@ int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); /** - * tcf_em_tree_match - evaulate an ematch tree + * tcf_em_tree_match - evaluate an ematch tree * * @skb: socket buffer of the packet in question * @tree: ematch tree to be used for evaluation @@ -456,7 +501,7 @@ int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, * through all ematches respecting their logic relations returning * as soon as the result is obvious. * - * Returns 1 if the ematch tree as-one matches, no ematches are configured + * Returns: 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, @@ -512,7 +557,7 @@ tcf_change_indev(struct net *net, struct nlattr *indev_tlv, char indev[IFNAMSIZ]; struct net_device *dev; - if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) >= IFNAMSIZ) { + if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Interface name too long"); return -EINVAL; @@ -536,9 +581,14 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) return ifindex == skb->skb_iif; } -int tc_setup_flow_action(struct flow_action *flow_action, - const struct tcf_exts *exts); -void tc_cleanup_flow_action(struct flow_action *flow_action); +int tc_setup_offload_action(struct flow_action *flow_action, + const struct tcf_exts *exts, + struct netlink_ext_ack *extack); +void tc_cleanup_offload_action(struct flow_action *flow_action); +int tc_setup_action(struct flow_action *flow_action, + struct tc_action *actions[], + u32 miss_cookie_base, + struct netlink_ext_ack *extack); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); @@ -705,10 +755,31 @@ tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; + cls_common->skip_sw = tc_skip_sw(flags); if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } +static inline void tcf_proto_update_usesw(struct tcf_proto *tp, u32 flags) +{ + if (tp->usesw) + return; + if (tc_skip_sw(flags) && tc_in_hw(flags)) + return; + tp->usesw = true; +} + +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) +{ + struct tc_skb_ext *tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); + + if (tc_skb_ext) + memset(tc_skb_ext, 0, sizeof(*tc_skb_ext)); + return tc_skb_ext; +} +#endif + enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, @@ -720,6 +791,7 @@ struct tc_cls_matchall_offload { enum tc_matchall_command command; struct flow_rule *rule; struct flow_stats stats; + bool use_act_stats; unsigned long cookie; }; @@ -738,16 +810,6 @@ struct tc_cls_bpf_offload { bool exts_integrated; }; -struct tc_mqprio_qopt_offload { - /* struct tc_mqprio_qopt must always be the first element */ - struct tc_mqprio_qopt qopt; - u16 mode; - u16 shaper; - u32 flags; - u64 min_rate[TC_QOPT_MAX_QUEUE]; - u64 max_rate[TC_QOPT_MAX_QUEUE]; -}; - /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers */ @@ -758,7 +820,7 @@ struct tc_cookie { }; struct tc_qopt_offload_stats { - struct gnet_stats_basic_packed *bstats; + struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; @@ -783,6 +845,43 @@ struct tc_mq_qopt_offload { }; }; +enum tc_htb_command { + /* Root */ + TC_HTB_CREATE, /* Initialize HTB offload. */ + TC_HTB_DESTROY, /* Destroy HTB offload. */ + + /* Classes */ + /* Allocate qid and create leaf. */ + TC_HTB_LEAF_ALLOC_QUEUE, + /* Convert leaf to inner, preserve and return qid, create new leaf. */ + TC_HTB_LEAF_TO_INNER, + /* Delete leaf, while siblings remain. */ + TC_HTB_LEAF_DEL, + /* Delete leaf, convert parent to leaf, preserving qid. */ + TC_HTB_LEAF_DEL_LAST, + /* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */ + TC_HTB_LEAF_DEL_LAST_FORCE, + /* Modify parameters of a node. */ + TC_HTB_NODE_MODIFY, + + /* Class qdisc */ + TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */ +}; + +struct tc_htb_qopt_offload { + struct netlink_ext_ack *extack; + enum tc_htb_command command; + u32 parent_classid; + u16 classid; + u16 qid; + u32 quantum; + u64 rate; + u64 ceil; + u8 prio; +}; + +#define TC_HTB_CLASSID_ROOT U32_MAX + enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, @@ -843,7 +942,7 @@ struct tc_gred_qopt_offload_params { }; struct tc_gred_qopt_offload_stats { - struct gnet_stats_basic_packed bstats[MAX_DPs]; + struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; @@ -935,6 +1034,7 @@ enum tc_tbf_command { TC_TBF_REPLACE, TC_TBF_DESTROY, TC_TBF_STATS, + TC_TBF_GRAFT, }; struct tc_tbf_qopt_offload_replace_params { @@ -950,6 +1050,7 @@ struct tc_tbf_qopt_offload { union { struct tc_tbf_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; + u32 child_handle; }; }; @@ -968,4 +1069,15 @@ struct tc_fifo_qopt_offload { }; }; +#ifdef CONFIG_NET_CLS_ACT +DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); +void tc_skb_ext_tc_enable(void); +void tc_skb_ext_tc_disable(void); +#define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) +#else /* CONFIG_NET_CLS_ACT */ +static inline void tc_skb_ext_tc_enable(void) { } +static inline void tc_skb_ext_tc_disable(void) { } +#define tc_skb_ext_tc_enabled() false +#endif + #endif diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 4ed32e6b0201..d7b7b6cd4aa1 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -11,6 +11,7 @@ #include <uapi/linux/pkt_sched.h> #define DEFAULT_TX_QUEUE_LEN 1000 +#define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; @@ -19,9 +20,14 @@ struct qdisc_walker { int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; -static inline void *qdisc_priv(struct Qdisc *q) +#define qdisc_priv(q) \ + _Generic(q, \ + const struct Qdisc * : (const void *)&q->privdata, \ + struct Qdisc * : (void *)&q->privdata) + +static inline struct Qdisc *qdisc_from_priv(void *priv) { - return &q->privdata; + return container_of(priv, struct Qdisc, privdata); } /* @@ -57,14 +63,7 @@ static inline psched_time_t psched_get_time(void) return PSCHED_NS2TICKS(ktime_get_ns()); } -static inline psched_tdiff_t -psched_tdiff_bounded(psched_time_t tv1, psched_time_t tv2, psched_time_t bound) -{ - return min(tv1 - tv2, bound); -} - struct qdisc_watchdog { - u64 last_expires; struct hrtimer timer; struct Qdisc *qdisc; }; @@ -100,7 +99,9 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, struct netlink_ext_ack *extack); int register_qdisc(struct Qdisc_ops *qops); -int unregister_qdisc(struct Qdisc_ops *qops); +void unregister_qdisc(struct Qdisc_ops *qops); +#define NET_SCH_ALIAS_PREFIX "net-sch-" +#define MODULE_ALIAS_NET_SCH(id) MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id) void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); @@ -123,22 +124,19 @@ void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { - /* NOLOCK qdisc must check 'state' under the qdisc seqlock - * to avoid racing with dev_qdisc_reset() - */ - if (!(q->flags & TCQ_F_NOLOCK) || - likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - __qdisc_run(q); + __qdisc_run(q); qdisc_run_end(q); } } +extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; + /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ static inline unsigned int psched_mtu(const struct net_device *dev) { - return dev->mtu + dev->hard_header_len; + return READ_ONCE(dev->mtu) + dev->hard_header_len; } static inline struct net *qdisc_net(struct Qdisc *q) @@ -146,6 +144,11 @@ static inline struct net *qdisc_net(struct Qdisc *q) return dev_net(q->dev_queue->dev); } +struct tc_query_caps_base { + enum tc_setup_type type; + void *caps; +}; + struct tc_cbs_qopt_offload { u8 enable; s32 queue; @@ -160,6 +163,58 @@ struct tc_etf_qopt_offload { s32 queue; }; +struct tc_mqprio_caps { + bool validate_queue_counts:1; +}; + +struct tc_mqprio_qopt_offload { + /* struct tc_mqprio_qopt must always be the first element */ + struct tc_mqprio_qopt qopt; + struct netlink_ext_ack *extack; + u16 mode; + u16 shaper; + u32 flags; + u64 min_rate[TC_QOPT_MAX_QUEUE]; + u64 max_rate[TC_QOPT_MAX_QUEUE]; + unsigned long preemptible_tcs; +}; + +struct tc_taprio_caps { + bool supports_queue_max_sdu:1; + bool gate_mask_per_txq:1; + /* Device expects lower TXQ numbers to have higher priority over higher + * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS, + * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE. + */ + bool broken_mqprio:1; +}; + +enum tc_taprio_qopt_cmd { + TAPRIO_CMD_REPLACE, + TAPRIO_CMD_DESTROY, + TAPRIO_CMD_STATS, + TAPRIO_CMD_QUEUE_STATS, +}; + +/** + * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics + * @window_drops: Frames that were dropped because they were too large to be + * transmitted in any of the allotted time windows (open gates) for their + * traffic class. + * @tx_overruns: Frames still being transmitted by the MAC after the + * transmission gate associated with their traffic class has closed. + * Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018. + */ +struct tc_taprio_qopt_stats { + u64 window_drops; + u64 tx_overruns; +}; + +struct tc_taprio_qopt_queue_stats { + int queue; + struct tc_taprio_qopt_stats stats; +}; + struct tc_taprio_sched_entry { u8 command; /* TC_TAPRIO_CMD_* */ @@ -169,18 +224,70 @@ struct tc_taprio_sched_entry { }; struct tc_taprio_qopt_offload { - u8 enable; - ktime_t base_time; - u64 cycle_time; - u64 cycle_time_extension; - - size_t num_entries; - struct tc_taprio_sched_entry entries[]; + enum tc_taprio_qopt_cmd cmd; + + union { + /* TAPRIO_CMD_STATS */ + struct tc_taprio_qopt_stats stats; + /* TAPRIO_CMD_QUEUE_STATS */ + struct tc_taprio_qopt_queue_stats queue_stats; + /* TAPRIO_CMD_REPLACE */ + struct { + struct tc_mqprio_qopt_offload mqprio; + struct netlink_ext_ack *extack; + ktime_t base_time; + u64 cycle_time; + u64 cycle_time_extension; + u32 max_sdu[TC_MAX_QUEUE]; + + size_t num_entries; + struct tc_taprio_sched_entry entries[]; + }; + }; }; +#if IS_ENABLED(CONFIG_NET_SCH_TAPRIO) + /* Reference counting */ struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload); void taprio_offload_free(struct tc_taprio_qopt_offload *offload); +#else + +/* Reference counting */ +static inline struct tc_taprio_qopt_offload * +taprio_offload_get(struct tc_taprio_qopt_offload *offload) +{ + return NULL; +} + +static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload) +{ +} + +#endif + +/* Ensure skb_mstamp_ns, which might have been populated with the txtime, is + * not mistaken for a software timestamp, because this will otherwise prevent + * the dispatch of hardware timestamps to the socket. + */ +static inline void skb_txtime_consumed(struct sk_buff *skb) +{ + skb->tstamp = ktime_set(0, 0); +} + +static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, + unsigned long cl, + struct qdisc_walker *arg) +{ + if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) { + arg->stop = 1; + return false; + } + + arg->count++; + return true; +} + #endif diff --git a/include/net/pptp.h b/include/net/pptp.h index 383e25ca53a7..e63176bdd4c8 100644 --- a/include/net/pptp.h +++ b/include/net/pptp.h @@ -2,6 +2,9 @@ #ifndef _NET_PPTP_H #define _NET_PPTP_H +#include <linux/types.h> +#include <net/gre.h> + #define PPP_LCP_ECHOREQ 0x09 #define PPP_LCP_ECHOREP 0x0A #define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP) diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h new file mode 100644 index 000000000000..a6ab2f4f5e28 --- /dev/null +++ b/include/net/proto_memory.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PROTO_MEMORY_H +#define _PROTO_MEMORY_H + +#include <net/sock.h> +#include <net/hotdata.h> + +/* 1 MB per cpu, in page units */ +#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) + +static inline bool sk_has_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure != NULL; +} + +static inline bool +proto_memory_pressure(const struct proto *prot) +{ + if (!prot->memory_pressure) + return false; + return !!READ_ONCE(*prot->memory_pressure); +} + +static inline bool sk_under_global_memory_pressure(const struct sock *sk) +{ + return proto_memory_pressure(sk->sk_prot); +} + +static inline bool sk_under_memory_pressure(const struct sock *sk) +{ + if (!sk->sk_prot->memory_pressure) + return false; + + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + mem_cgroup_under_socket_pressure(sk->sk_memcg)) + return true; + + return !!READ_ONCE(*sk->sk_prot->memory_pressure); +} + +static inline long +proto_memory_allocated(const struct proto *prot) +{ + return max(0L, atomic_long_read(prot->memory_allocated)); +} + +static inline long +sk_memory_allocated(const struct sock *sk) +{ + return proto_memory_allocated(sk->sk_prot); +} + +static inline void proto_memory_pcpu_drain(struct proto *proto) +{ + int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); + + if (val) + atomic_long_add(val, proto->memory_allocated); +} + +static inline void +sk_memory_allocated_add(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val >= READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +static inline void +sk_memory_allocated_sub(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val <= -READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +#endif /* _PROTO_MEMORY_H */ diff --git a/include/net/protocol.h b/include/net/protocol.h index 2b778e1d2d8f..b2499f88f8f8 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -35,26 +35,22 @@ /* This is used to register protocols. */ struct net_protocol { - int (*early_demux)(struct sk_buff *skb); - int (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); /* This returns an error if we weren't able to handle the error. */ int (*err_handler)(struct sk_buff *skb, u32 info); unsigned int no_policy:1, - netns_ok:1, /* does the protocol do more stringent * icmp tag validation than simple * socket lookup? */ icmp_strict_tag_validation:1; + u32 secret; }; #if IS_ENABLED(CONFIG_IPV6) struct inet6_protocol { - void (*early_demux)(struct sk_buff *skb); - void (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); /* This returns an error if we weren't able to handle the error. */ @@ -64,6 +60,7 @@ struct inet6_protocol { __be32 info); unsigned int flags; /* INET6_PROTO_xxx */ + u32 secret; }; #define INET6_PROTO_NOPOLICY 0x1 @@ -73,6 +70,7 @@ struct inet6_protocol { struct net_offload { struct offload_callbacks callbacks; unsigned int flags; /* Flags used by IPv6 for now */ + u32 secret; }; /* This should be set for any extension header which is compatible with GSO. */ #define INET6_PROTO_GSO_EXTHDR 0x1 diff --git a/include/net/psample.h b/include/net/psample.h index 68ae16bb0a4a..5071b5fc2b59 100644 --- a/include/net/psample.h +++ b/include/net/psample.h @@ -14,22 +14,40 @@ struct psample_group { struct rcu_head rcu; }; +struct psample_metadata { + u32 trunc_size; + int in_ifindex; + int out_ifindex; + u16 out_tc; + u64 out_tc_occ; /* bytes */ + u64 latency; /* nanoseconds */ + u8 out_tc_valid:1, + out_tc_occ_valid:1, + latency_valid:1, + rate_as_probability:1, + unused:4; + const u8 *user_cookie; + u32 user_cookie_len; +}; + struct psample_group *psample_group_get(struct net *net, u32 group_num); void psample_group_take(struct psample_group *group); void psample_group_put(struct psample_group *group); +struct sk_buff; + #if IS_ENABLED(CONFIG_PSAMPLE) -void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, - u32 trunc_size, int in_ifindex, int out_ifindex, - u32 sample_rate); +void psample_sample_packet(struct psample_group *group, + const struct sk_buff *skb, u32 sample_rate, + const struct psample_metadata *md); #else static inline void psample_sample_packet(struct psample_group *group, - struct sk_buff *skb, u32 trunc_size, - int in_ifindex, int out_ifindex, - u32 sample_rate) + const struct sk_buff *skb, + u32 sample_rate, + const struct psample_metadata *md) { } diff --git a/include/net/psnap.h b/include/net/psnap.h index 7cb0c8ab4171..88802b0754ad 100644 --- a/include/net/psnap.h +++ b/include/net/psnap.h @@ -2,6 +2,11 @@ #ifndef _NET_PSNAP_H #define _NET_PSNAP_H +struct datalink_proto; +struct sk_buff; +struct packet_type; +struct net_device; + struct datalink_proto * register_snap_client(const unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, diff --git a/include/net/raw.h b/include/net/raw.h index 8ad8df594853..32a61481a253 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -15,14 +15,15 @@ #include <net/inet_sock.h> #include <net/protocol.h> +#include <net/netns/hash.h> +#include <linux/hash.h> #include <linux/icmp.h> extern struct proto raw_prot; extern struct raw_hashinfo raw_v4_hashinfo; -struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, - unsigned short num, __be32 raddr, - __be32 laddr, int dif, int sdif); +bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num, + __be32 raddr, __be32 laddr, int dif, int sdif); int raw_abort(struct sock *sk, int err); void raw_icmp_error(struct sk_buff *, int, u32); @@ -30,13 +31,29 @@ int raw_local_deliver(struct sk_buff *, int); int raw_rcv(struct sock *, struct sk_buff *); -#define RAW_HTABLE_SIZE MAX_INET_PROTOS +#define RAW_HTABLE_LOG 8 +#define RAW_HTABLE_SIZE (1U << RAW_HTABLE_LOG) struct raw_hashinfo { - rwlock_t lock; - struct hlist_head ht[RAW_HTABLE_SIZE]; + spinlock_t lock; + + struct hlist_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned; }; +static inline u32 raw_hashfunc(const struct net *net, u32 proto) +{ + return hash_32(net_hash_mix(net) ^ proto, RAW_HTABLE_LOG); +} + +static inline void raw_hashinfo_init(struct raw_hashinfo *hashinfo) +{ + int i; + + spin_lock_init(&hashinfo->lock); + for (i = 0; i < RAW_HTABLE_SIZE; i++) + INIT_HLIST_HEAD(&hashinfo->ht[i]); +} + #ifdef CONFIG_PROC_FS int raw_proc_init(void); void raw_proc_exit(void); @@ -66,16 +83,13 @@ struct raw_sock { u32 ipmr_table; }; -static inline struct raw_sock *raw_sk(const struct sock *sk) -{ - return (struct raw_sock *)sk; -} +#define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - return inet_bound_dev_eq(!!net->ipv4.sysctl_raw_l3mdev_accept, + return inet_bound_dev_eq(READ_ONCE(net->ipv4.sysctl_raw_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); diff --git a/include/net/rawv6.h b/include/net/rawv6.h index 53d86b6055e8..82810cbe3798 100644 --- a/include/net/rawv6.h +++ b/include/net/rawv6.h @@ -3,11 +3,12 @@ #define _NET_RAWV6_H #include <net/protocol.h> +#include <net/raw.h> extern struct raw_hashinfo raw_v6_hashinfo; -struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, - unsigned short num, const struct in6_addr *loc_addr, - const struct in6_addr *rmt_addr, int dif, int sdif); +bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num, + const struct in6_addr *loc_addr, + const struct in6_addr *rmt_addr, int dif, int sdif); int raw_abort(struct sock *sk, int err); diff --git a/include/net/red.h b/include/net/red.h index fc455445f4b2..159a09359fc0 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -40,7 +40,7 @@ max_P should be small (not 1), usually 0.01..0.02 is good value. max_P is chosen as a number, so that max_P/(th_max-th_min) - is a negative power of two in order arithmetics to contain + is a negative power of two in order arithmetic to contain only shifts. @@ -122,7 +122,6 @@ struct red_stats { u32 forced_drop; /* Forced drops, qavg > max_thresh */ u32 forced_mark; /* Forced marks, qavg > max_thresh */ u32 pdrop; /* Drops due to queue limits */ - u32 other; /* Drops due to drop() calls */ }; struct red_parms { @@ -160,7 +159,7 @@ static inline u32 red_maxp(u8 Plog) static inline void red_set_vars(struct red_vars *v) { /* Reset average queue length, the value is strictly bound - * to the parameters below, reseting hurts a bit but leaving + * to the parameters below, resetting hurts a bit but leaving * it might result in an unreasonable qavg for a while. --TGR */ v->qavg = 0; @@ -168,14 +167,24 @@ static inline void red_set_vars(struct red_vars *v) v->qcount = -1; } -static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog) +static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, + u8 Scell_log, u8 *stab) { - if (fls(qth_min) + Wlog > 32) + if (fls(qth_min) + Wlog >= 32) return false; - if (fls(qth_max) + Wlog > 32) + if (fls(qth_max) + Wlog >= 32) + return false; + if (Scell_log >= 32) return false; if (qth_max < qth_min) return false; + if (stab) { + int i; + + for (i = 0; i < RED_STAB_SIZE; i++) + if (stab[i] >= 32) + return false; + } return true; } @@ -224,10 +233,10 @@ static inline void red_set_parms(struct red_parms *p, int delta = qth_max - qth_min; u32 max_p_delta; - p->qth_min = qth_min << Wlog; - p->qth_max = qth_max << Wlog; - p->Wlog = Wlog; - p->Plog = Plog; + WRITE_ONCE(p->qth_min, qth_min << Wlog); + WRITE_ONCE(p->qth_max, qth_max << Wlog); + WRITE_ONCE(p->Wlog, Wlog); + WRITE_ONCE(p->Plog, Plog); if (delta <= 0) delta = 1; p->qth_delta = delta; @@ -235,7 +244,7 @@ static inline void red_set_parms(struct red_parms *p, max_P = red_maxp(Plog); max_P *= delta; /* max_P = (qth_max - qth_min)/2^Plog */ } - p->max_P = max_P; + WRITE_ONCE(p->max_P, max_P); max_p_delta = max_P / delta; max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); @@ -248,7 +257,7 @@ static inline void red_set_parms(struct red_parms *p, p->target_min = qth_min + 2*delta; p->target_max = qth_min + 3*delta; - p->Scell_log = Scell_log; + WRITE_ONCE(p->Scell_log, Scell_log); p->Scell_max = (255 << Scell_log); if (stab) @@ -285,7 +294,7 @@ static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms int shift; /* - * The problem: ideally, average length queue recalcultion should + * The problem: ideally, average length queue recalculation should * be done over constant clock intervals. This is too expensive, so * that the calculation is driven by outgoing packets. * When the queue is idle we have to model this clock by hand. @@ -331,7 +340,7 @@ static inline unsigned long red_calc_qavg_no_idle_time(const struct red_parms *p { /* * NOTE: v->qavg is fixed point number with point at Wlog. - * The formula below is equvalent to floating point + * The formula below is equivalent to floating point * version: * * qavg = qavg*(1-W) + backlog*W; @@ -354,7 +363,7 @@ static inline unsigned long red_calc_qavg(const struct red_parms *p, static inline u32 red_random(const struct red_parms *p) { - return reciprocal_divide(prandom_u32(), p->max_P_reciprocal); + return reciprocal_divide(get_random_u32(), p->max_P_reciprocal); } static inline int red_mark_probability(const struct red_parms *p, @@ -366,7 +375,7 @@ static inline int red_mark_probability(const struct red_parms *p, OK. qR is random number in the interval (0..1/max_P)*(qth_max-qth_min) i.e. 0..(2^Plog). If we used floating point - arithmetics, it would be: (2^Plog)*rnd_num, + arithmetic, it would be: (2^Plog)*rnd_num, where rnd_num is less 1. Taking into account, that qavg have fixed diff --git a/include/net/regulatory.h b/include/net/regulatory.h index 47f06f6f5a67..6633627f6e76 100644 --- a/include/net/regulatory.h +++ b/include/net/regulatory.h @@ -1,3 +1,4 @@ + #ifndef __NET_REGULATORY_H #define __NET_REGULATORY_H /* @@ -19,6 +20,8 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include <linux/ieee80211.h> +#include <linux/nl80211.h> #include <linux/rcupdate.h> /** @@ -68,8 +71,6 @@ enum environment_cap { * CRDA and can be used by other regulatory requests. When a * the last request is not yet processed we must yield until it * is processed before processing any new requests. - * @country_ie_checksum: checksum of the last processed and accepted - * country IE * @country_ie_env: lets us know if the AP is telling us we are outdoor, * indoor, or if it doesn't matter * @list: used to insert into the reg_requests_list linked list @@ -120,7 +121,7 @@ struct regulatory_request { * @REGULATORY_DISABLE_BEACON_HINTS: enable this if your driver needs to * ensure that passive scan flags and beaconing flags may not be lifted by * cfg80211 due to regulatory beacon hints. For more information on beacon - * hints read the documenation for regulatory_hint_found_beacon() + * hints read the documentation for regulatory_hint_found_beacon() * @REGULATORY_COUNTRY_IE_FOLLOW_POWER: for devices that have a preference * that even though they may have programmed their own custom power * setting prior to wiphy registration, they want to ensure their channel @@ -137,17 +138,6 @@ struct regulatory_request { * otherwise initiating radiation is not allowed. This will enable the * relaxations enabled under the CFG80211_REG_RELAX_NO_IR configuration * option - * @REGULATORY_IGNORE_STALE_KICKOFF: the regulatory core will _not_ make sure - * all interfaces on this wiphy reside on allowed channels. If this flag - * is not set, upon a regdomain change, the interfaces are given a grace - * period (currently 60 seconds) to disconnect or move to an allowed - * channel. Interfaces on forbidden channels are forcibly disconnected. - * Currently these types of interfaces are supported for enforcement: - * NL80211_IFTYPE_ADHOC, NL80211_IFTYPE_STATION, NL80211_IFTYPE_AP, - * NL80211_IFTYPE_AP_VLAN, NL80211_IFTYPE_MONITOR, - * NL80211_IFTYPE_P2P_CLIENT, NL80211_IFTYPE_P2P_GO, - * NL80211_IFTYPE_P2P_DEVICE. The flag will be set by default if a device - * includes any modes unsupported for enforcement checking. * @REGULATORY_WIPHY_SELF_MANAGED: for devices that employ wiphy-specific * regdom management. These devices will ignore all regdom changes not * originating from their own wiphy. @@ -174,7 +164,7 @@ enum ieee80211_regulatory_flags { REGULATORY_COUNTRY_IE_FOLLOW_POWER = BIT(3), REGULATORY_COUNTRY_IE_IGNORE = BIT(4), REGULATORY_ENABLE_RELAX_NO_IR = BIT(5), - REGULATORY_IGNORE_STALE_KICKOFF = BIT(6), + /* reuse bit 6 next time */ REGULATORY_WIPHY_SELF_MANAGED = BIT(7), }; @@ -221,6 +211,7 @@ struct ieee80211_reg_rule { u32 flags; u32 dfs_cac_ms; bool has_wmm; + s8 psd; }; struct ieee80211_regdomain { diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 29e41ff3ec93..b07b1cd14e9f 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -18,6 +18,7 @@ #include <linux/refcount.h> #include <net/sock.h> +#include <net/rstreason.h> struct request_sock; struct sk_buff; @@ -34,7 +35,8 @@ struct request_sock_ops { void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); void (*send_reset)(const struct sock *sk, - struct sk_buff *skb); + struct sk_buff *skb, + enum sk_rst_reason reason); void (*destructor)(struct request_sock *req); void (*syn_ack_timeout)(const struct request_sock *req); }; @@ -61,7 +63,11 @@ struct request_sock { struct request_sock *dl_next; u16 mss; u8 num_retrans; /* number of retransmits */ - u8 syncookie:1; /* syncookie: encode tcpopts in timestamp */ + u8 syncookie:1; /* True if + * 1) tcpopts needs to be encoded in + * TS of SYN+ACK + * 2) ACK is validated by BPF kfunc. + */ u8 num_timeout:7; /* number of timeouts */ u32 ts_recent; struct timer_list rsk_timer; @@ -70,6 +76,7 @@ struct request_sock { struct saved_syn *saved_syn; u32 secid; u32 peer_secid; + u32 timeout; }; static inline struct request_sock *inet_reqsk(const struct sock *sk) @@ -82,34 +89,43 @@ static inline struct sock *req_to_sk(struct request_sock *req) return (struct sock *)req; } -static inline struct request_sock * -reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, - bool attach_listener) +/** + * skb_steal_sock - steal a socket from an sk_buff + * @skb: sk_buff to steal the socket from + * @refcounted: is set to true if the socket is reference-counted + * @prefetched: is set to true if the socket was assigned from bpf + */ +static inline struct sock *skb_steal_sock(struct sk_buff *skb, + bool *refcounted, bool *prefetched) { - struct request_sock *req; + struct sock *sk = skb->sk; - req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN); - if (!req) + if (!sk) { + *prefetched = false; + *refcounted = false; return NULL; - req->rsk_listener = NULL; - if (attach_listener) { - if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) { - kmem_cache_free(ops->slab, req); - return NULL; + } + + *prefetched = skb_sk_is_prefetched(skb); + if (*prefetched) { +#if IS_ENABLED(CONFIG_SYN_COOKIES) + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + struct request_sock *req = inet_reqsk(sk); + + *refcounted = false; + sk = req->rsk_listener; + req->rsk_listener = NULL; + return sk; } - req->rsk_listener = sk_listener; +#endif + *refcounted = sk_is_refcounted(sk); + } else { + *refcounted = true; } - req->rsk_ops = ops; - req_to_sk(req)->sk_prot = sk_listener->sk_prot; - sk_node_init(&req_to_sk(req)->sk_node); - sk_tx_queue_clear(req_to_sk(req)); - req->saved_syn = NULL; - req->num_timeout = 0; - req->num_retrans = 0; - req->sk = NULL; - refcount_set(&req->rsk_refcnt, 0); - return req; + skb->destructor = NULL; + skb->sk = NULL; + return sk; } static inline void __reqsk_free(struct request_sock *req) @@ -123,14 +139,14 @@ static inline void __reqsk_free(struct request_sock *req) static inline void reqsk_free(struct request_sock *req) { - WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); + DEBUG_NET_WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); __reqsk_free(req); } static inline void reqsk_put(struct request_sock *req) { if (refcount_dec_and_test(&req->rsk_refcnt)) - reqsk_free(req); + __reqsk_free(req); } /* @@ -236,4 +252,16 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) return atomic_read(&queue->young); } +/* RFC 7323 2.3 Using the Window Scale Option + * The window field (SEG.WND) of every outgoing segment, with the + * exception of <SYN> segments, MUST be right-shifted by + * Rcv.Wind.Shift bits. + * + * This means the SEG.WND carried in SYNACK can not exceed 65535. + * We use this property to harden TCP stack while in NEW_SYN_RECV state. + */ +static inline u32 tcp_synack_window(const struct request_sock *req) +{ + return min(req->rsk_rcv_wnd, 65535U); +} #endif /* _REQUEST_SOCK_H */ diff --git a/include/net/rose.h b/include/net/rose.h index cf517d306a28..23267b4efcfa 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -9,6 +9,7 @@ #define _ROSE_H #include <linux/rose.h> +#include <net/ax25.h> #include <net/sock.h> #define ROSE_ADDR_LEN 5 @@ -131,7 +132,8 @@ struct rose_sock { ax25_address source_digis[ROSE_MAX_DIGIS]; ax25_address dest_digis[ROSE_MAX_DIGIS]; struct rose_neigh *neighbour; - struct net_device *device; + struct net_device *device; + netdevice_tracker dev_tracker; unsigned int lci, rand; unsigned char state, condition, qbitincl, defer; unsigned char cause, diagnostic; @@ -162,8 +164,8 @@ extern int sysctl_rose_link_fail_timeout; extern int sysctl_rose_maximum_vcs; extern int sysctl_rose_window_size; -int rosecmp(rose_address *, rose_address *); -int rosecmpm(rose_address *, rose_address *, unsigned short); +int rosecmp(const rose_address *, const rose_address *); +int rosecmpm(const rose_address *, const rose_address *, unsigned short); char *rose2asc(char *buf, const rose_address *); struct sock *rose_find_socket(unsigned int, struct rose_neigh *); void rose_kill_by_neigh(struct rose_neigh *); @@ -205,8 +207,8 @@ extern const struct seq_operations rose_node_seqops; extern struct seq_operations rose_route_seqops; void rose_add_loopback_neigh(void); -int __must_check rose_add_loopback_node(rose_address *); -void rose_del_loopback_node(rose_address *); +int __must_check rose_add_loopback_node(const rose_address *); +void rose_del_loopback_node(const rose_address *); void rose_rt_device_down(struct net_device *); void rose_link_device_down(struct net_device *); struct net_device *rose_dev_first(void); diff --git a/include/net/route.h b/include/net/route.h index ff021cab657e..8e39aa822cf9 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -27,6 +27,8 @@ #include <net/ip_fib.h> #include <net/arp.h> #include <net/ndisc.h> +#include <net/inet_dscp.h> +#include <net/sock.h> #include <linux/in_route.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> @@ -35,14 +37,20 @@ #include <linux/cache.h> #include <linux/security.h> -/* IPv4 datagram length is stored into 16bit field (tot_len) */ -#define IP_MAX_MTU 0xFFFFU +static inline __u8 ip_sock_rt_scope(const struct sock *sk) +{ + if (sock_flag(sk, SOCK_LOCALROUTE)) + return RT_SCOPE_LINK; -#define RTO_ONLINK 0x01 + return RT_SCOPE_UNIVERSE; +} -#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) -#define RT_CONN_FLAGS_TOS(sk,tos) (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE)) +static inline __u8 ip_sock_rt_tos(const struct sock *sk) +{ + return READ_ONCE(inet_sk(sk)->tos) & INET_DSCP_MASK; +} +struct ip_tunnel_info; struct fib_nh; struct fib_info; struct uncached_list; @@ -67,11 +75,19 @@ struct rtable { /* Miscellaneous cached information */ u32 rt_mtu_locked:1, rt_pmtu:31; - - struct list_head rt_uncached; - struct uncached_list *rt_uncached_list; }; +#define dst_rtable(_ptr) container_of_const(_ptr, struct rtable, dst) + +/** + * skb_rtable - Returns the skb &rtable + * @skb: buffer + */ +static inline struct rtable *skb_rtable(const struct sk_buff *skb) +{ + return dst_rtable(skb_dst(skb)); +} + static inline bool rt_is_input_route(const struct rtable *rt) { return rt->rt_is_input != 0; @@ -114,6 +130,33 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); + +static inline void inet_sk_init_flowi4(const struct inet_sock *inet, + struct flowi4 *fl4) +{ + const struct ip_options_rcu *ip4_opt; + const struct sock *sk; + __be32 daddr; + + rcu_read_lock(); + ip4_opt = rcu_dereference(inet->inet_opt); + + /* Source routing option overrides the socket destination address */ + if (ip4_opt && ip4_opt->opt.srr) + daddr = ip4_opt->opt.faddr; + else + daddr = inet->inet_daddr; + rcu_read_unlock(); + + sk = &inet->sk; + flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), + sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk->sk_uid); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); +} + struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp, const struct sk_buff *skb); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp, @@ -128,12 +171,6 @@ static inline struct rtable *__ip_route_output_key(struct net *net, struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, const struct sock *sk); -struct rtable *ip_route_output_tunnel(struct sk_buff *skb, - struct net_device *dev, - struct net *net, __be32 *saddr, - const struct ip_tunnel_info *info, - u8 protocol, bool use_cache); - struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig); @@ -142,74 +179,68 @@ static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 return ip_route_output_flow(net, flp, NULL); } +/* Simplistic IPv4 route lookup function. + * This is only suitable for some particular use cases: since the flowi4 + * structure is only partially set, it may bypass some fib-rules. + */ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, - __be32 saddr, u8 tos, int oif) + __be32 saddr, dscp_t dscp, + int oif, __u8 scope) { struct flowi4 fl4 = { .flowi4_oif = oif, - .flowi4_tos = tos, + .flowi4_tos = inet_dscp_to_dsfield(dscp), + .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, }; + return ip_route_output_key(net, &fl4); } static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4, - struct sock *sk, + const struct sock *sk, __be32 daddr, __be32 saddr, __be16 dport, __be16 sport, __u8 proto, __u8 tos, int oif) { - flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos, - RT_SCOPE_UNIVERSE, proto, - sk ? inet_sk_flowi_flags(sk) : 0, + flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos, + sk ? ip_sock_rt_scope(sk) : RT_SCOPE_UNIVERSE, + proto, sk ? inet_sk_flowi_flags(sk) : 0, daddr, saddr, dport, sport, sock_net_uid(net, sk)); if (sk) - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } -static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4, - __be32 daddr, __be32 saddr, - __be32 gre_key, __u8 tos, int oif) -{ - memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; - fl4->daddr = daddr; - fl4->saddr = saddr; - fl4->flowi4_tos = tos; - fl4->flowi4_proto = IPPROTO_GRE; - fl4->fl4_gre_key = gre_key; - return ip_route_output_key(net, fl4); -} -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, - struct in_device *in_dev, u32 *itag); -int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin); -int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, - struct fib_result *res); - -int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, - const struct sk_buff *hint); - -static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag); +enum skb_drop_reason +ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev); +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint); + +static inline enum skb_drop_reason +ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, + struct net_device *devin) { - int err; + enum skb_drop_reason reason; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, tos, devin); - if (!err) { + reason = ip_route_input_noref(skb, dst, src, dscp, devin); + if (!reason) { skb_dst_force(skb); if (!skb_dst(skb)) - err = -EINVAL; + reason = SKB_DROP_REASON_NOT_SPECIFIED; } rcu_read_unlock(); - return err; + return reason; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, @@ -230,8 +261,7 @@ void ip_rt_multicast_event(struct in_device *); int ip_rt_ioctl(struct net *, unsigned int cmd, struct rtentry *rt); void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); struct rtable *rt_dst_alloc(struct net_device *dev, - unsigned int flags, u16 type, - bool nopolicy, bool noxfrm); + unsigned int flags, u16 type, bool noxfrm); struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt); struct in_ifaddr; @@ -255,8 +285,6 @@ static inline void ip_rt_put(struct rtable *rt) dst_release(&rt->dst); } -#define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) - extern const __u8 ip_tos2prio[16]; static inline char rt_tos2priority(u8 tos) @@ -288,57 +316,57 @@ static inline char rt_tos2priority(u8 tos) * ip_route_newports() calls. */ -static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 src, - u32 tos, int oif, u8 protocol, +static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, + __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, - struct sock *sk) + const struct sock *sk) { __u8 flow_flags = 0; - if (inet_sk(sk)->transparent) + if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; - flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - protocol, flow_flags, dst, src, dport, sport, - sk->sk_uid); + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) + flow_flags |= FLOWI_FLAG_ANY_SPORT; + + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), + ip_sock_rt_scope(sk), protocol, flow_flags, dst, + src, dport, sport, sk->sk_uid); } -static inline struct rtable *ip_route_connect(struct flowi4 *fl4, - __be32 dst, __be32 src, u32 tos, - int oif, u8 protocol, +static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, + __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, - struct sock *sk) + const struct sock *sk) { struct net *net = sock_net(sk); struct rtable *rt; - ip_route_connect_init(fl4, dst, src, tos, oif, protocol, - sport, dport, sk); + ip_route_connect_init(fl4, dst, src, oif, protocol, sport, dport, sk); if (!dst || !src) { rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) return rt; ip_rt_put(rt); - flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr); + flowi4_update_output(fl4, oif, fl4->daddr, fl4->saddr); } - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt, __be16 orig_sport, __be16 orig_dport, __be16 sport, __be16 dport, - struct sock *sk) + const struct sock *sk) { if (sport != orig_sport || dport != orig_dport) { fl4->fl4_dport = dport; fl4->fl4_sport = sport; ip_rt_put(rt); - flowi4_update_output(fl4, sk->sk_bound_dev_if, - RT_CONN_FLAGS(sk), fl4->daddr, + flowi4_update_output(fl4, sk->sk_bound_dev_if, fl4->daddr, fl4->saddr); - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(sock_net(sk), fl4, sk); } return rt; @@ -357,10 +385,15 @@ static inline int inet_iif(const struct sk_buff *skb) static inline int ip4_dst_hoplimit(const struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); - struct net *net = dev_net(dst->dev); - if (hoplimit == 0) - hoplimit = net->ipv4.sysctl_ip_default_ttl; + if (hoplimit == 0) { + const struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); + hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); + rcu_read_unlock(); + } return hoplimit; } @@ -369,7 +402,7 @@ static inline struct neighbour *ip_neigh_gw4(struct net_device *dev, { struct neighbour *neigh; - neigh = __ipv4_neigh_lookup_noref(dev, daddr); + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &daddr, dev, false); diff --git a/include/net/rpl.h b/include/net/rpl.h index 308ef0a05cae..74734191c458 100644 --- a/include/net/rpl.h +++ b/include/net/rpl.h @@ -23,12 +23,6 @@ static inline int rpl_init(void) static inline void rpl_exit(void) {} #endif -/* Worst decompression memory usage ipv6 address (16) + pad 7 */ -#define IPV6_RPL_SRH_WORST_SWAP_SIZE (sizeof(struct in6_addr) + 7) - -size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri, - unsigned char cmpre); - void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, const struct ipv6_rpl_sr_hdr *inhdr, const struct in6_addr *daddr, unsigned char n); diff --git a/include/net/rps.h b/include/net/rps.h new file mode 100644 index 000000000000..d8ab3a08bcc4 --- /dev/null +++ b/include/net/rps.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_RPS_H +#define _NET_RPS_H + +#include <linux/types.h> +#include <linux/static_key.h> +#include <net/sock.h> +#include <net/hotdata.h> + +#ifdef CONFIG_RPS + +extern struct static_key_false rps_needed; +extern struct static_key_false rfs_needed; + +/* + * This structure holds an RPS map which can be of variable length. The + * map is an array of CPUs. + */ +struct rps_map { + unsigned int len; + struct rcu_head rcu; + u16 cpus[]; +}; +#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) + +/* + * The rps_dev_flow structure contains the mapping of a flow to a CPU, the + * tail pointer for that CPU's input queue at the time of last enqueue, and + * a hardware filter index. + */ +struct rps_dev_flow { + u16 cpu; + u16 filter; + unsigned int last_qtail; +}; +#define RPS_NO_FILTER 0xffff + +/* + * The rps_dev_flow_table structure contains a table of flow mappings. + */ +struct rps_dev_flow_table { + u8 log; + struct rcu_head rcu; + struct rps_dev_flow flows[]; +}; +#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ + ((_num) * sizeof(struct rps_dev_flow))) + +/* + * The rps_sock_flow_table contains mappings of flows to the last CPU + * on which they were processed by the application (set in recvmsg). + * Each entry is a 32bit value. Upper part is the high-order bits + * of flow hash, lower part is CPU number. + * rps_cpu_mask is used to partition the space, depending on number of + * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 + * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, + * meaning we use 32-6=26 bits for the hash. + */ +struct rps_sock_flow_table { + struct rcu_head rcu; + u32 mask; + + u32 ents[] ____cacheline_aligned_in_smp; +}; +#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) + +#define RPS_NO_CPU 0xffff + +static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, + u32 hash) +{ + unsigned int index = hash & table->mask; + u32 val = hash & ~net_hotdata.rps_cpu_mask; + + /* We only give a hint, preemption can change CPU under us */ + val |= raw_smp_processor_id(); + + /* The following WRITE_ONCE() is paired with the READ_ONCE() + * here, and another one in get_rps_cpu(). + */ + if (READ_ONCE(table->ents[index]) != val) + WRITE_ONCE(table->ents[index], val); +} + +#endif /* CONFIG_RPS */ + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *sock_flow_table; + + if (!hash) + return; + rcu_read_lock(); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); + if (sock_flow_table) + rps_record_sock_flow(sock_flow_table, hash); + rcu_read_unlock(); +#endif +} + +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rfs_needed)) { + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). + */ + sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); + } + } +#endif +} + +static inline void sock_rps_delete_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *table; + u32 hash, index; + + if (!static_branch_unlikely(&rfs_needed)) + return; + + hash = READ_ONCE(sk->sk_rxhash); + if (!hash) + return; + + rcu_read_lock(); + table = rcu_dereference(net_hotdata.rps_sock_flow_table); + if (table) { + index = hash & table->mask; + if (READ_ONCE(table->ents[index]) != RPS_NO_CPU) + WRITE_ONCE(table->ents[index], RPS_NO_CPU); + } + rcu_read_unlock(); +#endif +} + +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return ++sd->input_queue_tail; +#else + return 0; +#endif +} + +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(*dest, tail); +#endif +} + +static inline void rps_input_queue_head_add(struct softnet_data *sd, int val) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val); +#endif +} + +static inline void rps_input_queue_head_incr(struct softnet_data *sd) +{ + rps_input_queue_head_add(sd, 1); +} + +#endif /* _NET_RPS_H */ diff --git a/include/net/rsi_91x.h b/include/net/rsi_91x.h index 040f07b47f1f..b2283762e446 100644 --- a/include/net/rsi_91x.h +++ b/include/net/rsi_91x.h @@ -1,4 +1,4 @@ -/** +/* * Copyright (c) 2017 Redpine Signals Inc. * * Permission to use, copy, modify, and/or distribute this software for any diff --git a/include/net/rstreason.h b/include/net/rstreason.h new file mode 100644 index 000000000000..979ac87b5d99 --- /dev/null +++ b/include/net/rstreason.h @@ -0,0 +1,221 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_RSTREASON_H +#define _LINUX_RSTREASON_H +#include <net/dropreason-core.h> +#include <uapi/linux/mptcp.h> + +#define DEFINE_RST_REASON(FN, FNe) \ + FN(NOT_SPECIFIED) \ + FN(NO_SOCKET) \ + FN(TCP_INVALID_ACK_SEQUENCE) \ + FN(TCP_RFC7323_PAWS) \ + FN(TCP_TOO_OLD_ACK) \ + FN(TCP_ACK_UNSENT_DATA) \ + FN(TCP_FLAGS) \ + FN(TCP_OLD_ACK) \ + FN(TCP_ABORT_ON_DATA) \ + FN(TCP_TIMEWAIT_SOCKET) \ + FN(INVALID_SYN) \ + FN(TCP_ABORT_ON_CLOSE) \ + FN(TCP_ABORT_ON_LINGER) \ + FN(TCP_ABORT_ON_MEMORY) \ + FN(TCP_STATE) \ + FN(TCP_KEEPALIVE_TIMEOUT) \ + FN(TCP_DISCONNECT_WITH_DATA) \ + FN(MPTCP_RST_EUNSPEC) \ + FN(MPTCP_RST_EMPTCP) \ + FN(MPTCP_RST_ERESOURCE) \ + FN(MPTCP_RST_EPROHIBIT) \ + FN(MPTCP_RST_EWQ2BIG) \ + FN(MPTCP_RST_EBADPERF) \ + FN(MPTCP_RST_EMIDDLEBOX) \ + FN(ERROR) \ + FNe(MAX) + +/** + * enum sk_rst_reason - the reasons of socket reset + * + * The reasons of sk reset, which are used in TCP/MPTCP protocols. + * + * There are three parts in order: + * 1) skb drop reasons: relying on drop reasons for such as passive reset + * 2) independent reset reasons: such as active reset reasons + * 3) reset reasons in MPTCP: only for MPTCP use + */ +enum sk_rst_reason { + /* Refer to include/net/dropreason-core.h + * Rely on skb drop reasons because it indicates exactly why RST + * could happen. + */ + /** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */ + SK_RST_REASON_NOT_SPECIFIED, + /** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */ + SK_RST_REASON_NO_SOCKET, + /** + * @SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ + * field because ack sequence is not in the window between snd_una + * and snd_nxt + */ + SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE, + /** + * @SK_RST_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to + * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED + */ + SK_RST_REASON_TCP_RFC7323_PAWS, + /** @SK_RST_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */ + SK_RST_REASON_TCP_TOO_OLD_ACK, + /** + * @SK_RST_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't + * sent yet + */ + SK_RST_REASON_TCP_ACK_UNSENT_DATA, + /** @SK_RST_REASON_TCP_FLAGS: TCP flags invalid */ + SK_RST_REASON_TCP_FLAGS, + /** @SK_RST_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */ + SK_RST_REASON_TCP_OLD_ACK, + /** + * @SK_RST_REASON_TCP_ABORT_ON_DATA: abort on data + * corresponding to LINUX_MIB_TCPABORTONDATA + */ + SK_RST_REASON_TCP_ABORT_ON_DATA, + + /* Here start with the independent reasons */ + /** @SK_RST_REASON_TCP_TIMEWAIT_SOCKET: happen on the timewait socket */ + SK_RST_REASON_TCP_TIMEWAIT_SOCKET, + /** + * @SK_RST_REASON_INVALID_SYN: receive bad syn packet + * RFC 793 says if the state is not CLOSED/LISTEN/SYN-SENT then + * "fourth, check the SYN bit,...If the SYN is in the window it is + * an error, send a reset" + */ + SK_RST_REASON_INVALID_SYN, + /** + * @SK_RST_REASON_TCP_ABORT_ON_CLOSE: abort on close + * corresponding to LINUX_MIB_TCPABORTONCLOSE + */ + SK_RST_REASON_TCP_ABORT_ON_CLOSE, + /** + * @SK_RST_REASON_TCP_ABORT_ON_LINGER: abort on linger + * corresponding to LINUX_MIB_TCPABORTONLINGER + */ + SK_RST_REASON_TCP_ABORT_ON_LINGER, + /** + * @SK_RST_REASON_TCP_ABORT_ON_MEMORY: abort on memory + * corresponding to LINUX_MIB_TCPABORTONMEMORY + */ + SK_RST_REASON_TCP_ABORT_ON_MEMORY, + /** + * @SK_RST_REASON_TCP_STATE: abort on tcp state + * Please see RFC 9293 for all possible reset conditions + */ + SK_RST_REASON_TCP_STATE, + /** + * @SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT: time to timeout + * When we have already run out of all the chances, which means + * keepalive timeout, we have to reset the connection + */ + SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT, + /** + * @SK_RST_REASON_TCP_DISCONNECT_WITH_DATA: disconnect when write + * queue is not empty + * It means user has written data into the write queue when doing + * disconnecting, so we have to send an RST. + */ + SK_RST_REASON_TCP_DISCONNECT_WITH_DATA, + + /* Copy from include/uapi/linux/mptcp.h. + * These reset fields will not be changed since they adhere to + * RFC 8684. So do not touch them. I'm going to list each definition + * of them respectively. + */ + /** + * @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error. + * This is the default error; it implies that the subflow is no + * longer available. The presence of this option shows that the + * RST was generated by an MPTCP-aware device. + */ + SK_RST_REASON_MPTCP_RST_EUNSPEC, + /** + * @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error. + * An error has been detected in the processing of MPTCP options. + * This is the usual reason code to return in the cases where a RST + * is being sent to close a subflow because of an invalid response. + */ + SK_RST_REASON_MPTCP_RST_EMPTCP, + /** + * @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources. + * This code indicates that the sending host does not have enough + * resources to support the terminated subflow. + */ + SK_RST_REASON_MPTCP_RST_ERESOURCE, + /** + * @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited. + * This code indicates that the requested subflow is prohibited by + * the policies of the sending host. + */ + SK_RST_REASON_MPTCP_RST_EPROHIBIT, + /** + * @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data. + * This code indicates that there is an excessive amount of data + * that needs to be transmitted over the terminated subflow while + * having already been acknowledged over one or more other subflows. + * This may occur if a path has been unavailable for a short period + * and it is more efficient to reset and start again than it is to + * retransmit the queued data. + */ + SK_RST_REASON_MPTCP_RST_EWQ2BIG, + /** + * @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable performance. + * This code indicates that the performance of this subflow was + * too low compared to the other subflows of this Multipath TCP + * connection. + */ + SK_RST_REASON_MPTCP_RST_EBADPERF, + /** + * @SK_RST_REASON_MPTCP_RST_EMIDDLEBOX: Middlebox interference. + * Middlebox interference has been detected over this subflow, + * making MPTCP signaling invalid. For example, this may be sent + * if the checksum does not validate. + */ + SK_RST_REASON_MPTCP_RST_EMIDDLEBOX, + + /** @SK_RST_REASON_ERROR: unexpected error happens */ + SK_RST_REASON_ERROR, + + /** + * @SK_RST_REASON_MAX: Maximum of socket reset reasons. + * It shouldn't be used as a real 'reason'. + */ + SK_RST_REASON_MAX, +}; + +/* Convert skb drop reasons to enum sk_rst_reason type */ +static inline enum sk_rst_reason +sk_rst_convert_drop_reason(enum skb_drop_reason reason) +{ + switch (reason) { + case SKB_DROP_REASON_NOT_SPECIFIED: + return SK_RST_REASON_NOT_SPECIFIED; + case SKB_DROP_REASON_NO_SOCKET: + return SK_RST_REASON_NO_SOCKET; + case SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: + return SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE; + case SKB_DROP_REASON_TCP_RFC7323_PAWS: + return SK_RST_REASON_TCP_RFC7323_PAWS; + case SKB_DROP_REASON_TCP_TOO_OLD_ACK: + return SK_RST_REASON_TCP_TOO_OLD_ACK; + case SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: + return SK_RST_REASON_TCP_ACK_UNSENT_DATA; + case SKB_DROP_REASON_TCP_FLAGS: + return SK_RST_REASON_TCP_FLAGS; + case SKB_DROP_REASON_TCP_OLD_ACK: + return SK_RST_REASON_TCP_OLD_ACK; + case SKB_DROP_REASON_TCP_ABORT_ON_DATA: + return SK_RST_REASON_TCP_ABORT_ON_DATA; + default: + /* If we don't have our own corresponding reason */ + return SK_RST_REASON_NOT_SPECIFIED; + } +} +#endif diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index e2091bb2b3a8..ec65a8cebb99 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -3,6 +3,7 @@ #define __NET_RTNETLINK_H #include <linux/rtnetlink.h> +#include <linux/srcu.h> #include <net/netlink.h> typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, @@ -10,16 +11,56 @@ typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { - RTNL_FLAG_DOIT_UNLOCKED = 1, + RTNL_FLAG_DOIT_UNLOCKED = BIT(0), +#define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED +#define RTNL_FLAG_DOIT_PERNET_WIP RTNL_FLAG_DOIT_UNLOCKED + RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), + RTNL_FLAG_DUMP_UNLOCKED = BIT(2), + RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ +}; + +enum rtnl_kinds { + RTNL_KIND_NEW, + RTNL_KIND_DEL, + RTNL_KIND_GET, + RTNL_KIND_SET +}; +#define RTNL_KIND_MASK 0x3 + +static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype) +{ + return msgtype & RTNL_KIND_MASK; +} + +/** + * struct rtnl_msg_handler - rtnetlink message type and handlers + * + * @owner: NULL for built-in, THIS_MODULE for module + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions + */ +struct rtnl_msg_handler { + struct module *owner; + int protocol; + int msgtype; + rtnl_doit_func doit; + rtnl_dumpit_func dumpit; + int flags; }; -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_register_module(struct module *owner, int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_unregister(int protocol, int msgtype); void rtnl_unregister_all(int protocol); +int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n); +void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n); + +#define rtnl_register_many(handlers) \ + __rtnl_register_many(handlers, ARRAY_SIZE(handlers)) +#define rtnl_unregister_many(handlers) \ + __rtnl_unregister_many(handlers, ARRAY_SIZE(handlers)) + static inline int rtnl_msg_family(const struct nlmsghdr *nlh) { if (nlmsg_len(nlh) >= sizeof(struct rtgenmsg)) @@ -29,13 +70,53 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) } /** + * struct rtnl_newlink_params - parameters of rtnl_link_ops::newlink() + * + * @src_net: Source netns of rtnetlink socket + * @link_net: Link netns by IFLA_LINK_NETNSID, NULL if not specified + * @peer_net: Peer netns + * @tb: IFLA_* attributes + * @data: IFLA_INFO_DATA attributes + */ +struct rtnl_newlink_params { + struct net *src_net; + struct net *link_net; + struct net *peer_net; + struct nlattr **tb; + struct nlattr **data; +}; + +/* Get effective link netns from newlink params. Generally, this is link_net + * and falls back to src_net. But for compatibility, a driver may * choose to + * use dev_net(dev) instead. + */ +static inline struct net *rtnl_newlink_link_net(struct rtnl_newlink_params *p) +{ + return p->link_net ? : p->src_net; +} + +/* Get peer netns from newlink params. Fallback to link netns if peer netns is + * not specified explicitly. + */ +static inline struct net *rtnl_newlink_peer_net(struct rtnl_newlink_params *p) +{ + return p->peer_net ? : rtnl_newlink_link_net(p); +} + +/** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally + * @list: Used internally, protected by link_ops_mutex and SRCU + * @srcu: Used internally * @kind: Identifier + * @netns_refund: Physical device, move to init_net on netns exit + * @peer_type: Peer device specific netlink attribute number (e.g. VETH_INFO_PEER) * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters + * @alloc: netdev allocation function, can be %NULL and is then used + * in place of alloc_netdev_mqs(), in this case @priv_size + * and @setup are unused. Returns a netdev or ERR_PTR(). * @priv_size: sizeof net_device private space * @setup: net_device setup function * @newlink: Function for configuring and registering a new device @@ -58,22 +139,28 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) */ struct rtnl_link_ops { struct list_head list; + struct srcu_struct srcu; const char *kind; size_t priv_size; + struct net_device *(*alloc)(struct nlattr *tb[], + const char *ifname, + unsigned char name_assign_type, + unsigned int num_tx_queues, + unsigned int num_rx_queues); void (*setup)(struct net_device *dev); + bool netns_refund; + const u16 peer_type; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); - int (*newlink)(struct net *src_net, - struct net_device *dev, - struct nlattr *tb[], - struct nlattr *data[], + int (*newlink)(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); int (*changelink)(struct net_device *dev, struct nlattr *tb[], @@ -112,16 +199,14 @@ struct rtnl_link_ops { int *prividx, int attr); }; -int __rtnl_link_register(struct rtnl_link_ops *ops); -void __rtnl_link_unregister(struct rtnl_link_ops *ops); - int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); /** * struct rtnl_af_ops - rtnetlink address family operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @family: Address family * @fill_link_af: Function to fill IFLA_AF_SPEC with address family * specific netlink attributes. @@ -134,6 +219,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); */ struct rtnl_af_ops { struct list_head list; + struct srcu_struct srcu; + int family; int (*fill_link_af)(struct sk_buff *skb, @@ -143,16 +230,17 @@ struct rtnl_af_ops { u32 ext_filter_mask); int (*validate_link_af)(const struct net_device *dev, - const struct nlattr *attr); + const struct nlattr *attr, + struct netlink_ext_ack *extack); int (*set_link_af)(struct net_device *dev, - const struct nlattr *attr); - + const struct nlattr *attr, + struct netlink_ext_ack *extack); int (*fill_stats_af)(struct sk_buff *skb, const struct net_device *dev); size_t (*get_stats_af_size)(const struct net_device *dev); }; -void rtnl_af_register(struct rtnl_af_ops *ops); +int rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); @@ -161,11 +249,12 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack); -int rtnl_delete_link(struct net_device *dev); -int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm); +int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh); +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, + u32 portid, const struct nlmsghdr *nlh); -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, - struct netlink_ext_ack *exterr); +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr); struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid); #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d8fd8676fc72..638948be4c50 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,6 +19,7 @@ #include <net/gen_stats.h> #include <net/rtnetlink.h> #include <net/flow_offload.h> +#include <linux/xarray.h> struct Qdisc_ops; struct qdisc_walker; @@ -36,8 +37,23 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, + __QDISC_STATE_MISSED, + __QDISC_STATE_DRAINING, }; +enum qdisc_state2_t { + /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. + * Use qdisc_run_begin/end() or qdisc_is_running() instead. + */ + __QDISC_STATE2_RUNNING, +}; + +#define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) +#define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) + +#define QDISC_STATE_NON_EMPTY (QDISC_STATE_MISSED | \ + QDISC_STATE_DRAINING) + struct qdisc_size_table { struct rcu_head rcu; struct list_head list; @@ -89,7 +105,7 @@ struct Qdisc { struct netdev_queue *dev_queue; struct net_rate_estimator __rcu *rate_est; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; int pad; refcount_t refcnt; @@ -99,20 +115,20 @@ struct Qdisc { */ struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; - struct gnet_stats_basic_packed bstats; - seqcount_t running; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; + int owner; unsigned long state; + unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; - /* for NOLOCK qdisc, true if there are no enqueued skbs */ - bool empty; struct rcu_head rcu; - + netdevice_tracker dev_tracker; + struct lock_class_key root_lock_key; /* private data */ long privdata[] ____cacheline_aligned; }; @@ -124,6 +140,13 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc) refcount_inc(&qdisc->refcnt); } +static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_BUILTIN) + return true; + return refcount_dec_if_one(&qdisc->refcnt); +} + /* Intended to be used by unlocked users, when concurrent qdisc release is * possible. */ @@ -137,11 +160,20 @@ static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) return NULL; } +/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc + * root_lock section, or provide their own memory barriers -- ordering + * against qdisc_run_begin/end() atomic bit operations. + */ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); - return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; + return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); +} + +static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) +{ + return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY); } static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) @@ -152,32 +184,53 @@ static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) - return READ_ONCE(qdisc->empty); + return nolock_qdisc_is_empty(qdisc); return !READ_ONCE(qdisc->q.qlen); } +/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with + * the qdisc root lock acquired. + */ static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { - if (!spin_trylock(&qdisc->seqlock)) + if (spin_trylock(&qdisc->seqlock)) + return true; + + /* No need to insist if the MISSED flag was already set. + * Note that test_and_set_bit() also gives us memory ordering + * guarantees wrt potential earlier enqueue() and below + * spin_trylock(), both of which are necessary to prevent races + */ + if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state)) return false; - WRITE_ONCE(qdisc->empty, false); - } else if (qdisc_is_running(qdisc)) { - return false; + + /* Try to take the lock again to make sure that we will either + * grab it or the CPU that still has it will see MISSED set + * when testing it in qdisc_run_end() + */ + return spin_trylock(&qdisc->seqlock); } - /* Variant of write_seqcount_begin() telling lockdep a trylock - * was attempted. - */ - raw_write_seqcount_begin(&qdisc->running); - seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); - return true; + return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline void qdisc_run_end(struct Qdisc *qdisc) { - write_seqcount_end(&qdisc->running); - if (qdisc->flags & TCQ_F_NOLOCK) + if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); + + /* spin_unlock() only has store-release semantic. The unlock + * and test_bit() ordering is a store-load ordering, so a full + * memory barrier is needed here. + */ + smp_mb(); + + if (unlikely(test_bit(__QDISC_STATE_MISSED, + &qdisc->state))) + __netif_schedule(qdisc); + } else { + __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + } } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) @@ -187,12 +240,7 @@ static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) { -#ifdef CONFIG_BQL - /* Non-BQL migrated drivers will return 0, too. */ - return dql_avail(&txq->dql); -#else - return 0; -#endif + return netdev_queue_dql_avail(txq); } struct Qdisc_class_ops { @@ -210,7 +258,8 @@ struct Qdisc_class_ops { int (*change)(struct Qdisc *, u32, u32, struct nlattr **, unsigned long *, struct netlink_ext_ack *); - int (*delete)(struct Qdisc *, unsigned long); + int (*delete)(struct Qdisc *, unsigned long, + struct netlink_ext_ack *); void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ @@ -257,6 +306,8 @@ struct Qdisc_ops { struct netlink_ext_ack *extack); void (*attach)(struct Qdisc *sch); int (*change_tx_queue_len)(struct Qdisc *, unsigned int); + void (*change_real_num_tx)(struct Qdisc *sch, + unsigned int new_real_tx); int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); @@ -271,7 +322,6 @@ struct Qdisc_ops { struct module *owner; }; - struct tcf_result { union { struct { @@ -279,12 +329,6 @@ struct tcf_result { u32 classid; }; const struct tcf_proto *goto_tp; - - /* used in the skb_tc_reinsert function */ - struct { - bool ingress; - struct gnet_stats_queue *qstats; - }; }; }; @@ -306,7 +350,7 @@ struct tcf_proto_ops { int (*change)(struct net *net, struct sk_buff *, struct tcf_proto*, unsigned long, u32 handle, struct nlattr **, - void **, bool, bool, + void **, u32, struct netlink_ext_ack *); int (*delete)(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, @@ -328,6 +372,12 @@ struct tcf_proto_ops { struct nlattr **tca, struct netlink_ext_ack *extack); void (*tmplt_destroy)(void *tmplt_priv); + void (*tmplt_reoffload)(struct tcf_chain *chain, + bool add, + flow_setup_cb_t *cb, + void *cb_priv); + struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, + u32 handle); /* rtnetlink specific */ int (*dump)(struct net*, struct tcf_proto*, void *, @@ -374,6 +424,8 @@ struct tcf_proto { */ spinlock_t lock; bool deleting; + bool counted; + bool usesw; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; @@ -387,7 +439,6 @@ struct qdisc_skb_cb { }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; - u16 mru; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); @@ -409,6 +460,7 @@ struct tcf_chain { }; struct tcf_block { + struct xarray ports; /* datapath accessible */ /* Lock protects tcf_block and lifetime-management data of chains * attached to the block (refcnt, action_refcnt, explicitly_created). */ @@ -423,6 +475,7 @@ struct tcf_block { struct flow_block flow_block; struct list_head owner_list; bool keep_dst; + atomic_t useswcnt; atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ @@ -435,7 +488,8 @@ struct tcf_block { struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */ }; -#ifdef CONFIG_PROVE_LOCKING +struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index); + static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain) { return lockdep_is_held(&chain->filter_chain_lock); @@ -445,17 +499,6 @@ static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp) { return lockdep_is_held(&tp->lock); } -#else -static inline bool lockdep_tcf_chain_is_locked(struct tcf_block *chain) -{ - return true; -} - -static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp) -{ - return true; -} -#endif /* #ifdef CONFIG_PROVE_LOCKING */ #define tcf_chain_dereference(p, chain) \ rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain)) @@ -471,11 +514,6 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } -static inline int qdisc_qlen_cpu(const struct Qdisc *q) -{ - return this_cpu_ptr(q->cpu_qstats)->qlen; -} - static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; @@ -520,26 +558,7 @@ static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc) static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) { - return qdisc->dev_queue->qdisc_sleeping; -} - -/* The qdisc root lock is a mechanism by which to top level - * of a qdisc tree can be locked from any qdisc node in the - * forest. This allows changing the configuration of some - * aspect of the qdisc tree while blocking out asynchronous - * qdisc access in the packet processing paths. - * - * It is only legal to do this when the root will not change - * on us. Otherwise we'll potentially lock the wrong qdisc - * root. This is enforced by holding the RTNL semaphore, which - * all users of this lock accessor must do. - */ -static inline spinlock_t *qdisc_root_lock(const struct Qdisc *qdisc) -{ - struct Qdisc *root = qdisc_root(qdisc); - - ASSERT_RTNL(); - return qdisc_lock(root); + return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping); } static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) @@ -550,32 +569,31 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) return qdisc_lock(root); } -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) -{ - struct Qdisc *root = qdisc_root_sleeping(qdisc); - - ASSERT_RTNL(); - return &root->running; -} - static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) { return qdisc->dev_queue->dev; } -static inline void sch_tree_lock(const struct Qdisc *q) +static inline void sch_tree_lock(struct Qdisc *q) { - spin_lock_bh(qdisc_root_sleeping_lock(q)); + if (q->flags & TCQ_F_MQROOT) + spin_lock_bh(qdisc_lock(q)); + else + spin_lock_bh(qdisc_root_sleeping_lock(q)); } -static inline void sch_tree_unlock(const struct Qdisc *q) +static inline void sch_tree_unlock(struct Qdisc *q) { - spin_unlock_bh(qdisc_root_sleeping_lock(q)); + if (q->flags & TCQ_F_MQROOT) + spin_unlock_bh(qdisc_lock(q)); + else + spin_unlock_bh(qdisc_root_sleeping_lock(q)); } extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; +extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1]; extern struct Qdisc_ops mq_qdisc_ops; extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; @@ -588,6 +606,7 @@ get_default_qdisc_ops(const struct net_device *dev, int ntx) struct Qdisc_class_common { u32 classid; + unsigned int filter_cnt; struct hlist_node hnode; }; @@ -622,6 +641,31 @@ qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) return NULL; } +static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl) +{ + return cl->filter_cnt > 0; +} + +static inline void qdisc_class_get(struct Qdisc_class_common *cl) +{ + unsigned int res; + + if (check_add_overflow(cl->filter_cnt, 1, &res)) + WARN(1, "Qdisc class overflow"); + + cl->filter_cnt = res; +} + +static inline void qdisc_class_put(struct Qdisc_class_common *cl) +{ + unsigned int res; + + if (check_sub_overflow(cl->filter_cnt, 1, &res)) + WARN(1, "Qdisc class underflow"); + + cl->filter_cnt = res; +} + static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid) { u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY; @@ -638,6 +682,8 @@ void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); void qdisc_class_hash_destroy(struct Qdisc_class_hash *); int dev_qdisc_change_tx_queue_len(struct net_device *dev); +void dev_qdisc_change_real_num_tx(struct net_device *dev, + unsigned int new_real_tx); void dev_init_scheduler(struct net_device *dev); void dev_shutdown(struct net_device *dev); void dev_activate(struct net_device *dev); @@ -646,6 +692,7 @@ void dev_deactivate_many(struct list_head *head); struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc); void qdisc_reset(struct Qdisc *qdisc); +void qdisc_destroy(struct Qdisc *qdisc); void qdisc_put(struct Qdisc *qdisc); void qdisc_put_unlocked(struct Qdisc *qdisc); void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len); @@ -673,6 +720,9 @@ qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, { } #endif +void qdisc_offload_query_caps(struct net_device *dev, + enum tc_setup_type type, + void *caps, size_t caps_len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack); @@ -686,7 +736,7 @@ int skb_do_redirect(struct sk_buff *); static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS return skb->tc_at_ingress; #else return false; @@ -745,12 +795,22 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (rcu_access_pointer(txq->qdisc) != txq->qdisc_sleeping) + + if (rcu_access_pointer(txq->qdisc) != + rcu_access_pointer(txq->qdisc_sleeping)) return true; } return false; } +/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */ +static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq) +{ + struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc); + + return qdisc->enqueue == NULL; +} + /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { @@ -795,18 +855,19 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - qdisc_calculate_pkt_len(skb, sch); return sch->enqueue(skb, sch, to_free); } -static inline void _bstats_update(struct gnet_stats_basic_packed *bstats, - __u64 bytes, __u32 packets) +static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, + __u64 bytes, __u64 packets) { - bstats->bytes += bytes; - bstats->packets += packets; + u64_stats_update_begin(&bstats->syncp); + u64_stats_add(&bstats->bytes, bytes); + u64_stats_add(&bstats->packets, packets); + u64_stats_update_end(&bstats->syncp); } -static inline void bstats_update(struct gnet_stats_basic_packed *bstats, +static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { _bstats_update(bstats, @@ -814,26 +875,10 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats, skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); } -static inline void _bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, - __u64 bytes, __u32 packets) -{ - u64_stats_update_begin(&bstats->syncp); - _bstats_update(&bstats->bstats, bytes, packets); - u64_stats_update_end(&bstats->syncp); -} - -static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, - const struct sk_buff *skb) -{ - u64_stats_update_begin(&bstats->syncp); - bstats_update(&bstats->bstats, skb); - u64_stats_update_end(&bstats->syncp); -} - static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, const struct sk_buff *skb) { - bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb); + bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); } static inline void qdisc_bstats_update(struct Qdisc *sch, @@ -922,21 +967,12 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, __u32 *backlog) { struct gnet_stats_queue qstats = { 0 }; - __u32 len = qdisc_qlen_sum(sch); - __gnet_stats_copy_queue(&qstats, sch->cpu_qstats, &sch->qstats, len); - *qlen = qstats.qlen; + gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); + *qlen = qstats.qlen + qdisc_qlen(sch); *backlog = qstats.backlog; } -static inline void qdisc_tree_flush_backlog(struct Qdisc *sch) -{ - __u32 qlen, backlog; - - qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); - qdisc_tree_reduce_backlog(sch, qlen, backlog); -} - static inline void qdisc_purge_queue(struct Qdisc *sch) { __u32 qlen, backlog; @@ -946,13 +982,6 @@ static inline void qdisc_purge_queue(struct Qdisc *sch) qdisc_tree_reduce_backlog(sch, qlen, backlog); } -static inline void qdisc_skb_head_init(struct qdisc_skb_head *qh) -{ - qh->head = NULL; - qh->tail = NULL; - qh->qlen = 0; -} - static inline void __qdisc_enqueue_tail(struct sk_buff *skb, struct qdisc_skb_head *qh) { @@ -1002,6 +1031,21 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) return skb; } +static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->gso_skb); + if (skb) { + sch->q.qlen--; + return skb; + } + if (direct) + return __qdisc_dequeue_head(&sch->q); + else + return sch->dequeue(sch); +} + static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); @@ -1014,6 +1058,37 @@ static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) return skb; } +struct tc_skb_cb { + struct qdisc_skb_cb qdisc_cb; + u32 drop_reason; + + u16 zone; /* Only valid if post_ct = true */ + u16 mru; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; +}; + +static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) +{ + struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; + + BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); + return cb; +} + +static inline enum skb_drop_reason +tcf_get_drop_reason(const struct sk_buff *skb) +{ + return tc_skb_cb(skb)->drop_reason; +} + +static inline void tcf_set_drop_reason(const struct sk_buff *skb, + enum skb_drop_reason reason) +{ + tc_skb_cb(skb)->drop_reason = reason; +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ @@ -1143,7 +1218,6 @@ static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh) static inline void qdisc_reset_queue(struct Qdisc *sch) { __qdisc_reset_queue(&sch->q); - sch->qstats.backlog = 0; } static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, @@ -1155,7 +1229,7 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, old = *pold; *pold = new; if (old != NULL) - qdisc_tree_flush_backlog(old); + qdisc_purge_queue(old); sch_tree_unlock(sch); return old; @@ -1185,6 +1259,14 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_DROP; } +static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free, + enum skb_drop_reason reason) +{ + tcf_set_drop_reason(skb, reason); + return qdisc_drop(skb, sch, to_free); +} + static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -1194,24 +1276,11 @@ static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_DROP; } -/* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how - long it will take to send a packet given its size. - */ -static inline u32 qdisc_l2t(struct qdisc_rate_table* rtab, unsigned int pktlen) -{ - int slot = pktlen + rtab->rate.cell_align + rtab->rate.overhead; - if (slot < 0) - slot = 0; - slot >>= rtab->rate.cell_log; - if (slot > 255) - return rtab->data[255]*(slot >> 8) + rtab->data[slot & 0xFF]; - return rtab->data[slot]; -} - struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; u16 overhead; + u16 mpu; u8 linklayer; u8 shift; }; @@ -1221,6 +1290,9 @@ static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, { len += r->overhead; + if (len < r->mpu) + len = r->mpu; + if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift; @@ -1243,24 +1315,39 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res, res->rate = min_t(u64, r->rate_bytes_ps, ~0U); res->overhead = r->overhead; + res->mpu = r->mpu; res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } +struct psched_pktrate { + u64 rate_pkts_ps; /* packets per second */ + u32 mult; + u8 shift; +}; + +static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r, + unsigned int pkt_num) +{ + return ((u64)pkt_num * r->mult) >> r->shift; +} + +void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); + /* Mini Qdisc serves for specific needs of ingress/clsact Qdisc. * The fast path only needs to access filter list and to update stats */ struct mini_Qdisc { struct tcf_proto *filter_list; struct tcf_block *block; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; - struct rcu_head rcu; + unsigned long rcu_state; }; static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, const struct sk_buff *skb) { - bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb); + bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); } static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) @@ -1281,9 +1368,15 @@ void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block); -static inline int skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) +void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx); + +int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)); + +/* Make sure qdisc is no longer in SCHED state. */ +static inline void qdisc_synchronize(const struct Qdisc *q) { - return res->ingress ? netif_receive_skb(skb) : dev_queue_xmit(skb); + while (test_bit(__QDISC_STATE_SCHED, &q->state)) + msleep(1); } #endif diff --git a/include/net/scm.h b/include/net/scm.h index 1ce365f4c256..84c4707e78a5 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -5,10 +5,12 @@ #include <linux/limits.h> #include <linux/net.h> #include <linux/cred.h> +#include <linux/file.h> #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/sched/signal.h> +#include <net/compat.h> /* Well, we should have at least one descriptor open * to accept passed FDs 8) @@ -21,9 +23,20 @@ struct scm_creds { kgid_t gid; }; +#ifdef CONFIG_UNIX +struct unix_edge; +#endif + struct scm_fp_list { short count; + short count_unix; short max; +#ifdef CONFIG_UNIX + bool inflight; + bool dead; + struct list_head vertices; + struct unix_edge *edges; +#endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; @@ -89,57 +102,18 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, return __scm_send(sock, msg, scm); } -#ifdef CONFIG_SECURITY_NETWORK -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ - char *secdata; - u32 seclen; - int err; - - if (test_bit(SOCK_PASSSEC, &sock->flags)) { - err = security_secid_to_secctx(scm->secid, &secdata, &seclen); - - if (!err) { - put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata); - security_release_secctx(secdata, seclen); - } - } -} -#else -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ } -#endif /* CONFIG_SECURITY_NETWORK */ +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); -static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) +static inline int scm_recv_one_fd(struct file *f, int __user *ufd, + unsigned int flags) { - if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp) - msg->msg_flags |= MSG_CTRUNC; - scm_destroy(scm); - return; - } - - if (test_bit(SOCK_PASSCRED, &sock->flags)) { - struct user_namespace *current_ns = current_user_ns(); - struct ucred ucreds = { - .pid = scm->creds.pid, - .uid = from_kuid_munged(current_ns, scm->creds.uid), - .gid = from_kgid_munged(current_ns, scm->creds.gid), - }; - put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); - } - - scm_destroy_cred(scm); - - scm_passec(sock, msg, scm); - - if (!scm->fp) - return; - - scm_detach_fds(msg, scm); + if (!ufd) + return -EFAULT; + return receive_fd(f, ufd, flags); } - #endif /* __LINUX_NET_SCM_H */ diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 5a9bb09f32b6..654d37ec0402 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -15,8 +15,6 @@ * Dinakaran Joseph * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> - * - * Rewritten to use libcrc32c by: * Vlad Yasevich <vladislav.yasevich@hp.com> */ @@ -24,43 +22,19 @@ #define __sctp_checksum_h__ #include <linux/types.h> -#include <net/sctp/sctp.h> -#include <linux/crc32c.h> -#include <linux/crc32.h> - -static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum) -{ - /* This uses the crypto implementation of crc32c, which is either - * implemented w/ hardware support or resolves to __crc32c_le(). - */ - return (__force __wsum)crc32c((__force __u32)sum, buff, len); -} - -static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - return (__force __wsum)__crc32c_le_combine((__force __u32)csum, - (__force __u32)csum2, len); -} - -static const struct skb_checksum_ops sctp_csum_ops = { - .update = sctp_csum_update, - .combine = sctp_csum_combine, -}; +#include <linux/sctp.h> static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, unsigned int offset) { struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); __le32 old = sh->checksum; - __wsum new; + u32 new; sh->checksum = 0; - new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, - &sctp_csum_ops); + new = ~skb_crc32c(skb, offset, skb->len - offset, ~0); sh->checksum = old; - - return cpu_to_le32((__force __u32)new); + return cpu_to_le32(new); } #endif /* __sctp_checksum_h__ */ diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index e8df72e1627a..2058fabffbf6 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -59,6 +59,7 @@ enum sctp_verb { SCTP_CMD_HB_TIMERS_START, /* Start the heartbeat timers. */ SCTP_CMD_HB_TIMER_UPDATE, /* Update a heartbeat timers. */ SCTP_CMD_HB_TIMERS_STOP, /* Stop the heartbeat timers. */ + SCTP_CMD_PROBE_TIMER_UPDATE, /* Update a probe timer. */ SCTP_CMD_TRANSPORT_HB_SENT, /* Reset the status of a transport. */ SCTP_CMD_TRANSPORT_IDLE, /* Do manipulations on idle transport */ SCTP_CMD_TRANSPORT_ON, /* Mark the transport as active. */ @@ -68,7 +69,6 @@ enum sctp_verb { SCTP_CMD_ASSOC_FAILED, /* Handle association failure. */ SCTP_CMD_DISCARD_PACKET, /* Discard the whole packet. */ SCTP_CMD_GEN_SHUTDOWN, /* Generate a SHUTDOWN chunk. */ - SCTP_CMD_UPDATE_ASSOC, /* Update association information. */ SCTP_CMD_PURGE_OUTQUEUE, /* Purge all data waiting to be sent. */ SCTP_CMD_SETUP_T2, /* Hi-level, setup T2-shutdown parms. */ SCTP_CMD_RTO_PENDING, /* Set transport's rto_pending. */ diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 122d9e2d8dfd..5859e0a16a58 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -77,6 +77,7 @@ enum sctp_event_timeout { SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, SCTP_EVENT_TIMEOUT_HEARTBEAT, SCTP_EVENT_TIMEOUT_RECONF, + SCTP_EVENT_TIMEOUT_PROBE, SCTP_EVENT_TIMEOUT_SACK, SCTP_EVENT_TIMEOUT_AUTOCLOSE, }; @@ -200,6 +201,23 @@ enum sctp_sock_state { SCTP_SS_CLOSING = TCP_CLOSE_WAIT, }; +enum sctp_plpmtud_state { + SCTP_PL_DISABLED, + SCTP_PL_BASE, + SCTP_PL_SEARCH, + SCTP_PL_COMPLETE, + SCTP_PL_ERROR, +}; + +#define SCTP_BASE_PLPMTU 1200 +#define SCTP_MAX_PLPMTU 9000 +#define SCTP_MIN_PLPMTU 512 + +#define SCTP_MAX_PROBES 3 + +#define SCTP_PL_BIG_STEP 32 +#define SCTP_PL_MIN_STEP 4 + /* These functions map various type to printable names. */ const char *sctp_cname(const union sctp_subtype id); /* chunk types */ const char *sctp_oname(const union sctp_subtype id); /* other events */ @@ -286,6 +304,8 @@ enum { SCTP_MAX_GABS = 16 }; * functions simpler to write. */ +#define SCTP_DEFAULT_UDP_PORT 9899 /* default UDP tunneling port */ + /* These are the values for pf exposure, UNUSED is to keep compatible with old * applications by default. */ @@ -340,8 +360,7 @@ enum { #define SCTP_SCOPE_POLICY_MAX SCTP_SCOPE_POLICY_LINK /* Based on IPv4 scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>, - * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 198.18.0.0/24, - * 192.88.99.0/24. + * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 192.88.99.0/24. * Also, RFC 8.4, non-unicast addresses are not considered valid SCTP * addresses. */ @@ -349,7 +368,6 @@ enum { ((htonl(INADDR_BROADCAST) == a) || \ ipv4_is_multicast(a) || \ ipv4_is_zeronet(a) || \ - ipv4_is_test_198(a) || \ ipv4_is_anycast_6to4(a)) /* Flags used for the bind address copy functions. */ @@ -422,4 +440,6 @@ enum { */ #define SCTP_AUTH_RANDOM_LENGTH 32 +#define SCTP_PROBE_TIMER_MIN 5000 + #endif /* __sctp_constants_h__ */ diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 4fc747b778eb..e96d1bd087f6 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -28,7 +28,7 @@ #define __net_sctp_h__ /* Header Strategy. - * Start getting some control over the header file depencies: + * Start getting some control over the header file dependencies: * includes * constants * structs @@ -67,11 +67,6 @@ #define SCTP_PROTOSW_FLAG INET_PROTOSW_PERMANENT #endif -/* Round an int up to the next multiple of 4. */ -#define SCTP_PAD4(s) (((s)+3)&~3) -/* Truncate to the previous multiple of 4. */ -#define SCTP_TRUNC4(s) ((s)&~3) - /* * Function declarations. */ @@ -84,6 +79,8 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *addr, struct sctp_pf *sctp_get_pf_specific(sa_family_t family); int sctp_register_pf(struct sctp_pf *, sa_family_t); void sctp_addr_wq_mgmt(struct net *, struct sctp_sockaddr_entry *, int); +int sctp_udp_sock_start(struct net *net); +void sctp_udp_sock_stop(struct net *net); /* * sctp/socket.c @@ -101,21 +98,20 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc); extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); -struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); +struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *); +typedef int (*sctp_callback_t)(struct sctp_endpoint *, struct sctp_transport *, void *); void sctp_transport_walk_start(struct rhashtable_iter *iter); void sctp_transport_walk_stop(struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_next(struct net *net, struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_idx(struct net *net, struct rhashtable_iter *iter, int pos); -int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), - struct net *net, +int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr, void *p); -int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), - int (*cb_done)(struct sctp_transport *, void *), - struct net *net, int *pos, void *p); + const union sctp_addr *paddr, void *p, int dif); +int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, + struct net *net, int *pos, void *p); int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, struct sctp_info *info); @@ -143,6 +139,8 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *, struct sctphdr *, struct sctp_association **, struct sctp_transport **); void sctp_err_finish(struct sock *, struct sctp_transport *); +int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb); +int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb); void sctp_icmp_frag_needed(struct sock *, struct sctp_association *, struct sctp_transport *t, __u32 pmtu); void sctp_icmp_redirect(struct sock *, struct sctp_transport *, @@ -150,8 +148,6 @@ void sctp_icmp_redirect(struct sock *, struct sctp_transport *, void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t); -void sctp_backlog_migrate(struct sctp_association *assoc, - struct sock *oldsk, struct sock *newsk); int sctp_transport_hashtable_init(void); void sctp_transport_hashtable_destroy(void); int sctp_hash_transport(struct sctp_transport *t); @@ -159,10 +155,12 @@ void sctp_unhash_transport(struct sctp_transport *t); struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr); + const union sctp_addr *paddr, + int dif, int sdif); struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr); +bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif); /* * sctp/proc.c @@ -366,8 +364,6 @@ sctp_assoc_to_state(const struct sctp_association *asoc) /* Look up the association by its id. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id); -int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp); - /* A macro to walk a list of skbs. */ #define sctp_skb_for_each(pos, head, tmp) \ skb_queue_walk_safe(head, pos, tmp) @@ -425,11 +421,11 @@ static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk) * the chunk length to indicate when to stop. Make sure * there is room for a param header too. */ -#define sctp_walk_params(pos, chunk, member)\ -_sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member) +#define sctp_walk_params(pos, chunk)\ +_sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length)) -#define _sctp_walk_params(pos, chunk, end, member)\ -for (pos.v = chunk->member;\ +#define _sctp_walk_params(pos, chunk, end)\ +for (pos.v = (u8 *)(chunk + 1);\ (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\ (void *)chunk + end) &&\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ @@ -452,8 +448,8 @@ for (err = (struct sctp_errhdr *)((void *)chunk_hdr + \ _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk)) #define _sctp_walk_fwdtsn(pos, chunk, end)\ -for (pos = chunk->subh.fwdtsn_hdr->skip;\ - (void *)pos <= (void *)chunk->subh.fwdtsn_hdr->skip + end - sizeof(struct sctp_fwdtsn_skip);\ +for (pos = (void *)(chunk->subh.fwdtsn_hdr + 1);\ + (void *)pos <= (void *)(chunk->subh.fwdtsn_hdr + 1) + end - sizeof(struct sctp_fwdtsn_skip);\ pos++) /* External references. */ @@ -506,8 +502,8 @@ static inline int sctp_ep_hashfn(struct net *net, __u16 lport) return (net_hash_mix(net) + lport) & (sctp_ep_hashsize - 1); } -#define sctp_for_each_hentry(epb, head) \ - hlist_for_each_entry(epb, head, node) +#define sctp_for_each_hentry(ep, head) \ + hlist_for_each_entry(ep, head, node) /* Is a socket of this style? */ #define sctp_style(sk, style) __sctp_style((sk), (SCTP_SOCKET_##style)) @@ -571,15 +567,19 @@ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport * /* Calculate max payload size given a MTU, or the total overhead if * given MTU is zero */ -static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp, - __u32 mtu, __u32 extra) +static inline __u32 __sctp_mtu_payload(const struct sctp_sock *sp, + const struct sctp_transport *t, + __u32 mtu, __u32 extra) { __u32 overhead = sizeof(struct sctphdr) + extra; - if (sp) + if (sp) { overhead += sp->pf->af->net_header_len; - else + if (sp->udp_port && (!t || t->encap_port)) + overhead += sizeof(struct udphdr); + } else { overhead += sizeof(struct ipv6hdr); + } if (WARN_ON_ONCE(mtu && mtu <= overhead)) mtu = overhead; @@ -587,6 +587,12 @@ static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp, return mtu ? mtu - overhead : overhead; } +static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp, + __u32 mtu, __u32 extra) +{ + return __sctp_mtu_payload(sp, NULL, mtu, extra); +} + static inline __u32 sctp_dst_mtu(const struct dst_entry *dst) { return SCTP_TRUNC4(max_t(__u32, dst_mtu(dst), @@ -610,6 +616,47 @@ static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize) return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize); } +static inline int sctp_transport_pl_hlen(struct sctp_transport *t) +{ + return __sctp_mtu_payload(sctp_sk(t->asoc->base.sk), t, 0, 0) - + sizeof(struct sctphdr); +} + +static inline void sctp_transport_pl_reset(struct sctp_transport *t) +{ + if (t->probe_interval && (t->param_flags & SPP_PMTUD_ENABLE) && + (t->state == SCTP_ACTIVE || t->state == SCTP_UNKNOWN)) { + if (t->pl.state == SCTP_PL_DISABLED) { + t->pl.state = SCTP_PL_BASE; + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pl.probe_size = SCTP_BASE_PLPMTU; + sctp_transport_reset_probe_timer(t); + } + } else { + if (t->pl.state != SCTP_PL_DISABLED) { + if (timer_delete(&t->probe_timer)) + sctp_transport_put(t); + t->pl.state = SCTP_PL_DISABLED; + } + } +} + +static inline void sctp_transport_pl_update(struct sctp_transport *t) +{ + if (t->pl.state == SCTP_PL_DISABLED) + return; + + t->pl.state = SCTP_PL_BASE; + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pl.probe_size = SCTP_BASE_PLPMTU; + sctp_transport_reset_probe_timer(t); +} + +static inline bool sctp_transport_pl_enabled(struct sctp_transport *t) +{ + return t->pl.state != SCTP_PL_DISABLED; +} + static inline bool sctp_newsk_ready(const struct sock *sk) { return sock_flag(sk, SOCK_DEAD) || sk->sk_socket; diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 5c491a3bc27e..3bfd261a53cc 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -151,22 +151,19 @@ sctp_state_fn_t sctp_sf_cookie_wait_icmp_abort; /* Prototypes for timeout event state functions. */ sctp_state_fn_t sctp_sf_do_6_3_3_rtx; sctp_state_fn_t sctp_sf_send_reconf; +sctp_state_fn_t sctp_sf_send_probe; sctp_state_fn_t sctp_sf_do_6_2_sack; sctp_state_fn_t sctp_sf_autoclose_timer_expire; /* Prototypes for utility support functions. */ -__u8 sctp_get_chunk_type(struct sctp_chunk *chunk); const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype); -int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); -__u32 sctp_generate_verification_tag(void); -void sctp_populate_tie_tags(__u8 *cookie, __u32 curTag, __u32 hisTag); /* Prototypes for chunk-building functions. */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, @@ -221,12 +218,17 @@ struct sctp_chunk *sctp_make_violation_paramlen( struct sctp_chunk *sctp_make_violation_max_retrans( const struct sctp_association *asoc, const struct sctp_chunk *chunk); +struct sctp_chunk *sctp_make_new_encap_port( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, - const struct sctp_transport *transport); + const struct sctp_transport *transport, + __u32 probe_size); struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen); +struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len); struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, @@ -307,6 +309,7 @@ int sctp_do_sm(struct net *net, enum sctp_event_type event_type, void sctp_generate_t3_rtx_event(struct timer_list *t); void sctp_generate_heartbeat_event(struct timer_list *t); void sctp_generate_reconf_event(struct timer_list *t); +void sctp_generate_probe_event(struct timer_list *t); void sctp_generate_proto_unreach_event(struct timer_list *t); void sctp_ootb_pkt_free(struct sctp_packet *packet); @@ -377,10 +380,11 @@ sctp_vtag_verify(const struct sctp_chunk *chunk, * Verification Tag value does not match the receiver's own * tag value, the receiver shall silently discard the packet... */ - if (ntohl(chunk->sctp_hdr->vtag) == asoc->c.my_vtag) - return 1; + if (ntohl(chunk->sctp_hdr->vtag) != asoc->c.my_vtag) + return 0; - return 0; + chunk->transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; + return 1; } /* Check VTAG of the packet matches the sender's own tag and the T bit is diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h index 01a70b27e026..8034bf5febbe 100644 --- a/include/net/sctp/stream_sched.h +++ b/include/net/sctp/stream_sched.h @@ -26,8 +26,8 @@ struct sctp_sched_ops { int (*init)(struct sctp_stream *stream); /* Init a stream */ int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp); - /* Frees the entire thing */ - void (*free)(struct sctp_stream *stream); + /* free a stream */ + void (*free_sid)(struct sctp_stream *stream, __u16 sid); /* Enqueue a chunk */ void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg); @@ -35,10 +35,10 @@ struct sctp_sched_ops { struct sctp_chunk *(*dequeue)(struct sctp_outq *q); /* Called only if the chunk fit the packet */ void (*dequeue_done)(struct sctp_outq *q, struct sctp_chunk *chunk); - /* Sched all chunks already enqueued */ - void (*sched_all)(struct sctp_stream *steam); - /* Unched all chunks already enqueued */ - void (*unsched_all)(struct sctp_stream *steam); + /* Schedule all chunks already enqueued */ + void (*sched_all)(struct sctp_stream *stream); + /* Unschedule all chunks already enqueued */ + void (*unsched_all)(struct sctp_stream *stream); }; int sctp_sched_set_sched(struct sctp_association *asoc, @@ -58,5 +58,7 @@ void sctp_sched_ops_register(enum sctp_sched_type sched, struct sctp_sched_ops *sched_ops); void sctp_sched_ops_prio_init(void); void sctp_sched_ops_rr_init(void); +void sctp_sched_ops_fc_init(void); +void sctp_sched_ops_wfq_init(void); #endif /* __sctp_stream_sched_h__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 0bdff38eb4bb..1ad7ce71d0a7 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -177,6 +177,10 @@ struct sctp_sock { * will be inherited by all new associations. */ __u32 hbinterval; + __u32 probe_interval; + + __be16 udp_port; + __be16 encap_port; /* This is the max_retrans value for new associations. */ __u16 pathmaxrxt; @@ -238,10 +242,7 @@ struct sctp_sock { int do_auto_asconf; }; -static inline struct sctp_sock *sctp_sk(const struct sock *sk) -{ - return (struct sctp_sock *)sk; -} +#define sctp_sk(ptr) container_of_const(ptr, struct sctp_sock, inet.sk) static inline struct sock *sctp_opt2sk(const struct sctp_sock *sp) { @@ -328,7 +329,7 @@ struct sctp_cookie { * the association TCB is re-constructed from the cookie. */ __u32 raw_addr_list_len; - struct sctp_init_chunk peer_init[]; + /* struct sctp_init_chunk peer_init[]; */ }; @@ -382,6 +383,7 @@ struct sctp_sender_hb_info { union sctp_addr daddr; unsigned long sent_at; __u64 hb_nonce; + __u32 probe_size; }; int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, @@ -458,7 +460,7 @@ struct sctp_af { int saddr); void (*from_sk) (union sctp_addr *, struct sock *sk); - void (*from_addr_param) (union sctp_addr *, + bool (*from_addr_param) (union sctp_addr *, union sctp_addr_param *, __be16 port, int iif); int (*to_addr_param) (const union sctp_addr *, @@ -472,6 +474,7 @@ struct sctp_af { int (*available) (union sctp_addr *, struct sctp_sock *); int (*skb_iif) (const struct sk_buff *sk); + int (*skb_sdif)(const struct sk_buff *sk); int (*is_ce) (const struct sk_buff *sk); void (*seq_dump_addr)(struct seq_file *seq, union sctp_addr *addr); @@ -518,7 +521,7 @@ struct sctp_datamsg { refcount_t refcnt; /* When is this message no longer interesting to the peer? */ unsigned long expires_at; - /* Did the messenge fail to send? */ + /* Did the message fail to send? */ int send_error; u8 send_failed:1, can_delay:1, /* should this message be Nagle delayed */ @@ -653,6 +656,7 @@ struct sctp_chunk { data_accepted:1, /* At least 1 chunk accepted */ auth:1, /* IN: was auth'ed | OUT: needs auth */ has_asconf:1, /* IN: have seen an asconf before */ + pmtu_probe:1, /* Used by PLPMTUD, can be set in s HB chunk */ tsn_missing_report:2, /* Data chunk missing counter. */ fast_retransmit:2; /* Is this chunk fast retransmitted? */ }; @@ -771,6 +775,7 @@ struct sctp_transport { /* Reference counting. */ refcount_t refcnt; + __u32 dead:1, /* RTO-Pending : A flag used to track if one of the DATA * chunks sent to this address is currently being * used to compute a RTT. If this flag is 0, @@ -780,7 +785,7 @@ struct sctp_transport { * calculation completes (i.e. the DATA chunk * is SACK'd) clear this flag. */ - __u32 rto_pending:1, + rto_pending:1, /* * hb_sent : a flag that signals that we have a pending @@ -788,7 +793,7 @@ struct sctp_transport { */ hb_sent:1, - /* Is the Path MTU update pending on this tranport */ + /* Is the Path MTU update pending on this transport */ pmtu_pending:1, dst_pending_confirm:1, /* need to confirm neighbour */ @@ -855,6 +860,7 @@ struct sctp_transport { * the destination address every heartbeat interval. */ unsigned long hbinterval; + unsigned long probe_interval; /* SACK delay timeout */ unsigned long sackdelay; @@ -877,6 +883,8 @@ struct sctp_transport { */ unsigned long last_time_ecne_reduced; + __be16 encap_port; + /* This is the max_retrans value for the transport and will * be initialized from the assocs value. This can be changed * using the SCTP_SET_PEER_ADDR_PARAMS socket option. @@ -929,6 +937,9 @@ struct sctp_transport { /* Timer to handler reconf chunk rtx */ struct timer_list reconf_timer; + /* Timer to send a probe HB packet for PLPMTUD */ + struct timer_list probe_timer; + /* Since we're using per-destination retransmission timers * (see above), we're also using per-destination "transmitted" * queues. This probably ought to be a private struct @@ -971,6 +982,14 @@ struct sctp_transport { char cacc_saw_newack; } cacc; + struct { + __u16 pmtu; + __u16 probe_size; + __u16 probe_high; + __u8 probe_count; + __u8 state; + } pl; /* plpmtud related */ + /* 64-bit random number sent with heartbeat. */ __u64 hb_nonce; @@ -988,6 +1007,8 @@ void sctp_transport_free(struct sctp_transport *); void sctp_transport_reset_t3_rtx(struct sctp_transport *); void sctp_transport_reset_hb_timer(struct sctp_transport *); void sctp_transport_reset_reconf_timer(struct sctp_transport *transport); +void sctp_transport_reset_probe_timer(struct sctp_transport *transport); +void sctp_transport_reset_raise_timer(struct sctp_transport *transport); int sctp_transport_hold(struct sctp_transport *); void sctp_transport_put(struct sctp_transport *); void sctp_transport_update_rto(struct sctp_transport *, __u32); @@ -1002,6 +1023,8 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); +void sctp_transport_pl_send(struct sctp_transport *t); +bool sctp_transport_pl_recv(struct sctp_transport *t); /* This is the structure we use to queue packets as they come into @@ -1097,8 +1120,6 @@ void sctp_outq_free(struct sctp_outq*); void sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t); int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *); int sctp_outq_is_empty(const struct sctp_outq *); -void sctp_outq_restart(struct sctp_outq *); - void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason); void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8); @@ -1117,13 +1138,14 @@ static inline void sctp_outq_cork(struct sctp_outq *q) */ struct sctp_input_cb { union { - struct inet_skb_parm h4; + struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; + struct inet6_skb_parm h6; #endif } header; struct sctp_chunk *chunk; struct sctp_af *af; + __be16 encap_port; }; #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) @@ -1202,7 +1224,7 @@ enum sctp_endpoint_type { }; /* - * A common base class to bridge the implmentation view of a + * A common base class to bridge the implementation view of a * socket (usually listening) endpoint versus an association's * local endpoint. * This common structure is useful for several purposes: @@ -1218,10 +1240,6 @@ enum sctp_endpoint_type { */ struct sctp_ep_common { - /* Fields to help us manage our entries in the hash tables. */ - struct hlist_node node; - int hashent; - /* Runtime type information. What kind of endpoint is this? */ enum sctp_endpoint_type type; @@ -1273,6 +1291,10 @@ struct sctp_endpoint { /* Common substructure for endpoint and association. */ struct sctp_ep_common base; + /* Fields to help us manage our entries in the hash tables. */ + struct hlist_node node; + int hashent; + /* Associations: A list of current associations and mappings * to the data consumers for each association. This * may be in the form of a hash table or other @@ -1329,19 +1351,10 @@ struct sctp_endpoint { reconf_enable:1; __u8 strreset_enable; - - /* Security identifiers from incoming (INIT). These are set by - * security_sctp_assoc_request(). These will only be used by - * SCTP TCP type sockets and peeled off connections as they - * cause a new socket to be generated. security_sctp_sk_clone() - * will then plug these into the new socket. - */ - - u32 secid; - u32 peer_secid; + struct rcu_head rcu; }; -/* Recover the outter endpoint structure. */ +/* Recover the outer endpoint structure. */ static inline struct sctp_endpoint *sctp_ep(struct sctp_ep_common *base) { struct sctp_endpoint *ep; @@ -1354,7 +1367,7 @@ static inline struct sctp_endpoint *sctp_ep(struct sctp_ep_common *base) struct sctp_endpoint *sctp_endpoint_new(struct sock *, gfp_t); void sctp_endpoint_free(struct sctp_endpoint *); void sctp_endpoint_put(struct sctp_endpoint *); -void sctp_endpoint_hold(struct sctp_endpoint *); +int sctp_endpoint_hold(struct sctp_endpoint *ep); void sctp_endpoint_add_asoc(struct sctp_endpoint *, struct sctp_association *); struct sctp_association *sctp_endpoint_lookup_assoc( const struct sctp_endpoint *ep, @@ -1362,10 +1375,12 @@ struct sctp_association *sctp_endpoint_lookup_assoc( struct sctp_transport **); bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr); -struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *, - struct net *, const union sctp_addr *); +struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, + struct net *net, + const union sctp_addr *laddr, + int dif, int sdif); bool sctp_has_association(struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr); + const union sctp_addr *paddr, int dif, int sdif); int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, @@ -1393,6 +1408,7 @@ struct sctp_stream_priorities { /* The next stream in line */ struct sctp_stream_out_ext *next; __u16 prio; + __u16 users; }; struct sctp_stream_out_ext { @@ -1409,6 +1425,11 @@ struct sctp_stream_out_ext { struct { struct list_head rr_list; }; + struct { + struct list_head fc_list; + __u32 fc_length; + __u16 fc_weight; + }; }; }; @@ -1455,6 +1476,9 @@ struct sctp_stream { /* The next stream in line */ struct sctp_stream_out_ext *rr_next; }; + struct { + struct list_head fc_list; + }; }; struct sctp_stream_interleave *si; }; @@ -1683,7 +1707,6 @@ struct sctp_association { __u16 ecn_capable:1, /* Can peer do ECN? */ ipv4_address:1, /* Peer understands IPv4 addresses? */ ipv6_address:1, /* Peer understands IPv6 addresses? */ - hostname_address:1, /* Peer understands DNS addresses? */ asconf_capable:1, /* Does peer support ADDIP? */ prsctp_capable:1, /* Can peer do PR-SCTP? */ reconf_capable:1, /* Can peer do RE-CONFIG? */ @@ -1789,6 +1812,9 @@ struct sctp_association { * will be inherited by all new transports. */ unsigned long hbinterval; + unsigned long probe_interval; + + __be16 encap_port; /* This is the max_retrans value for new transports in the * association. @@ -1881,7 +1907,7 @@ struct sctp_association { __u32 rwnd_over; /* Keeps treack of rwnd pressure. This happens when we have - * a window, but not recevie buffer (i.e small packets). This one + * a window, but not receive buffer (i.e small packets). This one * is releases slowly (1 PMTU at a time ). */ __u32 rwnd_press; @@ -1969,7 +1995,7 @@ struct sctp_association { /* ADDIP Section 5.2 Upon reception of an ASCONF Chunk. * - * This is needed to implement itmes E1 - E4 of the updated + * This is needed to implement items E1 - E4 of the updated * spec. Here is the justification: * * Since the peer may bundle multiple ASCONF chunks toward us, @@ -1980,7 +2006,7 @@ struct sctp_association { /* These ASCONF chunks are waiting to be sent. * - * These chunaks can't be pushed to outqueue until receiving + * These chunks can't be pushed to outqueue until receiving * ASCONF_ACK for the previous ASCONF indicated by * addip_last_asconf, so as to guarantee that only one ASCONF * is in flight at any time. @@ -2034,13 +2060,13 @@ struct sctp_association { struct sctp_transport *new_transport; /* SCTP AUTH: list of the endpoint shared keys. These - * keys are provided out of band by the user applicaton + * keys are provided out of band by the user application * and can't change during the lifetime of the association */ struct list_head endpoint_shared_keys; /* SCTP AUTH: - * The current generated assocaition shared key (secret) + * The current generated association shared key (secret) */ struct sctp_auth_bytes *asoc_shared_key; struct sctp_shared_key *shkey; @@ -2075,6 +2101,16 @@ struct sctp_association { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + /* Security identifiers from incoming (INIT). These are set by + * security_sctp_assoc_request(). These will only be used by + * SCTP TCP type sockets and peeled off connections as they + * cause a new socket to be generated. security_sctp_sk_clone() + * will then plug these into the new socket. + */ + + u32 secid; + u32 peer_secid; + struct rcu_head rcu; }; @@ -2086,7 +2122,7 @@ enum { SCTP_ASSOC_EYECATCHER = 0xa550c123, }; -/* Recover the outter association structure. */ +/* Recover the outer association structure. */ static inline struct sctp_association *sctp_assoc(struct sctp_ep_common *base) { struct sctp_association *asoc; @@ -2116,8 +2152,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *, const union sctp_addr *address, const gfp_t gfp, const int peer_state); -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr); void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer); void sctp_assoc_control_transport(struct sctp_association *asoc, diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index 0eaf8650e3b2..60f6641290c3 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -35,8 +35,7 @@ struct sctp_ulpq { }; /* Prototypes. */ -struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *, - struct sctp_association *); +void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc); void sctp_ulpq_flush(struct sctp_ulpq *ulpq); void sctp_ulpq_free(struct sctp_ulpq *); diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index d7d2495f83c2..cddebafb9f77 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -4,8 +4,10 @@ #include <linux/types.h> -u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); -u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, +struct net; + +u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); +u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport); u32 secure_tcp_seq(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); @@ -14,9 +16,5 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport); u32 secure_tcpv6_ts_off(const struct net *net, const __be32 *saddr, const __be32 *daddr); -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport); #endif /* _NET_SECURE_SEQ */ diff --git a/include/net/seg6.h b/include/net/seg6.h index 9d19c15e8545..82b3fbbcbb93 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -52,15 +52,43 @@ static inline struct seg6_pernet_data *seg6_pernet(struct net *net) extern int seg6_init(void); extern void seg6_exit(void); +#ifdef CONFIG_IPV6_SEG6_LWTUNNEL extern int seg6_iptunnel_init(void); extern void seg6_iptunnel_exit(void); extern int seg6_local_init(void); extern void seg6_local_exit(void); +#else +static inline int seg6_iptunnel_init(void) { return 0; } +static inline void seg6_iptunnel_exit(void) {} +static inline int seg6_local_init(void) { return 0; } +static inline void seg6_local_exit(void) {} +#endif extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced); +extern struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags); +extern void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto); extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); + +/* If the packet which invoked an ICMP error contains an SRH return + * the true destination address from within the SRH, otherwise use the + * destination address in the IP header. + */ +static inline const struct in6_addr *seg6_get_daddr(struct sk_buff *skb, + struct inet6_skb_parm *opt) +{ + struct ipv6_sr_hdr *srh; + + if (opt->flags & IP6SKB_SEG6) { + srh = (struct ipv6_sr_hdr *)(skb->data + opt->srhoff); + return &srh->segments[0]; + } + + return NULL; +} + + #endif diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 2b5d2ee5613e..24f733b3e3fe 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -49,9 +49,16 @@ extern int seg6_hmac_info_del(struct net *net, u32 key); extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh); extern bool seg6_hmac_validate_skb(struct sk_buff *skb); +#ifdef CONFIG_IPV6_SEG6_HMAC extern int seg6_hmac_init(void); extern void seg6_hmac_exit(void); extern int seg6_hmac_net_init(struct net *net); extern void seg6_hmac_net_exit(struct net *net); +#else +static inline int seg6_hmac_init(void) { return 0; } +static inline void seg6_hmac_exit(void) {} +static inline int seg6_hmac_net_init(struct net *net) { return 0; } +static inline void seg6_hmac_net_exit(struct net *net) {} +#endif #endif diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h index 3fab9dec2ec4..888c1ce6f527 100644 --- a/include/net/seg6_local.h +++ b/include/net/seg6_local.h @@ -19,6 +19,7 @@ extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb); struct seg6_bpf_srh_state { + local_lock_t bh_lock; struct ipv6_sr_hdr *srh; u16 hdrlen; bool valid; diff --git a/include/net/selftests.h b/include/net/selftests.h new file mode 100644 index 000000000000..e65e8d230d33 --- /dev/null +++ b/include/net/selftests.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_SELFTESTS +#define _NET_SELFTESTS + +#include <linux/ethtool.h> + +#if IS_ENABLED(CONFIG_NET_SELFTESTS) + +void net_selftest(struct net_device *ndev, struct ethtool_test *etest, + u64 *buf); +int net_selftest_get_count(void); +void net_selftest_get_strings(u8 *data); + +#else + +static inline void net_selftest(struct net_device *ndev, struct ethtool_test *etest, + u64 *buf) +{ +} + +static inline int net_selftest_get_count(void) +{ + return 0; +} + +static inline void net_selftest_get_strings(u8 *data) +{ +} + +#endif +#endif /* _NET_SELFTESTS */ diff --git a/include/net/smc.h b/include/net/smc.h index e441aa97ad61..db84e4e35080 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -11,6 +11,14 @@ #ifndef _SMC_H #define _SMC_H +#include <linux/device.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/wait.h> +#include "linux/ism.h" + +struct sock; + #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ struct smc_hashinfo { @@ -18,9 +26,6 @@ struct smc_hashinfo { struct hlist_head ht; }; -int smc_hash_sk(struct sock *sk); -void smc_unhash_sk(struct sock *sk); - /* SMCD/ISM device driver interface */ struct smcd_dmb { u64 dmb_tok; @@ -41,39 +46,43 @@ struct smcd_dmb { #define ISM_ERROR 0xFFFF -struct smcd_event { - u32 type; - u32 code; - u64 tok; - u64 time; - u64 info; -}; - struct smcd_dev; +struct smcd_gid { + u64 gid; + u64 gid_ext; +}; + struct smcd_ops { - int (*query_remote_gid)(struct smcd_dev *dev, u64 rgid, u32 vid_valid, - u32 vid); - int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); + int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid, + u32 vid_valid, u32 vid); + int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, + void *client); int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*set_vlan_required)(struct smcd_dev *dev); - int (*reset_vlan_required)(struct smcd_dev *dev); - int (*signal_event)(struct smcd_dev *dev, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info); int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); - void (*get_system_eid)(struct smcd_dev *dev, u8 **eid); + int (*supports_v2)(void); + void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); u16 (*get_chid)(struct smcd_dev *dev); + struct device* (*get_dev)(struct smcd_dev *dev); + + /* optional operations */ + int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); + int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); + int (*set_vlan_required)(struct smcd_dev *dev); + int (*reset_vlan_required)(struct smcd_dev *dev); + int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, + u32 trigger_irq, u32 event_code, u64 info); + int (*support_dmb_nocopy)(struct smcd_dev *dev); + int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); + int (*detach_dmb)(struct smcd_dev *dev, u64 token); }; struct smcd_dev { const struct smcd_ops *ops; - struct device dev; void *priv; - u64 local_gid; + void *client; struct list_head list; spinlock_t lock; struct smc_connection **conn; @@ -88,11 +97,4 @@ struct smcd_dev { u8 going_away : 1; }; -struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs); -int smcd_register_dev(struct smcd_dev *smcd); -void smcd_unregister_dev(struct smcd_dev *smcd); -void smcd_free_dev(struct smcd_dev *smcd); -void smcd_handle_event(struct smcd_dev *dev, struct smcd_event *event); -void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit); #endif /* _SMC_H */ diff --git a/include/net/snmp.h b/include/net/snmp.h index 468a67836e2f..4cb4326dfebe 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -159,7 +159,7 @@ struct linux_tls_mib { #define __SNMP_ADD_STATS64(mib, field, addend) \ do { \ - __typeof__(*mib) *ptr = raw_cpu_ptr(mib); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[field] += addend; \ u64_stats_update_end(&ptr->syncp); \ @@ -176,8 +176,7 @@ struct linux_tls_mib { #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1) #define __SNMP_UPD_PO_STATS64(mib, basefield, addend) \ do { \ - __typeof__(*mib) *ptr; \ - ptr = raw_cpu_ptr((mib)); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend; \ diff --git a/include/net/sock.h b/include/net/sock.h index a5c6ae78df77..4c37015b7cf7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -56,18 +56,19 @@ #include <linux/wait.h> #include <linux/cgroup-defs.h> #include <linux/rbtree.h> -#include <linux/filter.h> #include <linux/rculist_nulls.h> #include <linux/poll.h> #include <linux/sockptr.h> - +#include <linux/indirect_call_wrapper.h> #include <linux/atomic.h> #include <linux/refcount.h> +#include <linux/llist.h> #include <net/dst.h> #include <net/checksum.h> #include <net/tcp_states.h> #include <linux/net_tstamp.h> #include <net/l3mdev.h> +#include <uapi/linux/socket.h> /* * This structure really needs to be cleaned up. @@ -75,19 +76,6 @@ * the other protocols. */ -/* Define this to get the SOCK_DBG debugging facility. */ -#define SOCK_DEBUGGING -#ifdef SOCK_DEBUGGING -#define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ - printk(KERN_DEBUG msg); } while (0) -#else -/* Validate arguments and do nothing */ -static inline __printf(2, 3) -void SOCK_DEBUG(const struct sock *sk, const char *msg, ...) -{ -} -#endif - /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. @@ -160,9 +148,6 @@ typedef __u64 __bitwise __addrpair; * for struct sock and struct inet_timewait_sock. */ struct sock_common { - /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned - * address on 64bit arches : cf INET_MATCH() - */ union { __addrpair skc_addrpair; struct { @@ -226,7 +211,7 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING unsigned short skc_rx_queue_mapping; #endif union { @@ -247,6 +232,7 @@ struct sock_common { }; struct bpf_local_storage; +struct sk_filter; /** * struct sock - network layer representation of sockets @@ -258,10 +244,11 @@ struct bpf_local_storage; * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux + * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst + * @sk_rx_dst_cookie: cookie for @sk_rx_dst * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy - * @sk_rx_skb_cache: cache copy of recently accessed RX skb * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -269,6 +256,7 @@ struct bpf_local_storage; * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward + * @sk_reserved_mem: space reserved and non-reclaimable for the socket * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode @@ -276,14 +264,10 @@ struct bpf_local_storage; * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes - * @__sk_flags_offset: empty field used to determine location of bitfield - * @sk_padding: unused element for alignment * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) - * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) - * @sk_route_forced_caps: static, forced route capabilities - * (set in tcp_init_sock()) + * @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden. * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments @@ -301,34 +285,44 @@ struct bpf_local_storage; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_prefer_busy_poll: prefer busypolling over softirq processing + * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family + * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit + * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only - * @sk_tsflags: SO_TIMESTAMPING socket options + * @sk_tsflags: SO_TIMESTAMPING flags + * @sk_bpf_cb_flags: used in bpf_setsockopt() + * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. + * Sockets that can be used under memory reclaim should + * set this to false. + * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock + * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals - * @sk_user_data: RPC layer private data + * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] - * @sk_tx_skb_cache: cache copy of recently accessed TX skb * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start + * @sk_disconnects: number of disconnect operations performed on this sock * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available @@ -343,6 +337,16 @@ struct bpf_local_storage; * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags + * @sk_scm_recv_flags: all flags used by scm_recv() + * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS + * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY + * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS + * @sk_scm_unused: unused flags for scm_recv() + * @ns_tracker: tracker for netns reference + * @sk_user_frags: xarray of pages the user is holding a reference on. + * @sk_owner: reference to the real owner of the socket that calls + * sock_lock_init_class_and_name(). */ struct sock { /* @@ -354,7 +358,7 @@ struct sock { #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping #endif @@ -384,11 +388,11 @@ struct sock { #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash - socket_lock_t sk_lock; + __cacheline_group_begin(sock_write_rx); + atomic_t sk_drops; - int sk_rcvlowat; + __s32 sk_peek_off; struct sk_buff_head sk_error_queue; - struct sk_buff *sk_rx_skb_cache; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -406,12 +410,21 @@ struct sock { } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc - int sk_forward_alloc; + __cacheline_group_end(sock_write_rx); + + __cacheline_group_begin(sock_read_rx); + /* early demux fields */ + struct dst_entry __rcu *sk_rx_dst; + int sk_rx_dst_ifindex; + u32 sk_rx_dst_cookie; + #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_ll_usec; - /* ===== mostly read cache line ===== */ unsigned int sk_napi_id; + u16 sk_busy_poll_budget; + u8 sk_prefer_busy_poll; #endif + u8 sk_userlocks; int sk_rcvbuf; struct sk_filter __rcu *sk_filter; @@ -421,15 +434,33 @@ struct sock { struct socket_wq *sk_wq_raw; /* public: */ }; + + void (*sk_data_ready)(struct sock *sk); + long sk_rcvtimeo; + int sk_rcvlowat; + __cacheline_group_end(sock_read_rx); + + __cacheline_group_begin(sock_read_rxtx); + int sk_err; + struct socket *sk_socket; + struct mem_cgroup *sk_memcg; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct dst_entry *sk_rx_dst; - struct dst_entry __rcu *sk_dst_cache; + __cacheline_group_end(sock_read_rxtx); + + __cacheline_group_begin(sock_write_rxtx); + socket_lock_t sk_lock; + u32 sk_reserved_mem; + int sk_forward_alloc; + u32 sk_tsflags; + __cacheline_group_end(sock_write_rxtx); + + __cacheline_group_begin(sock_write_tx); + int sk_write_pending; atomic_t sk_omem_alloc; int sk_sndbuf; - /* ===== cache line for TX ===== */ int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; @@ -437,89 +468,112 @@ struct sock { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; - struct sk_buff *sk_tx_skb_cache; struct sk_buff_head sk_write_queue; - __s32 sk_peek_off; - int sk_write_pending; - __u32 sk_dst_pending_confirm; + u32 sk_dst_pending_confirm; u32 sk_pacing_status; /* see enum sk_pacing */ - long sk_sndtimeo; + struct page_frag sk_frag; struct timer_list sk_timer; - __u32 sk_priority; - __u32 sk_mark; + unsigned long sk_pacing_rate; /* bytes per second */ + atomic_t sk_zckey; + atomic_t sk_tskey; + __cacheline_group_end(sock_write_tx); + + __cacheline_group_begin(sock_read_tx); unsigned long sk_max_pacing_rate; - struct page_frag sk_frag; + long sk_sndtimeo; + u32 sk_priority; + u32 sk_mark; + struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; - netdev_features_t sk_route_nocaps; - netdev_features_t sk_route_forced_caps; - int sk_gso_type; +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif + u16 sk_gso_type; + u16 sk_gso_max_segs; unsigned int sk_gso_max_size; gfp_t sk_allocation; - __u32 sk_txhash; + u32 sk_txhash; + u8 sk_pacing_shift; + bool sk_use_task_frag; + __cacheline_group_end(sock_read_tx); /* * Because of non atomicity rules, all * changes are protected by socket lock. */ - u8 sk_padding : 1, + u8 sk_gso_disabled : 1, sk_kern_sock : 1, sk_no_check_tx : 1, - sk_no_check_rx : 1, - sk_userlocks : 4; - u8 sk_pacing_shift; + sk_no_check_rx : 1; + u8 sk_shutdown; u16 sk_type; u16 sk_protocol; - u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; - int sk_err, - sk_err_soft; + int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; + spinlock_t sk_peer_lock; + int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; - long sk_rcvtimeo; + ktime_t sk_stamp; #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif - u16 sk_tsflags; - u8 sk_shutdown; - u32 sk_tskey; - atomic_t sk_zckey; + int sk_disconnects; + union { + u8 sk_txrehash; + u8 sk_scm_recv_flags; + struct { + u8 sk_scm_credentials : 1, + sk_scm_security : 1, + sk_scm_pidfd : 1, + sk_scm_rights : 1, + sk_scm_unused : 4; + }; + }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG)) + u8 sk_bpf_cb_flags; - struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; - struct mem_cgroup *sk_memcg; void (*sk_state_change)(struct sock *sk); - void (*sk_data_ready)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); -#ifdef CONFIG_SOCK_VALIDATE_XMIT - struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, - struct net_device *dev, - struct sk_buff *skb); -#endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; + netns_tracker ns_tracker; + struct xarray sk_user_frags; + +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) + struct module *sk_owner; +#endif +}; + +struct sock_bh_locked { + struct sock *sock; + local_lock_t bh_lock; }; enum sk_pacing { @@ -528,14 +582,26 @@ enum sk_pacing { SK_PACING_FQ = 2, }; -/* Pointer stored in sk_user_data might not be suitable for copying - * when cloning the socket. For instance, it can point to a reference - * counted object. sk_user_data bottom bit is set if pointer must not - * be copied. +/* flag bits in sk_user_data + * + * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might + * not be suitable for copying when cloning the socket. For instance, + * it can point to a reference counted object. sk_user_data bottom + * bit is set if pointer must not be copied. + * + * - SK_USER_DATA_BPF: Mark whether sk_user_data field is + * managed/owned by a BPF reuseport array. This bit should be set + * when sk_user_data's sk is added to the bpf's reuseport_array. + * + * - SK_USER_DATA_PSOCK: Mark whether pointer stored in + * sk_user_data points to psock type. This bit should be set + * when sk_user_data is assigned to a psock object. */ #define SK_USER_DATA_NOCOPY 1UL -#define SK_USER_DATA_BPF 2UL /* Managed by BPF */ -#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) +#define SK_USER_DATA_BPF 2UL +#define SK_USER_DATA_PSOCK 4UL +#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ + SK_USER_DATA_PSOCK) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied @@ -548,24 +614,77 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk) #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) +/** + * __locked_read_sk_user_data_with_flags - return the pointer + * only if argument flags all has been set in sk_user_data. Otherwise + * return NULL + * + * @sk: socket + * @flags: flag bits + * + * The caller must be holding sk->sk_callback_lock. + */ +static inline void * +__locked_read_sk_user_data_with_flags(const struct sock *sk, + uintptr_t flags) +{ + uintptr_t sk_user_data = + (uintptr_t)rcu_dereference_check(__sk_user_data(sk), + lockdep_is_held(&sk->sk_callback_lock)); + + WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); + + if ((sk_user_data & flags) == flags) + return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); + return NULL; +} + +/** + * __rcu_dereference_sk_user_data_with_flags - return the pointer + * only if argument flags all has been set in sk_user_data. Otherwise + * return NULL + * + * @sk: socket + * @flags: flag bits + */ +static inline void * +__rcu_dereference_sk_user_data_with_flags(const struct sock *sk, + uintptr_t flags) +{ + uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); + + WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); + + if ((sk_user_data & flags) == flags) + return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); + return NULL; +} + #define rcu_dereference_sk_user_data(sk) \ + __rcu_dereference_sk_user_data_with_flags(sk, 0) +#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ ({ \ - void *__tmp = rcu_dereference(__sk_user_data((sk))); \ - (void *)((uintptr_t)__tmp & SK_USER_DATA_PTRMASK); \ -}) -#define rcu_assign_sk_user_data(sk, ptr) \ -({ \ - uintptr_t __tmp = (uintptr_t)(ptr); \ - WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ - rcu_assign_pointer(__sk_user_data((sk)), __tmp); \ -}) -#define rcu_assign_sk_user_data_nocopy(sk, ptr) \ -({ \ - uintptr_t __tmp = (uintptr_t)(ptr); \ - WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ + uintptr_t __tmp1 = (uintptr_t)(ptr), \ + __tmp2 = (uintptr_t)(flags); \ + WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ + WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), \ - __tmp | SK_USER_DATA_NOCOPY); \ + __tmp1 | __tmp2); \ }) +#define rcu_assign_sk_user_data(sk, ptr) \ + __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) + +static inline +struct net *sock_net(const struct sock *sk) +{ + return read_pnet(&sk->sk_net); +} + +static inline +void sock_net_set(struct sock *sk, struct net *net) +{ + write_pnet(&sk->sk_net, net); +} /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK @@ -580,7 +699,7 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk) int sk_set_peek_off(struct sock *sk, int val); -static inline int sk_peek_offset(struct sock *sk, int flags) +static inline int sk_peek_offset(const struct sock *sk, int flags) { if (unlikely(flags & MSG_PEEK)) { return READ_ONCE(sk->sk_peek_off); @@ -660,11 +779,6 @@ static inline void sk_node_init(struct hlist_node *node) node->pprev = NULL; } -static inline void sk_nulls_node_init(struct hlist_nulls_node *node) -{ - node->pprev = NULL; -} - static inline void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); @@ -805,6 +919,8 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) +#define sk_for_each_bound_safe(__sk, tmp, list) \ + hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset @@ -820,7 +936,7 @@ static inline void sk_add_bind_node(struct sock *sk, ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ pos = rcu_dereference(hlist_next_rcu(pos))) -static inline struct user_namespace *sk_user_ns(struct sock *sk) +static inline struct user_namespace *sk_user_ns(const struct sock *sk) { /* Careful only use this in a context where these parameters * can not change and must all be valid, such as recvmsg from @@ -861,11 +977,20 @@ enum sock_flags { SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ + SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ + SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ + SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) +/* + * The highest bit of sk_tsflags is reserved for kernel-internal + * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that + * SOF_TIMESTAMPING* values do not reach this reserved area + */ +#define SOCKCM_FLAG_TS_OPT_ID BIT(31) -static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) +static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { nsk->sk_flags = osk->sk_flags; } @@ -928,6 +1053,10 @@ static inline void sk_acceptq_added(struct sock *sk) WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } +/* Note: If you think the test should be: + * return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog); + * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.") + */ static inline bool sk_acceptq_is_full(const struct sock *sk) { return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); @@ -951,6 +1080,12 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val) WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } +static inline void sk_forward_alloc_add(struct sock *sk, int val) +{ + /* Paired with lockless reads of sk->sk_forward_alloc */ + WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val); +} + void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ @@ -1002,12 +1137,18 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); +INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)); +INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)); + static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { if (sk_memalloc_socks() && skb_pfmemalloc(skb)) return __sk_backlog_rcv(sk, skb); - return sk->sk_backlog_rcv(sk, skb); + return INDIRECT_CALL_INET(sk->sk_backlog_rcv, + tcp_v6_do_rcv, + tcp_v4_do_rcv, + sk, skb); } static inline void sk_incoming_cpu_update(struct sock *sk) @@ -1018,56 +1159,29 @@ static inline void sk_incoming_cpu_update(struct sock *sk) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } -static inline void sock_rps_record_flow_hash(__u32 hash) -{ -#ifdef CONFIG_RPS - struct rps_sock_flow_table *sock_flow_table; - - rcu_read_lock(); - sock_flow_table = rcu_dereference(rps_sock_flow_table); - rps_record_sock_flow(sock_flow_table, hash); - rcu_read_unlock(); -#endif -} - -static inline void sock_rps_record_flow(const struct sock *sk) -{ -#ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) - sock_rps_record_flow_hash(sk->sk_rxhash); - } -#endif -} static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_RPS - if (unlikely(sk->sk_rxhash != skb->hash)) - sk->sk_rxhash = skb->hash; + /* The following WRITE_ONCE() is paired with the READ_ONCE() + * here, and another one in sock_rps_record_flow(). + */ + if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash)) + WRITE_ONCE(sk->sk_rxhash, skb->hash); #endif } static inline void sock_rps_reset_rxhash(struct sock *sk) { #ifdef CONFIG_RPS - sk->sk_rxhash = 0; + /* Paired with READ_ONCE() in sock_rps_record_flow() */ + WRITE_ONCE(sk->sk_rxhash, 0); #endif } #define sk_wait_event(__sk, __timeo, __condition, __wait) \ - ({ int __rc; \ + ({ int __rc, __dis = __sk->sk_disconnects; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ @@ -1077,7 +1191,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) } \ sched_annotate_sleep(); \ lock_sock(__sk); \ - __rc = __condition; \ + __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \ __rc; \ }) @@ -1108,6 +1222,7 @@ struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; +struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes @@ -1121,6 +1236,13 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) size - offsetof(struct sock, sk_node.pprev)); } +struct proto_accept_arg { + int flags; + int err; + int is_empty; + bool kern; +}; + /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ @@ -1135,11 +1257,11 @@ struct proto { int addr_len); int (*disconnect)(struct sock *sk, int flags); - struct sock * (*accept)(struct sock *sk, int flags, int *err, - bool kern); + struct sock * (*accept)(struct sock *sk, + struct proto_accept_arg *arg); int (*ioctl)(struct sock *sk, int cmd, - unsigned long arg); + int *karg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); @@ -1157,10 +1279,8 @@ struct proto { int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, - int *addr_len); - int (*sendpage)(struct sock *sk, struct page *page, - int offset, size_t size, int flags); + size_t len, int flags, int *addr_len); + void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, @@ -1168,6 +1288,8 @@ struct proto { int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); + bool (*bpf_bypass_getsockopt)(int level, + int optname); void (*release_cb)(struct sock *sk); @@ -1176,6 +1298,12 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); + void (*put_port)(struct sock *sk); +#ifdef CONFIG_BPF_SYSCALL + int (*psock_update_sk_prot)(struct sock *sk, + struct sk_psock *psock, + bool restore); +#endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS @@ -1183,15 +1311,18 @@ struct proto { #endif bool (*stream_memory_free)(const struct sock *sk, int wake); - bool (*stream_memory_read)(const struct sock *sk); + bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ + int __percpu *per_cpu_fw_alloc; struct percpu_counter *sockets_allocated; /* Current number of sockets. */ + /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. + * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ @@ -1208,11 +1339,12 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; + unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ - struct percpu_counter *orphan_count; + unsigned int __percpu *orphan_count; struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; @@ -1229,9 +1361,6 @@ struct proto { char name[32]; struct list_head node; -#ifdef SOCK_REFCNT_DEBUG - atomic_t socks; -#endif int (*diag_destroy)(struct sock *sk, int err); } __randomize_layout; @@ -1239,30 +1368,7 @@ int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); int sock_load_diag_module(int family, int protocol); -#ifdef SOCK_REFCNT_DEBUG -static inline void sk_refcnt_debug_inc(struct sock *sk) -{ - atomic_inc(&sk->sk_prot->socks); -} - -static inline void sk_refcnt_debug_dec(struct sock *sk) -{ - atomic_dec(&sk->sk_prot->socks); - printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", - sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); -} - -static inline void sk_refcnt_debug_release(const struct sock *sk) -{ - if (refcount_read(&sk->sk_refcnt) != 1) - printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", - sk->sk_prot->name, sk, refcount_read(&sk->sk_refcnt)); -} -#else /* SOCK_REFCNT_DEBUG */ -#define sk_refcnt_debug_inc(sk) do { } while (0) -#define sk_refcnt_debug_dec(sk) do { } while (0) -#define sk_refcnt_debug_release(sk) do { } while (0) -#endif /* SOCK_REFCNT_DEBUG */ +INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { @@ -1270,7 +1376,8 @@ static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) return false; return sk->sk_prot->stream_memory_free ? - sk->sk_prot->stream_memory_free(sk, wake) : true; + INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free, + tcp_stream_memory_free, sk, wake) : true; } static inline bool sk_stream_memory_free(const struct sock *sk) @@ -1300,49 +1407,18 @@ static inline int sk_under_cgroup_hierarchy(struct sock *sk, #endif } -static inline bool sk_has_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure != NULL; -} - -static inline bool sk_under_memory_pressure(const struct sock *sk) -{ - if (!sk->sk_prot->memory_pressure) - return false; - - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) - return true; - - return !!*sk->sk_prot->memory_pressure; -} - -static inline long -sk_memory_allocated(const struct sock *sk) -{ - return atomic_long_read(sk->sk_prot->memory_allocated); -} - -static inline long -sk_memory_allocated_add(struct sock *sk, int amt) -{ - return atomic_long_add_return(amt, sk->sk_prot->memory_allocated); -} - -static inline void -sk_memory_allocated_sub(struct sock *sk, int amt) -{ - atomic_long_sub(amt, sk->sk_prot->memory_allocated); -} +#define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) { - percpu_counter_dec(sk->sk_prot->sockets_allocated); + percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1, + SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline void sk_sockets_allocated_inc(struct sock *sk) { - percpu_counter_inc(sk->sk_prot->sockets_allocated); + percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1, + SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline u64 @@ -1357,29 +1433,33 @@ proto_sockets_allocated_sum_positive(struct proto *prot) return percpu_counter_sum_positive(prot->sockets_allocated); } -static inline long -proto_memory_allocated(struct proto *prot) +#ifdef CONFIG_PROC_FS +#define PROTO_INUSE_NR 64 /* should be enough for the first time */ +struct prot_inuse { + int all; + int val[PROTO_INUSE_NR]; +}; + +static inline void sock_prot_inuse_add(const struct net *net, + const struct proto *prot, int val) { - return atomic_long_read(prot->memory_allocated); + this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } -static inline bool -proto_memory_pressure(struct proto *prot) +static inline void sock_inuse_add(const struct net *net, int val) { - if (!prot->memory_pressure) - return false; - return !!*prot->memory_pressure; + this_cpu_add(net->core.prot_inuse->all, val); } - -#ifdef CONFIG_PROC_FS -/* Called with local bh disabled */ -void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc); int sock_prot_inuse_get(struct net *net, struct proto *proto); int sock_inuse_get(struct net *net); #else -static inline void sock_prot_inuse_add(struct net *net, struct proto *prot, - int inc) +static inline void sock_prot_inuse_add(const struct net *net, + const struct proto *prot, int val) +{ +} + +static inline void sock_inuse_add(const struct net *net, int val) { } #endif @@ -1404,8 +1484,6 @@ static inline int __sk_prot_rehash(struct sock *sk) #define RCV_SHUTDOWN 1 #define SEND_SHUTDOWN 2 -#define SOCK_SNDBUF_LOCK 1 -#define SOCK_RCVBUF_LOCK 2 #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 @@ -1432,30 +1510,18 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); -/* We used to have PAGE_SIZE here, but systems with 64KB pages - * do not necessarily have 16x time more memory than 4KB ones. - */ -#define SK_MEM_QUANTUM 4096 -#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 -/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */ +/* sysctl_mem values are in pages */ static inline long sk_prot_mem_limits(const struct sock *sk, int index) { - long val = sk->sk_prot->sysctl_mem[index]; - -#if PAGE_SIZE > SK_MEM_QUANTUM - val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT; -#elif PAGE_SIZE < SK_MEM_QUANTUM - val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT; -#endif - return val; + return READ_ONCE(sk->sk_prot->sysctl_mem[index]); } static inline int sk_mem_pages(int amt) { - return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT; + return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT; } static inline bool sk_has_account(struct sock *sk) @@ -1466,87 +1532,108 @@ static inline bool sk_has_account(struct sock *sk) static inline bool sk_wmem_schedule(struct sock *sk, int size) { + int delta; + if (!sk_has_account(sk)) return true; - return size <= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_SEND); + delta = size - sk->sk_forward_alloc; + return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND); } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { + int delta; + if (!sk_has_account(sk)) return true; - return size <= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_RECV) || - skb_pfmemalloc(skb); + delta = size - sk->sk_forward_alloc; + return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || + pfmemalloc; +} + +static inline bool +sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +{ + return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); +} + +static inline int sk_unused_reserved_mem(const struct sock *sk) +{ + int unused_mem; + + if (likely(!sk->sk_reserved_mem)) + return 0; + + unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued - + atomic_read(&sk->sk_rmem_alloc); + + return unused_mem > 0 ? unused_mem : 0; } static inline void sk_mem_reclaim(struct sock *sk) { + int reclaimable; + if (!sk_has_account(sk)) return; - if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) - __sk_mem_reclaim(sk, sk->sk_forward_alloc); + + reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); + + if (reclaimable >= (int)PAGE_SIZE) + __sk_mem_reclaim(sk, reclaimable); } -static inline void sk_mem_reclaim_partial(struct sock *sk) +static inline void sk_mem_reclaim_final(struct sock *sk) { - if (!sk_has_account(sk)) - return; - if (sk->sk_forward_alloc > SK_MEM_QUANTUM) - __sk_mem_reclaim(sk, sk->sk_forward_alloc - 1); + sk->sk_reserved_mem = 0; + sk_mem_reclaim(sk); } static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; - sk->sk_forward_alloc -= size; + sk_forward_alloc_add(sk, -size); } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; - sk->sk_forward_alloc += size; + sk_forward_alloc_add(sk, size); + sk_mem_reclaim(sk); +} - /* Avoid a possible overflow. - * TCP send queues can make this happen, if sk_mem_reclaim() - * is not called and more than 2 GBytes are released at once. - * - * If we reach 2 MBytes, reclaim 1 MBytes right now, there is - * no need to hold that much forward allocation anyway. - */ - if (unlikely(sk->sk_forward_alloc >= 1 << 21)) - __sk_mem_reclaim(sk, 1 << 20); +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ + __module_get(owner); + sk->sk_owner = owner; } -DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); -static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) +static inline void sk_owner_clear(struct sock *sk) { - sk_wmem_queued_add(sk, -skb->truesize); - sk_mem_uncharge(sk, skb->truesize); - if (static_branch_unlikely(&tcp_tx_skb_cache_key) && - !sk->sk_tx_skb_cache && !skb_cloned(skb)) { - skb_ext_reset(skb); - skb_zcopy_clear(skb, true); - sk->sk_tx_skb_cache = skb; - return; - } - __kfree_skb(skb); + sk->sk_owner = NULL; } -static inline void sock_release_ownership(struct sock *sk) +static inline void sk_owner_put(struct sock *sk) +{ + module_put(sk->sk_owner); +} +#else +static inline void sk_owner_set(struct sock *sk, struct module *owner) { - if (sk->sk_lock.owned) { - sk->sk_lock.owned = 0; +} - /* The sk_lock has mutex_unlock() semantics: */ - mutex_release(&sk->sk_lock.dep_map, _RET_IP_); - } +static inline void sk_owner_clear(struct sock *sk) +{ } +static inline void sk_owner_put(struct sock *sk) +{ +} +#endif /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. @@ -1556,23 +1643,22 @@ static inline void sock_release_ownership(struct sock *sk) */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ + sk_owner_set(sk, THIS_MODULE); \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ - sizeof((sk)->sk_lock)); \ + sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ - (skey), (sname)); \ + (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) -#ifdef CONFIG_LOCKDEP static inline bool lockdep_sock_is_held(const struct sock *sk) { return lockdep_is_held(&sk->sk_lock) || lockdep_is_held(&sk->sk_lock.slock); } -#endif void lock_sock_nested(struct sock *sk, int subclass); @@ -1581,6 +1667,7 @@ static inline void lock_sock(struct sock *sk) lock_sock_nested(sk, 0); } +void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); @@ -1591,7 +1678,37 @@ void release_sock(struct sock *sk); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) -bool lock_sock_fast(struct sock *sk); +bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); + +/** + * lock_sock_fast - fast version of lock_sock + * @sk: socket + * + * This version should be used for very small section, where process won't block + * return false if fast path is taken: + * + * sk_lock.slock locked, owned = 0, BH disabled + * + * return true if slow path is taken: + * + * sk_lock.slock unlocked, owned = 1, BH enabled + */ +static inline bool lock_sock_fast(struct sock *sk) +{ + /* The sk_lock has mutex_lock() semantics here. */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + + return __lock_sock_fast(sk); +} + +/* fast socket lock variant for caller already holding a [different] socket lock */ +static inline bool lock_sock_fast_nested(struct sock *sk) +{ + mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); + + return __lock_sock_fast(sk); +} + /** * unlock_sock_fast - complement of lock_sock_fast * @sk: socket @@ -1601,13 +1718,22 @@ bool lock_sock_fast(struct sock *sk); * If slow mode is on, we call regular release_sock() */ static inline void unlock_sock_fast(struct sock *sk, bool slow) + __releases(&sk->sk_lock.slock) { - if (slow) + if (slow) { release_sock(sk); - else + __release(&sk->sk_lock.slock); + } else { + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); spin_unlock_bh(&sk->sk_lock.slock); + } } +void sockopt_lock_sock(struct sock *sk); +void sockopt_release_sock(struct sock *sk); +bool sockopt_ns_capable(struct user_namespace *ns, int cap); +bool sockopt_capable(int cap); + /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming @@ -1629,6 +1755,13 @@ static inline void sock_owned_by_me(const struct sock *sk) #endif } +static inline void sock_not_owned_by_me(const struct sock *sk) +{ +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks); +#endif +} + static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); @@ -1640,20 +1773,30 @@ static inline bool sock_owned_by_user_nocheck(const struct sock *sk) return sk->sk_lock.owned; } +static inline void sock_release_ownership(struct sock *sk) +{ + DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk)); + sk->sk_lock.owned = 0; + + /* The sk_lock has mutex_unlock() semantics: */ + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); +} + /* no reclassification while locks are held */ static inline bool sock_allow_reclassification(const struct sock *csk) { struct sock *sk = (struct sock *)csk; - return !sk->sk_lock.owned && !spin_is_locked(&sk->sk_lock.slock); + return !sock_owned_by_user_nocheck(sk) && + !spin_is_locked(&sk->sk_lock.slock); } struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); +void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); @@ -1667,40 +1810,77 @@ void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); + +static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + skb->sk = sk; + skb->destructor = sock_edemux; + } +} #else #define sock_edemux sock_efree #endif +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); +int do_sock_setsockopt(struct socket *sock, bool compat, int level, + int optname, sockptr_t optval, int optlen); +int do_sock_getsockopt(struct socket *sock, bool compat, int level, + int optname, sockptr_t optval, sockptr_t optlen); -int sock_getsockopt(struct socket *sock, int level, int op, - char __user *optval, int __user *optlen); +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); -struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, - int noblock, int *errcode); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order); + +static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, + unsigned long size, + int noblock, int *errcode) +{ + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); +} + void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); +static inline void sock_replace_proto(struct sock *sk, struct proto *proto) +{ + if (sk->sk_socket) + clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); + WRITE_ONCE(sk->sk_prot, proto); +} + struct sockcm_cookie { u64 transmit_time; u32 mark; - u16 tsflags; + u32 tsflags; + u32 ts_opt_id; + u32 priority; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { - *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; + *sockc = (struct sockcm_cookie) { + .mark = READ_ONCE(sk->sk_mark), + .tsflags = READ_ONCE(sk->sk_tsflags), + .priority = READ_ONCE(sk->sk_priority), + }; } -int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, +int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); @@ -1712,7 +1892,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); -int sock_no_accept(struct socket *, struct socket *, int, bool); +int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); @@ -1722,10 +1902,6 @@ int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags); -ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags); /* * Functions to fill in entries in struct proto_ops when a protocol @@ -1744,7 +1920,12 @@ void sk_common_release(struct sock *sk); * Default socket callbacks and setup code */ -/* Initialise core socket variables */ +/* Initialise core socket variables using an explicit uid. */ +void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid); + +/* Initialise core socket variables. + * Assumes struct socket *sock is embedded in a struct socket_alloc. + */ void sock_init_data(struct socket *sock, struct sock *sk); /* @@ -1796,54 +1977,81 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) /* sk_tx_queue_mapping accept only upto a 16-bit value */ if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) return; - sk->sk_tx_queue_mapping = tx_queue; + /* Paired with READ_ONCE() in sk_tx_queue_get() and + * other WRITE_ONCE() because socket lock might be not held. + */ + WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); } #define NO_QUEUE_MAPPING USHRT_MAX static inline void sk_tx_queue_clear(struct sock *sk) { - sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING; + /* Paired with READ_ONCE() in sk_tx_queue_get() and + * other WRITE_ONCE() because socket lock might be not held. + */ + WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } static inline int sk_tx_queue_get(const struct sock *sk) { - if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING) - return sk->sk_tx_queue_mapping; + if (sk) { + /* Paired with WRITE_ONCE() in sk_tx_queue_clear() + * and sk_tx_queue_set(). + */ + int val = READ_ONCE(sk->sk_tx_queue_mapping); + if (val != NO_QUEUE_MAPPING) + return val; + } return -1; } -static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) +static inline void __sk_rx_queue_set(struct sock *sk, + const struct sk_buff *skb, + bool force_set) { -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); - if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING)) - return; - - sk->sk_rx_queue_mapping = rx_queue; + if (force_set || + unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) + WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue); } #endif } +static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) +{ + __sk_rx_queue_set(sk, skb, true); +} + +static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb) +{ + __sk_rx_queue_set(sk, skb, false); +} + static inline void sk_rx_queue_clear(struct sock *sk) { -#ifdef CONFIG_XPS - sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING; +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING + WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING); #endif } -#ifdef CONFIG_XPS static inline int sk_rx_queue_get(const struct sock *sk) { - if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING) - return sk->sk_rx_queue_mapping; +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING + if (sk) { + int res = READ_ONCE(sk->sk_rx_queue_mapping); + + if (res != NO_QUEUE_MAPPING) + return res; + } +#endif return -1; } -#endif static inline void sk_set_socket(struct sock *sk, struct socket *sock) { @@ -1884,6 +2092,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) } kuid_t sock_i_uid(struct sock *sk); +unsigned long __sock_i_ino(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) @@ -1893,57 +2102,58 @@ static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) static inline u32 net_tx_rndhash(void) { - u32 v = prandom_u32(); + u32 v = get_random_u32(); return v ?: 1; } static inline void sk_set_txhash(struct sock *sk) { - sk->sk_txhash = net_tx_rndhash(); + /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ + WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } -static inline void sk_rethink_txhash(struct sock *sk) +static inline bool sk_rethink_txhash(struct sock *sk) { - if (sk->sk_txhash) + if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); + return true; + } + return false; } static inline struct dst_entry * -__sk_dst_get(struct sock *sk) +__sk_dst_get(const struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, lockdep_sock_is_held(sk)); } static inline struct dst_entry * -sk_dst_get(struct sock *sk) +sk_dst_get(const struct sock *sk) { struct dst_entry *dst; rcu_read_lock(); dst = rcu_dereference(sk->sk_dst_cache); - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + if (dst && !rcuref_get(&dst->__rcuref)) dst = NULL; rcu_read_unlock(); return dst; } -static inline void dst_negative_advice(struct sock *sk) +static inline void __dst_negative_advice(struct sock *sk) { - struct dst_entry *ndst, *dst = __sk_dst_get(sk); + struct dst_entry *dst = __sk_dst_get(sk); - sk_rethink_txhash(sk); - - if (dst && dst->ops->negative_advice) { - ndst = dst->ops->negative_advice(dst); + if (dst && dst->ops->negative_advice) + dst->ops->negative_advice(sk, dst); +} - if (ndst != dst) { - rcu_assign_pointer(sk->sk_dst_cache, ndst); - sk_tx_queue_clear(sk); - sk->sk_dst_pending_confirm = 0; - } - } +static inline void dst_negative_advice(struct sock *sk) +{ + sk_rethink_txhash(sk); + __dst_negative_advice(sk); } static inline void @@ -1952,7 +2162,7 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); - sk->sk_dst_pending_confirm = 0; + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = rcu_dereference_protected(sk->sk_dst_cache, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_dst_cache, dst); @@ -1965,8 +2175,8 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); - sk->sk_dst_pending_confirm = 0; - old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst); + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); + old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst))); dst_release(old_dst); } @@ -1996,17 +2206,14 @@ static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) { if (skb_get_dst_pending_confirm(skb)) { struct sock *sk = skb->sk; - unsigned long now = jiffies; - /* avoid dirtying neighbour */ - if (READ_ONCE(n->confirmed) != now) - WRITE_ONCE(n->confirmed, now); if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 0); + neigh_confirm(n); } } -bool sk_mc_loop(struct sock *sk); +bool sk_mc_loop(const struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) { @@ -2015,10 +2222,10 @@ static inline bool sk_can_gso(const struct sock *sk) void sk_setup_caps(struct sock *sk, struct dst_entry *dst); -static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags) +static inline void sk_gso_disable(struct sock *sk) { - sk->sk_route_nocaps |= flags; - sk->sk_route_caps &= ~flags; + sk->sk_gso_disabled = 1; + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, @@ -2064,9 +2271,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro if (err) return err; - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; + skb_len_add(skb, copy); sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; @@ -2142,7 +2347,7 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) } /** - * sock_poll_wait - place memory barrier behind the poll_wait call. + * sock_poll_wait - wrapper for the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table @@ -2152,22 +2357,22 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { - if (!poll_does_not_wait(p)) { - poll_wait(filp, &sock->wq.wait, p); - /* We need to be sure we are in sync with the - * socket flags modification. - * - * This memory barrier is paired in the wq_has_sleeper. - */ - smp_mb(); - } + /* Provides a barrier we need to be sure we are in sync + * with the socket flags modification. + * + * This memory barrier is paired in the wq_has_sleeper. + */ + poll_wait(filp, &sock->wq.wait, p); } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { - if (sk->sk_txhash) { + /* This pairs with WRITE_ONCE() in sk_set_txhash() */ + u32 txhash = READ_ONCE(sk->sk_txhash); + + if (txhash) { skb->l4_hash = 1; - skb->hash = sk->sk_txhash; + skb->hash = txhash; } } @@ -2190,6 +2395,39 @@ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) sk_mem_charge(sk, skb->truesize); } +static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) +{ + if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { + skb_orphan(skb); + skb->destructor = sock_efree; + skb->sk = sk; + return true; + } + return false; +} + +static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) +{ + skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + if (skb) { + if (sk_rmem_schedule(sk, skb, skb->truesize)) { + skb_set_owner_r(skb, sk); + return skb; + } + __kfree_skb(skb); + } + return NULL; +} + +static inline void skb_prepare_for_gro(struct sk_buff *skb) +{ + if (skb->destructor != sock_wfree) { + skb_orphan(skb); + return; + } + skb->slow_gro = 1; +} + void sk_reset_timer(struct sock *sk, struct timer_list *timer, unsigned long expires); @@ -2202,7 +2440,14 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, void (*destructor)(struct sock *sk, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); + +int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason); + +static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + return sock_queue_rcv_skb_reason(sk, skb, NULL); +} int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); struct sk_buff *sock_dequeue_err_skb(struct sock *sk); @@ -2214,12 +2459,19 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk); static inline int sock_error(struct sock *sk) { int err; - if (likely(!sk->sk_err)) + + /* Avoid an atomic operation for the common case. + * This is racy since another cpu/thread can change sk_err under us. + */ + if (likely(data_race(!sk->sk_err))) return 0; + err = xchg(&sk->sk_err, 0); return -err; } +void sk_error_report(struct sock *sk); + static inline unsigned long sock_wspace(struct sock *sk) { int amt = 0; @@ -2263,6 +2515,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band) } } +static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) +{ + if (unlikely(sock_flag(sk, SOCK_FASYNC))) + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); +} + /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at @@ -2281,31 +2539,30 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) return; val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); + val = max_t(u32, val, sk_unused_reserved_mem(sk)); WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); } -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, - bool force_schedule); - /** * sk_page_frag - return an appropriate page_frag * @sk: socket * * Use the per task page_frag instead of the per socket one for - * optimization when we know that we're in the normal context and owns + * optimization when we know that we're in process context and own * everything that's associated with %current. * - * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest - * inside other socket operations and end up recursing into sk_page_frag() - * while it's already in use. + * Both direct reclaim and page faults can nest inside other + * socket operations and end up recursing into sk_page_frag() + * while it's already in use: explicitly avoid task page_frag + * when users disable sk_use_task_frag. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if (gfpflags_normal_context(sk->sk_allocation)) + if (sk->sk_use_task_frag) return ¤t->task_frag; return &sk->sk_frag; @@ -2326,6 +2583,11 @@ static inline gfp_t gfp_any(void) return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } +static inline gfp_t gfp_memcg_charge(void) +{ + return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; +} + static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_rcvtimeo; @@ -2357,10 +2619,10 @@ struct sock_skb_cb { /* Store sock_skb_cb at the end of skb->cb[] so protocol families * using skb->cb[] would keep using it directly and utilize its - * alignement guarantee. + * alignment guarantee. */ -#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ - sizeof(struct sock_skb_cb))) +#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ + sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) @@ -2418,9 +2680,9 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - ktime_t kt = skb->tstamp; struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); - + u32 tsflags = READ_ONCE(sk->sk_tsflags); + ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested @@ -2428,75 +2690,127 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || - (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || - (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) || + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || + (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && - (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) + (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); - if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid) + if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb)) __sock_recv_wifi_status(msg, sk, skb); } -void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, - struct sk_buff *skb); +void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb); #define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) -static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, - struct sk_buff *skb) -{ -#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ - (1UL << SOCK_RCVTSTAMP)) +static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ +#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_RCVMARK) | \ + (1UL << SOCK_RCVPRIORITY) | \ + (1UL << SOCK_TIMESTAMPING_ANY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY) - __sock_recv_ts_and_drops(msg, sk, skb); + if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS) + __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); - else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP)) + else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP)) sock_write_timestamp(sk, 0); } -void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); +void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet - * @tsflags: timestamping flags to use + * @sockc: pointer to socket cmsg cookie to get timestamping info * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ -static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void _sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags, __u32 *tskey) { + __u32 tsflags = sockc->tsflags; + if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && - tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = sk->sk_tskey++; + tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { + if (tsflags & SOCKCM_FLAG_TS_OPT_ID) + *tskey = sockc->ts_opt_id; + else + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; + } } - if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) - *tx_flags |= SKBTX_WIFI_STATUS; } -static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags) { - _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); + _sock_tx_timestamp(sk, sockc, tx_flags, NULL); } -static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) +static inline void skb_setup_tx_timestamp(struct sk_buff *skb, + const struct sockcm_cookie *sockc) { - _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, + _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } -DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); +static inline bool sk_is_inet(const struct sock *sk) +{ + int family = READ_ONCE(sk->sk_family); + + return family == AF_INET || family == AF_INET6; +} + +static inline bool sk_is_tcp(const struct sock *sk) +{ + return sk_is_inet(sk) && + sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static inline bool sk_is_udp(const struct sock *sk) +{ + return sk_is_inet(sk) && + sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP; +} + +static inline bool sk_is_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX; +} + +static inline bool sk_is_stream_unix(const struct sock *sk) +{ + return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; +} + +static inline bool sk_is_vsock(const struct sock *sk) +{ + return sk->sk_family == AF_VSOCK; +} + +static inline bool sk_may_scm_recv(const struct sock *sk) +{ + return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || + sk->sk_family == AF_NETLINK || + (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from @@ -2508,27 +2822,9 @@ DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); - if (static_branch_unlikely(&tcp_rx_skb_cache_key) && - !sk->sk_rx_skb_cache) { - sk->sk_rx_skb_cache = skb; - skb_orphan(skb); - return; - } __kfree_skb(skb); } -static inline -struct net *sock_net(const struct sock *sk) -{ - return read_pnet(&sk->sk_net); -} - -static inline -void sock_net_set(struct sock *sk, struct net *net) -{ - write_pnet(&sk->sk_net, net); -} - static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { @@ -2554,26 +2850,10 @@ sk_is_refcounted(struct sock *sk) return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } -/** - * skb_steal_sock - steal a socket from an sk_buff - * @skb: sk_buff to steal the socket from - * @refcounted: is set to true if the socket is reference-counted - */ -static inline struct sock * -skb_steal_sock(struct sk_buff *skb, bool *refcounted) +static inline bool +sk_requests_wifi_status(struct sock *sk) { - if (skb->sk) { - struct sock *sk = skb->sk; - - *refcounted = true; - if (skb_sk_is_prefetched(skb)) - *refcounted = sk_is_refcounted(sk); - skb->destructor = NULL; - skb->sk = NULL; - return sk; - } - *refcounted = false; - return NULL; + return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); } /* Checks if this SKB belongs to an HW offloaded socket @@ -2588,12 +2868,10 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); -#ifdef CONFIG_TLS_DEVICE - } else if (unlikely(skb->decrypted)) { + } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; -#endif } #endif @@ -2608,6 +2886,16 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } +/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT + * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE) + * TCP RST and ACK can be attached to TIME_WAIT. + */ +static inline bool sk_listener_or_tw(const struct sock *sk) +{ + return (1 << READ_ONCE(sk->sk_state)) & + (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT); +} + void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); @@ -2632,30 +2920,28 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; -extern int sysctl_tstamp_allow_data; -extern int sysctl_optmem_max; - extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; +#define SKB_FRAG_PAGE_ORDER get_order(32768) DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); - return *proto->sysctl_wmem; + return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); - return *proto->sysctl_rmem; + return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) @@ -2676,13 +2962,14 @@ static inline void sk_pacing_shift_update(struct sock *sk, int val) */ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) { + int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); int mdif; - if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif) + if (!bound_dev_if || bound_dev_if == dif) return true; mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); - if (mdif && mdif == sk->sk_bound_dev_if) + if (mdif && mdif == bound_dev_if) return true; return false; @@ -2691,7 +2978,18 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); +void sock_set_timestamp(struct sock *sk, int optname, bool valbool); +int sock_set_timestamping(struct sock *sk, int optname, + struct so_timestamping timestamping); + void sock_enable_timestamps(struct sock *sk); +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); +#else +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ +} +#endif void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); @@ -2703,4 +3001,20 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs); int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); +int sock_get_timeout(long timeo, void *optval, bool old_timeval); +int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, + sockptr_t optval, int optlen, bool old_timeval); + +int sock_ioctl_inout(struct sock *sk, unsigned int cmd, + void __user *arg, void *karg, size_t size); +int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); +static inline bool sk_is_readable(struct sock *sk) +{ + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->sock_is_readable) + return prot->sock_is_readable(sk); + + return false; +} #endif /* _SOCK_H */ diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 505f1e18e9bf..6e4faf3ee76f 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -13,8 +13,10 @@ extern spinlock_t reuseport_lock; struct sock_reuseport { struct rcu_head rcu; - u16 max_socks; /* length of socks */ - u16 num_socks; /* elements in socks */ + u16 max_socks; /* length of socks */ + u16 num_socks; /* elements in socks */ + u16 num_closed_socks; /* closed elements in socks */ + u16 incoming_cpu; /* The last synq overflow event timestamp of this * reuse->socks[] group. */ @@ -24,35 +26,39 @@ struct sock_reuseport { unsigned int bind_inany:1; unsigned int has_conns:1; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ - struct sock *socks[]; /* array of sock pointers */ + struct sock *socks[] __counted_by(max_socks); }; extern int reuseport_alloc(struct sock *sk, bool bind_inany); extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); +void reuseport_stop_listen_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb); extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); -static inline bool reuseport_has_conns(struct sock *sk, bool set) +static inline bool reuseport_has_conns(struct sock *sk) { struct sock_reuseport *reuse; bool ret = false; rcu_read_lock(); reuse = rcu_dereference(sk->sk_reuseport_cb); - if (reuse) { - if (set) - reuse->has_conns = 1; - ret = reuse->has_conns; - } + if (reuse && reuse->has_conns) + ret = true; rcu_read_unlock(); return ret; } +void reuseport_has_conns_set(struct sock *sk); +void reuseport_update_incoming_cpu(struct sock *sk, int val); + #endif /* _SOCK_REUSEPORT_H */ diff --git a/include/net/stp.h b/include/net/stp.h index 2914e6d53490..528103fce2c0 100644 --- a/include/net/stp.h +++ b/include/net/stp.h @@ -2,6 +2,8 @@ #ifndef _NET_STP_H #define _NET_STP_H +#include <linux/if_ether.h> + struct stp_proto { unsigned char group_address[ETH_ALEN]; void (*rcv)(const struct stp_proto *, struct sk_buff *, diff --git a/include/net/strparser.h b/include/net/strparser.h index 1d20b98493a1..0ed73e364faa 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -43,6 +43,8 @@ struct strparser; struct strp_callbacks { int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); + int (*read_sock)(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); int (*read_sock_done)(struct strparser *strp, int err); void (*abort_parser)(struct strparser *strp, int err); void (*lock)(struct strparser *strp); @@ -54,10 +56,35 @@ struct strp_msg { int offset; }; +struct _strp_msg { + /* Internal cb structure. struct strp_msg must be first for passing + * to upper layer. + */ + struct strp_msg strp; + int accum_len; +}; + +struct sk_skb_cb { +#define SK_SKB_CB_PRIV_LEN 20 + unsigned char data[SK_SKB_CB_PRIV_LEN]; + /* align strp on cache line boundary within skb->cb[] */ + unsigned char pad[4]; + struct _strp_msg strp; + + /* strp users' data follows */ + struct tls_msg { + u8 control; + } tls; + /* temp_reg is a temporary register used for bpf_convert_data_end_access + * when dst_reg == src_reg. + */ + u64 temp_reg; +}; + static inline struct strp_msg *strp_msg(struct sk_buff *skb) { return (struct strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Structure for an attached lower socket */ @@ -87,8 +114,6 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); -/* Must be called with process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 53e8b4994296..8346b0d29542 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -16,34 +16,36 @@ #define SWITCHDEV_F_SKIP_EOPNOTSUPP BIT(1) #define SWITCHDEV_F_DEFER BIT(2) -struct switchdev_trans { - bool ph_prepare; -}; - -static inline bool switchdev_trans_ph_prepare(struct switchdev_trans *trans) -{ - return trans && trans->ph_prepare; -} - -static inline bool switchdev_trans_ph_commit(struct switchdev_trans *trans) -{ - return trans && !trans->ph_prepare; -} - enum switchdev_attr_id { SWITCHDEV_ATTR_ID_UNDEFINED, SWITCHDEV_ATTR_ID_PORT_STP_STATE, + SWITCHDEV_ATTR_ID_PORT_MST_STATE, SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS, SWITCHDEV_ATTR_ID_PORT_MROUTER, SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, + SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, -#if IS_ENABLED(CONFIG_BRIDGE_MRP) - SWITCHDEV_ATTR_ID_MRP_PORT_STATE, + SWITCHDEV_ATTR_ID_BRIDGE_MST, SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, -#endif + SWITCHDEV_ATTR_ID_VLAN_MSTI, +}; + +struct switchdev_mst_state { + u16 msti; + u8 state; +}; + +struct switchdev_brport_flags { + unsigned long val; + unsigned long mask; +}; + +struct switchdev_vlan_msti { + u16 vid; + u16 msti; }; struct switchdev_attr { @@ -54,15 +56,16 @@ struct switchdev_attr { void (*complete)(struct net_device *dev, int err, void *priv); union { u8 stp_state; /* PORT_STP_STATE */ - unsigned long brport_flags; /* PORT_{PRE}_BRIDGE_FLAGS */ + struct switchdev_mst_state mst_state; /* PORT_MST_STATE */ + struct switchdev_brport_flags brport_flags; /* PORT_BRIDGE_FLAGS */ bool mrouter; /* PORT_MROUTER */ clock_t ageing_time; /* BRIDGE_AGEING_TIME */ bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ + u16 vlan_protocol; /* BRIDGE_VLAN_PROTOCOL */ + bool mst; /* BRIDGE_MST */ bool mc_disabled; /* MC_DISABLED */ -#if IS_ENABLED(CONFIG_BRIDGE_MRP) - u8 mrp_port_state; /* MRP_PORT_STATE */ u8 mrp_port_role; /* MRP_PORT_ROLE */ -#endif + struct switchdev_vlan_msti vlan_msti; /* VLAN_MSTI */ } u; }; @@ -71,7 +74,6 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_PORT_VLAN, SWITCHDEV_OBJ_ID_PORT_MDB, SWITCHDEV_OBJ_ID_HOST_MDB, -#if IS_ENABLED(CONFIG_BRIDGE_MRP) SWITCHDEV_OBJ_ID_MRP, SWITCHDEV_OBJ_ID_RING_TEST_MRP, SWITCHDEV_OBJ_ID_RING_ROLE_MRP, @@ -79,11 +81,10 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_IN_TEST_MRP, SWITCHDEV_OBJ_ID_IN_ROLE_MRP, SWITCHDEV_OBJ_ID_IN_STATE_MRP, - -#endif }; struct switchdev_obj { + struct list_head list; struct net_device *orig_dev; enum switchdev_obj_id id; u32 flags; @@ -95,8 +96,14 @@ struct switchdev_obj { struct switchdev_obj_port_vlan { struct switchdev_obj obj; u16 flags; - u16 vid_begin; - u16 vid_end; + u16 vid; + /* If set, the notifier signifies a change of one of the following + * flags for a VLAN that already exists: + * - BRIDGE_VLAN_INFO_PVID + * - BRIDGE_VLAN_INFO_UNTAGGED + * Entries with BRIDGE_VLAN_INFO_BRENTRY unset are not notified at all. + */ + bool changed; }; #define SWITCHDEV_OBJ_PORT_VLAN(OBJ) \ @@ -113,7 +120,6 @@ struct switchdev_obj_port_mdb { container_of((OBJ), struct switchdev_obj_port_mdb, obj) -#if IS_ENABLED(CONFIG_BRIDGE_MRP) /* SWITCHDEV_OBJ_ID_MRP */ struct switchdev_obj_mrp { struct switchdev_obj obj; @@ -145,6 +151,7 @@ struct switchdev_obj_ring_role_mrp { struct switchdev_obj obj; u8 ring_role; u32 ring_id; + u8 sw_backup; }; #define SWITCHDEV_OBJ_RING_ROLE_MRP(OBJ) \ @@ -179,6 +186,7 @@ struct switchdev_obj_in_role_mrp { u32 ring_id; u16 in_id; u8 in_role; + u8 sw_backup; }; #define SWITCHDEV_OBJ_IN_ROLE_MRP(OBJ) \ @@ -193,9 +201,13 @@ struct switchdev_obj_in_state_mrp { #define SWITCHDEV_OBJ_IN_STATE_MRP(OBJ) \ container_of((OBJ), struct switchdev_obj_in_state_mrp, obj) -#endif - -typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj); +struct switchdev_brport { + struct net_device *dev; + const void *ctx; + struct notifier_block *atomic_nb; + struct notifier_block *blocking_nb; + bool tx_fwd_offload; +}; enum switchdev_notifier_type { SWITCHDEV_FDB_ADD_TO_BRIDGE = 1, @@ -214,35 +226,48 @@ enum switchdev_notifier_type { SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE, SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE, SWITCHDEV_VXLAN_FDB_OFFLOADED, + + SWITCHDEV_BRPORT_OFFLOADED, + SWITCHDEV_BRPORT_UNOFFLOADED, + SWITCHDEV_BRPORT_REPLAY, }; struct switchdev_notifier_info { struct net_device *dev; struct netlink_ext_ack *extack; + const void *ctx; }; +/* Remember to update br_switchdev_fdb_populate() when adding + * new members to this structure + */ struct switchdev_notifier_fdb_info { struct switchdev_notifier_info info; /* must be first */ const unsigned char *addr; u16 vid; u8 added_by_user:1, + is_local:1, + locked:1, offloaded:1; }; struct switchdev_notifier_port_obj_info { struct switchdev_notifier_info info; /* must be first */ const struct switchdev_obj *obj; - struct switchdev_trans *trans; bool handled; }; struct switchdev_notifier_port_attr_info { struct switchdev_notifier_info info; /* must be first */ const struct switchdev_attr *attr; - struct switchdev_trans *trans; bool handled; }; +struct switchdev_notifier_brport_info { + struct switchdev_notifier_info info; /* must be first */ + const struct switchdev_brport brport; +}; + static inline struct net_device * switchdev_notifier_info_to_dev(const struct switchdev_notifier_info *info) { @@ -255,11 +280,37 @@ switchdev_notifier_info_to_extack(const struct switchdev_notifier_info *info) return info->extack; } +static inline bool +switchdev_fdb_is_dynamically_learned(const struct switchdev_notifier_fdb_info *fdb_info) +{ + return !fdb_info->added_by_user && !fdb_info->is_local; +} + #ifdef CONFIG_NET_SWITCHDEV +int switchdev_bridge_port_offload(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack); +void switchdev_bridge_port_unoffload(struct net_device *brport_dev, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb); +int switchdev_bridge_port_replay(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack); + void switchdev_deferred_process(void); int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr); + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack); +bool switchdev_port_obj_act_is_deferred(struct net_device *dev, + enum switchdev_notifier_type nt, + const struct switchdev_obj *obj); int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack); @@ -278,37 +329,76 @@ int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack); -void switchdev_port_fwd_mark_set(struct net_device *dev, - struct net_device *group_dev, - bool joining); +int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, + unsigned long event, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)); int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*add_cb)(struct net_device *dev, + int (*add_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack)); +int switchdev_handle_port_obj_add_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack)); int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*del_cb)(struct net_device *dev, + int (*del_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj)); +int switchdev_handle_port_obj_del_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)); int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), - int (*set_cb)(struct net_device *dev, + int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, - struct switchdev_trans *trans)); + struct netlink_ext_ack *extack)); #else +static inline int +switchdev_bridge_port_offload(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline void +switchdev_bridge_port_unoffload(struct net_device *brport_dev, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ +} + static inline void switchdev_deferred_process(void) { } static inline int switchdev_port_attr_set(struct net_device *dev, - const struct switchdev_attr *attr) + const struct switchdev_attr *attr, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } @@ -366,12 +456,36 @@ call_switchdev_blocking_notifiers(unsigned long val, } static inline int +switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, + unsigned long event, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + return 0; +} + +static inline int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*add_cb)(struct net_device *dev, + int (*add_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack)) +{ + return 0; +} + +static inline int switchdev_handle_port_obj_add_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, - struct switchdev_trans *trans, struct netlink_ext_ack *extack)) { return 0; @@ -381,7 +495,19 @@ static inline int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*del_cb)(struct net_device *dev, + int (*del_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj)) +{ + return 0; +} + +static inline int +switchdev_handle_port_obj_del_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { return 0; @@ -391,9 +517,9 @@ static inline int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), - int (*set_cb)(struct net_device *dev, + int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, - struct switchdev_trans *trans)) + struct netlink_ext_ack *extack)) { return 0; } diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h index 1f4cb477bb5d..e8dd77a96748 100644 --- a/include/net/tc_act/tc_connmark.h +++ b/include/net/tc_act/tc_connmark.h @@ -4,10 +4,15 @@ #include <net/act_api.h> -struct tcf_connmark_info { - struct tc_action common; +struct tcf_connmark_parms { struct net *net; u16 zone; + struct rcu_head rcu; +}; + +struct tcf_connmark_info { + struct tc_action common; + struct tcf_connmark_parms __rcu *parms; }; #define to_connmark(a) ((struct tcf_connmark_info *)a) diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 8250d6f0a462..77f87c622a2e 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -10,6 +10,7 @@ #include <net/netfilter/nf_conntrack_labels.h> struct tcf_ct_params { + struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; @@ -21,6 +22,7 @@ struct tcf_ct_params { struct nf_nat_range2 range; bool ipv4_range; + bool put_labels; u16 ct_action; @@ -56,6 +58,11 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) return to_ct_params(a)->nf_ft; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return to_ct_params(a)->helper; +} + #else static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; } static inline int tcf_ct_action(const struct tc_action *a) { return 0; } @@ -63,6 +70,10 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) { return NULL; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return NULL; +} #endif /* CONFIG_NF_CONNTRACK */ #if IS_ENABLED(CONFIG_NET_ACT_CT) diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index eb8f01c819e6..832efd40e023 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -59,4 +59,19 @@ static inline u32 tcf_gact_goto_chain_index(const struct tc_action *a) return READ_ONCE(a->tcfa_action) & TC_ACT_EXT_VAL_MASK; } +static inline bool is_tcf_gact_continue(const struct tc_action *a) +{ + return __is_tcf_gact_act(a, TC_ACT_UNSPEC, false); +} + +static inline bool is_tcf_gact_reclassify(const struct tc_action *a) +{ + return __is_tcf_gact_act(a, TC_ACT_RECLASSIFY, false); +} + +static inline bool is_tcf_gact_pipe(const struct tc_action *a) +{ + return __is_tcf_gact_act(a, TC_ACT_PIPE, false); +} + #endif /* __NET_TC_GACT_H */ diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h index 8bc6be81a7ad..c8fa11ebb397 100644 --- a/include/net/tc_act/tc_gate.h +++ b/include/net/tc_act/tc_gate.h @@ -60,11 +60,6 @@ static inline bool is_tcf_gate(const struct tc_action *a) return false; } -static inline u32 tcf_gate_index(const struct tc_action *a) -{ - return a->tcfa_index; -} - static inline s32 tcf_gate_prio(const struct tc_action *a) { s32 tcfg_prio; diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h deleted file mode 100644 index 4225fcb1c6ba..000000000000 --- a/include/net/tc_act/tc_ipt.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NET_TC_IPT_H -#define __NET_TC_IPT_H - -#include <net/act_api.h> - -struct xt_entry_target; - -struct tcf_ipt { - struct tc_action common; - u32 tcfi_hook; - char *tcfi_tname; - struct xt_entry_target *tcfi_t; -}; -#define to_ipt(a) ((struct tcf_ipt *)a) - -#endif /* __NET_TC_IPT_H */ diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 1cace4c69e44..75722d967bf2 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -8,8 +8,10 @@ struct tcf_mirred { struct tc_action common; int tcfm_eaction; + u32 tcfm_blockid; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; + netdevice_tracker tcfm_dev_tracker; struct list_head tcfm_list; }; #define to_mirred(a) ((struct tcf_mirred *)a) diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h index c14407160812..c869274ac529 100644 --- a/include/net/tc_act/tc_nat.h +++ b/include/net/tc_act/tc_nat.h @@ -5,13 +5,17 @@ #include <linux/types.h> #include <net/act_api.h> -struct tcf_nat { - struct tc_action common; - +struct tcf_nat_parms { __be32 old_addr; __be32 new_addr; __be32 mask; u32 flags; + struct rcu_head rcu; +}; + +struct tcf_nat { + struct tc_action common; + struct tcf_nat_parms __rcu *parms; }; #define to_tcf_nat(a) ((struct tcf_nat *)a) diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 748cf87a4d7e..83fe39931781 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -4,21 +4,29 @@ #include <net/act_api.h> #include <linux/tc_act/tc_pedit.h> +#include <linux/types.h> struct tcf_pedit_key_ex { enum pedit_header_type htype; enum pedit_cmd cmd; }; -struct tcf_pedit { - struct tc_action common; - unsigned char tcfp_nkeys; - unsigned char tcfp_flags; +struct tcf_pedit_parms { struct tc_pedit_key *tcfp_keys; struct tcf_pedit_key_ex *tcfp_keys_ex; + u32 tcfp_off_max_hint; + unsigned char tcfp_nkeys; + unsigned char tcfp_flags; + struct rcu_head rcu; +}; + +struct tcf_pedit { + struct tc_action common; + struct tcf_pedit_parms __rcu *parms; }; #define to_pedit(a) ((struct tcf_pedit *)a) +#define to_pedit_parms(a) (rcu_dereference(to_pedit(a)->parms)) static inline bool is_tcf_pedit(const struct tc_action *a) { @@ -31,37 +39,81 @@ static inline bool is_tcf_pedit(const struct tc_action *a) static inline int tcf_pedit_nkeys(const struct tc_action *a) { - return to_pedit(a)->tcfp_nkeys; + struct tcf_pedit_parms *parms; + int nkeys; + + rcu_read_lock(); + parms = to_pedit_parms(a); + nkeys = parms->tcfp_nkeys; + rcu_read_unlock(); + + return nkeys; } static inline u32 tcf_pedit_htype(const struct tc_action *a, int index) { - if (to_pedit(a)->tcfp_keys_ex) - return to_pedit(a)->tcfp_keys_ex[index].htype; + u32 htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + struct tcf_pedit_parms *parms; + + rcu_read_lock(); + parms = to_pedit_parms(a); + if (parms->tcfp_keys_ex) + htype = parms->tcfp_keys_ex[index].htype; + rcu_read_unlock(); - return TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + return htype; } static inline u32 tcf_pedit_cmd(const struct tc_action *a, int index) { - if (to_pedit(a)->tcfp_keys_ex) - return to_pedit(a)->tcfp_keys_ex[index].cmd; + struct tcf_pedit_parms *parms; + u32 cmd = __PEDIT_CMD_MAX; - return __PEDIT_CMD_MAX; + rcu_read_lock(); + parms = to_pedit_parms(a); + if (parms->tcfp_keys_ex) + cmd = parms->tcfp_keys_ex[index].cmd; + rcu_read_unlock(); + + return cmd; } static inline u32 tcf_pedit_mask(const struct tc_action *a, int index) { - return to_pedit(a)->tcfp_keys[index].mask; + struct tcf_pedit_parms *parms; + u32 mask; + + rcu_read_lock(); + parms = to_pedit_parms(a); + mask = parms->tcfp_keys[index].mask; + rcu_read_unlock(); + + return mask; } static inline u32 tcf_pedit_val(const struct tc_action *a, int index) { - return to_pedit(a)->tcfp_keys[index].val; + struct tcf_pedit_parms *parms; + u32 val; + + rcu_read_lock(); + parms = to_pedit_parms(a); + val = parms->tcfp_keys[index].val; + rcu_read_unlock(); + + return val; } static inline u32 tcf_pedit_offset(const struct tc_action *a, int index) { - return to_pedit(a)->tcfp_keys[index].off; + struct tcf_pedit_parms *parms; + u32 off; + + rcu_read_lock(); + parms = to_pedit_parms(a); + off = parms->tcfp_keys[index].off; + rcu_read_unlock(); + + return off; } #endif /* __NET_TC_PED_H */ diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index 6d1e26b709b5..283bde711a42 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -10,10 +10,13 @@ struct tcf_police_params { s64 tcfp_burst; u32 tcfp_mtu; s64 tcfp_mtu_ptoks; + s64 tcfp_pkt_burst; struct psched_ratecfg rate; bool rate_present; struct psched_ratecfg peak; bool peak_present; + struct psched_pktrate ppsrate; + bool pps_present; struct rcu_head rcu; }; @@ -24,6 +27,7 @@ struct tcf_police { spinlock_t tcfp_lock ____cacheline_aligned_in_smp; s64 tcfp_toks; s64 tcfp_ptoks; + s64 tcfp_pkttoks; s64 tcfp_t_c; }; @@ -97,6 +101,54 @@ static inline u32 tcf_police_burst(const struct tc_action *act) return burst; } +static inline u64 tcf_police_rate_pkt_ps(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->ppsrate.rate_pkts_ps; +} + +static inline u32 tcf_police_burst_pkt(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + u32 burst; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + + /* + * "rate" pkts "burst" nanoseconds + * ------------ * ------------------- + * 1 second 2^6 ticks + * + * ------------------------------------ + * NSEC_PER_SEC nanoseconds + * ------------------------ + * 2^6 ticks + * + * "rate" pkts "burst" nanoseconds 2^6 ticks + * = ------------ * ------------------- * ------------------------ + * 1 second 2^6 ticks NSEC_PER_SEC nanoseconds + * + * "rate" * "burst" + * = ---------------- pkts/nanosecond + * NSEC_PER_SEC^2 + * + * + * "rate" * "burst" + * = ---------------- pkts/second + * NSEC_PER_SEC + */ + burst = div_u64(params->tcfp_pkt_burst * params->ppsrate.rate_pkts_ps, + NSEC_PER_SEC); + + return burst; +} + static inline u32 tcf_police_tcfp_mtu(const struct tc_action *act) { struct tcf_police *police = to_police(act); @@ -107,4 +159,34 @@ static inline u32 tcf_police_tcfp_mtu(const struct tc_action *act) return params->tcfp_mtu; } +static inline u64 tcf_police_peakrate_bytes_ps(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->peak.rate_bytes_ps; +} + +static inline u32 tcf_police_tcfp_ewma_rate(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->tcfp_ewma_rate; +} + +static inline u16 tcf_police_rate_overhead(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->rate.overhead; +} + #endif /* __NET_TC_POLICE_H */ diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 00bfee70609e..9649600fb3dc 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -17,6 +17,7 @@ struct tcf_skbedit_params { u32 mark; u32 mask; u16 queue_mapping; + u16 mapping_mod; u16 ptype; struct rcu_head rcu; }; @@ -94,4 +95,45 @@ static inline u32 tcf_skbedit_priority(const struct tc_action *a) return priority; } +static inline u16 tcf_skbedit_rx_queue_mapping(const struct tc_action *a) +{ + u16 rx_queue; + + rcu_read_lock(); + rx_queue = rcu_dereference(to_skbedit(a)->params)->queue_mapping; + rcu_read_unlock(); + + return rx_queue; +} + +/* Return true iff action is queue_mapping */ +static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a) +{ + return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING); +} + +/* Return true if action is on ingress traffic */ +static inline bool is_tcf_skbedit_ingress(u32 flags) +{ + return flags & TCA_ACT_FLAGS_AT_INGRESS; +} + +static inline bool is_tcf_skbedit_tx_queue_mapping(const struct tc_action *a) +{ + return is_tcf_skbedit_queue_mapping(a) && + !is_tcf_skbedit_ingress(a->tcfa_flags); +} + +static inline bool is_tcf_skbedit_rx_queue_mapping(const struct tc_action *a) +{ + return is_tcf_skbedit_queue_mapping(a) && + is_tcf_skbedit_ingress(a->tcfa_flags); +} + +/* Return true iff action is inheritdsfield */ +static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a) +{ + return is_tcf_skbedit_with_flag(a, SKBEDIT_F_INHERITDSFIELD); +} + #endif /* __NET_TC_SKBEDIT_H */ diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index f051046ba034..904eddfc1826 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -16,6 +16,7 @@ struct tcf_vlan_params { u16 tcfv_push_vid; __be16 tcfv_push_proto; u8 tcfv_push_prio; + bool tcfv_push_prio_exists; struct rcu_head rcu; }; @@ -77,4 +78,14 @@ static inline u8 tcf_vlan_push_prio(const struct tc_action *a) return tcfv_push_prio; } + +static inline void tcf_vlan_push_eth(unsigned char *src, unsigned char *dest, + const struct tc_action *a) +{ + rcu_read_lock(); + memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_dst, ETH_ALEN); + memcpy(src, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN); + rcu_read_unlock(); +} + #endif /* __NET_TC_VLAN_H */ diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h new file mode 100644 index 000000000000..ffe58a02537c --- /dev/null +++ b/include/net/tc_wrapper.h @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_TC_WRAPPER_H +#define __NET_TC_WRAPPER_H + +#include <net/pkt_cls.h> + +#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) + +#include <linux/cpufeature.h> +#include <linux/static_key.h> +#include <linux/indirect_call_wrapper.h> + +#define TC_INDIRECT_SCOPE + +extern struct static_key_false tc_skip_wrapper; + +/* TC Actions */ +#ifdef CONFIG_NET_CLS_ACT + +#define TC_INDIRECT_ACTION_DECLARE(fname) \ + INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb, \ + const struct tc_action *a, \ + struct tcf_result *res)) + +TC_INDIRECT_ACTION_DECLARE(tcf_bpf_act); +TC_INDIRECT_ACTION_DECLARE(tcf_connmark_act); +TC_INDIRECT_ACTION_DECLARE(tcf_csum_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ct_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ctinfo_act); +TC_INDIRECT_ACTION_DECLARE(tcf_gact_act); +TC_INDIRECT_ACTION_DECLARE(tcf_gate_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ife_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ipt_act); +TC_INDIRECT_ACTION_DECLARE(tcf_mirred_act); +TC_INDIRECT_ACTION_DECLARE(tcf_mpls_act); +TC_INDIRECT_ACTION_DECLARE(tcf_nat_act); +TC_INDIRECT_ACTION_DECLARE(tcf_pedit_act); +TC_INDIRECT_ACTION_DECLARE(tcf_police_act); +TC_INDIRECT_ACTION_DECLARE(tcf_sample_act); +TC_INDIRECT_ACTION_DECLARE(tcf_simp_act); +TC_INDIRECT_ACTION_DECLARE(tcf_skbedit_act); +TC_INDIRECT_ACTION_DECLARE(tcf_skbmod_act); +TC_INDIRECT_ACTION_DECLARE(tcf_vlan_act); +TC_INDIRECT_ACTION_DECLARE(tunnel_key_act); + +static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + if (static_branch_likely(&tc_skip_wrapper)) + goto skip; + +#if IS_BUILTIN(CONFIG_NET_ACT_GACT) + if (a->ops->act == tcf_gact_act) + return tcf_gact_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_MIRRED) + if (a->ops->act == tcf_mirred_act) + return tcf_mirred_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_PEDIT) + if (a->ops->act == tcf_pedit_act) + return tcf_pedit_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT) + if (a->ops->act == tcf_skbedit_act) + return tcf_skbedit_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SKBMOD) + if (a->ops->act == tcf_skbmod_act) + return tcf_skbmod_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_POLICE) + if (a->ops->act == tcf_police_act) + return tcf_police_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_BPF) + if (a->ops->act == tcf_bpf_act) + return tcf_bpf_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CONNMARK) + if (a->ops->act == tcf_connmark_act) + return tcf_connmark_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CSUM) + if (a->ops->act == tcf_csum_act) + return tcf_csum_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CT) + if (a->ops->act == tcf_ct_act) + return tcf_ct_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CTINFO) + if (a->ops->act == tcf_ctinfo_act) + return tcf_ctinfo_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_GATE) + if (a->ops->act == tcf_gate_act) + return tcf_gate_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_MPLS) + if (a->ops->act == tcf_mpls_act) + return tcf_mpls_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_NAT) + if (a->ops->act == tcf_nat_act) + return tcf_nat_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY) + if (a->ops->act == tunnel_key_act) + return tunnel_key_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_VLAN) + if (a->ops->act == tcf_vlan_act) + return tcf_vlan_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_IFE) + if (a->ops->act == tcf_ife_act) + return tcf_ife_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SIMP) + if (a->ops->act == tcf_simp_act) + return tcf_simp_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SAMPLE) + if (a->ops->act == tcf_sample_act) + return tcf_sample_act(skb, a, res); +#endif + +skip: + return a->ops->act(skb, a, res); +} + +#endif /* CONFIG_NET_CLS_ACT */ + +/* TC Filters */ +#ifdef CONFIG_NET_CLS + +#define TC_INDIRECT_FILTER_DECLARE(fname) \ + INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb, \ + const struct tcf_proto *tp, \ + struct tcf_result *res)) + +TC_INDIRECT_FILTER_DECLARE(basic_classify); +TC_INDIRECT_FILTER_DECLARE(cls_bpf_classify); +TC_INDIRECT_FILTER_DECLARE(cls_cgroup_classify); +TC_INDIRECT_FILTER_DECLARE(fl_classify); +TC_INDIRECT_FILTER_DECLARE(flow_classify); +TC_INDIRECT_FILTER_DECLARE(fw_classify); +TC_INDIRECT_FILTER_DECLARE(mall_classify); +TC_INDIRECT_FILTER_DECLARE(route4_classify); +TC_INDIRECT_FILTER_DECLARE(u32_classify); + +static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + if (static_branch_likely(&tc_skip_wrapper)) + goto skip; + +#if IS_BUILTIN(CONFIG_NET_CLS_BPF) + if (tp->classify == cls_bpf_classify) + return cls_bpf_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_U32) + if (tp->classify == u32_classify) + return u32_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_FLOWER) + if (tp->classify == fl_classify) + return fl_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_FW) + if (tp->classify == fw_classify) + return fw_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_MATCHALL) + if (tp->classify == mall_classify) + return mall_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_BASIC) + if (tp->classify == basic_classify) + return basic_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_CGROUP) + if (tp->classify == cls_cgroup_classify) + return cls_cgroup_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_FLOW) + if (tp->classify == flow_classify) + return flow_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_ROUTE4) + if (tp->classify == route4_classify) + return route4_classify(skb, tp, res); +#endif + +skip: + return tp->classify(skb, tp, res); +} + +#endif /* CONFIG_NET_CLS */ + +static inline void tc_wrapper_init(void) +{ +#ifdef CONFIG_X86 + if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + static_branch_enable(&tc_skip_wrapper); +#endif +} + +#else + +#define TC_INDIRECT_SCOPE static + +static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + return a->ops->act(skb, a, res); +} + +static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + return tp->classify(skb, tp, res); +} + +static inline void tc_wrapper_init(void) +{ +} + +#endif + +#endif /* __NET_TC_WRAPPER_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d4ef5bf94168..5078ad868fee 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -26,6 +26,7 @@ #include <linux/kref.h> #include <linux/ktime.h> #include <linux/indirect_call_wrapper.h> +#include <linux/bits.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> @@ -37,9 +38,11 @@ #include <net/snmp.h> #include <net/ip.h> #include <net/tcp_states.h> +#include <net/tcp_ao.h> #include <net/inet_ecn.h> #include <net/dst.h> #include <net/mptcp.h> +#include <net/xfrm.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> @@ -48,7 +51,11 @@ extern struct inet_hashinfo tcp_hashinfo; -extern struct percpu_counter tcp_orphan_count; +DECLARE_PER_CPU(unsigned int, tcp_orphan_count); +int tcp_orphan_count_sum(void); + +DECLARE_PER_CPU(u32, tcp_tw_isn); + void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) @@ -129,6 +136,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ +static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); + #if HZ >= 100 #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */ #define TCP_ATO_MIN ((unsigned)(HZ/25)) @@ -136,9 +145,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif -#define TCP_RTO_MAX ((unsigned)(120*HZ)) -#define TCP_RTO_MIN ((unsigned)(HZ/5)) +#define TCP_RTO_MAX_SEC 120 +#define TCP_RTO_MAX ((unsigned)(TCP_RTO_MAX_SEC * HZ)) +#define TCP_RTO_MIN ((unsigned)(HZ / 5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ + +#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */ + #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the @@ -159,9 +172,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 -#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ +/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds + * to avoid overflows. This assumes a clock smaller than 1 Mhz. + * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz. + */ +#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC) -#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN @@ -184,6 +200,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ +#define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ @@ -251,6 +268,8 @@ extern long sysctl_tcp_mem[3]; #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ extern atomic_long_t tcp_memory_allocated; +DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); + extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; @@ -280,30 +299,19 @@ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) return seq3 - seq2 >= seq1 - seq2; } -static inline bool tcp_out_of_memory(struct sock *sk) +static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { - if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) - return true; - return false; + sk_wmem_queued_add(sk, -skb->truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, skb->truesize); + else + sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb))); + __kfree_skb(skb); } void sk_forced_mem_schedule(struct sock *sk, int size); -static inline bool tcp_too_many_orphans(struct sock *sk, int shift) -{ - struct percpu_counter *ocp = sk->sk_prot->orphan_count; - int orphans = percpu_counter_read_positive(ocp); - - if (orphans << shift > sysctl_tcp_max_orphans) { - orphans = percpu_counter_sum_positive(ocp); - if (orphans << shift > sysctl_tcp_max_orphans) - return true; - } - return false; -} - -bool tcp_check_oom(struct sock *sk, int shift); +bool tcp_check_oom(const struct sock *sk, int shift); extern struct proto tcp_prot; @@ -322,39 +330,41 @@ void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); -int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); +void tcp_remove_empty_skb(struct sock *sk); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); -int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, - int flags); -int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, - size_t size, int flags); -ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, - size_t size, int flags); +int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, + size_t size, struct ubuf_info *uarg); +void tcp_splice_eof(struct socket *sock); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); +int tcp_wmem_schedule(struct sock *sk, int copy); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); -int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); +int tcp_ioctl(struct sock *sk, int cmd, int *karg); +enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); +void tcp_twsk_purge(struct list_head *net_exit_list); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, + bool force_schedule); -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); -static inline void tcp_dec_quickack_mode(struct sock *sk, - const unsigned int pkts) +static inline void tcp_dec_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.quick) { + /* How many ACKs S/ACKing new data have we sent? */ + const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0; + if (pkts >= icsk->icsk_ack.quick) { icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ @@ -364,48 +374,99 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, } } -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 -#define TCP_ECN_SEEN 8 +#define TCP_ECN_MODE_RFC3168 BIT(0) +#define TCP_ECN_QUEUE_CWR BIT(1) +#define TCP_ECN_DEMAND_CWR BIT(2) +#define TCP_ECN_SEEN BIT(3) +#define TCP_ECN_MODE_ACCECN BIT(4) + +#define TCP_ECN_DISABLED 0 +#define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) +#define TCP_ECN_MODE_ANY (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + +static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp) +{ + return tp->ecn_flags & TCP_ECN_MODE_ANY; +} + +static inline bool tcp_ecn_mode_rfc3168(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_RFC3168; +} + +static inline bool tcp_ecn_mode_accecn(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_ACCECN; +} + +static inline bool tcp_ecn_disabled(const struct tcp_sock *tp) +{ + return !tcp_ecn_mode_any(tp); +} + +static inline bool tcp_ecn_mode_pending(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_PENDING) == TCP_ECN_MODE_PENDING; +} + +static inline void tcp_ecn_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->ecn_flags &= ~TCP_ECN_MODE_ANY; + tp->ecn_flags |= mode; +} enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, TCP_TW_ACK = 2, - TCP_TW_SYN = 3 + TCP_TW_SYN = 3, + TCP_TW_ACK_OOW = 4 }; enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th); + const struct tcphdr *th, + u32 *tw_isn, + enum skb_drop_reason *drop_reason); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, - bool *lost_race); -int tcp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb); + bool *lost_race, enum skb_drop_reason *drop_reason); +enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); -void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag); +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); +void __tcp_close(struct sock *sk, long timeout); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); +int do_tcp_getsockopt(struct sock *sk, int level, + int optname, sockptr_t optval, sockptr_t optlen); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); +bool tcp_bpf_bypass_getsockopt(int level, int optname); +int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +void tcp_reset_keepalive_timer(struct sock *sk, unsigned long timeout); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); -int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); +int tcp_set_window_clamp(struct sock *sk, int val); +void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss); +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, @@ -414,7 +475,6 @@ int tcp_mmap(struct file *file, struct socket *sock, void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); -const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); /* * BPF SKB-less helpers @@ -423,6 +483,7 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie); +u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss); u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th); @@ -466,12 +527,30 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst, u32 tsoff); -int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, - u32 cookie); + struct dst_entry *dst); +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk, struct sk_buff *skb); + struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *tcp_opt, + int mss, u32 tsoff); + +#if IS_ENABLED(CONFIG_BPF) +struct bpf_tcp_req_attrs { + u32 rcv_tsval; + u32 rcv_tsecr; + u16 mss; + u8 rcv_wscale; + u8 snd_wscale; + u8 ecn_ok; + u8 wscale_ok; + u8 sack_ok; + u8 tstamp_ok; + u8 usec_ts_ok; + u8 reserved[3]; +}; +#endif + #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. @@ -509,7 +588,7 @@ static inline void tcp_synq_overflow(const struct sock *sk) last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); if (!time_between32(now, last_overflow, last_overflow + HZ)) - WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now); + WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now); } /* syncookies: no recent synqueue overflow on this listening socket? */ @@ -551,18 +630,50 @@ static inline u32 tcp_cookie_time(void) return val; } +/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ +static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val) +{ + if (usec_ts) + return div_u64(val, NSEC_PER_USEC); + + return div_u64(val, NSEC_PER_MSEC); +} + u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); -bool cookie_ecn_ok(const struct tcp_options_received *opt, - const struct net *net, const struct dst_entry *dst); + +static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *dst) +{ + return READ_ONCE(net->ipv4.sysctl_tcp_ecn) || + dst_feature(dst, RTAX_FEATURE_ECN); +} + +#if IS_ENABLED(CONFIG_BPF) +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return skb->sk; +} + +struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb); +#else +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return false; +} + +static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return NULL; +} +#endif /* From net/ipv6/syncookies.c */ -int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, - u32 cookie); +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, @@ -571,6 +682,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ +void tcp_skb_entail(struct sock *sk, struct sk_buff *skb); +void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); @@ -589,13 +702,13 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); -void tcp_send_partial(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); -void tcp_send_active_reset(struct sock *sk, gfp_t priority); +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); -void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); @@ -606,9 +719,24 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); -void tcp_reset(struct sock *sk); -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); +void tcp_done_with_error(struct sock *sk, int err); +void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_fin(struct sock *sk); +void tcp_check_space(struct sock *sk); +void tcp_sack_compress_send_ack(struct sock *sk); + +static inline void tcp_cleanup_skb(struct sk_buff *skb) +{ + skb_dst_drop(skb); + secpath_reset(skb); +} + +static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb) +{ + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb)); + __skb_queue_tail(&sk->sk_receive_queue, skb); +} /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); @@ -625,6 +753,7 @@ static inline void tcp_clear_xmit_timers(struct sock *sk) unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); unsigned int tcp_current_mss(struct sock *sk); +u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when); /* Bound MSS / TSO packet size with the half of the window */ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) @@ -655,6 +784,12 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq); +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); +struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); +void tcp_read_done(struct sock *sk, size_t len); void tcp_initialize_rcv_mss(struct sock *sk); @@ -662,10 +797,14 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -static inline void tcp_bound_rto(const struct sock *sk) +static inline unsigned int tcp_rto_max(const struct sock *sk) +{ + return READ_ONCE(inet_csk(sk)->icsk_rto_max); +} + +static inline void tcp_bound_rto(struct sock *sk) { - if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) - inet_csk(sk)->icsk_rto = TCP_RTO_MAX; + inet_csk(sk)->icsk_rto = min(inet_csk(sk)->icsk_rto, tcp_rto_max(sk)); } static inline u32 __tcp_set_rto(const struct tcp_sock *tp) @@ -675,6 +814,10 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { + /* mptcp hooks are only on the slow path */ + if (sk_is_mptcp((struct sock *)tp)) + return; + tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); @@ -696,18 +839,20 @@ static inline void tcp_fast_path_check(struct sock *sk) tcp_fast_path_on(tp); } +u32 tcp_delack_max(const struct sock *sk); + /* Compute the actual rto_min value */ -static inline u32 tcp_rto_min(struct sock *sk) +static inline u32 tcp_rto_min(const struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = inet_csk(sk)->icsk_rto_min; + u32 rto_min = READ_ONCE(inet_csk(sk)->icsk_rto_min); if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); return rto_min; } -static inline u32 tcp_rto_min_us(struct sock *sk) +static inline u32 tcp_rto_min_us(const struct sock *sk) { return jiffies_to_usecs(tcp_rto_min(sk)); } @@ -767,22 +912,31 @@ static inline u64 tcp_clock_us(void) return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } -/* This should only be used in contexts where tp->tcp_mstamp is up to date */ -static inline u32 tcp_time_stamp(const struct tcp_sock *tp) +static inline u64 tcp_clock_ms(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_MSEC); +} + +/* TCP Timestamp included in TS option (RFC 1323) can either use ms + * or usec resolution. Each socket carries a flag to select one or other + * resolution, as the route attribute could change anytime. + * Each flow must stick to initial resolution. + */ +static inline u32 tcp_clock_ts(bool usec_ts) { - return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); + return usec_ts ? tcp_clock_us() : tcp_clock_ms(); } -/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ -static inline u32 tcp_ns_to_ts(u64 ns) +static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) { - return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); + return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); } -/* Could use tcp_clock_us() / 1000, but this version uses a single divide */ -static inline u32 tcp_time_stamp_raw(void) +static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp) { - return tcp_ns_to_ts(tcp_clock_ns()); + if (tp->tcp_usec_ts) + return tp->tcp_mstamp; + return tcp_time_stamp_ms(tp); } void tcp_mstamp_refresh(struct tcp_sock *tp); @@ -792,31 +946,64 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } -static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) -{ - return tcp_ns_to_ts(skb->skb_mstamp_ns); -} - /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } +/* Provide skb TSval in usec or ms unit */ +static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb) +{ + if (usec_ts) + return tcp_skb_timestamp_us(skb); -#define tcp_flag_byte(th) (((u_int8_t *)th)[13]) + return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC); +} -#define TCPHDR_FIN 0x01 -#define TCPHDR_SYN 0x02 -#define TCPHDR_RST 0x04 -#define TCPHDR_PSH 0x08 -#define TCPHDR_ACK 0x10 -#define TCPHDR_URG 0x20 -#define TCPHDR_ECE 0x40 -#define TCPHDR_CWR 0x80 +static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) +{ + return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset; +} +static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) +{ + return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off; +} + +#define tcp_flag_byte(th) (((u_int8_t *)th)[13]) + +#define TCPHDR_FIN BIT(0) +#define TCPHDR_SYN BIT(1) +#define TCPHDR_RST BIT(2) +#define TCPHDR_PSH BIT(3) +#define TCPHDR_ACK BIT(4) +#define TCPHDR_URG BIT(5) +#define TCPHDR_ECE BIT(6) +#define TCPHDR_CWR BIT(7) +#define TCPHDR_AE BIT(8) +#define TCPHDR_FLAGS_MASK (TCPHDR_FIN | TCPHDR_SYN | TCPHDR_RST | \ + TCPHDR_PSH | TCPHDR_ACK | TCPHDR_URG | \ + TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) +#define tcp_flags_ntohs(th) (ntohs(*(__be16 *)&tcp_flag_word(th)) & \ + TCPHDR_FLAGS_MASK) + +#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +/* State flags for sacked in struct tcp_skb_cb */ +enum tcp_skb_cb_sacked_flags { + TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ + TCPCB_SACKED_RETRANS = (1 << 1), /* SKB retransmitted */ + TCPCB_LOST = (1 << 2), /* SKB is lost */ + TCPCB_TAGBITS = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS | + TCPCB_LOST), /* All tag bits */ + TCPCB_REPAIRED = (1 << 4), /* SKB repaired (no skb_mstamp_ns) */ + TCPCB_EVER_RETRANS = (1 << 7), /* Ever retransmitted frame */ + TCPCB_RETRANS = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS | + TCPCB_REPAIRED), +}; + /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. @@ -827,42 +1014,33 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : tcp_tw_isn is used in input path only - * (isn chosen by tcp_timewait_state_process()) - * + /* Note : * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ - __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; }; }; - __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ + __u16 tcp_flags; /* TCP header flags (tcp[12-13])*/ __u8 sacked; /* State flags for SACK. */ -#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ -#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ -#define TCPCB_LOST 0x04 /* SKB is lost */ -#define TCPCB_TAGBITS 0x07 /* All tag bits */ -#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */ -#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ -#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ - TCPCB_REPAIRED) - __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ - __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ +#define TSTAMP_ACK_SK 0x1 +#define TSTAMP_ACK_BPF 0x2 + __u8 txstamp_ack:2, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ - unused:5; + unused:4; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { +#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) /* There is space for up to 24 bytes */ - __u32 in_flight:30,/* Bytes in flight at transmit */ - is_app_limited:1, /* cwnd not fully used? */ - unused:1; + __u32 is_app_limited:1, /* cwnd not fully used? */ + delivered_ce:20, + unused:11; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ @@ -876,36 +1054,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - void *data_end; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} - extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) @@ -938,7 +1091,7 @@ extern const struct inet_connection_sock_af_ops ipv6_specific; INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); -INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb)); +void tcp_v6_early_demux(struct sk_buff *skb); #endif @@ -984,8 +1137,18 @@ static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { + /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */ return likely(tcp_skb_can_collapse_to(to) && - mptcp_skb_can_collapse(to, from)); + mptcp_skb_can_collapse(to, from) && + skb_pure_zcopy_same(to, from) && + skb_frags_readable(to) == skb_frags_readable(from)); +} + +static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to, + const struct sk_buff *from) +{ + return likely(mptcp_skb_can_collapse(to, from) && + !skb_cmp_decrypted(to, from)); } /* Events passed to congestion control interface */ @@ -1015,9 +1178,9 @@ enum tcp_ca_ack_event_flags { #define TCP_CA_UNSPEC 0 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ -#define TCP_CONG_NON_RESTRICTED 0x1 +#define TCP_CONG_NON_RESTRICTED BIT(0) /* Requires ECN/ECT set on all packets */ -#define TCP_CONG_NEEDS_ECN 0x2 +#define TCP_CONG_NEEDS_ECN BIT(1) #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; @@ -1039,7 +1202,9 @@ struct ack_sample { struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ + s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ @@ -1047,53 +1212,69 @@ struct rate_sample { int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ + u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { - struct list_head list; - u32 key; - u32 flags; - - /* initialize private data (optional) */ - void (*init)(struct sock *sk); - /* cleanup private data (optional) */ - void (*release)(struct sock *sk); +/* fast path fields are put first to fill one cache line */ /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); + /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); + /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); + /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); - /* new value of cwnd after loss (required) */ - u32 (*undo_cwnd)(struct sock *sk); + /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - u32 (*sndbuf_expand)(struct sock *sk); + /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); + + + /* new value of cwnd after loss (required) */ + u32 (*undo_cwnd)(struct sock *sk); + /* returns the multiplier used in tcp_sndbuf_expand (optional) */ + u32 (*sndbuf_expand)(struct sock *sk); + +/* control/slow paths put last */ /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); - char name[TCP_CA_NAME_MAX]; - struct module *owner; -}; + char name[TCP_CA_NAME_MAX]; + struct module *owner; + struct list_head list; + u32 key; + u32 flags; + + /* initialize private data (optional) */ + void (*init)(struct sock *sk); + /* cleanup private data (optional) */ + void (*release)(struct sock *sk); +} ____cacheline_aligned_in_smp; int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); +int tcp_update_congestion_control(struct tcp_congestion_ops *type, + struct tcp_congestion_ops *old_type); +int tcp_validate_congestion_control(struct tcp_congestion_ops *ca); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); @@ -1115,7 +1296,7 @@ extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else @@ -1132,15 +1313,6 @@ static inline bool tcp_ca_needs_ecn(const struct sock *sk) return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; } -static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_ca_ops->set_state) - icsk->icsk_ca_ops->set_state(sk, ca_state); - icsk->icsk_ca_state = ca_state; -} - static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1149,6 +1321,9 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } +/* From tcp_cong.c */ +void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + /* From tcp_rate.c */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, @@ -1157,6 +1332,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); +static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) +{ + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); +} + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. @@ -1200,9 +1380,20 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) #define TCP_INFINITE_SSTHRESH 0x7fffffff +static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp) +{ + return tp->snd_cwnd; +} + +static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val) +{ + WARN_ON_ONCE((int)val <= 0); + tp->snd_cwnd = val; +} + static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { - return tp->snd_cwnd < tp->snd_ssthresh; + return tcp_snd_cwnd(tp) < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) @@ -1228,8 +1419,8 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, - ((tp->snd_cwnd >> 1) + - (tp->snd_cwnd >> 2))); + ((tcp_snd_cwnd(tp) >> 1) + + (tcp_snd_cwnd(tp) >> 2))); } /* Use define here intentionally to get WARN_ON location shown at the caller */ @@ -1269,11 +1460,14 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + if (tp->is_cwnd_limited) + return true; + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) - return tp->snd_cwnd < 2 * tp->max_packets_out; + return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out; - return tp->is_cwnd_limited; + return false; } /* BBR congestion control needs pacing. @@ -1300,10 +1494,12 @@ static inline unsigned long tcp_pacing_delay(const struct sock *sk) static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, - const unsigned long max_when) + bool pace_delay) { - inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), - max_when); + if (pace_delay) + when += tcp_pacing_delay(sk); + inet_csk_reset_xmit_timer(sk, what, when, + tcp_rto_max(sk)); } /* Something is really bad, we could not queue an additional packet, @@ -1321,7 +1517,9 @@ static inline unsigned long tcp_probe0_base(const struct sock *sk) static inline unsigned long tcp_probe0_when(const struct sock *sk, unsigned long max_when) { - u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff; + u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1, + inet_csk(sk)->icsk_backoff); + u64 when = (u64)tcp_probe0_base(sk) << backoff; return (unsigned long)min_t(u64, when, max_when); } @@ -1330,7 +1528,7 @@ static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_base(sk), TCP_RTO_MAX); + tcp_probe0_base(sk), true); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) @@ -1358,7 +1556,10 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) __skb_checksum_complete(skb); } -bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason); + + int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); @@ -1378,8 +1579,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); s32 delta; - if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out || - ca_ops->cong_control) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) || + tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) @@ -1392,13 +1593,40 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); +static inline int __tcp_win_from_space(u8 scaling_ratio, int space) +{ + s64 scaled_space = (s64)space * scaling_ratio; + + return scaled_space >> TCP_RMEM_TO_WIN_SCALE; +} + static inline int tcp_win_from_space(const struct sock *sk, int space) { - int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale; + return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space); +} + +/* inverse of __tcp_win_from_space() */ +static inline int __tcp_space_from_win(u8 scaling_ratio, int win) +{ + u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; - return tcp_adv_win_scale <= 0 ? - (space>>(-tcp_adv_win_scale)) : - space - (space>>tcp_adv_win_scale); + do_div(val, scaling_ratio); + return val; +} + +static inline int tcp_space_from_win(const struct sock *sk, int win) +{ + return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); +} + +/* Assume a 50% default for skb->len/skb->truesize ratio. + * This may be adjusted later in tcp_measure_rcv_mss(). + */ +#define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1)) + +static inline void tcp_scaling_ratio_init(struct sock *sk) +{ + tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; } /* Note: caller must be prepared to deal with negative returns */ @@ -1414,7 +1642,25 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } +static inline void __tcp_adjust_rcv_ssthresh(struct sock *sk, u32 new_ssthresh) +{ + int unused_mem = sk_unused_reserved_mem(sk); + struct tcp_sock *tp = tcp_sk(sk); + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, new_ssthresh); + if (unused_mem) + tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh, + tcp_win_from_space(sk, unused_mem)); +} + +static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) +{ + __tcp_adjust_rcv_ssthresh(sk, 4U * tcp_sk(sk)->advmss); +} + void tcp_cleanup_rbuf(struct sock *sk, int copied); +void __tcp_cleanup_rbuf(struct sock *sk, int copied); + /* We provision sk_rcvbuf around 200% of sk_rcvlowat. * If 87.5 % (7/8) of the space has been consumed, we want to override @@ -1423,12 +1669,29 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied); */ static inline bool tcp_rmem_pressure(const struct sock *sk) { - int rcvbuf = READ_ONCE(sk->sk_rcvbuf); - int threshold = rcvbuf - (rcvbuf >> 3); + int rcvbuf, threshold; + + if (tcp_under_memory_pressure(sk)) + return true; + + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + threshold = rcvbuf - (rcvbuf >> 3); return atomic_read(&sk->sk_rmem_alloc) > threshold; } +static inline bool tcp_epollin_ready(const struct sock *sk, int target) +{ + const struct tcp_sock *tp = tcp_sk(sk); + int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); + + if (avail <= 0) + return false; + + return (avail >= target) || tcp_rmem_pressure(sk) || + (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss); +} + extern void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst); @@ -1439,22 +1702,38 @@ void tcp_leave_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); + int val; - return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl; + /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl() + * and do_tcp_setsockopt(). + */ + val = READ_ONCE(tp->keepalive_intvl); + + return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); } static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); + int val; + + /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */ + val = READ_ONCE(tp->keepalive_time); - return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time; + return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); + int val; - return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes; + /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt() + * and do_tcp_setsockopt(). + */ + val = READ_ONCE(tp->keepalive_probes); + + return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) @@ -1467,7 +1746,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) static inline int tcp_fin_time(const struct sock *sk) { - int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout; + int fin_timeout = tcp_sk(sk)->linger2 ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout); const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) @@ -1482,7 +1762,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; if (unlikely(!time_before32(ktime_get_seconds(), - rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) + rx_opt->ts_recent_stamp + TCP_PAWS_WRAP))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, @@ -1542,12 +1822,7 @@ static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) tp->retransmit_skb_hint = NULL; } -union tcp_md5_addr { - struct in_addr a4; -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr a6; -#endif -}; +#define tcp_md5_addr tcp_ao_addr /* - key database */ struct tcp_md5sig_key { @@ -1555,6 +1830,7 @@ struct tcp_md5sig_key { u8 keylen; u8 family; /* AF_INET or AF_INET6 */ u8 prefixlen; + u8 flags; union tcp_md5_addr addr; int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; @@ -1590,36 +1866,75 @@ union tcp_md5sum_block { #endif }; -/* - pool: digest algorithm, hash description and scratch buffer */ -struct tcp_md5sig_pool { - struct ahash_request *md5_req; - void *scratch; +/* + * struct tcp_sigpool - per-CPU pool of ahash_requests + * @scratch: per-CPU temporary area, that can be used between + * tcp_sigpool_start() and tcp_sigpool_end() to perform + * crypto request + * @req: pre-allocated ahash request + */ +struct tcp_sigpool { + void *scratch; + struct ahash_request *req; }; +int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size); +void tcp_sigpool_get(unsigned int id); +void tcp_sigpool_release(unsigned int id); +int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, + const struct sk_buff *skb, + unsigned int header_len); + +/** + * tcp_sigpool_start - disable bh and start using tcp_sigpool_ahash + * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() + * @c: returned tcp_sigpool for usage (uninitialized on failure) + * + * Returns: 0 on success, error otherwise. + */ +int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); +/** + * tcp_sigpool_end - enable bh and stop using tcp_sigpool + * @c: tcp_sigpool context that was returned by tcp_sigpool_start() + */ +void tcp_sigpool_end(struct tcp_sigpool *c); +size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len); /* - functions */ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index, - const u8 *newkey, u8 newkeylen, gfp_t gfp); + int family, u8 prefixlen, int l3index, u8 flags, + const u8 *newkey, u8 newkeylen); +int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, + int family, u8 prefixlen, int l3index, + struct tcp_md5sig_key *key); + int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index); + int family, u8 prefixlen, int l3index, u8 flags); +void tcp_clear_md5_list(struct sock *sk); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG -#include <linux/jump_label.h> -extern struct static_key_false tcp_md5_needed; struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, - int family); + int family, bool any_l3index); static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { - if (!static_branch_unlikely(&tcp_md5_needed)) + if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; - return __tcp_md5_do_lookup(sk, l3index, addr, family); + return __tcp_md5_do_lookup(sk, l3index, addr, family, false); +} + +static inline struct tcp_md5sig_key * +tcp_md5_do_lookup_any_l3index(const struct sock *sk, + const union tcp_md5_addr *addr, int family) +{ + if (!static_branch_unlikely(&tcp_md5_needed.key)) + return NULL; + return __tcp_md5_do_lookup(sk, 0, addr, family, true); } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) @@ -1630,20 +1945,23 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, { return NULL; } -#define tcp_twsk_md5_key(twsk) NULL -#endif -bool tcp_alloc_md5sig_pool(void); - -struct tcp_md5sig_pool *tcp_get_md5sig_pool(void); -static inline void tcp_put_md5sig_pool(void) +static inline struct tcp_md5sig_key * +tcp_md5_do_lookup_any_l3index(const struct sock *sk, + const union tcp_md5_addr *addr, int family) { - local_bh_enable(); + return NULL; } -int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *, - unsigned int header_len); -int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, +#define tcp_twsk_md5_key(twsk) NULL +#endif + +int tcp_md5_alloc_sigpool(void); +void tcp_md5_release_sigpool(void); +void tcp_md5_add_sigpool(void); +extern int tcp_md5_sigpool_id; + +int tcp_md5_hash_key(struct tcp_sigpool *hp, const struct tcp_md5sig_key *key); /* From tcp_fastopen.c */ @@ -1688,7 +2006,6 @@ struct tcp_fastopen_context { struct rcu_head rcu; }; -extern unsigned int sysctl_tcp_fastopen_blackhole_timeout; void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); @@ -1766,11 +2083,6 @@ static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) return skb_rb_last(&sk->tcp_rtx_queue); } -static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); @@ -1849,7 +2161,15 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc { list_del(&skb->tcp_tsorted_anchor); tcp_rtx_queue_unlink(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); +} + +static inline void tcp_write_collapse_fence(struct sock *sk) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + + if (skb) + TCP_SKB_CB(skb)->eor = 1; } static inline void tcp_push_pending_frames(struct sock *sk) @@ -1909,7 +2229,7 @@ static inline bool inet_sk_transparent(const struct sock *sk) case TCP_NEW_SYN_RECV: return inet_rsk(inet_reqsk(sk))->no_srccheck; } - return inet_sk(sk)->transparent; + return inet_test_bit(TRANSPARENT, sk); } /* Determines whether this is a thin stream (which may suffer from @@ -1938,7 +2258,6 @@ struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; - struct tcp_seq_afinfo *bpf_seq_afinfo; int bucket, offset, sbucket, num; loff_t last_pos; }; @@ -1950,34 +2269,34 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb); +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)); -int tcp_gro_complete(struct sk_buff *skb); +#ifdef CONFIG_INET +void tcp_gro_complete(struct sk_buff *skb); +#else +static inline void tcp_gro_complete(struct sk_buff *skb) { } +#endif void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; -} + u32 val; -/* @wake is one when sk_stream_write_space() calls us. - * This sends EPOLLOUT only if notsent_bytes is half the limit. - * This mimics the strategy used in sock_def_write_space(). - */ -static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) -{ - const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = READ_ONCE(tp->write_seq) - - READ_ONCE(tp->snd_nxt); + val = READ_ONCE(tp->notsent_lowat); - return (notsent_bytes << wake) < tcp_notsent_lowat(tp); + return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } +bool tcp_stream_memory_free(const struct sock *sk, int wake); + #ifdef CONFIG_PROC_FS int tcp4_proc_init(void); void tcp4_proc_exit(void); @@ -2002,6 +2321,18 @@ struct tcp_sock_af_ops { sockptr_t optval, int optlen); #endif +#ifdef CONFIG_TCP_AO + int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); + struct tcp_ao_key *(*ao_lookup)(const struct sock *sk, + struct sock *addr_sk, + int sndid, int rcvid); + int (*ao_calc_key_sk)(struct tcp_ao_key *mkt, u8 *key, + const struct sock *sk, + __be32 sisn, __be32 disn, bool send); + int (*calc_ao_hash)(char *location, struct tcp_ao_key *ao, + const struct sock *sk, const struct sk_buff *skb, + const u8 *tkey, int hash_offset, u32 sne); +#endif }; struct tcp_request_sock_ops { @@ -2014,15 +2345,24 @@ struct tcp_request_sock_ops { const struct sock *sk, const struct sk_buff *skb); #endif - void (*init_req)(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb); +#ifdef CONFIG_TCP_AO + struct tcp_ao_key *(*ao_lookup)(const struct sock *sk, + struct request_sock *req, + int sndid, int rcvid); + int (*ao_calc_key)(struct tcp_ao_key *mkt, u8 *key, struct request_sock *sk); + int (*ao_synack_hash)(char *ao_hash, struct tcp_ao_key *mkt, + struct request_sock *req, const struct sk_buff *skb, + int hash_offset, u32 sne); +#endif #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif - struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, - const struct request_sock *req); + struct dst_entry *(*route_req)(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req, + u32 tw_isn); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, @@ -2055,6 +2395,70 @@ static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, } #endif +struct tcp_key { + union { + struct { + struct tcp_ao_key *ao_key; + char *traffic_key; + u32 sne; + u8 rcv_next; + }; + struct tcp_md5sig_key *md5_key; + }; + enum { + TCP_KEY_NONE = 0, + TCP_KEY_MD5, + TCP_KEY_AO, + } type; +}; + +static inline void tcp_get_current_key(const struct sock *sk, + struct tcp_key *out) +{ +#if defined(CONFIG_TCP_AO) || defined(CONFIG_TCP_MD5SIG) + const struct tcp_sock *tp = tcp_sk(sk); +#endif + +#ifdef CONFIG_TCP_AO + if (static_branch_unlikely(&tcp_ao_needed.key)) { + struct tcp_ao_info *ao; + + ao = rcu_dereference_protected(tp->ao_info, + lockdep_sock_is_held(sk)); + if (ao) { + out->ao_key = READ_ONCE(ao->current_key); + out->type = TCP_KEY_AO; + return; + } + } +#endif +#ifdef CONFIG_TCP_MD5SIG + if (static_branch_unlikely(&tcp_md5_needed.key) && + rcu_access_pointer(tp->md5sig_info)) { + out->md5_key = tp->af_specific->md5_lookup(sk, sk); + if (out->md5_key) { + out->type = TCP_KEY_MD5; + return; + } + } +#endif + out->type = TCP_KEY_NONE; +} + +static inline bool tcp_key_is_md5(const struct tcp_key *key) +{ + if (static_branch_tcp_md5()) + return key->type == TCP_KEY_MD5; + return false; +} + +static inline bool tcp_key_is_ao(const struct tcp_key *key) +{ + if (static_branch_tcp_ao()) + return key->type == TCP_KEY_AO; + return false; +} + int tcpv4_offload_init(void); void tcp_v4_init(void); @@ -2065,20 +2469,69 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd); -extern void tcp_rack_mark_lost(struct sock *sk); +extern bool tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); +/* tcp_plb.c */ + +/* + * Scaling factor for fractions in PLB. For example, tcp_plb_update_state + * expects cong_ratio which represents fraction of traffic that experienced + * congestion over a single RTT. In order to avoid floating point operations, + * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in. + */ +#define TCP_PLB_SCALE 8 + +/* State for PLB (Protective Load Balancing) for a single TCP connection. */ +struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +}; + +static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +{ + plb->consec_cong_rounds = 0; + plb->pause_until = 0; +} +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio); +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); + +static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) +{ + WARN_ONCE(cond, + "%scwn:%u out:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", + str, + tcp_snd_cwnd(tcp_sk(sk)), + tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, + tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, + tcp_sk(sk)->tlp_high_seq, sk->sk_state, + inet_csk(sk)->icsk_ca_state, + tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, + inet_csk(sk)->icsk_pmtu_cookie); +} + /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; - u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); - return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; + if (likely(skb)) { + u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); + + return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; + } else { + tcp_warn_once(sk, 1, "rtx queue empty: "); + return jiffies_to_usecs(rto); + } + } /* @@ -2148,9 +2601,13 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) u16 segs_in; segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - tp->segs_in += segs_in; + + /* We update these fields while other threads might + * read them from tcp_get_info() + */ + WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in); if (skb->len > tcp_hdrlen(skb)) - tp->data_segs_in += segs_in; + WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in); } /* @@ -2187,8 +2644,8 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ - int (*get_info)(const struct sock *sk, struct sk_buff *skb); - size_t (*get_info_size)(const struct sock *sk); + int (*get_info)(struct sock *sk, struct sk_buff *skb, bool net_admin); + size_t (*get_info_size)(const struct sock *sk, bool net_admin); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, const gfp_t priority); @@ -2208,25 +2665,38 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) +#ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; -#ifdef CONFIG_BPF_STREAM_PARSER -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +#ifdef CONFIG_BPF_SYSCALL +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); +#ifdef CONFIG_BPF_STREAM_PARSER +struct strparser; +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); +#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ + +#ifdef CONFIG_INET +void tcp_eat_skb(struct sock *sk, struct sk_buff *skb); #else -static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { } -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif -#ifdef CONFIG_NET_SOCK_MSG -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, - int flags); -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags); +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, + struct sk_msg *msg, u32 bytes, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) +static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ +} +#endif + #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, @@ -2257,6 +2727,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_owned_by_me(sk); } @@ -2315,7 +2786,7 @@ static inline u32 tcp_timeout_init(struct sock *sk) if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; - return timeout; + return min_t(int, timeout, TCP_RTO_MAX); } static inline u32 tcp_rwnd_init_bpf(struct sock *sk) @@ -2334,10 +2805,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } -static inline void tcp_bpf_rtt(struct sock *sk) +static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) - tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) @@ -2345,9 +2816,9 @@ extern struct static_key_false tcp_have_smc; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) -void clean_acked_data_enable(struct inet_connection_sock *icsk, +void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)); -void clean_acked_data_disable(struct inet_connection_sock *icsk); +void clean_acked_data_disable(struct tcp_sock *tp); void clean_acked_data_flush(void); #endif @@ -2373,4 +2844,59 @@ static inline u64 tcp_transmit_time(const struct sock *sk) return 0; } +static inline int tcp_parse_auth_options(const struct tcphdr *th, + const u8 **md5_hash, const struct tcp_ao_hdr **aoh) +{ + const u8 *md5_tmp, *ao_tmp; + int ret; + + ret = tcp_do_parse_auth_options(th, &md5_tmp, &ao_tmp); + if (ret) + return ret; + + if (md5_hash) + *md5_hash = md5_tmp; + + if (aoh) { + if (!ao_tmp) + *aoh = NULL; + else + *aoh = (struct tcp_ao_hdr *)(ao_tmp - 2); + } + + return 0; +} + +static inline bool tcp_ao_required(struct sock *sk, const void *saddr, + int family, int l3index, bool stat_inc) +{ +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao_info; + struct tcp_ao_key *ao_key; + + if (!static_branch_unlikely(&tcp_ao_needed.key)) + return false; + + ao_info = rcu_dereference_check(tcp_sk(sk)->ao_info, + lockdep_sock_is_held(sk)); + if (!ao_info) + return false; + + ao_key = tcp_ao_do_lookup(sk, l3index, saddr, family, -1, -1); + if (ao_info->ao_required || ao_key) { + if (stat_inc) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOREQUIRED); + atomic64_inc(&ao_info->counters.ao_required); + } + return true; + } +#endif + return false; +} + +enum skb_drop_reason tcp_inbound_hash(struct sock *sk, + const struct request_sock *req, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); + #endif /* _TCP_H */ diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h new file mode 100644 index 000000000000..df655ce6987d --- /dev/null +++ b/include/net/tcp_ao.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TCP_AO_H +#define _TCP_AO_H + +#define TCP_AO_KEY_ALIGN 1 +#define __tcp_ao_key_align __aligned(TCP_AO_KEY_ALIGN) + +union tcp_ao_addr { + struct in_addr a4; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr a6; +#endif +}; + +struct tcp_ao_hdr { + u8 kind; + u8 length; + u8 keyid; + u8 rnext_keyid; +}; + +static inline u8 tcp_ao_hdr_maclen(const struct tcp_ao_hdr *aoh) +{ + return aoh->length - sizeof(struct tcp_ao_hdr); +} + +struct tcp_ao_counters { + atomic64_t pkt_good; + atomic64_t pkt_bad; + atomic64_t key_not_found; + atomic64_t ao_required; + atomic64_t dropped_icmp; +}; + +struct tcp_ao_key { + struct hlist_node node; + union tcp_ao_addr addr; + u8 key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align; + unsigned int tcp_sigpool_id; + unsigned int digest_size; + int l3index; + u8 prefixlen; + u8 family; + u8 keylen; + u8 keyflags; + u8 sndid; + u8 rcvid; + u8 maclen; + struct rcu_head rcu; + atomic64_t pkt_good; + atomic64_t pkt_bad; + u8 traffic_keys[]; +}; + +static inline u8 *rcv_other_key(struct tcp_ao_key *key) +{ + return key->traffic_keys; +} + +static inline u8 *snd_other_key(struct tcp_ao_key *key) +{ + return key->traffic_keys + key->digest_size; +} + +static inline int tcp_ao_maclen(const struct tcp_ao_key *key) +{ + return key->maclen; +} + +/* Use tcp_ao_len_aligned() for TCP header calculations */ +static inline int tcp_ao_len(const struct tcp_ao_key *key) +{ + return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr); +} + +static inline int tcp_ao_len_aligned(const struct tcp_ao_key *key) +{ + return round_up(tcp_ao_len(key), 4); +} + +static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key) +{ + return key->digest_size; +} + +static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key) +{ + return sizeof(struct tcp_ao_key) + (key->digest_size << 1); +} + +struct tcp_ao_info { + /* List of tcp_ao_key's */ + struct hlist_head head; + /* current_key and rnext_key are maintained on sockets + * in TCP_AO_ESTABLISHED states. + * Their purpose is to cache keys on established connections, + * saving needless lookups. Never dereference any of them from + * listen sockets. + * ::current_key may change in RX to the key that was requested by + * the peer, please use READ_ONCE()/WRITE_ONCE() in order to avoid + * load/store tearing. + * Do the same for ::rnext_key, if you don't hold socket lock + * (it's changed only by userspace request in setsockopt()). + */ + struct tcp_ao_key *current_key; + struct tcp_ao_key *rnext_key; + struct tcp_ao_counters counters; + u32 ao_required :1, + accept_icmps :1, + __unused :30; + __be32 lisn; + __be32 risn; + /* Sequence Number Extension (SNE) are upper 4 bytes for SEQ, + * that protect TCP-AO connection from replayed old TCP segments. + * See RFC5925 (6.2). + * In order to get correct SNE, there's a helper tcp_ao_compute_sne(). + * It needs SEQ basis to understand whereabouts are lower SEQ numbers. + * According to that basis vector, it can provide incremented SNE + * when SEQ rolls over or provide decremented SNE when there's + * a retransmitted segment from before-rolling over. + * - for request sockets such basis is rcv_isn/snt_isn, which seems + * good enough as it's unexpected to receive 4 Gbytes on reqsk. + * - for full sockets the basis is rcv_nxt/snd_una. snd_una is + * taken instead of snd_nxt as currently it's easier to track + * in tcp_snd_una_update(), rather than updating SNE in all + * WRITE_ONCE(tp->snd_nxt, ...) + * - for time-wait sockets the basis is tw_rcv_nxt/tw_snd_nxt. + * tw_snd_nxt is not expected to change, while tw_rcv_nxt may. + */ + u32 snd_sne; + u32 rcv_sne; + refcount_t refcnt; /* Protects twsk destruction */ + struct rcu_head rcu; +}; + +#ifdef CONFIG_TCP_MD5SIG +#include <linux/jump_label.h> +extern struct static_key_false_deferred tcp_md5_needed; +#define static_branch_tcp_md5() static_branch_unlikely(&tcp_md5_needed.key) +#else +#define static_branch_tcp_md5() false +#endif +#ifdef CONFIG_TCP_AO +/* TCP-AO structures and functions */ +#include <linux/jump_label.h> +extern struct static_key_false_deferred tcp_ao_needed; +#define static_branch_tcp_ao() static_branch_unlikely(&tcp_ao_needed.key) +#else +#define static_branch_tcp_ao() false +#endif + +#ifdef CONFIG_TCP_AO +/* TCP-AO structures and functions */ +struct tcp4_ao_context { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + __be32 sisn; + __be32 disn; +}; + +struct tcp6_ao_context { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + __be32 sisn; + __be32 disn; +}; + +struct tcp_sigpool; +/* Established states are fast-path and there always is current_key/rnext_key */ +#define TCP_AO_ESTABLISHED (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | \ + TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSING) + +int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, + struct tcp_ao_key *key, struct tcphdr *th, + __u8 *hash_location); +int tcp_ao_hash_skb(unsigned short int family, + char *ao_hash, struct tcp_ao_key *key, + const struct sock *sk, const struct sk_buff *skb, + const u8 *tkey, int hash_offset, u32 sne); +int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, + sockptr_t optval, int optlen); +struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, + struct tcp_ao_info *ao, + int sndid, int rcvid); +int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, + struct request_sock *req, struct sk_buff *skb, + int family); +int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx, + unsigned int len, struct tcp_sigpool *hp); +void tcp_ao_destroy_sock(struct sock *sk, bool twsk); +void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp); +bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code); +int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen); +int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen); +int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen); +int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen); +enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, + const struct sk_buff *skb, unsigned short int family, + const struct request_sock *req, int l3index, + const struct tcp_ao_hdr *aoh); +u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq); +struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, + const union tcp_ao_addr *addr, + int family, int sndid, int rcvid); +int tcp_ao_hash_hdr(unsigned short family, char *ao_hash, + struct tcp_ao_key *key, const u8 *tkey, + const union tcp_ao_addr *daddr, + const union tcp_ao_addr *saddr, + const struct tcphdr *th, u32 sne); +int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb, + const struct tcp_ao_hdr *aoh, int l3index, u32 seq, + struct tcp_ao_key **key, char **traffic_key, + bool *allocated_traffic_key, u8 *keyid, u32 *sne); + +/* ipv4 specific functions */ +int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); +struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk, + int sndid, int rcvid); +int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *mkt, + struct request_sock *req, const struct sk_buff *skb, + int hash_offset, u32 sne); +int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, + const struct sock *sk, + __be32 sisn, __be32 disn, bool send); +int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, + struct request_sock *req); +struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk, + struct request_sock *req, + int sndid, int rcvid); +int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, + const struct sock *sk, const struct sk_buff *skb, + const u8 *tkey, int hash_offset, u32 sne); +/* ipv6 specific functions */ +int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp, + const struct in6_addr *daddr, + const struct in6_addr *saddr, int nbytes); +int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key, + const struct sk_buff *skb, __be32 sisn, __be32 disn); +int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, + const struct sock *sk, __be32 sisn, + __be32 disn, bool send); +int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, + struct request_sock *req); +struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk, + struct sock *addr_sk, int sndid, int rcvid); +struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk, + struct request_sock *req, + int sndid, int rcvid); +int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, + const struct sock *sk, const struct sk_buff *skb, + const u8 *tkey, int hash_offset, u32 sne); +int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); +int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key, + struct request_sock *req, const struct sk_buff *skb, + int hash_offset, u32 sne); +void tcp_ao_established(struct sock *sk); +void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb); +void tcp_ao_connect_init(struct sock *sk); +void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, + struct request_sock *req, unsigned short int family); +#else /* CONFIG_TCP_AO */ + +static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, + struct tcp_ao_key *key, struct tcphdr *th, + __u8 *hash_location) +{ + return 0; +} + +static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, + struct request_sock *req, unsigned short int family) +{ +} + +static inline bool tcp_ao_ignore_icmp(const struct sock *sk, int family, + int type, int code) +{ + return false; +} + +static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, + const struct sk_buff *skb, unsigned short int family, + const struct request_sock *req, int l3index, + const struct tcp_ao_hdr *aoh) +{ + return SKB_NOT_DROPPED_YET; +} + +static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, + int l3index, const union tcp_ao_addr *addr, + int family, int sndid, int rcvid) +{ + return NULL; +} + +static inline void tcp_ao_destroy_sock(struct sock *sk, bool twsk) +{ +} + +static inline void tcp_ao_established(struct sock *sk) +{ +} + +static inline void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) +{ +} + +static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, + struct tcp_sock *tp) +{ +} + +static inline void tcp_ao_connect_init(struct sock *sk) +{ +} + +static inline int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) +{ + return -ENOPROTOOPT; +} + +static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) +{ + return -ENOPROTOOPT; +} + +static inline int tcp_ao_get_repair(struct sock *sk, + sockptr_t optval, sockptr_t optlen) +{ + return -ENOPROTOOPT; +} + +static inline int tcp_ao_set_repair(struct sock *sk, + sockptr_t optval, unsigned int optlen) +{ + return -ENOPROTOOPT; +} +#endif + +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) +int tcp_do_parse_auth_options(const struct tcphdr *th, + const u8 **md5_hash, const u8 **ao_hash); +#else +static inline int tcp_do_parse_auth_options(const struct tcphdr *th, + const u8 **md5_hash, const u8 **ao_hash) +{ + *md5_hash = NULL; + *ao_hash = NULL; + return 0; +} +#endif + +#endif /* _TCP_AO_H */ diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h index cc00118acca1..d60e8148ff4c 100644 --- a/include/net/tcp_states.h +++ b/include/net/tcp_states.h @@ -22,6 +22,7 @@ enum { TCP_LISTEN, TCP_CLOSING, /* Now a valid state */ TCP_NEW_SYN_RECV, + TCP_BOUND_INACTIVE, /* Pseudo-state for inet_diag */ TCP_MAX_STATES /* Leave at the end! */ }; @@ -43,6 +44,7 @@ enum { TCPF_LISTEN = (1 << TCP_LISTEN), TCPF_CLOSING = (1 << TCP_CLOSING), TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV), + TCPF_BOUND_INACTIVE = (1 << TCP_BOUND_INACTIVE), }; #endif /* _LINUX_TCP_STATES_H */ diff --git a/include/net/tcx.h b/include/net/tcx.h new file mode 100644 index 000000000000..5ce0ce9e0c02 --- /dev/null +++ b/include/net/tcx.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __NET_TCX_H +#define __NET_TCX_H + +#include <linux/bpf.h> +#include <linux/bpf_mprog.h> + +#include <net/sch_generic.h> + +struct mini_Qdisc; + +struct tcx_entry { + struct mini_Qdisc __rcu *miniq; + struct bpf_mprog_bundle bundle; + u32 miniq_active; + struct rcu_head rcu; +}; + +struct tcx_link { + struct bpf_link link; + struct net_device *dev; + u32 location; +}; + +static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) +{ +#ifdef CONFIG_NET_XGRESS + skb->tc_at_ingress = ingress; +#endif +} + +#ifdef CONFIG_NET_XGRESS +static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry) +{ + struct bpf_mprog_bundle *bundle = entry->parent; + + return container_of(bundle, struct tcx_entry, bundle); +} + +static inline struct tcx_link *tcx_link(const struct bpf_link *link) +{ + return container_of(link, struct tcx_link, link); +} + +void tcx_inc(void); +void tcx_dec(void); + +static inline void tcx_entry_sync(void) +{ + /* bpf_mprog_entry got a/b swapped, therefore ensure that + * there are no inflight users on the old one anymore. + */ + synchronize_rcu(); +} + +static inline void +tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry, + bool ingress) +{ + ASSERT_RTNL(); + if (ingress) + rcu_assign_pointer(dev->tcx_ingress, entry); + else + rcu_assign_pointer(dev->tcx_egress, entry); +} + +static inline struct bpf_mprog_entry * +tcx_entry_fetch(struct net_device *dev, bool ingress) +{ + ASSERT_RTNL(); + if (ingress) + return rcu_dereference_rtnl(dev->tcx_ingress); + else + return rcu_dereference_rtnl(dev->tcx_egress); +} + +static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void) +{ + struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL); + + if (tcx) { + bpf_mprog_bundle_init(&tcx->bundle); + return &tcx->bundle.a; + } + return NULL; +} +#define tcx_entry_create(...) alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__)) + +static inline void tcx_entry_free(struct bpf_mprog_entry *entry) +{ + kfree_rcu(tcx_entry(entry), rcu); +} + +static inline struct bpf_mprog_entry * +tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created) +{ + struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress); + + *created = false; + if (!entry) { + entry = tcx_entry_create(); + if (!entry) + return NULL; + *created = true; + } + return entry; +} + +static inline void tcx_skeys_inc(bool ingress) +{ + tcx_inc(); + if (ingress) + net_inc_ingress_queue(); + else + net_inc_egress_queue(); +} + +static inline void tcx_skeys_dec(bool ingress) +{ + if (ingress) + net_dec_ingress_queue(); + else + net_dec_egress_queue(); + tcx_dec(); +} + +static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry) +{ + ASSERT_RTNL(); + tcx_entry(entry)->miniq_active++; +} + +static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry) +{ + ASSERT_RTNL(); + tcx_entry(entry)->miniq_active--; +} + +static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) +{ + ASSERT_RTNL(); + return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active; +} + +static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb, + int code) +{ + switch (code) { + case TCX_PASS: + skb->tc_index = qdisc_skb_cb(skb)->tc_classid; + fallthrough; + case TCX_DROP: + case TCX_REDIRECT: + return code; + case TCX_NEXT: + default: + return TCX_NEXT; + } +} +#endif /* CONFIG_NET_XGRESS */ + +#if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL) +int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); +void tcx_uninstall(struct net_device *dev, bool ingress); + +int tcx_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); + +static inline void dev_tcx_uninstall(struct net_device *dev) +{ + ASSERT_RTNL(); + tcx_uninstall(dev, true); + tcx_uninstall(dev, false); +} +#else +static inline int tcx_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_prog_detach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} + +static inline void dev_tcx_uninstall(struct net_device *dev) +{ +} +#endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */ +#endif /* __NET_TCX_H */ diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 74d2b463cc95..62b3e9f2aed4 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -15,18 +15,9 @@ struct timewait_sock_ops { struct kmem_cache *twsk_slab; char *twsk_slab_name; unsigned int twsk_obj_size; - int (*twsk_unique)(struct sock *sk, - struct sock *sktw, void *twp); void (*twsk_destructor)(struct sock *sk); }; -static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) -{ - if (sk->sk_prot->twsk_prot->twsk_unique != NULL) - return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp); - return 0; -} - static inline void twsk_destructor(struct sock *sk) { if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) diff --git a/include/net/tls.h b/include/net/tls.h index baf1e99d8193..857340338b69 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -39,7 +39,6 @@ #include <linux/crypto.h> #include <linux/socket.h> #include <linux/tcp.h> -#include <linux/skmsg.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/rcupdate.h> @@ -50,6 +49,7 @@ #include <crypto/aead.h> #include <uapi/linux/tls.h> +struct tls_rec; /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) @@ -59,14 +59,17 @@ #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) -#define TLS_RECORD_TYPE_DATA 0x17 +#define TLS_HANDSHAKE_KEYUPDATE 24 /* rfc8446 B.3: Key update */ #define TLS_AAD_SPACE_SIZE 13 -#define MAX_IV_SIZE 16 +#define TLS_MAX_IV_SIZE 16 +#define TLS_MAX_SALT_SIZE 4 +#define TLS_TAG_SIZE 16 #define TLS_MAX_REC_SEQ_SIZE 8 +#define TLS_MAX_AAD_SIZE TLS_AAD_SPACE_SIZE -/* For AES-CCM, the full 16-bytes of IV is made of '4' fields of given sizes. +/* For CCM mode, the full 16-bytes of IV is made of '4' fields of given sizes. * * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3] * @@ -74,15 +77,7 @@ * Hence b0 contains (3 - 1) = 2. */ #define TLS_AES_CCM_IV_B0_BYTE 2 - -#define __TLS_INC_STATS(net, field) \ - __SNMP_INC_STATS((net)->mib.tls_statistics, field) -#define TLS_INC_STATS(net, field) \ - SNMP_INC_STATS((net)->mib.tls_statistics, field) -#define __TLS_DEC_STATS(net, field) \ - __SNMP_DEC_STATS((net)->mib.tls_statistics, field) -#define TLS_DEC_STATS(net, field) \ - SNMP_DEC_STATS((net)->mib.tls_statistics, field) +#define TLS_SM4_CCM_IV_B0_BYTE 2 enum { TLS_BASE, @@ -92,37 +87,6 @@ enum { TLS_NUM_CONFIG, }; -/* TLS records are maintained in 'struct tls_rec'. It stores the memory pages - * allocated or mapped for each TLS record. After encryption, the records are - * stores in a linked list. - */ -struct tls_rec { - struct list_head list; - int tx_ready; - int tx_flags; - - struct sk_msg msg_plaintext; - struct sk_msg msg_encrypted; - - /* AAD | msg_plaintext.sg.data | sg_tag */ - struct scatterlist sg_aead_in[2]; - /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */ - struct scatterlist sg_aead_out[2]; - - char content_type; - struct scatterlist sg_content_type; - - char aad_space[TLS_AAD_SPACE_SIZE]; - u8 iv_data[MAX_IV_SIZE]; - struct aead_request aead_req; - u8 aead_req_ctx[]; -}; - -struct tls_msg { - struct strp_msg rxm; - u8 control; -}; - struct tx_work { struct delayed_work work; struct sock *sk; @@ -135,9 +99,6 @@ struct tls_sw_context_tx { struct tls_rec *open_rec; struct list_head tx_list; atomic_t encrypt_pending; - /* protect crypto_wait with encrypt_pending */ - spinlock_t encrypt_compl_lock; - int async_notify; u8 async_capable:1; #define BIT_TX_SCHEDULED 0 @@ -145,21 +106,39 @@ struct tls_sw_context_tx { unsigned long tx_bitmask; }; +struct tls_strparser { + struct sock *sk; + + u32 mark : 8; + u32 stopped : 1; + u32 copy_mode : 1; + u32 mixed_decrypted : 1; + + bool msg_ready; + + struct strp_msg stm; + + struct sk_buff *anchor; + struct work_struct work; +}; + struct tls_sw_context_rx { struct crypto_aead *aead_recv; struct crypto_wait async_wait; - struct strparser strp; struct sk_buff_head rx_list; /* list of decrypted 'data' records */ void (*saved_data_ready)(struct sock *sk); - struct sk_buff *recv_pkt; - u8 control; + u8 reader_present; u8 async_capable:1; - u8 decrypted:1; + u8 zc_capable:1; + u8 reader_contended:1; + bool key_update_pending; + + struct tls_strparser strp; + atomic_t decrypt_pending; - /* protect crypto_wait with decrypt_pending*/ - spinlock_t decrypt_compl_lock; - bool async_notify; + struct sk_buff_head async_hold; + struct wait_queue_head wq; }; struct tls_record_info { @@ -170,6 +149,7 @@ struct tls_record_info { skb_frag_t frags[MAX_SKB_FRAGS]; }; +#define TLS_DRIVER_STATE_SIZE_TX 16 struct tls_offload_context_tx { struct crypto_aead *aead_send; spinlock_t lock; /* protects records list */ @@ -181,29 +161,37 @@ struct tls_offload_context_tx { struct scatterlist sg_tx_data[MAX_SKB_FRAGS]; void (*sk_destruct)(struct sock *sk); - u8 driver_state[] __aligned(8); + struct work_struct destruct_work; + struct tls_context *ctx; /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ -#define TLS_DRIVER_STATE_SIZE_TX 16 + u8 driver_state[TLS_DRIVER_STATE_SIZE_TX] __aligned(8); }; -#define TLS_OFFLOAD_CONTEXT_SIZE_TX \ - (sizeof(struct tls_offload_context_tx) + TLS_DRIVER_STATE_SIZE_TX) - enum tls_context_flags { - TLS_RX_SYNC_RUNNING = 0, + /* tls_device_down was called after the netdev went down, device state + * was released, and kTLS works in software, even though rx_conf is + * still TLS_HW (needed for transition). + */ + TLS_RX_DEV_DEGRADED = 0, /* Unlike RX where resync is driven entirely by the core in TX only * the driver knows when things went out of sync, so we need the flag * to be atomic. */ TLS_TX_SYNC_SCHED = 1, + /* tls_dev_del was called for the RX side, device state was released, + * but tls_ctx->netdev might still be kept, because TX-side driver + * resources might not be released yet. Used to prevent the second + * tls_dev_del call in tls_device_down if it happens simultaneously. + */ + TLS_RX_DEV_CLOSED = 2, }; struct cipher_context { - char *iv; - char *rec_seq; + char iv[TLS_MAX_IV_SIZE + TLS_MAX_SALT_SIZE]; + char rec_seq[TLS_MAX_REC_SEQ_SIZE]; }; union tls_crypto_context { @@ -211,6 +199,9 @@ union tls_crypto_context { union { struct tls12_crypto_info_aes_gcm_128 aes_gcm_128; struct tls12_crypto_info_aes_gcm_256 aes_gcm_256; + struct tls12_crypto_info_chacha20_poly1305 chacha20_poly1305; + struct tls12_crypto_info_sm4_gcm sm4_gcm; + struct tls12_crypto_info_sm4_ccm sm4_ccm; }; }; @@ -233,6 +224,8 @@ struct tls_context { u8 tx_conf:3; u8 rx_conf:3; + u8 zerocopy_sendfile:1; + u8 rx_no_pad:1; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); @@ -240,7 +233,7 @@ struct tls_context { void *priv_ctx_tx; void *priv_ctx_rx; - struct net_device *netdev; + struct net_device __rcu *netdev; /* rw cache line */ struct cipher_context tx; @@ -249,7 +242,7 @@ struct tls_context { struct scatterlist *partially_sent_record; u16 partially_sent_offset; - bool in_tcp_sendpages; + bool splicing_pages; bool pending_open_record_frags; struct mutex tx_lock; /* protects partially_sent_* fields and @@ -259,6 +252,7 @@ struct tls_context { /* cache cold stuff */ struct proto *sk_proto; + struct sock *sk; void (*sk_destruct)(struct sock *sk); @@ -300,10 +294,12 @@ enum tls_offload_sync_type { #define TLS_DEVICE_RESYNC_ASYNC_LOGMAX 13 struct tls_offload_resync_async { atomic64_t req; - u32 loglen; + u16 loglen; + u16 rcd_delta; u32 log[TLS_DEVICE_RESYNC_ASYNC_LOGMAX]; }; +#define TLS_DRIVER_STATE_SIZE_RX 8 struct tls_offload_context_rx { /* sw must be the first member of tls_offload_context_rx */ struct tls_sw_context_rx sw; @@ -327,53 +323,13 @@ struct tls_offload_context_rx { struct tls_offload_resync_async *resync_async; }; }; - u8 driver_state[] __aligned(8); /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ -#define TLS_DRIVER_STATE_SIZE_RX 8 + u8 driver_state[TLS_DRIVER_STATE_SIZE_RX] __aligned(8); }; -#define TLS_OFFLOAD_CONTEXT_SIZE_RX \ - (sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX) - -struct tls_context *tls_ctx_create(struct sock *sk); -void tls_ctx_free(struct sock *sk, struct tls_context *ctx); -void update_sk_prot(struct sock *sk, struct tls_context *ctx); - -int wait_on_pending_writer(struct sock *sk, long *timeo); -int tls_sk_query(struct sock *sk, int optname, char __user *optval, - int __user *optlen); -int tls_sk_attach(struct sock *sk, int optname, char __user *optval, - unsigned int optlen); - -int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); -void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); -void tls_sw_strparser_done(struct tls_context *tls_ctx); -int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -int tls_sw_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -int tls_sw_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); -void tls_sw_release_resources_tx(struct sock *sk); -void tls_sw_free_ctx_tx(struct tls_context *tls_ctx); -void tls_sw_free_resources_rx(struct sock *sk); -void tls_sw_release_resources_rx(struct sock *sk); -void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); -int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); -bool tls_sw_stream_read(const struct sock *sk); -ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, unsigned int flags); - -int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -int tls_device_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -int tls_tx_records(struct sock *sk, int flags); - struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); @@ -387,64 +343,19 @@ static inline u32 tls_record_start_seq(struct tls_record_info *rec) return rec->end_seq - rec->len; } -int tls_push_sg(struct sock *sk, struct tls_context *ctx, - struct scatterlist *sg, u16 first_offset, - int flags); -int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, - int flags); -void tls_free_partial_record(struct sock *sk, struct tls_context *ctx); - -static inline struct tls_msg *tls_msg(struct sk_buff *skb) -{ - return (struct tls_msg *)strp_msg(skb); -} - -static inline bool tls_is_partially_sent_record(struct tls_context *ctx) -{ - return !!ctx->partially_sent_record; -} - -static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) -{ - return tls_ctx->pending_open_record_frags; -} - -static inline bool is_tx_ready(struct tls_sw_context_tx *ctx) -{ - struct tls_rec *rec; - - rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); - if (!rec) - return false; - - return READ_ONCE(rec->tx_ready); -} - -static inline u16 tls_user_config(struct tls_context *ctx, bool tx) -{ - u16 config = tx ? ctx->tx_conf : ctx->rx_conf; - - switch (config) { - case TLS_BASE: - return TLS_CONF_BASE; - case TLS_SW: - return TLS_CONF_SW; - case TLS_HW: - return TLS_CONF_HW; - case TLS_HW_RECORD: - return TLS_CONF_HW_RECORD; - } - return 0; -} - struct sk_buff * tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, struct sk_buff *skb); +struct sk_buff * +tls_validate_xmit_skb_sw(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); -static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) +static inline bool tls_is_skb_tx_device_offloaded(const struct sk_buff *skb) { -#ifdef CONFIG_SOCK_VALIDATE_XMIT - return sk_fullsock(sk) && +#ifdef CONFIG_TLS_DEVICE + struct sock *sk = skb->sk; + + return sk && sk_fullsock(sk) && (smp_load_acquire(&sk->sk_validate_xmit_skb) == &tls_validate_xmit_skb); #else @@ -452,28 +363,9 @@ static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) #endif } -static inline void tls_err_abort(struct sock *sk, int err) -{ - sk->sk_err = err; - sk->sk_error_report(sk); -} - -static inline bool tls_bigint_increment(unsigned char *seq, int len) -{ - int i; - - for (i = len - 1; i >= 0; i--) { - ++seq[i]; - if (seq[i] != 0) - break; - } - - return (i == -1); -} - static inline struct tls_context *tls_get_ctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code, * TLS data path doesn't need rcu_dereference(). @@ -481,81 +373,6 @@ static inline struct tls_context *tls_get_ctx(const struct sock *sk) return (__force void *)icsk->icsk_ulp_data; } -static inline void tls_advance_record_sn(struct sock *sk, - struct tls_prot_info *prot, - struct cipher_context *ctx) -{ - if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size)) - tls_err_abort(sk, EBADMSG); - - if (prot->version != TLS_1_3_VERSION) - tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - prot->iv_size); -} - -static inline void tls_fill_prepend(struct tls_context *ctx, - char *buf, - size_t plaintext_len, - unsigned char record_type, - int version) -{ - struct tls_prot_info *prot = &ctx->prot_info; - size_t pkt_len, iv_size = prot->iv_size; - - pkt_len = plaintext_len + prot->tag_size; - if (version != TLS_1_3_VERSION) { - pkt_len += iv_size; - - memcpy(buf + TLS_NONCE_OFFSET, - ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); - } - - /* we cover nonce explicit here as well, so buf should be of - * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE - */ - buf[0] = version == TLS_1_3_VERSION ? - TLS_RECORD_TYPE_DATA : record_type; - /* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */ - buf[1] = TLS_1_2_VERSION_MINOR; - buf[2] = TLS_1_2_VERSION_MAJOR; - /* we can use IV for nonce explicit according to spec */ - buf[3] = pkt_len >> 8; - buf[4] = pkt_len & 0xFF; -} - -static inline void tls_make_aad(char *buf, - size_t size, - char *record_sequence, - int record_sequence_size, - unsigned char record_type, - int version) -{ - if (version != TLS_1_3_VERSION) { - memcpy(buf, record_sequence, record_sequence_size); - buf += 8; - } else { - size += TLS_CIPHER_AES_GCM_128_TAG_SIZE; - } - - buf[0] = version == TLS_1_3_VERSION ? - TLS_RECORD_TYPE_DATA : record_type; - buf[1] = TLS_1_2_VERSION_MAJOR; - buf[2] = TLS_1_2_VERSION_MINOR; - buf[3] = size >> 8; - buf[4] = size & 0xFF; -} - -static inline void xor_iv_with_seq(int version, char *iv, char *seq) -{ - int i; - - if (version == TLS_1_3_VERSION) { - for (i = 0; i < 8; i++) - iv[i + 4] ^= seq[i]; - } -} - - static inline struct tls_sw_context_rx *tls_sw_ctx_rx( const struct tls_context *tls_ctx) { @@ -576,8 +393,12 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) static inline bool tls_sw_has_ctx_tx(const struct sock *sk) { - struct tls_context *ctx = tls_get_ctx(sk); + struct tls_context *ctx; + if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk)) + return false; + + ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_tx(ctx); @@ -585,23 +406,23 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk) static inline bool tls_sw_has_ctx_rx(const struct sock *sk) { - struct tls_context *ctx = tls_get_ctx(sk); + struct tls_context *ctx; + + if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk)) + return false; + ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_rx(ctx); } -void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); -void tls_device_write_space(struct sock *sk, struct tls_context *ctx); - static inline struct tls_offload_context_rx * tls_offload_ctx_rx(const struct tls_context *tls_ctx) { return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; } -#if IS_ENABLED(CONFIG_TLS_DEVICE) static inline void *__tls_driver_ctx(struct tls_context *tls_ctx, enum tls_offload_ctx_dir direction) { @@ -616,7 +437,6 @@ tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction) { return __tls_driver_ctx(tls_get_ctx(sk), direction); } -#endif #define RESYNC_REQ BIT(0) #define RESYNC_REQ_ASYNC BIT(1) @@ -639,6 +459,7 @@ tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len) atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) | ((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC); rx_ctx->resync_async->loglen = 0; + rx_ctx->resync_async->rcd_delta = 0; } static inline void @@ -670,31 +491,11 @@ static inline bool tls_offload_tx_resync_pending(struct sock *sk) return ret; } -int __net_init tls_proc_init(struct net *net); -void __net_exit tls_proc_fini(struct net *net); - -int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, - unsigned char *record_type); -int decrypt_skb(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout); struct sk_buff *tls_encrypt_skb(struct sk_buff *skb); -int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context_tx *offload_ctx, - struct tls_crypto_info *crypto_info); - #ifdef CONFIG_TLS_DEVICE -void tls_device_init(void); -void tls_device_cleanup(void); void tls_device_sk_destruct(struct sock *sk); -int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); -void tls_device_free_resources_tx(struct sock *sk); -int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); -void tls_device_offload_cleanup_rx(struct sock *sk); -void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq); -int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, - struct sk_buff *skb, struct strp_msg *rxm); static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk) { @@ -703,33 +504,5 @@ static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk) return false; return tls_get_ctx(sk)->rx_conf == TLS_HW; } -#else -static inline void tls_device_init(void) {} -static inline void tls_device_cleanup(void) {} - -static inline int -tls_set_device_offload(struct sock *sk, struct tls_context *ctx) -{ - return -EOPNOTSUPP; -} - -static inline void tls_device_free_resources_tx(struct sock *sk) {} - -static inline int -tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) -{ - return -EOPNOTSUPP; -} - -static inline void tls_device_offload_cleanup_rx(struct sock *sk) {} -static inline void -tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {} - -static inline int -tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, - struct sk_buff *skb, struct strp_msg *rxm) -{ - return 0; -} #endif #endif /* _TLS_OFFLOAD_H */ diff --git a/include/net/tls_prot.h b/include/net/tls_prot.h new file mode 100644 index 000000000000..68a40756440b --- /dev/null +++ b/include/net/tls_prot.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. + * + * TLS Protocol definitions + * + * From https://www.iana.org/assignments/tls-parameters/tls-parameters.xhtml + */ + +#ifndef _TLS_PROT_H +#define _TLS_PROT_H + +/* + * TLS Record protocol: ContentType + */ +enum { + TLS_RECORD_TYPE_CHANGE_CIPHER_SPEC = 20, + TLS_RECORD_TYPE_ALERT = 21, + TLS_RECORD_TYPE_HANDSHAKE = 22, + TLS_RECORD_TYPE_DATA = 23, + TLS_RECORD_TYPE_HEARTBEAT = 24, + TLS_RECORD_TYPE_TLS12_CID = 25, + TLS_RECORD_TYPE_ACK = 26, +}; + +/* + * TLS Alert protocol: AlertLevel + */ +enum { + TLS_ALERT_LEVEL_WARNING = 1, + TLS_ALERT_LEVEL_FATAL = 2, +}; + +/* + * TLS Alert protocol: AlertDescription + */ +enum { + TLS_ALERT_DESC_CLOSE_NOTIFY = 0, + TLS_ALERT_DESC_UNEXPECTED_MESSAGE = 10, + TLS_ALERT_DESC_BAD_RECORD_MAC = 20, + TLS_ALERT_DESC_RECORD_OVERFLOW = 22, + TLS_ALERT_DESC_HANDSHAKE_FAILURE = 40, + TLS_ALERT_DESC_BAD_CERTIFICATE = 42, + TLS_ALERT_DESC_UNSUPPORTED_CERTIFICATE = 43, + TLS_ALERT_DESC_CERTIFICATE_REVOKED = 44, + TLS_ALERT_DESC_CERTIFICATE_EXPIRED = 45, + TLS_ALERT_DESC_CERTIFICATE_UNKNOWN = 46, + TLS_ALERT_DESC_ILLEGAL_PARAMETER = 47, + TLS_ALERT_DESC_UNKNOWN_CA = 48, + TLS_ALERT_DESC_ACCESS_DENIED = 49, + TLS_ALERT_DESC_DECODE_ERROR = 50, + TLS_ALERT_DESC_DECRYPT_ERROR = 51, + TLS_ALERT_DESC_TOO_MANY_CIDS_REQUESTED = 52, + TLS_ALERT_DESC_PROTOCOL_VERSION = 70, + TLS_ALERT_DESC_INSUFFICIENT_SECURITY = 71, + TLS_ALERT_DESC_INTERNAL_ERROR = 80, + TLS_ALERT_DESC_INAPPROPRIATE_FALLBACK = 86, + TLS_ALERT_DESC_USER_CANCELED = 90, + TLS_ALERT_DESC_MISSING_EXTENSION = 109, + TLS_ALERT_DESC_UNSUPPORTED_EXTENSION = 110, + TLS_ALERT_DESC_UNRECOGNIZED_NAME = 112, + TLS_ALERT_DESC_BAD_CERTIFICATE_STATUS_RESPONSE = 113, + TLS_ALERT_DESC_UNKNOWN_PSK_IDENTITY = 115, + TLS_ALERT_DESC_CERTIFICATE_REQUIRED = 116, + TLS_ALERT_DESC_NO_APPLICATION_PROTOCOL = 120, +}; + +#endif /* _TLS_PROT_H */ diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index da06613c9603..1a97e3f32029 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -3,6 +3,7 @@ #define _TRANSP_V6_H #include <net/checksum.h> +#include <net/sock.h> /* IPv6 transport protocols */ extern struct proto rawv6_prot; @@ -12,6 +13,7 @@ extern struct proto tcpv6_prot; extern struct proto pingv6_prot; struct flowi6; +struct ipcm6_cookie; /* extension headers */ int ipv6_exthdrs_init(void); @@ -31,8 +33,6 @@ void udplitev6_exit(void); int tcpv6_init(void); void tcpv6_exit(void); -int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); - /* this does all the common and the specific ctl work */ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb); @@ -56,8 +56,6 @@ ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006) -void inet6_destroy_sock(struct sock *sk); - #define IPV6_SEQ_DGRAM_HEADER \ " sl " \ "local_address " \ diff --git a/include/net/tso.h b/include/net/tso.h index 62c98a9c60f1..e7e157ae0526 100644 --- a/include/net/tso.h +++ b/include/net/tso.h @@ -2,6 +2,7 @@ #ifndef _TSO_H #define _TSO_H +#include <linux/skbuff.h> #include <net/ip.h> #define TSO_HEADER_SIZE 256 @@ -16,7 +17,12 @@ struct tso_t { u32 tcp_seq; }; -int tso_count_descs(const struct sk_buff *skb); +/* Calculate the worst case buffer count */ +static inline int tso_count_descs(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} + void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last); void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size); diff --git a/include/net/tun_proto.h b/include/net/tun_proto.h index 2ea3deba4c99..7b0de7852908 100644 --- a/include/net/tun_proto.h +++ b/include/net/tun_proto.h @@ -1,7 +1,8 @@ #ifndef __NET_TUN_PROTO_H #define __NET_TUN_PROTO_H -#include <linux/kernel.h> +#include <linux/if_ether.h> +#include <linux/types.h> /* One byte protocol values as defined by VXLAN-GPE and NSH. These will * hopefully get a shared IANA registry. diff --git a/include/net/udp.h b/include/net/udp.h index 295d52a73598..a772510b2aa5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -21,6 +21,7 @@ #include <linux/list.h> #include <linux/bug.h> #include <net/inet_sock.h> +#include <net/gso.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> @@ -49,39 +50,68 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** - * struct udp_hslot - UDP hash slot + * struct udp_hslot - UDP hash slot used by udp_table.hash/hash4 * * @head: head of list of sockets + * @nulls_head: head of list of sockets, only used by hash4 * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { - struct hlist_head head; + union { + struct hlist_head head; + /* hash4 uses hlist_nulls to avoid moving wrongly onto another + * hlist, because rehash() can happen with lookup(). + */ + struct hlist_nulls_head nulls_head; + }; int count; spinlock_t lock; -} __attribute__((aligned(2 * sizeof(long)))); +} __aligned(2 * sizeof(long)); + +/** + * struct udp_hslot_main - UDP hash slot used by udp_table.hash2 + * + * @hslot: basic hash slot + * @hash4_cnt: number of sockets in hslot4 of the same + * (local port, local address) + */ +struct udp_hslot_main { + struct udp_hslot hslot; /* must be the first member */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + u32 hash4_cnt; +#endif +} __aligned(2 * sizeof(long)); +#define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot)) /** * struct udp_table - UDP table * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) + * @hash4: hash table, connected sockets are hashed on + * (local port, local address, remote port, remote address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; - struct udp_hslot *hash2; + struct udp_hslot_main *hash2; +#if !IS_ENABLED(CONFIG_BASE_SMALL) + struct udp_hslot *hash4; +#endif unsigned int mask; unsigned int log; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); static inline struct udp_hslot *udp_hashslot(struct udp_table *table, - struct net *net, unsigned int num) + const struct net *net, + unsigned int num) { return &table->hash[udp_hashfn(net, num, table->mask)]; } + /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() @@ -89,12 +119,94 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { - return &table->hash2[hash & table->mask]; + return &table->hash2[hash & table->mask].hslot; +} + +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_table_hash4_init(struct udp_table *table) +{ +} + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + BUILD_BUG(); + return NULL; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return false; } +static inline unsigned int udp_hash4_slot_size(void) +{ + return 0; +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return false; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ +} +#else /* !CONFIG_BASE_SMALL */ + +/* Must be called with table->hash2 initialized */ +static inline void udp_table_hash4_init(struct udp_table *table) +{ + table->hash4 = (void *)(table->hash2 + (table->mask + 1)); + for (int i = 0; i <= table->mask; i++) { + table->hash2[i].hash4_cnt = 0; + + INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i); + table->hash4[i].count = 0; + spin_lock_init(&table->hash4[i].lock); + } +} + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + return &table->hash4[hash & table->mask]; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node); +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return sizeof(struct udp_hslot); +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return UDP_HSLOT_MAIN(hslot2)->hash4_cnt; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt++; +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt--; +} +#endif /* CONFIG_BASE_SMALL */ + extern struct proto udp_prot; extern atomic_long_t udp_memory_allocated; +DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); /* sysctl variables for udp */ extern long sysctl_udp_mem[3]; @@ -164,34 +276,23 @@ static inline void udp_csum_pull_header(struct sk_buff *skb) UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); } -typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, +typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, __be16 dport); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); -struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, - struct udphdr *uh, struct sock *sk); -int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); +void udp_v6_early_demux(struct sk_buff *skb); +INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, - netdev_features_t features); + netdev_features_t features, bool is_ipv6); -static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) +static inline void udp_lib_init_sock(struct sock *sk) { - struct udphdr *uh; - unsigned int hlen, off; + struct udp_sock *up = udp_sk(sk); - off = skb_gro_offset(skb); - hlen = off + sizeof(*uh); - uh = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) - uh = skb_gro_header_slow(skb, hlen, off); - - return uh; + skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); + up->forward_threshold = sk->sk_rcvbuf >> 2; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ @@ -202,13 +303,29 @@ static inline int udp_lib_hash(struct sock *sk) } void udp_lib_unhash(struct sock *sk); -void udp_lib_rehash(struct sock *sk, u16 new_hash); +void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } +/* hash4 routines shared between UDPv4/6 */ +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_lib_hash4(struct sock *sk, u16 hash) +{ +} + +static inline void udp4_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +void udp_lib_hash4(struct sock *sk, u16 hash); +void udp4_hash4(struct sock *sk); +#endif /* CONFIG_BASE_SMALL */ + int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); @@ -241,7 +358,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, } /* Since this is being sent on the wire obfuscate hash a bit - * to minimize possbility that any useful information to an + * to minimize possibility that any useful information to an * attacker is leaked. Only upper 16 bits are relevant in the * computation for 16 bit port value. */ @@ -255,11 +372,11 @@ static inline int udp_rqueue_get(struct sock *sk) return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } -static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, +static inline bool udp_sk_bound_dev_eq(const struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept, + return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); @@ -267,34 +384,32 @@ static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, } /* net/ipv4/udp.c */ -void udp_destruct_sock(struct sock *sk); +void udp_destruct_common(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); -struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, - int noblock, int *off, int *err); +struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, + int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, - int noblock, int *err) + int *err) { int off = 0; - return __skb_recv_udp(sk, flags, noblock, &off, err); + return __skb_recv_udp(sk, flags, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); -int udp_get_port(struct sock *sk, unsigned short snum, - int (*saddr_cmp)(const struct sock *, - const struct sock *)); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +void udp_splice_eof(struct socket *sock); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); -int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); +int udp_ioctl(struct sock *sk, int cmd, int *karg); int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); @@ -308,24 +423,26 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)); -struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, +struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); -struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, +struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, + __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); -struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, +struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); -struct sock *udp6_lib_lookup(struct net *net, +struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif); -struct sock *__udp6_lib_lookup(struct net *net, +struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); -struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, +struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); +int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() @@ -390,14 +507,7 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb) static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { - int n; - - n = copy_to_iter(skb->data + off, len, to); - if (n == len) - return 0; - - iov_iter_revert(to, n); - return -EFAULT; + return copy_to_iter_full(skb->data + off, len, to) ? 0 : -EFAULT; } /* @@ -447,7 +557,6 @@ struct udp_seq_afinfo { struct udp_iter_state { struct seq_net_private p; int bucket; - struct udp_seq_afinfo *bpf_seq_afinfo; }; void *udp_seq_start(struct seq_file *seq, loff_t *pos); @@ -467,6 +576,7 @@ void udp_init(void); DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void); +void udp_encap_disable(void); #if IS_ENABLED(CONFIG_IPV6) DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void); @@ -488,8 +598,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit - * their ip_summed is set to CHECKSUM_UNNECESSARY. Reset in this - * specific case, where PARTIAL is both correct and required. + * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. + * Reset in this specific case, where PARTIAL is both correct and + * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; @@ -511,9 +622,32 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; } -#ifdef CONFIG_BPF_STREAM_PARSER +static inline void udp_post_segment_fix_csum(struct sk_buff *skb) +{ + /* UDP-lite can't land here - no GRO */ + WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov); + + /* UDP packets generated with UDP_SEGMENT and traversing: + * + * UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx) + * + * can reach an UDP socket with CHECKSUM_NONE, because + * __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE. + * SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will + * have a valid checksum, as the GRO engine validates the UDP csum + * before the aggregation and nobody strips such info in between. + * Instead of adding another check in the tunnel fastpath, we can force + * a valid csum after the segmentation. + * Additionally fixup the UDP CB. + */ + UDP_SKB_CB(skb)->cscov = skb->len; + if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid) + skb->csum_valid = 1; +} + +#ifdef CONFIG_BPF_SYSCALL struct sk_psock; -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -#endif /* BPF_STREAM_PARSER */ +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +#endif #endif /* _UDP_H */ diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 2ea453dac876..2df3b8344eb5 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -67,6 +67,9 @@ static inline int udp_sock_create(struct net *net, typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk, struct sk_buff *skb); +typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk, + struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, struct list_head *head, @@ -80,6 +83,7 @@ struct udp_tunnel_sock_cfg { __u8 encap_type; udp_tunnel_encap_rcv_t encap_rcv; udp_tunnel_encap_err_lookup_t encap_err_lookup; + udp_tunnel_encap_err_rcv_t encap_err_rcv; udp_tunnel_encap_destroy_t encap_destroy; udp_tunnel_gro_receive_t gro_receive; udp_tunnel_gro_complete_t gro_complete; @@ -129,12 +133,16 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); static inline void udp_tunnel_get_rx_info(struct net_device *dev) { ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); } static inline void udp_tunnel_drop_rx_info(struct net_device *dev) { ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); } @@ -146,16 +154,33 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, - struct net_device *dev, struct in6_addr *saddr, - struct in6_addr *daddr, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, __be16 src_port, __be16 dst_port, bool nocheck); void udp_tunnel_sock_release(struct socket *sock); +struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, + struct net_device *dev, + struct net *net, int oif, + __be32 *saddr, + const struct ip_tunnel_key *key, + __be16 sport, __be16 dport, u8 tos, + struct dst_cache *dst_cache); +struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, + struct net_device *dev, + struct net *net, + struct socket *sock, int oif, + struct in6_addr *saddr, + const struct ip_tunnel_key *key, + __be16 sport, __be16 dport, u8 dsfield, + struct dst_cache *dst_cache); + struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, - __be16 flags, __be64 tunnel_id, - int md_size); + const unsigned long *flags, + __be64 tunnel_id, int md_size); #ifdef CONFIG_INET static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) @@ -166,20 +191,31 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif -static inline void udp_tunnel_encap_enable(struct socket *sock) +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) { - struct udp_sock *up = udp_sk(sock->sk); + udp_tunnel_update_gro_rcv(sk, false); + udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); +} - if (up->encap_enabled) +static inline void udp_tunnel_encap_enable(struct sock *sk) +{ + if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) return; - up->encap_enabled = 1; #if IS_ENABLED(CONFIG_IPV6) - if (sock->sk->sk_family == PF_INET6) + if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); - else #endif - udp_encap_enable(); + udp_encap_enable(); } #define UDP_TUNNEL_NIC_MAX_TABLES 4 @@ -323,6 +359,8 @@ udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, static inline void udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->add_port(dev, ti); } @@ -330,6 +368,8 @@ udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) static inline void udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->del_port(dev, ti); } diff --git a/include/net/udplite.h b/include/net/udplite.h index 9185e45b997f..786919d29f8d 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -6,6 +6,7 @@ #define _UDPLITE_H #include <net/ip6_checksum.h> +#include <net/udp.h> /* UDP-Lite socket options */ #define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ @@ -24,14 +25,6 @@ static __inline__ int udplite_getfrag(void *from, char *to, int offset, return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT; } -/* Designate sk as UDP-Lite socket */ -static inline int udplite_sk_init(struct sock *sk) -{ - udp_init_sock(sk); - udp_sk(sk)->pcflag = UDPLITE_BIT; - return 0; -} - /* * Checksumming routines */ @@ -70,60 +63,21 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) return 0; } -/* Slow-path computation of checksum. Socket is locked. */ -static inline __wsum udplite_csum_outgoing(struct sock *sk, struct sk_buff *skb) -{ - const struct udp_sock *up = udp_sk(skb->sk); - int cscov = up->len; - __wsum csum = 0; - - if (up->pcflag & UDPLITE_SEND_CC) { - /* - * Sender has set `partial coverage' option on UDP-Lite socket. - * The special case "up->pcslen == 0" signifies full coverage. - */ - if (up->pcslen < up->len) { - if (0 < up->pcslen) - cscov = up->pcslen; - udp_hdr(skb)->len = htons(up->pcslen); - } - /* - * NOTE: Causes for the error case `up->pcslen > up->len': - * (i) Application error (will not be penalized). - * (ii) Payload too big for send buffer: data is split - * into several packets, each with its own header. - * In this case (e.g. last segment), coverage may - * exceed packet length. - * Since packets with coverage length > packet length are - * illegal, we fall back to the defaults here. - */ - } - - skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ - - skb_queue_walk(&sk->sk_write_queue, skb) { - const int off = skb_transport_offset(skb); - const int len = skb->len - off; - - csum = skb_checksum(skb, off, (cscov > len)? len : cscov, csum); - - if ((cscov -= len) <= 0) - break; - } - return csum; -} - /* Fast-path computation of checksum. Socket may not be locked. */ static inline __wsum udplite_csum(struct sk_buff *skb) { - const struct udp_sock *up = udp_sk(skb->sk); const int off = skb_transport_offset(skb); + const struct sock *sk = skb->sk; int len = skb->len - off; - if ((up->pcflag & UDPLITE_SEND_CC) && up->pcslen < len) { - if (0 < up->pcslen) - len = up->pcslen; - udp_hdr(skb)->len = htons(up->pcslen); + if (udp_test_bit(UDPLITE_SEND_CC, sk)) { + u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen); + + if (pcslen < len) { + if (pcslen > 0) + len = pcslen; + udp_hdr(skb)->len = htons(pcslen); + } } skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ @@ -131,6 +85,4 @@ static inline __wsum udplite_csum(struct sk_buff *skb) } void udplite4_register(void); -int udplite_get_port(struct sock *sk, unsigned short snum, - int (*scmp)(const struct sock *, const struct sock *)); #endif /* _UDPLITE_H */ diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 08537aa14f7c..e2f7ca045d3e 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -3,6 +3,7 @@ #define __NET_VXLAN_H 1 #include <linux/if_vlan.h> +#include <linux/rhashtable-types.h> #include <net/udp_tunnel.h> #include <net/dst_metadata.h> #include <net/rtnetlink.h> @@ -10,6 +11,7 @@ #include <net/nexthop.h> #define IANA_VXLAN_UDP_PORT 4789 +#define IANA_VXLAN_GPE_UDP_PORT 4790 /* VXLAN protocol (RFC 7348) header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ @@ -208,22 +210,49 @@ struct vxlan_rdst { }; struct vxlan_config { - union vxlan_addr remote_ip; - union vxlan_addr saddr; - __be32 vni; - int remote_ifindex; - int mtu; - __be16 dst_port; - u16 port_min; - u16 port_max; - u8 tos; - u8 ttl; - __be32 label; - u32 flags; - unsigned long age_interval; - unsigned int addrmax; - bool no_share; - enum ifla_vxlan_df df; + union vxlan_addr remote_ip; + union vxlan_addr saddr; + __be32 vni; + int remote_ifindex; + int mtu; + __be16 dst_port; + u16 port_min; + u16 port_max; + u8 tos; + u8 ttl; + __be32 label; + enum ifla_vxlan_label_policy label_policy; + u32 flags; + unsigned long age_interval; + unsigned int addrmax; + bool no_share; + enum ifla_vxlan_df df; + struct vxlanhdr reserved_bits; +}; + +enum { + VXLAN_VNI_STATS_RX, + VXLAN_VNI_STATS_RX_DROPS, + VXLAN_VNI_STATS_RX_ERRORS, + VXLAN_VNI_STATS_TX, + VXLAN_VNI_STATS_TX_DROPS, + VXLAN_VNI_STATS_TX_ERRORS, +}; + +struct vxlan_vni_stats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_drops; + u64 rx_errors; + u64 tx_packets; + u64 tx_bytes; + u64 tx_drops; + u64 tx_errors; +}; + +struct vxlan_vni_stats_pcpu { + struct vxlan_vni_stats stats; + struct u64_stats_sync syncp; }; struct vxlan_dev_node { @@ -231,6 +260,26 @@ struct vxlan_dev_node { struct vxlan_dev *vxlan; }; +struct vxlan_vni_node { + struct rhash_head vnode; + struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ +#if IS_ENABLED(CONFIG_IPV6) + struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ +#endif + struct list_head vlist; + __be32 vni; + union vxlan_addr remote_ip; /* default remote ip for this vni */ + struct vxlan_vni_stats_pcpu __percpu *stats; + + struct rcu_head rcu; +}; + +struct vxlan_vni_group { + struct rhashtable vni_hash; + struct list_head vni_list; + u32 num_vnis; +}; + /* Pseudo network device */ struct vxlan_dev { struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ @@ -247,13 +296,20 @@ struct vxlan_dev { struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; - spinlock_t hash_lock[FDB_HASH_SIZE]; + spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; struct vxlan_config cfg; - struct hlist_head fdb_head[FDB_HASH_SIZE]; + struct vxlan_vni_group __rcu *vnigrp; + + struct rhashtable fdb_hash_tbl; + + struct rhashtable mdb_tbl; + struct hlist_head fdb_list; + struct hlist_head mdb_list; + unsigned int mdb_seq; }; #define VXLAN_F_LEARN 0x01 @@ -273,6 +329,9 @@ struct vxlan_dev { #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 #define VXLAN_F_TTL_INHERIT 0x10000 +#define VXLAN_F_VNIFILTER 0x20000 +#define VXLAN_F_MDB 0x40000 +#define VXLAN_F_LOCALBYPASS 0x80000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -282,7 +341,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ - VXLAN_F_COLLECT_METADATA) + VXLAN_F_COLLECT_METADATA | \ + VXLAN_F_VNIFILTER) /* Flags that can be set together with VXLAN_F_GPE. */ #define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ @@ -291,7 +351,9 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM_TX | \ VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ - VXLAN_F_COLLECT_METADATA) + VXLAN_F_COLLECT_METADATA | \ + VXLAN_F_VNIFILTER | \ + VXLAN_F_LOCALBYPASS) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); @@ -327,10 +389,15 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, return features; } -/* IP header + UDP + VXLAN + Ethernet header */ -#define VXLAN_HEADROOM (20 + 8 + 8 + 14) -/* IPv6 header + UDP + VXLAN + Ethernet header */ -#define VXLAN6_HEADROOM (40 + 8 + 8 + 14) +static inline int vxlan_headroom(u32 flags) +{ + /* VXLAN: IP4/6 header + UDP + VXLAN + Ethernet header */ + /* VXLAN-GPE: IP4/6 header + UDP + VXLAN */ + return (flags & VXLAN_F_IPV6 ? sizeof(struct ipv6hdr) : + sizeof(struct iphdr)) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr) + + (flags & VXLAN_F_GPE ? 0 : ETH_HLEN); +} static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) { @@ -492,12 +559,12 @@ static inline void vxlan_flag_attr_error(int attrtype, } static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, - int hash, + u32 hash, struct vxlan_rdst *rdst) { struct fib_nh_common *nhc; - nhc = nexthop_path_fdb_result(nh, hash); + nhc = nexthop_path_fdb_result(nh, hash >> 1); if (unlikely(!nhc)) return false; @@ -515,4 +582,23 @@ static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, return true; } +static inline void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, const struct vxlan_metadata *md) +{ + struct vxlanhdr_gbp *gbp; + + if (!md->gbp) + return; + + gbp = (struct vxlanhdr_gbp *)vxh; + vxh->vx_flags |= VXLAN_HF_GBP; + + if (md->gbp & VXLAN_GBP_DONT_LEARN) + gbp->dont_learn = 1; + + if (md->gbp & VXLAN_GBP_POLICY_APPLIED) + gbp->policy_applied = 1; + + gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); +} + #endif diff --git a/include/net/wimax.h b/include/net/wimax.h deleted file mode 100644 index f6e31d2f47aa..000000000000 --- a/include/net/wimax.h +++ /dev/null @@ -1,503 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Linux WiMAX - * Kernel space API for accessing WiMAX devices - * - * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com> - * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> - * - * The WiMAX stack provides an API for controlling and managing the - * system's WiMAX devices. This API affects the control plane; the - * data plane is accessed via the network stack (netdev). - * - * Parts of the WiMAX stack API and notifications are exported to - * user space via Generic Netlink. In user space, libwimax (part of - * the wimax-tools package) provides a shim layer for accessing those - * calls. - * - * The API is standarized for all WiMAX devices and different drivers - * implement the backend support for it. However, device-specific - * messaging pipes are provided that can be used to issue commands and - * receive notifications in free form. - * - * Currently the messaging pipes are the only means of control as it - * is not known (due to the lack of more devices in the market) what - * will be a good abstraction layer. Expect this to change as more - * devices show in the market. This API is designed to be growable in - * order to address this problem. - * - * USAGE - * - * Embed a `struct wimax_dev` at the beginning of the device's - * private structure, initialize and register it. For details, see - * `struct wimax_dev`s documentation. - * - * Once this is done, wimax-tools's libwimaxll can be used to - * communicate with the driver from user space. You user space - * application does not have to forcibily use libwimaxll and can talk - * the generic netlink protocol directly if desired. - * - * Remember this is a very low level API that will to provide all of - * WiMAX features. Other daemons and services running in user space - * are the expected clients of it. They offer a higher level API that - * applications should use (an example of this is the Intel's WiMAX - * Network Service for the i2400m). - * - * DESIGN - * - * Although not set on final stone, this very basic interface is - * mostly completed. Remember this is meant to grow as new common - * operations are decided upon. New operations will be added to the - * interface, intent being on keeping backwards compatibility as much - * as possible. - * - * This layer implements a set of calls to control a WiMAX device, - * exposing a frontend to the rest of the kernel and user space (via - * generic netlink) and a backend implementation in the driver through - * function pointers. - * - * WiMAX devices have a state, and a kernel-only API allows the - * drivers to manipulate that state. State transitions are atomic, and - * only some of them are allowed (see `enum wimax_st`). - * - * Most API calls will set the state automatically; in most cases - * drivers have to only report state changes due to external - * conditions. - * - * All API operations are 'atomic', serialized through a mutex in the - * `struct wimax_dev`. - * - * EXPORTING TO USER SPACE THROUGH GENERIC NETLINK - * - * The API is exported to user space using generic netlink (other - * methods can be added as needed). - * - * There is a Generic Netlink Family named "WiMAX", where interfaces - * supporting the WiMAX interface receive commands and broadcast their - * signals over a multicast group named "msg". - * - * Mapping to the source/destination interface is done by an interface - * index attribute. - * - * For user-to-kernel traffic (commands) we use a function call - * marshalling mechanism, where a message X with attributes A, B, C - * sent from user space to kernel space means executing the WiMAX API - * call wimax_X(A, B, C), sending the results back as a message. - * - * Kernel-to-user (notifications or signals) communication is sent - * over multicast groups. This allows to have multiple applications - * monitoring them. - * - * Each command/signal gets assigned it's own attribute policy. This - * way the validator will verify that all the attributes in there are - * only the ones that should be for each command/signal. Thing of an - * attribute mapping to a type+argumentname for each command/signal. - * - * If we had a single policy for *all* commands/signals, after running - * the validator we'd have to check "does this attribute belong in - * here"? for each one. It can be done manually, but it's just easier - * to have the validator do that job with multiple policies. As well, - * it makes it easier to later expand each command/signal signature - * without affecting others and keeping the namespace more or less - * sane. Not that it is too complicated, but it makes it even easier. - * - * No state information is maintained in the kernel for each user - * space connection (the connection is stateless). - * - * TESTING FOR THE INTERFACE AND VERSIONING - * - * If network interface X is a WiMAX device, there will be a Generic - * Netlink family named "WiMAX X" and the device will present a - * "wimax" directory in it's network sysfs directory - * (/sys/class/net/DEVICE/wimax) [used by HAL]. - * - * The inexistence of any of these means the device does not support - * this WiMAX API. - * - * By querying the generic netlink controller, versioning information - * and the multicast groups available can be found. Applications using - * the interface can either rely on that or use the generic netlink - * controller to figure out which generic netlink commands/signals are - * supported. - * - * NOTE: this versioning is a last resort to avoid hard - * incompatibilities. It is the intention of the design of this - * stack not to introduce backward incompatible changes. - * - * The version code has to fit in one byte (restrictions imposed by - * generic netlink); we use `version / 10` for the major version and - * `version % 10` for the minor. This gives 9 minors for each major - * and 25 majors. - * - * The version change protocol is as follow: - * - * - Major versions: needs to be increased if an existing message/API - * call is changed or removed. Doesn't need to be changed if a new - * message is added. - * - * - Minor version: needs to be increased if new messages/API calls are - * being added or some other consideration that doesn't impact the - * user-kernel interface too much (like some kind of bug fix) and - * that is kind of left up in the air to common sense. - * - * User space code should not try to work if the major version it was - * compiled for differs from what the kernel offers. As well, if the - * minor version of the kernel interface is lower than the one user - * space is expecting (the one it was compiled for), the kernel - * might be missing API calls; user space shall be ready to handle - * said condition. Use the generic netlink controller operations to - * find which ones are supported and which not. - * - * libwimaxll:wimaxll_open() takes care of checking versions. - * - * THE OPERATIONS: - * - * Each operation is defined in its on file (drivers/net/wimax/op-*.c) - * for clarity. The parts needed for an operation are: - * - * - a function pointer in `struct wimax_dev`: optional, as the - * operation might be implemented by the stack and not by the - * driver. - * - * All function pointers are named wimax_dev->op_*(), and drivers - * must implement them except where noted otherwise. - * - * - When exported to user space, a `struct nla_policy` to define the - * attributes of the generic netlink command and a `struct genl_ops` - * to define the operation. - * - * All the declarations for the operation codes (WIMAX_GNL_OP_<NAME>) - * and generic netlink attributes (WIMAX_GNL_<NAME>_*) are declared in - * include/linux/wimax.h; this file is intended to be cloned by user - * space to gain access to those declarations. - * - * A few caveats to remember: - * - * - Need to define attribute numbers starting in 1; otherwise it - * fails. - * - * - the `struct genl_family` requires a maximum attribute id; when - * defining the `struct nla_policy` for each message, it has to have - * an array size of WIMAX_GNL_ATTR_MAX+1. - * - * The op_*() function pointers will not be called if the wimax_dev is - * in a state <= %WIMAX_ST_UNINITIALIZED. The exception is: - * - * - op_reset: can be called at any time after wimax_dev_add() has - * been called. - * - * THE PIPE INTERFACE: - * - * This interface is kept intentionally simple. The driver can send - * and receive free-form messages to/from user space through a - * pipe. See drivers/net/wimax/op-msg.c for details. - * - * The kernel-to-user messages are sent with - * wimax_msg(). user-to-kernel messages are delivered via - * wimax_dev->op_msg_from_user(). - * - * RFKILL: - * - * RFKILL support is built into the wimax_dev layer; the driver just - * needs to call wimax_report_rfkill_{hw,sw}() to inform of changes in - * the hardware or software RF kill switches. When the stack wants to - * turn the radio off, it will call wimax_dev->op_rfkill_sw_toggle(), - * which the driver implements. - * - * User space can set the software RF Kill switch by calling - * wimax_rfkill(). - * - * The code for now only supports devices that don't require polling; - * If the device needs to be polled, create a self-rearming delayed - * work struct for polling or look into adding polled support to the - * WiMAX stack. - * - * When initializing the hardware (_probe), after calling - * wimax_dev_add(), query the device for it's RF Kill switches status - * and feed it back to the WiMAX stack using - * wimax_report_rfkill_{hw,sw}(). If any switch is missing, always - * report it as ON. - * - * NOTE: the wimax stack uses an inverted terminology to that of the - * RFKILL subsystem: - * - * - ON: radio is ON, RFKILL is DISABLED or OFF. - * - OFF: radio is OFF, RFKILL is ENABLED or ON. - * - * MISCELLANEOUS OPS: - * - * wimax_reset() can be used to reset the device to power on state; by - * default it issues a warm reset that maintains the same device - * node. If that is not possible, it falls back to a cold reset - * (device reconnect). The driver implements the backend to this - * through wimax_dev->op_reset(). - */ - -#ifndef __NET__WIMAX_H__ -#define __NET__WIMAX_H__ - -#include <linux/wimax.h> -#include <net/genetlink.h> -#include <linux/netdevice.h> - -struct net_device; -struct genl_info; -struct wimax_dev; - -/** - * struct wimax_dev - Generic WiMAX device - * - * @net_dev: [fill] Pointer to the &struct net_device this WiMAX - * device implements. - * - * @op_msg_from_user: [fill] Driver-specific operation to - * handle a raw message from user space to the driver. The - * driver can send messages to user space using with - * wimax_msg_to_user(). - * - * @op_rfkill_sw_toggle: [fill] Driver-specific operation to act on - * userspace (or any other agent) requesting the WiMAX device to - * change the RF Kill software switch (WIMAX_RF_ON or - * WIMAX_RF_OFF). - * If such hardware support is not present, it is assumed the - * radio cannot be switched off and it is always on (and the stack - * will error out when trying to switch it off). In such case, - * this function pointer can be left as NULL. - * - * @op_reset: [fill] Driver specific operation to reset the - * device. - * This operation should always attempt first a warm reset that - * does not disconnect the device from the bus and return 0. - * If that fails, it should resort to some sort of cold or bus - * reset (even if it implies a bus disconnection and device - * disappearance). In that case, -ENODEV should be returned to - * indicate the device is gone. - * This operation has to be synchronous, and return only when the - * reset is complete. In case of having had to resort to bus/cold - * reset implying a device disconnection, the call is allowed to - * return immediately. - * NOTE: wimax_dev->mutex is NOT locked when this op is being - * called; however, wimax_dev->mutex_reset IS locked to ensure - * serialization of calls to wimax_reset(). - * See wimax_reset()'s documentation. - * - * @name: [fill] A way to identify this device. We need to register a - * name with many subsystems (rfkill, workqueue creation, etc). - * We can't use the network device name as that - * might change and in some instances we don't know it yet (until - * we don't call register_netdev()). So we generate an unique one - * using the driver name and device bus id, place it here and use - * it across the board. Recommended naming: - * DRIVERNAME-BUSNAME:BUSID (dev->bus->name, dev->bus_id). - * - * @id_table_node: [private] link to the list of wimax devices kept by - * id-table.c. Protected by it's own spinlock. - * - * @mutex: [private] Serializes all concurrent access and execution of - * operations. - * - * @mutex_reset: [private] Serializes reset operations. Needs to be a - * different mutex because as part of the reset operation, the - * driver has to call back into the stack to do things such as - * state change, that require wimax_dev->mutex. - * - * @state: [private] Current state of the WiMAX device. - * - * @rfkill: [private] integration into the RF-Kill infrastructure. - * - * @rf_sw: [private] State of the software radio switch (OFF/ON) - * - * @rf_hw: [private] State of the hardware radio switch (OFF/ON) - * - * @debugfs_dentry: [private] Used to hook up a debugfs entry. This - * shows up in the debugfs root as wimax\:DEVICENAME. - * - * Description: - * This structure defines a common interface to access all WiMAX - * devices from different vendors and provides a common API as well as - * a free-form device-specific messaging channel. - * - * Usage: - * 1. Embed a &struct wimax_dev at *the beginning* the network - * device structure so that netdev_priv() points to it. - * - * 2. memset() it to zero - * - * 3. Initialize with wimax_dev_init(). This will leave the WiMAX - * device in the %__WIMAX_ST_NULL state. - * - * 4. Fill all the fields marked with [fill]; once called - * wimax_dev_add(), those fields CANNOT be modified. - * - * 5. Call wimax_dev_add() *after* registering the network - * device. This will leave the WiMAX device in the %WIMAX_ST_DOWN - * state. - * Protect the driver's net_device->open() against succeeding if - * the wimax device state is lower than %WIMAX_ST_DOWN. - * - * 6. Select when the device is going to be turned on/initialized; - * for example, it could be initialized on 'ifconfig up' (when the - * netdev op 'open()' is called on the driver). - * - * When the device is initialized (at `ifconfig up` time, or right - * after calling wimax_dev_add() from _probe(), make sure the - * following steps are taken - * - * a. Move the device to %WIMAX_ST_UNINITIALIZED. This is needed so - * some API calls that shouldn't work until the device is ready - * can be blocked. - * - * b. Initialize the device. Make sure to turn the SW radio switch - * off and move the device to state %WIMAX_ST_RADIO_OFF when - * done. When just initialized, a device should be left in RADIO - * OFF state until user space devices to turn it on. - * - * c. Query the device for the state of the hardware rfkill switch - * and call wimax_rfkill_report_hw() and wimax_rfkill_report_sw() - * as needed. See below. - * - * wimax_dev_rm() undoes before unregistering the network device. Once - * wimax_dev_add() is called, the driver can get called on the - * wimax_dev->op_* function pointers - * - * CONCURRENCY: - * - * The stack provides a mutex for each device that will disallow API - * calls happening concurrently; thus, op calls into the driver - * through the wimax_dev->op*() function pointers will always be - * serialized and *never* concurrent. - * - * For locking, take wimax_dev->mutex is taken; (most) operations in - * the API have to check for wimax_dev_is_ready() to return 0 before - * continuing (this is done internally). - * - * REFERENCE COUNTING: - * - * The WiMAX device is reference counted by the associated network - * device. The only operation that can be used to reference the device - * is wimax_dev_get_by_genl_info(), and the reference it acquires has - * to be released with dev_put(wimax_dev->net_dev). - * - * RFKILL: - * - * At startup, both HW and SW radio switchess are assumed to be off. - * - * At initialization time [after calling wimax_dev_add()], have the - * driver query the device for the status of the software and hardware - * RF kill switches and call wimax_report_rfkill_hw() and - * wimax_rfkill_report_sw() to indicate their state. If any is - * missing, just call it to indicate it is ON (radio always on). - * - * Whenever the driver detects a change in the state of the RF kill - * switches, it should call wimax_report_rfkill_hw() or - * wimax_report_rfkill_sw() to report it to the stack. - */ -struct wimax_dev { - struct net_device *net_dev; - struct list_head id_table_node; - struct mutex mutex; /* Protects all members and API calls */ - struct mutex mutex_reset; - enum wimax_st state; - - int (*op_msg_from_user)(struct wimax_dev *wimax_dev, - const char *, - const void *, size_t, - const struct genl_info *info); - int (*op_rfkill_sw_toggle)(struct wimax_dev *wimax_dev, - enum wimax_rf_state); - int (*op_reset)(struct wimax_dev *wimax_dev); - - struct rfkill *rfkill; - unsigned int rf_hw; - unsigned int rf_sw; - char name[32]; - - struct dentry *debugfs_dentry; -}; - - - -/* - * WiMAX stack public API for device drivers - * ----------------------------------------- - * - * These functions are not exported to user space. - */ -void wimax_dev_init(struct wimax_dev *); -int wimax_dev_add(struct wimax_dev *, struct net_device *); -void wimax_dev_rm(struct wimax_dev *); - -static inline -struct wimax_dev *net_dev_to_wimax(struct net_device *net_dev) -{ - return netdev_priv(net_dev); -} - -static inline -struct device *wimax_dev_to_dev(struct wimax_dev *wimax_dev) -{ - return wimax_dev->net_dev->dev.parent; -} - -void wimax_state_change(struct wimax_dev *, enum wimax_st); -enum wimax_st wimax_state_get(struct wimax_dev *); - -/* - * Radio Switch state reporting. - * - * enum wimax_rf_state is declared in linux/wimax.h so the exports - * to user space can use it. - */ -void wimax_report_rfkill_hw(struct wimax_dev *, enum wimax_rf_state); -void wimax_report_rfkill_sw(struct wimax_dev *, enum wimax_rf_state); - - -/* - * Free-form messaging to/from user space - * - * Sending a message: - * - * wimax_msg(wimax_dev, pipe_name, buf, buf_size, GFP_KERNEL); - * - * Broken up: - * - * skb = wimax_msg_alloc(wimax_dev, pipe_name, buf_size, GFP_KERNEL); - * ...fill up skb... - * wimax_msg_send(wimax_dev, pipe_name, skb); - * - * Be sure not to modify skb->data in the middle (ie: don't use - * skb_push()/skb_pull()/skb_reserve() on the skb). - * - * "pipe_name" is any string, that can be interpreted as the name of - * the pipe or recipient; the interpretation of it is driver - * specific, so the recipient can multiplex it as wished. It can be - * NULL, it won't be used - an example is using a "diagnostics" tag to - * send diagnostics information that a device-specific diagnostics - * tool would be interested in. - */ -struct sk_buff *wimax_msg_alloc(struct wimax_dev *, const char *, const void *, - size_t, gfp_t); -int wimax_msg_send(struct wimax_dev *, struct sk_buff *); -int wimax_msg(struct wimax_dev *, const char *, const void *, size_t, gfp_t); - -const void *wimax_msg_data_len(struct sk_buff *, size_t *); -const void *wimax_msg_data(struct sk_buff *); -ssize_t wimax_msg_len(struct sk_buff *); - - -/* - * WiMAX stack user space API - * -------------------------- - * - * This API is what gets exported to user space for general - * operations. As well, they can be called from within the kernel, - * (with a properly referenced `struct wimax_dev`). - * - * Properly referenced means: the 'struct net_device' that embeds the - * device's control structure and (as such) the 'struct wimax_dev' is - * referenced by the caller. - */ -int wimax_rfkill(struct wimax_dev *, enum wimax_rf_state); -int wimax_reset(struct wimax_dev *); - -#endif /* #ifndef __NET__WIMAX_H__ */ diff --git a/include/net/x25.h b/include/net/x25.h index d7d6c2b4ffa7..5e833cfc864e 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -81,7 +81,7 @@ enum { #define X25_DEFAULT_WINDOW_SIZE 2 /* Default Window Size */ #define X25_DEFAULT_PACKET_SIZE X25_PS128 /* Default Packet Size */ -#define X25_DEFAULT_THROUGHPUT 0x0A /* Deafult Throughput */ +#define X25_DEFAULT_THROUGHPUT 0x0A /* Default Throughput */ #define X25_DEFAULT_REVERSE 0x00 /* Default Reverse Charging */ #define X25_SMODULUS 8 @@ -177,10 +177,7 @@ struct x25_forward { atomic_t refcnt; }; -static inline struct x25_sock *x25_sk(const struct sock *sk) -{ - return (struct x25_sock *)sk; -} +#define x25_sk(ptr) container_of_const(ptr, struct x25_sock, sk) /* af_x25.c */ extern int sysctl_x25_restart_request_timeout; diff --git a/include/net/xdp.h b/include/net/xdp.h index 3814fb631d52..b40f1f96cb11 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -6,22 +6,27 @@ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ +#include <linux/bitfield.h> +#include <linux/filter.h> +#include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ +#include <net/page_pool/types.h> + /** * DOC: XDP RX-queue information * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how - * the driver have configured a given RX-ring queue. + * the driver has configured a given RX-ring queue. * - * Each xdp_buff frame received in the driver carry a (pointer) + * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP * data-path read-access to RX-info for both kernel and bpf-side * (limited subset). * * For now, direct access is only safe while running in NAPI/softirq - * context. Contents is read-mostly and must not be updated during + * context. Contents are read-mostly and must not be updated during * driver NAPI/softirq poll. * * The driver usage API is a register and unregister API. @@ -29,9 +34,9 @@ * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver - * naturally need to stop the RX-ring before purging and reallocating - * memory. In that process the driver MUST call unregistor (which - * also apply for driver shutdown and unload). The register API is + * naturally needs to stop the RX-ring before purging and reallocating + * memory. In that process the driver MUST call unregister (which + * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. */ @@ -59,12 +64,20 @@ struct xdp_rxq_info { u32 queue_index; u32 reg_state; struct xdp_mem_info mem; + u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { struct net_device *dev; }; +enum xdp_buff_flags { + XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ + XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under + * pressure + */ +}; + struct xdp_buff { void *data; void *data_end; @@ -73,8 +86,55 @@ struct xdp_buff { struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ + u32 flags; /* supported values defined in xdp_buff_flags */ }; +static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); +} + +static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_HAS_FRAGS; +} + +static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; +} + +static __always_inline bool +xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); +} + +static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; +} + +static __always_inline void +xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) +{ + xdp->frame_sz = frame_sz; + xdp->rxq = rxq; + xdp->flags = 0; +} + +static __always_inline void +xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, + int headroom, int data_len, const bool meta_valid) +{ + unsigned char *data = hard_start + headroom; + + xdp->data_hard_start = hard_start; + xdp->data = data; + xdp->data_end = data + data_len; + xdp->data_meta = meta_valid ? data : data + 1; +} + /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the @@ -86,27 +146,151 @@ struct xdp_buff { SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * -xdp_get_shared_info_from_buff(struct xdp_buff *xdp) +xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } +static __always_inline unsigned int +xdp_get_buff_len(const struct xdp_buff *xdp) +{ + unsigned int len = xdp->data_end - xdp->data; + const struct skb_shared_info *sinfo; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + len += sinfo->xdp_frags_size; +out: + return len; +} + +void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp); + +/** + * __xdp_buff_add_frag - attach frag to &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @netmem: network memory containing the frag + * @offset: offset at which the frag starts + * @size: size of the frag + * @truesize: total memory size occupied by the frag + * @try_coalesce: whether to try coalescing the frags (not valid for XSk) + * + * Attach frag to the XDP buffer. If it currently has no frags attached, + * initialize the related fields, otherwise check that the frag number + * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing + * the frag with the previous one. + * The function doesn't check/update the pfmemalloc bit. Please use the + * non-underscored wrapper in drivers. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, + u32 offset, u32 size, u32 truesize, + bool try_coalesce) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *prev; + u32 nr_frags; + + if (!xdp_buff_has_frags(xdp)) { + xdp_buff_set_frags_flag(xdp); + + nr_frags = 0; + sinfo->xdp_frags_size = 0; + sinfo->xdp_frags_truesize = 0; + + goto fill; + } + + nr_frags = sinfo->nr_frags; + prev = &sinfo->frags[nr_frags - 1]; + + if (try_coalesce && netmem == skb_frag_netmem(prev) && + offset == skb_frag_off(prev) + skb_frag_size(prev)) { + skb_frag_size_add(prev, size); + /* Guaranteed to only decrement the refcount */ + xdp_return_frag(netmem, xdp); + } else if (unlikely(nr_frags == MAX_SKB_FRAGS)) { + return false; + } else { +fill: + __skb_fill_netmem_desc_noacc(sinfo, nr_frags++, netmem, + offset, size); + } + + sinfo->nr_frags = nr_frags; + sinfo->xdp_frags_size += size; + sinfo->xdp_frags_truesize += truesize; + + return true; +} + +/** + * xdp_buff_add_frag - attach frag to &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @netmem: network memory containing the frag + * @offset: offset at which the frag starts + * @size: size of the frag + * @truesize: total memory size occupied by the frag + * + * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, + u32 offset, u32 size, u32 truesize) +{ + if (!__xdp_buff_add_frag(xdp, netmem, offset, size, truesize, true)) + return false; + + if (unlikely(netmem_is_pfmemalloc(netmem))) + xdp_buff_set_frag_pfmemalloc(xdp); + + return true; +} + struct xdp_frame { void *data; - u16 len; - u16 headroom; - u32 metasize:8; - u32 frame_sz:24; + u32 len; + u32 headroom; + u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem info is valid on remote CPU. + * while mem_type is valid on remote CPU. */ - struct xdp_mem_info mem; + enum xdp_mem_type mem_type:32; struct net_device *dev_rx; /* used by cpumap */ + u32 frame_sz; + u32 flags; /* supported values defined in xdp_buff_flags */ }; +static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) +{ + return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); +} + +static __always_inline bool +xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) +{ + return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); +} + +#define XDP_BULK_QUEUE_SIZE 16 +struct xdp_frame_bulk { + int count; + netmem_ref q[XDP_BULK_QUEUE_SIZE]; +}; + +static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) +{ + bq->count = 0; +} static inline struct skb_shared_info * -xdp_get_shared_info_from_frame(struct xdp_frame *frame) +xdp_get_shared_info_from_frame(const struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); @@ -127,24 +311,54 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) frame->dev_rx = NULL; } +static inline void +xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, + unsigned int size, unsigned int truesize, + bool pfmemalloc) +{ + struct skb_shared_info *sinfo = skb_shinfo(skb); + + sinfo->nr_frags = nr_frags; + /* + * ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``, + * reset it after that these fields aren't used anymore. + */ + sinfo->destructor_arg = NULL; + + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; + skb->pfmemalloc |= pfmemalloc; +} + /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev); +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev); +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline -void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) +void xdp_convert_frame_to_buff(const struct xdp_frame *frame, + struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; + xdp->flags = frame->flags; } static inline -int xdp_update_frame_from_buff(struct xdp_buff *xdp, +int xdp_update_frame_from_buff(const struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; @@ -167,6 +381,7 @@ int xdp_update_frame_from_buff(struct xdp_buff *xdp, xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; + xdp_frame->flags = xdp->flags; return 0; } @@ -185,39 +400,96 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; - /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ - xdp_frame->mem = xdp->rxq->mem; + /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */ + xdp_frame->mem_type = xdp->rxq->mem.type; return xdp_frame; } +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq); -/* When sending xdp_frame into the network stack, then there is no - * return point callback, which is needed to release e.g. DMA-mapping - * resources with page_pool. Thus, have explicit function to release - * frame resources. - */ -void __xdp_release_frame(void *data, struct xdp_mem_info *mem); -static inline void xdp_release_frame(struct xdp_frame *xdpf) +static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + if (unlikely(!bq->count)) + return; + + page_pool_put_netmem_bulk(bq->q, bq->count); + bq->count = 0; +} + +static __always_inline unsigned int +xdp_get_frame_len(const struct xdp_frame *xdpf) { - struct xdp_mem_info *mem = &xdpf->mem; + const struct skb_shared_info *sinfo; + unsigned int len = xdpf->len; - /* Curr only page_pool needs this */ - if (mem->type == MEM_TYPE_PAGE_POOL) - __xdp_release_frame(xdpf->data, mem); + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + len += sinfo->xdp_frags_size; +out: + return len; +} + +int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id, u32 frag_size); +static inline int +xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id) +{ + return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); } -int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); +int xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, void *allocator); +void xdp_unreg_mem_model(struct xdp_mem_info *mem); +int xdp_reg_page_pool(struct page_pool *pool); +void xdp_unreg_page_pool(const struct page_pool *pool); +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool); + +/** + * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info + * @xdp_rxq: XDP RxQ info to attach the memory info to + * @mem: already registered memory info + * + * If the driver registers its memory providers manually, it must use this + * function instead of xdp_rxq_info_reg_mem_model(). + */ +static inline void +xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq, + const struct xdp_mem_info *mem) +{ + xdp_rxq->mem = *mem; +} + +/** + * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info + * @xdp_rxq: XDP RxQ info to detach the memory info from + * + * If the driver registers its memory providers manually and then attaches it + * via xdp_rxq_info_attach_mem_model(), it must call this function before + * xdp_rxq_info_unreg(). + */ +static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->mem = (struct xdp_mem_info){ }; +} /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. @@ -234,17 +506,161 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) return unlikely(xdp->data_meta > xdp->data); } +static inline bool xdp_metalen_invalid(unsigned long metalen) +{ + unsigned long meta_max; + + meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); + BUILD_BUG_ON(!__builtin_constant_p(meta_max)); + + return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; +} + struct xdp_attachment_info { struct bpf_prog *prog; u32 flags; }; struct netdev_bpf; -bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, - struct netdev_bpf *bpf); void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); -#define DEV_MAP_BULK_SIZE 16 +#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE + +/* Define the relationship between xdp-rx-metadata kfunc and + * various other entities: + * - xdp_rx_metadata enum + * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) + * - kfunc name + * - xdp_metadata_ops field + */ +#define XDP_METADATA_KFUNC_xxx \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ + NETDEV_XDP_RX_METADATA_TIMESTAMP, \ + bpf_xdp_metadata_rx_timestamp, \ + xmo_rx_timestamp) \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ + NETDEV_XDP_RX_METADATA_HASH, \ + bpf_xdp_metadata_rx_hash, \ + xmo_rx_hash) \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ + NETDEV_XDP_RX_METADATA_VLAN_TAG, \ + bpf_xdp_metadata_rx_vlan_tag, \ + xmo_rx_vlan_tag) \ + +enum xdp_rx_metadata { +#define XDP_METADATA_KFUNC(name, _, __, ___) name, +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC +MAX_XDP_METADATA_KFUNC, +}; + +enum xdp_rss_hash_type { + /* First part: Individual bits for L3/L4 types */ + XDP_RSS_L3_IPV4 = BIT(0), + XDP_RSS_L3_IPV6 = BIT(1), + + /* The fixed (L3) IPv4 and IPv6 headers can both be followed by + * variable/dynamic headers, IPv4 called Options and IPv6 called + * Extension Headers. HW RSS type can contain this info. + */ + XDP_RSS_L3_DYNHDR = BIT(2), + + /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in + * addition to the protocol specific bit. This ease interaction with + * SKBs and avoids reserving a fixed mask for future L4 protocol bits. + */ + XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ + XDP_RSS_L4_TCP = BIT(4), + XDP_RSS_L4_UDP = BIT(5), + XDP_RSS_L4_SCTP = BIT(6), + XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ + XDP_RSS_L4_ICMP = BIT(8), + + /* Second part: RSS hash type combinations used for driver HW mapping */ + XDP_RSS_TYPE_NONE = 0, + XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, + + XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, + XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, + XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, + XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, + + XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, + XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, + XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, + XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, + XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, + + XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, + XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, + XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, + XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, + + XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, + XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, + XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, +}; + +struct xdp_metadata_ops { + int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); + int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type); + int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, + u16 *vlan_tci); +}; + +#ifdef CONFIG_NET +u32 bpf_xdp_metadata_kfunc_id(int id); +bool bpf_dev_bound_kfunc_id(u32 btf_id); +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg); +void xdp_features_clear_redirect_target(struct net_device *dev); +void xdp_features_clear_redirect_target_locked(struct net_device *dev); +#else +static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } +static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } + +static inline void +xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ +} + +static inline void +xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ +} + +static inline void +xdp_features_clear_redirect_target(struct net_device *dev) +{ +} +#endif + +static inline void xdp_clear_features_flag(struct net_device *dev) +{ + xdp_set_features_flag(dev, 0); +} + +static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, + struct xdp_buff *xdp) +{ + /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus + * under local_bh_disable(), which provides the needed RCU protection + * for accessing map entries. + */ + u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + + if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { + if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) + act = xdp_master_redirect(xdp); + } + return act; +} #endif /* __LINUX_NET_XDP_H__ */ diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h index a9d5b7603b89..c9df68d5f258 100644 --- a/include/net/xdp_priv.h +++ b/include/net/xdp_priv.h @@ -3,6 +3,7 @@ #define __LINUX_NET_XDP_PRIV_H__ #include <linux/rhashtable.h> +#include <net/xdp.h> /* Private to net/core/xdp.c, but used by trace/events/xdp.h */ struct xdp_mem_allocator { @@ -10,7 +11,6 @@ struct xdp_mem_allocator { union { void *allocator; struct page_pool *page_pool; - struct zero_copy_allocator *zc_alloc; }; struct rhash_head node; struct rcu_head rcu; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 1a9559c0cbdd..e8bd6ddb7b12 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -6,6 +6,7 @@ #ifndef _LINUX_XDP_SOCK_H #define _LINUX_XDP_SOCK_H +#include <linux/bpf.h> #include <linux/workqueue.h> #include <linux/if_xdp.h> #include <linux/mutex.h> @@ -13,6 +14,8 @@ #include <linux/mm.h> #include <net/sock.h> +#define XDP_UMEM_SG_FLAG (1 << 1) + struct net_device; struct xsk_queue; struct xdp_buff; @@ -27,16 +30,19 @@ struct xdp_umem { struct user_struct *user; refcount_t users; u8 flags; + u8 tx_metadata_len; bool zc; struct page **pgs; int id; struct list_head xsk_dma_list; + struct work_struct work; }; struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ - struct xdp_sock *xsk_map[]; + atomic_t count; + struct xdp_sock __rcu *xsk_map[]; }; struct xdp_sock { @@ -49,6 +55,7 @@ struct xdp_sock { struct xsk_buff_pool *pool; u16 queue_id; bool zc; + bool sg; enum { XSK_READY = 0, XSK_BOUND, @@ -57,17 +64,23 @@ struct xdp_sock { struct xsk_queue *tx ____cacheline_aligned_in_smp; struct list_head tx_list; - /* Mutual exclusion of NAPI TX thread and sendmsg error paths - * in the SKB destructor callback. + /* record the number of tx descriptors sent by this xsk and + * when it exceeds MAX_PER_SOCKET_BUDGET, an opportunity needs + * to be given to other xsks for sending tx descriptors, thereby + * preventing other XSKs from being starved. */ - spinlock_t tx_completion_lock; - /* Protects generic receive. */ - spinlock_t rx_lock; + u32 tx_budget_spent; /* Statistics */ u64 rx_dropped; u64 rx_queue_full; + /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current + * packet, the partially built skb is saved here so that packet building can resume in next + * call of __xsk_generic_xmit(). + */ + struct sk_buff *skb; + struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; @@ -77,23 +90,115 @@ struct xdp_sock { struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; +/* + * AF_XDP TX metadata hooks for network devices. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. + * + * void (*tmo_request_timestamp)(void *priv) + * Called when AF_XDP frame requested egress timestamp. + * + * u64 (*tmo_fill_timestamp)(void *priv) + * Called when AF_XDP frame, that had requested egress timestamp, + * received a completion. The hook needs to return the actual HW timestamp. + * + * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv) + * Called when AF_XDP frame requested HW checksum offload. csum_start + * indicates position where checksumming should start. + * csum_offset indicates position where checksum should be stored. + * + * void (*tmo_request_launch_time)(u64 launch_time, void *priv) + * Called when AF_XDP frame requested launch time HW offload support. + * launch_time indicates the PTP time at which the device can schedule the + * packet for transmission. + */ +struct xsk_tx_metadata_ops { + void (*tmo_request_timestamp)(void *priv); + u64 (*tmo_fill_timestamp)(void *priv); + void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); + void (*tmo_request_launch_time)(u64 launch_time, void *priv); +}; + #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); -void __xsk_map_flush(void); +void __xsk_map_flush(struct list_head *flush_list); + +/** + * xsk_tx_metadata_to_compl - Save enough relevant metadata information + * to perform tx completion in the future. + * @meta: pointer to AF_XDP metadata area + * @compl: pointer to output struct xsk_tx_metadata_to_compl + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. The value of @compl should be stored + * and passed to xsk_tx_metadata_complete upon TX completion. + */ +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ + if (!meta) + return; -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + compl->tx_timestamp = &meta->completion.tx_timestamp; + else + compl->tx_timestamp = NULL; +} + +/** + * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission + * and call appropriate xsk_tx_metadata_ops operation. + * @meta: pointer to AF_XDP metadata area + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. + */ +static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) { - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs; + if (!meta) + return; + + if (ops->tmo_request_launch_time) + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + ops->tmo_request_launch_time(meta->request.launch_time, + priv); + + if (ops->tmo_request_timestamp) + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + ops->tmo_request_timestamp(priv); + + if (ops->tmo_request_checksum) + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) + ops->tmo_request_checksum(meta->request.csum_start, + meta->request.csum_offset, priv); +} - if (key >= map->max_entries) - return NULL; +/** + * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion + * and call appropriate xsk_tx_metadata_ops operation. + * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device upon + * AF_XDP egress completion. + */ +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!compl) + return; + if (!compl->tx_timestamp) + return; - xs = READ_ONCE(m->xsk_map[key]); - return xs; + *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); } #else @@ -108,16 +213,26 @@ static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) return -EOPNOTSUPP; } -static inline void __xsk_map_flush(void) +static inline void __xsk_map_flush(struct list_head *flush_list) { } -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) { - return NULL; } -#endif /* CONFIG_XDP_SOCKETS */ +static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + +#endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 5b1ee8a9976d..513c8e9704f6 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -9,10 +9,20 @@ #include <net/xdp_sock.h> #include <net/xsk_buff_pool.h> +#define XDP_UMEM_MIN_CHUNK_SHIFT 11 +#define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) + +struct xsk_cb_desc { + void *src; + u8 off; + u8 bytes; +}; + #ifdef CONFIG_XDP_SOCKETS void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); @@ -43,6 +53,12 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, xp_set_rxq_info(pool, rxq); } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ + xp_fill_cb(pool, desc); +} + static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { @@ -76,6 +92,17 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) +{ + return !xp_mb_desc(desc); +} + +/* Returns as many entries as possible up to max. 0 <= N <= max. */ +static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + return xp_alloc_batch(pool, xdp, max); +} + static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return xp_can_alloc(pool, count); @@ -84,10 +111,80 @@ static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) static inline void xsk_buff_free(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + struct list_head *xskb_list = &xskb->pool->xskb_list; + struct xdp_buff_xsk *pos, *tmp; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { + list_del(&pos->list_node); + xp_free(pos); + } + xdp_get_shared_info_from_buff(xdp)->nr_frags = 0; +out: xp_free(xskb); } +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) +{ + const void *data = xdp->data; + struct xdp_buff_xsk *frag; + + if (!__xdp_buff_add_frag(head, virt_to_netmem(data), + offset_in_page(data), xdp->data_end - data, + xdp->frame_sz, false)) + return false; + + frag = container_of(xdp, struct xdp_buff_xsk, xdp); + list_add_tail(&frag->list_node, &frag->pool->xskb_list); + + return true; +} + +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff *ret = NULL; + struct xdp_buff_xsk *frag; + + frag = list_first_entry_or_null(&xskb->pool->xskb_list, + struct xdp_buff_xsk, list_node); + if (frag) { + list_del(&frag->list_node); + ret = &frag->xdp; + } + + return ret; +} + +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ + struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + + list_del(&xskb->list_node); +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + list_node); + return &frag->xdp; +} + +static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) +{ + xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + size; + xdp->flags = 0; +} + static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { @@ -99,12 +196,59 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) +/** + * xsk_buff_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for + * details. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) { - struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + return xp_raw_get_ctx(pool, addr); +} - if (!pool->dma_need_sync) - return; +#define XDP_TXMD_FLAGS_VALID ( \ + XDP_TXMD_FLAGS_TIMESTAMP | \ + XDP_TXMD_FLAGS_CHECKSUM | \ + XDP_TXMD_FLAGS_LAUNCH_TIME | \ + 0) + +static inline bool +xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) +{ + return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); +} + +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) +{ + struct xsk_tx_metadata *meta; + + if (!pool->tx_metadata_len) + return NULL; + + meta = data - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return NULL; /* no way to signal the error to the user */ + + return meta; +} + +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr)); +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); xp_dma_sync_for_cpu(xskb); } @@ -128,6 +272,11 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, return false; } +static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max) +{ + return 0; +} + static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } @@ -179,6 +328,11 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, { } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ +} + static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { @@ -205,6 +359,16 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) +{ + return false; +} + +static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + return 0; +} + static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; @@ -214,6 +378,30 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) +{ + return false; +} + +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) +{ + return NULL; +} + +static inline void xsk_buff_del_tail(struct xdp_buff *tail) +{ +} + +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) +{ + return NULL; +} + +static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) +{ +} + static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { @@ -225,7 +413,30 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return (struct xdp_desc_ctx){ }; +} + +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return false; +} + +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) +{ + return NULL; +} + +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return NULL; +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b2a06f10b62c..a21e276dbe44 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -19,6 +19,7 @@ #include <net/sock.h> #include <net/dst.h> +#include <net/inet_dscp.h> #include <net/ip.h> #include <net/route.h> #include <net/ipv6.h> @@ -37,6 +38,7 @@ #define XFRM_PROTO_COMP 108 #define XFRM_PROTO_IPIP 4 #define XFRM_PROTO_IPV6 41 +#define XFRM_PROTO_IPTFS IPPROTO_AGGFRAG #define XFRM_PROTO_ROUTING IPPROTO_ROUTING #define XFRM_PROTO_DSTOPTS IPPROTO_DSTOPTS @@ -51,8 +53,10 @@ #ifdef CONFIG_XFRM_STATISTICS #define XFRM_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.xfrm_statistics, field) +#define XFRM_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.xfrm_statistics, field, val) #else #define XFRM_INC_STATS(net, field) ((void)(net)) +#define XFRM_ADD_STATS(net, field, val) ((void)(net)) #endif @@ -65,27 +69,27 @@ - instance of a transformer, struct xfrm_state (=SA) - template to clone xfrm_state, struct xfrm_tmpl - SPD is plain linear list of xfrm_policy rules, ordered by priority. + SPD is organized as hash table (for policies that meet minimum address prefix + length setting, net->xfrm.policy_hthresh). Other policies are stored in + lists, sorted into rbtree ordered by destination and source address networks. + See net/xfrm/xfrm_policy.c for details. + (To be compatible with existing pfkeyv2 implementations, many rules with priority of 0x7fffffff are allowed to exist and such rules are ordered in an unpredictable way, thanks to bsd folks.) - Lookup is plain linear search until the first match with selector. - If "action" is "block", then we prohibit the flow, otherwise: if "xfrms_nr" is zero, the flow passes untransformed. Otherwise, policy entry has list of up to XFRM_MAX_DEPTH transformations, described by templates xfrm_tmpl. Each template is resolved to a complete xfrm_state (see below) and we pack bundle of transformations - to a dst_entry returned to requestor. + to a dst_entry returned to requester. dst -. xfrm .-> xfrm_state #1 |---. child .-> dst -. xfrm .-> xfrm_state #2 |---. child .-> dst -. xfrm .-> xfrm_state #3 |---. child .-> NULL - Bundles are cached at xrfm_policy struct (field ->bundles). - Resolution of xrfm_tmpl ----------------------- @@ -126,12 +130,41 @@ struct xfrm_state_walk { struct xfrm_address_filter *filter; }; -struct xfrm_state_offload { +enum { + XFRM_DEV_OFFLOAD_IN = 1, + XFRM_DEV_OFFLOAD_OUT, + XFRM_DEV_OFFLOAD_FWD, +}; + +enum { + XFRM_DEV_OFFLOAD_UNSPECIFIED, + XFRM_DEV_OFFLOAD_CRYPTO, + XFRM_DEV_OFFLOAD_PACKET, +}; + +enum { + XFRM_DEV_OFFLOAD_FLAG_ACQ = 1, +}; + +struct xfrm_dev_offload { + /* The device for this offload. + * Device drivers should not use this directly, as that will prevent + * them from working with bonding device. Instead, the device passed + * to the add/delete callbacks should be used. + */ struct net_device *dev; + netdevice_tracker dev_tracker; + /* This is a private pointer used by the bonding driver (and eventually + * should be moved there). Device drivers should not use it. + * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases, + * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock + * is held. + */ struct net_device *real_dev; unsigned long offload_handle; - unsigned int num_exthdrs; - u8 flags; + u8 dir : 2; + u8 type : 2; + u8 flags : 2; }; struct xfrm_mode { @@ -145,6 +178,12 @@ enum { XFRM_MODE_FLAG_TUNNEL = 1, }; +enum xfrm_replay_mode { + XFRM_REPLAY_MODE_LEGACY, + XFRM_REPLAY_MODE_BMP, + XFRM_REPLAY_MODE_ESN, +}; + /* Full description of state of transformer. */ struct xfrm_state { possible_net_t xs_net; @@ -152,12 +191,19 @@ struct xfrm_state { struct hlist_node gclist; struct hlist_node bydst; }; - struct hlist_node bysrc; + union { + struct hlist_node dev_gclist; + struct hlist_node bysrc; + }; struct hlist_node byspi; + struct hlist_node byseq; + struct hlist_node state_cache; + struct hlist_node state_cache_input; refcount_t refcnt; spinlock_t lock; + u32 pcpu_num; struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; @@ -179,6 +225,7 @@ struct xfrm_state { u16 family; xfrm_address_t saddr; int header_len; + int enc_hdr_len; int trailer_len; u32 extra_flags; struct xfrm_mark smark; @@ -193,9 +240,17 @@ struct xfrm_state { struct xfrm_algo_aead *aead; const char *geniv; + /* mapping change rate limiting */ + __be16 new_mapping_sport; + u32 new_mapping; /* seconds */ + u32 mapping_maxage; /* seconds for input SA */ + /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; - struct sock __rcu *encap_sk; + + /* NAT keepalive */ + u32 nat_keepalive_interval; /* seconds */ + time64_t nat_keepalive_expiration; /* Data for care-of address */ xfrm_address_t *coaddr; @@ -214,9 +269,8 @@ struct xfrm_state { struct xfrm_replay_state preplay; struct xfrm_replay_state_esn *preplay_esn; - /* The functions for replay detection. */ - const struct xfrm_replay *repl; - + /* replay detection mode */ + enum xfrm_replay_mode repl_mode; /* internal flag that only holds state for delayed aevent at the * moment */ @@ -235,7 +289,7 @@ struct xfrm_state { struct xfrm_lifetime_cur curlft; struct hrtimer mtimer; - struct xfrm_state_offload xso; + struct xfrm_dev_offload xso; /* used to fix curlft->add_time when changing date */ long saved_tmo; @@ -260,6 +314,10 @@ struct xfrm_state { /* Private data of this transformer, format is opaque, * interpreted by xfrm_type methods. */ void *data; + u8 dir; + + const struct xfrm_mode_cbs *mode_cbs; + void *mode_data; }; static inline struct net *xs_net(struct xfrm_state *x) @@ -296,40 +354,39 @@ struct km_event { struct net *net; }; -struct xfrm_replay { - void (*advance)(struct xfrm_state *x, __be32 net_seq); - int (*check)(struct xfrm_state *x, - struct sk_buff *skb, - __be32 net_seq); - int (*recheck)(struct xfrm_state *x, - struct sk_buff *skb, - __be32 net_seq); - void (*notify)(struct xfrm_state *x, int event); - int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); +struct xfrm_if_decode_session_result { + struct net *net; + u32 if_id; }; struct xfrm_if_cb { - struct xfrm_if *(*decode_session)(struct sk_buff *skb, - unsigned short family); + bool (*decode_session)(struct sk_buff *skb, + unsigned short family, + struct xfrm_if_decode_session_result *res); }; void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); void xfrm_if_unregister_cb(void); +struct xfrm_dst_lookup_params { + struct net *net; + dscp_t dscp; + int oif; + xfrm_address_t *saddr; + xfrm_address_t *daddr; + u32 mark; + __u8 ipproto; + union flowi_uli uli; +}; + struct net_device; struct xfrm_type; struct xfrm_dst; struct xfrm_policy_afinfo { struct dst_ops *dst_ops; - struct dst_entry *(*dst_lookup)(struct net *net, - int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - u32 mark); - int (*get_saddr)(struct net *net, int oif, - xfrm_address_t *saddr, - xfrm_address_t *daddr, - u32 mark); + struct dst_entry *(*dst_lookup)(const struct xfrm_dst_lookup_params *params); + int (*get_saddr)(xfrm_address_t *saddr, + const struct xfrm_dst_lookup_params *params); int (*fill_dst)(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl); @@ -387,7 +444,6 @@ void xfrm_flush_gc(void); void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { - char *description; struct module *owner; u8 proto; u8 flags; @@ -396,20 +452,19 @@ struct xfrm_type { #define XFRM_TYPE_LOCAL_COADDR 4 #define XFRM_TYPE_REMOTE_COADDR 8 - int (*init_state)(struct xfrm_state *x); + int (*init_state)(struct xfrm_state *x, + struct netlink_ext_ack *extack); void (*destructor)(struct xfrm_state *); int (*input)(struct xfrm_state *, struct sk_buff *skb); int (*output)(struct xfrm_state *, struct sk_buff *pskb); int (*reject)(struct xfrm_state *, struct sk_buff *, const struct flowi *); - int (*hdr_offset)(struct xfrm_state *, struct sk_buff *, u8 **); }; int xfrm_register_type(const struct xfrm_type *type, unsigned short family); void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family); struct xfrm_type_offload { - char *description; struct module *owner; u8 proto; void (*encap)(struct xfrm_state *, struct sk_buff *pskb); @@ -419,6 +474,54 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); +void xfrm_set_type_offload(struct xfrm_state *x); +static inline void xfrm_unset_type_offload(struct xfrm_state *x) +{ + if (!x->type_offload) + return; + + module_put(x->type_offload->owner); + x->type_offload = NULL; +} + +/** + * struct xfrm_mode_cbs - XFRM mode callbacks + * @owner: module owner or NULL + * @init_state: Add/init mode specific state in `xfrm_state *x` + * @clone_state: Copy mode specific values from `orig` to new state `x` + * @destroy_state: Cleanup mode specific state from `xfrm_state *x` + * @user_init: Process mode specific netlink attributes from user + * @copy_to_user: Add netlink attributes to `attrs` based on state in `x` + * @sa_len: Return space required to store mode specific netlink attributes + * @get_inner_mtu: Return avail payload space after removing encap overhead + * @input: Process received packet from SA using mode + * @output: Output given packet using mode + * @prepare_output: Add mode specific encapsulation to packet in skb. On return + * `transport_header` should point at ESP header, `network_header` should + * point at outer IP header and `mac_header` should opint at the + * protocol/nexthdr field of the outer IP. + * + * One should examine and understand the specific uses of these callbacks in + * xfrm for further detail on how and when these functions are called. RTSL. + */ +struct xfrm_mode_cbs { + struct module *owner; + int (*init_state)(struct xfrm_state *x); + int (*clone_state)(struct xfrm_state *x, struct xfrm_state *orig); + void (*destroy_state)(struct xfrm_state *x); + int (*user_init)(struct net *net, struct xfrm_state *x, + struct nlattr **attrs, + struct netlink_ext_ack *extack); + int (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb); + unsigned int (*sa_len)(const struct xfrm_state *x); + u32 (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu); + int (*input)(struct xfrm_state *x, struct sk_buff *skb); + int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); + int (*prepare_output)(struct xfrm_state *x, struct sk_buff *skb); +}; + +int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs); +void xfrm_unregister_mode_cbs(u8 mode); static inline int xfrm_af2proto(unsigned int family) { @@ -495,11 +598,44 @@ struct xfrm_policy_queue { unsigned long timeout; }; +/** + * struct xfrm_policy - xfrm policy + * @xp_net: network namespace the policy lives in + * @bydst: hlist node for SPD hash table or rbtree list + * @byidx: hlist node for index hash table + * @state_cache_list: hlist head for policy cached xfrm states + * @lock: serialize changes to policy structure members + * @refcnt: reference count, freed once it reaches 0 + * @pos: kernel internal tie-breaker to determine age of policy + * @timer: timer + * @genid: generation, used to invalidate old policies + * @priority: priority, set by userspace + * @index: policy index (autogenerated) + * @if_id: virtual xfrm interface id + * @mark: packet mark + * @selector: selector + * @lft: liftime configuration data + * @curlft: liftime state + * @walk: list head on pernet policy list + * @polq: queue to hold packets while aqcuire operaion in progress + * @bydst_reinsert: policy tree node needs to be merged + * @type: XFRM_POLICY_TYPE_MAIN or _SUB + * @action: XFRM_POLICY_ALLOW or _BLOCK + * @flags: XFRM_POLICY_LOCALOK, XFRM_POLICY_ICMP + * @xfrm_nr: number of used templates in @xfrm_vec + * @family: protocol family + * @security: SELinux security label + * @xfrm_vec: array of templates to resolve state + * @rcu: rcu head, used to defer memory release + * @xdo: hardware offload state + */ struct xfrm_policy { possible_net_t xp_net; struct hlist_node bydst; struct hlist_node byidx; + struct hlist_head state_cache_list; + /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; @@ -524,8 +660,9 @@ struct xfrm_policy { u16 family; struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; - struct hlist_node bydst_inexact_list; struct rcu_head rcu; + + struct xfrm_dev_offload xdo; }; static inline struct net *xp_net(const struct xfrm_policy *xp) @@ -582,8 +719,8 @@ struct xfrm_mgr { bool (*is_alive)(const struct km_event *c); }; -int xfrm_register_km(struct xfrm_mgr *km); -int xfrm_unregister_km(struct xfrm_mgr *km); +void xfrm_register_km(struct xfrm_mgr *km); +void xfrm_unregister_km(struct xfrm_mgr *km); struct xfrm_tunnel_skb_cb { union { @@ -983,7 +1120,8 @@ void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); struct xfrm_if_parms { int link; /* ifindex of underlying L2 interface */ - u32 if_id; /* interface identifyer */ + u32 if_id; /* interface identifier */ + bool collect_md; }; struct xfrm_if { @@ -1009,7 +1147,7 @@ struct xfrm_offload { #define CRYPTO_FALLBACK 8 #define XFRM_GSO_SEGMENT 16 #define XFRM_GRO 32 -#define XFRM_ESP_NO_TRAILER 64 +/* 64 is free */ #define XFRM_DEV_RESUME 128 #define XFRM_XMIT 256 @@ -1023,12 +1161,17 @@ struct xfrm_offload { #define CRYPTO_INVALID_PACKET_SYNTAX 64 #define CRYPTO_INVALID_PROTOCOL 128 + /* Used to keep whole l2 header for transport mode GRO */ + __u32 orig_mac_len; + __u8 proto; + __u8 inner_ipproto; }; struct sec_path { int len; int olen; + int verified_cnt; struct xfrm_state *xvec[XFRM_MAX_DEPTH]; struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; @@ -1083,22 +1226,85 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un } #ifdef CONFIG_XFRM +static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) +{ + struct sec_path *sp = skb_sec_path(skb); + + return sp->xvec[sp->len - 1]; +} +#endif + +static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) +{ +#ifdef CONFIG_XFRM + struct sec_path *sp = skb_sec_path(skb); + + if (!sp || !sp->olen || sp->len != sp->olen) + return NULL; + + return &sp->ovec[sp->olen - 1]; +#else + return NULL; +#endif +} + +#ifdef CONFIG_XFRM int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb, unsigned short family); +static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb, + int dir) +{ + if (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) + return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT; + + return false; +} + +static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb, + int dir, unsigned short family) +{ + if (dir != XFRM_POLICY_OUT && family == AF_INET) { + /* same dst may be used for traffic originating from + * devices with different policy settings. + */ + return IPCB(skb)->flags & IPSKB_NOPOLICY; + } + return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY); +} + static inline int __xfrm_policy_check2(struct sock *sk, int dir, struct sk_buff *skb, unsigned int family, int reverse) { struct net *net = dev_net(skb->dev); int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0); + struct xfrm_offload *xo = xfrm_offload(skb); + struct xfrm_state *x; if (sk && sk->sk_policy[XFRM_POLICY_IN]) return __xfrm_policy_check(sk, ndir, skb, family); - return (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) || - (skb_dst(skb)->flags & DST_NOPOLICY) || - __xfrm_policy_check(sk, ndir, skb, family); + if (xo) { + x = xfrm_input_state(skb); + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) { + bool check = (xo->flags & CRYPTO_DONE) && + (xo->status & CRYPTO_SUCCESS); + + /* The packets here are plain ones and secpath was + * needed to indicate that hardware already handled + * them and there is no need to do nothing in addition. + * + * Consume secpath which was set by drivers. + */ + secpath_reset(skb); + return check; + } + } + + return __xfrm_check_nopolicy(net, skb, dir) || + __xfrm_check_dev_nopolicy(skb, dir, family) || + __xfrm_policy_check(sk, ndir, skb, family); } static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) @@ -1128,20 +1334,20 @@ static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir, return __xfrm_policy_check2(sk, dir, skb, AF_INET6, 1); } -int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, +int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse); -static inline int xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, +static inline int xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family) { - return __xfrm_decode_session(skb, fl, family, 0); + return __xfrm_decode_session(net, skb, fl, family, 0); } -static inline int xfrm_decode_session_reverse(struct sk_buff *skb, +static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family) { - return __xfrm_decode_session(skb, fl, family, 1); + return __xfrm_decode_session(net, skb, fl, family, 1); } int __xfrm_route_forward(struct sk_buff *skb, unsigned short family); @@ -1150,9 +1356,12 @@ static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); - return !net->xfrm.policy_count[XFRM_POLICY_OUT] || - (skb_dst(skb)->flags & DST_NOXFRM) || - __xfrm_route_forward(skb, family); + if (!net->xfrm.policy_count[XFRM_POLICY_OUT] && + net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT) + return true; + + return (skb_dst(skb)->flags & DST_NOXFRM) || + __xfrm_route_forward(skb, family); } static inline int xfrm4_route_forward(struct sk_buff *skb) @@ -1169,6 +1378,8 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk); static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { + if (!sk_fullsock(osk)) + return 0; sk->sk_policy[0] = NULL; sk->sk_policy[1] = NULL; if (unlikely(osk->sk_policy[0] || osk->sk_policy[1])) @@ -1212,7 +1423,7 @@ static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *sk { return 1; } -static inline int xfrm_decode_session_reverse(struct sk_buff *skb, +static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family) { @@ -1493,12 +1704,31 @@ struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, unsigned short family); int xfrm_state_check_expire(struct xfrm_state *x); +void xfrm_state_update_stats(struct net *net); +#ifdef CONFIG_XFRM_OFFLOAD +static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) +{ + struct xfrm_dev_offload *xdo = &x->xso; + struct net_device *dev = READ_ONCE(xdo->dev); + + if (dev && dev->xfrmdev_ops && + dev->xfrmdev_ops->xdo_dev_state_update_stats) + dev->xfrmdev_ops->xdo_dev_state_update_stats(x); + +} +#else +static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) {} +#endif void xfrm_state_insert(struct xfrm_state *x); int xfrm_state_add(struct xfrm_state *x); int xfrm_state_update(struct xfrm_state *x); struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family); +struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family); struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, @@ -1538,16 +1768,18 @@ struct xfrmk_spdinfo { u32 spdhmcnt; }; -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); +int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, + bool task_valid); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); -int xfrm_init_replay(struct xfrm_state *x); +int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); +int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); @@ -1557,20 +1789,27 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)); -int xfrm_output_resume(struct sk_buff *skb, int err); +int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); +int xfrm4_tunnel_check_size(struct sk_buff *skb); +#if IS_ENABLED(CONFIG_IPV6) +int xfrm6_tunnel_check_size(struct sk_buff *skb); +#else +static inline int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + return -EMSGSIZE; +} +#endif #if IS_ENABLED(CONFIG_NET_PKTGEN) int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb); #endif void xfrm_local_error(struct sk_buff *skb, int mtu); -int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm4_transport_finish(struct sk_buff *skb, int async); int xfrm4_rcv(struct sk_buff *skb); -int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq); static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) { @@ -1581,13 +1820,11 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) } int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb); -int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb); int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); void xfrm4_local_error(struct sk_buff *skb, u32 mtu); -int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t); int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, @@ -1605,14 +1842,15 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr); __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr); int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb); -int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb); -int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, - u8 **prevhdr); #ifdef CONFIG_XFRM void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu); int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); +struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, + struct sk_buff *skb); +struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, + struct sk_buff *skb); int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen); #else @@ -1623,10 +1861,7 @@ static inline int xfrm_user_policy(struct sock *sk, int optname, } #endif -struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - int family, u32 mark); +struct dst_entry *__xfrm_dst_lookup(int family, const struct xfrm_dst_lookup_params *params); struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp); @@ -1649,10 +1884,11 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); -int verify_spi_info(u8 proto, u32 min, u32 max); -int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); +int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); +int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, + struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u32 if_id, u8 proto, + u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); @@ -1663,14 +1899,20 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net); +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, + struct net *net, + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, u32 if_id, + struct netlink_ext_ack *extack, + struct xfrm_user_offload *xuo); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); @@ -1721,6 +1963,12 @@ static inline int xfrm_policy_id2dir(u32 index) } #ifdef CONFIG_XFRM +void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq); +int xfrm_replay_check(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); +void xfrm_replay_notify(struct xfrm_state *x, int event); +int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb); +int xfrm_replay_recheck(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); + static inline int xfrm_aevent_is_on(struct net *net) { struct sock *nlsk; @@ -1819,29 +2067,6 @@ static inline void xfrm_states_delete(struct xfrm_state **states, int n) } #endif -#ifdef CONFIG_XFRM -static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) -{ - struct sec_path *sp = skb_sec_path(skb); - - return sp->xvec[sp->len - 1]; -} -#endif - -static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) -{ -#ifdef CONFIG_XFRM - struct sec_path *sp = skb_sec_path(skb); - - if (!sp || !sp->olen || sp->len != sp->olen) - return NULL; - - return &sp->ovec[sp->olen - 1]; -#else - return NULL; -#endif -} - void __init xfrm_dev_init(void); #ifdef CONFIG_XFRM_OFFLOAD @@ -1849,15 +2074,22 @@ void xfrm_dev_resume(struct sk_buff *skb); void xfrm_dev_backlog(struct softnet_data *sd); struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, - struct xfrm_user_offload *xuo); + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack); +int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, + struct xfrm_user_offload *xuo, u8 dir, + struct netlink_ext_ack *extack); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); +void xfrm_dev_state_delete(struct xfrm_state *x); +void xfrm_dev_state_free(struct xfrm_state *x); static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) { - struct xfrm_state_offload *xso = &x->xso; + struct xfrm_dev_offload *xso = &x->xso; + struct net_device *dev = READ_ONCE(xso->dev); - if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn) - xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); + if (dev && dev->xfrmdev_ops->xdo_dev_state_advance_esn) + dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); } static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) @@ -1878,24 +2110,25 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) return false; } -static inline void xfrm_dev_state_delete(struct xfrm_state *x) +static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) { - struct xfrm_state_offload *xso = &x->xso; + struct xfrm_dev_offload *xdo = &x->xdo; + struct net_device *dev = xdo->dev; - if (xso->dev) - xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); + if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete) + dev->xfrmdev_ops->xdo_dev_policy_delete(x); } -static inline void xfrm_dev_state_free(struct xfrm_state *x) +static inline void xfrm_dev_policy_free(struct xfrm_policy *x) { - struct xfrm_state_offload *xso = &x->xso; - struct net_device *dev = xso->dev; + struct xfrm_dev_offload *xdo = &x->xdo; + struct net_device *dev = xdo->dev; if (dev && dev->xfrmdev_ops) { - if (dev->xfrmdev_ops->xdo_dev_state_free) - dev->xfrmdev_ops->xdo_dev_state_free(x); - xso->dev = NULL; - dev_put(dev); + if (dev->xfrmdev_ops->xdo_dev_policy_free) + dev->xfrmdev_ops->xdo_dev_policy_free(x); + xdo->dev = NULL; + netdev_put(dev, &xdo->dev_tracker); } } #else @@ -1912,7 +2145,7 @@ static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_fea return skb; } -static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo) +static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack) { return 0; } @@ -1925,6 +2158,21 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) { } +static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, + struct xfrm_user_offload *xuo, u8 dir, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) +{ +} + +static inline void xfrm_dev_policy_free(struct xfrm_policy *x) +{ +} + static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { return false; @@ -2039,9 +2287,41 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk) proto = sk->sk_protocol; if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) - return inet6_sk(sk)->dontfrag; + return inet6_test_bit(DONTFRAG, sk); return false; } #endif + +#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern struct metadata_dst __percpu *xfrm_bpf_md_dst; + +int register_xfrm_interface_bpf(void); + +#else + +static inline int register_xfrm_interface_bpf(void) +{ + return 0; +} + +#endif + +#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF) +int register_xfrm_state_bpf(void); +#else +static inline int register_xfrm_state_bpf(void) +{ + return 0; +} +#endif + +int xfrm_nat_keepalive_init(unsigned short family); +void xfrm_nat_keepalive_fini(unsigned short family); +int xfrm_nat_keepalive_net_init(struct net *net); +int xfrm_nat_keepalive_net_fini(struct net *net); +void xfrm_nat_keepalive_state_updated(struct xfrm_state *x); + #endif /* _NET_XFRM_H */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 0140d086dc84..cac56e6b0869 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -7,10 +7,12 @@ #include <linux/if_xdp.h> #include <linux/types.h> #include <linux/dma-mapping.h> +#include <linux/bpf.h> #include <net/xdp.h> struct xsk_buff_pool; struct xdp_rxq_info; +struct xsk_cb_desc; struct xsk_queue; struct xdp_desc; struct xdp_umem; @@ -18,15 +20,19 @@ struct xdp_sock; struct device; struct page; +#define XSK_PRIV_MAX 24 + struct xdp_buff_xsk { struct xdp_buff xdp; + u8 cb[XSK_PRIV_MAX]; dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; - bool unaligned; - u64 orig_addr; - struct list_head free_list_node; -}; + struct list_head list_node; +} __aligned_largest; + +#define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) +#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) struct xsk_dma_map { dma_addr_t *dma_pages; @@ -35,7 +41,6 @@ struct xsk_dma_map { refcount_t users; struct list_head list; /* Protected by the RTNL_LOCK */ u32 dma_pages_cnt; - bool dma_need_sync; }; struct xsk_buff_pool { @@ -48,7 +53,10 @@ struct xsk_buff_pool { refcount_t users; struct xdp_umem *umem; struct work_struct work; + /* Protects generic receive in shared and non-shared umem mode. */ + spinlock_t rx_lock; struct list_head free_list; + struct list_head xskb_list; u32 heads_cnt; u16 queue_id; @@ -60,6 +68,7 @@ struct xsk_buff_pool { */ dma_addr_t *dma_pages; struct xdp_buff_xsk *heads; + struct xdp_desc *tx_descs; u64 chunk_mask; u64 addrs_cnt; u32 free_list_cnt; @@ -67,26 +76,41 @@ struct xsk_buff_pool { u32 free_heads_cnt; u32 headroom; u32 chunk_size; + u32 chunk_shift; u32 frame_len; + u32 xdp_zc_max_segs; + u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; - bool dma_need_sync; bool unaligned; + bool tx_sw_csum; void *addrs; + /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: + * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when + * sockets share a single cq when the same netdev and queue id is shared. + */ + spinlock_t cq_lock; struct xdp_buff_xsk *free_heads[]; }; +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + /* AF_XDP core. */ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem); int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, u16 queue_id, u16 flags); -int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct net_device *dev, u16 queue_id); +int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_destroy(struct xsk_buff_pool *pool); -void xp_release(struct xdp_buff_xsk *xskb); void xp_get_pool(struct xsk_buff_pool *pool); -void xp_put_pool(struct xsk_buff_pool *pool); +bool xp_put_pool(struct xsk_buff_pool *pool); void xp_clear_dev(struct xsk_buff_pool *pool); void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); @@ -94,15 +118,39 @@ void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); +static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, + u64 addr) +{ + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; +} + +static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, + dma_addr_t *dma_pages, u64 addr) +{ + xskb->frame_dma = (dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); + xskb->dma = xskb->frame_dma + pool->headroom + XDP_PACKET_HEADROOM; +} + /* AF_XDP ZC drivers, via xdp_sock_buff.h */ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc); int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages); void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); +u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); + +struct xdp_desc_ctx { + dma_addr_t dma; + struct xsk_tx_metadata *meta; +}; + +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr); + static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) { return xskb->dma; @@ -113,21 +161,17 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) return xskb->frame_dma; } -void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - xp_dma_sync_for_cpu_slow(xskb); + dma_sync_single_for_cpu(xskb->pool->dev, xskb->dma, + xskb->pool->frame_len, + DMA_BIDIRECTIONAL); } -void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, - size_t size); static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - if (!pool->dma_need_sync) - return; - - xp_dma_sync_for_device_slow(pool, dma, size); + dma_sync_single_for_device(pool->dev, dma, size, DMA_BIDIRECTIONAL); } /* Masks for xdp_umem_page flags. @@ -142,11 +186,16 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, { bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; - if (pool->dma_pages_cnt && cross_pg) { - return !(pool->dma_pages[addr >> PAGE_SHIFT] & - XSK_NEXT_PG_CONTIG_MASK); - } - return false; + if (likely(!cross_pg)) + return false; + + return pool->dma_pages && + !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); +} + +static inline bool xp_mb_desc(const struct xdp_desc *desc) +{ + return desc->options & XDP_PKT_CONTD; } static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) @@ -170,4 +219,35 @@ static inline u64 xp_unaligned_add_offset_to_addr(u64 addr) xp_unaligned_extract_offset(addr); } +static inline u32 xp_aligned_extract_idx(struct xsk_buff_pool *pool, u64 addr) +{ + return xp_aligned_extract_addr(pool, addr) >> pool->chunk_shift; +} + +static inline void xp_release(struct xdp_buff_xsk *xskb) +{ + if (xskb->pool->unaligned) + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} + +static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, + struct xsk_buff_pool *pool) +{ + u64 orig_addr = xskb->xdp.data - pool->addrs; + u64 offset; + + if (!pool->unaligned) + return orig_addr; + + offset = xskb->xdp.data - xskb->xdp.data_hard_start; + offset += pool->headroom; + orig_addr -= offset; + return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); +} + +static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) +{ + return pool->tx_metadata_len > 0; +} + #endif /* XSK_BUFF_POOL_H_ */ |