diff options
Diffstat (limited to 'include/net/tcp.h')
-rw-r--r-- | include/net/tcp.h | 427 |
1 files changed, 270 insertions, 157 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index a5ea27df3c2b..14d45661a84d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -23,9 +23,9 @@ #include <linux/cache.h> #include <linux/percpu.h> #include <linux/skbuff.h> -#include <linux/cryptohash.h> #include <linux/kref.h> #include <linux/ktime.h> +#include <linux/indirect_call_wrapper.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> @@ -48,10 +48,12 @@ extern struct inet_hashinfo tcp_hashinfo; -extern struct percpu_counter tcp_orphan_count; +DECLARE_PER_CPU(unsigned int, tcp_orphan_count); +int tcp_orphan_count_sum(void); + void tcp_time_wait(struct sock *sk, int state, int timeo); -#define MAX_TCP_HEADER (128 + MAX_HEADER) +#define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) #define MAX_TCP_OPTION_SPACE 40 #define TCP_MIN_SND_MSS 48 #define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE) @@ -126,6 +128,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); * to combine FIN-WAIT-2 timeout with * TIME-WAIT timer. */ +#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ #if HZ >= 100 @@ -250,6 +253,8 @@ extern long sysctl_tcp_mem[3]; #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ extern atomic_long_t tcp_memory_allocated; +DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); + extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; @@ -287,21 +292,18 @@ static inline bool tcp_out_of_memory(struct sock *sk) return false; } -void sk_forced_mem_schedule(struct sock *sk, int size); - -static inline bool tcp_too_many_orphans(struct sock *sk, int shift) +static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { - struct percpu_counter *ocp = sk->sk_prot->orphan_count; - int orphans = percpu_counter_read_positive(ocp); - - if (orphans << shift > sysctl_tcp_max_orphans) { - orphans = percpu_counter_sum_positive(ocp); - if (orphans << shift > sysctl_tcp_max_orphans) - return true; - } - return false; + sk_wmem_queued_add(sk, -skb->truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, skb->truesize); + else + sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb))); + __kfree_skb(skb); } +void sk_forced_mem_schedule(struct sock *sk, int size); + bool tcp_check_oom(struct sock *sk, int shift); @@ -321,9 +323,12 @@ void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); +void tcp_remove_empty_skb(struct sock *sk); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); +int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, + size_t size, struct ubuf_info *uarg); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, @@ -343,9 +348,12 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); +void tcp_twsk_purge(struct list_head *net_exit_list, int family); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule); void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); static inline void tcp_dec_quickack_mode(struct sock *sk, @@ -385,30 +393,37 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); -void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag); +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); +void __tcp_close(struct sock *sk, long timeout); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); -void tcp_init_transfer(struct sock *sk, int bpf_op); +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); +int do_tcp_getsockopt(struct sock *sk, int level, + int optname, sockptr_t optval, sockptr_t optlen); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -int tcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -int compat_tcp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -int compat_tcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); +bool tcp_bpf_bypass_getsockopt(int level, int optname); +int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); +int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); -int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); +int tcp_set_window_clamp(struct sock *sk, int val); +void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss); +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, @@ -426,6 +441,7 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie); +u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss); u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th); @@ -436,6 +452,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); +void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, @@ -457,7 +474,8 @@ enum tcp_synack_type { struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -471,6 +489,9 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); +struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. @@ -570,6 +591,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ +void tcp_skb_entail(struct sock *sk, struct sk_buff *skb); +void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); @@ -605,9 +628,10 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); -void tcp_reset(struct sock *sk); +void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); +void tcp_check_space(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); @@ -624,6 +648,7 @@ static inline void tcp_clear_xmit_timers(struct sock *sk) unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); unsigned int tcp_current_mss(struct sock *sk); +u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when); /* Bound MSS / TSO packet size with the half of the window */ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) @@ -654,13 +679,15 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); +struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); +void tcp_read_done(struct sock *sk, size_t len); void tcp_initialize_rcv_mss(struct sock *sk); int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -void tcp_init_buffer_space(struct sock *sk); static inline void tcp_bound_rto(const struct sock *sk) { @@ -675,6 +702,10 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { + /* mptcp hooks are only on the slow path */ + if (sk_is_mptcp((struct sock *)tp)) + return; + tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); @@ -700,7 +731,7 @@ static inline void tcp_fast_path_check(struct sock *sk) static inline u32 tcp_rto_min(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = TCP_RTO_MIN; + u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); @@ -859,10 +890,11 @@ struct tcp_skb_cb { __u32 ack_seq; /* Sequence number ACK'd */ union { struct { +#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) /* There is space for up to 24 bytes */ - __u32 in_flight:30,/* Bytes in flight at transmit */ - is_app_limited:1, /* cwnd not fully used? */ - unused:1; + __u32 is_app_limited:1, /* cwnd not fully used? */ + delivered_ce:20, + unused:11; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ @@ -876,35 +908,12 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - void *data_end; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} +extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, @@ -931,17 +940,14 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb) #endif return 0; } -#endif -static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_tcp_l3mdev_accept && - skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) - return true; +extern const struct inet_connection_sock_af_ops ipv6_specific; + +INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); +INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); +void tcp_v6_early_demux(struct sk_buff *skb); + #endif - return false; -} /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v4_sdif(struct sk_buff *skb) @@ -986,7 +992,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return likely(tcp_skb_can_collapse_to(to) && - mptcp_skb_can_collapse(to, from)); + mptcp_skb_can_collapse(to, from) && + skb_pure_zcopy_same(to, from)); } /* Events passed to congestion control interface */ @@ -1040,7 +1047,9 @@ struct ack_sample { struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ + s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ @@ -1048,50 +1057,63 @@ struct rate_sample { int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ + u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { - struct list_head list; - u32 key; - u32 flags; - - /* initialize private data (optional) */ - void (*init)(struct sock *sk); - /* cleanup private data (optional) */ - void (*release)(struct sock *sk); +/* fast path fields are put first to fill one cache line */ /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); + /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); + /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); + /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); - /* new value of cwnd after loss (required) */ - u32 (*undo_cwnd)(struct sock *sk); + /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - u32 (*sndbuf_expand)(struct sock *sk); + /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + + + /* new value of cwnd after loss (required) */ + u32 (*undo_cwnd)(struct sock *sk); + /* returns the multiplier used in tcp_sndbuf_expand (optional) */ + u32 (*sndbuf_expand)(struct sock *sk); + +/* control/slow paths put last */ /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); - char name[TCP_CA_NAME_MAX]; - struct module *owner; -}; + char name[TCP_CA_NAME_MAX]; + struct module *owner; + struct list_head list; + u32 key; + u32 flags; + + /* initialize private data (optional) */ + void (*init)(struct sock *sk); + /* cleanup private data (optional) */ + void (*release)(struct sock *sk); +} ____cacheline_aligned_in_smp; int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); @@ -1105,7 +1127,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin); + bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); @@ -1133,15 +1155,6 @@ static inline bool tcp_ca_needs_ecn(const struct sock *sk) return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; } -static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_ca_ops->set_state) - icsk->icsk_ca_ops->set_state(sk, ca_state); - icsk->icsk_ca_state = ca_state; -} - static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1150,6 +1163,9 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } +/* From tcp_cong.c */ +void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + /* From tcp_rate.c */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, @@ -1158,6 +1174,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); +static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) +{ + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); +} + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. @@ -1201,9 +1222,20 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) #define TCP_INFINITE_SSTHRESH 0x7fffffff +static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp) +{ + return tp->snd_cwnd; +} + +static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val) +{ + WARN_ON_ONCE((int)val <= 0); + tp->snd_cwnd = val; +} + static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { - return tp->snd_cwnd < tp->snd_ssthresh; + return tcp_snd_cwnd(tp) < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) @@ -1229,8 +1261,8 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, - ((tp->snd_cwnd >> 1) + - (tp->snd_cwnd >> 2))); + ((tcp_snd_cwnd(tp) >> 1) + + (tcp_snd_cwnd(tp) >> 2))); } /* Use define here intentionally to get WARN_ON location shown at the caller */ @@ -1270,11 +1302,14 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + if (tp->is_cwnd_limited) + return true; + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) - return tp->snd_cwnd < 2 * tp->max_packets_out; + return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out; - return tp->is_cwnd_limited; + return false; } /* BBR congestion control needs pacing. @@ -1288,26 +1323,22 @@ static inline bool tcp_needs_internal_pacing(const struct sock *sk) return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; } -/* Return in jiffies the delay before one skb is sent. - * If @skb is NULL, we look at EDT for next packet being sent on the socket. +/* Estimates in how many jiffies next packet for this flow can be sent. + * Scheduling a retransmit timer too early would be silly. */ -static inline unsigned long tcp_pacing_delay(const struct sock *sk, - const struct sk_buff *skb) +static inline unsigned long tcp_pacing_delay(const struct sock *sk) { - s64 pacing_delay = skb ? skb->tstamp : tcp_sk(sk)->tcp_wstamp_ns; - - pacing_delay -= tcp_sk(sk)->tcp_clock_cache; + s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache; - return pacing_delay > 0 ? nsecs_to_jiffies(pacing_delay) : 0; + return delay > 0 ? nsecs_to_jiffies(delay) : 0; } static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, - const unsigned long max_when, - const struct sk_buff *skb) + const unsigned long max_when) { - inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk, skb), + inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), max_when); } @@ -1326,7 +1357,9 @@ static inline unsigned long tcp_probe0_base(const struct sock *sk) static inline unsigned long tcp_probe0_when(const struct sock *sk, unsigned long max_when) { - u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff; + u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1, + inet_csk(sk)->icsk_backoff); + u64 when = (u64)tcp_probe0_base(sk) << backoff; return (unsigned long)min_t(u64, when, max_when); } @@ -1335,8 +1368,7 @@ static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_base(sk), TCP_RTO_MAX, - NULL); + tcp_probe0_base(sk), TCP_RTO_MAX); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) @@ -1364,7 +1396,10 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) __skb_checksum_complete(skb); } -bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason); + + int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); @@ -1376,7 +1411,6 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) rx_opt->num_sacks = 0; } -u32 tcp_default_init_rwnd(u32 mss); void tcp_cwnd_restart(struct sock *sk, s32 delta); static inline void tcp_slow_start_after_idle_check(struct sock *sk) @@ -1385,8 +1419,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); s32 delta; - if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out || - ca_ops->cong_control) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) || + tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) @@ -1401,7 +1435,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, static inline int tcp_win_from_space(const struct sock *sk, int space) { - int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale; + int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale); return tcp_adv_win_scale <= 0 ? (space>>(-tcp_adv_win_scale)) : @@ -1421,6 +1455,49 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } +static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) +{ + int unused_mem = sk_unused_reserved_mem(sk); + struct tcp_sock *tp = tcp_sk(sk); + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + if (unused_mem) + tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh, + tcp_win_from_space(sk, unused_mem)); +} + +void tcp_cleanup_rbuf(struct sock *sk, int copied); + +/* We provision sk_rcvbuf around 200% of sk_rcvlowat. + * If 87.5 % (7/8) of the space has been consumed, we want to override + * SO_RCVLOWAT constraint, since we are receiving skbs with too small + * len/truesize ratio. + */ +static inline bool tcp_rmem_pressure(const struct sock *sk) +{ + int rcvbuf, threshold; + + if (tcp_under_memory_pressure(sk)) + return true; + + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + threshold = rcvbuf - (rcvbuf >> 3); + + return atomic_read(&sk->sk_rmem_alloc) > threshold; +} + +static inline bool tcp_epollin_ready(const struct sock *sk, int target) +{ + const struct tcp_sock *tp = tcp_sk(sk); + int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); + + if (avail <= 0) + return false; + + return (avail >= target) || tcp_rmem_pressure(sk) || + (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss); +} + extern void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst); @@ -1432,21 +1509,24 @@ static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl; + return tp->keepalive_intvl ? : + READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); } static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time; + return tp->keepalive_time ? : + READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes; + return tp->keepalive_probes ? : + READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) @@ -1459,7 +1539,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) static inline int tcp_fin_time(const struct sock *sk) { - int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout; + int fin_timeout = tcp_sk(sk)->linger2 ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout); const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) @@ -1547,6 +1628,7 @@ struct tcp_md5sig_key { u8 keylen; u8 family; /* AF_INET or AF_INET6 */ u8 prefixlen; + u8 flags; union tcp_md5_addr addr; int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; @@ -1592,10 +1674,10 @@ struct tcp_md5sig_pool { int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index, + int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen, gfp_t gfp); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index); + int family, u8 prefixlen, int l3index, u8 flags); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); @@ -1614,6 +1696,12 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, return __tcp_md5_do_lookup(sk, l3index, addr, family); } +enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); + + #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else static inline struct tcp_md5sig_key * @@ -1622,6 +1710,14 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, { return NULL; } + +static inline enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif) +{ + return SKB_NOT_DROPPED_YET; +} #define tcp_twsk_md5_key(twsk) NULL #endif @@ -1657,6 +1753,8 @@ void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key); +int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, + u64 *key); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -1678,7 +1776,6 @@ struct tcp_fastopen_context { struct rcu_head rcu; }; -extern unsigned int sysctl_tcp_fastopen_blackhole_timeout; void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); @@ -1756,11 +1853,6 @@ static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) return skb_rb_last(&sk->tcp_rtx_queue); } -static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); @@ -1839,7 +1931,7 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc { list_del(&skb->tcp_tsorted_anchor); tcp_rtx_queue_unlink(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } static inline void tcp_push_pending_frames(struct sock *sk) @@ -1940,6 +2032,10 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); +INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); +INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)); int tcp_gro_complete(struct sk_buff *skb); void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); @@ -1947,21 +2043,10 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; + return tp->notsent_lowat ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } -/* @wake is one when sk_stream_write_space() calls us. - * This sends EPOLLOUT only if notsent_bytes is half the limit. - * This mimics the strategy used in sock_def_write_space(). - */ -static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) -{ - const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = READ_ONCE(tp->write_seq) - - READ_ONCE(tp->snd_nxt); - - return (notsent_bytes << wake) < tcp_notsent_lowat(tp); -} +bool tcp_stream_memory_free(const struct sock *sk, int wake); #ifdef CONFIG_PROC_FS int tcp4_proc_init(void); @@ -1984,7 +2069,7 @@ struct tcp_sock_af_ops { const struct sk_buff *skb); int (*md5_parse)(struct sock *sk, int optname, - char __user *optval, + sockptr_t optval, int optlen); #endif }; @@ -1999,21 +2084,21 @@ struct tcp_request_sock_ops { const struct sock *sk, const struct sk_buff *skb); #endif - void (*init_req)(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif - struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, - const struct request_sock *req); + struct dst_entry *(*route_req)(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; @@ -2049,7 +2134,7 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd); -extern void tcp_rack_mark_lost(struct sock *sk); +extern bool tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); @@ -2132,9 +2217,13 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) u16 segs_in; segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - tp->segs_in += segs_in; + + /* We update these fields while other threads might + * read them from tcp_get_info() + */ + WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in); if (skb->len > tcp_hdrlen(skb)) - tp->data_segs_in += segs_in; + WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in); } /* @@ -2192,17 +2281,41 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) +#ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; -int tcp_bpf_init(struct sock *sk); -void tcp_bpf_reinit(struct sock *sk); +#ifdef CONFIG_BPF_SYSCALL +struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); +#endif /* CONFIG_BPF_SYSCALL */ + int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); -int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags); +#endif /* CONFIG_NET_SOCK_MSG */ + +#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) +static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ +} +#endif + +#ifdef CONFIG_CGROUP_BPF +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ + skops->skb = skb; + skops->skb_data_end = skb->data + end_offset; +} +#else +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ +} +#endif /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF @@ -2276,7 +2389,7 @@ static inline u32 tcp_timeout_init(struct sock *sk) if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; - return timeout; + return min_t(int, timeout, TCP_RTO_MAX); } static inline u32 tcp_rwnd_init_bpf(struct sock *sk) |