aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/skbuff.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/skbuff.h')
-rw-r--r--include/linux/skbuff.h738
1 files changed, 515 insertions, 223 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c8cb7e697d47..7be5bb4c94b6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -36,104 +36,121 @@
#include <linux/splice.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
+#include <linux/llist.h>
#include <net/flow.h>
#include <net/page_pool.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_common.h>
#endif
+#include <net/net_debug.h>
+#include <net/dropreason.h>
-/* The interface for checksum offload between the stack and networking drivers
+/**
+ * DOC: skb checksums
+ *
+ * The interface for checksum offload between the stack and networking drivers
* is as follows...
*
- * A. IP checksum related features
+ * IP checksum related features
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* Drivers advertise checksum offload capabilities in the features of a device.
* From the stack's point of view these are capabilities offered by the driver.
* A driver typically only advertises features that it is capable of offloading
* to its device.
*
- * The checksum related features are:
- *
- * NETIF_F_HW_CSUM - The driver (or its device) is able to compute one
- * IP (one's complement) checksum for any combination
- * of protocols or protocol layering. The checksum is
- * computed and set in a packet per the CHECKSUM_PARTIAL
- * interface (see below).
- *
- * NETIF_F_IP_CSUM - Driver (device) is only able to checksum plain
- * TCP or UDP packets over IPv4. These are specifically
- * unencapsulated packets of the form IPv4|TCP or
- * IPv4|UDP where the Protocol field in the IPv4 header
- * is TCP or UDP. The IPv4 header may contain IP options.
- * This feature cannot be set in features for a device
- * with NETIF_F_HW_CSUM also set. This feature is being
- * DEPRECATED (see below).
- *
- * NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain
- * TCP or UDP packets over IPv6. These are specifically
- * unencapsulated packets of the form IPv6|TCP or
- * IPv6|UDP where the Next Header field in the IPv6
- * header is either TCP or UDP. IPv6 extension headers
- * are not supported with this feature. This feature
- * cannot be set in features for a device with
- * NETIF_F_HW_CSUM also set. This feature is being
- * DEPRECATED (see below).
- *
- * NETIF_F_RXCSUM - Driver (device) performs receive checksum offload.
- * This flag is only used to disable the RX checksum
- * feature for a device. The stack will accept receive
- * checksum indication in packets received on a device
- * regardless of whether NETIF_F_RXCSUM is set.
- *
- * B. Checksumming of received packets by device. Indication of checksum
- * verification is set in skb->ip_summed. Possible values are:
- *
- * CHECKSUM_NONE:
+ * .. flat-table:: Checksum related device features
+ * :widths: 1 10
+ *
+ * * - %NETIF_F_HW_CSUM
+ * - The driver (or its device) is able to compute one
+ * IP (one's complement) checksum for any combination
+ * of protocols or protocol layering. The checksum is
+ * computed and set in a packet per the CHECKSUM_PARTIAL
+ * interface (see below).
+ *
+ * * - %NETIF_F_IP_CSUM
+ * - Driver (device) is only able to checksum plain
+ * TCP or UDP packets over IPv4. These are specifically
+ * unencapsulated packets of the form IPv4|TCP or
+ * IPv4|UDP where the Protocol field in the IPv4 header
+ * is TCP or UDP. The IPv4 header may contain IP options.
+ * This feature cannot be set in features for a device
+ * with NETIF_F_HW_CSUM also set. This feature is being
+ * DEPRECATED (see below).
+ *
+ * * - %NETIF_F_IPV6_CSUM
+ * - Driver (device) is only able to checksum plain
+ * TCP or UDP packets over IPv6. These are specifically
+ * unencapsulated packets of the form IPv6|TCP or
+ * IPv6|UDP where the Next Header field in the IPv6
+ * header is either TCP or UDP. IPv6 extension headers
+ * are not supported with this feature. This feature
+ * cannot be set in features for a device with
+ * NETIF_F_HW_CSUM also set. This feature is being
+ * DEPRECATED (see below).
+ *
+ * * - %NETIF_F_RXCSUM
+ * - Driver (device) performs receive checksum offload.
+ * This flag is only used to disable the RX checksum
+ * feature for a device. The stack will accept receive
+ * checksum indication in packets received on a device
+ * regardless of whether NETIF_F_RXCSUM is set.
+ *
+ * Checksumming of received packets by device
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Indication of checksum verification is set in &sk_buff.ip_summed.
+ * Possible values are:
+ *
+ * - %CHECKSUM_NONE
*
* Device did not checksum this packet e.g. due to lack of capabilities.
* The packet contains full (though not verified) checksum in packet but
* not in skb->csum. Thus, skb->csum is undefined in this case.
*
- * CHECKSUM_UNNECESSARY:
+ * - %CHECKSUM_UNNECESSARY
*
* The hardware you're dealing with doesn't calculate the full checksum
- * (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums
- * for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY
- * if their checksums are okay. skb->csum is still undefined in this case
+ * (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums
+ * for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY
+ * if their checksums are okay. &sk_buff.csum is still undefined in this case
* though. A driver or device must never modify the checksum field in the
* packet even if checksum is verified.
*
- * CHECKSUM_UNNECESSARY is applicable to following protocols:
- * TCP: IPv6 and IPv4.
- * UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
+ * %CHECKSUM_UNNECESSARY is applicable to following protocols:
+ *
+ * - TCP: IPv6 and IPv4.
+ * - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
* zero UDP checksum for either IPv4 or IPv6, the networking stack
* may perform further validation in this case.
- * GRE: only if the checksum is present in the header.
- * SCTP: indicates the CRC in SCTP header has been validated.
- * FCOE: indicates the CRC in FC frame has been validated.
+ * - GRE: only if the checksum is present in the header.
+ * - SCTP: indicates the CRC in SCTP header has been validated.
+ * - FCOE: indicates the CRC in FC frame has been validated.
*
- * skb->csum_level indicates the number of consecutive checksums found in
- * the packet minus one that have been verified as CHECKSUM_UNNECESSARY.
+ * &sk_buff.csum_level indicates the number of consecutive checksums found in
+ * the packet minus one that have been verified as %CHECKSUM_UNNECESSARY.
* For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
* and a device is able to verify the checksums for UDP (possibly zero),
- * GRE (checksum flag is set) and TCP, skb->csum_level would be set to
+ * GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to
* two. If the device were only able to verify the UDP checksum and not
* GRE, either because it doesn't support GRE checksum or because GRE
* checksum is bad, skb->csum_level would be set to zero (TCP checksum is
* not considered in this case).
*
- * CHECKSUM_COMPLETE:
+ * - %CHECKSUM_COMPLETE
*
* This is the most generic way. The device supplied checksum of the _whole_
- * packet as seen by netif_rx() and fills in skb->csum. This means the
+ * packet as seen by netif_rx() and fills in &sk_buff.csum. This means the
* hardware doesn't need to parse L3/L4 headers to implement this.
*
* Notes:
+ *
* - Even if device supports only some protocols, but is able to produce
* skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
* - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols.
*
- * CHECKSUM_PARTIAL:
+ * - %CHECKSUM_PARTIAL
*
* A checksum is set up to be offloaded to a device as described in the
* output description for CHECKSUM_PARTIAL. This may occur on a packet
@@ -145,14 +162,18 @@
* packet that are after the checksum being offloaded are not considered to
* be verified.
*
- * C. Checksumming on transmit for non-GSO. The stack requests checksum offload
- * in the skb->ip_summed for a packet. Values are:
+ * Checksumming on transmit for non-GSO
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
- * CHECKSUM_PARTIAL:
+ * The stack requests checksum offload in the &sk_buff.ip_summed for a packet.
+ * Values are:
+ *
+ * - %CHECKSUM_PARTIAL
*
* The driver is required to checksum the packet as seen by hard_start_xmit()
- * from skb->csum_start up to the end, and to record/write the checksum at
- * offset skb->csum_start + skb->csum_offset. A driver may verify that the
+ * from &sk_buff.csum_start up to the end, and to record/write the checksum at
+ * offset &sk_buff.csum_start + &sk_buff.csum_offset.
+ * A driver may verify that the
* csum_start and csum_offset values are valid values given the length and
* offset of the packet, but it should not attempt to validate that the
* checksum refers to a legitimate transport layer checksum -- it is the
@@ -164,55 +185,66 @@
* checksum calculation to the device, or call skb_checksum_help (in the case
* that the device does not support offload for a particular checksum).
*
- * NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM are being deprecated in favor of
- * NETIF_F_HW_CSUM. New devices should use NETIF_F_HW_CSUM to indicate
+ * %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of
+ * %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate
* checksum offload capability.
- * skb_csum_hwoffload_help() can be called to resolve CHECKSUM_PARTIAL based
+ * skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based
* on network device checksumming capabilities: if a packet does not match
- * them, skb_checksum_help or skb_crc32c_help (depending on the value of
- * csum_not_inet, see item D.) is called to resolve the checksum.
+ * them, skb_checksum_help() or skb_crc32c_help() (depending on the value of
+ * &sk_buff.csum_not_inet, see :ref:`crc`)
+ * is called to resolve the checksum.
*
- * CHECKSUM_NONE:
+ * - %CHECKSUM_NONE
*
* The skb was already checksummed by the protocol, or a checksum is not
* required.
*
- * CHECKSUM_UNNECESSARY:
+ * - %CHECKSUM_UNNECESSARY
*
* This has the same meaning as CHECKSUM_NONE for checksum offload on
* output.
*
- * CHECKSUM_COMPLETE:
+ * - %CHECKSUM_COMPLETE
+ *
* Not used in checksum output. If a driver observes a packet with this value
- * set in skbuff, it should treat the packet as if CHECKSUM_NONE were set.
- *
- * D. Non-IP checksum (CRC) offloads
- *
- * NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
- * offloading the SCTP CRC in a packet. To perform this offload the stack
- * will set csum_start and csum_offset accordingly, set ip_summed to
- * CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in
- * the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
- * A driver that supports both IP checksum offload and SCTP CRC32c offload
- * must verify which offload is configured for a packet by testing the
- * value of skb->csum_not_inet; skb_crc32c_csum_help is provided to resolve
- * CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
- *
- * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
- * offloading the FCOE CRC in a packet. To perform this offload the stack
- * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset
- * accordingly. Note that there is no indication in the skbuff that the
- * CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
- * both IP checksum offload and FCOE CRC offload must verify which offload
- * is configured for a packet, presumably by inspecting packet headers.
- *
- * E. Checksumming on output with GSO.
- *
- * In the case of a GSO packet (skb_is_gso(skb) is true), checksum offload
+ * set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set.
+ *
+ * .. _crc:
+ *
+ * Non-IP checksum (CRC) offloads
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * .. flat-table::
+ * :widths: 1 10
+ *
+ * * - %NETIF_F_SCTP_CRC
+ * - This feature indicates that a device is capable of
+ * offloading the SCTP CRC in a packet. To perform this offload the stack
+ * will set csum_start and csum_offset accordingly, set ip_summed to
+ * %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication
+ * in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c.
+ * A driver that supports both IP checksum offload and SCTP CRC32c offload
+ * must verify which offload is configured for a packet by testing the
+ * value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to
+ * resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
+ *
+ * * - %NETIF_F_FCOE_CRC
+ * - This feature indicates that a device is capable of offloading the FCOE
+ * CRC in a packet. To perform this offload the stack will set ip_summed
+ * to %CHECKSUM_PARTIAL and set csum_start and csum_offset
+ * accordingly. Note that there is no indication in the skbuff that the
+ * %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
+ * both IP checksum offload and FCOE CRC offload must verify which offload
+ * is configured for a packet, presumably by inspecting packet headers.
+ *
+ * Checksumming on output with GSO
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * In the case of a GSO packet (skb_is_gso() is true), checksum offload
* is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
- * gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as
+ * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as
* part of the GSO operation is implied. If a checksum is being offloaded
- * with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and
+ * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and
* csum_offset are set to refer to the outermost checksum being offloaded
* (two offloaded checksums are possible with UDP encapsulation).
*/
@@ -286,14 +318,19 @@ struct nf_bridge_info {
struct tc_skb_ext {
__u32 chain;
__u16 mru;
- bool post_ct;
+ __u16 zone;
+ u8 post_ct:1;
+ u8 post_ct_snat:1;
+ u8 post_ct_dnat:1;
};
#endif
struct sk_buff_head {
- /* These two members must be first. */
- struct sk_buff *next;
- struct sk_buff *prev;
+ /* These two members must be first to match sk_buff. */
+ struct_group_tagged(sk_buff_list, list,
+ struct sk_buff *next;
+ struct sk_buff *prev;
+ );
__u32 qlen;
spinlock_t lock;
@@ -405,8 +442,10 @@ static inline bool skb_frag_must_loop(struct page *p)
/**
* struct skb_shared_hwtstamps - hardware time stamps
- * @hwtstamp: hardware time stamp transformed into duration
- * since arbitrary point in time
+ * @hwtstamp: hardware time stamp transformed into duration
+ * since arbitrary point in time
+ * @netdev_data: address/cookie of network device driver used as
+ * reference to actual hardware time stamp
*
* Software time stamps generated by ktime_get_real() are stored in
* skb->tstamp.
@@ -418,7 +457,10 @@ static inline bool skb_frag_must_loop(struct page *p)
* &skb_shared_info. Use skb_hwtstamps() to get a pointer.
*/
struct skb_shared_hwtstamps {
- ktime_t hwtstamp;
+ union {
+ ktime_t hwtstamp;
+ void *netdev_data;
+ };
};
/* Definitions for tx_flags in struct skb_shared_info */
@@ -432,16 +474,24 @@ enum {
/* device driver is going to provide hardware time stamp */
SKBTX_IN_PROGRESS = 1 << 2,
+ /* generate hardware time stamp based on cycles if supported */
+ SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3,
+
/* generate wifi status information (where possible) */
SKBTX_WIFI_STATUS = 1 << 4,
+ /* determine hardware time stamp based on time or cycles */
+ SKBTX_HW_TSTAMP_NETDEV = 1 << 5,
+
/* generate software time stamp when entering packet scheduling */
SKBTX_SCHED_TSTAMP = 1 << 6,
};
#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
SKBTX_SCHED_TSTAMP)
-#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
+#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \
+ SKBTX_HW_TSTAMP_USE_CYCLES | \
+ SKBTX_ANY_SW_TSTAMP)
/* Definitions for flags in struct skb_shared_info */
enum {
@@ -459,10 +509,18 @@ enum {
* charged to the kernel memory.
*/
SKBFL_PURE_ZEROCOPY = BIT(2),
+
+ SKBFL_DONT_ORPHAN = BIT(3),
+
+ /* page references are managed by the ubuf_info, so it's safe to
+ * use frags only up until ubuf_info is released
+ */
+ SKBFL_MANAGED_FRAG_REFS = BIT(4),
};
#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
-#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
+#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
+ SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)
/*
* The callback notifies userspace to release buffers when skb DMA is done in
@@ -475,6 +533,13 @@ enum {
struct ubuf_info {
void (*callback)(struct sk_buff *, struct ubuf_info *,
bool zerocopy_success);
+ refcount_t refcnt;
+ u8 flags;
+};
+
+struct ubuf_info_msgzc {
+ struct ubuf_info ubuf;
+
union {
struct {
unsigned long desc;
@@ -487,8 +552,6 @@ struct ubuf_info {
u32 bytelen;
};
};
- refcount_t refcnt;
- u8 flags;
struct mmpin {
struct user_struct *user;
@@ -497,24 +560,12 @@ struct ubuf_info {
};
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
+#define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \
+ ubuf)
int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);
-struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size);
-struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg);
-
-void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
-
-void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
- bool success);
-
-int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
-int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
- struct msghdr *msg, int len,
- struct ubuf_info *uarg);
-
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
@@ -535,6 +586,7 @@ struct skb_shared_info {
* Warning : all fields before dataref are cleared in __alloc_skb()
*/
atomic_t dataref;
+ unsigned int xdp_frags_size;
/* Intermediate layers must ensure that destructor_arg
* remains valid until skb destructor */
@@ -544,16 +596,32 @@ struct skb_shared_info {
skb_frag_t frags[MAX_SKB_FRAGS];
};
-/* We divide dataref into two halves. The higher 16 bits hold references
- * to the payload part of skb->data. The lower 16 bits hold references to
- * the entire skb->data. A clone of a headerless skb holds the length of
- * the header in skb->hdr_len.
- *
- * All users must obey the rule that the skb->data reference count must be
- * greater than or equal to the payload reference count.
- *
- * Holding a reference to the payload part means that the user does not
- * care about modifications to the header part of skb->data.
+/**
+ * DOC: dataref and headerless skbs
+ *
+ * Transport layers send out clones of payload skbs they hold for
+ * retransmissions. To allow lower layers of the stack to prepend their headers
+ * we split &skb_shared_info.dataref into two halves.
+ * The lower 16 bits count the overall number of references.
+ * The higher 16 bits indicate how many of the references are payload-only.
+ * skb_header_cloned() checks if skb is allowed to add / write the headers.
+ *
+ * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr
+ * (via __skb_header_release()). Any clone created from marked skb will get
+ * &sk_buff.hdr_len populated with the available headroom.
+ * If there's the only clone in existence it's able to modify the headroom
+ * at will. The sequence of calls inside the transport layer is::
+ *
+ * <alloc skb>
+ * skb_reserve()
+ * __skb_header_release()
+ * skb_clone()
+ * // send the clone down the stack
+ *
+ * This is not a very generic construct and it depends on the transport layers
+ * doing the right thing. In practice there's usually only one payload-only skb.
+ * Having multiple payload-only skbs with different lengths of hdr_len is not
+ * possible. The payload-only skbs should never leave their owner.
*/
#define SKB_DATAREF_SHIFT 16
#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)
@@ -618,6 +686,46 @@ typedef unsigned char *sk_buff_data_t;
#endif
/**
+ * DOC: Basic sk_buff geometry
+ *
+ * struct sk_buff itself is a metadata structure and does not hold any packet
+ * data. All the data is held in associated buffers.
+ *
+ * &sk_buff.head points to the main "head" buffer. The head buffer is divided
+ * into two parts:
+ *
+ * - data buffer, containing headers and sometimes payload;
+ * this is the part of the skb operated on by the common helpers
+ * such as skb_put() or skb_pull();
+ * - shared info (struct skb_shared_info) which holds an array of pointers
+ * to read-only data in the (page, offset, length) format.
+ *
+ * Optionally &skb_shared_info.frag_list may point to another skb.
+ *
+ * Basic diagram may look like this::
+ *
+ * ---------------
+ * | sk_buff |
+ * ---------------
+ * ,--------------------------- + head
+ * / ,----------------- + data
+ * / / ,----------- + tail
+ * | | | , + end
+ * | | | |
+ * v v v v
+ * -----------------------------------------------
+ * | headroom | data | tailroom | skb_shared_info |
+ * -----------------------------------------------
+ * + [page frag]
+ * + [page frag]
+ * + [page frag]
+ * + [page frag] ---------
+ * + frag_list --> | sk_buff |
+ * ---------
+ *
+ */
+
+/**
* struct sk_buff - socket buffer
* @next: Next buffer in list
* @prev: Previous buffer in list
@@ -626,6 +734,7 @@ typedef unsigned char *sk_buff_data_t;
* for retransmit timer
* @rbnode: RB tree node, alternative to next/prev for netem/tcp
* @list: queue head
+ * @ll_node: anchor in an llist (eg socket defer_list)
* @sk: Socket we are owned by
* @ip_defrag_offset: (aka @sk) alternate use of @sk, used in
* fragmentation management
@@ -694,11 +803,17 @@ typedef unsigned char *sk_buff_data_t;
* @csum_level: indicates the number of consecutive checksums found in
* the packet minus one that have been verified as
* CHECKSUM_UNNECESSARY (max 3)
+ * @scm_io_uring: SKB holds io_uring registered files
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @slow_gro: state present at GRO time, slower prepare step required
+ * @mono_delivery_time: When set, skb->tstamp has the
+ * delivery_time in mono clock base (i.e. EDT). Otherwise, the
+ * skb->tstamp has the (rcv) timestamp at ingress and
+ * delivery_time at egress.
* @napi_id: id of the NAPI struct this skb came from
* @sender_cpu: (aka @napi_id) source CPU in XPS
+ * @alloc_cpu: CPU which did the skb allocation.
* @secmark: security marking
* @mark: Generic packet mark
* @reserved_tailroom: (aka @mark) number of bytes of free space available
@@ -728,7 +843,7 @@ typedef unsigned char *sk_buff_data_t;
struct sk_buff {
union {
struct {
- /* These two members must be first. */
+ /* These two members must be first to match sk_buff_head. */
struct sk_buff *next;
struct sk_buff *prev;
@@ -743,6 +858,7 @@ struct sk_buff {
};
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list;
+ struct llist_node ll_node;
};
union {
@@ -792,7 +908,7 @@ struct sk_buff {
#else
#define CLONED_MASK 1
#endif
-#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset)
+#define CLONED_OFFSET offsetof(struct sk_buff, __cloned_offset)
/* private: */
__u8 __cloned_offset[0];
@@ -808,25 +924,15 @@ struct sk_buff {
__u8 active_extensions;
#endif
- /* fields enclosed in headers_start/headers_end are copied
+ /* Fields enclosed in headers group are copied
* using a single memcpy() in __copy_skb_header()
*/
- /* private: */
- __u32 headers_start[0];
- /* public: */
-
-/* if you move pkt_type around you also must adapt those constants */
-#ifdef __BIG_ENDIAN_BITFIELD
-#define PKT_TYPE_MAX (7 << 5)
-#else
-#define PKT_TYPE_MAX 7
-#endif
-#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
+ struct_group(headers,
/* private: */
__u8 __pkt_type_offset[0];
/* public: */
- __u8 pkt_type:3;
+ __u8 pkt_type:3; /* see PKT_TYPE_MAX */
__u8 ignore_df:1;
__u8 nf_trace:1;
__u8 ip_summed:2;
@@ -842,20 +948,18 @@ struct sk_buff {
__u8 encap_hdr_csum:1;
__u8 csum_valid:1;
-#ifdef __BIG_ENDIAN_BITFIELD
-#define PKT_VLAN_PRESENT_BIT 7
-#else
-#define PKT_VLAN_PRESENT_BIT 0
-#endif
-#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset)
/* private: */
__u8 __pkt_vlan_present_offset[0];
/* public: */
- __u8 vlan_present:1;
+ __u8 vlan_present:1; /* See PKT_VLAN_PRESENT_BIT */
__u8 csum_complete_sw:1;
__u8 csum_level:2;
- __u8 csum_not_inet:1;
__u8 dst_pending_confirm:1;
+ __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */
+#ifdef CONFIG_NET_CLS_ACT
+ __u8 tc_skip_classify:1;
+ __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */
+#endif
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
@@ -867,10 +971,6 @@ struct sk_buff {
__u8 offload_fwd_mark:1;
__u8 offload_l3_fwd_mark:1;
#endif
-#ifdef CONFIG_NET_CLS_ACT
- __u8 tc_skip_classify:1;
- __u8 tc_at_ingress:1;
-#endif
__u8 redirected:1;
#ifdef CONFIG_NET_REDIRECT
__u8 from_ingress:1;
@@ -882,6 +982,8 @@ struct sk_buff {
__u8 decrypted:1;
#endif
__u8 slow_gro:1;
+ __u8 csum_not_inet:1;
+ __u8 scm_io_uring:1;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
@@ -905,6 +1007,7 @@ struct sk_buff {
unsigned int sender_cpu;
};
#endif
+ u16 alloc_cpu;
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
@@ -932,9 +1035,7 @@ struct sk_buff {
u64 kcov_handle;
#endif
- /* private: */
- __u32 headers_end[0];
- /* public: */
+ ); /* end headers group */
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
@@ -950,6 +1051,28 @@ struct sk_buff {
#endif
};
+/* if you move pkt_type around you also must adapt those constants */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define PKT_TYPE_MAX (7 << 5)
+#else
+#define PKT_TYPE_MAX 7
+#endif
+#define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset)
+
+/* if you move pkt_vlan_present, tc_at_ingress, or mono_delivery_time
+ * around, you also must adapt these constants.
+ */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define PKT_VLAN_PRESENT_BIT 7
+#define TC_AT_INGRESS_MASK (1 << 0)
+#define SKB_MONO_DELIVERY_TIME_MASK (1 << 2)
+#else
+#define PKT_VLAN_PRESENT_BIT 0
+#define TC_AT_INGRESS_MASK (1 << 7)
+#define SKB_MONO_DELIVERY_TIME_MASK (1 << 5)
+#endif
+#define PKT_VLAN_PRESENT_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset)
+
#ifdef __KERNEL__
/*
* Handling routines are only of interest to the kernel
@@ -1081,12 +1204,29 @@ static inline bool skb_unref(struct sk_buff *skb)
return true;
}
+void __fix_address
+kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason);
+
+/**
+ * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
+ * @skb: buffer to free
+ */
+static inline void kfree_skb(struct sk_buff *skb)
+{
+ kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+}
+
void skb_release_head_state(struct sk_buff *skb);
-void kfree_skb(struct sk_buff *skb);
-void kfree_skb_list(struct sk_buff *segs);
+void kfree_skb_list_reason(struct sk_buff *segs,
+ enum skb_drop_reason reason);
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
void skb_tx_error(struct sk_buff *skb);
+static inline void kfree_skb_list(struct sk_buff *segs)
+{
+ kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED);
+}
+
#ifdef CONFIG_TRACEPOINTS
void consume_skb(struct sk_buff *skb);
#else
@@ -1110,6 +1250,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
void *data, unsigned int frag_size);
+void skb_attempt_defer_free(struct sk_buff *skb);
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
@@ -1329,8 +1470,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
unsigned int key_count);
struct bpf_flow_dissector;
-bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
- __be16 proto, int nhoff, int hlen, unsigned int flags);
+u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
+ __be16 proto, int nhoff, int hlen, unsigned int flags);
bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb,
@@ -1380,7 +1521,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container,
u16 *ctinfo_map, size_t mapsize,
- bool post_ct);
+ bool post_ct, u16 zone);
void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
@@ -1443,6 +1584,11 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
return skb->end;
}
+
+static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
+{
+ skb->end = offset;
+}
#else
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
@@ -1453,8 +1599,35 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
return skb->end - skb->head;
}
+
+static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
+{
+ skb->end = skb->head + offset;
+}
#endif
+struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg);
+
+void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
+
+void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success);
+
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, struct iov_iter *from,
+ size_t length);
+
+static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
+ struct msghdr *msg, int len)
+{
+ return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
+}
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, int len,
+ struct ubuf_info *uarg);
+
/* Internal */
#define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB)))
@@ -1475,6 +1648,11 @@ static inline bool skb_zcopy_pure(const struct sk_buff *skb)
return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}
+static inline bool skb_zcopy_managed(const struct sk_buff *skb)
+{
+ return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
+}
+
static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
const struct sk_buff *skb2)
{
@@ -1549,6 +1727,14 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
}
}
+void __skb_zcopy_downgrade_managed(struct sk_buff *skb);
+
+static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
+{
+ if (unlikely(skb_zcopy_managed(skb)))
+ __skb_zcopy_downgrade_managed(skb);
+}
+
static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
skb->next = NULL;
@@ -1692,19 +1878,19 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
return 0;
}
-/* This variant of skb_unclone() makes sure skb->truesize is not changed */
+/* This variant of skb_unclone() makes sure skb->truesize
+ * and skb_end_offset() are not changed, whenever a new skb->head is needed.
+ *
+ * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X))
+ * when various debugging features are in place.
+ */
+int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri);
static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
might_sleep_if(gfpflags_allow_blocking(pri));
- if (skb_cloned(skb)) {
- unsigned int save = skb->truesize;
- int res;
-
- res = pskb_expand_head(skb, 0, 0, pri);
- skb->truesize = save;
- return res;
- }
+ if (skb_cloned(skb))
+ return __skb_unclone_keeptruesize(skb, pri);
return 0;
}
@@ -1738,8 +1924,10 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
}
/**
- * __skb_header_release - release reference to header
- * @skb: buffer to operate on
+ * __skb_header_release() - allow clones to use the headroom
+ * @skb: buffer to operate on
+ *
+ * See "DOC: dataref and headerless skbs".
*/
static inline void __skb_header_release(struct sk_buff *skb)
{
@@ -1975,8 +2163,8 @@ static inline void __skb_insert(struct sk_buff *newsk,
*/
WRITE_ONCE(newsk->next, next);
WRITE_ONCE(newsk->prev, prev);
- WRITE_ONCE(next->prev, newsk);
- WRITE_ONCE(prev->next, newsk);
+ WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk);
+ WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk);
WRITE_ONCE(list->qlen, list->qlen + 1);
}
@@ -2072,7 +2260,7 @@ static inline void __skb_queue_after(struct sk_buff_head *list,
struct sk_buff *prev,
struct sk_buff *newsk)
{
- __skb_insert(newsk, prev, prev->next, list);
+ __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list);
}
void skb_append(struct sk_buff *old, struct sk_buff *newsk,
@@ -2082,7 +2270,7 @@ static inline void __skb_queue_before(struct sk_buff_head *list,
struct sk_buff *next,
struct sk_buff *newsk)
{
- __skb_insert(newsk, next->prev, next, list);
+ __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list);
}
/**
@@ -2195,6 +2383,34 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
return skb_headlen(skb) + __skb_pagelen(skb);
}
+static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
+ int i, struct page *page,
+ int off, int size)
+{
+ skb_frag_t *frag = &shinfo->frags[i];
+
+ /*
+ * Propagate page pfmemalloc to the skb if we can. The problem is
+ * that not all callers have unique ownership of the page but rely
+ * on page_is_pfmemalloc doing the right thing(tm).
+ */
+ frag->bv_page = page;
+ frag->bv_offset = off;
+ skb_frag_size_set(frag, size);
+}
+
+/**
+ * skb_len_add - adds a number to len fields of skb
+ * @skb: buffer to add len to
+ * @delta: number of bytes to add
+ */
+static inline void skb_len_add(struct sk_buff *skb, int delta)
+{
+ skb->len += delta;
+ skb->data_len += delta;
+ skb->truesize += delta;
+}
+
/**
* __skb_fill_page_desc - initialise a paged fragment in an skb
* @skb: buffer containing fragment to be initialised
@@ -2211,17 +2427,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
struct page *page, int off, int size)
{
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
- /*
- * Propagate page pfmemalloc to the skb if we can. The problem is
- * that not all callers have unique ownership of the page but rely
- * on page_is_pfmemalloc doing the right thing(tm).
- */
- frag->bv_page = page;
- frag->bv_offset = off;
- skb_frag_size_set(frag, size);
-
+ __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
page = compound_head(page);
if (page_is_pfmemalloc(page))
skb->pfmemalloc = true;
@@ -2248,6 +2454,27 @@ static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
skb_shinfo(skb)->nr_frags = i + 1;
}
+/**
+ * skb_fill_page_desc_noacc - initialise a paged fragment in an skb
+ * @skb: buffer containing fragment to be initialised
+ * @i: paged fragment index to initialise
+ * @page: the page to use for this fragment
+ * @off: the offset to the data with @page
+ * @size: the length of the data
+ *
+ * Variant of skb_fill_page_desc() which does not deal with
+ * pfmemalloc, if page is not owned by us.
+ */
+static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i,
+ struct page *page, int off,
+ int size)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ __skb_fill_page_desc_noacc(shinfo, i, page, off, size);
+ shinfo->nr_frags = i + 1;
+}
+
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size, unsigned int truesize);
@@ -2291,6 +2518,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
#endif /* NET_SKBUFF_DATA_USES_OFFSET */
+static inline void skb_assert_len(struct sk_buff *skb)
+{
+#ifdef CONFIG_DEBUG_NET
+ if (WARN_ONCE(!skb->len, "%s\n", __func__))
+ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+#endif /* CONFIG_DEBUG_NET */
+}
+
/*
* Add data to an sk_buff
*/
@@ -2363,7 +2598,14 @@ void *skb_pull(struct sk_buff *skb, unsigned int len);
static inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
{
skb->len -= len;
- BUG_ON(skb->len < skb->data_len);
+ if (unlikely(skb->len < skb->data_len)) {
+#if defined(CONFIG_DEBUG_NET)
+ skb->len += len;
+ pr_err("__skb_pull(len=%u)\n", len);
+ skb_dump(KERN_ERR, skb, false);
+#endif
+ BUG();
+ }
return skb->data += len;
}
@@ -2372,21 +2614,9 @@ static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
}
-void *__pskb_pull_tail(struct sk_buff *skb, int delta);
+void *skb_pull_data(struct sk_buff *skb, size_t len);
-static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len)
-{
- if (len > skb_headlen(skb) &&
- !__pskb_pull_tail(skb, len - skb_headlen(skb)))
- return NULL;
- skb->len -= len;
- return skb->data += len;
-}
-
-static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
-{
- return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
-}
+void *__pskb_pull_tail(struct sk_buff *skb, int delta);
static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
{
@@ -2397,6 +2627,15 @@ static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
}
+static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
+{
+ if (!pskb_may_pull(skb, len))
+ return NULL;
+
+ skb->len -= len;
+ return skb->data += len;
+}
+
void skb_condense(struct sk_buff *skb);
/**
@@ -2566,6 +2805,7 @@ static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
+ DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
return skb->head + skb->transport_header;
}
@@ -2597,8 +2837,14 @@ static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
skb->network_header += offset;
}
+static inline int skb_mac_header_was_set(const struct sk_buff *skb)
+{
+ return skb->mac_header != (typeof(skb->mac_header))~0U;
+}
+
static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
+ DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
return skb->head + skb->mac_header;
}
@@ -2609,14 +2855,10 @@ static inline int skb_mac_offset(const struct sk_buff *skb)
static inline u32 skb_mac_header_len(const struct sk_buff *skb)
{
+ DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
return skb->network_header - skb->mac_header;
}
-static inline int skb_mac_header_was_set(const struct sk_buff *skb)
-{
- return skb->mac_header != (typeof(skb->mac_header))~0U;
-}
-
static inline void skb_unset_mac_header(struct sk_buff *skb)
{
skb->mac_header = (typeof(skb->mac_header))~0U;
@@ -2839,8 +3081,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
if (likely(!skb_zcopy(skb)))
return 0;
- if (!skb_zcopy_is_nouarg(skb) &&
- skb_uarg(skb)->callback == msg_zerocopy_callback)
+ if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
return 0;
return skb_copy_ubufs(skb, gfp_mask);
}
@@ -3153,7 +3394,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
*/
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
- __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (!skb_zcopy_managed(skb))
+ __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}
/**
@@ -3484,7 +3728,12 @@ __skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
static inline void skb_postpull_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
{
- __skb_postpull_rcsum(skb, start, len, 0);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = wsum_negate(csum_partial(start, len,
+ wsum_negate(skb->csum)));
+ else if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_start_offset(skb) < 0)
+ skb->ip_summed = CHECKSUM_NONE;
}
static __always_inline void
@@ -3645,8 +3894,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
struct sk_buff *__skb_recv_datagram(struct sock *sk,
struct sk_buff_head *sk_queue,
unsigned int flags, int *off, int *err);
-struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
- int *err);
+struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err);
__poll_t datagram_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
@@ -3695,7 +3943,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features,
unsigned int offset);
struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
-int skb_ensure_writable(struct sk_buff *skb, int write_len);
+int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len);
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
int skb_vlan_pop(struct sk_buff *skb);
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
@@ -3852,6 +4100,7 @@ static inline void skb_get_new_timestampns(const struct sk_buff *skb,
static inline void __net_timestamp(struct sk_buff *skb)
{
skb->tstamp = ktime_get_real();
+ skb->mono_delivery_time = 0;
}
static inline ktime_t net_timedelta(ktime_t t)
@@ -3859,8 +4108,53 @@ static inline ktime_t net_timedelta(ktime_t t)
return ktime_sub(ktime_get_real(), t);
}
-static inline ktime_t net_invalid_timestamp(void)
+static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
+ bool mono)
+{
+ skb->tstamp = kt;
+ skb->mono_delivery_time = kt && mono;
+}
+
+DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);
+
+/* It is used in the ingress path to clear the delivery_time.
+ * If needed, set the skb->tstamp to the (rcv) timestamp.
+ */
+static inline void skb_clear_delivery_time(struct sk_buff *skb)
+{
+ if (skb->mono_delivery_time) {
+ skb->mono_delivery_time = 0;
+ if (static_branch_unlikely(&netstamp_needed_key))
+ skb->tstamp = ktime_get_real();
+ else
+ skb->tstamp = 0;
+ }
+}
+
+static inline void skb_clear_tstamp(struct sk_buff *skb)
{
+ if (skb->mono_delivery_time)
+ return;
+
+ skb->tstamp = 0;
+}
+
+static inline ktime_t skb_tstamp(const struct sk_buff *skb)
+{
+ if (skb->mono_delivery_time)
+ return 0;
+
+ return skb->tstamp;
+}
+
+static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond)
+{
+ if (!skb->mono_delivery_time && skb->tstamp)
+ return skb->tstamp;
+
+ if (static_branch_unlikely(&netstamp_needed_key) || cond)
+ return ktime_get_real();
+
return 0;
}
@@ -4658,9 +4952,7 @@ static inline void skb_forward_csum(struct sk_buff *skb)
*/
static inline void skb_checksum_none_assert(const struct sk_buff *skb)
{
-#ifdef DEBUG
- BUG_ON(skb->ip_summed != CHECKSUM_NONE);
-#endif
+ DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE);
}
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
@@ -4720,7 +5012,7 @@ static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
#ifdef CONFIG_NET_REDIRECT
skb->from_ingress = from_ingress;
if (skb->from_ingress)
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
#endif
}