15 files changed, 1469 insertions, 189 deletions
diff --git a/Documentation/networking/.gitignore b/Documentation/networking/.gitignore
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/Documentation/networking/.gitignore
+++ /dev/null
diff --git a/Documentation/networking/Makefile b/Documentation/networking/Makefile
index 0aa1ac98fc2b..4c5d7c485439 100644
--- a/Documentation/networking/Makefile
+++ b/Documentation/networking/Makefile
@@ -1,7 +1 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
-
-# Tell kbuild to always build the programs
-always := $(hostprogs-y)
-
-obj-m := timestamping/
+subdir-y := timestamping
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index eeb5b2e97bed..83bf4986baea 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -2230,11 +2230,8 @@ balance-rr: This mode is the only mode that will permit a single
 
 	It is possible to adjust TCP/IP's congestion limits by
 	altering the net.ipv4.tcp_reordering sysctl parameter.  The
-	usual default value is 3, and the maximum useful value is 127.
-	For a four interface balance-rr bond, expect that a single
-	TCP/IP stream will utilize no more than approximately 2.3
-	interface's worth of throughput, even after adjusting
-	tcp_reordering.
+	usual default value is 3. But keep in mind TCP stack is able
+	to automatically increase this when it detects reorders.
 
 	Note that the fraction of packets that will be delivered out of
 	order is highly variable, and is unlikely to be zero.  The level
diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt
new file mode 100644
index 000000000000..0d5dfbc89ec9
--- /dev/null
+++ b/Documentation/networking/dctcp.txt
@@ -0,0 +1,43 @@
+DCTCP (DataCenter TCP)
+----------------------
+
+DCTCP is an enhancement to the TCP congestion control algorithm for data
+center networks and leverages Explicit Congestion Notification (ECN) in
+the data center network to provide multi-bit feedback to the end hosts.
+
+To enable it on end hosts:
+
+  sysctl -w net.ipv4.tcp_congestion_control=dctcp
+
+All switches in the data center network running DCTCP must support ECN
+marking and be configured for marking when reaching defined switch buffer
+thresholds. The default ECN marking threshold heuristic for DCTCP on
+switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps,
+but might need further careful tweaking.
+
+For more details, see below documents:
+
+Paper:
+
+The algorithm is further described in detail in the following two
+SIGCOMM/SIGMETRICS papers:
+
+ i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+    Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+      "Data Center TCP (DCTCP)", Data Center Networks session
+      Proc. ACM SIGCOMM, New Delhi, 2010.
+    http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+    http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192
+
+ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+      "Analysis of DCTCP: Stability, Convergence, and Fairness"
+      Proc. ACM SIGMETRICS, San Jose, 2011.
+    http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+
+IETF informational draft:
+
+  http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00
+
+DCTCP site:
+
+  http://simula.stanford.edu/~alizade/Site/DCTCP.html
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index d16f424c5e8d..58d08f8d8d80 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -462,9 +462,9 @@ JIT compiler
 ------------
 
 The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, PowerPC,
-ARM, MIPS and s390 and can be enabled through CONFIG_BPF_JIT. The JIT compiler
-is transparently invoked for each attached filter from user space or for
-internal kernel users if it has been previously enabled by root:
+ARM, ARM64, MIPS and s390 and can be enabled through CONFIG_BPF_JIT. The JIT
+compiler is transparently invoked for each attached filter from user space
+or for internal kernel users if it has been previously enabled by root:
 
   echo 1 > /proc/sys/net/core/bpf_jit_enable
 
@@ -700,11 +700,11 @@ Some core changes of the new internal format:
     bpf_exit
 
   If f2 is JITed and the pointer stored to '_f2'. The calls f1 -> f2 -> f3 and
-  returns will be seamless. Without JIT, __sk_run_filter() interpreter needs to
+  returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to
   be used to call into f2.
 
   For practical reasons all eBPF programs have only one argument 'ctx' which is
-  already placed into R1 (e.g. on __sk_run_filter() startup) and the programs
+  already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs
   can call kernel functions with up to 5 arguments. Calls with 6 or more arguments
   are currently not supported, but these restrictions can be lifted if necessary
   in the future.
@@ -951,7 +951,7 @@ Size modifier is one of ...
 
 Mode modifier is one of:
 
-  BPF_IMM  0x00  /* classic BPF only, reserved in eBPF */
+  BPF_IMM  0x00  /* used for 32-bit mov in classic BPF and 64-bit in eBPF */
   BPF_ABS  0x20
   BPF_IND  0x40
   BPF_MEM  0x60
@@ -995,6 +995,275 @@ BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg
 Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and
 2 byte atomic increments are not supported.
 
+eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists
+of two consecutive 'struct bpf_insn' 8-byte blocks and interpreted as single
+instruction that loads 64-bit immediate value into a dst_reg.
+Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
+32-bit immediate value into a register.
+
+eBPF verifier
+-------------
+The safety of the eBPF program is determined in two steps.
+
+First step does DAG check to disallow loops and other CFG validation.
+In particular it will detect programs that have unreachable instructions.
+(though classic BPF checker allows them)
+
+Second step starts from the first insn and descends all possible paths.
+It simulates execution of every insn and observes the state change of
+registers and stack.
+
+At the start of the program the register R1 contains a pointer to context
+and has type PTR_TO_CTX.
+If verifier sees an insn that does R2=R1, then R2 has now type
+PTR_TO_CTX as well and can be used on the right hand side of expression.
+If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=UNKNOWN_VALUE,
+since addition of two valid pointers makes invalid pointer.
+(In 'secure' mode verifier will reject any type of pointer arithmetic to make
+sure that kernel addresses don't leak to unprivileged users)
+
+If register was never written to, it's not readable:
+  bpf_mov R0 = R2
+  bpf_exit
+will be rejected, since R2 is unreadable at the start of the program.
+
+After kernel function call, R1-R5 are reset to unreadable and
+R0 has a return type of the function.
+
+Since R6-R9 are callee saved, their state is preserved across the call.
+  bpf_mov R6 = 1
+  bpf_call foo
+  bpf_mov R0 = R6
+  bpf_exit
+is a correct program. If there was R1 instead of R6, it would have
+been rejected.
+
+load/store instructions are allowed only with registers of valid types, which
+are PTR_TO_CTX, PTR_TO_MAP, FRAME_PTR. They are bounds and alignment checked.
+For example:
+ bpf_mov R1 = 1
+ bpf_mov R2 = 2
+ bpf_xadd *(u32 *)(R1 + 3) += R2
+ bpf_exit
+will be rejected, since R1 doesn't have a valid pointer type at the time of
+execution of instruction bpf_xadd.
+
+At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context')
+A callback is used to customize verifier to restrict eBPF program access to only
+certain fields within ctx structure with specified size and alignment.
+
+For example, the following insn:
+  bpf_ld R0 = *(u32 *)(R6 + 8)
+intends to load a word from address R6 + 8 and store it into R0
+If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know
+that offset 8 of size 4 bytes can be accessed for reading, otherwise
+the verifier will reject the program.
+If R6=FRAME_PTR, then access should be aligned and be within
+stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8,
+so it will fail verification, since it's out of bounds.
+
+The verifier will allow eBPF program to read data from stack only after
+it wrote into it.
+Classic BPF verifier does similar check with M[0-15] memory slots.
+For example:
+  bpf_ld R0 = *(u32 *)(R10 - 4)
+  bpf_exit
+is invalid program.
+Though R10 is correct read-only register and has type FRAME_PTR
+and R10 - 4 is within stack bounds, there were no stores into that location.
+
+Pointer register spill/fill is tracked as well, since four (R6-R9)
+callee saved registers may not be enough for some programs.
+
+Allowed function calls are customized with bpf_verifier_ops->get_func_proto()
+The eBPF verifier will check that registers match argument constraints.
+After the call register R0 will be set to return type of the function.
+
+Function calls is a main mechanism to extend functionality of eBPF programs.
+Socket filters may let programs to call one set of functions, whereas tracing
+filters may allow completely different set.
+
+If a function made accessible to eBPF program, it needs to be thought through
+from safety point of view. The verifier will guarantee that the function is
+called with valid arguments.
+
+seccomp vs socket filters have different security restrictions for classic BPF.
+Seccomp solves this by two stage verifier: classic BPF verifier is followed
+by seccomp verifier. In case of eBPF one configurable verifier is shared for
+all use cases.
+
+See details of eBPF verifier in kernel/bpf/verifier.c
+
+eBPF maps
+---------
+'maps' is a generic storage of different types for sharing data between kernel
+and userspace.
+
+The maps are accessed from user space via BPF syscall, which has commands:
+- create a map with given type and attributes
+  map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
+  using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
+  returns process-local file descriptor or negative error
+
+- lookup key in a given map
+  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
+  using attr->map_fd, attr->key, attr->value
+  returns zero and stores found elem into value or negative error
+
+- create or update key/value pair in a given map
+  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
+  using attr->map_fd, attr->key, attr->value
+  returns zero or negative error
+
+- find and delete element by key in a given map
+  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
+  using attr->map_fd, attr->key
+
+- to delete map: close(fd)
+  Exiting process will delete maps automatically
+
+userspace programs use this syscall to create/access maps that eBPF programs
+are concurrently updating.
+
+maps can have different types: hash, array, bloom filter, radix-tree, etc.
+
+The map is defined by:
+  . type
+  . max number of elements
+  . key size in bytes
+  . value size in bytes
+
+Understanding eBPF verifier messages
+------------------------------------
+
+The following are few examples of invalid eBPF programs and verifier error
+messages as seen in the log:
+
+Program with unreachable instructions:
+static struct bpf_insn prog[] = {
+  BPF_EXIT_INSN(),
+  BPF_EXIT_INSN(),
+};
+Error:
+  unreachable insn 1
+
+Program that reads uninitialized register:
+  BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+  BPF_EXIT_INSN(),
+Error:
+  0: (bf) r0 = r2
+  R2 !read_ok
+
+Program that doesn't initialize R0 before exiting:
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+  BPF_EXIT_INSN(),
+Error:
+  0: (bf) r2 = r1
+  1: (95) exit
+  R0 !read_ok
+
+Program that accesses stack out of bounds:
+  BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
+  BPF_EXIT_INSN(),
+Error:
+  0: (7a) *(u64 *)(r10 +8) = 0
+  invalid stack off=8 size=8
+
+Program that doesn't initialize stack before passing its address into function:
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_LD_MAP_FD(BPF_REG_1, 0),
+  BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+  BPF_EXIT_INSN(),
+Error:
+  0: (bf) r2 = r10
+  1: (07) r2 += -8
+  2: (b7) r1 = 0x0
+  3: (85) call 1
+  invalid indirect read from stack off -8+0 size 8
+
+Program that uses invalid map_fd=0 while calling to map_lookup_elem() function:
+  BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_LD_MAP_FD(BPF_REG_1, 0),
+  BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+  BPF_EXIT_INSN(),
+Error:
+  0: (7a) *(u64 *)(r10 -8) = 0
+  1: (bf) r2 = r10
+  2: (07) r2 += -8
+  3: (b7) r1 = 0x0
+  4: (85) call 1
+  fd 0 is not pointing to valid bpf_map
+
+Program that doesn't check return value of map_lookup_elem() before accessing
+map element:
+  BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_LD_MAP_FD(BPF_REG_1, 0),
+  BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+  BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+  BPF_EXIT_INSN(),
+Error:
+  0: (7a) *(u64 *)(r10 -8) = 0
+  1: (bf) r2 = r10
+  2: (07) r2 += -8
+  3: (b7) r1 = 0x0
+  4: (85) call 1
+  5: (7a) *(u64 *)(r0 +0) = 0
+  R0 invalid mem access 'map_value_or_null'
+
+Program that correctly checks map_lookup_elem() returned value for NULL, but
+accesses the memory with incorrect alignment:
+  BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_LD_MAP_FD(BPF_REG_1, 0),
+  BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+  BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+  BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
+  BPF_EXIT_INSN(),
+Error:
+  0: (7a) *(u64 *)(r10 -8) = 0
+  1: (bf) r2 = r10
+  2: (07) r2 += -8
+  3: (b7) r1 = 1
+  4: (85) call 1
+  5: (15) if r0 == 0x0 goto pc+1
+   R0=map_ptr R10=fp
+  6: (7a) *(u64 *)(r0 +4) = 0
+  misaligned access off 4 size 8
+
+Program that correctly checks map_lookup_elem() returned value for NULL and
+accesses memory with correct alignment in one side of 'if' branch, but fails
+to do so in the other side of 'if' branch:
+  BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_LD_MAP_FD(BPF_REG_1, 0),
+  BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+  BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+  BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+  BPF_EXIT_INSN(),
+  BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
+  BPF_EXIT_INSN(),
+Error:
+  0: (7a) *(u64 *)(r10 -8) = 0
+  1: (bf) r2 = r10
+  2: (07) r2 += -8
+  3: (b7) r1 = 1
+  4: (85) call 1
+  5: (15) if r0 == 0x0 goto pc+2
+   R0=map_ptr R10=fp
+  6: (7a) *(u64 *)(r0 +0) = 0
+  7: (95) exit
+
+  from 5 to 8: R0=imm0 R10=fp
+  8: (7a) *(u64 *)(r0 +0) = 1
+  R0 invalid mem access 'imm'
+
 Testing
 -------
 
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 29a93518bf18..9bffdfc648dc 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -56,6 +56,13 @@ ip_forward_use_pmtu - BOOLEAN
 	0 - disabled
 	1 - enabled
 
+fwmark_reflect - BOOLEAN
+	Controls the fwmark of kernel-generated IPv4 reply packets that are not
+	associated with a socket for example, TCP RSTs or ICMP echo replies).
+	If unset, these packets have a fwmark of zero. If set, they have the
+	fwmark of the packet they are replying to.
+	Default: 0
+
 route/max_size - INTEGER
 	Maximum number of routes allowed in the kernel.  Increase
 	this when using large numbers of interfaces and/or routes.
@@ -65,6 +72,12 @@ neigh/default/gc_thresh1 - INTEGER
 	purge entries if there are fewer than this number.
 	Default: 128
 
+neigh/default/gc_thresh2 - INTEGER
+	Threshold when garbage collector becomes more aggressive about
+	purging entries. Entries older than 5 seconds will be cleared
+	when over this number.
+	Default: 512
+
 neigh/default/gc_thresh3 - INTEGER
 	Maximum number of neighbor entries allowed.  Increase this
 	when using large numbers of interfaces and when communicating
@@ -370,9 +383,17 @@ tcp_orphan_retries - INTEGER
 	may consume significant resources. Cf. tcp_max_orphans.
 
 tcp_reordering - INTEGER
-	Maximal reordering of packets in a TCP stream.
+	Initial reordering level of packets in a TCP stream.
+	TCP stack can then dynamically adjust flow reordering level
+	between this initial value and tcp_max_reordering
 	Default: 3
 
+tcp_max_reordering - INTEGER
+	Maximal reordering level of packets in a TCP stream.
+	300 is a fairly conservative value, but you might increase it
+	if paths are using per packet load balancing (like bonding rr mode)
+	Default: 300
+
 tcp_retrans_collapse - BOOLEAN
 	Bug-to-bug compatibility with some broken printers.
 	On retransmit try to send bigger packets to work around bugs in
@@ -580,12 +601,6 @@ tcp_workaround_signed_windows - BOOLEAN
 	not receive a window scaling option from them.
 	Default: 0
 
-tcp_dma_copybreak - INTEGER
-	Lower limit, in bytes, of the size of socket reads that will be
-	offloaded to a DMA copy engine, if one is present in the system
-	and CONFIG_NET_DMA is enabled.
-	Default: 4096
-
 tcp_thin_linear_timeouts - BOOLEAN
 	Enable dynamic triggering of linear timeouts for thin streams.
 	If set, a check is performed upon retransmission by timeout to
@@ -763,8 +778,21 @@ icmp_ratelimit - INTEGER
 	icmp_ratemask (see below) to specific targets.
 	0 to disable any limiting,
 	otherwise the minimal space between responses in milliseconds.
+	Note that another sysctl, icmp_msgs_per_sec limits the number
+	of ICMP packets	sent on all targets.
+	Default: 1000
+
+icmp_msgs_per_sec - INTEGER
+	Limit maximal number of ICMP packets sent per second from this host.
+	Only messages whose type matches icmp_ratemask (see below) are
+	controlled by this limit.
 	Default: 1000
 
+icmp_msgs_burst - INTEGER
+	icmp_msgs_per_sec controls number of ICMP packets sent per second,
+	while icmp_msgs_burst controls the burst size of these packets.
+	Default: 50
+
 icmp_ratemask - INTEGER
 	Mask made of ICMP types for which rates are being limited.
 	Significant bits: IHGFEDCBA9876543210
@@ -838,6 +866,11 @@ igmp_max_memberships - INTEGER
 
 	conf/all/*	  is special, changes the settings for all interfaces
 
+igmp_qrv - INTEGER
+	 Controls the IGMP query robustness variable (see RFC2236 8.1).
+	 Default: 2 (as specified by RFC2236 8.1)
+	 Minimum: 1 (as specified by RFC6636 4.5)
+
 log_martians - BOOLEAN
 	Log packets with impossible addresses to kernel log.
 	log_martians for the interface will be enabled if at least one of
@@ -941,14 +974,9 @@ accept_source_route - BOOLEAN
 		FALSE (host)
 
 accept_local - BOOLEAN
-	Accept packets with local source addresses. In combination
-	with suitable routing, this can be used to direct packets
-	between two local interfaces over the wire and have them
-	accepted properly.
-
-	rp_filter must be set to a non-zero value in order for
-	accept_local to have an effect.
-
+	Accept packets with local source addresses. In combination with
+	suitable routing, this can be used to direct packets between two
+	local interfaces over the wire and have them accepted properly.
 	default FALSE
 
 route_localnet - BOOLEAN
@@ -1146,6 +1174,11 @@ anycast_src_echo_reply - BOOLEAN
 	FALSE: disabled
 	Default: FALSE
 
+mld_qrv - INTEGER
+	Controls the MLD query robustness variable (see RFC3810 9.1).
+	Default: 2 (as specified by RFC3810 9.1)
+	Minimum: 1 (as specified by RFC6636 4.5)
+
 IPv6 Fragmentation:
 
 ip6frag_high_thresh - INTEGER
@@ -1183,6 +1216,13 @@ conf/all/forwarding - BOOLEAN
 proxy_ndp - BOOLEAN
 	Do proxy ndp.
 
+fwmark_reflect - BOOLEAN
+	Controls the fwmark of kernel-generated IPv6 reply packets that are not
+	associated with a socket for example, TCP RSTs or ICMPv6 echo replies).
+	If unset, these packets have a fwmark of zero. If set, they have the
+	fwmark of the packet they are replying to.
+	Default: 0
+
 conf/interface/*:
 	Change special settings per interface.
 
@@ -1434,6 +1474,19 @@ suppress_frag_ndisc - INTEGER
 	1 - (default) discard fragmented neighbor discovery packets
 	0 - allow fragmented neighbor discovery packets
 
+optimistic_dad - BOOLEAN
+	Whether to perform Optimistic Duplicate Address Detection (RFC 4429).
+		0: disabled (default)
+		1: enabled
+
+use_optimistic - BOOLEAN
+	If enabled, do not classify optimistic addresses as deprecated during
+	source address selection.  Preferred addresses will still be chosen
+	before optimistic addresses, subject to other ranking in the source
+	address selection algorithm.
+		0: disabled (default)
+		1: enabled
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
new file mode 100644
index 000000000000..cf996394e466
--- /dev/null
+++ b/Documentation/networking/ipvlan.txt
@@ -0,0 +1,107 @@
+
+                            IPVLAN Driver HOWTO
+
+Initial Release:
+	Mahesh Bandewar <maheshb AT google.com>
+
+1. Introduction:
+	This is conceptually very similar to the macvlan driver with one major
+exception of using L3 for mux-ing /demux-ing among slaves. This property makes
+the master device share the L2 with it's slave devices. I have developed this
+driver in conjuntion with network namespaces and not sure if there is use case
+outside of it.
+
+
+2. Building and Installation:
+	In order to build the driver, please select the config item CONFIG_IPVLAN.
+The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
+(CONFIG_IPVLAN=m).
+
+
+3. Configuration:
+	There are no module parameters for this driver and it can be configured
+using IProute2/ip utility.
+
+	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 }
+
+	e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
+
+
+4. Operating modes:
+	IPvlan has two modes of operation - L2 and L3. For a given master device,
+you can select one of these two modes and all slaves on that master will
+operate in the same (selected) mode. The RX mode is almost identical except
+that in L3 mode the slaves wont receive any multicast / broadcast traffic.
+L3 mode is more restrictive since routing is controlled from the other (mostly)
+default namespace.
+
+4.1 L2 mode:
+	In this mode TX processing happens on the stack instance attached to the
+slave device and packets are switched and queued to the master device to send
+out. In this mode the slaves will RX/TX multicast and broadcast (if applicable)
+as well.
+
+4.2 L3 mode:
+	In this mode TX processing upto L3 happens on the stack instance attached
+to the slave device and packets are switched to the stack instance of the
+master device for the L2 processing and routing from that instance will be
+used before packets are queued on the outbound device. In this mode the slaves
+will not receive nor can send multicast / broadcast traffic.
+
+
+5. What to choose (macvlan vs. ipvlan)?
+	These two devices are very similar in many regards and the specific use
+case could very well define which device to choose. if one of the following
+situations defines your use case then you can choose to use ipvlan -
+	(a) The Linux host that is connected to the external switch / router has
+policy configured that allows only one mac per port.
+	(b) No of virtual devices created on a master exceed the mac capacity and
+puts the NIC in promiscous mode and degraded performance is a concern.
+	(c) If the slave device is to be put into the hostile / untrusted network
+namespace where L2 on the slave could be changed / misused.
+
+
+6. Example configuration:
+
+  +=============================================================+
+  |  Host: host1                                                |
+  |                                                             |
+  |   +----------------------+      +----------------------+    |
+  |   |   NS:ns0             |      |  NS:ns1              |    |
+  |   |                      |      |                      |    |
+  |   |                      |      |                      |    |
+  |   |        ipvl0         |      |         ipvl1        |    |
+  |   +----------#-----------+      +-----------#----------+    |
+  |              #                              #               |
+  |              ################################               |
+  |                              # eth0                         |
+  +==============================#==============================+
+
+
+	(a) Create two network namespaces - ns0, ns1
+		ip netns add ns0
+		ip netns add ns1
+
+	(b) Create two ipvlan slaves on eth0 (master device)
+		ip link add link eth0 ipvl0 type ipvlan mode l2
+		ip link add link eth0 ipvl1 type ipvlan mode l2
+
+	(c) Assign slaves to the respective network namespaces
+		ip link set dev ipvl0 netns ns0
+		ip link set dev ipvl1 netns ns1
+
+	(d) Now switch to the namespace (ns0 or ns1) to configure the slave devices
+		- For ns0
+			(1) ip netns exec ns0 bash
+			(2) ip link set dev ipvl0 up
+			(3) ip link set dev lo up
+			(4) ip -4 addr add 127.0.0.1 dev lo
+			(5) ip -4 addr add $IPADDR dev ipvl0
+			(6) ip -4 route add default via $ROUTER dev ipvl0
+		- For ns1
+			(1) ip netns exec ns1 bash
+			(2) ip link set dev ipvl1 up
+			(3) ip link set dev lo up
+			(4) ip -4 addr add 127.0.0.1 dev lo
+			(5) ip -4 addr add $IPADDR dev ipvl1
+			(6) ip -4 route add default via $ROUTER dev ipvl1
diff --git a/Documentation/networking/ixgbe.txt b/Documentation/networking/ixgbe.txt
index 96cccebb839b..0ace6e776ac8 100644
--- a/Documentation/networking/ixgbe.txt
+++ b/Documentation/networking/ixgbe.txt
@@ -138,7 +138,7 @@ Other ethtool Commands:
 To enable Flow Director
 	ethtool -K ethX ntuple on
 To add a filter
-	Use -U switch. e.g., ethtool -U ethX flow-type tcp4 src-ip 0x178000a
+	Use -U switch. e.g., ethtool -U ethX flow-type tcp4 src-ip 10.0.128.23
         action 1
 To see the list of filters currently present:
 	ethtool -u ethX
diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt
index 0dffc6e37902..6915c6b27869 100644
--- a/Documentation/networking/pktgen.txt
+++ b/Documentation/networking/pktgen.txt
@@ -99,6 +99,9 @@ Examples:
 
  pgset "clone_skb 1"     sets the number of copies of the same packet
  pgset "clone_skb 0"     use single SKB for all transmits
+ pgset "burst 8"         uses xmit_more API to queue 8 copies of the same
+                         packet and update HW tx queue tail pointer once.
+                         "burst 1" is the default
  pgset "pkt_size 9014"   sets packet size to 9014
  pgset "frags 5"         packet will consist of 5 fragments
  pgset "count 200000"    sets number of packets to send, set to zero
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 2090895b08d4..e655e2453c98 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -1,12 +1,12 @@
        STMicroelectronics 10/100/1000 Synopsys Ethernet driver
 
-Copyright (C) 2007-2013  STMicroelectronics Ltd
+Copyright (C) 2007-2014  STMicroelectronics Ltd
 Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
 
 This is the driver for the MAC 10/100/1000 on-chip Ethernet controllers
 (Synopsys IP blocks).
 
-Currently this network device driver is for all STM embedded MAC/GMAC
+Currently this network device driver is for all STi embedded MAC/GMAC
 (i.e. 7xxx/5xxx SoCs), SPEAr (arm), Loongson1B (mips) and XLINX XC2V3000
 FF1152AMT0221 D1215994A VIRTEX FPGA board.
 
@@ -22,6 +22,9 @@ The kernel configuration option is STMMAC_ETH:
  Device Drivers ---> Network device support ---> Ethernet (1000 Mbit) --->
  STMicroelectronics 10/100/1000 Ethernet driver (STMMAC_ETH)
 
+CONFIG_STMMAC_PLATFORM: is to enable the platform driver.
+CONFIG_STMMAC_PCI: is to enable the pci driver.
+
 2) Driver parameters list:
 	debug: message level (0: no output, 16: all);
 	phyaddr: to manually provide the physical address to the PHY device;
@@ -45,10 +48,11 @@ Driver parameters can be also passed in command line by using:
 The xmit method is invoked when the kernel needs to transmit a packet; it sets
 the descriptors in the ring and informs the DMA engine that there is a packet
 ready to be transmitted.
-Once the controller has finished transmitting the packet, an interrupt is
-triggered; So the driver will be able to release the socket buffers.
 By default, the driver sets the NETIF_F_SG bit in the features field of the
-net_device structure enabling the scatter/gather feature.
+net_device structure enabling the scatter-gather feature. This is true on
+chips and configurations where the checksum can be done in hardware.
+Once the controller has finished transmitting the packet, napi will be
+scheduled to release the transmit resources.
 
 4.2) Receive process
 When one or more packets are received, an interrupt happens. The interrupts
@@ -58,20 +62,12 @@ This is based on NAPI so the interrupt handler signals only if there is work
 to be done, and it exits.
 Then the poll method will be scheduled at some future point.
 The incoming packets are stored, by the DMA, in a list of pre-allocated socket
-buffers in order to avoid the memcpy (Zero-copy).
+buffers in order to avoid the memcpy (zero-copy).
 
 4.3) Interrupt Mitigation
 The driver is able to mitigate the number of its DMA interrupts
 using NAPI for the reception on chips older than the 3.50.
 New chips have an HW RX-Watchdog used for this mitigation.
-
-On Tx-side, the mitigation schema is based on a SW timer that calls the
-tx function (stmmac_tx) to reclaim the resource after transmitting the
-frames.
-Also there is another parameter (like a threshold) used to program
-the descriptors avoiding to set the interrupt on completion bit in
-when the frame is sent (xmit).
-
 Mitigation parameters can be tuned by ethtool.
 
 4.4) WOL
@@ -79,7 +75,7 @@ Wake up on Lan feature through Magic and Unicast frames are supported for the
 GMAC core.
 
 4.5) DMA descriptors
-Driver handles both normal and enhanced descriptors. The latter has been only
+Driver handles both normal and alternate descriptors. The latter has been only
 tested on DWC Ether MAC 10/100/1000 Universal version 3.41a and later.
 
 STMMAC supports DMA descriptor to operate both in dual buffer (RING)
@@ -91,9 +87,20 @@ In CHAINED mode each descriptor will have pointer to next descriptor in
 the list, hence creating the explicit chaining in the descriptor itself,
 whereas such explicit chaining is not possible in RING mode.
 
+4.5.1) Extended descriptors
+	The extended descriptors give us information about the Ethernet payload
+	when it is carrying PTP packets or TCP/UDP/ICMP over IP.
+	These are not available on GMAC Synopsys chips older than the 3.50.
+	At probe time the driver will decide if these can be actually used.
+	This support also is mandatory for PTPv2 because the extra descriptors
+	are used for saving the hardware timestamps and Extended Status.
+
 4.6) Ethtool support
-Ethtool is supported. Driver statistics and internal errors can be taken using:
-ethtool -S ethX command. It is possible to dump registers etc.
+Ethtool is supported.
+
+For example, driver statistics (including RMON), internal errors can be taken
+using:
+  # ethtool -S ethX command
 
 4.7) Jumbo and Segmentation Offloading
 Jumbo frames are supported and tested for the GMAC.
@@ -101,12 +108,11 @@ The GSO has been also added but it's performed in software.
 LRO is not supported.
 
 4.8) Physical
-The driver is compatible with PAL to work with PHY and GPHY devices.
+The driver is compatible with Physical Abstraction Layer to be connected with
+PHY and GPHY devices.
 
 4.9) Platform information
-Several driver's information can be passed through the platform
-These are included in the include/linux/stmmac.h header file
-and detailed below as well:
+Several information can be passed through the platform and device-tree.
 
 struct plat_stmmacenet_data {
 	char *phy_bus_name;
@@ -125,15 +131,18 @@ struct plat_stmmacenet_data {
 	int force_sf_dma_mode;
 	int force_thresh_dma_mode;
 	int riwt_off;
+	int max_speed;
+	int maxmtu;
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	void (*bus_setup)(void __iomem *ioaddr);
 	void *(*setup)(struct platform_device *pdev);
+	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *custom_cfg;
 	void *custom_data;
 	void *bsp_priv;
- };
+};
 
 Where:
  o phy_bus_name: phy bus name to attach to the stmmac.
@@ -258,32 +267,43 @@ and the second one, with a real PHY device attached to the bus,
 by using the stmmac_mdio_bus_data structure (to provide the id, the
 reset procedure etc).
 
-4.10) List of source files:
- o Kconfig
- o Makefile
- o stmmac_main.c: main network device driver;
- o stmmac_mdio.c: mdio functions;
- o stmmac_pci: PCI driver;
- o stmmac_platform.c: platform driver
- o stmmac_ethtool.c: ethtool support;
- o stmmac_timer.[ch]: timer code used for mitigating the driver dma interrupts
-		      (only tested on ST40 platforms based);
+Note that, starting from new chips, where it is available the HW capability
+register, many configurations are discovered at run-time for example to
+understand if EEE, HW csum, PTP, enhanced descriptor etc are actually
+available. As strategy adopted in this driver, the information from the HW
+capability register can replace what has been passed from the platform.
+
+4.10) Device-tree support.
+
+Please see the following document:
+	Documentation/devicetree/bindings/net/stmmac.txt
+
+and the stmmac_of_data structure inside the include/linux/stmmac.h header file.
+
+4.11) This is a summary of the content of some relevant files:
+ o stmmac_main.c: to implement the main network device driver;
+ o stmmac_mdio.c: to provide mdio functions;
+ o stmmac_pci: this the PCI driver;
+ o stmmac_platform.c: this the platform driver (OF supported)
+ o stmmac_ethtool.c: to implement the ethtool support;
  o stmmac.h: private driver structure;
  o common.h: common definitions and VFTs;
  o descs.h: descriptor structure definitions;
- o dwmac1000_core.c: GMAC core functions;
- o dwmac1000_dma.c:  dma functions for the GMAC chip;
- o dwmac1000.h: specific header file for the GMAC;
- o dwmac100_core: MAC 100 core and dma code;
- o dwmac100_dma.c: dma functions for the MAC chip;
+ o dwmac1000_core.c: dwmac GiGa core functions;
+ o dwmac1000_dma.c: dma functions for the GMAC chip;
+ o dwmac1000.h: specific header file for the dwmac GiGa;
+ o dwmac100_core: dwmac 100 core code;
+ o dwmac100_dma.c: dma functions for the dwmac 100 chip;
  o dwmac1000.h: specific header file for the MAC;
- o dwmac_lib.c: generic DMA functions shared among chips;
+ o dwmac_lib.c: generic DMA functions;
  o enh_desc.c: functions for handling enhanced descriptors;
  o norm_desc.c: functions for handling normal descriptors;
  o chain_mode.c/ring_mode.c:: functions to manage RING/CHAINED modes;
  o mmc_core.c/mmc.h: Management MAC Counters;
- o stmmac_hwtstamp.c: HW timestamp support for PTP
- o stmmac_ptp.c: PTP 1588 clock
+ o stmmac_hwtstamp.c: HW timestamp support for PTP;
+ o stmmac_ptp.c: PTP 1588 clock;
+ o dwmac-<XXX>.c: these are for the platform glue-logic file; e.g. dwmac-sti.c
+   for STMicroelectronics SoCs.
 
 5) Debug Information
 
@@ -298,23 +318,14 @@ to get statistics: e.g. using: ethtool -S ethX
 (that shows the Management counters (MMC) if supported)
 or sees the MAC/DMA registers: e.g. using: ethtool -d ethX
 
-Compiling the Kernel with CONFIG_DEBUG_FS and enabling the
-STMMAC_DEBUG_FS option the driver will export the following
+Compiling the Kernel with CONFIG_DEBUG_FS the driver will export the following
 debugfs entries:
 
 /sys/kernel/debug/stmmaceth/descriptors_status
   To show the DMA TX/RX descriptor rings
 
-Developer can also use the "debug" module parameter to get
-further debug information.
-
-In the end, there are other macros (that cannot be enabled
-via menuconfig) to turn-on the RX/TX DMA debugging,
-specific MAC core debug printk etc. Others to enable the
-debug in the TX and RX processes.
-All these are only useful during the developing stage
-and should never enabled inside the code for general usage.
-In fact, these can generate an huge amount of debug messages.
+Developer can also use the "debug" module parameter to get further debug
+information (please see: NETIF Msg Level).
 
 6) Energy Efficient Ethernet
 
@@ -337,15 +348,7 @@ To enter in Tx LPI mode the driver needs to have a software timer
 that enable and disable the LPI mode when there is nothing to be
 transmitted.
 
-7) Extended descriptors
-The extended descriptors give us information about the receive Ethernet payload
-when it is carrying PTP packets or TCP/UDP/ICMP over IP.
-These are not available on GMAC Synopsys chips older than the 3.50.
-At probe time the driver will decide if these can be actually used.
-This support also is mandatory for PTPv2 because the extra descriptors 6 and 7
-are used for saving the hardware timestamps.
-
-8) Precision Time Protocol (PTP)
+7) Precision Time Protocol (PTP)
 The driver supports the IEEE 1588-2002, Precision Time Protocol (PTP),
 which enables precise synchronization of clocks in measurement and
 control systems implemented with technologies such as network
@@ -355,7 +358,7 @@ In addition to the basic timestamp features mentioned in IEEE 1588-2002
 Timestamps, new GMAC cores support the advanced timestamp features.
 IEEE 1588-2008 that can be enabled when configure the Kernel.
 
-9) SGMII/RGMII supports
+8) SGMII/RGMII supports
 New GMAC devices provide own way to manage RGMII/SGMII.
 This information is available at run-time by looking at the
 HW capability register. This means that the stmmac can manage
@@ -364,8 +367,3 @@ In fact, the HW provides a subset of extended registers to
 restart the ANE, verify Full/Half duplex mode and Speed.
 Also thanks to these registers it is possible to look at the
 Auto-negotiated Link Parter Ability.
-
-10) TODO:
- o XGMAC is not supported.
- o Complete the TBI & RTBI support.
- o extend VLAN support for 3.70a SYNP GMAC.
diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt
new file mode 100644
index 000000000000..f981a9295a39
--- /dev/null
+++ b/Documentation/networking/switchdev.txt
@@ -0,0 +1,59 @@
+Switch (and switch-ish) device drivers HOWTO
+===========================
+
+Please note that the word "switch" is here used in very generic meaning.
+This include devices supporting L2/L3 but also various flow offloading chips,
+including switches embedded into SR-IOV NICs.
+
+Lets describe a topology a bit. Imagine the following example:
+
+       +----------------------------+    +---------------+
+       |     SOME switch chip       |    |      CPU      |
+       +----------------------------+    +---------------+
+       port1 port2 port3 port4 MNGMNT    |     PCI-E     |
+         |     |     |     |     |       +---------------+
+        PHY   PHY    |     |     |         |  NIC0 NIC1
+                     |     |     |         |   |    |
+                     |     |     +- PCI-E -+   |    |
+                     |     +------- MII -------+    |
+                     +------------- MII ------------+
+
+In this example, there are two independent lines between the switch silicon
+and CPU. NIC0 and NIC1 drivers are not aware of a switch presence. They are
+separate from the switch driver. SOME switch chip is by managed by a driver
+via PCI-E device MNGMNT. Note that MNGMNT device, NIC0 and NIC1 may be
+connected to some other type of bus.
+
+Now, for the previous example show the representation in kernel:
+
+       +----------------------------+    +---------------+
+       |     SOME switch chip       |    |      CPU      |
+       +----------------------------+    +---------------+
+       sw0p0 sw0p1 sw0p2 sw0p3 MNGMNT    |     PCI-E     |
+         |     |     |     |     |       +---------------+
+        PHY   PHY    |     |     |         |  eth0 eth1
+                     |     |     |         |   |    |
+                     |     |     +- PCI-E -+   |    |
+                     |     +------- MII -------+    |
+                     +------------- MII ------------+
+
+Lets call the example switch driver for SOME switch chip "SOMEswitch". This
+driver takes care of PCI-E device MNGMNT. There is a netdevice instance sw0pX
+created for each port of a switch. These netdevices are instances
+of "SOMEswitch" driver. sw0pX netdevices serve as a "representation"
+of the switch chip. eth0 and eth1 are instances of some other existing driver.
+
+The only difference of the switch-port netdevice from the ordinary netdevice
+is that is implements couple more NDOs:
+
+  ndo_switch_parent_id_get - This returns the same ID for two port netdevices
+			     of the same physical switch chip. This is
+			     mandatory to be implemented by all switch drivers
+			     and serves the caller for recognition of a port
+			     netdevice.
+  ndo_switch_parent_* - Functions that serve for a manipulation of the switch
+			chip itself (it can be though of as a "parent" of the
+			port, therefore the name). They are not port-specific.
+			Caller might use arbitrary port netdevice of the same
+			switch and it will make no difference.
+  ndo_switch_port_* - Functions that serve for a port-specific manipulation.
diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 897f942b976b..a5c784c89312 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -1,102 +1,324 @@
-The existing interfaces for getting network packages time stamped are:
+
+1. Control Interfaces
+
+The interfaces for receiving network packages timestamps are:
 
 * SO_TIMESTAMP
-  Generate time stamp for each incoming packet using the (not necessarily
-  monotonous!) system time. Result is returned via recv_msg() in a
-  control message as timeval (usec resolution).
+  Generates a timestamp for each incoming packet in (not necessarily
+  monotonic) system time. Reports the timestamp via recvmsg() in a
+  control message as struct timeval (usec resolution).
 
 * SO_TIMESTAMPNS
-  Same time stamping mechanism as SO_TIMESTAMP, but returns result as
-  timespec (nsec resolution).
+  Same timestamping mechanism as SO_TIMESTAMP, but reports the
+  timestamp as struct timespec (nsec resolution).
 
 * IP_MULTICAST_LOOP + SO_TIMESTAMP[NS]
-  Only for multicasts: approximate send time stamp by receiving the looped
-  packet and using its receive time stamp.
+  Only for multicast:approximate transmit timestamp obtained by
+  reading the looped packet receive timestamp.
 
-The following interface complements the existing ones: receive time
-stamps can be generated and returned for arbitrary packets and much
-closer to the point where the packet is really sent. Time stamps can
-be generated in software (as before) or in hardware (if the hardware
-has such a feature).
+* SO_TIMESTAMPING
+  Generates timestamps on reception, transmission or both. Supports
+  multiple timestamp sources, including hardware. Supports generating
+  timestamps for stream sockets.
 
-SO_TIMESTAMPING:
 
-Instructs the socket layer which kind of information should be collected
-and/or reported.  The parameter is an integer with some of the following
-bits set. Setting other bits is an error and doesn't change the current
-state.
+1.1 SO_TIMESTAMP:
 
-Four of the bits are requests to the stack to try to generate
-timestamps.  Any combination of them is valid.
+This socket option enables timestamping of datagrams on the reception
+path. Because the destination socket, if any, is not known early in
+the network stack, the feature has to be enabled for all packets. The
+same is true for all early receive timestamp options.
 
-SOF_TIMESTAMPING_TX_HARDWARE:  try to obtain send time stamps in hardware
-SOF_TIMESTAMPING_TX_SOFTWARE:  try to obtain send time stamps in software
-SOF_TIMESTAMPING_RX_HARDWARE:  try to obtain receive time stamps in hardware
-SOF_TIMESTAMPING_RX_SOFTWARE:  try to obtain receive time stamps in software
+For interface details, see `man 7 socket`.
+
+
+1.2 SO_TIMESTAMPNS:
+
+This option is identical to SO_TIMESTAMP except for the returned data type.
+Its struct timespec allows for higher resolution (ns) timestamps than the
+timeval of SO_TIMESTAMP (ms).
+
+
+1.3 SO_TIMESTAMPING:
+
+Supports multiple types of timestamp requests. As a result, this
+socket option takes a bitmap of flags, not a boolean. In
+
+  err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, (void *) val, &val);
+
+val is an integer with any of the following bits set. Setting other
+bit returns EINVAL and does not change the current state.
 
-The other three bits control which timestamps will be reported in a
-generated control message.  If none of these bits are set or if none of
-the set bits correspond to data that is available, then the control
-message will not be generated:
 
-SOF_TIMESTAMPING_SOFTWARE:     report systime if available
-SOF_TIMESTAMPING_SYS_HARDWARE: report hwtimetrans if available (deprecated)
-SOF_TIMESTAMPING_RAW_HARDWARE: report hwtimeraw if available
+1.3.1 Timestamp Generation
 
-It is worth noting that timestamps may be collected for reasons other
-than being requested by a particular socket with
-SOF_TIMESTAMPING_[TR]X_(HARD|SOFT)WARE.  For example, most drivers that
-can generate hardware receive timestamps ignore
-SOF_TIMESTAMPING_RX_HARDWARE.  It is still a good idea to set that flag
-in case future drivers pay attention.
+Some bits are requests to the stack to try to generate timestamps. Any
+combination of them is valid. Changes to these bits apply to newly
+created packets, not to packets already in the stack. As a result, it
+is possible to selectively request timestamps for a subset of packets
+(e.g., for sampling) by embedding an send() call within two setsockopt
+calls, one to enable timestamp generation and one to disable it.
+Timestamps may also be generated for reasons other than being
+requested by a particular socket, such as when receive timestamping is
+enabled system wide, as explained earlier.
 
-If timestamps are reported, they will appear in a control message with
-cmsg_level==SOL_SOCKET, cmsg_type==SO_TIMESTAMPING, and a payload like
-this:
+SOF_TIMESTAMPING_RX_HARDWARE:
+  Request rx timestamps generated by the network adapter.
+
+SOF_TIMESTAMPING_RX_SOFTWARE:
+  Request rx timestamps when data enters the kernel. These timestamps
+  are generated just after a device driver hands a packet to the
+  kernel receive stack.
+
+SOF_TIMESTAMPING_TX_HARDWARE:
+  Request tx timestamps generated by the network adapter.
+
+SOF_TIMESTAMPING_TX_SOFTWARE:
+  Request tx timestamps when data leaves the kernel. These timestamps
+  are generated in the device driver as close as possible, but always
+  prior to, passing the packet to the network interface. Hence, they
+  require driver support and may not be available for all devices.
+
+SOF_TIMESTAMPING_TX_SCHED:
+  Request tx timestamps prior to entering the packet scheduler. Kernel
+  transmit latency is, if long, often dominated by queuing delay. The
+  difference between this timestamp and one taken at
+  SOF_TIMESTAMPING_TX_SOFTWARE will expose this latency independent
+  of protocol processing. The latency incurred in protocol
+  processing, if any, can be computed by subtracting a userspace
+  timestamp taken immediately before send() from this timestamp. On
+  machines with virtual devices where a transmitted packet travels
+  through multiple devices and, hence, multiple packet schedulers,
+  a timestamp is generated at each layer. This allows for fine
+  grained measurement of queuing delay.
+
+SOF_TIMESTAMPING_TX_ACK:
+  Request tx timestamps when all data in the send buffer has been
+  acknowledged. This only makes sense for reliable protocols. It is
+  currently only implemented for TCP. For that protocol, it may
+  over-report measurement, because the timestamp is generated when all
+  data up to and including the buffer at send() was acknowledged: the
+  cumulative acknowledgment. The mechanism ignores SACK and FACK.
+
+
+1.3.2 Timestamp Reporting
+
+The other three bits control which timestamps will be reported in a
+generated control message. Changes to the bits take immediate
+effect at the timestamp reporting locations in the stack. Timestamps
+are only reported for packets that also have the relevant timestamp
+generation request set.
+
+SOF_TIMESTAMPING_SOFTWARE:
+  Report any software timestamps when available.
+
+SOF_TIMESTAMPING_SYS_HARDWARE:
+  This option is deprecated and ignored.
+
+SOF_TIMESTAMPING_RAW_HARDWARE:
+  Report hardware timestamps as generated by
+  SOF_TIMESTAMPING_TX_HARDWARE when available.
+
+
+1.3.3 Timestamp Options
+
+The interface supports the options
+
+SOF_TIMESTAMPING_OPT_ID:
+
+  Generate a unique identifier along with each packet. A process can
+  have multiple concurrent timestamping requests outstanding. Packets
+  can be reordered in the transmit path, for instance in the packet
+  scheduler. In that case timestamps will be queued onto the error
+  queue out of order from the original send() calls. It is not always
+  possible to uniquely match timestamps to the original send() calls
+  based on timestamp order or payload inspection alone, then.
+
+  This option associates each packet at send() with a unique
+  identifier and returns that along with the timestamp. The identifier
+  is derived from a per-socket u32 counter (that wraps). For datagram
+  sockets, the counter increments with each sent packet. For stream
+  sockets, it increments with every byte.
+
+  The counter starts at zero. It is initialized the first time that
+  the socket option is enabled. It is reset each time the option is
+  enabled after having been disabled. Resetting the counter does not
+  change the identifiers of existing packets in the system.
+
+  This option is implemented only for transmit timestamps. There, the
+  timestamp is always looped along with a struct sock_extended_err.
+  The option modifies field ee_data to pass an id that is unique
+  among all possibly concurrently outstanding timestamp requests for
+  that socket.
+
+
+SOF_TIMESTAMPING_OPT_CMSG:
+
+  Support recv() cmsg for all timestamped packets. Control messages
+  are already supported unconditionally on all packets with receive
+  timestamps and on IPv6 packets with transmit timestamp. This option
+  extends them to IPv4 packets with transmit timestamp. One use case
+  is to correlate packets with their egress device, by enabling socket
+  option IP_PKTINFO simultaneously.
+
+
+1.4 Bytestream Timestamps
+
+The SO_TIMESTAMPING interface supports timestamping of bytes in a
+bytestream. Each request is interpreted as a request for when the
+entire contents of the buffer has passed a timestamping point. That
+is, for streams option SOF_TIMESTAMPING_TX_SOFTWARE will record
+when all bytes have reached the device driver, regardless of how
+many packets the data has been converted into.
+
+In general, bytestreams have no natural delimiters and therefore
+correlating a timestamp with data is non-trivial. A range of bytes
+may be split across segments, any segments may be merged (possibly
+coalescing sections of previously segmented buffers associated with
+independent send() calls). Segments can be reordered and the same
+byte range can coexist in multiple segments for protocols that
+implement retransmissions.
+
+It is essential that all timestamps implement the same semantics,
+regardless of these possible transformations, as otherwise they are
+incomparable. Handling "rare" corner cases differently from the
+simple case (a 1:1 mapping from buffer to skb) is insufficient
+because performance debugging often needs to focus on such outliers.
+
+In practice, timestamps can be correlated with segments of a
+bytestream consistently, if both semantics of the timestamp and the
+timing of measurement are chosen correctly. This challenge is no
+different from deciding on a strategy for IP fragmentation. There, the
+definition is that only the first fragment is timestamped. For
+bytestreams, we chose that a timestamp is generated only when all
+bytes have passed a point. SOF_TIMESTAMPING_TX_ACK as defined is easy to
+implement and reason about. An implementation that has to take into
+account SACK would be more complex due to possible transmission holes
+and out of order arrival.
+
+On the host, TCP can also break the simple 1:1 mapping from buffer to
+skbuff as a result of Nagle, cork, autocork, segmentation and GSO. The
+implementation ensures correctness in all cases by tracking the
+individual last byte passed to send(), even if it is no longer the
+last byte after an skbuff extend or merge operation. It stores the
+relevant sequence number in skb_shinfo(skb)->tskey. Because an skbuff
+has only one such field, only one timestamp can be generated.
+
+In rare cases, a timestamp request can be missed if two requests are
+collapsed onto the same skb. A process can detect this situation by
+enabling SOF_TIMESTAMPING_OPT_ID and comparing the byte offset at
+send time with the value returned for each timestamp. It can prevent
+the situation by always flushing the TCP stack in between requests,
+for instance by enabling TCP_NODELAY and disabling TCP_CORK and
+autocork.
+
+These precautions ensure that the timestamp is generated only when all
+bytes have passed a timestamp point, assuming that the network stack
+itself does not reorder the segments. The stack indeed tries to avoid
+reordering. The one exception is under administrator control: it is
+possible to construct a packet scheduler configuration that delays
+segments from the same stream differently. Such a setup would be
+unusual.
+
+
+2 Data Interfaces
+
+Timestamps are read using the ancillary data feature of recvmsg().
+See `man 3 cmsg` for details of this interface. The socket manual
+page (`man 7 socket`) describes how timestamps generated with
+SO_TIMESTAMP and SO_TIMESTAMPNS records can be retrieved.
+
+
+2.1 SCM_TIMESTAMPING records
+
+These timestamps are returned in a control message with cmsg_level
+SOL_SOCKET, cmsg_type SCM_TIMESTAMPING, and payload of type
 
 struct scm_timestamping {
-	struct timespec systime;
-	struct timespec hwtimetrans;
-	struct timespec hwtimeraw;
+	struct timespec ts[3];
 };
 
-recvmsg() can be used to get this control message for regular incoming
-packets. For send time stamps the outgoing packet is looped back to
-the socket's error queue with the send time stamp(s) attached. It can
-be received with recvmsg(flags=MSG_ERRQUEUE). The call returns the
-original outgoing packet data including all headers preprended down to
-and including the link layer, the scm_timestamping control message and
-a sock_extended_err control message with ee_errno==ENOMSG and
-ee_origin==SO_EE_ORIGIN_TIMESTAMPING. A socket with such a pending
-bounced packet is ready for reading as far as select() is concerned.
-If the outgoing packet has to be fragmented, then only the first
-fragment is time stamped and returned to the sending socket.
-
-All three values correspond to the same event in time, but were
-generated in different ways. Each of these values may be empty (= all
-zero), in which case no such value was available. If the application
-is not interested in some of these values, they can be left blank to
-avoid the potential overhead of calculating them.
-
-systime is the value of the system time at that moment. This
-corresponds to the value also returned via SO_TIMESTAMP[NS]. If the
-time stamp was generated by hardware, then this field is
-empty. Otherwise it is filled in if SOF_TIMESTAMPING_SOFTWARE is
-set.
-
-hwtimeraw is the original hardware time stamp. Filled in if
-SOF_TIMESTAMPING_RAW_HARDWARE is set. No assumptions about its
-relation to system time should be made.
-
-hwtimetrans is always zero. This field is deprecated. It used to hold
-hw timestamps converted to system time. Instead, expose the hardware
-clock device on the NIC directly as a HW PTP clock source, to allow
-time conversion in userspace and optionally synchronize system time
-with a userspace PTP stack such as linuxptp. For the PTP clock API,
-see Documentation/ptp/ptp.txt.
-
-
-SIOCSHWTSTAMP, SIOCGHWTSTAMP:
+The structure can return up to three timestamps. This is a legacy
+feature. Only one field is non-zero at any time. Most timestamps
+are passed in ts[0]. Hardware timestamps are passed in ts[2].
+
+ts[1] used to hold hardware timestamps converted to system time.
+Instead, expose the hardware clock device on the NIC directly as
+a HW PTP clock source, to allow time conversion in userspace and
+optionally synchronize system time with a userspace PTP stack such
+as linuxptp. For the PTP clock API, see Documentation/ptp/ptp.txt.
+
+2.1.1 Transmit timestamps with MSG_ERRQUEUE
+
+For transmit timestamps the outgoing packet is looped back to the
+socket's error queue with the send timestamp(s) attached. A process
+receives the timestamps by calling recvmsg() with flag MSG_ERRQUEUE
+set and with a msg_control buffer sufficiently large to receive the
+relevant metadata structures. The recvmsg call returns the original
+outgoing data packet with two ancillary messages attached.
+
+A message of cm_level SOL_IP(V6) and cm_type IP(V6)_RECVERR
+embeds a struct sock_extended_err. This defines the error type. For
+timestamps, the ee_errno field is ENOMSG. The other ancillary message
+will have cm_level SOL_SOCKET and cm_type SCM_TIMESTAMPING. This
+embeds the struct scm_timestamping.
+
+
+2.1.1.2 Timestamp types
+
+The semantics of the three struct timespec are defined by field
+ee_info in the extended error structure. It contains a value of
+type SCM_TSTAMP_* to define the actual timestamp passed in
+scm_timestamping.
+
+The SCM_TSTAMP_* types are 1:1 matches to the SOF_TIMESTAMPING_*
+control fields discussed previously, with one exception. For legacy
+reasons, SCM_TSTAMP_SND is equal to zero and can be set for both
+SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE. It
+is the first if ts[2] is non-zero, the second otherwise, in which
+case the timestamp is stored in ts[0].
+
+
+2.1.1.3 Fragmentation
+
+Fragmentation of outgoing datagrams is rare, but is possible, e.g., by
+explicitly disabling PMTU discovery. If an outgoing packet is fragmented,
+then only the first fragment is timestamped and returned to the sending
+socket.
+
+
+2.1.1.4 Packet Payload
+
+The calling application is often not interested in receiving the whole
+packet payload that it passed to the stack originally: the socket
+error queue mechanism is just a method to piggyback the timestamp on.
+In this case, the application can choose to read datagrams with a
+smaller buffer, possibly even of length 0. The payload is truncated
+accordingly. Until the process calls recvmsg() on the error queue,
+however, the full packet is queued, taking up budget from SO_RCVBUF.
+
+
+2.1.1.5 Blocking Read
+
+Reading from the error queue is always a non-blocking operation. To
+block waiting on a timestamp, use poll or select. poll() will return
+POLLERR in pollfd.revents if any data is ready on the error queue.
+There is no need to pass this flag in pollfd.events. This flag is
+ignored on request. See also `man 2 poll`.
+
+
+2.1.2 Receive timestamps
+
+On reception, there is no reason to read from the socket error queue.
+The SCM_TIMESTAMPING ancillary data is sent along with the packet data
+on a normal recvmsg(). Since this is not a socket error, it is not
+accompanied by a message SOL_IP(V6)/IP(V6)_RECVERROR. In this case,
+the meaning of the three fields in struct scm_timestamping is
+implicitly defined. ts[0] holds a software timestamp if set, ts[1]
+is again deprecated and ts[2] holds a hardware timestamp if set.
+
+
+3. Hardware Timestamping configuration: SIOCSHWTSTAMP and SIOCGHWTSTAMP
 
 Hardware time stamping must also be initialized for each device driver
 that is expected to do hardware time stamping. The parameter is defined in
@@ -167,8 +389,7 @@ enum {
 	 */
 };
 
-
-DEVICE IMPLEMENTATION
+3.1 Hardware Timestamping Implementation: Device Drivers
 
 A driver which supports hardware time stamping must support the
 SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with
diff --git a/Documentation/networking/timestamping/.gitignore b/Documentation/networking/timestamping/.gitignore
index a380159765ce..9e69e982fb38 100644
--- a/Documentation/networking/timestamping/.gitignore
+++ b/Documentation/networking/timestamping/.gitignore
@@ -1,2 +1,3 @@
 timestamping
+txtimestamp
 hwtstamp_config
diff --git a/Documentation/networking/timestamping/Makefile b/Documentation/networking/timestamping/Makefile
index d934afc8306a..8c20dfaa4d6e 100644
--- a/Documentation/networking/timestamping/Makefile
+++ b/Documentation/networking/timestamping/Makefile
@@ -1,14 +1,14 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
+# To compile, from the source root
+#
+#    make headers_install
+#    make M=documentation
 
 # List of programs to build
-hostprogs-y := timestamping hwtstamp_config
+hostprogs-y := hwtstamp_config timestamping txtimestamp
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 
 HOSTCFLAGS_timestamping.o += -I$(objtree)/usr/include
+HOSTCFLAGS_txtimestamp.o += -I$(objtree)/usr/include
 HOSTCFLAGS_hwtstamp_config.o += -I$(objtree)/usr/include
-
-clean:
-	rm -f timestamping hwtstamp_config
diff --git a/Documentation/networking/timestamping/txtimestamp.c b/Documentation/networking/timestamping/txtimestamp.c
new file mode 100644
index 000000000000..876f71c5625a
--- /dev/null
+++ b/Documentation/networking/timestamping/txtimestamp.c
@@ -0,0 +1,535 @@
+/*
+ * Copyright 2014 Google Inc.
+ * Author: willemb@google.com (Willem de Bruijn)
+ *
+ * Test software tx timestamping, including
+ *
+ * - SCHED, SND and ACK timestamps
+ * - RAW, UDP and TCP
+ * - IPv4 and IPv6
+ * - various packet sizes (to test GSO and TSO)
+ *
+ * Consult the command line arguments for help on running
+ * the various testcases.
+ *
+ * This test requires a dummy TCP server.
+ * A simple `nc6 [-u] -l -p $DESTPORT` will do
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <arpa/inet.h>
+#include <asm/types.h>
+#include <error.h>
+#include <errno.h>
+#include <linux/errqueue.h>
+#include <linux/if_ether.h>
+#include <linux/net_tstamp.h>
+#include <netdb.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
+#include <netpacket/packet.h>
+#include <poll.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+/* ugly hack to work around netinet/in.h and linux/ipv6.h conflicts */
+#ifndef in6_pktinfo
+struct in6_pktinfo {
+	struct in6_addr	ipi6_addr;
+	int		ipi6_ifindex;
+};
+#endif
+
+/* command line parameters */
+static int cfg_proto = SOCK_STREAM;
+static int cfg_ipproto = IPPROTO_TCP;
+static int cfg_num_pkts = 4;
+static int do_ipv4 = 1;
+static int do_ipv6 = 1;
+static int cfg_payload_len = 10;
+static bool cfg_show_payload;
+static bool cfg_do_pktinfo;
+static uint16_t dest_port = 9000;
+
+static struct sockaddr_in daddr;
+static struct sockaddr_in6 daddr6;
+static struct timespec ts_prev;
+
+static void __print_timestamp(const char *name, struct timespec *cur,
+			      uint32_t key, int payload_len)
+{
+	if (!(cur->tv_sec | cur->tv_nsec))
+		return;
+
+	fprintf(stderr, "  %s: %lu s %lu us (seq=%u, len=%u)",
+			name, cur->tv_sec, cur->tv_nsec / 1000,
+			key, payload_len);
+
+	if ((ts_prev.tv_sec | ts_prev.tv_nsec)) {
+		int64_t cur_ms, prev_ms;
+
+		cur_ms = (long) cur->tv_sec * 1000 * 1000;
+		cur_ms += cur->tv_nsec / 1000;
+
+		prev_ms = (long) ts_prev.tv_sec * 1000 * 1000;
+		prev_ms += ts_prev.tv_nsec / 1000;
+
+		fprintf(stderr, "  (%+ld us)", cur_ms - prev_ms);
+	}
+
+	ts_prev = *cur;
+	fprintf(stderr, "\n");
+}
+
+static void print_timestamp_usr(void)
+{
+	struct timespec ts;
+	struct timeval tv;	/* avoid dependency on -lrt */
+
+	gettimeofday(&tv, NULL);
+	ts.tv_sec = tv.tv_sec;
+	ts.tv_nsec = tv.tv_usec * 1000;
+
+	__print_timestamp("  USR", &ts, 0, 0);
+}
+
+static void print_timestamp(struct scm_timestamping *tss, int tstype,
+			    int tskey, int payload_len)
+{
+	const char *tsname;
+
+	switch (tstype) {
+	case SCM_TSTAMP_SCHED:
+		tsname = "  ENQ";
+		break;
+	case SCM_TSTAMP_SND:
+		tsname = "  SND";
+		break;
+	case SCM_TSTAMP_ACK:
+		tsname = "  ACK";
+		break;
+	default:
+		error(1, 0, "unknown timestamp type: %u",
+		tstype);
+	}
+	__print_timestamp(tsname, &tss->ts[0], tskey, payload_len);
+}
+
+/* TODO: convert to check_and_print payload once API is stable */
+static void print_payload(char *data, int len)
+{
+	int i;
+
+	if (len > 70)
+		len = 70;
+
+	fprintf(stderr, "payload: ");
+	for (i = 0; i < len; i++)
+		fprintf(stderr, "%02hhx ", data[i]);
+	fprintf(stderr, "\n");
+}
+
+static void print_pktinfo(int family, int ifindex, void *saddr, void *daddr)
+{
+	char sa[INET6_ADDRSTRLEN], da[INET6_ADDRSTRLEN];
+
+	fprintf(stderr, "         pktinfo: ifindex=%u src=%s dst=%s\n",
+		ifindex,
+		saddr ? inet_ntop(family, saddr, sa, sizeof(sa)) : "unknown",
+		daddr ? inet_ntop(family, daddr, da, sizeof(da)) : "unknown");
+}
+
+static void __poll(int fd)
+{
+	struct pollfd pollfd;
+	int ret;
+
+	memset(&pollfd, 0, sizeof(pollfd));
+	pollfd.fd = fd;
+	ret = poll(&pollfd, 1, 100);
+	if (ret != 1)
+		error(1, errno, "poll");
+}
+
+static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len)
+{
+	struct sock_extended_err *serr = NULL;
+	struct scm_timestamping *tss = NULL;
+	struct cmsghdr *cm;
+
+	for (cm = CMSG_FIRSTHDR(msg);
+	     cm && cm->cmsg_len;
+	     cm = CMSG_NXTHDR(msg, cm)) {
+		if (cm->cmsg_level == SOL_SOCKET &&
+		    cm->cmsg_type == SCM_TIMESTAMPING) {
+			tss = (void *) CMSG_DATA(cm);
+		} else if ((cm->cmsg_level == SOL_IP &&
+			    cm->cmsg_type == IP_RECVERR) ||
+			   (cm->cmsg_level == SOL_IPV6 &&
+			    cm->cmsg_type == IPV6_RECVERR)) {
+			serr = (void *) CMSG_DATA(cm);
+			if (serr->ee_errno != ENOMSG ||
+			    serr->ee_origin != SO_EE_ORIGIN_TIMESTAMPING) {
+				fprintf(stderr, "unknown ip error %d %d\n",
+						serr->ee_errno,
+						serr->ee_origin);
+				serr = NULL;
+			}
+		} else if (cm->cmsg_level == SOL_IP &&
+			   cm->cmsg_type == IP_PKTINFO) {
+			struct in_pktinfo *info = (void *) CMSG_DATA(cm);
+			print_pktinfo(AF_INET, info->ipi_ifindex,
+				      &info->ipi_spec_dst, &info->ipi_addr);
+		} else if (cm->cmsg_level == SOL_IPV6 &&
+			   cm->cmsg_type == IPV6_PKTINFO) {
+			struct in6_pktinfo *info6 = (void *) CMSG_DATA(cm);
+			print_pktinfo(AF_INET6, info6->ipi6_ifindex,
+				      NULL, &info6->ipi6_addr);
+		} else
+			fprintf(stderr, "unknown cmsg %d,%d\n",
+					cm->cmsg_level, cm->cmsg_type);
+	}
+
+	if (serr && tss)
+		print_timestamp(tss, serr->ee_info, serr->ee_data, payload_len);
+}
+
+static int recv_errmsg(int fd)
+{
+	static char ctrl[1024 /* overprovision*/];
+	static struct msghdr msg;
+	struct iovec entry;
+	static char *data;
+	int ret = 0;
+
+	data = malloc(cfg_payload_len);
+	if (!data)
+		error(1, 0, "malloc");
+
+	memset(&msg, 0, sizeof(msg));
+	memset(&entry, 0, sizeof(entry));
+	memset(ctrl, 0, sizeof(ctrl));
+
+	entry.iov_base = data;
+	entry.iov_len = cfg_payload_len;
+	msg.msg_iov = &entry;
+	msg.msg_iovlen = 1;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = ctrl;
+	msg.msg_controllen = sizeof(ctrl);
+
+	ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+	if (ret == -1 && errno != EAGAIN)
+		error(1, errno, "recvmsg");
+
+	if (ret > 0) {
+		__recv_errmsg_cmsg(&msg, ret);
+		if (cfg_show_payload)
+			print_payload(data, cfg_payload_len);
+	}
+
+	free(data);
+	return ret == -1;
+}
+
+static void do_test(int family, unsigned int opt)
+{
+	char *buf;
+	int fd, i, val = 1, total_len;
+
+	if (family == AF_INET6 && cfg_proto != SOCK_STREAM) {
+		/* due to lack of checksum generation code */
+		fprintf(stderr, "test: skipping datagram over IPv6\n");
+		return;
+	}
+
+	total_len = cfg_payload_len;
+	if (cfg_proto == SOCK_RAW) {
+		total_len += sizeof(struct udphdr);
+		if (cfg_ipproto == IPPROTO_RAW)
+			total_len += sizeof(struct iphdr);
+	}
+
+	buf = malloc(total_len);
+	if (!buf)
+		error(1, 0, "malloc");
+
+	fd = socket(family, cfg_proto, cfg_ipproto);
+	if (fd < 0)
+		error(1, errno, "socket");
+
+	if (cfg_proto == SOCK_STREAM) {
+		if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+			       (char*) &val, sizeof(val)))
+			error(1, 0, "setsockopt no nagle");
+
+		if (family == PF_INET) {
+			if (connect(fd, (void *) &daddr, sizeof(daddr)))
+				error(1, errno, "connect ipv4");
+		} else {
+			if (connect(fd, (void *) &daddr6, sizeof(daddr6)))
+				error(1, errno, "connect ipv6");
+		}
+	}
+
+	if (cfg_do_pktinfo) {
+		if (family == AF_INET6) {
+			if (setsockopt(fd, SOL_IPV6, IPV6_RECVPKTINFO,
+				       &val, sizeof(val)))
+				error(1, errno, "setsockopt pktinfo ipv6");
+		} else {
+			if (setsockopt(fd, SOL_IP, IP_PKTINFO,
+				       &val, sizeof(val)))
+				error(1, errno, "setsockopt pktinfo ipv4");
+		}
+	}
+
+	opt |= SOF_TIMESTAMPING_SOFTWARE |
+	       SOF_TIMESTAMPING_OPT_CMSG |
+	       SOF_TIMESTAMPING_OPT_ID;
+	if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
+		       (char *) &opt, sizeof(opt)))
+		error(1, 0, "setsockopt timestamping");
+
+	for (i = 0; i < cfg_num_pkts; i++) {
+		memset(&ts_prev, 0, sizeof(ts_prev));
+		memset(buf, 'a' + i, total_len);
+
+		if (cfg_proto == SOCK_RAW) {
+			struct udphdr *udph;
+			int off = 0;
+
+			if (cfg_ipproto == IPPROTO_RAW) {
+				struct iphdr *iph = (void *) buf;
+
+				memset(iph, 0, sizeof(*iph));
+				iph->ihl      = 5;
+				iph->version  = 4;
+				iph->ttl      = 2;
+				iph->daddr    = daddr.sin_addr.s_addr;
+				iph->protocol = IPPROTO_UDP;
+				/* kernel writes saddr, csum, len */
+
+				off = sizeof(*iph);
+			}
+
+			udph = (void *) buf + off;
+			udph->source = ntohs(9000); 	/* random spoof */
+			udph->dest   = ntohs(dest_port);
+			udph->len    = ntohs(sizeof(*udph) + cfg_payload_len);
+			udph->check  = 0;	/* not allowed for IPv6 */
+		}
+
+		print_timestamp_usr();
+		if (cfg_proto != SOCK_STREAM) {
+			if (family == PF_INET)
+				val = sendto(fd, buf, total_len, 0, (void *) &daddr, sizeof(daddr));
+			else
+				val = sendto(fd, buf, total_len, 0, (void *) &daddr6, sizeof(daddr6));
+		} else {
+			val = send(fd, buf, cfg_payload_len, 0);
+		}
+		if (val != total_len)
+			error(1, errno, "send");
+
+		/* wait for all errors to be queued, else ACKs arrive OOO */
+		usleep(50 * 1000);
+
+		__poll(fd);
+
+		while (!recv_errmsg(fd)) {}
+	}
+
+	if (close(fd))
+		error(1, errno, "close");
+
+	free(buf);
+	usleep(400 * 1000);
+}
+
+static void __attribute__((noreturn)) usage(const char *filepath)
+{
+	fprintf(stderr, "\nUsage: %s [options] hostname\n"
+			"\nwhere options are:\n"
+			"  -4:   only IPv4\n"
+			"  -6:   only IPv6\n"
+			"  -h:   show this message\n"
+			"  -I:   request PKTINFO\n"
+			"  -l N: send N bytes at a time\n"
+			"  -r:   use raw\n"
+			"  -R:   use raw (IP_HDRINCL)\n"
+			"  -p N: connect to port N\n"
+			"  -u:   use udp\n"
+			"  -x:   show payload (up to 70 bytes)\n",
+			filepath);
+	exit(1);
+}
+
+static void parse_opt(int argc, char **argv)
+{
+	int proto_count = 0;
+	char c;
+
+	while ((c = getopt(argc, argv, "46hIl:p:rRux")) != -1) {
+		switch (c) {
+		case '4':
+			do_ipv6 = 0;
+			break;
+		case '6':
+			do_ipv4 = 0;
+			break;
+		case 'I':
+			cfg_do_pktinfo = true;
+			break;
+		case 'r':
+			proto_count++;
+			cfg_proto = SOCK_RAW;
+			cfg_ipproto = IPPROTO_UDP;
+			break;
+		case 'R':
+			proto_count++;
+			cfg_proto = SOCK_RAW;
+			cfg_ipproto = IPPROTO_RAW;
+			break;
+		case 'u':
+			proto_count++;
+			cfg_proto = SOCK_DGRAM;
+			cfg_ipproto = IPPROTO_UDP;
+			break;
+		case 'l':
+			cfg_payload_len = strtoul(optarg, NULL, 10);
+			break;
+		case 'p':
+			dest_port = strtoul(optarg, NULL, 10);
+			break;
+		case 'x':
+			cfg_show_payload = true;
+			break;
+		case 'h':
+		default:
+			usage(argv[0]);
+		}
+	}
+
+	if (!cfg_payload_len)
+		error(1, 0, "payload may not be nonzero");
+	if (cfg_proto != SOCK_STREAM && cfg_payload_len > 1472)
+		error(1, 0, "udp packet might exceed expected MTU");
+	if (!do_ipv4 && !do_ipv6)
+		error(1, 0, "pass -4 or -6, not both");
+	if (proto_count > 1)
+		error(1, 0, "pass -r, -R or -u, not multiple");
+
+	if (optind != argc - 1)
+		error(1, 0, "missing required hostname argument");
+}
+
+static void resolve_hostname(const char *hostname)
+{
+	struct addrinfo *addrs, *cur;
+	int have_ipv4 = 0, have_ipv6 = 0;
+
+	if (getaddrinfo(hostname, NULL, NULL, &addrs))
+		error(1, errno, "getaddrinfo");
+
+	cur = addrs;
+	while (cur && !have_ipv4 && !have_ipv6) {
+		if (!have_ipv4 && cur->ai_family == AF_INET) {
+			memcpy(&daddr, cur->ai_addr, sizeof(daddr));
+			daddr.sin_port = htons(dest_port);
+			have_ipv4 = 1;
+		}
+		else if (!have_ipv6 && cur->ai_family == AF_INET6) {
+			memcpy(&daddr6, cur->ai_addr, sizeof(daddr6));
+			daddr6.sin6_port = htons(dest_port);
+			have_ipv6 = 1;
+		}
+		cur = cur->ai_next;
+	}
+	if (addrs)
+		freeaddrinfo(addrs);
+
+	do_ipv4 &= have_ipv4;
+	do_ipv6 &= have_ipv6;
+}
+
+static void do_main(int family)
+{
+	fprintf(stderr, "family:       %s\n",
+			family == PF_INET ? "INET" : "INET6");
+
+	fprintf(stderr, "test SND\n");
+	do_test(family, SOF_TIMESTAMPING_TX_SOFTWARE);
+
+	fprintf(stderr, "test ENQ\n");
+	do_test(family, SOF_TIMESTAMPING_TX_SCHED);
+
+	fprintf(stderr, "test ENQ + SND\n");
+	do_test(family, SOF_TIMESTAMPING_TX_SCHED |
+			SOF_TIMESTAMPING_TX_SOFTWARE);
+
+	if (cfg_proto == SOCK_STREAM) {
+		fprintf(stderr, "\ntest ACK\n");
+		do_test(family, SOF_TIMESTAMPING_TX_ACK);
+
+		fprintf(stderr, "\ntest SND + ACK\n");
+		do_test(family, SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_TX_ACK);
+
+		fprintf(stderr, "\ntest ENQ + SND + ACK\n");
+		do_test(family, SOF_TIMESTAMPING_TX_SCHED |
+				SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_TX_ACK);
+	}
+}
+
+const char *sock_names[] = { NULL, "TCP", "UDP", "RAW" };
+
+int main(int argc, char **argv)
+{
+	if (argc == 1)
+		usage(argv[0]);
+
+	parse_opt(argc, argv);
+	resolve_hostname(argv[argc - 1]);
+
+	fprintf(stderr, "protocol:     %s\n", sock_names[cfg_proto]);
+	fprintf(stderr, "payload:      %u\n", cfg_payload_len);
+	fprintf(stderr, "server port:  %u\n", dest_port);
+	fprintf(stderr, "\n");
+
+	if (do_ipv4)
+		do_main(PF_INET);
+	if (do_ipv6)
+		do_main(PF_INET6);
+
+	return 0;
+}