tools/testing/selftests/netfilter/nft_flowtable.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324

#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This tests basic flowtable functionality.
# Creates following topology:
#
# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
# Router1 is the one doing flow offloading, Router2 has no special
# purpose other than having a link that is smaller than either Originator
# and responder, i.e. TCPMSS announced values are too large and will still
# result in fragmentation and/or PMTU discovery.

# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
ret=0

ns1in=""
ns2in=""
ns1out=""
ns2out=""

log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)

nft --version > /dev/null 2>&1
if [ $? -ne 0 ];then
	echo "SKIP: Could not run test without nft tool"
	exit $ksft_skip
fi

ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
	echo "SKIP: Could not run test without ip tool"
	exit $ksft_skip
fi

which nc > /dev/null 2>&1
if [ $? -ne 0 ];then
	echo "SKIP: Could not run test without nc (netcat)"
	exit $ksft_skip
fi

ip netns add nsr1
if [ $? -ne 0 ];then
	echo "SKIP: Could not create net namespace"
	exit $ksft_skip
fi

ip netns add ns1
ip netns add ns2

ip netns add nsr2

cleanup() {
	for i in 1 2; do
		ip netns del ns$i
		ip netns del nsr$i
	done

	rm -f "$ns1in" "$ns1out"
	rm -f "$ns2in" "$ns2out"

	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
}

trap cleanup EXIT

sysctl -q net.netfilter.nf_log_all_netns=1

ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1
ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2

ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2

for dev in lo veth0 veth1; do
  for i in 1 2; do
    ip -net nsr$i link set $dev up
  done
done

ip -net nsr1 addr add 10.0.1.1/24 dev veth0
ip -net nsr1 addr add dead:1::1/64 dev veth0

ip -net nsr2 addr add 10.0.2.1/24 dev veth1
ip -net nsr2 addr add dead:2::1/64 dev veth1

# set different MTUs so we need to push packets coming from ns1 (large MTU)
# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
# or to do PTMU discovery (send ICMP error back to originator).
# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
# is NOT the lowest link mtu.

ip -net nsr1 link set veth0 mtu 9000
ip -net ns1 link set eth0 mtu 9000

ip -net nsr2 link set veth1 mtu 2000
ip -net ns2 link set eth0 mtu 2000

# transfer-net between nsr1 and nsr2.
# these addresses are not used for connections.
ip -net nsr1 addr add 192.168.10.1/24 dev veth1
ip -net nsr1 addr add fee1:2::1/64 dev veth1

ip -net nsr2 addr add 192.168.10.2/24 dev veth0
ip -net nsr2 addr add fee1:2::2/64 dev veth0

for i in 1 2; do
  ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
  ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null

  ip -net ns$i link set lo up
  ip -net ns$i link set eth0 up
  ip -net ns$i addr add 10.0.$i.99/24 dev eth0
  ip -net ns$i route add default via 10.0.$i.1
  ip -net ns$i addr add dead:$i::99/64 dev eth0
  ip -net ns$i route add default via dead:$i::1
  ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null

  # don't set ip DF bit for first two tests
  ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
done

ip -net nsr1 route add default via 192.168.10.2
ip -net nsr2 route add default via 192.168.10.1

ip netns exec nsr1 nft -f - <<EOF
table inet filter {
  flowtable f1 {
     hook ingress priority 0
     devices = { veth0, veth1 }
   }

   chain forward {
      type filter hook forward priority 0; policy drop;

      # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
      meta oif "veth1" tcp dport 12345 flow offload @f1 counter

      # use packet size to trigger 'should be offloaded by now'.
      # otherwise, if 'flow offload' expression never offloads, the
      # test will pass.
      tcp dport 12345 meta length gt 200 ct mark set 1 counter

      # this turns off flow offloading internally, so expect packets again
      tcp flags fin,rst ct mark set 0 accept

      # this allows large packets from responder, we need this as long
      # as PMTUd is off.
      # This rule is deleted for the last test, when we expect PMTUd
      # to kick in and ensure all packets meet mtu requirements.
      meta length gt 1500 accept comment something-to-grep-for

      # next line blocks connection w.o. working offload.
      # we only do this for reverse dir, because we expect packets to
      # enter slow path due to MTU mismatch of veth0 and veth1.
      tcp sport 12345 ct mark 1 counter log prefix "mark failure " drop

      ct state established,related accept

      # for packets that we can't offload yet, i.e. SYN (any ct that is not confirmed)
      meta length lt 200 oif "veth1" tcp dport 12345 counter accept

      meta nfproto ipv4 meta l4proto icmp accept
      meta nfproto ipv6 meta l4proto icmpv6 accept
   }
}
EOF

if [ $? -ne 0 ]; then
	echo "SKIP: Could not load nft ruleset"
	exit $ksft_skip
fi

# test basic connectivity
ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null
if [ $? -ne 0 ];then
  echo "ERROR: ns1 cannot reach ns2" 1>&2
  bash
  exit 1
fi

ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null
if [ $? -ne 0 ];then
  echo "ERROR: ns2 cannot reach ns1" 1>&2
  exit 1
fi

if [ $ret -eq 0 ];then
	echo "PASS: netns routing/connectivity: ns1 can reach ns2"
fi

ns1in=$(mktemp)
ns1out=$(mktemp)
ns2in=$(mktemp)
ns2out=$(mktemp)

make_file()
{
	name=$1
	who=$2

	SIZE=$((RANDOM % (1024 * 8)))
	TSIZE=$((SIZE * 1024))

	dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null

	SIZE=$((RANDOM % 1024))
	SIZE=$((SIZE + 128))
	TSIZE=$((TSIZE + SIZE))
	dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
}

check_transfer()
{
	in=$1
	out=$2
	what=$3

	cmp "$in" "$out" > /dev/null 2>&1
	if [ $? -ne 0 ] ;then
		echo "FAIL: file mismatch for $what" 1>&2
		ls -l "$in"
		ls -l "$out"
		return 1
	fi

	return 0
}

test_tcp_forwarding()
{
	local nsa=$1
	local nsb=$2
	local lret=0

	ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" &
	lpid=$!

	sleep 1
	ip netns exec $nsa nc -w 4 10.0.2.99 12345 < "$ns1in" > "$ns1out" &
	cpid=$!

	sleep 3

	kill $lpid
	kill $cpid
	wait

	check_transfer "$ns1in" "$ns2out" "ns1 -> ns2"
	if [ $? -ne 0 ];then
		lret=1
	fi

	check_transfer "$ns2in" "$ns1out" "ns1 <- ns2"
	if [ $? -ne 0 ];then
		lret=1
	fi

	return $lret
}

make_file "$ns1in" "ns1"
make_file "$ns2in" "ns2"

# First test:
# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
test_tcp_forwarding ns1 ns2
if [ $? -eq 0 ] ;then
	echo "PASS: flow offloaded for ns1/ns2"
else
	echo "FAIL: flow offload for ns1/ns2:" 1>&2
	ip netns exec nsr1 nft list ruleset
	ret=1
fi

# delete default route, i.e. ns2 won't be able to reach ns1 and
# will depend on ns1 being masqueraded in nsr1.
# expect ns1 has nsr1 address.
ip -net ns2 route del default via 10.0.2.1
ip -net ns2 route del default via dead:2::1
ip -net ns2 route add 192.168.10.1 via 10.0.2.1

# Second test:
# Same, but with NAT enabled.
ip netns exec nsr1 nft -f - <<EOF
table ip nat {
   chain postrouting {
      type nat hook postrouting priority 0; policy accept;
      meta oifname "veth1" masquerade
   }
}
EOF

test_tcp_forwarding ns1 ns2

if [ $? -eq 0 ] ;then
	echo "PASS: flow offloaded for ns1/ns2 with NAT"
else
	echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
	ip netns exec nsr1 nft list ruleset
	ret=1
fi

# Third test:
# Same as second test, but with PMTU discovery enabled.
handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2)

ip netns exec nsr1 nft delete rule inet filter forward $handle
if [ $? -ne 0 ] ;then
	echo "FAIL: Could not delete large-packet accept rule"
	exit 1
fi

ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null

test_tcp_forwarding ns1 ns2
if [ $? -eq 0 ] ;then
	echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery"
else
	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
	ip netns exec nsr1 nft list ruleset
fi

exit $ret