aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2022-07-06 12:56:35 +0100
committerDavid S. Miller <davem@davemloft.net>2022-07-06 12:56:35 +0100
commit4874fb9484be4cee78d8b3b0f0209cd16e5ae35d (patch)
treec3759309c9c88b17a1f87d0bcbb4b939491e2ef7
parentMerge branch 'octeontx2-af-next' (diff)
parenttls: rx: periodically flush socket backlog (diff)
downloadlinux-dev-4874fb9484be4cee78d8b3b0f0209cd16e5ae35d.tar.xz
linux-dev-4874fb9484be4cee78d8b3b0f0209cd16e5ae35d.zip
Merge branch 'tls-rx-nopad-and-backlog-flushing'
Jakub Kicinski says: ==================== tls: rx: nopad and backlog flushing This small series contains the two changes I've been working towards in the previous ~50 patches a couple of months ago. The first major change is the optional "nopad" optimization. Currently TLS 1.3 Rx performs quite poorly because it does not support the "zero-copy" or rather direct decrypt to a user space buffer. Because of TLS 1.3 record padding we don't know if a record contains data or a control message until we decrypt it. Most records will contain data, tho, so the optimization is to try the decryption hoping its data and retry if it wasn't. The performance gain from doing that is significant (~40%) but if I'm completely honest the major reason is that we call skb_cow_data() on the non-"zc" path. The next series will remove the CoW, dropping the gain to only ~10%. The second change is to flush the backlog every 128kB. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/tls.rst18
-rw-r--r--include/linux/sockptr.h8
-rw-r--r--include/net/tls.h3
-rw-r--r--include/uapi/linux/snmp.h1
-rw-r--r--include/uapi/linux/tls.h2
-rw-r--r--net/core/sock.c1
-rw-r--r--net/tls/tls_main.c75
-rw-r--r--net/tls/tls_proc.c1
-rw-r--r--net/tls/tls_sw.c84
-rw-r--r--tools/testing/selftests/net/tls.c15
10 files changed, 191 insertions, 17 deletions
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index be8e10c14b05..7a6643836e42 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -239,6 +239,19 @@ for the original TCP transmission and TCP retransmissions. To the receiver
this will look like TLS records had been tampered with and will result
in record authentication failures.
+TLS_RX_EXPECT_NO_PAD
+~~~~~~~~~~~~~~~~~~~~
+
+TLS 1.3 only. Expect the sender to not pad records. This allows the data
+to be decrypted directly into user space buffers with TLS 1.3.
+
+This optimization is safe to enable only if the remote end is trusted,
+otherwise it is an attack vector to doubling the TLS processing cost.
+
+If the record decrypted turns out to had been padded or is not a data
+record it will be decrypted again into a kernel buffer without zero copy.
+Such events are counted in the ``TlsDecryptRetry`` statistic.
+
Statistics
==========
@@ -264,3 +277,8 @@ TLS implementation exposes the following per-namespace statistics
- ``TlsDeviceRxResync`` -
number of RX resyncs sent to NICs handling cryptography
+
+- ``TlsDecryptRetry`` -
+ number of RX records which had to be re-decrypted due to
+ ``TLS_RX_EXPECT_NO_PAD`` mis-prediction. Note that this counter will
+ also increment for non-data records.
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index ea193414298b..d45902fb4cad 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -102,4 +102,12 @@ static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
return strncpy_from_user(dst, src.user, count);
}
+static inline int check_zeroed_sockptr(sockptr_t src, size_t offset,
+ size_t size)
+{
+ if (!sockptr_is_kernel(src))
+ return check_zeroed_user(src.user + offset, size);
+ return memchr_inv(src.kernel + offset, 0, size) == NULL;
+}
+
#endif /* _LINUX_SOCKPTR_H */
diff --git a/include/net/tls.h b/include/net/tls.h
index 8017f1703447..4fc16ca5f469 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -149,6 +149,7 @@ struct tls_sw_context_rx {
struct sk_buff *recv_pkt;
u8 async_capable:1;
+ u8 zc_capable:1;
atomic_t decrypt_pending;
/* protect crypto_wait with decrypt_pending*/
spinlock_t decrypt_compl_lock;
@@ -239,6 +240,7 @@ struct tls_context {
u8 tx_conf:3;
u8 rx_conf:3;
u8 zerocopy_sendfile:1;
+ u8 rx_no_pad:1;
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
@@ -358,6 +360,7 @@ int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
void tls_err_abort(struct sock *sk, int err);
int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
+void tls_update_rx_zc_capable(struct tls_context *tls_ctx);
void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
void tls_sw_strparser_done(struct tls_context *tls_ctx);
int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 904909d020e2..1c9152add663 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -344,6 +344,7 @@ enum
LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */
LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */
LINUX_MIB_TLSRXDEVICERESYNC, /* TlsRxDeviceResync */
+ LINUX_MIN_TLSDECRYPTRETRY, /* TlsDecryptRetry */
__LINUX_MIB_TLSMAX
};
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index bb8f80812b0b..f1157d8f4acd 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -40,6 +40,7 @@
#define TLS_TX 1 /* Set transmit parameters */
#define TLS_RX 2 /* Set receive parameters */
#define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */
+#define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */
/* Supported versions */
#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -162,6 +163,7 @@ enum {
TLS_INFO_TXCONF,
TLS_INFO_RXCONF,
TLS_INFO_ZC_RO_TX,
+ TLS_INFO_RX_NO_PAD,
__TLS_INFO_MAX,
};
#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/core/sock.c b/net/core/sock.c
index 92a0296ccb18..4cb957d934a2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2870,6 +2870,7 @@ void __sk_flush_backlog(struct sock *sk)
__release_sock(sk);
spin_unlock_bh(&sk->sk_lock.slock);
}
+EXPORT_SYMBOL_GPL(__sk_flush_backlog);
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 2ffede463e4a..1b3efc96db0b 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -533,6 +533,37 @@ static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval,
return 0;
}
+static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
+ int __user *optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ unsigned int value;
+ int err, len;
+
+ if (ctx->prot_info.version != TLS_1_3_VERSION)
+ return -EINVAL;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < sizeof(value))
+ return -EINVAL;
+
+ lock_sock(sk);
+ err = -EINVAL;
+ if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW)
+ value = ctx->rx_no_pad;
+ release_sock(sk);
+ if (err)
+ return err;
+
+ if (put_user(sizeof(value), optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &value, sizeof(value)))
+ return -EFAULT;
+
+ return 0;
+}
+
static int do_tls_getsockopt(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
@@ -547,6 +578,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
case TLS_TX_ZEROCOPY_RO:
rc = do_tls_getsockopt_tx_zc(sk, optval, optlen);
break;
+ case TLS_RX_EXPECT_NO_PAD:
+ rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -718,6 +752,38 @@ static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval,
return 0;
}
+static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ u32 val;
+ int rc;
+
+ if (ctx->prot_info.version != TLS_1_3_VERSION ||
+ sockptr_is_null(optval) || optlen < sizeof(val))
+ return -EINVAL;
+
+ rc = copy_from_sockptr(&val, optval, sizeof(val));
+ if (rc)
+ return -EFAULT;
+ if (val > 1)
+ return -EINVAL;
+ rc = check_zeroed_sockptr(optval, sizeof(val), optlen - sizeof(val));
+ if (rc < 1)
+ return rc == 0 ? -EINVAL : rc;
+
+ lock_sock(sk);
+ rc = -EINVAL;
+ if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) {
+ ctx->rx_no_pad = val;
+ tls_update_rx_zc_capable(ctx);
+ rc = 0;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
@@ -736,6 +802,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
rc = do_tls_setsockopt_tx_zc(sk, optval, optlen);
release_sock(sk);
break;
+ case TLS_RX_EXPECT_NO_PAD:
+ rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -976,6 +1045,11 @@ static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
if (err)
goto nla_failure;
}
+ if (ctx->rx_no_pad) {
+ err = nla_put_flag(skb, TLS_INFO_RX_NO_PAD);
+ if (err)
+ goto nla_failure;
+ }
rcu_read_unlock();
nla_nest_end(skb, start);
@@ -997,6 +1071,7 @@ static size_t tls_get_info_size(const struct sock *sk)
nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */
+ nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */
0;
return size;
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index feeceb0e4cb4..0c200000cc45 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -18,6 +18,7 @@ static const struct snmp_mib tls_mib_list[] = {
SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
+ SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIN_TLSDECRYPTRETRY),
SNMP_MIB_SENTINEL
};
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 0513f82b8537..79043bc3da39 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -47,6 +47,7 @@
struct tls_decrypt_arg {
bool zc;
bool async;
+ u8 tail;
};
noinline void tls_err_abort(struct sock *sk, int err)
@@ -133,7 +134,8 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len)
return __skb_nsg(skb, offset, len, 0);
}
-static int padding_length(struct tls_prot_info *prot, struct sk_buff *skb)
+static int tls_padding_length(struct tls_prot_info *prot, struct sk_buff *skb,
+ struct tls_decrypt_arg *darg)
{
struct strp_msg *rxm = strp_msg(skb);
struct tls_msg *tlm = tls_msg(skb);
@@ -142,7 +144,7 @@ static int padding_length(struct tls_prot_info *prot, struct sk_buff *skb)
/* Determine zero-padding length */
if (prot->version == TLS_1_3_VERSION) {
int offset = rxm->full_len - TLS_TAG_SIZE - 1;
- char content_type = 0;
+ char content_type = darg->zc ? darg->tail : 0;
int err;
while (content_type == 0) {
@@ -1418,18 +1420,18 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
struct strp_msg *rxm = strp_msg(skb);
struct tls_msg *tlm = tls_msg(skb);
int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0;
+ u8 *aad, *iv, *tail, *mem = NULL;
struct aead_request *aead_req;
struct sk_buff *unused;
- u8 *aad, *iv, *mem = NULL;
struct scatterlist *sgin = NULL;
struct scatterlist *sgout = NULL;
- const int data_len = rxm->full_len - prot->overhead_size +
- prot->tail_size;
+ const int data_len = rxm->full_len - prot->overhead_size;
+ int tail_pages = !!prot->tail_size;
int iv_offset = 0;
if (darg->zc && (out_iov || out_sg)) {
if (out_iov)
- n_sgout = 1 +
+ n_sgout = 1 + tail_pages +
iov_iter_npages_cap(out_iov, INT_MAX, data_len);
else
n_sgout = sg_nents(out_sg);
@@ -1453,9 +1455,10 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
mem_size = aead_size + (nsg * sizeof(struct scatterlist));
mem_size = mem_size + prot->aad_size;
mem_size = mem_size + MAX_IV_SIZE;
+ mem_size = mem_size + prot->tail_size;
/* Allocate a single block of memory which contains
- * aead_req || sgin[] || sgout[] || aad || iv.
+ * aead_req || sgin[] || sgout[] || aad || iv || tail.
* This order achieves correct alignment for aead_req, sgin, sgout.
*/
mem = kmalloc(mem_size, sk->sk_allocation);
@@ -1468,6 +1471,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
sgout = sgin + n_sgin;
aad = (u8 *)(sgout + n_sgout);
iv = aad + prot->aad_size;
+ tail = iv + MAX_IV_SIZE;
/* For CCM based ciphers, first byte of nonce+iv is a constant */
switch (prot->cipher_type) {
@@ -1521,9 +1525,16 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
err = tls_setup_from_iter(out_iov, data_len,
&pages, &sgout[1],
- (n_sgout - 1));
+ (n_sgout - 1 - tail_pages));
if (err < 0)
goto fallback_to_reg_recv;
+
+ if (prot->tail_size) {
+ sg_unmark_end(&sgout[pages]);
+ sg_set_buf(&sgout[pages + 1], tail,
+ prot->tail_size);
+ sg_mark_end(&sgout[pages + 1]);
+ }
} else if (out_sg) {
memcpy(sgout, out_sg, n_sgout * sizeof(*sgout));
} else {
@@ -1538,10 +1549,13 @@ fallback_to_reg_recv:
/* Prepare and submit AEAD request */
err = tls_do_decryption(sk, skb, sgin, sgout, iv,
- data_len, aead_req, darg);
+ data_len + prot->tail_size, aead_req, darg);
if (darg->async)
return 0;
+ if (prot->tail_size)
+ darg->tail = *tail;
+
/* Release the pages in case iov was mapped to pages */
for (; pages > 0; pages--)
put_page(sg_page(&sgout[pages]));
@@ -1583,9 +1597,16 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
return err;
if (darg->async)
goto decrypt_next;
+ /* If opportunistic TLS 1.3 ZC failed retry without ZC */
+ if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION &&
+ darg->tail != TLS_RECORD_TYPE_DATA)) {
+ darg->zc = false;
+ TLS_INC_STATS(sock_net(sk), LINUX_MIN_TLSDECRYPTRETRY);
+ return decrypt_skb_update(sk, skb, dest, darg);
+ }
decrypt_done:
- pad = padding_length(prot, skb);
+ pad = tls_padding_length(prot, skb, darg);
if (pad < 0)
return pad;
@@ -1717,6 +1738,24 @@ out:
return copied ? : err;
}
+static void
+tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot,
+ size_t len_left, size_t decrypted, ssize_t done,
+ size_t *flushed_at)
+{
+ size_t max_rec;
+
+ if (len_left <= decrypted)
+ return;
+
+ max_rec = prot->overhead_size - prot->tail_size + TLS_MAX_PAYLOAD_SIZE;
+ if (done - *flushed_at < SZ_128K && tcp_inq(sk) > max_rec)
+ return;
+
+ *flushed_at = done;
+ sk_flush_backlog(sk);
+}
+
int tls_sw_recvmsg(struct sock *sk,
struct msghdr *msg,
size_t len,
@@ -1729,6 +1768,7 @@ int tls_sw_recvmsg(struct sock *sk,
struct sk_psock *psock;
unsigned char control = 0;
ssize_t decrypted = 0;
+ size_t flushed_at = 0;
struct strp_msg *rxm;
struct tls_msg *tlm;
struct sk_buff *skb;
@@ -1767,7 +1807,7 @@ int tls_sw_recvmsg(struct sock *sk,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
zc_capable = !bpf_strp_enabled && !is_kvec && !is_peek &&
- prot->version != TLS_1_3_VERSION;
+ ctx->zc_capable;
decrypted = 0;
while (len && (decrypted + copied < target || ctx->recv_pkt)) {
struct tls_decrypt_arg darg = {};
@@ -1818,6 +1858,10 @@ int tls_sw_recvmsg(struct sock *sk,
if (err <= 0)
goto recv_end;
+ /* periodically flush backlog, and feed strparser */
+ tls_read_flush_backlog(sk, prot, len, to_decrypt,
+ decrypted + copied, &flushed_at);
+
ctx->recv_pkt = NULL;
__strp_unpause(&ctx->strp);
__skb_queue_tail(&ctx->rx_list, skb);
@@ -2249,6 +2293,14 @@ void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx)
strp_check_rcv(&rx_ctx->strp);
}
+void tls_update_rx_zc_capable(struct tls_context *tls_ctx)
+{
+ struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx);
+
+ rx_ctx->zc_capable = tls_ctx->rx_no_pad ||
+ tls_ctx->prot_info.version != TLS_1_3_VERSION;
+}
+
int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
@@ -2484,12 +2536,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
if (sw_ctx_rx) {
tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv);
- if (crypto_info->version == TLS_1_3_VERSION)
- sw_ctx_rx->async_capable = 0;
- else
- sw_ctx_rx->async_capable =
- !!(tfm->__crt_alg->cra_flags &
- CRYPTO_ALG_ASYNC);
+ tls_update_rx_zc_capable(ctx);
+ sw_ctx_rx->async_capable =
+ crypto_info->version != TLS_1_3_VERSION &&
+ !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC);
/* Set up strparser */
memset(&cb, 0, sizeof(cb));
diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 5d70b04c482c..e71ec5846be9 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -235,6 +235,7 @@ FIXTURE_VARIANT(tls)
{
uint16_t tls_version;
uint16_t cipher_type;
+ bool nopad;
};
FIXTURE_VARIANT_ADD(tls, 12_aes_gcm)
@@ -297,9 +298,17 @@ FIXTURE_VARIANT_ADD(tls, 13_aes_gcm_256)
.cipher_type = TLS_CIPHER_AES_GCM_256,
};
+FIXTURE_VARIANT_ADD(tls, 13_nopad)
+{
+ .tls_version = TLS_1_3_VERSION,
+ .cipher_type = TLS_CIPHER_AES_GCM_128,
+ .nopad = true,
+};
+
FIXTURE_SETUP(tls)
{
struct tls_crypto_info_keys tls12;
+ int one = 1;
int ret;
tls_crypto_info_init(variant->tls_version, variant->cipher_type,
@@ -315,6 +324,12 @@ FIXTURE_SETUP(tls)
ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len);
ASSERT_EQ(ret, 0);
+
+ if (variant->nopad) {
+ ret = setsockopt(self->cfd, SOL_TLS, TLS_RX_EXPECT_NO_PAD,
+ (void *)&one, sizeof(one));
+ ASSERT_EQ(ret, 0);
+ }
}
FIXTURE_TEARDOWN(tls)