aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--Documentation/bpf/prog_cgroup_sockopt.rst14
-rw-r--r--include/uapi/linux/bpf.h2
-rw-r--r--kernel/bpf/cgroup.c53
-rw-r--r--kernel/bpf/devmap.c10
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--net/core/xdp.c1
-rw-r--r--samples/bpf/xdp_monitor_user.c8
-rw-r--r--samples/bpf/xdp_redirect_cpu_user.c7
-rw-r--r--samples/bpf/xdp_rxq_info_user.c13
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-map.rst2
-rw-r--r--tools/bpf/bpftool/map.c3
-rw-r--r--tools/include/uapi/linux/bpf.h2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockopt_sk.c46
-rw-r--r--tools/testing/selftests/bpf/progs/sockopt_sk.c54
14 files changed, 158 insertions, 59 deletions
diff --git a/Documentation/bpf/prog_cgroup_sockopt.rst b/Documentation/bpf/prog_cgroup_sockopt.rst
index c47d974629ae..172f957204bf 100644
--- a/Documentation/bpf/prog_cgroup_sockopt.rst
+++ b/Documentation/bpf/prog_cgroup_sockopt.rst
@@ -86,6 +86,20 @@ then the next program in the chain (A) will see those changes,
*not* the original input ``setsockopt`` arguments. The potentially
modified values will be then passed down to the kernel.
+Large optval
+============
+When the ``optval`` is greater than the ``PAGE_SIZE``, the BPF program
+can access only the first ``PAGE_SIZE`` of that data. So it has to options:
+
+* Set ``optlen`` to zero, which indicates that the kernel should
+ use the original buffer from the userspace. Any modifications
+ done by the BPF program to the ``optval`` are ignored.
+* Set ``optlen`` to the value less than ``PAGE_SIZE``, which
+ indicates that the kernel should use BPF's trimmed ``optval``.
+
+When the BPF program returns with the ``optlen`` greater than
+``PAGE_SIZE``, the userspace will receive ``EFAULT`` errno.
+
Example
=======
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 19684813faae..974a71342aea 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3168,7 +3168,7 @@ union bpf_attr {
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
- * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
+ * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
* Description
* Copy *size* bytes from *data* into a ring buffer *ringbuf*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4d76f16524cc..ac53102e244a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1276,16 +1276,23 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
{
- if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+ if (unlikely(max_optlen < 0))
return -EINVAL;
+ if (unlikely(max_optlen > PAGE_SIZE)) {
+ /* We don't expose optvals that are greater than PAGE_SIZE
+ * to the BPF program.
+ */
+ max_optlen = PAGE_SIZE;
+ }
+
ctx->optval = kzalloc(max_optlen, GFP_USER);
if (!ctx->optval)
return -ENOMEM;
ctx->optval_end = ctx->optval + max_optlen;
- return 0;
+ return max_optlen;
}
static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
@@ -1319,13 +1326,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
max_optlen = max_t(int, 16, *optlen);
- ret = sockopt_alloc_buf(&ctx, max_optlen);
- if (ret)
- return ret;
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ if (max_optlen < 0)
+ return max_optlen;
ctx.optlen = *optlen;
- if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+ if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
@@ -1353,8 +1360,14 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
/* export any potential modifications */
*level = ctx.level;
*optname = ctx.optname;
- *optlen = ctx.optlen;
- *kernel_optval = ctx.optval;
+
+ /* optlen == 0 from BPF indicates that we should
+ * use original userspace data.
+ */
+ if (ctx.optlen != 0) {
+ *optlen = ctx.optlen;
+ *kernel_optval = ctx.optval;
+ }
}
out:
@@ -1385,12 +1398,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
return retval;
- ret = sockopt_alloc_buf(&ctx, max_optlen);
- if (ret)
- return ret;
-
ctx.optlen = max_optlen;
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ if (max_optlen < 0)
+ return max_optlen;
+
if (!retval) {
/* If kernel getsockopt finished successfully,
* copy whatever was returned to the user back
@@ -1404,10 +1417,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
- if (ctx.optlen > max_optlen)
- ctx.optlen = max_optlen;
-
- if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+ if (copy_from_user(ctx.optval, optval,
+ min(ctx.optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
@@ -1436,10 +1447,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
- if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
- put_user(ctx.optlen, optlen)) {
- ret = -EFAULT;
- goto out;
+ if (ctx.optlen != 0) {
+ if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+ put_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
ret = ctx.retval;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 0cbb72cdaf63..5fdbc776a760 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -86,12 +86,13 @@ static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
-static struct hlist_head *dev_map_create_hash(unsigned int entries)
+static struct hlist_head *dev_map_create_hash(unsigned int entries,
+ int numa_node)
{
int i;
struct hlist_head *hash;
- hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
+ hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node);
if (hash != NULL)
for (i = 0; i < entries; i++)
INIT_HLIST_HEAD(&hash[i]);
@@ -145,7 +146,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
return -EINVAL;
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
+ dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
+ dtab->map.numa_node);
if (!dtab->dev_index_head)
goto free_charge;
@@ -232,7 +234,7 @@ static void dev_map_free(struct bpf_map *map)
}
}
- kfree(dtab->dev_index_head);
+ bpf_map_area_free(dtab->dev_index_head);
} else {
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e729c9e587a0..a3ac7de98baa 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -241,7 +241,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
if (unlikely(ret < 0))
goto fail;
- return 0;
+ return ret;
fail:
memset(dst, 0, size);
return ret;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 90f44f382115..3c45f99e26d5 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -462,6 +462,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->len = totsize - metasize;
xdpf->headroom = 0;
xdpf->metasize = metasize;
+ xdpf->frame_sz = PAGE_SIZE;
xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
xsk_buff_free(xdp);
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index dd558cbb2309..ef53b93db573 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -509,11 +509,8 @@ static void *alloc_rec_per_cpu(int record_size)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
void *array;
- size_t size;
- size = record_size * nr_cpus;
- array = malloc(size);
- memset(array, 0, size);
+ array = calloc(nr_cpus, record_size);
if (!array) {
fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
exit(EXIT_FAIL_MEM);
@@ -528,8 +525,7 @@ static struct stats_record *alloc_stats_record(void)
int i;
/* Alloc main stats_record structure */
- rec = malloc(sizeof(*rec));
- memset(rec, 0, sizeof(*rec));
+ rec = calloc(1, sizeof(*rec));
if (!rec) {
fprintf(stderr, "Mem alloc error\n");
exit(EXIT_FAIL_MEM);
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
index f3468168982e..f4e755e0dd73 100644
--- a/samples/bpf/xdp_redirect_cpu_user.c
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -207,11 +207,8 @@ static struct datarec *alloc_record_per_cpu(void)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
struct datarec *array;
- size_t size;
- size = sizeof(struct datarec) * nr_cpus;
- array = malloc(size);
- memset(array, 0, size);
+ array = calloc(nr_cpus, sizeof(struct datarec));
if (!array) {
fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
exit(EXIT_FAIL_MEM);
@@ -226,11 +223,11 @@ static struct stats_record *alloc_stats_record(void)
size = sizeof(*rec) + n_cpus * sizeof(struct record);
rec = malloc(size);
- memset(rec, 0, size);
if (!rec) {
fprintf(stderr, "Mem alloc error\n");
exit(EXIT_FAIL_MEM);
}
+ memset(rec, 0, size);
rec->rx_cnt.cpu = alloc_record_per_cpu();
rec->redir_err.cpu = alloc_record_per_cpu();
rec->kthread.cpu = alloc_record_per_cpu();
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
index 4fe47502ebed..caa4e7ffcfc7 100644
--- a/samples/bpf/xdp_rxq_info_user.c
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -198,11 +198,8 @@ static struct datarec *alloc_record_per_cpu(void)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
struct datarec *array;
- size_t size;
- size = sizeof(struct datarec) * nr_cpus;
- array = malloc(size);
- memset(array, 0, size);
+ array = calloc(nr_cpus, sizeof(struct datarec));
if (!array) {
fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
exit(EXIT_FAIL_MEM);
@@ -214,11 +211,8 @@ static struct record *alloc_record_per_rxq(void)
{
unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
struct record *array;
- size_t size;
- size = sizeof(struct record) * nr_rxqs;
- array = malloc(size);
- memset(array, 0, size);
+ array = calloc(nr_rxqs, sizeof(struct record));
if (!array) {
fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs);
exit(EXIT_FAIL_MEM);
@@ -232,8 +226,7 @@ static struct stats_record *alloc_stats_record(void)
struct stats_record *rec;
int i;
- rec = malloc(sizeof(*rec));
- memset(rec, 0, sizeof(*rec));
+ rec = calloc(1, sizeof(struct stats_record));
if (!rec) {
fprintf(stderr, "Mem alloc error\n");
exit(EXIT_FAIL_MEM);
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 31101643e57c..70c78faa47ab 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -49,7 +49,7 @@ MAP COMMANDS
| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps**
| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
-| | **queue** | **stack** | **sk_storage** | **struct_ops** }
+| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** }
DESCRIPTION
===========
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index c5fac8068ba1..1d3b60651078 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -49,6 +49,7 @@ const char * const map_type_name[] = {
[BPF_MAP_TYPE_STACK] = "stack",
[BPF_MAP_TYPE_SK_STORAGE] = "sk_storage",
[BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops",
+ [BPF_MAP_TYPE_RINGBUF] = "ringbuf",
};
const size_t map_type_name_size = ARRAY_SIZE(map_type_name);
@@ -1590,7 +1591,7 @@ static int do_help(int argc, char **argv)
" lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n"
" devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
" cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
- " queue | stack | sk_storage | struct_ops }\n"
+ " queue | stack | sk_storage | struct_ops | ringbuf }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, argv[-2]);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 19684813faae..974a71342aea 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3168,7 +3168,7 @@ union bpf_attr {
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
- * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
+ * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
* Description
* Copy *size* bytes from *data* into a ring buffer *ringbuf*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index 2061a6beac0f..5f54c6aec7f0 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -13,6 +13,7 @@ static int getsetsockopt(void)
char cc[16]; /* TCP_CA_NAME_MAX */
} buf = {};
socklen_t optlen;
+ char *big_buf = NULL;
fd = socket(AF_INET, SOCK_STREAM, 0);
if (fd < 0) {
@@ -22,24 +23,31 @@ static int getsetsockopt(void)
/* IP_TOS - BPF bypass */
- buf.u8[0] = 0x08;
- err = setsockopt(fd, SOL_IP, IP_TOS, &buf, 1);
+ optlen = getpagesize() * 2;
+ big_buf = calloc(1, optlen);
+ if (!big_buf) {
+ log_err("Couldn't allocate two pages");
+ goto err;
+ }
+
+ *(int *)big_buf = 0x08;
+ err = setsockopt(fd, SOL_IP, IP_TOS, big_buf, optlen);
if (err) {
log_err("Failed to call setsockopt(IP_TOS)");
goto err;
}
- buf.u8[0] = 0x00;
+ memset(big_buf, 0, optlen);
optlen = 1;
- err = getsockopt(fd, SOL_IP, IP_TOS, &buf, &optlen);
+ err = getsockopt(fd, SOL_IP, IP_TOS, big_buf, &optlen);
if (err) {
log_err("Failed to call getsockopt(IP_TOS)");
goto err;
}
- if (buf.u8[0] != 0x08) {
- log_err("Unexpected getsockopt(IP_TOS) buf[0] 0x%02x != 0x08",
- buf.u8[0]);
+ if (*(int *)big_buf != 0x08) {
+ log_err("Unexpected getsockopt(IP_TOS) optval 0x%x != 0x08",
+ *(int *)big_buf);
goto err;
}
@@ -78,6 +86,28 @@ static int getsetsockopt(void)
goto err;
}
+ /* IP_FREEBIND - BPF can't access optval past PAGE_SIZE */
+
+ optlen = getpagesize() * 2;
+ memset(big_buf, 0, optlen);
+
+ err = setsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, optlen);
+ if (err != 0) {
+ log_err("Failed to call setsockopt, ret=%d", err);
+ goto err;
+ }
+
+ err = getsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, &optlen);
+ if (err != 0) {
+ log_err("Failed to call getsockopt, ret=%d", err);
+ goto err;
+ }
+
+ if (optlen != 1 || *(__u8 *)big_buf != 0x55) {
+ log_err("Unexpected IP_FREEBIND getsockopt, optlen=%d, optval=0x%x",
+ optlen, *(__u8 *)big_buf);
+ }
+
/* SO_SNDBUF is overwritten */
buf.u32 = 0x01010101;
@@ -124,9 +154,11 @@ static int getsetsockopt(void)
goto err;
}
+ free(big_buf);
close(fd);
return 0;
err:
+ free(big_buf);
close(fd);
return -1;
}
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
index d5a5eeb5fb52..712df7b49cb1 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -8,6 +8,10 @@
char _license[] SEC("license") = "GPL";
__u32 _version SEC("version") = 1;
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
#define SOL_CUSTOM 0xdeadbeef
struct sockopt_sk {
@@ -28,12 +32,14 @@ int _getsockopt(struct bpf_sockopt *ctx)
__u8 *optval = ctx->optval;
struct sockopt_sk *storage;
- if (ctx->level == SOL_IP && ctx->optname == IP_TOS)
+ if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
/* Not interested in SOL_IP:IP_TOS;
* let next BPF program in the cgroup chain or kernel
* handle it.
*/
+ ctx->optlen = 0; /* bypass optval>PAGE_SIZE */
return 1;
+ }
if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
/* Not interested in SOL_SOCKET:SO_SNDBUF;
@@ -51,6 +57,26 @@ int _getsockopt(struct bpf_sockopt *ctx)
return 1;
}
+ if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ /* Always export 0x55 */
+ optval[0] = 0x55;
+ ctx->optlen = 1;
+
+ /* Userspace buffer is PAGE_SIZE * 2, but BPF
+ * program can only see the first PAGE_SIZE
+ * bytes of data.
+ */
+ if (optval_end - optval != PAGE_SIZE)
+ return 0; /* EPERM, unexpected data size */
+
+ return 1;
+ }
+
if (ctx->level != SOL_CUSTOM)
return 0; /* EPERM, deny everything except custom level */
@@ -81,12 +107,14 @@ int _setsockopt(struct bpf_sockopt *ctx)
__u8 *optval = ctx->optval;
struct sockopt_sk *storage;
- if (ctx->level == SOL_IP && ctx->optname == IP_TOS)
+ if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
/* Not interested in SOL_IP:IP_TOS;
* let next BPF program in the cgroup chain or kernel
* handle it.
*/
+ ctx->optlen = 0; /* bypass optval>PAGE_SIZE */
return 1;
+ }
if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
/* Overwrite SO_SNDBUF value */
@@ -112,6 +140,28 @@ int _setsockopt(struct bpf_sockopt *ctx)
return 1;
}
+ if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+ /* Original optlen is larger than PAGE_SIZE. */
+ if (ctx->optlen != PAGE_SIZE * 2)
+ return 0; /* EPERM, unexpected data size */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ /* Make sure we can trim the buffer. */
+ optval[0] = 0;
+ ctx->optlen = 1;
+
+ /* Usepace buffer is PAGE_SIZE * 2, but BPF
+ * program can only see the first PAGE_SIZE
+ * bytes of data.
+ */
+ if (optval_end - optval != PAGE_SIZE)
+ return 0; /* EPERM, unexpected data size */
+
+ return 1;
+ }
+
if (ctx->level != SOL_CUSTOM)
return 0; /* EPERM, deny everything except custom level */