From 95da0cdb723260362fc126a563285ac66a193da7 Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Tue, 6 Mar 2018 10:55:01 -0800 Subject: bpf: add support to read sample address in bpf program This commit adds new field "addr" to bpf_perf_event_data which could be read and used by bpf programs attached to perf events. The value of the field is copied from bpf_perf_event_data_kern.addr and contains the address value recorded by specifying sample_type with PERF_SAMPLE_ADDR when calling perf_event_open. Signed-off-by: Teng Qin Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf_perf_event.h | 1 + kernel/trace/bpf_trace.c | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h index 8f95303f9d80..eb1b9d21250c 100644 --- a/include/uapi/linux/bpf_perf_event.h +++ b/include/uapi/linux/bpf_perf_event.h @@ -13,6 +13,7 @@ struct bpf_perf_event_data { bpf_user_pt_regs_t regs; __u64 sample_period; + __u64 addr; }; #endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c0a9e310d715..c634e093951f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -726,8 +726,7 @@ const struct bpf_prog_ops tracepoint_prog_ops = { static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, - sample_period); + const int size_u64 = sizeof(u64); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; @@ -738,8 +737,13 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): - bpf_ctx_record_field_size(info, size_sp); - if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; + case bpf_ctx_range(struct bpf_perf_event_data, addr): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; default: @@ -766,6 +770,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct perf_sample_data, period, 8, target_size)); break; + case offsetof(struct bpf_perf_event_data, addr): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, addr, 8, + target_size)); + break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs), si->dst_reg, si->src_reg, -- cgit v1.2.3-59-g8ed1b From 12fe12253c56a26e591ceefbdf0998b391022003 Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Tue, 6 Mar 2018 10:55:02 -0800 Subject: samples/bpf: add example to test reading address This commit adds additional test in the trace_event example, by attaching the bpf program to MEM_UOPS_RETIRED.LOCK_LOADS event with PERF_SAMPLE_ADDR requested, and print the lock address value read from the bpf program to trace_pipe. Signed-off-by: Teng Qin Signed-off-by: Daniel Borkmann --- samples/bpf/trace_event_kern.c | 4 ++++ samples/bpf/trace_event_user.c | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index a77a583d94d4..7068fbdde951 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -39,6 +39,7 @@ int bpf_prog1(struct bpf_perf_event_data *ctx) { char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu"; char time_fmt2[] = "Get Time Failed, ErrCode: %d"; + char addr_fmt[] = "Address recorded on event: %llx"; char fmt[] = "CPU-%d period %lld ip %llx"; u32 cpu = bpf_get_smp_processor_id(); struct bpf_perf_event_value value_buf; @@ -64,6 +65,9 @@ int bpf_prog1(struct bpf_perf_event_data *ctx) else bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret); + if (ctx->addr != 0) + bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr); + val = bpf_map_lookup_elem(&counts, &key); if (val) (*val)++; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index bf4f1b6d9a52..56f7a259a7c9 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -215,6 +215,17 @@ static void test_bpf_perf_event(void) /* Intel Instruction Retired */ .config = 0xc0, }; + struct perf_event_attr attr_type_raw_lock_load = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_RAW, + /* Intel MEM_UOPS_RETIRED.LOCK_LOADS */ + .config = 0x21d0, + /* Request to record lock address from PEBS */ + .sample_type = PERF_SAMPLE_ADDR, + /* Record address value requires precise event */ + .precise_ip = 2, + }; printf("Test HW_CPU_CYCLES\n"); test_perf_event_all_cpu(&attr_type_hw); @@ -236,6 +247,10 @@ static void test_bpf_perf_event(void) test_perf_event_all_cpu(&attr_type_raw); test_perf_event_task(&attr_type_raw); + printf("Test Lock Load\n"); + test_perf_event_all_cpu(&attr_type_raw_lock_load); + test_perf_event_task(&attr_type_raw_lock_load); + printf("*** PASS ***\n"); } -- cgit v1.2.3-59-g8ed1b From 72ab55e96ebc00d68501f5cda59478ed7cf7e484 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 8 Mar 2018 23:00:35 +0100 Subject: tools: bpftool: silence 'missing initializer' warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building bpf tool, gcc emits piles of warnings: prog.c: In function ‘prog_fd_by_tag’: prog.c:101:9: warning: missing initializer for field ‘type’ of ‘struct bpf_prog_info’ [-Wmissing-field-initializers] struct bpf_prog_info info = {}; ^ In file included from /home/storage/jbenc/git/net-next/tools/lib/bpf/bpf.h:26:0, from prog.c:47: /home/storage/jbenc/git/net-next/tools/include/uapi/linux/bpf.h:925:8: note: ‘type’ declared here __u32 type; ^ As these warnings are not useful, switch them off. Signed-off-by: Jiri Benc Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 26901ec87361..4c2867481f5c 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -38,7 +38,7 @@ bash_compdir ?= /usr/share/bash-completion/completions CC = gcc CFLAGS += -O2 -CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow +CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/ CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' LIBS = -lelf -lbfd -lopcodes $(LIBBPF) -- cgit v1.2.3-59-g8ed1b From 5a8997f207154826c7bf1a97acf75ffb44159c50 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 8 Mar 2018 23:00:36 +0100 Subject: tools: bpf: respect output directory during build Currently, the programs under tools/bpf (with the notable exception of bpftool) do not respect the output directory (make O=dir). Fix that. Signed-off-by: Jiri Benc Signed-off-by: Daniel Borkmann --- tools/bpf/Makefile | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index c8ec0ae16bf0..e7b15967492e 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +include ../scripts/Makefile.include + prefix = /usr CC = gcc @@ -7,7 +9,7 @@ YACC = bison MAKE = make CFLAGS += -Wall -O2 -CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include +CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/include/uapi -I$(srctree)/include ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(CURDIR))) @@ -38,32 +40,36 @@ ifeq ($(feature-disassembler-four-args), 1) CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE endif -%.yacc.c: %.y +$(OUTPUT)%.yacc.c: $(srctree)/tools/bpf/%.y $(YACC) -o $@ -d $< -%.lex.c: %.l +$(OUTPUT)%.lex.c: $(srctree)/tools/bpf/%.l $(LEX) -o $@ $< -all: bpf_jit_disasm bpf_dbg bpf_asm bpftool +$(OUTPUT)%.o: $(srctree)/tools/bpf/%.c + $(COMPILE.c) -o $@ $< + +all: $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm bpftool -bpf_jit_disasm : CFLAGS += -DPACKAGE='bpf_jit_disasm' -bpf_jit_disasm : LDLIBS = -lopcodes -lbfd -ldl -bpf_jit_disasm : bpf_jit_disasm.o +$(OUTPUT)bpf_jit_disasm: CFLAGS += -DPACKAGE='bpf_jit_disasm' +$(OUTPUT)bpf_jit_disasm: LDLIBS = -lopcodes -lbfd -ldl +$(OUTPUT)bpf_jit_disasm: $(OUTPUT)bpf_jit_disasm.o -bpf_dbg : LDLIBS = -lreadline -bpf_dbg : bpf_dbg.o +$(OUTPUT)bpf_dbg: LDLIBS = -lreadline +$(OUTPUT)bpf_dbg: $(OUTPUT)bpf_dbg.o -bpf_asm : LDLIBS = -bpf_asm : bpf_asm.o bpf_exp.yacc.o bpf_exp.lex.o -bpf_exp.lex.o : bpf_exp.yacc.c +$(OUTPUT)bpf_asm: LDLIBS = +$(OUTPUT)bpf_asm: $(OUTPUT)bpf_asm.o $(OUTPUT)bpf_exp.yacc.o $(OUTPUT)bpf_exp.lex.o +$(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.yacc.c clean: bpftool_clean - rm -rf *.o bpf_jit_disasm bpf_dbg bpf_asm bpf_exp.yacc.* bpf_exp.lex.* + rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \ + $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.* install: bpftool_install - install bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm - install bpf_dbg $(prefix)/bin/bpf_dbg - install bpf_asm $(prefix)/bin/bpf_asm + install $(OUTPUT)bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm + install $(OUTPUT)bpf_dbg $(prefix)/bin/bpf_dbg + install $(OUTPUT)bpf_asm $(prefix)/bin/bpf_asm bpftool: $(MAKE) -C bpftool -- cgit v1.2.3-59-g8ed1b From fde68c5becb552ce96fcf1d123624b5c11baa187 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 8 Mar 2018 23:00:37 +0100 Subject: tools: bpf: consistent make bpf_install Currently, make bpf_install in tools/ does not respect DESTDIR. Moreover, it installs to /usr/bin/ unconditionally. Let it respect DESTDIR and allow prefix to be specified. Also, to be more consistent with bpftool and with the usual customs, default the prefix to /usr/local instead of /usr. Signed-off-by: Jiri Benc Signed-off-by: Daniel Borkmann --- tools/bpf/Makefile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index e7b15967492e..c42ca24a072d 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 include ../scripts/Makefile.include -prefix = /usr +prefix ?= /usr/local CC = gcc LEX = flex YACC = bison MAKE = make +INSTALL ?= install CFLAGS += -Wall -O2 CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/include/uapi -I$(srctree)/include @@ -67,9 +68,10 @@ clean: bpftool_clean $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.* install: bpftool_install - install $(OUTPUT)bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm - install $(OUTPUT)bpf_dbg $(prefix)/bin/bpf_dbg - install $(OUTPUT)bpf_asm $(prefix)/bin/bpf_asm + $(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/bin + $(INSTALL) $(OUTPUT)bpf_jit_disasm $(DESTDIR)$(prefix)/bin/bpf_jit_disasm + $(INSTALL) $(OUTPUT)bpf_dbg $(DESTDIR)$(prefix)/bin/bpf_dbg + $(INSTALL) $(OUTPUT)bpf_asm $(DESTDIR)$(prefix)/bin/bpf_asm bpftool: $(MAKE) -C bpftool -- cgit v1.2.3-59-g8ed1b From 6c0710084e6bf5337c9af8075e266069b79ac2c2 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 8 Mar 2018 23:00:38 +0100 Subject: tools: bpf: make install should build first Make the 'install' target depend on the 'all' target to build the binaries first. Signed-off-by: Jiri Benc Signed-off-by: Daniel Borkmann --- tools/bpf/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index c42ca24a072d..e8d9e4125bf3 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -50,7 +50,9 @@ $(OUTPUT)%.lex.c: $(srctree)/tools/bpf/%.l $(OUTPUT)%.o: $(srctree)/tools/bpf/%.c $(COMPILE.c) -o $@ $< -all: $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm bpftool +PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm + +all: $(PROGS) bpftool $(OUTPUT)bpf_jit_disasm: CFLAGS += -DPACKAGE='bpf_jit_disasm' $(OUTPUT)bpf_jit_disasm: LDLIBS = -lopcodes -lbfd -ldl @@ -67,7 +69,7 @@ clean: bpftool_clean rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \ $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.* -install: bpftool_install +install: $(PROGS) bpftool_install $(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/bin $(INSTALL) $(OUTPUT)bpf_jit_disasm $(DESTDIR)$(prefix)/bin/bpf_jit_disasm $(INSTALL) $(OUTPUT)bpf_dbg $(DESTDIR)$(prefix)/bin/bpf_dbg -- cgit v1.2.3-59-g8ed1b From 58416c37d0fe1c46aecb293283f2d558526cac19 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 8 Mar 2018 23:00:39 +0100 Subject: tools: bpf: call descend in Makefile Use the descend macro to properly propagate $(subdir) to bpftool. Signed-off-by: Jiri Benc Signed-off-by: Daniel Borkmann --- tools/bpf/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index e8d9e4125bf3..daca0a4277d1 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -76,12 +76,12 @@ install: $(PROGS) bpftool_install $(INSTALL) $(OUTPUT)bpf_asm $(DESTDIR)$(prefix)/bin/bpf_asm bpftool: - $(MAKE) -C bpftool + $(call descend,bpftool) bpftool_install: - $(MAKE) -C bpftool install + $(call descend,bpftool,install) bpftool_clean: - $(MAKE) -C bpftool clean + $(call descend,bpftool,clean) .PHONY: bpftool FORCE -- cgit v1.2.3-59-g8ed1b From a50b7f8c618a7953cd0bef407b390b6db60d0fc3 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 8 Mar 2018 23:00:40 +0100 Subject: tools: bpf: respect quiet/verbose build Default to quiet build, with V=1 enabling verbose build as is usual. Signed-off-by: Jiri Benc Signed-off-by: Daniel Borkmann --- tools/bpf/Makefile | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index daca0a4277d1..757ea22c428a 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -17,6 +17,12 @@ srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif +ifeq ($(V),1) + Q = +else + Q = @ +endif + FEATURE_USER = .bpf FEATURE_TESTS = libbfd disassembler-four-args FEATURE_DISPLAY = libbfd disassembler-four-args @@ -42,38 +48,48 @@ CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE endif $(OUTPUT)%.yacc.c: $(srctree)/tools/bpf/%.y - $(YACC) -o $@ -d $< + $(QUIET_BISON)$(YACC) -o $@ -d $< $(OUTPUT)%.lex.c: $(srctree)/tools/bpf/%.l - $(LEX) -o $@ $< + $(QUIET_FLEX)$(LEX) -o $@ $< $(OUTPUT)%.o: $(srctree)/tools/bpf/%.c - $(COMPILE.c) -o $@ $< + $(QUIET_CC)$(COMPILE.c) -o $@ $< + +$(OUTPUT)%.yacc.o: $(OUTPUT)%.yacc.c + $(QUIET_CC)$(COMPILE.c) -o $@ $< +$(OUTPUT)%.lex.o: $(OUTPUT)%.lex.c + $(QUIET_CC)$(COMPILE.c) -o $@ $< PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm all: $(PROGS) bpftool $(OUTPUT)bpf_jit_disasm: CFLAGS += -DPACKAGE='bpf_jit_disasm' -$(OUTPUT)bpf_jit_disasm: LDLIBS = -lopcodes -lbfd -ldl $(OUTPUT)bpf_jit_disasm: $(OUTPUT)bpf_jit_disasm.o + $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ -lopcodes -lbfd -ldl -$(OUTPUT)bpf_dbg: LDLIBS = -lreadline $(OUTPUT)bpf_dbg: $(OUTPUT)bpf_dbg.o + $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ -lreadline -$(OUTPUT)bpf_asm: LDLIBS = $(OUTPUT)bpf_asm: $(OUTPUT)bpf_asm.o $(OUTPUT)bpf_exp.yacc.o $(OUTPUT)bpf_exp.lex.o + $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ + $(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.yacc.c clean: bpftool_clean - rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \ + $(call QUIET_CLEAN, bpf-progs) + $(Q)rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \ $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.* install: $(PROGS) bpftool_install - $(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/bin - $(INSTALL) $(OUTPUT)bpf_jit_disasm $(DESTDIR)$(prefix)/bin/bpf_jit_disasm - $(INSTALL) $(OUTPUT)bpf_dbg $(DESTDIR)$(prefix)/bin/bpf_dbg - $(INSTALL) $(OUTPUT)bpf_asm $(DESTDIR)$(prefix)/bin/bpf_asm + $(call QUIET_INSTALL, bpf_jit_disasm) + $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/bin + $(Q)$(INSTALL) $(OUTPUT)bpf_jit_disasm $(DESTDIR)$(prefix)/bin/bpf_jit_disasm + $(call QUIET_INSTALL, bpf_dbg) + $(Q)$(INSTALL) $(OUTPUT)bpf_dbg $(DESTDIR)$(prefix)/bin/bpf_dbg + $(call QUIET_INSTALL, bpf_asm) + $(Q)$(INSTALL) $(OUTPUT)bpf_asm $(DESTDIR)$(prefix)/bin/bpf_asm bpftool: $(call descend,bpftool) -- cgit v1.2.3-59-g8ed1b From ef8ba83b7cf6c530fbb2c83cf87f9755b8424a84 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 8 Mar 2018 23:00:41 +0100 Subject: tools: bpf: silence make by not deleting intermediate file Even in quiet mode, make finishes with rm tools/bpf/bpf_exp.lex.c That's because it considers the file to be intermediate. Silence that by mentioning the lex.c file instead of the lex.o file; the dependency still stays. Signed-off-by: Jiri Benc Signed-off-by: Daniel Borkmann --- tools/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 757ea22c428a..c07b4e718eeb 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -75,7 +75,7 @@ $(OUTPUT)bpf_dbg: $(OUTPUT)bpf_dbg.o $(OUTPUT)bpf_asm: $(OUTPUT)bpf_asm.o $(OUTPUT)bpf_exp.yacc.o $(OUTPUT)bpf_exp.lex.o $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ -$(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.yacc.c +$(OUTPUT)bpf_exp.lex.c: $(OUTPUT)bpf_exp.yacc.c clean: bpftool_clean $(call QUIET_CLEAN, bpf-progs) -- cgit v1.2.3-59-g8ed1b From 6d8cb045cde681e64a5ed80a2ab70be831a7f9b0 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 8 Mar 2018 23:46:33 -0800 Subject: bpf: comment why dots in filenames under BPF virtual FS are not allowed When pinning a file under the BPF virtual file system (traditionally /sys/fs/bpf), using a dot in the name of the location to pin at is not allowed. For example, trying to pin at "/sys/fs/bpf/foo.bar" will be rejected with -EPERM. This check was introduced at the same time as the BPF file system itself, with commit b2197755b263 ("bpf: add support for persistent maps/progs"). At this time, it was checked in a function called "bpf_dname_reserved()", which made clear that using a dot was reserved for future extensions. This function disappeared and the check was moved elsewhere with commit 0c93b7d85d40 ("bpf: reject invalid names right in ->lookup()"), and the meaning of the dot ban was lost. The present commit simply adds a comment in the source to explain to the reader that the usage of dots is reserved for future usage. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 81e2f6995adb..bf6da59ae0d0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -178,6 +178,9 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { + /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future + * extensions. + */ if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); -- cgit v1.2.3-59-g8ed1b From 615755a77b2461ed78dfafb8a6649456201949c7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 14 Mar 2018 10:23:21 -0700 Subject: bpf: extend stackmap to save binary_build_id+offset instead of address Currently, bpf stackmap store address for each entry in the call trace. To map these addresses to user space files, it is necessary to maintain the mapping from these virtual address to symbols in the binary. Usually, the user space profiler (such as perf) has to scan /proc/pid/maps at the beginning of profiling, and monitor mmap2() calls afterwards. Given the cost of maintaining the address map, this solution is not practical for system wide profiling that is always on. This patch tries to solve this problem with a variation of stackmap. This variation is enabled by flag BPF_F_STACK_BUILD_ID. Instead of storing addresses, the variation stores ELF file build_id + offset. Build ID is a 20-byte unique identifier for ELF files. The following command shows the Build ID of /bin/bash: [user@]$ readelf -n /bin/bash ... Build ID: XXXXXXXXXX ... With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID for each entry in the call trace, and translate it into the following struct: struct bpf_stack_build_id_offset { __s32 status; unsigned char build_id[BPF_BUILD_ID_SIZE]; union { __u64 offset; __u64 ip; }; }; The search of build_id is limited to the first page of the file, and this page should be in page cache. Otherwise, we fallback to store ip for this entry (ip field in struct bpf_stack_build_id_offset). This requires the build_id to be stored in the first page. A quick survey of binary and dynamic library files in a few different systems shows that almost all binary and dynamic library files have build_id in the first page. Build_id is only meaningful for user stack. If a kernel stack is added to a stackmap with BPF_F_STACK_BUILD_ID, it will automatically fallback to only store ip (status == BPF_STACK_BUILD_ID_IP). Similarly, if build_id lookup failed for some reason, it will also fallback to store ip. User space can access struct bpf_stack_build_id_offset with bpf syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to maintain mapping from build id to binary files. This mostly static mapping is much easier to maintain than per process address maps. Note: Stackmap with build_id only works in non-nmi context at this time. This is because we need to take mm->mmap_sem for find_vma(). If this changes, we would like to allow build_id lookup in nmi context. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 22 ++++ kernel/bpf/stackmap.c | 257 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 257 insertions(+), 22 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a66769e5875..1e15d1724d89 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -231,6 +231,28 @@ enum bpf_attach_type { #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) +/* Flag for stack_map, store build_id+offset instead of pointer */ +#define BPF_F_STACK_BUILD_ID (1U << 5) + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b0ecf43f5894..57eeb1234b67 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,16 +9,19 @@ #include #include #include +#include +#include #include "percpu_freelist.h" -#define STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ + BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; - u64 ip[]; + u64 data[]; }; struct bpf_stack_map { @@ -29,6 +32,17 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +static inline bool stack_map_use_build_id(struct bpf_map *map) +{ + return (map->map_flags & BPF_F_STACK_BUILD_ID); +} + +static inline int stack_map_data_size(struct bpf_map *map) +{ + return stack_map_use_build_id(map) ? + sizeof(struct bpf_stack_build_id) : sizeof(u64); +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; @@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - value_size < 8 || value_size % 8 || - value_size / 8 > sysctl_perf_event_max_stack) + value_size < 8 || value_size % 8) + return ERR_PTR(-EINVAL); + + BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); + if (attr->map_flags & BPF_F_STACK_BUILD_ID) { + if (value_size % sizeof(struct bpf_stack_build_id) || + value_size / sizeof(struct bpf_stack_build_id) + > sysctl_perf_event_max_stack) + return ERR_PTR(-EINVAL); + } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ @@ -114,13 +136,184 @@ free_smap: return ERR_PTR(err); } +#define BPF_BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int stack_map_parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BPF_BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + BPF_BUILD_ID_SIZE); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int stack_map_get_build_id_32(void *page_addr, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int stack_map_get_build_id_64(void *page_addr, + unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +static int stack_map_get_build_id(struct vm_area_struct *vma, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = page_address(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = stack_map_get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = stack_map_get_build_id_64(page_addr, build_id); +out: + put_page(page); + return ret; +} + +static void stack_map_get_build_id_offset(struct bpf_map *map, + struct stack_map_bucket *bucket, + u64 *ips, u32 trace_nr, bool user) +{ + int i; + struct vm_area_struct *vma; + struct bpf_stack_build_id *id_offs; + + bucket->nr = trace_nr; + id_offs = (struct bpf_stack_build_id *)bucket->data; + + /* + * We cannot do up_read() in nmi context, so build_id lookup is + * only supported for non-nmi events. If at some point, it is + * possible to run find_vma() without taking the semaphore, we + * would like to allow build_id lookup in nmi context. + * + * Same fallback is used for kernel stack (!user) on a stackmap + * with build_id. + */ + if (!user || !current || !current->mm || in_nmi() || + down_read_trylock(¤t->mm->mmap_sem) == 0) { + /* cannot access current->mm, fall back to ips */ + for (i = 0; i < trace_nr; i++) { + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + } + return; + } + + for (i = 0; i < trace_nr; i++) { + vma = find_vma(current->mm, ips[i]); + if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + /* per entry fall back to ips */ + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + continue; + } + id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] + - vma->vm_start; + id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + } + up_read(¤t->mm->mmap_sem); +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / 8; + u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, bool user = flags & BPF_F_USER_STACK; bool kernel = !user; u64 *ips; + bool hash_matches; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) @@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); - if (bucket && bucket->hash == hash) { - if (flags & BPF_F_FAST_STACK_CMP) + hash_matches = bucket && bucket->hash == hash; + /* fast cmp */ + if (hash_matches && flags & BPF_F_FAST_STACK_CMP) + return id; + + if (stack_map_use_build_id(map)) { + /* for build_id+offset, pop a bucket before slow cmp */ + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + stack_map_get_build_id_offset(map, new_bucket, ips, + trace_nr, user); + trace_len = trace_nr * sizeof(struct bpf_stack_build_id); + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, new_bucket->data, trace_len) == 0) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; - if (bucket->nr == trace_nr && - memcmp(bucket->ip, ips, trace_len) == 0) + } + if (bucket && !(flags & BPF_F_REUSE_STACKID)) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); + return -EEXIST; + } + } else { + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, ips, trace_len) == 0) return id; + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + memcpy(new_bucket->data, ips, trace_len); } - /* this call stack is not in the map, try to add it */ - if (bucket && !(flags & BPF_F_REUSE_STACKID)) - return -EEXIST; - - new_bucket = (struct stack_map_bucket *) - pcpu_freelist_pop(&smap->freelist); - if (unlikely(!new_bucket)) - return -ENOMEM; - - memcpy(new_bucket->ip, ips, trace_len); new_bucket->hash = hash; new_bucket->nr = trace_nr; @@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) if (!bucket) return -ENOENT; - trace_len = bucket->nr * sizeof(u64); - memcpy(value, bucket->ip, trace_len); + trace_len = bucket->nr * stack_map_data_size(map); + memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); -- cgit v1.2.3-59-g8ed1b From 81f77fd0deeb866d65ccc79733df8d2af5217c82 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 14 Mar 2018 10:23:22 -0700 Subject: bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID test_stacktrace_build_id() is added. It accesses tracepoint urandom_read with "dd" and "urandom_read" and gathers stack traces. Then it reads the stack traces from the stackmap. urandom_read is a statically link binary that reads from /dev/urandom. test_stacktrace_build_id() calls readelf to read build ID of urandom_read and compares it with build ID from the stackmap. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 22 +++ tools/testing/selftests/bpf/Makefile | 12 +- tools/testing/selftests/bpf/test_progs.c | 164 ++++++++++++++++++++- .../selftests/bpf/test_stacktrace_build_id.c | 60 ++++++++ tools/testing/selftests/bpf/urandom_read.c | 22 +++ 5 files changed, 278 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/test_stacktrace_build_id.c create mode 100644 tools/testing/selftests/bpf/urandom_read.c diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index db6bdc375126..1944d0aee7e8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -231,6 +231,28 @@ enum bpf_attach_type { #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) +/* Flag for stack_map, store build_id+offset instead of pointer */ +#define BPF_F_STACK_BUILD_ID (1U << 5) + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8567a858b789..b0d29fd5e51c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -13,6 +13,14 @@ endif CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include LDLIBS += -lcap -lelf -lrt -lpthread +TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read +all: $(TEST_CUSTOM_PROGS) + +$(TEST_CUSTOM_PROGS): urandom_read + +urandom_read: urandom_read.c + $(CC) -o $(TEST_CUSTOM_PROGS) -static $< + # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user @@ -21,7 +29,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ - sample_map_ret0.o test_tcpbpf_kern.o + sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -74,3 +82,5 @@ $(OUTPUT)/%.o: %.c $(CLANG) $(CLANG_FLAGS) \ -O2 -target bpf -emit-llvm -c $< -o - | \ $(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@ + +EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 27ad5404389e..e9df48b306df 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -841,7 +841,8 @@ static void test_tp_attach_query(void) static int compare_map_keys(int map1_fd, int map2_fd) { __u32 key, next_key; - char val_buf[PERF_MAX_STACK_DEPTH * sizeof(__u64)]; + char val_buf[PERF_MAX_STACK_DEPTH * + sizeof(struct bpf_stack_build_id)]; int err; err = bpf_map_get_next_key(map1_fd, NULL, &key); @@ -964,6 +965,166 @@ out: return; } +static int extract_build_id(char *build_id, size_t size) +{ + FILE *fp; + char *line = NULL; + size_t len = 0; + + fp = popen("readelf -n ./urandom_read | grep 'Build ID'", "r"); + if (fp == NULL) + return -1; + + if (getline(&line, &len, fp) == -1) + goto err; + fclose(fp); + + if (len > size) + len = size; + memcpy(build_id, line, len); + build_id[len] = '\0'; + return 0; +err: + fclose(fp); + return -1; +} + +static void test_stacktrace_build_id(void) +{ + int control_map_fd, stackid_hmap_fd, stackmap_fd; + const char *file = "./test_stacktrace_build_id.o"; + int bytes, efd, err, pmu_fd, prog_fd; + struct perf_event_attr attr = {}; + __u32 key, previous_key, val, duration = 0; + struct bpf_object *obj; + char buf[256]; + int i, j; + struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH]; + int build_id_matches = 0; + + err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); + if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) + goto out; + + /* Get the ID for the sched/sched_switch tracepoint */ + snprintf(buf, sizeof(buf), + "/sys/kernel/debug/tracing/events/random/urandom_read/id"); + efd = open(buf, O_RDONLY, 0); + if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) + goto close_prog; + + bytes = read(efd, buf, sizeof(buf)); + close(efd); + if (CHECK(bytes <= 0 || bytes >= sizeof(buf), + "read", "bytes %d errno %d\n", bytes, errno)) + goto close_prog; + + /* Open the perf event and attach bpf progrram */ + attr.config = strtol(buf, NULL, 0); + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN; + attr.sample_period = 1; + attr.wakeup_events = 1; + pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, + 0 /* cpu 0 */, -1 /* group id */, + 0 /* flags */); + if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", + pmu_fd, errno)) + goto close_prog; + + err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); + if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", + err, errno)) + goto close_pmu; + + err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); + if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", + err, errno)) + goto disable_pmu; + + /* find map fds */ + control_map_fd = bpf_find_map(__func__, obj, "control_map"); + if (CHECK(control_map_fd < 0, "bpf_find_map control_map", + "err %d errno %d\n", err, errno)) + goto disable_pmu; + + stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap"); + if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap", + "err %d errno %d\n", err, errno)) + goto disable_pmu; + + stackmap_fd = bpf_find_map(__func__, obj, "stackmap"); + if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n", + err, errno)) + goto disable_pmu; + + assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null") + == 0); + assert(system("./urandom_read if=/dev/urandom of=/dev/zero count=4 2> /dev/null") == 0); + /* disable stack trace collection */ + key = 0; + val = 1; + bpf_map_update_elem(control_map_fd, &key, &val, 0); + + /* for every element in stackid_hmap, we can find a corresponding one + * in stackmap, and vise versa. + */ + err = compare_map_keys(stackid_hmap_fd, stackmap_fd); + if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", + "err %d errno %d\n", err, errno)) + goto disable_pmu; + + err = compare_map_keys(stackmap_fd, stackid_hmap_fd); + if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap", + "err %d errno %d\n", err, errno)) + goto disable_pmu; + + err = extract_build_id(buf, 256); + + if (CHECK(err, "get build_id with readelf", + "err %d errno %d\n", err, errno)) + goto disable_pmu; + + err = bpf_map_get_next_key(stackmap_fd, NULL, &key); + if (CHECK(err, "get_next_key from stackmap", + "err %d, errno %d\n", err, errno)) + goto disable_pmu; + + do { + char build_id[64]; + + err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs); + if (CHECK(err, "lookup_elem from stackmap", + "err %d, errno %d\n", err, errno)) + goto disable_pmu; + for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i) + if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID && + id_offs[i].offset != 0) { + for (j = 0; j < 20; ++j) + sprintf(build_id + 2 * j, "%02x", + id_offs[i].build_id[j] & 0xff); + if (strstr(buf, build_id) != NULL) + build_id_matches = 1; + } + previous_key = key; + } while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0); + + CHECK(build_id_matches < 1, "build id match", + "Didn't find expected build ID from the map"); + +disable_pmu: + ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE); + +close_pmu: + close(pmu_fd); + +close_prog: + bpf_object__close(obj); + +out: + return; +} + int main(void) { test_pkt_access(); @@ -976,6 +1137,7 @@ int main(void) test_obj_name(); test_tp_attach_query(); test_stacktrace_map(); + test_stacktrace_build_id(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/test_stacktrace_build_id.c new file mode 100644 index 000000000000..b755bd783ce5 --- /dev/null +++ b/tools/testing/selftests/bpf/test_stacktrace_build_id.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include +#include "bpf_helpers.h" + +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif + +struct bpf_map_def SEC("maps") control_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") stackid_hmap = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 10000, +}; + +struct bpf_map_def SEC("maps") stackmap = { + .type = BPF_MAP_TYPE_STACK_TRACE, + .key_size = sizeof(__u32), + .value_size = sizeof(struct bpf_stack_build_id) + * PERF_MAX_STACK_DEPTH, + .max_entries = 128, + .map_flags = BPF_F_STACK_BUILD_ID, +}; + +/* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */ +struct random_urandom_args { + unsigned long long pad; + int got_bits; + int pool_left; + int input_left; +}; + +SEC("tracepoint/random/urandom_read") +int oncpu(struct random_urandom_args *args) +{ + __u32 key = 0, val = 0, *value_p; + + value_p = bpf_map_lookup_elem(&control_map, &key); + if (value_p && *value_p) + return 0; /* skip if non-zero *value_p */ + + /* The size of stackmap and stackid_hmap should be the same */ + key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK); + if ((int)key >= 0) + bpf_map_update_elem(&stackid_hmap, &key, &val, 0); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */ diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c new file mode 100644 index 000000000000..4acfdebf36fa --- /dev/null +++ b/tools/testing/selftests/bpf/urandom_read.c @@ -0,0 +1,22 @@ +#include +#include +#include +#include +#include +#include + +#define BUF_SIZE 256 +int main(void) +{ + int fd = open("/dev/urandom", O_RDONLY); + int i; + char buf[BUF_SIZE]; + + if (fd < 0) + return 1; + for (i = 0; i < 4; ++i) + read(fd, buf, BUF_SIZE); + + close(fd); + return 0; +} -- cgit v1.2.3-59-g8ed1b From 90126e3a40843de369a496db160ab22bdd6f4c48 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Mar 2018 23:26:14 -0700 Subject: tools: bpftool: fix dependency file path Auto-generated dependency files are in the OUTPUT directory, we need to include them from there. This fixes object files not being rebuilt after header changes. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 4c2867481f5c..50a715ac9e8b 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -70,7 +70,7 @@ ifeq ($(feature-disassembler-four-args), 1) CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE endif -include $(wildcard *.d) +include $(wildcard $(OUTPUT)*.d) all: $(OUTPUT)bpftool -- cgit v1.2.3-59-g8ed1b From d5fc73dceb060688f01c2ee225a2b42cf47352ba Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Mar 2018 23:26:15 -0700 Subject: tools: bpftool: fix potential format truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC 7 complains: xlated_dumper.c: In function ‘print_call’: xlated_dumper.c:179:10: warning: ‘%s’ directive output may be truncated writing up to 255 bytes into a region of size between 249 and 253 [-Wformat-truncation=] "%+d#%s", insn->off, sym->name); Add a bit more space to the buffer so it can handle the entire string and integer without truncation. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/xlated_dumper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h index 51c935d38ae2..b34affa7ef2d 100644 --- a/tools/bpf/bpftool/xlated_dumper.h +++ b/tools/bpf/bpftool/xlated_dumper.h @@ -49,7 +49,7 @@ struct dump_data { unsigned long address_call_base; struct kernel_sym *sym_mapping; __u32 sym_count; - char scratch_buff[SYM_MAX_NAME]; + char scratch_buff[SYM_MAX_NAME + 8]; }; void kernel_syms_load(struct dump_data *dd); -- cgit v1.2.3-59-g8ed1b From 8050ea4653c21e14681d9e00390c6886556a2640 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Mar 2018 23:26:16 -0700 Subject: tools: bpf: cleanup PHONY target There is no FORCE target in the Makefile and some of the PHONY targets are missing, update the list. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index c07b4e718eeb..acfaf4c8cc32 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -100,4 +100,4 @@ bpftool_install: bpftool_clean: $(call descend,bpftool,clean) -.PHONY: bpftool FORCE +.PHONY: all install clean bpftool bpftool_install bpftool_clean -- cgit v1.2.3-59-g8ed1b From cc5b3403f0248ca5f40539d33f0e127ccca25dd2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Mar 2018 23:26:17 -0700 Subject: tools: bpf: remove feature detection output bpf tools use feature detection for libbfd dependency, clean up the output files on make clean. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/Makefile | 2 ++ tools/bpf/bpftool/Makefile | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index acfaf4c8cc32..1ea545965ee3 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -81,6 +81,8 @@ clean: bpftool_clean $(call QUIET_CLEAN, bpf-progs) $(Q)rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \ $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.* + $(call QUIET_CLEAN, core-gen) + $(Q)rm -f $(OUTPUT)FEATURE-DUMP.bpf install: $(PROGS) bpftool_install $(call QUIET_INSTALL, bpf_jit_disasm) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 50a715ac9e8b..4e69782c4a79 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -89,6 +89,8 @@ $(OUTPUT)%.o: %.c clean: $(LIBBPF)-clean $(call QUIET_CLEAN, bpftool) $(Q)$(RM) $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d + $(call QUIET_CLEAN, core-gen) + $(Q)$(RM) $(OUTPUT)FEATURE-DUMP.bpftool install: $(OUTPUT)bpftool $(call QUIET_INSTALL, bpftool) -- cgit v1.2.3-59-g8ed1b From 2c3682f0be97a5f57c6c8b40fa154dfc77efb461 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:56:49 -0700 Subject: sock: make static tls function alloc_sg generic sock helper The TLS ULP module builds scatterlists from a sock using page_frag_refill(). This is going to be useful for other ULPs so move it into sock file for more general use. In the process remove useless goto at end of while loop. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/sock.h | 4 ++++ net/core/sock.c | 56 ++++++++++++++++++++++++++++++++++++++++++++ net/tls/tls_sw.c | 69 ++++++------------------------------------------------ 3 files changed, 67 insertions(+), 62 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index b9624581d639..447150c51feb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2141,6 +2141,10 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); +int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, + int *sg_num_elem, unsigned int *sg_size, + int first_coalesce); + /* * Default write policy as shown to user space via poll/select/SIGIO */ diff --git a/net/core/sock.c b/net/core/sock.c index 27f218bba43f..f68dff0d7bc4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2239,6 +2239,62 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); +int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, + int *sg_num_elem, unsigned int *sg_size, + int first_coalesce) +{ + struct page_frag *pfrag; + unsigned int size = *sg_size; + int num_elem = *sg_num_elem, use = 0, rc = 0; + struct scatterlist *sge; + unsigned int orig_offset; + + len -= size; + pfrag = sk_page_frag(sk); + + while (len > 0) { + if (!sk_page_frag_refill(sk, pfrag)) { + rc = -ENOMEM; + goto out; + } + + use = min_t(int, len, pfrag->size - pfrag->offset); + + if (!sk_wmem_schedule(sk, use)) { + rc = -ENOMEM; + goto out; + } + + sk_mem_charge(sk, use); + size += use; + orig_offset = pfrag->offset; + pfrag->offset += use; + + sge = sg + num_elem - 1; + if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && + sg->offset + sg->length == orig_offset) { + sg->length += use; + } else { + sge++; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + ++num_elem; + if (num_elem == MAX_SKB_FRAGS) { + rc = -ENOSPC; + break; + } + } + + len -= use; + } +out: + *sg_size = size; + *sg_num_elem = num_elem; + return rc; +} +EXPORT_SYMBOL(sk_alloc_sg); + static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f26376e954ae..0fc8a24c6473 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -87,71 +87,16 @@ static void trim_both_sgl(struct sock *sk, int target_size) target_size); } -static int alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, - int first_coalesce) -{ - struct page_frag *pfrag; - unsigned int size = *sg_size; - int num_elem = *sg_num_elem, use = 0, rc = 0; - struct scatterlist *sge; - unsigned int orig_offset; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + num_elem - 1; - if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && - sg->offset + sg->length == orig_offset) { - sg->length += use; - } else { - sge++; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - ++num_elem; - if (num_elem == MAX_SKB_FRAGS) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } - goto out; - -out: - *sg_size = size; - *sg_num_elem = num_elem; - return rc; -} - static int alloc_encrypted_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = alloc_sg(sk, len, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0); + rc = sk_alloc_sg(sk, len, + ctx->sg_encrypted_data, + &ctx->sg_encrypted_num_elem, + &ctx->sg_encrypted_size, 0); return rc; } @@ -162,9 +107,9 @@ static int alloc_plaintext_sg(struct sock *sk, int len) struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = alloc_sg(sk, len, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, - tls_ctx->pending_open_record_frags); + rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, + &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, + tls_ctx->pending_open_record_frags); return rc; } -- cgit v1.2.3-59-g8ed1b From ffa35660017161524b7db79b0ce293fa1af16c4d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:56:54 -0700 Subject: sockmap: convert refcnt to an atomic refcnt The sockmap refcnt up until now has been wrapped in the sk_callback_lock(). So its not actually needed any locking of its own. The counter itself tracks the lifetime of the psock object. Sockets in a sockmap have a lifetime that is independent of the map they are part of. This is possible because a single socket may be in multiple maps. When this happens we can only release the psock data associated with the socket when the refcnt reaches zero. There are three possible delete sock reference decrement paths first through the normal sockmap process, the user deletes the socket from the map. Second the map is removed and all sockets in the map are removed, delete path is similar to case 1. The third case is an asyncronous socket event such as a closing the socket. The last case handles removing sockets that are no longer available. For completeness, although inc does not pose any problems in this patch series, the inc case only happens when a psock is added to a map. Next we plan to add another socket prog type to handle policy and monitoring on the TX path. When we do this however we will need to keep a reference count open across the sendmsg/sendpage call and holding the sk_callback_lock() here (on every send) seems less than ideal, also it may sleep in cases where we hit memory pressure. Instead of dealing with these issues in some clever way simply make the reference counting a refcnt_t type and do proper atomic ops. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a927e89dad6e..051b224270e3 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -62,8 +62,7 @@ struct smap_psock_map_entry { struct smap_psock { struct rcu_head rcu; - /* refcnt is used inside sk_callback_lock */ - u32 refcnt; + refcount_t refcnt; /* datapath variables */ struct sk_buff_head rxqueue; @@ -373,15 +372,13 @@ static void smap_destroy_psock(struct rcu_head *rcu) static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { - psock->refcnt--; - if (psock->refcnt) - return; - - tcp_cleanup_ulp(sock); - smap_stop_sock(psock, sock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); + if (refcount_dec_and_test(&psock->refcnt)) { + tcp_cleanup_ulp(sock); + smap_stop_sock(psock, sock); + clear_bit(SMAP_TX_RUNNING, &psock->state); + rcu_assign_sk_user_data(sock, NULL); + call_rcu_sched(&psock->rcu, smap_destroy_psock); + } } static int smap_parse_func_strparser(struct strparser *strp, @@ -511,7 +508,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); - psock->refcnt = 1; + refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -772,7 +769,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - psock->refcnt++; + refcount_inc(&psock->refcnt); } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { -- cgit v1.2.3-59-g8ed1b From 312fc2b4c82e96a48cb2d0da2bd4816eb253c499 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:00 -0700 Subject: net: do_tcp_sendpages flag to avoid SKBTX_SHARED_FRAG When calling do_tcp_sendpages() from in kernel and we know the data has no references from user side we can omit SKBTX_SHARED_FRAG flag. This patch adds an internal flag, NO_SKBTX_SHARED_FRAG that can be used to omit setting SKBTX_SHARED_FRAG. The flag is not exposed to userspace because the sendpage call from the splice logic masks out all bits except MSG_MORE. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/socket.h | 1 + net/ipv4/tcp.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 1ce1f768a58c..60e01482a9c4 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -287,6 +287,7 @@ struct ucred { #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN +#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fb350f740f69..f90ec24c2cc8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -994,7 +994,9 @@ new_segment: get_page(page); skb_fill_page_desc(skb, i, page, offset, copy); } - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; skb->len += copy; skb->data_len += copy; -- cgit v1.2.3-59-g8ed1b From 8c05dbf04b2882c3c0bc43fe7668c720210877f3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:05 -0700 Subject: net: generalize sk_alloc_sg to work with scatterlist rings The current implementation of sk_alloc_sg expects scatterlist to always start at entry 0 and complete at entry MAX_SKB_FRAGS. Future patches will want to support starting at arbitrary offset into scatterlist so add an additional sg_start parameters and then default to the current values in TLS code paths. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/sock.h | 2 +- net/core/sock.c | 27 ++++++++++++++++----------- net/tls/tls_sw.c | 4 ++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 447150c51feb..b7c75e024e37 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2142,7 +2142,7 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, + int sg_start, int *sg_curr, unsigned int *sg_size, int first_coalesce); /* diff --git a/net/core/sock.c b/net/core/sock.c index f68dff0d7bc4..4f92c2910200 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2240,19 +2240,20 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) EXPORT_SYMBOL(sk_page_frag_refill); int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, + int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, int first_coalesce) { + int sg_curr = *sg_curr_index, use = 0, rc = 0; + unsigned int size = *sg_curr_size; struct page_frag *pfrag; - unsigned int size = *sg_size; - int num_elem = *sg_num_elem, use = 0, rc = 0; struct scatterlist *sge; - unsigned int orig_offset; len -= size; pfrag = sk_page_frag(sk); while (len > 0) { + unsigned int orig_offset; + if (!sk_page_frag_refill(sk, pfrag)) { rc = -ENOMEM; goto out; @@ -2270,17 +2271,21 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, orig_offset = pfrag->offset; pfrag->offset += use; - sge = sg + num_elem - 1; - if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && + sge = sg + sg_curr - 1; + if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && sg->offset + sg->length == orig_offset) { sg->length += use; } else { - sge++; + sge = sg + sg_curr; sg_unmark_end(sge); sg_set_page(sge, pfrag->page, use, orig_offset); get_page(pfrag->page); - ++num_elem; - if (num_elem == MAX_SKB_FRAGS) { + sg_curr++; + + if (sg_curr == MAX_SKB_FRAGS) + sg_curr = 0; + + if (sg_curr == sg_start) { rc = -ENOSPC; break; } @@ -2289,8 +2294,8 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, len -= use; } out: - *sg_size = size; - *sg_num_elem = num_elem; + *sg_curr_size = size; + *sg_curr_index = sg_curr; return rc; } EXPORT_SYMBOL(sk_alloc_sg); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0fc8a24c6473..057a558ed6d7 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -94,7 +94,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len) int rc = 0; rc = sk_alloc_sg(sk, len, - ctx->sg_encrypted_data, + ctx->sg_encrypted_data, 0, &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0); @@ -107,7 +107,7 @@ static int alloc_plaintext_sg(struct sock *sk, int len) struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, + rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, tls_ctx->pending_open_record_frags); -- cgit v1.2.3-59-g8ed1b From 4f738adba30a7cfc006f605707e7aee847ffefa0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:10 -0700 Subject: bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data This implements a BPF ULP layer to allow policy enforcement and monitoring at the socket layer. In order to support this a new program type BPF_PROG_TYPE_SK_MSG is used to run the policy at the sendmsg/sendpage hook. To attach the policy to sockets a sockmap is used with a new program attach type BPF_SK_MSG_VERDICT. Similar to previous sockmap usages when a sock is added to a sockmap, via a map update, if the map contains a BPF_SK_MSG_VERDICT program type attached then the BPF ULP layer is created on the socket and the attached BPF_PROG_TYPE_SK_MSG program is run for every msg in sendmsg case and page/offset in sendpage case. BPF_PROG_TYPE_SK_MSG Semantics/API: BPF_PROG_TYPE_SK_MSG supports only two return codes SK_PASS and SK_DROP. Returning SK_DROP free's the copied data in the sendmsg case and in the sendpage case leaves the data untouched. Both cases return -EACESS to the user. Returning SK_PASS will allow the msg to be sent. In the sendmsg case data is copied into kernel space buffers before running the BPF program. The kernel space buffers are stored in a scatterlist object where each element is a kernel memory buffer. Some effort is made to coalesce data from the sendmsg call here. For example a sendmsg call with many one byte iov entries will likely be pushed into a single entry. The BPF program is run with data pointers (start/end) pointing to the first sg element. In the sendpage case data is not copied. We opt not to copy the data by default here, because the BPF infrastructure does not know what bytes will be needed nor when they will be needed. So copying all bytes may be wasteful. Because of this the initial start/end data pointers are (0,0). Meaning no data can be read or written. This avoids reading data that may be modified by the user. A new helper is added later in this series if reading and writing the data is needed. The helper call will do a copy by default so that the page is exclusively owned by the BPF call. The verdict from the BPF_PROG_TYPE_SK_MSG applies to the entire msg in the sendmsg() case and the entire page/offset in the sendpage case. This avoids ambiguity on how to handle mixed return codes in the sendmsg case. Again a helper is added later in the series if a verdict needs to apply to multiple system calls and/or only a subpart of the currently being processed message. The helper msg_redirect_map() can be used to select the socket to send the data on. This is used similar to existing redirect use cases. This allows policy to redirect msgs. Pseudo code simple example: The basic logic to attach a program to a socket is as follows, // load the programs bpf_prog_load(SOCKMAP_TCP_MSG_PROG, BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog); // lookup the sockmap bpf_map_msg = bpf_object__find_map_by_name(obj, "my_sock_map"); // get fd for sockmap map_fd_msg = bpf_map__fd(bpf_map_msg); // attach program to sockmap bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0); Adding sockets to the map is done in the normal way, // Add a socket 'fd' to sockmap at location 'i' bpf_map_update_elem(map_fd_msg, &i, fd, BPF_ANY); After the above any socket attached to "my_sock_map", in this case 'fd', will run the BPF msg verdict program (msg_prog) on every sendmsg and sendpage system call. For a complete example see BPF selftests or sockmap samples. Implementation notes: It seemed the simplest, to me at least, to use a refcnt to ensure psock is not lost across the sendmsg copy into the sg, the bpf program running on the data in sg_data, and the final pass to the TCP stack. Some performance testing may show a better method to do this and avoid the refcnt cost, but for now use the simpler method. Another item that will come after basic support is in place is supporting MSG_MORE flag. At the moment we call sendpages even if the MSG_MORE flag is set. An enhancement would be to collect the pages into a larger scatterlist and pass down the stack. Notice that bpf_tcp_sendmsg() could support this with some additional state saved across sendmsg calls. I built the code to support this without having to do refactoring work. Other features TBD include ZEROCOPY and the TCP_RECV_QUEUE/TCP_NO_QUEUE support. This will follow initial series shortly. Future work could improve size limits on the scatterlist rings used here. Currently, we use MAX_SKB_FRAGS simply because this was being used already in the TLS case. Future work could extend the kernel sk APIs to tune this depending on workload. This is a trade-off between memory usage and throughput performance. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 17 ++ include/uapi/linux/bpf.h | 22 +- kernel/bpf/sockmap.c | 712 +++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 14 +- kernel/bpf/verifier.c | 5 +- net/core/filter.c | 106 +++++++ 8 files changed, 857 insertions(+), 21 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 66df387106de..819229c80eca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; +struct sock; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 19b8349a3809..5e2e8a49fb21 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -13,6 +13,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) diff --git a/include/linux/filter.h b/include/linux/filter.h index fdb691b520c0..109d05ccea9a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -507,6 +507,22 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct sk_msg_buff { + void *data; + void *data_end; + __u32 apply_bytes; + __u32 cork_bytes; + int sg_copybreak; + int sg_start; + int sg_curr; + int sg_end; + struct scatterlist sg_data[MAX_SKB_FRAGS]; + bool sg_copy[MAX_SKB_FRAGS]; + __u32 key; + __u32 flags; + struct bpf_map *map; +}; + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -771,6 +787,7 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); +struct sock *do_msg_redirect_map(struct sk_msg_buff *md); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e15d1724d89..ef529539abde 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, }; enum bpf_attach_type { @@ -143,6 +144,7 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -718,6 +720,15 @@ union bpf_attr { * int bpf_override_return(pt_regs, rc) * @pt_regs: pointer to struct pt_regs * @rc: the return value to set + * + * int bpf_msg_redirect_map(map, key, flags) + * Redirect msg to a sock in map using key as a lookup key for the + * sock in map. + * @map: pointer to sockmap + * @key: key to lookup sock in map + * @flags: reserved for future use + * Return: SK_PASS + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -779,7 +790,8 @@ union bpf_attr { FN(perf_prog_read_value), \ FN(getsockopt), \ FN(override_return), \ - FN(sock_ops_cb_flags_set), + FN(sock_ops_cb_flags_set), \ + FN(msg_redirect_map), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -942,6 +954,14 @@ enum sk_action { SK_PASS, }; +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + void *data; + void *data_end; +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 051b224270e3..69c5bccabd22 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; @@ -73,7 +75,16 @@ struct smap_psock { int save_off; struct sk_buff *save_skb; + /* datapath variables for tx_msg ULP */ + struct sock *sk_redir; + int apply_bytes; + int cork_bytes; + int sg_size; + int eval; + struct sk_msg_buff *cork; + struct strparser strp; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; @@ -91,6 +102,11 @@ struct smap_psock { void (*save_write_space)(struct sock *sk); }; +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); + static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); @@ -115,27 +131,41 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; + + if (psock->bpf_tx_msg) { + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; + tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + } + sk->sk_prot = &tcp_bpf_proto; rcu_read_unlock(); return 0; } +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); + static void bpf_tcp_release(struct sock *sk) { struct smap_psock *psock; rcu_read_lock(); psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; - if (likely(psock)) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; } + + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; +out: rcu_read_unlock(); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -174,6 +204,7 @@ enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, + __SK_NONE, }; static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { @@ -185,10 +216,621 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { .release = bpf_tcp_release, }; +static int memcopy_from_iter(struct sock *sk, + struct sk_msg_buff *md, + struct iov_iter *from, int bytes) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_curr, rc = -ENOSPC; + + do { + int copy; + char *to; + + if (md->sg_copybreak >= sg[i].length) { + md->sg_copybreak = 0; + + if (++i == MAX_SKB_FRAGS) + i = 0; + + if (i == md->sg_end) + break; + } + + copy = sg[i].length - md->sg_copybreak; + to = sg_virt(&sg[i]) + md->sg_copybreak; + md->sg_copybreak += copy; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(to, copy, from); + else + rc = copy_from_iter(to, copy, from); + + if (rc != copy) { + rc = -EFAULT; + goto out; + } + + bytes -= copy; + if (!bytes) + break; + + md->sg_copybreak = 0; + if (++i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +out: + md->sg_curr = i; + return rc; +} + +static int bpf_tcp_push(struct sock *sk, int apply_bytes, + struct sk_msg_buff *md, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sg; + int offset, ret = 0; + struct page *p; + size_t size; + + while (1) { + sg = md->sg_data + md->sg_start; + size = (apply && apply_bytes < sg->length) ? + apply_bytes : sg->length; + offset = sg->offset; + + tcp_rate_check_app_limited(sk); + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret != size) { + if (ret > 0) { + if (apply) + apply_bytes -= ret; + size -= ret; + offset += ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + goto retry; + } + + sg->length = size; + sg->offset = offset; + return ret; + } + + if (apply) + apply_bytes -= ret; + sg->offset += ret; + sg->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + + if (!sg->length) { + put_page(p); + md->sg_start++; + if (md->sg_start == MAX_SKB_FRAGS) + md->sg_start = 0; + memset(sg, 0, sizeof(*sg)); + + if (md->sg_start == md->sg_end) + break; + } + + if (apply && !apply_bytes) + break; + } + return 0; +} + +static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data + md->sg_start; + + if (md->sg_copy[md->sg_start]) { + md->data = md->data_end = 0; + } else { + md->data = sg_virt(sg); + md->data_end = md->data + sg->length; + } +} + +static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start; + + do { + int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + if (!bytes) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +} + +static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start, free; + + while (bytes && sg[i].length) { + free = sg[i].length; + if (bytes < free) { + sg[i].length -= bytes; + sg[i].offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + bytes -= sg[i].length; + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } +} + +static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = start, free = 0; + + while (sg[i].length) { + free += sg[i].length; + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } + + return free; +} + +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +{ + int free = free_sg(sk, md->sg_start, md); + + md->sg_start = md->sg_end; + return free; +} + +static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) +{ + return free_sg(sk, md->sg_curr, md); +} + +static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) +{ + return ((_rc == SK_PASS) ? + (md->map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP); +} + +static unsigned int smap_do_tx_msg(struct sock *sk, + struct smap_psock *psock, + struct sk_msg_buff *md) +{ + struct bpf_prog *prog; + unsigned int rc, _rc; + + preempt_disable(); + rcu_read_lock(); + + /* If the policy was removed mid-send then default to 'accept' */ + prog = READ_ONCE(psock->bpf_tx_msg); + if (unlikely(!prog)) { + _rc = SK_PASS; + goto verdict; + } + + bpf_compute_data_pointers_sg(md); + rc = (*prog->bpf_func)(md, prog->insnsi); + psock->apply_bytes = md->apply_bytes; + + /* Moving return codes from UAPI namespace into internal namespace */ + _rc = bpf_map_msg_verdict(rc, md); + + /* The psock has a refcount on the sock but not on the map and because + * we need to drop rcu read lock here its possible the map could be + * removed between here and when we need it to execute the sock + * redirect. So do the map lookup now for future use. + */ + if (_rc == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = do_msg_redirect_map(md); + if (!psock->sk_redir) { + _rc = __SK_DROP; + goto verdict; + } + sock_hold(psock->sk_redir); + } +verdict: + rcu_read_unlock(); + preempt_enable(); + + return _rc; +} + +static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, + struct sk_msg_buff *md, + int flags) +{ + struct smap_psock *psock; + struct scatterlist *sg; + int i, err, free = 0; + + sg = md->sg_data; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out_rcu; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto out_rcu; + + rcu_read_unlock(); + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + smap_release_sock(psock, sk); + if (unlikely(err)) + goto out; + return 0; +out_rcu: + rcu_read_unlock(); +out: + i = md->sg_start; + while (sg[i].length) { + free += sg[i].length; + put_page(sg_page(&sg[i])); + sg[i].length = 0; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } + return free; +} + +static inline void bpf_md_init(struct smap_psock *psock) +{ + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } +} + +static void apply_bytes_dec(struct smap_psock *psock, int i) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < i) + psock->apply_bytes = 0; + else + psock->apply_bytes -= i; + } +} + +static int bpf_exec_tx_verdict(struct smap_psock *psock, + struct sk_msg_buff *m, + struct sock *sk, + int *copied, int flags) +{ + bool cork = false, enospc = (m->sg_start == m->sg_end); + struct sock *redir; + int err = 0; + int send; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = smap_do_tx_msg(sk, psock, m); + + if (m->cork_bytes && + m->cork_bytes > psock->sg_size && !enospc) { + psock->cork_bytes = m->cork_bytes - psock->sg_size; + if (!psock->cork) { + psock->cork = kcalloc(1, + sizeof(struct sk_msg_buff), + GFP_ATOMIC | __GFP_NOWARN); + + if (!psock->cork) { + err = -ENOMEM; + goto out_err; + } + } + memcpy(psock->cork, m, sizeof(*m)); + goto out_err; + } + + send = psock->sg_size; + if (psock->apply_bytes && psock->apply_bytes < send) + send = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = bpf_tcp_push(sk, send, m, flags, true); + if (unlikely(err)) { + *copied -= free_start_sg(sk, m); + break; + } + + apply_bytes_dec(psock, send); + psock->sg_size -= send; + break; + case __SK_REDIRECT: + redir = psock->sk_redir; + apply_bytes_dec(psock, send); + + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + + return_mem_sg(sk, send, m); + release_sock(sk); + + err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); + lock_sock(sk); + + if (cork) { + free_start_sg(sk, m); + kfree(m); + m = NULL; + } + if (unlikely(err)) + *copied -= err; + else + psock->sg_size -= send; + break; + case __SK_DROP: + default: + free_bytes_sg(sk, send, m); + apply_bytes_dec(psock, send); + *copied -= send; + psock->sg_size -= send; + err = -EACCES; + break; + } + + if (likely(!err)) { + bpf_md_init(psock); + if (m && + m->sg_data[m->sg_start].page_link && + m->sg_data[m->sg_start].length) + goto more_data; + } + +out_err: + return err; +} + +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + struct sk_msg_buff md = {0}; + unsigned int sg_copy = 0; + struct smap_psock *psock; + int copied = 0, err = 0; + struct scatterlist *sg; + long timeo; + + /* Its possible a sock event or user removed the psock _but_ the ops + * have not been reprogrammed yet so we get here. In this case fallback + * to tcp_sendmsg. Note this only works because we _only_ ever allow + * a single ULP there is no hierarchy here. + */ + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + /* Increment the psock refcnt to ensure its not released while sending a + * message. Required because sk lookup and bpf programs are used in + * separate rcu critical sections. Its OK if we lose the map entry + * but we can't lose the sock reference. + */ + if (!refcount_inc_not_zero(&psock->refcnt)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + sg = md.sg_data; + sg_init_table(sg, MAX_SKB_FRAGS); + rcu_read_unlock(); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + while (msg_data_left(msg)) { + struct sk_msg_buff *m; + bool enospc = false; + int copy; + + if (sk->sk_err) { + err = sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + m = psock->cork_bytes ? psock->cork : &md; + m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; + err = sk_alloc_sg(sk, copy, m->sg_data, + m->sg_start, &m->sg_end, &sg_copy, + m->sg_end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = sg_copy; + } + + err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); + if (err < 0) { + free_curr_sg(sk, m); + goto out_err; + } + + psock->sg_size += copy; + copied += copy; + sg_copy = 0; + + /* When bytes are being corked skip running BPF program and + * applying verdict unless there is no more buffer space. In + * the ENOSPC case simply run BPF prorgram with currently + * accumulated data. We don't have much choice at this point + * we could try extending the page frags or chaining complex + * frags but even in these cases _eventually_ we will hit an + * OOM scenario. More complex recovery schemes may be + * implemented in the future, but BPF programs must handle + * the case where apply_cork requests are not honored. The + * canonical method to verify this is to check data length. + */ + if (psock->cork_bytes) { + if (copy > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= copy; + + if (psock->cork_bytes && !enospc) + goto out_cork; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); +out_cork: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +} + +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct sk_msg_buff md = {0}, *m = NULL; + int err = 0, copied = 0; + struct smap_psock *psock; + struct scatterlist *sg; + bool enospc = false; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto accept; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto accept; + rcu_read_unlock(); + + lock_sock(sk); + + if (psock->cork_bytes) + m = psock->cork; + else + m = &md; + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(m->sg_end == m->sg_start && + m->sg_data[m->sg_end].length)) + goto out_err; + + psock->sg_size += size; + sg = &m->sg_data[m->sg_end]; + sg_set_page(sg, page, size, offset); + get_page(page); + m->sg_copy[m->sg_end] = true; + sk_mem_charge(sk, size); + m->sg_end++; + copied = size; + + if (m->sg_end == MAX_SKB_FRAGS) + m->sg_end = 0; + + if (m->sg_end == m->sg_start) + enospc = true; + + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + + if (psock->cork_bytes && !enospc) + goto out_err; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); +out_err: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +accept: + rcu_read_unlock(); + return tcp_sendpage(sk, page, offset, size, flags); +} + +static void bpf_tcp_msg_add(struct smap_psock *psock, + struct sock *sk, + struct bpf_prog *tx_msg) +{ + struct bpf_prog *orig_tx_msg; + + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); + if (orig_tx_msg) + bpf_prog_put(orig_tx_msg); +} + static int bpf_tcp_ulp_register(void) { tcp_bpf_proto = tcp_prot; tcp_bpf_proto.close = bpf_tcp_close; + /* Once BPF TX ULP is registered it is never unregistered. It + * will be in the ULP list for the lifetime of the system. Doing + * duplicate registers is not a problem. + */ return tcp_register_ulp(&bpf_tcp_ulp_ops); } @@ -412,7 +1054,6 @@ static int smap_parse_func_strparser(struct strparser *strp, return rc; } - static int smap_read_sock_done(struct strparser *strp, int err) { return err; @@ -482,12 +1123,22 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_parse); if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); + if (psock->bpf_tx_msg) + bpf_prog_put(psock->bpf_tx_msg); + + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + } list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); } + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sock); kfree(psock); } @@ -503,6 +1154,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, if (!psock) return ERR_PTR(-ENOMEM); + psock->eval = __SK_NONE; psock->sock = sock; skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); @@ -711,10 +1363,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct smap_psock_map_entry *e = NULL; - struct bpf_prog *verdict, *parse; + struct bpf_prog *verdict, *parse, *tx_msg; struct sock *osock, *sock; struct smap_psock *psock; u32 i = *(u32 *)key; + bool new = false; int err; if (unlikely(flags > BPF_EXIST)) @@ -737,6 +1390,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, */ verdict = READ_ONCE(stab->bpf_verdict); parse = READ_ONCE(stab->bpf_parse); + tx_msg = READ_ONCE(stab->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -755,6 +1409,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } + if (tx_msg) { + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + if (IS_ERR(tx_msg)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(tx_msg); + } + } + write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); @@ -769,7 +1434,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - refcount_inc(&psock->refcnt); + if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { + err = -EBUSY; + goto out_progs; + } + if (!refcount_inc_not_zero(&psock->refcnt)) { + err = -EAGAIN; + goto out_progs; + } } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { @@ -777,11 +1449,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); - if (err) - goto out_progs; - set_bit(SMAP_TX_RUNNING, &psock->state); + new = true; } e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); @@ -794,6 +1463,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ + if (tx_msg) + bpf_tcp_msg_add(psock, sock, tx_msg); + if (new) { + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_free; + } + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) @@ -815,8 +1492,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - if (osock != sock && parse) - smap_stop_sock(opsock, osock); smap_list_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); @@ -829,6 +1504,8 @@ out_progs: bpf_prog_put(verdict); if (parse) bpf_prog_put(parse); + if (tx_msg) + bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; @@ -843,6 +1520,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return -EINVAL; switch (type) { + case BPF_SK_MSG_VERDICT: + orig = xchg(&stab->bpf_tx_msg, prog); + break; case BPF_SK_SKB_STREAM_PARSER: orig = xchg(&stab->bpf_parse, prog); break; @@ -904,6 +1584,10 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file) orig = xchg(&stab->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); + + orig = xchg(&stab->bpf_tx_msg, NULL); + if (orig) + bpf_prog_put(orig); } const struct bpf_map_ops sock_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e24aa3241387..3aeb4ea2a93a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1315,7 +1315,8 @@ static int bpf_obj_get(const union bpf_attr *attr) #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) +static int sockmap_get_from_fd(const union bpf_attr *attr, + int type, bool attach) { struct bpf_prog *prog = NULL; int ufd = attr->target_fd; @@ -1329,8 +1330,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return PTR_ERR(map); if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_SK_SKB); + prog = bpf_prog_get_type(attr->attach_bpf_fd, type); if (IS_ERR(prog)) { fdput(f); return PTR_ERR(prog); @@ -1382,9 +1382,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, true); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); default: return -EINVAL; } @@ -1437,9 +1439,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb79a34359c0..e9f7c20691c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1248,6 +1248,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: if (meta) return meta->pkt_access; @@ -2071,7 +2072,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_map) goto error; break; default: @@ -2109,6 +2111,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_redirect_map: + case BPF_FUNC_msg_redirect_map: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; diff --git a/net/core/filter.c b/net/core/filter.c index 33edfa8372fd..2b6c47597eab 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1890,6 +1890,44 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags)) + return SK_DROP; + + msg->key = key; + msg->flags = flags; + msg->map = map; + + return SK_PASS; +} + +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + struct sock *sk = NULL; + + if (msg->map) { + sk = __sock_map_lookup_elem(msg->map, msg->key); + + msg->key = 0; + msg->map = NULL; + } + + return sk; +} + +static const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3591,6 +3629,16 @@ static const struct bpf_func_proto * } } +static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_msg_redirect_map: + return &bpf_msg_redirect_map_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -3980,6 +4028,32 @@ static bool sk_skb_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } +static bool sk_msg_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct sk_msg_md, data): + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct sk_msg_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + if (off < 0 || off >= sizeof(struct sk_msg_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u64)) + return false; + + return true; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -4778,6 +4852,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_msg_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data)); + break; + case offsetof(struct sk_msg_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data_end)); + break; + } + + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -4868,6 +4965,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = { const struct bpf_prog_ops sk_skb_prog_ops = { }; +const struct bpf_verifier_ops sk_msg_verifier_ops = { + .get_func_proto = sk_msg_func_proto, + .is_valid_access = sk_msg_is_valid_access, + .convert_ctx_access = sk_msg_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_msg_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; -- cgit v1.2.3-59-g8ed1b From 2a100317c9ebc204a166f16294884fbf9da074ce Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:15 -0700 Subject: bpf: sockmap, add bpf_msg_apply_bytes() helper A single sendmsg or sendfile system call can contain multiple logical messages that a BPF program may want to read and apply a verdict. But, without an apply_bytes helper any verdict on the data applies to all bytes in the sendmsg/sendfile. Alternatively, a BPF program may only care to read the first N bytes of a msg. If the payload is large say MB or even GB setting up and calling the BPF program repeatedly for all bytes, even though the verdict is already known, creates unnecessary overhead. To allow BPF programs to control how many bytes a given verdict applies to we implement a bpf_msg_apply_bytes() helper. When called from within a BPF program this sets a counter, internal to the BPF infrastructure, that applies the last verdict to the next N bytes. If the N is smaller than the current data being processed from a sendmsg/sendfile call, the first N bytes will be sent and the BPF program will be re-run with start_data pointing to the N+1 byte. If N is larger than the current data being processed the BPF verdict will be applied to multiple sendmsg/sendfile calls until N bytes are consumed. Note1 if a socket closes with apply_bytes counter non-zero this is not a problem because data is not being buffered for N bytes and is sent as its received. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ef529539abde..a557a2a5d72d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -791,7 +791,8 @@ union bpf_attr { FN(getsockopt), \ FN(override_return), \ FN(sock_ops_cb_flags_set), \ - FN(msg_redirect_map), + FN(msg_redirect_map), \ + FN(msg_apply_bytes), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 2b6c47597eab..17d6775f5431 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1928,6 +1928,20 @@ static const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->apply_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { + .func = bpf_msg_apply_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3634,6 +3648,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_apply_bytes: + return &bpf_msg_apply_bytes_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-59-g8ed1b From 91843d540a139eb8070bcff8aa10089164436deb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:20 -0700 Subject: bpf: sockmap, add msg_cork_bytes() helper In the case where we need a specific number of bytes before a verdict can be assigned, even if the data spans multiple sendmsg or sendfile calls. The BPF program may use msg_cork_bytes(). The extreme case is a user can call sendmsg repeatedly with 1-byte msg segments. Obviously, this is bad for performance but is still valid. If the BPF program needs N bytes to validate a header it can use msg_cork_bytes to specify N bytes and the BPF program will not be called again until N bytes have been accumulated. The infrastructure will attempt to coalesce data if possible so in many cases (most my use cases at least) the data will be in a single scatterlist element with data pointers pointing to start/end of the element. However, this is dependent on available memory so is not guaranteed. So BPF programs must validate data pointer ranges, but this is the case anyways to convince the verifier the accesses are valid. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a557a2a5d72d..1765cfb16c99 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -792,7 +792,8 @@ union bpf_attr { FN(override_return), \ FN(sock_ops_cb_flags_set), \ FN(msg_redirect_map), \ - FN(msg_apply_bytes), + FN(msg_apply_bytes), \ + FN(msg_cork_bytes), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 17d6775f5431..0c9daf6ee555 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1942,6 +1942,20 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->cork_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { + .func = bpf_msg_cork_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3650,6 +3664,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; + case BPF_FUNC_msg_cork_bytes: + return &bpf_msg_cork_bytes_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-59-g8ed1b From 015632bb30daaaee64e1bcac07570860e0bf3092 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:25 -0700 Subject: bpf: sk_msg program helper bpf_sk_msg_pull_data Currently, if a bpf sk msg program is run the program can only parse data that the (start,end) pointers already consumed. For sendmsg hooks this is likely the first scatterlist element. For sendpage this will be the range (0,0) because the data is shared with userspace and by default we want to avoid allowing userspace to modify data while (or after) BPF verdict is being decided. To support pulling in additional bytes for parsing use a new helper bpf_sk_msg_pull(start, end, flags) which works similar to cls tc logic. This helper will attempt to point the data start pointer at 'start' bytes offest into msg and data end pointer at 'end' bytes offset into message. After basic sanity checks to ensure 'start' <= 'end' and 'end' <= msg_length there are a few cases we need to handle. First the sendmsg hook has already copied the data from userspace and has exclusive access to it. Therefor, it is not necessesary to copy the data. However, it may be required. After finding the scatterlist element with 'start' offset byte in it there are two cases. One the range (start,end) is entirely contained in the sg element and is already linear. All that is needed is to update the data pointers, no allocate/copy is needed. The other case is (start, end) crosses sg element boundaries. In this case we allocate a block of size 'end - start' and copy the data to linearize it. Next sendpage hook has not copied any data in initial state so that data pointers are (0,0). In this case we handle it similar to the above sendmsg case except the allocation/copy must always happen. Then when sending the data we have possibly three memory regions that need to be sent, (0, start - 1), (start, end), and (end + 1, msg_length). This is required to ensure any writes by the BPF program are correctly transmitted. Lastly this operation will invalidate any previous data checks so BPF programs will have to revalidate pointers after making this BPF call. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 +- net/core/filter.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1765cfb16c99..18b7c510c511 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -793,7 +793,8 @@ union bpf_attr { FN(sock_ops_cb_flags_set), \ FN(msg_redirect_map), \ FN(msg_apply_bytes), \ - FN(msg_cork_bytes), + FN(msg_cork_bytes), \ + FN(msg_pull_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 0c9daf6ee555..c86f03fd9ea5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1956,6 +1956,136 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_pull_data, + struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +{ + unsigned int len = 0, offset = 0, copy = 0; + struct scatterlist *sg = msg->sg_data; + int first_sg, last_sg, i, shift; + unsigned char *p, *to, *from; + int bytes = end - start; + struct page *page; + + if (unlikely(flags || end <= start)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg_start; + do { + len = sg[i].length; + offset += len; + if (start < offset + len) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != msg->sg_end); + + if (unlikely(start >= offset + len)) + return -EINVAL; + + if (!msg->sg_copy[i] && bytes <= len) + goto out; + + first_sg = i; + + /* At this point we need to linearize multiple scatterlist + * elements or a single shared page. Either way we need to + * copy into a linear buffer exclusively owned by BPF. Then + * place the buffer in the scatterlist and fixup the original + * entries by removing the entries now in the linear buffer + * and shifting the remaining entries. For now we do not try + * to copy partial entries to avoid complexity of running out + * of sg_entry slots. The downside is reading a single byte + * will copy the entire sg entry. + */ + do { + copy += sg[i].length; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (bytes < copy) + break; + } while (i != msg->sg_end); + last_sg = i; + + if (unlikely(copy < end - start)) + return -EINVAL; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); + if (unlikely(!page)) + return -ENOMEM; + p = page_address(page); + offset = 0; + + i = first_sg; + do { + from = sg_virt(&sg[i]); + len = sg[i].length; + to = p + offset; + + memcpy(to, from, len); + offset += len; + sg[i].length = 0; + put_page(sg_page(&sg[i])); + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != last_sg); + + sg[first_sg].length = copy; + sg_set_page(&sg[first_sg], page, copy, 0); + + /* To repair sg ring we need to shift entries. If we only + * had a single entry though we can just replace it and + * be done. Otherwise walk the ring and shift the entries. + */ + shift = last_sg - first_sg - 1; + if (!shift) + goto out; + + i = first_sg + 1; + do { + int move_from; + + if (i + shift >= MAX_SKB_FRAGS) + move_from = i + shift - MAX_SKB_FRAGS; + else + move_from = i + shift; + + if (move_from == msg->sg_end) + break; + + sg[i] = sg[move_from]; + sg[move_from].length = 0; + sg[move_from].page_link = 0; + sg[move_from].offset = 0; + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (1); + msg->sg_end -= shift; + if (msg->sg_end < 0) + msg->sg_end += MAX_SKB_FRAGS; +out: + msg->data = sg_virt(&sg[i]) + start - offset; + msg->data_end = msg->data + bytes; + + return 0; +} + +static const struct bpf_func_proto bpf_msg_pull_data_proto = { + .func = bpf_msg_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2897,7 +3027,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta) + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data) return true; return false; @@ -3666,6 +3797,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; + case BPF_FUNC_msg_pull_data: + return &bpf_msg_pull_data_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-59-g8ed1b From 82a8616889d506cb690cfc0afb2ccadda120461d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:31 -0700 Subject: bpf: add map tests for BPF_PROG_TYPE_SK_MSG Add map tests to attach BPF_PROG_TYPE_SK_MSG types to a sockmap. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 10 ++++ tools/testing/selftests/bpf/Makefile | 3 +- tools/testing/selftests/bpf/bpf_helpers.h | 2 + tools/testing/selftests/bpf/sockmap_parse_prog.c | 15 +++++- tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c | 33 +++++++++++++ tools/testing/selftests/bpf/sockmap_verdict_prog.c | 7 +++ tools/testing/selftests/bpf/test_maps.c | 55 ++++++++++++++++++++-- 7 files changed, 118 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1944d0aee7e8..13d9c59d79ce 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, }; enum bpf_attach_type { @@ -143,6 +144,7 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -941,6 +943,14 @@ enum sk_action { SK_PASS, }; +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + void *data; + void *data_end; +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index b0d29fd5e51c..f35fb02bdf56 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -29,7 +29,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ - sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o + sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \ + sockmap_tcp_msg_prog.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index dde2c11d7771..1558fe8bcf3e 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -123,6 +123,8 @@ static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = (void *) BPF_FUNC_skb_under_cgroup; static int (*bpf_skb_change_head)(void *, int len, int flags) = (void *) BPF_FUNC_skb_change_head; +static int (*bpf_skb_pull_data)(void *, int len) = + (void *) BPF_FUNC_skb_pull_data; /* Scan the ARCH passed in from ARCH env variable (see Makefile) */ #if defined(__TARGET_ARCH_x86) diff --git a/tools/testing/selftests/bpf/sockmap_parse_prog.c b/tools/testing/selftests/bpf/sockmap_parse_prog.c index a1dec2b6d9c5..0f92858f6226 100644 --- a/tools/testing/selftests/bpf/sockmap_parse_prog.c +++ b/tools/testing/selftests/bpf/sockmap_parse_prog.c @@ -20,14 +20,25 @@ int bpf_prog1(struct __sk_buff *skb) __u32 lport = skb->local_port; __u32 rport = skb->remote_port; __u8 *d = data; + __u32 len = (__u32) data_end - (__u32) data; + int err; - if (data + 10 > data_end) - return skb->len; + if (data + 10 > data_end) { + err = bpf_skb_pull_data(skb, 10); + if (err) + return SK_DROP; + + data_end = (void *)(long)skb->data_end; + data = (void *)(long)skb->data; + if (data + 10 > data_end) + return SK_DROP; + } /* This write/read is a bit pointless but tests the verifier and * strparser handler for read/write pkt data and access into sk * fields. */ + d = data; d[7] = 1; return skb->len; } diff --git a/tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c b/tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c new file mode 100644 index 000000000000..12a7b5c82ed6 --- /dev/null +++ b/tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c @@ -0,0 +1,33 @@ +#include +#include "bpf_helpers.h" +#include "bpf_util.h" +#include "bpf_endian.h" + +int _version SEC("version") = 1; + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sk_msg1") +int bpf_prog1(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + + char *d; + + if (data + 8 > data_end) + return SK_DROP; + + bpf_printk("data length %i\n", (__u64)msg->data_end - (__u64)msg->data); + d = (char *)data; + bpf_printk("hello sendmsg hook %i %i\n", d[0], d[1]); + + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/sockmap_verdict_prog.c b/tools/testing/selftests/bpf/sockmap_verdict_prog.c index d7bea972cb21..2ce7634a4012 100644 --- a/tools/testing/selftests/bpf/sockmap_verdict_prog.c +++ b/tools/testing/selftests/bpf/sockmap_verdict_prog.c @@ -26,6 +26,13 @@ struct bpf_map_def SEC("maps") sock_map_tx = { .max_entries = 20, }; +struct bpf_map_def SEC("maps") sock_map_msg = { + .type = BPF_MAP_TYPE_SOCKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 20, +}; + struct bpf_map_def SEC("maps") sock_map_break = { .type = BPF_MAP_TYPE_ARRAY, .key_size = sizeof(int), diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 1238733c5b33..6c253343a6f9 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -464,15 +464,17 @@ static void test_devmap(int task, void *data) #include #define SOCKMAP_PARSE_PROG "./sockmap_parse_prog.o" #define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o" +#define SOCKMAP_TCP_MSG_PROG "./sockmap_tcp_msg_prog.o" static void test_sockmap(int tasks, void *data) { - int one = 1, map_fd_rx = 0, map_fd_tx = 0, map_fd_break, s, sc, rc; - struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_break; + struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_msg, *bpf_map_break; + int map_fd_msg = 0, map_fd_rx = 0, map_fd_tx = 0, map_fd_break; int ports[] = {50200, 50201, 50202, 50204}; int err, i, fd, udp, sfd[6] = {0xdeadbeef}; u8 buf[20] = {0x0, 0x5, 0x3, 0x2, 0x1, 0x0}; - int parse_prog, verdict_prog; + int parse_prog, verdict_prog, msg_prog; struct sockaddr_in addr; + int one = 1, s, sc, rc; struct bpf_object *obj; struct timeval to; __u32 key, value; @@ -584,6 +586,12 @@ static void test_sockmap(int tasks, void *data) goto out_sockmap; } + err = bpf_prog_attach(-1, fd, BPF_SK_MSG_VERDICT, 0); + if (!err) { + printf("Failed invalid msg verdict prog attach\n"); + goto out_sockmap; + } + err = bpf_prog_attach(-1, fd, __MAX_BPF_ATTACH_TYPE, 0); if (!err) { printf("Failed unknown prog attach\n"); @@ -602,6 +610,12 @@ static void test_sockmap(int tasks, void *data) goto out_sockmap; } + err = bpf_prog_detach(fd, BPF_SK_MSG_VERDICT); + if (err) { + printf("Failed empty msg verdict prog detach\n"); + goto out_sockmap; + } + err = bpf_prog_detach(fd, __MAX_BPF_ATTACH_TYPE); if (!err) { printf("Detach invalid prog successful\n"); @@ -616,6 +630,13 @@ static void test_sockmap(int tasks, void *data) goto out_sockmap; } + err = bpf_prog_load(SOCKMAP_TCP_MSG_PROG, + BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog); + if (err) { + printf("Failed to load SK_SKB msg prog\n"); + goto out_sockmap; + } + err = bpf_prog_load(SOCKMAP_VERDICT_PROG, BPF_PROG_TYPE_SK_SKB, &obj, &verdict_prog); if (err) { @@ -631,7 +652,7 @@ static void test_sockmap(int tasks, void *data) map_fd_rx = bpf_map__fd(bpf_map_rx); if (map_fd_rx < 0) { - printf("Failed to get map fd\n"); + printf("Failed to get map rx fd\n"); goto out_sockmap; } @@ -647,6 +668,18 @@ static void test_sockmap(int tasks, void *data) goto out_sockmap; } + bpf_map_msg = bpf_object__find_map_by_name(obj, "sock_map_msg"); + if (IS_ERR(bpf_map_msg)) { + printf("Failed to load map msg from msg_verdict prog\n"); + goto out_sockmap; + } + + map_fd_msg = bpf_map__fd(bpf_map_msg); + if (map_fd_msg < 0) { + printf("Failed to get map msg fd\n"); + goto out_sockmap; + } + bpf_map_break = bpf_object__find_map_by_name(obj, "sock_map_break"); if (IS_ERR(bpf_map_break)) { printf("Failed to load map tx from verdict prog\n"); @@ -680,6 +713,12 @@ static void test_sockmap(int tasks, void *data) goto out_sockmap; } + err = bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0); + if (err) { + printf("Failed msg verdict bpf prog attach\n"); + goto out_sockmap; + } + err = bpf_prog_attach(verdict_prog, map_fd_rx, __MAX_BPF_ATTACH_TYPE, 0); if (!err) { @@ -719,6 +758,14 @@ static void test_sockmap(int tasks, void *data) } } + /* Put sfd[2] (sending fd below) into msg map to test sendmsg bpf */ + i = 0; + err = bpf_map_update_elem(map_fd_msg, &i, &sfd[2], BPF_ANY); + if (err) { + printf("Failed map_fd_msg update sockmap %i\n", err); + goto out_sockmap; + } + /* Test map send/recv */ for (i = 0; i < 2; i++) { buf[0] = i; -- cgit v1.2.3-59-g8ed1b From 1acc60b6a41bd4e43c3cb10e5395a5c812e158f5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:36 -0700 Subject: bpf: add verifier tests for BPF_PROG_TYPE_SK_MSG Test read and writes for BPF_PROG_TYPE_SK_MSG. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_verifier.c | 54 +++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 86d7ff491b6f..3e7718b1a9ae 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1596,6 +1596,60 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SK_SKB, }, + { + "direct packet read for SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, + offsetof(struct sk_msg_md, data)), + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, + offsetof(struct sk_msg_md, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { + "direct packet write for SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, + offsetof(struct sk_msg_md, data)), + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, + offsetof(struct sk_msg_md, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { + "overlapping checks for direct packet access SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, + offsetof(struct sk_msg_md, data)), + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, + offsetof(struct sk_msg_md, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), + BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_2, 6), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, { "check skb->mark is not writeable by sockets", .insns = { -- cgit v1.2.3-59-g8ed1b From 4c4c3c276c099f265c8b11e0132ce826ee718e2c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:41 -0700 Subject: bpf: sockmap sample, add option to attach SK_MSG program Add sockmap option to use SK_MSG program types. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- samples/bpf/bpf_load.c | 8 +++- samples/sockmap/sockmap_kern.c | 52 ++++++++++++++++++++++++ samples/sockmap/sockmap_user.c | 67 ++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 13 +++++- tools/lib/bpf/libbpf.c | 1 + tools/testing/selftests/bpf/bpf_helpers.h | 3 ++ 6 files changed, 135 insertions(+), 9 deletions(-) diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 69806d74fa53..b1a310c3ae89 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -67,6 +67,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; bool is_sockops = strncmp(event, "sockops", 7) == 0; bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; + bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -96,6 +97,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_SOCK_OPS; } else if (is_sk_skb) { prog_type = BPF_PROG_TYPE_SK_SKB; + } else if (is_sk_msg) { + prog_type = BPF_PROG_TYPE_SK_MSG; } else { printf("Unknown event '%s'\n", event); return -1; @@ -113,7 +116,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) return 0; - if (is_socket || is_sockops || is_sk_skb) { + if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { if (is_socket) event += 6; else @@ -589,7 +592,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "socket", 6) == 0 || memcmp(shname, "cgroup/", 7) == 0 || memcmp(shname, "sockops", 7) == 0 || - memcmp(shname, "sk_skb", 6) == 0) { + memcmp(shname, "sk_skb", 6) == 0 || + memcmp(shname, "sk_msg", 6) == 0) { ret = load_and_attach(shname, data->d_buf, data->d_size); if (ret != 0) diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index 52b0053274f4..75edb2f151d4 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -43,6 +43,20 @@ struct bpf_map_def SEC("maps") sock_map = { .max_entries = 20, }; +struct bpf_map_def SEC("maps") sock_map_txmsg = { + .type = BPF_MAP_TYPE_SOCKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 20, +}; + +struct bpf_map_def SEC("maps") sock_map_redir = { + .type = BPF_MAP_TYPE_SOCKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1, +}; + SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { @@ -105,4 +119,42 @@ int bpf_sockmap(struct bpf_sock_ops *skops) return 0; } + +SEC("sk_msg1") +int bpf_prog4(struct sk_msg_md *msg) +{ + return SK_PASS; +} + +SEC("sk_msg2") +int bpf_prog5(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + + bpf_printk("sk_msg2: data length %i\n", (__u32)data_end - (__u32)data); + return SK_PASS; +} + +SEC("sk_msg3") +int bpf_prog6(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0; + + return bpf_msg_redirect_map(msg, &sock_map_redir, ret, 0); +} + +SEC("sk_msg4") +int bpf_prog7(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0; + + bpf_printk("sk_msg3: redirect(%iB)\n", (__u32)data_end - (__u32)data); + return bpf_msg_redirect_map(msg, &sock_map_redir, ret, 0); +} + char _license[] SEC("license") = "GPL"; diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 95a54a89a532..bbfe3a2b9885 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -54,6 +54,11 @@ void running_handler(int a); /* global sockets */ int s1, s2, c1, c2, p1, p2; +int txmsg_pass; +int txmsg_noisy; +int txmsg_redir; +int txmsg_redir_noisy; + static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, {"cgroup", required_argument, NULL, 'c' }, @@ -62,6 +67,10 @@ static const struct option long_options[] = { {"iov_count", required_argument, NULL, 'i' }, {"length", required_argument, NULL, 'l' }, {"test", required_argument, NULL, 't' }, + {"txmsg", no_argument, &txmsg_pass, 1 }, + {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, + {"txmsg_redir", no_argument, &txmsg_redir, 1 }, + {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, {0, 0, NULL, 0 } }; @@ -447,13 +456,13 @@ enum { int main(int argc, char **argv) { - int iov_count = 1, length = 1024, rate = 1, verbose = 0; + int iov_count = 1, length = 1024, rate = 1, verbose = 0, tx_prog_fd; struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; int opt, longindex, err, cg_fd = 0; int test = PING_PONG; char filename[256]; - while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:", + while ((opt = getopt_long(argc, argv, ":hvc:r:i:l:t:", long_options, &longindex)) != -1) { switch (opt) { /* Cgroup configuration */ @@ -490,6 +499,8 @@ int main(int argc, char **argv) return -1; } break; + case 0: + break; case 'h': default: usage(argv); @@ -515,16 +526,16 @@ int main(int argc, char **argv) /* catch SIGINT */ signal(SIGINT, running_handler); - /* If base test skip BPF setup */ - if (test == BASE) - goto run; - if (load_bpf_file(filename)) { fprintf(stderr, "load_bpf_file: (%s) %s\n", filename, strerror(errno)); return 1; } + /* If base test skip BPF setup */ + if (test == BASE) + goto run; + /* Attach programs to sockmap */ err = bpf_prog_attach(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER, 0); @@ -557,6 +568,50 @@ run: goto out; } + /* Attach txmsg program to sockmap */ + if (txmsg_pass) + tx_prog_fd = prog_fd[3]; + else if (txmsg_noisy) + tx_prog_fd = prog_fd[4]; + else if (txmsg_redir) + tx_prog_fd = prog_fd[5]; + else if (txmsg_redir_noisy) + tx_prog_fd = prog_fd[6]; + else + tx_prog_fd = 0; + + if (tx_prog_fd) { + int redir_fd, i = 0; + + err = bpf_prog_attach(tx_prog_fd, + map_fd[1], BPF_SK_MSG_VERDICT, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (txmsg): %d (%s)\n", + err, strerror(errno)); + return err; + } + + err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg): %d (%s\n", + err, strerror(errno)); + return err; + } + if (test == SENDMSG) + redir_fd = c2; + else + redir_fd = c1; + + err = bpf_map_update_elem(map_fd[2], &i, &redir_fd, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg): %d (%s\n", + err, strerror(errno)); + return err; + } + } if (test == PING_PONG) err = forever_ping_pong(rate, verbose); else if (test == SENDMSG) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 13d9c59d79ce..01b9c97b172e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -720,6 +720,15 @@ union bpf_attr { * int bpf_override_return(pt_regs, rc) * @pt_regs: pointer to struct pt_regs * @rc: the return value to set + * + * int bpf_msg_redirect_map(map, key, flags) + * Redirect msg to a sock in map using key as a lookup key for the + * sock in map. + * @map: pointer to sockmap + * @key: key to lookup sock in map + * @flags: reserved for future use + * Return: SK_PASS + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -781,7 +790,9 @@ union bpf_attr { FN(perf_prog_read_value), \ FN(getsockopt), \ FN(override_return), \ - FN(sock_ops_cb_flags_set), + FN(sock_ops_cb_flags_set), \ + FN(msg_redirect_map), \ + FN(msg_apply_bytes), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 5bbbf285af74..64a8fc384186 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1857,6 +1857,7 @@ static const struct { BPF_PROG_SEC("lwt_xmit", BPF_PROG_TYPE_LWT_XMIT), BPF_PROG_SEC("sockops", BPF_PROG_TYPE_SOCK_OPS), BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB), + BPF_PROG_SEC("sk_msg", BPF_PROG_TYPE_SK_MSG), }; #undef BPF_PROG_SEC diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 1558fe8bcf3e..bba7ee6d93ab 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -86,6 +86,9 @@ static int (*bpf_perf_prog_read_value)(void *ctx, void *buf, (void *) BPF_FUNC_perf_prog_read_value; static int (*bpf_override_return)(void *ctx, unsigned long rc) = (void *) BPF_FUNC_override_return; +static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) = + (void *) BPF_FUNC_msg_redirect_map; + /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit v1.2.3-59-g8ed1b From e67463cb5d9345d762aa7325e58aaeff5bc68ee1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:46 -0700 Subject: bpf: sockmap sample, add sendfile test To exercise TX ULP sendpage implementation we need a test that does a sendfile. Add sendfile test option here. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- samples/sockmap/sockmap_user.c | 70 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index bbfe3a2b9885..ec624a801306 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -67,10 +68,10 @@ static const struct option long_options[] = { {"iov_count", required_argument, NULL, 'i' }, {"length", required_argument, NULL, 'l' }, {"test", required_argument, NULL, 't' }, - {"txmsg", no_argument, &txmsg_pass, 1 }, - {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, - {"txmsg_redir", no_argument, &txmsg_redir, 1 }, - {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, + {"txmsg", no_argument, &txmsg_pass, 1 }, + {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, + {"txmsg_redir", no_argument, &txmsg_redir, 1 }, + {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, {0, 0, NULL, 0 } }; @@ -204,6 +205,35 @@ struct msg_stats { struct timespec end; }; +static int msg_loop_sendpage(int fd, int iov_length, int cnt, + struct msg_stats *s) +{ + off_t offset = 0; + FILE *file; + int i, fp; + + file = fopen(".sendpage_tst.tmp", "w+"); + fseek(file, iov_length * cnt, SEEK_CUR); + fprintf(file, "A"); + fseek(file, 0, SEEK_SET); + + fp = fileno(file); + clock_gettime(CLOCK_MONOTONIC, &s->start); + for (i = 0; i < cnt; i++) { + int sent = sendfile(fd, fp, &offset, iov_length); + + if (sent < 0) { + perror("send loop error:"); + fclose(file); + return sent; + } + s->bytes_sent += sent; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + fclose(file); + return 0; +} + static int msg_loop(int fd, int iov_count, int iov_length, int cnt, struct msg_stats *s, bool tx) { @@ -309,7 +339,7 @@ static inline float recvdBps(struct msg_stats s) } static int sendmsg_test(int iov_count, int iov_buf, int cnt, - int verbose, bool base) + int verbose, bool base, bool sendpage) { float sent_Bps = 0, recvd_Bps = 0; int rx_fd, txpid, rxpid, err = 0; @@ -325,6 +355,8 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, rxpid = fork(); if (rxpid == 0) { + if (sendpage) + iov_count = 1; err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false); if (err) fprintf(stderr, @@ -348,7 +380,11 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, txpid = fork(); if (txpid == 0) { - err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true); + if (sendpage) + err = msg_loop_sendpage(c1, iov_buf, cnt, &s); + else + err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true); + if (err) fprintf(stderr, "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n", @@ -452,6 +488,8 @@ enum { PING_PONG, SENDMSG, BASE, + BASE_SENDPAGE, + SENDPAGE, }; int main(int argc, char **argv) @@ -494,6 +532,10 @@ int main(int argc, char **argv) test = SENDMSG; } else if (strcmp(optarg, "base") == 0) { test = BASE; + } else if (strcmp(optarg, "base_sendpage") == 0) { + test = BASE_SENDPAGE; + } else if (strcmp(optarg, "sendpage") == 0) { + test = SENDPAGE; } else { usage(argv); return -1; @@ -533,7 +575,7 @@ int main(int argc, char **argv) } /* If base test skip BPF setup */ - if (test == BASE) + if (test == BASE || test == BASE_SENDPAGE) goto run; /* Attach programs to sockmap */ @@ -599,7 +641,7 @@ run: err, strerror(errno)); return err; } - if (test == SENDMSG) + if (txmsg_redir || txmsg_redir_noisy) redir_fd = c2; else redir_fd = c1; @@ -615,9 +657,17 @@ run: if (test == PING_PONG) err = forever_ping_pong(rate, verbose); else if (test == SENDMSG) - err = sendmsg_test(iov_count, length, rate, verbose, false); + err = sendmsg_test(iov_count, length, rate, + verbose, false, false); + else if (test == SENDPAGE) + err = sendmsg_test(iov_count, length, rate, + verbose, false, true); else if (test == BASE) - err = sendmsg_test(iov_count, length, rate, verbose, true); + err = sendmsg_test(iov_count, length, rate, + verbose, true, false); + else if (test == BASE_SENDPAGE) + err = sendmsg_test(iov_count, length, rate, + verbose, true, true); else fprintf(stderr, "unknown test\n"); out: -- cgit v1.2.3-59-g8ed1b From 6bce9d2ca65238d37890186bf41acd7f7d9a9820 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:51 -0700 Subject: bpf: sockmap sample, add data verification option To verify data is not being dropped or corrupted this adds an option to verify test-patterns on recv. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- samples/sockmap/sockmap_user.c | 118 +++++++++++++++++++++++++++++------------ 1 file changed, 84 insertions(+), 34 deletions(-) diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index ec624a801306..8017ad7afea9 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -68,6 +68,7 @@ static const struct option long_options[] = { {"iov_count", required_argument, NULL, 'i' }, {"length", required_argument, NULL, 'l' }, {"test", required_argument, NULL, 't' }, + {"data_test", no_argument, NULL, 'd' }, {"txmsg", no_argument, &txmsg_pass, 1 }, {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, {"txmsg_redir", no_argument, &txmsg_redir, 1 }, @@ -208,45 +209,49 @@ struct msg_stats { static int msg_loop_sendpage(int fd, int iov_length, int cnt, struct msg_stats *s) { - off_t offset = 0; + unsigned char k = 0; FILE *file; int i, fp; file = fopen(".sendpage_tst.tmp", "w+"); - fseek(file, iov_length * cnt, SEEK_CUR); - fprintf(file, "A"); + for (i = 0; i < iov_length * cnt; i++, k++) + fwrite(&k, sizeof(char), 1, file); + fflush(file); fseek(file, 0, SEEK_SET); + fclose(file); - fp = fileno(file); + fp = open(".sendpage_tst.tmp", O_RDONLY); clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendfile(fd, fp, &offset, iov_length); + int sent = sendfile(fd, fp, NULL, iov_length); if (sent < 0) { perror("send loop error:"); - fclose(file); + close(fp); return sent; } s->bytes_sent += sent; } clock_gettime(CLOCK_MONOTONIC, &s->end); - fclose(file); + close(fp); return 0; } static int msg_loop(int fd, int iov_count, int iov_length, int cnt, - struct msg_stats *s, bool tx) + struct msg_stats *s, bool tx, bool data_test) { struct msghdr msg = {0}; int err, i, flags = MSG_NOSIGNAL; struct iovec *iov; + unsigned char k; iov = calloc(iov_count, sizeof(struct iovec)); if (!iov) return errno; + k = 0; for (i = 0; i < iov_count; i++) { - char *d = calloc(iov_length, sizeof(char)); + unsigned char *d = calloc(iov_length, sizeof(char)); if (!d) { fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count); @@ -254,10 +259,18 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, } iov[i].iov_base = d; iov[i].iov_len = iov_length; + + if (data_test && tx) { + int j; + + for (j = 0; j < iov_length; j++) + d[j] = k++; + } } msg.msg_iov = iov; msg.msg_iovlen = iov_count; + k = 0; if (tx) { clock_gettime(CLOCK_MONOTONIC, &s->start); @@ -311,6 +324,26 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, } s->bytes_recvd += recv; + + if (data_test) { + int j; + + for (i = 0; i < msg.msg_iovlen; i++) { + unsigned char *d = iov[i].iov_base; + + for (j = 0; + j < iov[i].iov_len && recv; j++) { + if (d[j] != k++) { + errno = -EIO; + fprintf(stderr, + "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n", + i, j, d[j], k - 1, d[j+1], k + 1); + goto out_errno; + } + recv--; + } + } + } } clock_gettime(CLOCK_MONOTONIC, &s->end); } @@ -338,8 +371,15 @@ static inline float recvdBps(struct msg_stats s) return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec); } +struct sockmap_options { + int verbose; + bool base; + bool sendpage; + bool data_test; +}; + static int sendmsg_test(int iov_count, int iov_buf, int cnt, - int verbose, bool base, bool sendpage) + struct sockmap_options *opt) { float sent_Bps = 0, recvd_Bps = 0; int rx_fd, txpid, rxpid, err = 0; @@ -348,16 +388,17 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, errno = 0; - if (base) + if (opt->base) rx_fd = p1; else rx_fd = p2; rxpid = fork(); if (rxpid == 0) { - if (sendpage) + if (opt->sendpage) iov_count = 1; - err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false); + err = msg_loop(rx_fd, iov_count, iov_buf, + cnt, &s, false, opt->data_test); if (err) fprintf(stderr, "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", @@ -380,10 +421,11 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, txpid = fork(); if (txpid == 0) { - if (sendpage) + if (opt->sendpage) err = msg_loop_sendpage(c1, iov_buf, cnt, &s); else - err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true); + err = msg_loop(c1, iov_count, iov_buf, + cnt, &s, true, opt->data_test); if (err) fprintf(stderr, @@ -409,7 +451,7 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, return err; } -static int forever_ping_pong(int rate, int verbose) +static int forever_ping_pong(int rate, struct sockmap_options *opt) { struct timeval timeout; char buf[1024] = {0}; @@ -474,7 +516,7 @@ static int forever_ping_pong(int rate, int verbose) if (rate) sleep(rate); - if (verbose) { + if (opt->verbose) { printf("."); fflush(stdout); @@ -494,13 +536,14 @@ enum { int main(int argc, char **argv) { - int iov_count = 1, length = 1024, rate = 1, verbose = 0, tx_prog_fd; + int iov_count = 1, length = 1024, rate = 1, tx_prog_fd; struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; int opt, longindex, err, cg_fd = 0; + struct sockmap_options options = {0}; int test = PING_PONG; char filename[256]; - while ((opt = getopt_long(argc, argv, ":hvc:r:i:l:t:", + while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:", long_options, &longindex)) != -1) { switch (opt) { /* Cgroup configuration */ @@ -517,7 +560,7 @@ int main(int argc, char **argv) rate = atoi(optarg); break; case 'v': - verbose = 1; + options.verbose = 1; break; case 'i': iov_count = atoi(optarg); @@ -525,6 +568,9 @@ int main(int argc, char **argv) case 'l': length = atoi(optarg); break; + case 'd': + options.data_test = true; + break; case 't': if (strcmp(optarg, "ping") == 0) { test = PING_PONG; @@ -655,20 +701,24 @@ run: } } if (test == PING_PONG) - err = forever_ping_pong(rate, verbose); - else if (test == SENDMSG) - err = sendmsg_test(iov_count, length, rate, - verbose, false, false); - else if (test == SENDPAGE) - err = sendmsg_test(iov_count, length, rate, - verbose, false, true); - else if (test == BASE) - err = sendmsg_test(iov_count, length, rate, - verbose, true, false); - else if (test == BASE_SENDPAGE) - err = sendmsg_test(iov_count, length, rate, - verbose, true, true); - else + err = forever_ping_pong(rate, &options); + else if (test == SENDMSG) { + options.base = false; + options.sendpage = false; + err = sendmsg_test(iov_count, length, rate, &options); + } else if (test == SENDPAGE) { + options.base = false; + options.sendpage = true; + err = sendmsg_test(iov_count, length, rate, &options); + } else if (test == BASE) { + options.base = true; + options.sendpage = false; + err = sendmsg_test(iov_count, length, rate, &options); + } else if (test == BASE_SENDPAGE) { + options.base = true; + options.sendpage = true; + err = sendmsg_test(iov_count, length, rate, &options); + } else fprintf(stderr, "unknown test\n"); out: bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS); -- cgit v1.2.3-59-g8ed1b From 1c16c3126ac938562a68ab7041fe74ce0de53166 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:56 -0700 Subject: bpf: sockmap, add sample option to test apply_bytes helper This adds an option to test the apply_bytes helper. This option lets the user specify an int on the command line specifying how much data each verdict should apply to. When this is set a map entry is set with the bytes input by the user and then the specified program --txmsg or --txmsg_redir will use the value and set the applied data. If no other option is set then a default --txmsg_apply program is run. This program will drop pkts if an error is detected on the bytes map lookup. Useful to verify the map lookup and apply helper are working and causing a hard error if it is not. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- samples/sockmap/sockmap_kern.c | 54 +++++++++++++++++++++++++++---- samples/sockmap/sockmap_user.c | 19 ++++++++++- tools/testing/selftests/bpf/bpf_helpers.h | 3 +- 3 files changed, 68 insertions(+), 8 deletions(-) diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index 75edb2f151d4..205ec36449d1 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -57,6 +57,13 @@ struct bpf_map_def SEC("maps") sock_map_redir = { .max_entries = 1, }; +struct bpf_map_def SEC("maps") sock_apply_bytes = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { @@ -123,6 +130,11 @@ int bpf_sockmap(struct bpf_sock_ops *skops) SEC("sk_msg1") int bpf_prog4(struct sk_msg_md *msg) { + int *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); return SK_PASS; } @@ -131,8 +143,13 @@ int bpf_prog5(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; void *data = (void *)(long) msg->data; + int *bytes, err = 0, zero = 0; - bpf_printk("sk_msg2: data length %i\n", (__u32)data_end - (__u32)data); + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + err = bpf_msg_apply_bytes(msg, *bytes); + bpf_printk("sk_msg2: data length %i err %i\n", + (__u64)data_end - (__u64)data, err); return SK_PASS; } @@ -141,9 +158,12 @@ int bpf_prog6(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; void *data = (void *)(long) msg->data; - int ret = 0; + int *bytes, zero = 0; - return bpf_msg_redirect_map(msg, &sock_map_redir, ret, 0); + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0); } SEC("sk_msg4") @@ -151,10 +171,32 @@ int bpf_prog7(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; void *data = (void *)(long) msg->data; - int ret = 0; + int *bytes, err = 0, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + err = bpf_msg_apply_bytes(msg, *bytes); + bpf_printk("sk_msg3: redirect(%iB) err=%i\n", + (__u64)data_end - (__u64)data, err); + return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0); +} - bpf_printk("sk_msg3: redirect(%iB)\n", (__u32)data_end - (__u32)data); - return bpf_msg_redirect_map(msg, &sock_map_redir, ret, 0); +SEC("sk_msg5") +int bpf_prog8(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0, *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) { + ret = bpf_msg_apply_bytes(msg, *bytes); + if (ret) + return SK_DROP; + } else { + return SK_DROP; + } + return SK_PASS; } char _license[] SEC("license") = "GPL"; diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 8017ad7afea9..41774ec0f0b3 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -59,6 +59,7 @@ int txmsg_pass; int txmsg_noisy; int txmsg_redir; int txmsg_redir_noisy; +int txmsg_apply; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -73,6 +74,7 @@ static const struct option long_options[] = { {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, {"txmsg_redir", no_argument, &txmsg_redir, 1 }, {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, + {"txmsg_apply", required_argument, NULL, 'a'}, {0, 0, NULL, 0 } }; @@ -546,7 +548,9 @@ int main(int argc, char **argv) while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:", long_options, &longindex)) != -1) { switch (opt) { - /* Cgroup configuration */ + case 'a': + txmsg_apply = atoi(optarg); + break; case 'c': cg_fd = open(optarg, O_DIRECTORY, O_RDONLY); if (cg_fd < 0) { @@ -665,6 +669,8 @@ run: tx_prog_fd = prog_fd[5]; else if (txmsg_redir_noisy) tx_prog_fd = prog_fd[6]; + else if (txmsg_apply) + tx_prog_fd = prog_fd[7]; else tx_prog_fd = 0; @@ -699,6 +705,17 @@ run: err, strerror(errno)); return err; } + + if (txmsg_apply) { + err = bpf_map_update_elem(map_fd[3], + &i, &txmsg_apply, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (apply_bytes): %d (%s\n", + err, strerror(errno)); + return err; + } + } } if (test == PING_PONG) err = forever_ping_pong(rate, &options); diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index bba7ee6d93ab..4713de48cf88 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -88,7 +88,8 @@ static int (*bpf_override_return)(void *ctx, unsigned long rc) = (void *) BPF_FUNC_override_return; static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) = (void *) BPF_FUNC_msg_redirect_map; - +static int (*bpf_msg_apply_bytes)(void *ctx, int len) = + (void *) BPF_FUNC_msg_apply_bytes; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit v1.2.3-59-g8ed1b From 468b3fdea8826b232a570d95ba45272eb38919cb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:58:02 -0700 Subject: bpf: sockmap sample support for bpf_msg_cork_bytes() Add sample application support for the bpf_msg_cork_bytes helper. This lets the user specify how many bytes each verdict should apply to. Similar to apply_bytes() tests these can be run as a stand-alone test when used without other options or inline with other tests by using the txmsg_cork option along with any of the basic tests txmsg, txmsg_redir, txmsg_drop. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- samples/sockmap/sockmap_kern.c | 53 ++++++++++++++++++++++++++----- samples/sockmap/sockmap_user.c | 19 +++++++++++ tools/include/uapi/linux/bpf.h | 3 +- tools/testing/selftests/bpf/bpf_helpers.h | 2 ++ 4 files changed, 68 insertions(+), 9 deletions(-) diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index 205ec36449d1..735226794ce1 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -64,6 +64,13 @@ struct bpf_map_def SEC("maps") sock_apply_bytes = { .max_entries = 1 }; +struct bpf_map_def SEC("maps") sock_cork_bytes = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { @@ -135,6 +142,9 @@ int bpf_prog4(struct sk_msg_md *msg) bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); return SK_PASS; } @@ -143,13 +153,16 @@ int bpf_prog5(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; void *data = (void *)(long) msg->data; - int *bytes, err = 0, zero = 0; + int *bytes, err1 = -1, err2 = -1, zero = 0; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) - err = bpf_msg_apply_bytes(msg, *bytes); - bpf_printk("sk_msg2: data length %i err %i\n", - (__u64)data_end - (__u64)data, err); + err1 = bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + err2 = bpf_msg_cork_bytes(msg, *bytes); + bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", + (__u64)data_end - (__u64)data, err1, err2); return SK_PASS; } @@ -163,6 +176,9 @@ int bpf_prog6(struct sk_msg_md *msg) bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0); } @@ -171,13 +187,17 @@ int bpf_prog7(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; void *data = (void *)(long) msg->data; - int *bytes, err = 0, zero = 0; + int *bytes, err1 = 0, err2 = 0, zero = 0; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) - err = bpf_msg_apply_bytes(msg, *bytes); - bpf_printk("sk_msg3: redirect(%iB) err=%i\n", - (__u64)data_end - (__u64)data, err); + err1 = bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + err2 = bpf_msg_cork_bytes(msg, *bytes); + + bpf_printk("sk_msg3: redirect(%iB) err1=%i err2=%i\n", + (__u64)data_end - (__u64)data, err1, err2); return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0); } @@ -198,5 +218,22 @@ int bpf_prog8(struct sk_msg_md *msg) } return SK_PASS; } +SEC("sk_msg6") +int bpf_prog9(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0, *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) { + if (((__u64)data_end - (__u64)data) >= *bytes) + return SK_PASS; + ret = bpf_msg_cork_bytes(msg, *bytes); + if (ret) + return SK_DROP; + } + return SK_PASS; +} char _license[] SEC("license") = "GPL"; diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 41774ec0f0b3..4e0a3d87881f 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -60,6 +60,7 @@ int txmsg_noisy; int txmsg_redir; int txmsg_redir_noisy; int txmsg_apply; +int txmsg_cork; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -75,6 +76,7 @@ static const struct option long_options[] = { {"txmsg_redir", no_argument, &txmsg_redir, 1 }, {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, {"txmsg_apply", required_argument, NULL, 'a'}, + {"txmsg_cork", required_argument, NULL, 'k'}, {0, 0, NULL, 0 } }; @@ -551,6 +553,9 @@ int main(int argc, char **argv) case 'a': txmsg_apply = atoi(optarg); break; + case 'k': + txmsg_cork = atoi(optarg); + break; case 'c': cg_fd = open(optarg, O_DIRECTORY, O_RDONLY); if (cg_fd < 0) { @@ -671,6 +676,8 @@ run: tx_prog_fd = prog_fd[6]; else if (txmsg_apply) tx_prog_fd = prog_fd[7]; + else if (txmsg_cork) + tx_prog_fd = prog_fd[8]; else tx_prog_fd = 0; @@ -716,6 +723,18 @@ run: return err; } } + + if (txmsg_cork) { + err = bpf_map_update_elem(map_fd[4], + &i, &txmsg_cork, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (cork_bytes): %d (%s\n", + err, strerror(errno)); + return err; + } + } + } if (test == PING_PONG) err = forever_ping_pong(rate, &options); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 01b9c97b172e..e6924ab20fc3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -792,7 +792,8 @@ union bpf_attr { FN(override_return), \ FN(sock_ops_cb_flags_set), \ FN(msg_redirect_map), \ - FN(msg_apply_bytes), + FN(msg_apply_bytes), \ + FN(msg_cork_bytes), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 4713de48cf88..b5b45ff705ff 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -90,6 +90,8 @@ static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) = (void *) BPF_FUNC_msg_redirect_map; static int (*bpf_msg_apply_bytes)(void *ctx, int len) = (void *) BPF_FUNC_msg_apply_bytes; +static int (*bpf_msg_cork_bytes)(void *ctx, int len) = + (void *) BPF_FUNC_msg_cork_bytes; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit v1.2.3-59-g8ed1b From e6373ce70a01292424a118d9457d927349dad51d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:58:07 -0700 Subject: bpf: sockmap add SK_DROP tests Add tests for SK_DROP. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- samples/sockmap/sockmap_kern.c | 15 ++++++++++ samples/sockmap/sockmap_user.c | 62 +++++++++++++++++++++++++++++++----------- 2 files changed, 61 insertions(+), 16 deletions(-) diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index 735226794ce1..8b6c34cd5f46 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -236,4 +236,19 @@ int bpf_prog9(struct sk_msg_md *msg) return SK_PASS; } +SEC("sk_msg7") +int bpf_prog10(struct sk_msg_md *msg) +{ + int *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + return SK_DROP; +} + + char _license[] SEC("license") = "GPL"; diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 4e0a3d87881f..52c4ed7774d4 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -59,6 +59,7 @@ int txmsg_pass; int txmsg_noisy; int txmsg_redir; int txmsg_redir_noisy; +int txmsg_drop; int txmsg_apply; int txmsg_cork; @@ -75,6 +76,7 @@ static const struct option long_options[] = { {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, {"txmsg_redir", no_argument, &txmsg_redir, 1 }, {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, + {"txmsg_drop", no_argument, &txmsg_drop, 1 }, {"txmsg_apply", required_argument, NULL, 'a'}, {"txmsg_cork", required_argument, NULL, 'k'}, {0, 0, NULL, 0 } @@ -210,9 +212,19 @@ struct msg_stats { struct timespec end; }; +struct sockmap_options { + int verbose; + bool base; + bool sendpage; + bool data_test; + bool drop_expected; +}; + static int msg_loop_sendpage(int fd, int iov_length, int cnt, - struct msg_stats *s) + struct msg_stats *s, + struct sockmap_options *opt) { + bool drop = opt->drop_expected; unsigned char k = 0; FILE *file; int i, fp; @@ -229,12 +241,18 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, for (i = 0; i < cnt; i++) { int sent = sendfile(fd, fp, NULL, iov_length); - if (sent < 0) { + if (!drop && sent < 0) { perror("send loop error:"); close(fp); return sent; + } else if (drop && sent >= 0) { + printf("sendpage loop error expected: %i\n", sent); + close(fp); + return -EIO; } - s->bytes_sent += sent; + + if (sent > 0) + s->bytes_sent += sent; } clock_gettime(CLOCK_MONOTONIC, &s->end); close(fp); @@ -242,12 +260,15 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, } static int msg_loop(int fd, int iov_count, int iov_length, int cnt, - struct msg_stats *s, bool tx, bool data_test) + struct msg_stats *s, bool tx, + struct sockmap_options *opt) { struct msghdr msg = {0}; int err, i, flags = MSG_NOSIGNAL; struct iovec *iov; unsigned char k; + bool data_test = opt->data_test; + bool drop = opt->drop_expected; iov = calloc(iov_count, sizeof(struct iovec)); if (!iov) @@ -281,11 +302,16 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, for (i = 0; i < cnt; i++) { int sent = sendmsg(fd, &msg, flags); - if (sent < 0) { + if (!drop && sent < 0) { perror("send loop error:"); goto out_errno; + } else if (drop && sent >= 0) { + printf("send loop error expected: %i\n", sent); + errno = -EIO; + goto out_errno; } - s->bytes_sent += sent; + if (sent > 0) + s->bytes_sent += sent; } clock_gettime(CLOCK_MONOTONIC, &s->end); } else { @@ -375,13 +401,6 @@ static inline float recvdBps(struct msg_stats s) return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec); } -struct sockmap_options { - int verbose; - bool base; - bool sendpage; - bool data_test; -}; - static int sendmsg_test(int iov_count, int iov_buf, int cnt, struct sockmap_options *opt) { @@ -399,10 +418,13 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, rxpid = fork(); if (rxpid == 0) { + if (opt->drop_expected) + exit(1); + if (opt->sendpage) iov_count = 1; err = msg_loop(rx_fd, iov_count, iov_buf, - cnt, &s, false, opt->data_test); + cnt, &s, false, opt); if (err) fprintf(stderr, "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", @@ -426,10 +448,10 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, txpid = fork(); if (txpid == 0) { if (opt->sendpage) - err = msg_loop_sendpage(c1, iov_buf, cnt, &s); + err = msg_loop_sendpage(c1, iov_buf, cnt, &s, opt); else err = msg_loop(c1, iov_count, iov_buf, - cnt, &s, true, opt->data_test); + cnt, &s, true, opt); if (err) fprintf(stderr, @@ -674,6 +696,9 @@ run: tx_prog_fd = prog_fd[5]; else if (txmsg_redir_noisy) tx_prog_fd = prog_fd[6]; + else if (txmsg_drop) + tx_prog_fd = prog_fd[9]; + /* apply and cork must be last */ else if (txmsg_apply) tx_prog_fd = prog_fd[7]; else if (txmsg_cork) @@ -700,6 +725,7 @@ run: err, strerror(errno)); return err; } + if (txmsg_redir || txmsg_redir_noisy) redir_fd = c2; else @@ -736,6 +762,10 @@ run: } } + + if (txmsg_drop) + options.drop_expected = true; + if (test == PING_PONG) err = forever_ping_pong(rate, &options); else if (test == SENDMSG) { -- cgit v1.2.3-59-g8ed1b From 0dcbbf6785799ba05b44df2ba6bcc1195f4f945c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:58:12 -0700 Subject: bpf: sockmap sample test for bpf_msg_pull_data This adds an option to test the msg_pull_data helper. This uses two options txmsg_start and txmsg_end to let the user specify start and end bytes to pull. The options can be used with txmsg_apply, txmsg_cork options as well as with any of the basic tests, txmsg, txmsg_redir and txmsg_drop (plus noisy variants) to run pull_data inline with those tests. By giving user direct control over the variables we can easily do negative testing as well as positive tests. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- samples/sockmap/sockmap_kern.c | 79 +++++++++++++++++++++++++------ samples/sockmap/sockmap_user.c | 32 +++++++++++++ tools/include/uapi/linux/bpf.h | 3 +- tools/testing/selftests/bpf/bpf_helpers.h | 2 + 4 files changed, 101 insertions(+), 15 deletions(-) diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index 8b6c34cd5f46..9ad5ba79c85a 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -71,6 +71,14 @@ struct bpf_map_def SEC("maps") sock_cork_bytes = { .max_entries = 1 }; +struct bpf_map_def SEC("maps") sock_pull_bytes = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 2 +}; + + SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { @@ -137,7 +145,8 @@ int bpf_sockmap(struct bpf_sock_ops *skops) SEC("sk_msg1") int bpf_prog4(struct sk_msg_md *msg) { - int *bytes, zero = 0; + int *bytes, zero = 0, one = 1; + int *start, *end; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) @@ -145,15 +154,18 @@ int bpf_prog4(struct sk_msg_md *msg) bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); if (bytes) bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); return SK_PASS; } SEC("sk_msg2") int bpf_prog5(struct sk_msg_md *msg) { - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - int *bytes, err1 = -1, err2 = -1, zero = 0; + int err1 = -1, err2 = -1, zero = 0, one = 1; + int *bytes, *start, *end, len1, len2; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) @@ -161,17 +173,32 @@ int bpf_prog5(struct sk_msg_md *msg) bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); if (bytes) err2 = bpf_msg_cork_bytes(msg, *bytes); + len1 = (__u64)msg->data_end - (__u64)msg->data; + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) { + int err; + + bpf_printk("sk_msg2: pull(%i:%i)\n", + start ? *start : 0, end ? *end : 0); + err = bpf_msg_pull_data(msg, *start, *end, 0); + if (err) + bpf_printk("sk_msg2: pull_data err %i\n", + err); + len2 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length update %i->%i\n", + len1, len2); + } bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", - (__u64)data_end - (__u64)data, err1, err2); + len1, err1, err2); return SK_PASS; } SEC("sk_msg3") int bpf_prog6(struct sk_msg_md *msg) { - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - int *bytes, zero = 0; + int *bytes, zero = 0, one = 1; + int *start, *end; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) @@ -179,15 +206,18 @@ int bpf_prog6(struct sk_msg_md *msg) bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); if (bytes) bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0); } SEC("sk_msg4") int bpf_prog7(struct sk_msg_md *msg) { - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - int *bytes, err1 = 0, err2 = 0, zero = 0; + int err1 = 0, err2 = 0, zero = 0, one = 1; + int *bytes, *start, *end, len1, len2; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) @@ -195,9 +225,24 @@ int bpf_prog7(struct sk_msg_md *msg) bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); if (bytes) err2 = bpf_msg_cork_bytes(msg, *bytes); - + len1 = (__u64)msg->data_end - (__u64)msg->data; + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) { + int err; + + bpf_printk("sk_msg2: pull(%i:%i)\n", + start ? *start : 0, end ? *end : 0); + err = bpf_msg_pull_data(msg, *start, *end, 0); + if (err) + bpf_printk("sk_msg2: pull_data err %i\n", + err); + len2 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length update %i->%i\n", + len1, len2); + } bpf_printk("sk_msg3: redirect(%iB) err1=%i err2=%i\n", - (__u64)data_end - (__u64)data, err1, err2); + len1, err1, err2); return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0); } @@ -239,7 +284,8 @@ int bpf_prog9(struct sk_msg_md *msg) SEC("sk_msg7") int bpf_prog10(struct sk_msg_md *msg) { - int *bytes, zero = 0; + int *bytes, zero = 0, one = 1; + int *start, *end; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) @@ -247,6 +293,11 @@ int bpf_prog10(struct sk_msg_md *msg) bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); if (bytes) bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + return SK_DROP; } diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 52c4ed7774d4..07aa237221d1 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -62,6 +62,8 @@ int txmsg_redir_noisy; int txmsg_drop; int txmsg_apply; int txmsg_cork; +int txmsg_start; +int txmsg_end; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -79,6 +81,8 @@ static const struct option long_options[] = { {"txmsg_drop", no_argument, &txmsg_drop, 1 }, {"txmsg_apply", required_argument, NULL, 'a'}, {"txmsg_cork", required_argument, NULL, 'k'}, + {"txmsg_start", required_argument, NULL, 's'}, + {"txmsg_end", required_argument, NULL, 'e'}, {0, 0, NULL, 0 } }; @@ -572,6 +576,12 @@ int main(int argc, char **argv) while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:", long_options, &longindex)) != -1) { switch (opt) { + case 's': + txmsg_start = atoi(optarg); + break; + case 'e': + txmsg_end = atoi(optarg); + break; case 'a': txmsg_apply = atoi(optarg); break; @@ -761,6 +771,28 @@ run: } } + if (txmsg_start) { + err = bpf_map_update_elem(map_fd[5], + &i, &txmsg_start, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_start): %d (%s)\n", + err, strerror(errno)); + return err; + } + } + + if (txmsg_end) { + i = 1; + err = bpf_map_update_elem(map_fd[5], + &i, &txmsg_end, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_end): %d (%s)\n", + err, strerror(errno)); + return err; + } + } } if (txmsg_drop) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e6924ab20fc3..d245c41213ac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -793,7 +793,8 @@ union bpf_attr { FN(sock_ops_cb_flags_set), \ FN(msg_redirect_map), \ FN(msg_apply_bytes), \ - FN(msg_cork_bytes), + FN(msg_cork_bytes), \ + FN(msg_pull_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index b5b45ff705ff..7cae376d8d0c 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -92,6 +92,8 @@ static int (*bpf_msg_apply_bytes)(void *ctx, int len) = (void *) BPF_FUNC_msg_apply_bytes; static int (*bpf_msg_cork_bytes)(void *ctx, int len) = (void *) BPF_FUNC_msg_cork_bytes; +static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) = + (void *) BPF_FUNC_msg_pull_data; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit v1.2.3-59-g8ed1b From ae30727fa4beabd8403b016896eb1f714e6c0fd4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:58:17 -0700 Subject: bpf: sockmap test script This adds the test script I am currently using to validate the latest sockmap changes. Shortly sockmap will be ported to selftests and these will be run from the infrastructure there. Until then add the script here so we have a coverage checklist when porting into selftests. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- samples/sockmap/sockmap_test.sh | 450 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 450 insertions(+) create mode 100755 samples/sockmap/sockmap_test.sh diff --git a/samples/sockmap/sockmap_test.sh b/samples/sockmap/sockmap_test.sh new file mode 100755 index 000000000000..6d8cc40cca22 --- /dev/null +++ b/samples/sockmap/sockmap_test.sh @@ -0,0 +1,450 @@ +#Test a bunch of positive cases to verify basic functionality +for prog in "--txmsg" "--txmsg_redir" "--txmsg_drop"; do +for t in "sendmsg" "sendpage"; do +for r in 1 10 100; do + for i in 1 10 100; do + for l in 1 10 100; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 + done + done +done +done +done + +#Test max iov +t="sendmsg" +r=1 +i=1024 +l=1 +prog="--txmsg" + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" +echo $TEST +$TEST +sleep 2 +prog="--txmsg_redir" +TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" +echo $TEST +$TEST + +# Test max iov with 1k send + +t="sendmsg" +r=1 +i=1024 +l=1024 +prog="--txmsg" + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" +echo $TEST +$TEST +sleep 2 +prog="--txmsg_redir" +TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" +echo $TEST +$TEST +sleep 2 + +# Test apply with 1B +r=1 +i=1024 +l=1024 +prog="--txmsg_apply 1" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply with larger value than send +r=1 +i=8 +l=1024 +prog="--txmsg_apply 2048" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply with apply that never reaches limit +r=1024 +i=1 +l=1 +prog="--txmsg_apply 2048" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply and redirect with 1B +r=1 +i=1024 +l=1024 +prog="--txmsg_redir --txmsg_apply 1" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply and redirect with larger value than send +r=1 +i=8 +l=1024 +prog="--txmsg_redir --txmsg_apply 2048" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply and redirect with apply that never reaches limit +r=1024 +i=1 +l=1 +prog="--txmsg_apply 2048" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with 1B not really useful but test it anyways +r=1 +i=1024 +l=1024 +prog="--txmsg_cork 1" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with a more reasonable 100B +r=1 +i=1000 +l=1000 +prog="--txmsg_cork 100" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with larger value than send +r=1 +i=8 +l=1024 +prog="--txmsg_cork 2048" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with cork that never reaches limit +r=1024 +i=1 +l=1 +prog="--txmsg_cork 2048" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +r=1 +i=1024 +l=1024 +prog="--txmsg_redir --txmsg_cork 1" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with a more reasonable 100B +r=1 +i=1000 +l=1000 +prog="--txmsg_redir --txmsg_cork 100" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with larger value than send +r=1 +i=8 +l=1024 +prog="--txmsg_redir --txmsg_cork 2048" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with cork that never reaches limit +r=1024 +i=1 +l=1 +prog="--txmsg_cork 2048" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + +# mix and match cork and apply not really useful but valid programs + +# Test apply < cork +r=100 +i=1 +l=5 +prog="--txmsg_apply 10 --txmsg_cork 100" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Try again with larger sizes so we hit overflow case +r=100 +i=1000 +l=2048 +prog="--txmsg_apply 4096 --txmsg_cork 8096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply > cork +r=100 +i=1 +l=5 +prog="--txmsg_apply 100 --txmsg_cork 10" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Again with larger sizes so we hit overflow cases +r=100 +i=1000 +l=2048 +prog="--txmsg_apply 8096 --txmsg_cork 4096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + +# Test apply = cork +r=100 +i=1 +l=5 +prog="--txmsg_apply 10 --txmsg_cork 10" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +r=100 +i=1000 +l=2048 +prog="--txmsg_apply 4096 --txmsg_cork 4096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply < cork +r=100 +i=1 +l=5 +prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 100" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Try again with larger sizes so we hit overflow case +r=100 +i=1000 +l=2048 +prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 8096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply > cork +r=100 +i=1 +l=5 +prog="--txmsg_redir --txmsg_apply 100 --txmsg_cork 10" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Again with larger sizes so we hit overflow cases +r=100 +i=1000 +l=2048 +prog="--txmsg_redir --txmsg_apply 8096 --txmsg_cork 4096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + +# Test apply = cork +r=100 +i=1 +l=5 +prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 10" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +r=100 +i=1000 +l=2048 +prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 4096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Tests for bpf_msg_pull_data() +for i in `seq 99 100 1600`; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 0 --txmsg_end $i --txmsg_cork 1600" + echo $TEST + $TEST + sleep 2 +done + +for i in `seq 199 100 1600`; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 100 --txmsg_end $i --txmsg_cork 1600" + echo $TEST + $TEST + sleep 2 +done + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 1500 --txmsg_end 1600 --txmsg_cork 1600" +echo $TEST +$TEST +sleep 2 + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 1111 --txmsg_end 1112 --txmsg_cork 1600" +echo $TEST +$TEST +sleep 2 + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 1111 --txmsg_end 0 --txmsg_cork 1600" +echo $TEST +$TEST +sleep 2 + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1600" +echo $TEST +$TEST +sleep 2 + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1602" +echo $TEST +$TEST +sleep 2 + +# Run through gamut again with start and end +for prog in "--txmsg" "--txmsg_redir" "--txmsg_drop"; do +for t in "sendmsg" "sendpage"; do +for r in 1 10 100; do + for i in 1 10 100; do + for l in 1 10 100; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog --txmsg_start 1 --txmsg_end 2" + echo $TEST + $TEST + sleep 2 + done + done +done +done +done + +# Some specific tests to cover specific code paths +./sockmap --cgroup /mnt/cgroup2/ -t sendpage \ + -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3 +./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \ + -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3 +./sockmap --cgroup /mnt/cgroup2/ -t sendpage \ + -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5 +./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \ + -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5 -- cgit v1.2.3-59-g8ed1b From 78262f4575c29f185947fe58952cd1beabc74f82 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Mar 2018 00:21:15 +0100 Subject: bpf, doc: add description wrt native/bpf clang target and pointer size As this recently came up on netdev [0], lets add it to the BPF devel doc. [0] https://www.spinics.net/lists/netdev/msg489612.html Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- Documentation/bpf/bpf_devel_QA.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/bpf/bpf_devel_QA.txt b/Documentation/bpf/bpf_devel_QA.txt index 84cbb302f2b5..1a0b704e1a38 100644 --- a/Documentation/bpf/bpf_devel_QA.txt +++ b/Documentation/bpf/bpf_devel_QA.txt @@ -539,6 +539,18 @@ A: Although LLVM IR generation and optimization try to stay architecture The clang option "-fno-jump-tables" can be used to disable switch table generation. + - For clang -target bpf, it is guaranteed that pointer or long / + unsigned long types will always have a width of 64 bit, no matter + whether underlying clang binary or default target (or kernel) is + 32 bit. However, when native clang target is used, then it will + compile these types based on the underlying architecture's conventions, + meaning in case of 32 bit architecture, pointer or long / unsigned + long types e.g. in BPF context structure will have width of 32 bit + while the BPF LLVM back end still operates in 64 bit. The native + target is mostly needed in tracing for the case of walking pt_regs + or other kernel structures where CPU's register width matters. + Otherwise, clang -target bpf is generally recommended. + You should use default target when: - Your program includes a header file, e.g., ptrace.h, which eventually -- cgit v1.2.3-59-g8ed1b