aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/examples/bpf/augmented_raw_syscalls.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/examples/bpf/augmented_raw_syscalls.c')
-rw-r--r--tools/perf/examples/bpf/augmented_raw_syscalls.c448
1 files changed, 216 insertions, 232 deletions
diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
index 2422894a8194..b80437971d80 100644
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
@@ -16,17 +16,38 @@
#include <unistd.h>
#include <linux/limits.h>
+#include <linux/socket.h>
#include <pid_filter.h>
/* bpf-output associated map */
bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__);
+/*
+ * string_args_len: one per syscall arg, 0 means not a string or don't copy it,
+ * PATH_MAX for copying everything, any other value to limit
+ * it a la 'strace -s strsize'.
+ */
struct syscall {
bool enabled;
+ u16 string_args_len[6];
};
bpf_map(syscalls, ARRAY, int, struct syscall, 512);
+/*
+ * What to augment at entry?
+ *
+ * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
+ */
+bpf_map(syscalls_sys_enter, PROG_ARRAY, u32, u32, 512);
+
+/*
+ * What to augment at exit?
+ *
+ * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
+ */
+bpf_map(syscalls_sys_exit, PROG_ARRAY, u32, u32, 512);
+
struct syscall_enter_args {
unsigned long long common_tp_fields;
long syscall_nr;
@@ -39,272 +60,235 @@ struct syscall_exit_args {
long ret;
};
-struct augmented_filename {
+struct augmented_arg {
unsigned int size;
- int reserved;
+ int err;
char value[PATH_MAX];
};
-/* syscalls where the first arg is a string */
-#define SYS_OPEN 2
-#define SYS_STAT 4
-#define SYS_LSTAT 6
-#define SYS_ACCESS 21
-#define SYS_EXECVE 59
-#define SYS_TRUNCATE 76
-#define SYS_CHDIR 80
-#define SYS_RENAME 82
-#define SYS_MKDIR 83
-#define SYS_RMDIR 84
-#define SYS_CREAT 85
-#define SYS_LINK 86
-#define SYS_UNLINK 87
-#define SYS_SYMLINK 88
-#define SYS_READLINK 89
-#define SYS_CHMOD 90
-#define SYS_CHOWN 92
-#define SYS_LCHOWN 94
-#define SYS_MKNOD 133
-#define SYS_STATFS 137
-#define SYS_PIVOT_ROOT 155
-#define SYS_CHROOT 161
-#define SYS_ACCT 163
-#define SYS_SWAPON 167
-#define SYS_SWAPOFF 168
-#define SYS_DELETE_MODULE 176
-#define SYS_SETXATTR 188
-#define SYS_LSETXATTR 189
-#define SYS_GETXATTR 191
-#define SYS_LGETXATTR 192
-#define SYS_LISTXATTR 194
-#define SYS_LLISTXATTR 195
-#define SYS_REMOVEXATTR 197
-#define SYS_LREMOVEXATTR 198
-#define SYS_MQ_OPEN 240
-#define SYS_MQ_UNLINK 241
-#define SYS_ADD_KEY 248
-#define SYS_REQUEST_KEY 249
-#define SYS_SYMLINKAT 266
-#define SYS_MEMFD_CREATE 319
-
-/* syscalls where the first arg is a string */
-
-#define SYS_PWRITE64 18
-#define SYS_EXECVE 59
-#define SYS_RENAME 82
-#define SYS_QUOTACTL 179
-#define SYS_FSETXATTR 190
-#define SYS_FGETXATTR 193
-#define SYS_FREMOVEXATTR 199
-#define SYS_MQ_TIMEDSEND 242
-#define SYS_REQUEST_KEY 249
-#define SYS_INOTIFY_ADD_WATCH 254
-#define SYS_OPENAT 257
-#define SYS_MKDIRAT 258
-#define SYS_MKNODAT 259
-#define SYS_FCHOWNAT 260
-#define SYS_FUTIMESAT 261
-#define SYS_NEWFSTATAT 262
-#define SYS_UNLINKAT 263
-#define SYS_RENAMEAT 264
-#define SYS_LINKAT 265
-#define SYS_READLINKAT 267
-#define SYS_FCHMODAT 268
-#define SYS_FACCESSAT 269
-#define SYS_UTIMENSAT 280
-#define SYS_NAME_TO_HANDLE_AT 303
-#define SYS_FINIT_MODULE 313
-#define SYS_RENAMEAT2 316
-#define SYS_EXECVEAT 322
-#define SYS_STATX 332
-
pid_filter(pids_filtered);
-struct augmented_args_filename {
+struct augmented_args_payload {
struct syscall_enter_args args;
- struct augmented_filename filename;
+ union {
+ struct {
+ struct augmented_arg arg, arg2;
+ };
+ struct sockaddr_storage saddr;
+ };
};
-bpf_map(augmented_filename_map, PERCPU_ARRAY, int, struct augmented_args_filename, 1);
+// We need more tmp space than the BPF stack can give us
+bpf_map(augmented_args_tmp, PERCPU_ARRAY, int, struct augmented_args_payload, 1);
-SEC("raw_syscalls:sys_enter")
-int sys_enter(struct syscall_enter_args *args)
+static inline struct augmented_args_payload *augmented_args_payload(void)
{
- struct augmented_args_filename *augmented_args;
- unsigned int len = sizeof(*augmented_args);
- const void *filename_arg = NULL;
- struct syscall *syscall;
int key = 0;
+ return bpf_map_lookup_elem(&augmented_args_tmp, &key);
+}
+
+static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
+{
+ /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
+ return perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
+}
+
+static inline
+unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
+{
+ unsigned int augmented_len = sizeof(*augmented_arg);
+ int string_len = probe_read_str(&augmented_arg->value, arg_len, arg);
+
+ augmented_arg->size = augmented_arg->err = 0;
+ /*
+ * probe_read_str may return < 0, e.g. -EFAULT
+ * So we leave that in the augmented_arg->size that userspace will
+ */
+ if (string_len > 0) {
+ augmented_len -= sizeof(augmented_arg->value) - string_len;
+ augmented_len &= sizeof(augmented_arg->value) - 1;
+ augmented_arg->size = string_len;
+ } else {
+ /*
+ * So that username notice the error while still being able
+ * to skip this augmented arg record
+ */
+ augmented_arg->err = string_len;
+ augmented_len = offsetof(struct augmented_arg, value);
+ }
+
+ return augmented_len;
+}
+
+SEC("!raw_syscalls:unaugmented")
+int syscall_unaugmented(struct syscall_enter_args *args)
+{
+ return 1;
+}
+
+/*
+ * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in
+ * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go
+ * on from there, reading the first syscall arg as a string, i.e. open's
+ * filename.
+ */
+SEC("!syscalls:sys_enter_connect")
+int sys_enter_connect(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *sockaddr_arg = (const void *)args->args[1];
+ unsigned int socklen = args->args[2];
+ unsigned int len = sizeof(augmented_args->args);
- augmented_args = bpf_map_lookup_elem(&augmented_filename_map, &key);
if (augmented_args == NULL)
- return 1;
+ return 1; /* Failure: don't filter */
- if (pid_filter__has(&pids_filtered, getpid()))
- return 0;
+ if (socklen > sizeof(augmented_args->saddr))
+ socklen = sizeof(augmented_args->saddr);
- probe_read(&augmented_args->args, sizeof(augmented_args->args), args);
+ probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
- syscall = bpf_map_lookup_elem(&syscalls, &augmented_args->args.syscall_nr);
- if (syscall == NULL || !syscall->enabled)
- return 0;
+ return augmented__output(args, augmented_args, len + socklen);
+}
+
+SEC("!syscalls:sys_enter_sendto")
+int sys_enter_sendto(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *sockaddr_arg = (const void *)args->args[4];
+ unsigned int socklen = args->args[5];
+ unsigned int len = sizeof(augmented_args->args);
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ if (socklen > sizeof(augmented_args->saddr))
+ socklen = sizeof(augmented_args->saddr);
+
+ probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
+
+ return augmented__output(args, augmented_args, len + socklen);
+}
+
+SEC("!syscalls:sys_enter_open")
+int sys_enter_open(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *filename_arg = (const void *)args->args[0];
+ unsigned int len = sizeof(augmented_args->args);
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
+
+ return augmented__output(args, augmented_args, len);
+}
+
+SEC("!syscalls:sys_enter_openat")
+int sys_enter_openat(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *filename_arg = (const void *)args->args[1];
+ unsigned int len = sizeof(augmented_args->args);
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
+
+ return augmented__output(args, augmented_args, len);
+}
+
+SEC("!syscalls:sys_enter_rename")
+int sys_enter_rename(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *oldpath_arg = (const void *)args->args[0],
+ *newpath_arg = (const void *)args->args[1];
+ unsigned int len = sizeof(augmented_args->args), oldpath_len;
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
+ len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
+
+ return augmented__output(args, augmented_args, len);
+}
+
+SEC("!syscalls:sys_enter_renameat")
+int sys_enter_renameat(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *oldpath_arg = (const void *)args->args[1],
+ *newpath_arg = (const void *)args->args[3];
+ unsigned int len = sizeof(augmented_args->args), oldpath_len;
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
+ len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
+
+ return augmented__output(args, augmented_args, len);
+}
+
+SEC("raw_syscalls:sys_enter")
+int sys_enter(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args;
/*
- * Yonghong and Edward Cree sayz:
- *
- * https://www.spinics.net/lists/netdev/msg531645.html
- *
- * >> R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
- * >> 10: (bf) r1 = r6
- * >> 11: (07) r1 += 16
- * >> 12: (05) goto pc+2
- * >> 15: (79) r3 = *(u64 *)(r1 +0)
- * >> dereference of modified ctx ptr R1 off=16 disallowed
- * > Aha, we at least got a different error message this time.
- * > And indeed llvm has done that optimisation, rather than the more obvious
- * > 11: r3 = *(u64 *)(r1 +16)
- * > because it wants to have lots of reads share a single insn. You may be able
- * > to defeat that optimisation by adding compiler barriers, idk. Maybe someone
- * > with llvm knowledge can figure out how to stop it (ideally, llvm would know
- * > when it's generating for bpf backend and not do that). -O0? ¯\_(ツ)_/¯
+ * We start len, the amount of data that will be in the perf ring
+ * buffer, if this is not filtered out by one of pid_filter__has(),
+ * syscall->enabled, etc, with the non-augmented raw syscall payload,
+ * i.e. sizeof(augmented_args->args).
*
- * The optimization mostly likes below:
- *
- * br1:
- * ...
- * r1 += 16
- * goto merge
- * br2:
- * ...
- * r1 += 20
- * goto merge
- * merge:
- * *(u64 *)(r1 + 0)
- *
- * The compiler tries to merge common loads. There is no easy way to
- * stop this compiler optimization without turning off a lot of other
- * optimizations. The easiest way is to add barriers:
- *
- * __asm__ __volatile__("": : :"memory")
- *
- * after the ctx memory access to prevent their down stream merging.
+ * We'll add to this as we add augmented syscalls right after that
+ * initial, non-augmented raw_syscalls:sys_enter payload.
*/
+ unsigned int len = sizeof(augmented_args->args);
+ struct syscall *syscall;
+
+ if (pid_filter__has(&pids_filtered, getpid()))
+ return 0;
+
+ augmented_args = augmented_args_payload();
+ if (augmented_args == NULL)
+ return 1;
+
+ probe_read(&augmented_args->args, sizeof(augmented_args->args), args);
+
/*
- * This table of what args are strings will be provided by userspace,
- * in the syscalls map, i.e. we will already have to do the lookup to
- * see if this specific syscall is filtered, so we can as well get more
- * info about what syscall args are strings or pointers, and how many
- * bytes to copy, per arg, etc.
- *
- * For now hard code it, till we have all the basic mechanisms in place
- * to automate everything and make the kernel part be completely driven
- * by information obtained in userspace for each kernel version and
- * processor architecture, making the kernel part the same no matter what
- * kernel version or processor architecture it runs on.
+ * Jump to syscall specific augmenter, even if the default one,
+ * "!raw_syscalls:unaugmented" that will just return 1 to return the
+ * unagmented tracepoint payload.
*/
- switch (augmented_args->args.syscall_nr) {
- case SYS_ACCT:
- case SYS_ADD_KEY:
- case SYS_CHDIR:
- case SYS_CHMOD:
- case SYS_CHOWN:
- case SYS_CHROOT:
- case SYS_CREAT:
- case SYS_DELETE_MODULE:
- case SYS_EXECVE:
- case SYS_GETXATTR:
- case SYS_LCHOWN:
- case SYS_LGETXATTR:
- case SYS_LINK:
- case SYS_LISTXATTR:
- case SYS_LLISTXATTR:
- case SYS_LREMOVEXATTR:
- case SYS_LSETXATTR:
- case SYS_LSTAT:
- case SYS_MEMFD_CREATE:
- case SYS_MKDIR:
- case SYS_MKNOD:
- case SYS_MQ_OPEN:
- case SYS_MQ_UNLINK:
- case SYS_PIVOT_ROOT:
- case SYS_READLINK:
- case SYS_REMOVEXATTR:
- case SYS_RENAME:
- case SYS_REQUEST_KEY:
- case SYS_RMDIR:
- case SYS_SETXATTR:
- case SYS_STAT:
- case SYS_STATFS:
- case SYS_SWAPOFF:
- case SYS_SWAPON:
- case SYS_SYMLINK:
- case SYS_SYMLINKAT:
- case SYS_TRUNCATE:
- case SYS_UNLINK:
- case SYS_ACCESS:
- case SYS_OPEN: filename_arg = (const void *)args->args[0];
- __asm__ __volatile__("": : :"memory");
- break;
- case SYS_EXECVEAT:
- case SYS_FACCESSAT:
- case SYS_FCHMODAT:
- case SYS_FCHOWNAT:
- case SYS_FGETXATTR:
- case SYS_FINIT_MODULE:
- case SYS_FREMOVEXATTR:
- case SYS_FSETXATTR:
- case SYS_FUTIMESAT:
- case SYS_INOTIFY_ADD_WATCH:
- case SYS_LINKAT:
- case SYS_MKDIRAT:
- case SYS_MKNODAT:
- case SYS_MQ_TIMEDSEND:
- case SYS_NAME_TO_HANDLE_AT:
- case SYS_NEWFSTATAT:
- case SYS_PWRITE64:
- case SYS_QUOTACTL:
- case SYS_READLINKAT:
- case SYS_RENAMEAT:
- case SYS_RENAMEAT2:
- case SYS_STATX:
- case SYS_UNLINKAT:
- case SYS_UTIMENSAT:
- case SYS_OPENAT: filename_arg = (const void *)args->args[1];
- break;
- }
+ bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
- if (filename_arg != NULL) {
- augmented_args->filename.reserved = 0;
- augmented_args->filename.size = probe_read_str(&augmented_args->filename.value,
- sizeof(augmented_args->filename.value),
- filename_arg);
- if (augmented_args->filename.size < sizeof(augmented_args->filename.value)) {
- len -= sizeof(augmented_args->filename.value) - augmented_args->filename.size;
- len &= sizeof(augmented_args->filename.value) - 1;
- }
- } else {
- len = sizeof(augmented_args->args);
- }
-
- /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
- return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len);
+ // If not found on the PROG_ARRAY syscalls map, then we're filtering it:
+ return 0;
}
SEC("raw_syscalls:sys_exit")
int sys_exit(struct syscall_exit_args *args)
{
struct syscall_exit_args exit_args;
- struct syscall *syscall;
if (pid_filter__has(&pids_filtered, getpid()))
return 0;
probe_read(&exit_args, sizeof(exit_args), args);
-
- syscall = bpf_map_lookup_elem(&syscalls, &exit_args.syscall_nr);
- if (syscall == NULL || !syscall->enabled)
- return 0;
-
- return 1;
+ /*
+ * Jump to syscall specific return augmenter, even if the default one,
+ * "!raw_syscalls:unaugmented" that will just return 1 to return the
+ * unagmented tracepoint payload.
+ */
+ bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
+ /*
+ * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
+ */
+ return 0;
}
license(GPL);