aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c6
-rw-r--r--kernel/audit_fsnotify.c5
-rw-r--r--kernel/audit_tree.c12
-rw-r--r--kernel/audit_watch.c7
-rw-r--r--kernel/auditfilter.c6
-rw-r--r--kernel/auditsc.c135
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c50
-rw-r--r--kernel/bpf/btf.c2348
-rw-r--r--kernel/bpf/cgroup.c11
-rw-r--r--kernel/bpf/core.c190
-rw-r--r--kernel/bpf/cpumap.c132
-rw-r--r--kernel/bpf/devmap.c152
-rw-r--r--kernel/bpf/hashtab.c12
-rw-r--r--kernel/bpf/helpers.c15
-rw-r--r--kernel/bpf/inode.c182
-rw-r--r--kernel/bpf/lpm_trie.c5
-rw-r--r--kernel/bpf/offload.c6
-rw-r--r--kernel/bpf/sockmap.c640
-rw-r--r--kernel/bpf/stackmap.c138
-rw-r--r--kernel/bpf/syscall.c370
-rw-r--r--kernel/bpf/tnum.c10
-rw-r--r--kernel/bpf/verifier.c437
-rw-r--r--kernel/bpf/xskmap.c232
-rw-r--r--kernel/cgroup/Makefile2
-rw-r--r--kernel/cgroup/cgroup-internal.h11
-rw-r--r--kernel/cgroup/cgroup-v1.c4
-rw-r--r--kernel/cgroup/cgroup.c109
-rw-r--r--kernel/cgroup/cpuset.c7
-rw-r--r--kernel/cgroup/rdma.c35
-rw-r--r--kernel/cgroup/rstat.c416
-rw-r--r--kernel/cgroup/stat.c338
-rw-r--r--kernel/configs/android-recommended.config2
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/crash_core.c1
-rw-r--r--kernel/debug/kdb/kdb_main.c13
-rw-r--r--kernel/dma/Kconfig50
-rw-r--r--kernel/dma/Makefile11
-rw-r--r--kernel/dma/coherent.c434
-rw-r--r--kernel/dma/contiguous.c278
-rw-r--r--kernel/dma/debug.c1773
-rw-r--r--kernel/dma/direct.c204
-rw-r--r--kernel/dma/mapping.c345
-rw-r--r--kernel/dma/noncoherent.c102
-rw-r--r--kernel/dma/swiotlb.c1087
-rw-r--r--kernel/dma/virt.c59
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/events/uprobes.c3
-rw-r--r--kernel/fail_function.c2
-rw-r--r--kernel/fork.c15
-rw-r--r--kernel/gcov/Kconfig17
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/hung_task.c11
-rw-r--r--kernel/iomem.c167
-rw-r--r--kernel/irq/debugfs.c1
-rw-r--r--kernel/irq/manage.c38
-rw-r--r--kernel/irq/migration.c31
-rw-r--r--kernel/kcov.c21
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/locking/locktorture.c14
-rw-r--r--kernel/memremap.c210
-rw-r--r--kernel/module.c7
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/power/hibernate.c7
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/qos.c1
-rw-r--r--kernel/power/suspend.c27
-rw-r--r--kernel/power/swap.c6
-rw-r--r--kernel/power/user.c5
-rw-r--r--kernel/power/wakelock.c1
-rw-r--r--kernel/printk/printk.c14
-rw-r--r--kernel/printk/printk_safe.c7
-rw-r--r--kernel/rcu/rcutorture.c5
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c1
-rw-r--r--kernel/rseq.c357
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/cpufreq_schedutil.c262
-rw-r--r--kernel/sched/fair.c4
-rw-r--r--kernel/sched/rt.c4
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/seccomp.c126
-rw-r--r--kernel/signal.c24
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--kernel/time/posix-cpu-timers.c2
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/time.c6
-rw-r--r--kernel/trace/Kconfig22
-rw-r--r--kernel/trace/bpf_trace.c116
-rw-r--r--kernel/trace/ftrace.c28
-rw-r--r--kernel/trace/ring_buffer.c20
-rw-r--r--kernel/trace/trace.c52
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c36
-rw-r--r--kernel/trace/trace_events_filter.c39
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_export.c9
-rw-r--r--kernel/trace/trace_kprobe.c29
-rw-r--r--kernel/trace/trace_uprobe.c22
-rw-r--r--kernel/trace/tracing_map.c2
-rw-r--r--kernel/tracepoint.c3
-rw-r--r--kernel/umh.c124
-rw-r--r--kernel/user_namespace.c6
-rw-r--r--kernel/workqueue.c145
-rw-r--r--kernel/workqueue_internal.h3
114 files changed, 10988 insertions, 1549 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f85ae5dfa474..04bc07c2b42a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -41,6 +41,7 @@ obj-y += printk/
obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
+obj-y += dma/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
@@ -112,7 +113,9 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_HAS_IOMEM) += memremap.o
+obj-$(CONFIG_HAS_IOMEM) += iomem.o
+obj-$(CONFIG_ZONE_DEVICE) += memremap.o
+obj-$(CONFIG_RSEQ) += rseq.o
$(obj)/configs.o: $(obj)/config_data.h
diff --git a/kernel/audit.c b/kernel/audit.c
index 670665c6e2a6..e7478cb58079 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1099,8 +1099,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
if (audit_enabled == AUDIT_OFF)
return;
-
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
if (!ab)
return;
audit_log_task_info(ab, current);
@@ -2317,8 +2316,7 @@ void audit_log_link_denied(const char *operation)
return;
/* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
- ab = audit_log_start(current->audit_context, GFP_KERNEL,
- AUDIT_ANOM_LINK);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK);
if (!ab)
return;
audit_log_format(ab, "op=%s", operation);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 52f368b6561e..fba78047fb37 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -109,7 +109,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
audit_update_mark(audit_mark, dentry->d_inode);
audit_mark->rule = krule;
- ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true);
+ ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true);
if (ret < 0) {
fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
@@ -165,12 +165,11 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
/* Update mark data in audit rules based on fsnotify events. */
static int audit_mark_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
const unsigned char *dname, u32 cookie,
struct fsnotify_iter_info *iter_info)
{
+ struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
struct audit_fsnotify_mark *audit_mark;
const struct inode *inode = NULL;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 67e6956c0b61..c99ebaae5abc 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -288,8 +288,8 @@ static void untag_chunk(struct node *p)
if (!new)
goto Fallback;
- if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode,
- NULL, 1)) {
+ if (fsnotify_add_inode_mark_locked(&new->mark, entry->connector->inode,
+ 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -354,7 +354,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
return -ENOMEM;
entry = &chunk->mark;
- if (fsnotify_add_mark(entry, inode, NULL, 0)) {
+ if (fsnotify_add_inode_mark(entry, inode, 0)) {
fsnotify_put_mark(entry);
return -ENOSPC;
}
@@ -434,8 +434,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
return -ENOENT;
}
- if (fsnotify_add_mark_locked(chunk_entry,
- old_entry->connector->inode, NULL, 1)) {
+ if (fsnotify_add_inode_mark_locked(chunk_entry,
+ old_entry->connector->inode, 1)) {
spin_unlock(&old_entry->lock);
mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_put_mark(chunk_entry);
@@ -989,8 +989,6 @@ static void evict_chunk(struct audit_chunk *chunk)
static int audit_tree_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
const unsigned char *file_name, u32 cookie,
struct fsnotify_iter_info *iter_info)
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9eb8b3511636..c17c0c268436 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -160,7 +160,7 @@ static struct audit_parent *audit_init_parent(struct path *path)
fsnotify_init_mark(&parent->mark, audit_watch_group);
parent->mark.mask = AUDIT_FS_WATCH;
- ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0);
+ ret = fsnotify_add_inode_mark(&parent->mark, inode, 0);
if (ret < 0) {
audit_free_parent(parent);
return ERR_PTR(ret);
@@ -274,7 +274,7 @@ static void audit_update_watch(struct audit_parent *parent,
/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
if (invalidating && !audit_dummy_context())
- audit_filter_inodes(current, current->audit_context);
+ audit_filter_inodes(current, audit_context());
/* updating ino will likely change which audit_hash_list we
* are on so we need a new watch for the new list */
@@ -472,12 +472,11 @@ void audit_remove_watch_rule(struct audit_krule *krule)
/* Update watch data in audit rules based on fsnotify events. */
static int audit_watch_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
const unsigned char *dname, u32 cookie,
struct fsnotify_iter_info *iter_info)
{
+ struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
const struct inode *inode;
struct audit_parent *parent;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d7a807e81451..eaa320148d97 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -426,7 +426,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
return -EINVAL;
break;
case AUDIT_EXE:
- if (f->op != Audit_equal)
+ if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
if (entry->rule.listnr != AUDIT_FILTER_EXIT)
return -EINVAL;
@@ -1089,8 +1089,6 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
struct audit_buffer *ab;
- uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
- unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -1098,7 +1096,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+ audit_log_session_info(ab);
audit_log_task_context(ab);
audit_log_format(ab, " op=%s", action);
audit_log_key(ab, rule->filterkey);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4e0a4ac803db..ceb1c4596c51 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -374,7 +374,7 @@ static int audit_field_compare(struct task_struct *tsk,
case AUDIT_COMPARE_EGID_TO_OBJ_GID:
return audit_compare_gid(cred->egid, name, f, ctx);
case AUDIT_COMPARE_AUID_TO_OBJ_UID:
- return audit_compare_uid(tsk->loginuid, name, f, ctx);
+ return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx);
case AUDIT_COMPARE_SUID_TO_OBJ_UID:
return audit_compare_uid(cred->suid, name, f, ctx);
case AUDIT_COMPARE_SGID_TO_OBJ_GID:
@@ -385,7 +385,8 @@ static int audit_field_compare(struct task_struct *tsk,
return audit_compare_gid(cred->fsgid, name, f, ctx);
/* uid comparisons */
case AUDIT_COMPARE_UID_TO_AUID:
- return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
+ return audit_uid_comparator(cred->uid, f->op,
+ audit_get_loginuid(tsk));
case AUDIT_COMPARE_UID_TO_EUID:
return audit_uid_comparator(cred->uid, f->op, cred->euid);
case AUDIT_COMPARE_UID_TO_SUID:
@@ -394,11 +395,14 @@ static int audit_field_compare(struct task_struct *tsk,
return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
/* auid comparisons */
case AUDIT_COMPARE_AUID_TO_EUID:
- return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
+ return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+ cred->euid);
case AUDIT_COMPARE_AUID_TO_SUID:
- return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
+ return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+ cred->suid);
case AUDIT_COMPARE_AUID_TO_FSUID:
- return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
+ return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+ cred->fsuid);
/* euid comparisons */
case AUDIT_COMPARE_EUID_TO_SUID:
return audit_uid_comparator(cred->euid, f->op, cred->suid);
@@ -471,6 +475,8 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_EXE:
result = audit_exe_compare(tsk, rule->exe);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_UID:
result = audit_uid_comparator(cred->uid, f->op, f->uid);
@@ -511,7 +517,7 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
case AUDIT_SESSIONID:
- sessionid = audit_get_sessionid(current);
+ sessionid = audit_get_sessionid(tsk);
result = audit_comparator(sessionid, f->op, f->val);
break;
case AUDIT_PERS:
@@ -609,7 +615,8 @@ static int audit_filter_rules(struct task_struct *tsk,
result = match_tree_refs(ctx, rule->tree);
break;
case AUDIT_LOGINUID:
- result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+ result = audit_uid_comparator(audit_get_loginuid(tsk),
+ f->op, f->uid);
break;
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
@@ -863,7 +870,7 @@ static inline struct audit_context *audit_take_context(struct task_struct *tsk,
audit_filter_inodes(tsk, context);
}
- tsk->audit_context = NULL;
+ audit_set_context(tsk, NULL);
return context;
}
@@ -950,7 +957,7 @@ int audit_alloc(struct task_struct *tsk)
}
context->filterkey = key;
- tsk->audit_context = context;
+ audit_set_context(tsk, context);
set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
return 0;
}
@@ -1507,8 +1514,7 @@ void __audit_free(struct task_struct *tsk)
void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
unsigned long a3, unsigned long a4)
{
- struct task_struct *tsk = current;
- struct audit_context *context = tsk->audit_context;
+ struct audit_context *context = audit_context();
enum audit_state state;
if (!audit_enabled || !context)
@@ -1523,7 +1529,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->dummy = !audit_n_rules;
if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
context->prio = 0;
- if (auditd_test_task(tsk))
+ if (auditd_test_task(current))
return;
}
@@ -1553,7 +1559,6 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
*/
void __audit_syscall_exit(int success, long return_code)
{
- struct task_struct *tsk = current;
struct audit_context *context;
if (success)
@@ -1561,12 +1566,12 @@ void __audit_syscall_exit(int success, long return_code)
else
success = AUDITSC_FAILURE;
- context = audit_take_context(tsk, success, return_code);
+ context = audit_take_context(current, success, return_code);
if (!context)
return;
if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit(context, tsk);
+ audit_log_exit(context, current);
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
@@ -1588,7 +1593,7 @@ void __audit_syscall_exit(int success, long return_code)
kfree(context->filterkey);
context->filterkey = NULL;
}
- tsk->audit_context = context;
+ audit_set_context(current, context);
}
static inline void handle_one(const struct inode *inode)
@@ -1600,7 +1605,7 @@ static inline void handle_one(const struct inode *inode)
int count;
if (likely(!inode->i_fsnotify_marks))
return;
- context = current->audit_context;
+ context = audit_context();
p = context->trees;
count = context->tree_count;
rcu_read_lock();
@@ -1631,7 +1636,7 @@ static void handle_path(const struct dentry *dentry)
unsigned long seq;
int count;
- context = current->audit_context;
+ context = audit_context();
p = context->trees;
count = context->tree_count;
retry:
@@ -1713,7 +1718,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
struct filename *
__audit_reusename(const __user char *uptr)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct audit_names *n;
list_for_each_entry(n, &context->names_list, list) {
@@ -1736,7 +1741,7 @@ __audit_reusename(const __user char *uptr)
*/
void __audit_getname(struct filename *name)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct audit_names *n;
if (!context->in_syscall)
@@ -1764,7 +1769,7 @@ void __audit_getname(struct filename *name)
void __audit_inode(struct filename *name, const struct dentry *dentry,
unsigned int flags)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
@@ -1863,7 +1868,7 @@ void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
const unsigned char type)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
@@ -2048,7 +2053,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
int audit_set_loginuid(kuid_t loginuid)
{
struct task_struct *task = current;
- unsigned int oldsessionid, sessionid = (unsigned int)-1;
+ unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
kuid_t oldloginuid;
int rc;
@@ -2062,7 +2067,7 @@ int audit_set_loginuid(kuid_t loginuid)
/* are we setting or clearing? */
if (uid_valid(loginuid)) {
sessionid = (unsigned int)atomic_inc_return(&session_id);
- if (unlikely(sessionid == (unsigned int)-1))
+ if (unlikely(sessionid == AUDIT_SID_UNSET))
sessionid = (unsigned int)atomic_inc_return(&session_id);
}
@@ -2082,7 +2087,7 @@ out:
*/
void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (attr)
memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
@@ -2106,7 +2111,7 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
const struct timespec64 *abs_timeout)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
if (abs_timeout)
@@ -2130,7 +2135,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (notification)
context->mq_notify.sigev_signo = notification->sigev_signo;
@@ -2149,7 +2154,7 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
*/
void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->mq_getsetattr.mqdes = mqdes;
context->mq_getsetattr.mqstat = *mqstat;
context->type = AUDIT_MQ_GETSETATTR;
@@ -2162,7 +2167,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
*/
void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->ipc.uid = ipcp->uid;
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
@@ -2182,7 +2187,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
*/
void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->ipc.qbytes = qbytes;
context->ipc.perm_uid = uid;
@@ -2193,7 +2198,7 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
void __audit_bprm(struct linux_binprm *bprm)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->type = AUDIT_EXECVE;
context->execve.argc = bprm->argc;
@@ -2208,7 +2213,7 @@ void __audit_bprm(struct linux_binprm *bprm)
*/
int __audit_socketcall(int nargs, unsigned long *args)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
return -EINVAL;
@@ -2226,7 +2231,7 @@ int __audit_socketcall(int nargs, unsigned long *args)
*/
void __audit_fd_pair(int fd1, int fd2)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->fds[0] = fd1;
context->fds[1] = fd2;
}
@@ -2240,7 +2245,7 @@ void __audit_fd_pair(int fd1, int fd2)
*/
int __audit_sockaddr(int len, void *a)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
@@ -2256,7 +2261,7 @@ int __audit_sockaddr(int len, void *a)
void __audit_ptrace(struct task_struct *t)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->target_pid = task_tgid_nr(t);
context->target_auid = audit_get_loginuid(t);
@@ -2277,19 +2282,19 @@ void __audit_ptrace(struct task_struct *t)
int audit_signal_info(int sig, struct task_struct *t)
{
struct audit_aux_data_pids *axp;
- struct task_struct *tsk = current;
- struct audit_context *ctx = tsk->audit_context;
- kuid_t uid = current_uid(), t_uid = task_uid(t);
+ struct audit_context *ctx = audit_context();
+ kuid_t uid = current_uid(), auid, t_uid = task_uid(t);
if (auditd_test_task(t) &&
(sig == SIGTERM || sig == SIGHUP ||
sig == SIGUSR1 || sig == SIGUSR2)) {
- audit_sig_pid = task_tgid_nr(tsk);
- if (uid_valid(tsk->loginuid))
- audit_sig_uid = tsk->loginuid;
+ audit_sig_pid = task_tgid_nr(current);
+ auid = audit_get_loginuid(current);
+ if (uid_valid(auid))
+ audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid(tsk, &audit_sig_sid);
+ security_task_getsecid(current, &audit_sig_sid);
}
if (!audit_signals || audit_dummy_context())
@@ -2345,7 +2350,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
const struct cred *new, const struct cred *old)
{
struct audit_aux_data_bprm_fcaps *ax;
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct cpu_vfs_cap_data vcaps;
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
@@ -2385,7 +2390,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
*/
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
@@ -2396,7 +2401,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
void __audit_mmap_fd(int fd, int flags)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->mmap.fd = fd;
context->mmap.flags = flags;
context->type = AUDIT_MMAP;
@@ -2404,7 +2409,7 @@ void __audit_mmap_fd(int fd, int flags)
void __audit_log_kern_module(char *name)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
strcpy(context->module.name, name);
@@ -2413,7 +2418,7 @@ void __audit_log_kern_module(char *name)
void __audit_fanotify(unsigned int response)
{
- audit_log(current->audit_context, GFP_KERNEL,
+ audit_log(audit_context(), GFP_KERNEL,
AUDIT_FANOTIFY, "resp=%u", response);
}
@@ -2464,7 +2469,19 @@ void audit_core_dumps(long signr)
audit_log_end(ab);
}
-void __audit_seccomp(unsigned long syscall, long signr, int code)
+/**
+ * audit_seccomp - record information about a seccomp action
+ * @syscall: syscall number
+ * @signr: signal value
+ * @code: the seccomp action
+ *
+ * Record the information associated with a seccomp action. Event filtering for
+ * seccomp actions that are not to be logged is done in seccomp_log().
+ * Therefore, this function forces auditing independent of the audit_enabled
+ * and dummy context state because seccomp actions should be logged even when
+ * audit is not in use.
+ */
+void audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
@@ -2478,9 +2495,29 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
audit_log_end(ab);
}
+void audit_seccomp_actions_logged(const char *names, const char *old_names,
+ int res)
+{
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL,
+ AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+
+ audit_log_format(ab, "op=seccomp-logging");
+ audit_log_format(ab, " actions=%s", names);
+ audit_log_format(ab, " old-actions=%s", old_names);
+ audit_log_format(ab, " res=%d", res);
+ audit_log_end(ab);
+}
+
struct list_head *audit_killed_trees(void)
{
- struct audit_context *ctx = current->audit_context;
+ struct audit_context *ctx = audit_context();
if (likely(!ctx || !ctx->in_syscall))
return NULL;
return &ctx->killed_trees;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a713fd23ec88..f27f5496d6fe 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,9 +4,13 @@ obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+ifeq ($(CONFIG_XDP_SOCKETS),y)
+obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
+endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
ifeq ($(CONFIG_STREAM_PARSER),y)
ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 027107f4be53..544e58f5f642 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,11 +11,13 @@
* General Public License for more details.
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
#include <linux/perf_event.h>
+#include <uapi/linux/btf.h>
#include "map_in_map.h"
@@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map)
bpf_map_area_free(array);
}
+static void array_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void *value;
+
+ rcu_read_lock();
+
+ value = array_map_lookup_elem(map, key);
+ if (!value) {
+ rcu_read_unlock();
+ return;
+ }
+
+ seq_printf(m, "%u: ", *(u32 *)key);
+ btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+ seq_puts(m, "\n");
+
+ rcu_read_unlock();
+}
+
+static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+ u32 btf_key_id, u32 btf_value_id)
+{
+ const struct btf_type *key_type, *value_type;
+ u32 key_size, value_size;
+ u32 int_data;
+
+ key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+ if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+ return -EINVAL;
+
+ int_data = *(u32 *)(key_type + 1);
+ /* bpf array can only take a u32 key. This check makes
+ * sure that the btf matches the attr used during map_create.
+ */
+ if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
+ BTF_INT_OFFSET(int_data))
+ return -EINVAL;
+
+ value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+ if (!value_type || value_size > map->value_size)
+ return -EINVAL;
+
+ return 0;
+}
+
const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
@@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = {
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_gen_lookup = array_map_gen_lookup,
+ .map_seq_show_elem = array_map_seq_show_elem,
+ .map_check_btf = array_map_check_btf,
};
const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
new file mode 100644
index 000000000000..2d49d18b793a
--- /dev/null
+++ b/kernel/bpf/btf.c
@@ -0,0 +1,2348 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <uapi/linux/btf.h>
+#include <uapi/linux/types.h>
+#include <linux/seq_file.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/sort.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+
+/* BTF (BPF Type Format) is the meta data format which describes
+ * the data types of BPF program/map. Hence, it basically focus
+ * on the C programming language which the modern BPF is primary
+ * using.
+ *
+ * ELF Section:
+ * ~~~~~~~~~~~
+ * The BTF data is stored under the ".BTF" ELF section
+ *
+ * struct btf_type:
+ * ~~~~~~~~~~~~~~~
+ * Each 'struct btf_type' object describes a C data type.
+ * Depending on the type it is describing, a 'struct btf_type'
+ * object may be followed by more data. F.e.
+ * To describe an array, 'struct btf_type' is followed by
+ * 'struct btf_array'.
+ *
+ * 'struct btf_type' and any extra data following it are
+ * 4 bytes aligned.
+ *
+ * Type section:
+ * ~~~~~~~~~~~~~
+ * The BTF type section contains a list of 'struct btf_type' objects.
+ * Each one describes a C type. Recall from the above section
+ * that a 'struct btf_type' object could be immediately followed by extra
+ * data in order to desribe some particular C types.
+ *
+ * type_id:
+ * ~~~~~~~
+ * Each btf_type object is identified by a type_id. The type_id
+ * is implicitly implied by the location of the btf_type object in
+ * the BTF type section. The first one has type_id 1. The second
+ * one has type_id 2...etc. Hence, an earlier btf_type has
+ * a smaller type_id.
+ *
+ * A btf_type object may refer to another btf_type object by using
+ * type_id (i.e. the "type" in the "struct btf_type").
+ *
+ * NOTE that we cannot assume any reference-order.
+ * A btf_type object can refer to an earlier btf_type object
+ * but it can also refer to a later btf_type object.
+ *
+ * For example, to describe "const void *". A btf_type
+ * object describing "const" may refer to another btf_type
+ * object describing "void *". This type-reference is done
+ * by specifying type_id:
+ *
+ * [1] CONST (anon) type_id=2
+ * [2] PTR (anon) type_id=0
+ *
+ * The above is the btf_verifier debug log:
+ * - Each line started with "[?]" is a btf_type object
+ * - [?] is the type_id of the btf_type object.
+ * - CONST/PTR is the BTF_KIND_XXX
+ * - "(anon)" is the name of the type. It just
+ * happens that CONST and PTR has no name.
+ * - type_id=XXX is the 'u32 type' in btf_type
+ *
+ * NOTE: "void" has type_id 0
+ *
+ * String section:
+ * ~~~~~~~~~~~~~~
+ * The BTF string section contains the names used by the type section.
+ * Each string is referred by an "offset" from the beginning of the
+ * string section.
+ *
+ * Each string is '\0' terminated.
+ *
+ * The first character in the string section must be '\0'
+ * which is used to mean 'anonymous'. Some btf_type may not
+ * have a name.
+ */
+
+/* BTF verification:
+ *
+ * To verify BTF data, two passes are needed.
+ *
+ * Pass #1
+ * ~~~~~~~
+ * The first pass is to collect all btf_type objects to
+ * an array: "btf->types".
+ *
+ * Depending on the C type that a btf_type is describing,
+ * a btf_type may be followed by extra data. We don't know
+ * how many btf_type is there, and more importantly we don't
+ * know where each btf_type is located in the type section.
+ *
+ * Without knowing the location of each type_id, most verifications
+ * cannot be done. e.g. an earlier btf_type may refer to a later
+ * btf_type (recall the "const void *" above), so we cannot
+ * check this type-reference in the first pass.
+ *
+ * In the first pass, it still does some verifications (e.g.
+ * checking the name is a valid offset to the string section).
+ *
+ * Pass #2
+ * ~~~~~~~
+ * The main focus is to resolve a btf_type that is referring
+ * to another type.
+ *
+ * We have to ensure the referring type:
+ * 1) does exist in the BTF (i.e. in btf->types[])
+ * 2) does not cause a loop:
+ * struct A {
+ * struct B b;
+ * };
+ *
+ * struct B {
+ * struct A a;
+ * };
+ *
+ * btf_type_needs_resolve() decides if a btf_type needs
+ * to be resolved.
+ *
+ * The needs_resolve type implements the "resolve()" ops which
+ * essentially does a DFS and detects backedge.
+ *
+ * During resolve (or DFS), different C types have different
+ * "RESOLVED" conditions.
+ *
+ * When resolving a BTF_KIND_STRUCT, we need to resolve all its
+ * members because a member is always referring to another
+ * type. A struct's member can be treated as "RESOLVED" if
+ * it is referring to a BTF_KIND_PTR. Otherwise, the
+ * following valid C struct would be rejected:
+ *
+ * struct A {
+ * int m;
+ * struct A *a;
+ * };
+ *
+ * When resolving a BTF_KIND_PTR, it needs to keep resolving if
+ * it is referring to another BTF_KIND_PTR. Otherwise, we cannot
+ * detect a pointer loop, e.g.:
+ * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
+ * ^ |
+ * +-----------------------------------------+
+ *
+ */
+
+#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
+#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
+#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
+#define BITS_ROUNDUP_BYTES(bits) \
+ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
+
+#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INT_MASK 0x0fffffff
+#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
+#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
+
+/* 16MB for 64k structs and each has 16 members and
+ * a few MB spaces for the string section.
+ * The hard limit is S32_MAX.
+ */
+#define BTF_MAX_SIZE (16 * 1024 * 1024)
+
+#define for_each_member(i, struct_type, member) \
+ for (i = 0, member = btf_type_member(struct_type); \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
+#define for_each_member_from(i, from, struct_type, member) \
+ for (i = from, member = btf_type_member(struct_type) + from; \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
+static DEFINE_IDR(btf_idr);
+static DEFINE_SPINLOCK(btf_idr_lock);
+
+struct btf {
+ void *data;
+ struct btf_type **types;
+ u32 *resolved_ids;
+ u32 *resolved_sizes;
+ const char *strings;
+ void *nohdr_data;
+ struct btf_header hdr;
+ u32 nr_types;
+ u32 types_size;
+ u32 data_size;
+ refcount_t refcnt;
+ u32 id;
+ struct rcu_head rcu;
+};
+
+enum verifier_phase {
+ CHECK_META,
+ CHECK_TYPE,
+};
+
+struct resolve_vertex {
+ const struct btf_type *t;
+ u32 type_id;
+ u16 next_member;
+};
+
+enum visit_state {
+ NOT_VISITED,
+ VISITED,
+ RESOLVED,
+};
+
+enum resolve_mode {
+ RESOLVE_TBD, /* To Be Determined */
+ RESOLVE_PTR, /* Resolving for Pointer */
+ RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union
+ * or array
+ */
+};
+
+#define MAX_RESOLVE_DEPTH 32
+
+struct btf_sec_info {
+ u32 off;
+ u32 len;
+};
+
+struct btf_verifier_env {
+ struct btf *btf;
+ u8 *visit_states;
+ struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
+ struct bpf_verifier_log log;
+ u32 log_type_id;
+ u32 top_stack;
+ enum verifier_phase phase;
+ enum resolve_mode resolve_mode;
+};
+
+static const char * const btf_kind_str[NR_BTF_KINDS] = {
+ [BTF_KIND_UNKN] = "UNKNOWN",
+ [BTF_KIND_INT] = "INT",
+ [BTF_KIND_PTR] = "PTR",
+ [BTF_KIND_ARRAY] = "ARRAY",
+ [BTF_KIND_STRUCT] = "STRUCT",
+ [BTF_KIND_UNION] = "UNION",
+ [BTF_KIND_ENUM] = "ENUM",
+ [BTF_KIND_FWD] = "FWD",
+ [BTF_KIND_TYPEDEF] = "TYPEDEF",
+ [BTF_KIND_VOLATILE] = "VOLATILE",
+ [BTF_KIND_CONST] = "CONST",
+ [BTF_KIND_RESTRICT] = "RESTRICT",
+};
+
+struct btf_kind_operations {
+ s32 (*check_meta)(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left);
+ int (*resolve)(struct btf_verifier_env *env,
+ const struct resolve_vertex *v);
+ int (*check_member)(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type);
+ void (*log_details)(struct btf_verifier_env *env,
+ const struct btf_type *t);
+ void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct seq_file *m);
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
+static struct btf_type btf_void;
+
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+ /* Some of them is not strictly a C modifier
+ * but they are grouped into the same bucket
+ * for BTF concern:
+ * A type (t) that refers to another
+ * type through t->type AND its size cannot
+ * be determined without following the t->type.
+ *
+ * ptr does not fall into this bucket
+ * because its size is always sizeof(void *).
+ */
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_TYPEDEF:
+ case BTF_KIND_VOLATILE:
+ case BTF_KIND_CONST:
+ case BTF_KIND_RESTRICT:
+ return true;
+ }
+
+ return false;
+}
+
+static bool btf_type_is_void(const struct btf_type *t)
+{
+ /* void => no type and size info.
+ * Hence, FWD is also treated as void.
+ */
+ return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_void_or_null(const struct btf_type *t)
+{
+ return !t || btf_type_is_void(t);
+}
+
+/* union is only a special case of struct:
+ * all its offsetof(member) == 0
+ */
+static bool btf_type_is_struct(const struct btf_type *t)
+{
+ u8 kind = BTF_INFO_KIND(t->info);
+
+ return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static bool btf_type_is_array(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+}
+
+static bool btf_type_is_ptr(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
+}
+
+static bool btf_type_is_int(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
+}
+
+/* What types need to be resolved?
+ *
+ * btf_type_is_modifier() is an obvious one.
+ *
+ * btf_type_is_struct() because its member refers to
+ * another type (through member->type).
+
+ * btf_type_is_array() because its element (array->type)
+ * refers to another type. Array can be thought of a
+ * special case of struct while array just has the same
+ * member-type repeated by array->nelems of times.
+ */
+static bool btf_type_needs_resolve(const struct btf_type *t)
+{
+ return btf_type_is_modifier(t) ||
+ btf_type_is_ptr(t) ||
+ btf_type_is_struct(t) ||
+ btf_type_is_array(t);
+}
+
+/* t->size can be used */
+static bool btf_type_has_size(const struct btf_type *t)
+{
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_INT:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ return true;
+ }
+
+ return false;
+}
+
+static const char *btf_int_encoding_str(u8 encoding)
+{
+ if (encoding == 0)
+ return "(none)";
+ else if (encoding == BTF_INT_SIGNED)
+ return "SIGNED";
+ else if (encoding == BTF_INT_CHAR)
+ return "CHAR";
+ else if (encoding == BTF_INT_BOOL)
+ return "BOOL";
+ else
+ return "UNKN";
+}
+
+static u16 btf_type_vlen(const struct btf_type *t)
+{
+ return BTF_INFO_VLEN(t->info);
+}
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+ return *(u32 *)(t + 1);
+}
+
+static const struct btf_array *btf_type_array(const struct btf_type *t)
+{
+ return (const struct btf_array *)(t + 1);
+}
+
+static const struct btf_member *btf_type_member(const struct btf_type *t)
+{
+ return (const struct btf_member *)(t + 1);
+}
+
+static const struct btf_enum *btf_type_enum(const struct btf_type *t)
+{
+ return (const struct btf_enum *)(t + 1);
+}
+
+static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
+{
+ return kind_ops[BTF_INFO_KIND(t->info)];
+}
+
+static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+{
+ return BTF_STR_OFFSET_VALID(offset) &&
+ offset < btf->hdr.str_len;
+}
+
+static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+ if (!offset)
+ return "(anon)";
+ else if (offset < btf->hdr.str_len)
+ return &btf->strings[offset];
+ else
+ return "(invalid-name-offset)";
+}
+
+static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+{
+ if (type_id > btf->nr_types)
+ return NULL;
+
+ return btf->types[type_id];
+}
+
+/*
+ * Regular int is not a bit field and it must be either
+ * u8/u16/u32/u64.
+ */
+static bool btf_type_int_is_regular(const struct btf_type *t)
+{
+ u16 nr_bits, nr_bytes;
+ u32 int_data;
+
+ int_data = btf_type_int(t);
+ nr_bits = BTF_INT_BITS(int_data);
+ nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+ if (BITS_PER_BYTE_MASKED(nr_bits) ||
+ BTF_INT_OFFSET(int_data) ||
+ (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+ nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+ return false;
+ }
+
+ return true;
+}
+
+__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+
+__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+
+__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ bool log_details,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ u8 kind = BTF_INFO_KIND(t->info);
+ struct btf *btf = env->btf;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ __btf_verifier_log(log, "[%u] %s %s%s",
+ env->log_type_id,
+ btf_kind_str[kind],
+ btf_name_by_offset(btf, t->name_off),
+ log_details ? " " : "");
+
+ if (log_details)
+ btf_type_ops(t)->log_details(env, t);
+
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
+#define btf_verifier_log_type(env, t, ...) \
+ __btf_verifier_log_type((env), (t), true, __VA_ARGS__)
+#define btf_verifier_log_basic(env, t, ...) \
+ __btf_verifier_log_type((env), (t), false, __VA_ARGS__)
+
+__printf(4, 5)
+static void btf_verifier_log_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct btf *btf = env->btf;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ /* The CHECK_META phase already did a btf dump.
+ *
+ * If member is logged again, it must hit an error in
+ * parsing this member. It is useful to print out which
+ * struct this member belongs to.
+ */
+ if (env->phase != CHECK_META)
+ btf_verifier_log_type(env, struct_type, NULL);
+
+ __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+ btf_name_by_offset(btf, member->name_off),
+ member->type, member->offset);
+
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+ u32 btf_data_size)
+{
+ struct bpf_verifier_log *log = &env->log;
+ const struct btf *btf = env->btf;
+ const struct btf_header *hdr;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ hdr = &btf->hdr;
+ __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
+ __btf_verifier_log(log, "version: %u\n", hdr->version);
+ __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
+ __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
+ __btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+ __btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
+ __btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
+ __btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+ __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
+}
+
+static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
+{
+ struct btf *btf = env->btf;
+
+ /* < 2 because +1 for btf_void which is always in btf->types[0].
+ * btf_void is not accounted in btf->nr_types because btf_void
+ * does not come from the BTF file.
+ */
+ if (btf->types_size - btf->nr_types < 2) {
+ /* Expand 'types' array */
+
+ struct btf_type **new_types;
+ u32 expand_by, new_size;
+
+ if (btf->types_size == BTF_MAX_TYPE) {
+ btf_verifier_log(env, "Exceeded max num of types");
+ return -E2BIG;
+ }
+
+ expand_by = max_t(u32, btf->types_size >> 2, 16);
+ new_size = min_t(u32, BTF_MAX_TYPE,
+ btf->types_size + expand_by);
+
+ new_types = kvcalloc(new_size, sizeof(*new_types),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_types)
+ return -ENOMEM;
+
+ if (btf->nr_types == 0)
+ new_types[0] = &btf_void;
+ else
+ memcpy(new_types, btf->types,
+ sizeof(*btf->types) * (btf->nr_types + 1));
+
+ kvfree(btf->types);
+ btf->types = new_types;
+ btf->types_size = new_size;
+ }
+
+ btf->types[++(btf->nr_types)] = t;
+
+ return 0;
+}
+
+static int btf_alloc_id(struct btf *btf)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&btf_idr_lock);
+ id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ btf->id = id;
+ spin_unlock_bh(&btf_idr_lock);
+ idr_preload_end();
+
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void btf_free_id(struct btf *btf)
+{
+ unsigned long flags;
+
+ /*
+ * In map-in-map, calling map_delete_elem() on outer
+ * map will call bpf_map_put on the inner map.
+ * It will then eventually call btf_free_id()
+ * on the inner map. Some of the map_delete_elem()
+ * implementation may have irq disabled, so
+ * we need to use the _irqsave() version instead
+ * of the _bh() version.
+ */
+ spin_lock_irqsave(&btf_idr_lock, flags);
+ idr_remove(&btf_idr, btf->id);
+ spin_unlock_irqrestore(&btf_idr_lock, flags);
+}
+
+static void btf_free(struct btf *btf)
+{
+ kvfree(btf->types);
+ kvfree(btf->resolved_sizes);
+ kvfree(btf->resolved_ids);
+ kvfree(btf->data);
+ kfree(btf);
+}
+
+static void btf_free_rcu(struct rcu_head *rcu)
+{
+ struct btf *btf = container_of(rcu, struct btf, rcu);
+
+ btf_free(btf);
+}
+
+void btf_put(struct btf *btf)
+{
+ if (btf && refcount_dec_and_test(&btf->refcnt)) {
+ btf_free_id(btf);
+ call_rcu(&btf->rcu, btf_free_rcu);
+ }
+}
+
+static int env_resolve_init(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ u32 nr_types = btf->nr_types;
+ u32 *resolved_sizes = NULL;
+ u32 *resolved_ids = NULL;
+ u8 *visit_states = NULL;
+
+ /* +1 for btf_void */
+ resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!resolved_sizes)
+ goto nomem;
+
+ resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!resolved_ids)
+ goto nomem;
+
+ visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!visit_states)
+ goto nomem;
+
+ btf->resolved_sizes = resolved_sizes;
+ btf->resolved_ids = resolved_ids;
+ env->visit_states = visit_states;
+
+ return 0;
+
+nomem:
+ kvfree(resolved_sizes);
+ kvfree(resolved_ids);
+ kvfree(visit_states);
+ return -ENOMEM;
+}
+
+static void btf_verifier_env_free(struct btf_verifier_env *env)
+{
+ kvfree(env->visit_states);
+ kfree(env);
+}
+
+static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
+ const struct btf_type *next_type)
+{
+ switch (env->resolve_mode) {
+ case RESOLVE_TBD:
+ /* int, enum or void is a sink */
+ return !btf_type_needs_resolve(next_type);
+ case RESOLVE_PTR:
+ /* int, enum, void, struct or array is a sink for ptr */
+ return !btf_type_is_modifier(next_type) &&
+ !btf_type_is_ptr(next_type);
+ case RESOLVE_STRUCT_OR_ARRAY:
+ /* int, enum, void or ptr is a sink for struct and array */
+ return !btf_type_is_modifier(next_type) &&
+ !btf_type_is_array(next_type) &&
+ !btf_type_is_struct(next_type);
+ default:
+ BUG();
+ }
+}
+
+static bool env_type_is_resolved(const struct btf_verifier_env *env,
+ u32 type_id)
+{
+ return env->visit_states[type_id] == RESOLVED;
+}
+
+static int env_stack_push(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ struct resolve_vertex *v;
+
+ if (env->top_stack == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ if (env->visit_states[type_id] != NOT_VISITED)
+ return -EEXIST;
+
+ env->visit_states[type_id] = VISITED;
+
+ v = &env->stack[env->top_stack++];
+ v->t = t;
+ v->type_id = type_id;
+ v->next_member = 0;
+
+ if (env->resolve_mode == RESOLVE_TBD) {
+ if (btf_type_is_ptr(t))
+ env->resolve_mode = RESOLVE_PTR;
+ else if (btf_type_is_struct(t) || btf_type_is_array(t))
+ env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
+ }
+
+ return 0;
+}
+
+static void env_stack_set_next_member(struct btf_verifier_env *env,
+ u16 next_member)
+{
+ env->stack[env->top_stack - 1].next_member = next_member;
+}
+
+static void env_stack_pop_resolved(struct btf_verifier_env *env,
+ u32 resolved_type_id,
+ u32 resolved_size)
+{
+ u32 type_id = env->stack[--(env->top_stack)].type_id;
+ struct btf *btf = env->btf;
+
+ btf->resolved_sizes[type_id] = resolved_size;
+ btf->resolved_ids[type_id] = resolved_type_id;
+ env->visit_states[type_id] = RESOLVED;
+}
+
+static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
+{
+ return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
+}
+
+/* The input param "type_id" must point to a needs_resolve type */
+static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
+ u32 *type_id)
+{
+ *type_id = btf->resolved_ids[*type_id];
+ return btf_type_by_id(btf, *type_id);
+}
+
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+ u32 *type_id, u32 *ret_size)
+{
+ const struct btf_type *size_type;
+ u32 size_type_id = *type_id;
+ u32 size = 0;
+
+ size_type = btf_type_by_id(btf, size_type_id);
+ if (btf_type_is_void_or_null(size_type))
+ return NULL;
+
+ if (btf_type_has_size(size_type)) {
+ size = size_type->size;
+ } else if (btf_type_is_array(size_type)) {
+ size = btf->resolved_sizes[size_type_id];
+ } else if (btf_type_is_ptr(size_type)) {
+ size = sizeof(void *);
+ } else {
+ if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+ return NULL;
+
+ size = btf->resolved_sizes[size_type_id];
+ size_type_id = btf->resolved_ids[size_type_id];
+ size_type = btf_type_by_id(btf, size_type_id);
+ if (btf_type_is_void(size_type))
+ return NULL;
+ }
+
+ *type_id = size_type_id;
+ if (ret_size)
+ *ret_size = size;
+
+ return size_type;
+}
+
+static int btf_df_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ btf_verifier_log_basic(env, struct_type,
+ "Unsupported check_member");
+ return -EINVAL;
+}
+
+static int btf_df_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ btf_verifier_log_basic(env, v->t, "Unsupported resolve");
+ return -EINVAL;
+}
+
+static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct seq_file *m)
+{
+ seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+}
+
+static int btf_int_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 int_data = btf_type_int(member_type);
+ u32 struct_bits_off = member->offset;
+ u32 struct_size = struct_type->size;
+ u32 nr_copy_bits;
+ u32 bytes_offset;
+
+ if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "bits_offset exceeds U32_MAX");
+ return -EINVAL;
+ }
+
+ struct_bits_off += BTF_INT_OFFSET(int_data);
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ nr_copy_bits = BTF_INT_BITS(int_data) +
+ BITS_PER_BYTE_MASKED(struct_bits_off);
+
+ if (nr_copy_bits > BITS_PER_U64) {
+ btf_verifier_log_member(env, struct_type, member,
+ "nr_copy_bits exceeds 64");
+ return -EINVAL;
+ }
+
+ if (struct_size < bytes_offset ||
+ struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_int_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 int_data, nr_bits, meta_needed = sizeof(int_data);
+ u16 encoding;
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ int_data = btf_type_int(t);
+ if (int_data & ~BTF_INT_MASK) {
+ btf_verifier_log_basic(env, t, "Invalid int_data:%x",
+ int_data);
+ return -EINVAL;
+ }
+
+ nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
+
+ if (nr_bits > BITS_PER_U64) {
+ btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
+ BITS_PER_U64);
+ return -EINVAL;
+ }
+
+ if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
+ btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
+ return -EINVAL;
+ }
+
+ /*
+ * Only one of the encoding bits is allowed and it
+ * should be sufficient for the pretty print purpose (i.e. decoding).
+ * Multiple bits can be allowed later if it is found
+ * to be insufficient.
+ */
+ encoding = BTF_INT_ENCODING(int_data);
+ if (encoding &&
+ encoding != BTF_INT_SIGNED &&
+ encoding != BTF_INT_CHAR &&
+ encoding != BTF_INT_BOOL) {
+ btf_verifier_log_type(env, t, "Unsupported encoding");
+ return -ENOTSUPP;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_int_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ int int_data = btf_type_int(t);
+
+ btf_verifier_log(env,
+ "size=%u bits_offset=%u nr_bits=%u encoding=%s",
+ t->size, BTF_INT_OFFSET(int_data),
+ BTF_INT_BITS(int_data),
+ btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
+}
+
+static void btf_int_bits_seq_show(const struct btf *btf,
+ const struct btf_type *t,
+ void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ u32 int_data = btf_type_int(t);
+ u16 nr_bits = BTF_INT_BITS(int_data);
+ u16 total_bits_offset;
+ u16 nr_copy_bytes;
+ u16 nr_copy_bits;
+ u8 nr_upper_bits;
+ union {
+ u64 u64_num;
+ u8 u8_nums[8];
+ } print_num;
+
+ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+ data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+ bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+ nr_copy_bits = nr_bits + bits_offset;
+ nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
+
+ print_num.u64_num = 0;
+ memcpy(&print_num.u64_num, data, nr_copy_bytes);
+
+ /* Ditch the higher order bits */
+ nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits);
+ if (nr_upper_bits) {
+ /* We need to mask out some bits of the upper byte. */
+ u8 mask = (1 << nr_upper_bits) - 1;
+
+ print_num.u8_nums[nr_copy_bytes - 1] &= mask;
+ }
+
+ print_num.u64_num >>= bits_offset;
+
+ seq_printf(m, "0x%llx", print_num.u64_num);
+}
+
+static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ u32 int_data = btf_type_int(t);
+ u8 encoding = BTF_INT_ENCODING(int_data);
+ bool sign = encoding & BTF_INT_SIGNED;
+ u32 nr_bits = BTF_INT_BITS(int_data);
+
+ if (bits_offset || BTF_INT_OFFSET(int_data) ||
+ BITS_PER_BYTE_MASKED(nr_bits)) {
+ btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+ return;
+ }
+
+ switch (nr_bits) {
+ case 64:
+ if (sign)
+ seq_printf(m, "%lld", *(s64 *)data);
+ else
+ seq_printf(m, "%llu", *(u64 *)data);
+ break;
+ case 32:
+ if (sign)
+ seq_printf(m, "%d", *(s32 *)data);
+ else
+ seq_printf(m, "%u", *(u32 *)data);
+ break;
+ case 16:
+ if (sign)
+ seq_printf(m, "%d", *(s16 *)data);
+ else
+ seq_printf(m, "%u", *(u16 *)data);
+ break;
+ case 8:
+ if (sign)
+ seq_printf(m, "%d", *(s8 *)data);
+ else
+ seq_printf(m, "%u", *(u8 *)data);
+ break;
+ default:
+ btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+ }
+}
+
+static const struct btf_kind_operations int_ops = {
+ .check_meta = btf_int_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_int_check_member,
+ .log_details = btf_int_log,
+ .seq_show = btf_int_seq_show,
+};
+
+static int btf_modifier_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id = member->type;
+ struct btf_member resolved_member;
+ struct btf *btf = env->btf;
+
+ resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+ if (!resolved_type) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ resolved_member = *member;
+ resolved_member.type = resolved_type_id;
+
+ return btf_type_ops(resolved_type)->check_member(env, struct_type,
+ &resolved_member,
+ resolved_type);
+}
+
+static int btf_ptr_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_size, struct_bits_off, bytes_offset;
+
+ struct_size = struct_type->size;
+ struct_bits_off = member->offset;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ if (struct_size - bytes_offset < sizeof(void *)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_ref_type_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (!BTF_TYPE_ID_VALID(t->type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static int btf_modifier_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ const struct btf_type *next_type;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ u32 next_type_size = 0;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ /* "typedef void new_void", "const void"...etc */
+ if (btf_type_is_void(next_type))
+ goto resolved;
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ /* Figure out the resolved next_type_id with size.
+ * They will be stored in the current modifier's
+ * resolved_ids and resolved_sizes such that it can
+ * save us a few type-following when we use it later (e.g. in
+ * pretty print).
+ */
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+ !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+resolved:
+ env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+ return 0;
+}
+
+static int btf_ptr_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ u32 next_type_size = 0;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ /* "void *" */
+ if (btf_type_is_void(next_type))
+ goto resolved;
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
+ * the modifier may have stopped resolving when it was resolved
+ * to a ptr (last-resolved-ptr).
+ *
+ * We now need to continue from the last-resolved-ptr to
+ * ensure the last-resolved-ptr will not referring back to
+ * the currenct ptr (t).
+ */
+ if (btf_type_is_modifier(next_type)) {
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id;
+
+ resolved_type_id = next_type_id;
+ resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+ if (btf_type_is_ptr(resolved_type) &&
+ !env_type_is_resolve_sink(env, resolved_type) &&
+ !env_type_is_resolved(env, resolved_type_id))
+ return env_stack_push(env, resolved_type,
+ resolved_type_id);
+ }
+
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+ !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+resolved:
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static void btf_modifier_seq_show(const struct btf *btf,
+ const struct btf_type *t,
+ u32 type_id, void *data,
+ u8 bits_offset, struct seq_file *m)
+{
+ t = btf_type_id_resolve(btf, &type_id);
+
+ btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
+static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ /* It is a hashed value */
+ seq_printf(m, "%p", *(void **)data);
+}
+
+static void btf_ref_type_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "type_id=%u", t->type);
+}
+
+static struct btf_kind_operations modifier_ops = {
+ .check_meta = btf_ref_type_check_meta,
+ .resolve = btf_modifier_resolve,
+ .check_member = btf_modifier_check_member,
+ .log_details = btf_ref_type_log,
+ .seq_show = btf_modifier_seq_show,
+};
+
+static struct btf_kind_operations ptr_ops = {
+ .check_meta = btf_ref_type_check_meta,
+ .resolve = btf_ptr_resolve,
+ .check_member = btf_ptr_check_member,
+ .log_details = btf_ref_type_log,
+ .seq_show = btf_ptr_seq_show,
+};
+
+static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (t->type) {
+ btf_verifier_log_type(env, t, "type != 0");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static struct btf_kind_operations fwd_ops = {
+ .check_meta = btf_fwd_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_df_check_member,
+ .log_details = btf_ref_type_log,
+ .seq_show = btf_df_seq_show,
+};
+
+static int btf_array_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+ u32 array_type_id, array_size;
+ struct btf *btf = env->btf;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ array_type_id = member->type;
+ btf_type_id_size(btf, &array_type_id, &array_size);
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < array_size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_array_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_array *array = btf_type_array(t);
+ u32 meta_needed = sizeof(*array);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (t->size) {
+ btf_verifier_log_type(env, t, "size != 0");
+ return -EINVAL;
+ }
+
+ /* Array elem type and index type cannot be in type void,
+ * so !array->type and !array->index_type are not allowed.
+ */
+ if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
+ btf_verifier_log_type(env, t, "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
+ btf_verifier_log_type(env, t, "Invalid index");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static int btf_array_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_array *array = btf_type_array(v->t);
+ const struct btf_type *elem_type, *index_type;
+ u32 elem_type_id, index_type_id;
+ struct btf *btf = env->btf;
+ u32 elem_size;
+
+ /* Check array->index_type */
+ index_type_id = array->index_type;
+ index_type = btf_type_by_id(btf, index_type_id);
+ if (btf_type_is_void_or_null(index_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid index");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, index_type) &&
+ !env_type_is_resolved(env, index_type_id))
+ return env_stack_push(env, index_type, index_type_id);
+
+ index_type = btf_type_id_size(btf, &index_type_id, NULL);
+ if (!index_type || !btf_type_is_int(index_type) ||
+ !btf_type_int_is_regular(index_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid index");
+ return -EINVAL;
+ }
+
+ /* Check array->type */
+ elem_type_id = array->type;
+ elem_type = btf_type_by_id(btf, elem_type_id);
+ if (btf_type_is_void_or_null(elem_type)) {
+ btf_verifier_log_type(env, v->t,
+ "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, elem_type) &&
+ !env_type_is_resolved(env, elem_type_id))
+ return env_stack_push(env, elem_type, elem_type_id);
+
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ if (!elem_type) {
+ btf_verifier_log_type(env, v->t, "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid array of int");
+ return -EINVAL;
+ }
+
+ if (array->nelems && elem_size > U32_MAX / array->nelems) {
+ btf_verifier_log_type(env, v->t,
+ "Array size overflows U32_MAX");
+ return -EINVAL;
+ }
+
+ env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);
+
+ return 0;
+}
+
+static void btf_array_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_array *array = btf_type_array(t);
+
+ btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
+ array->type, array->index_type, array->nelems);
+}
+
+static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const struct btf_array *array = btf_type_array(t);
+ const struct btf_kind_operations *elem_ops;
+ const struct btf_type *elem_type;
+ u32 i, elem_size, elem_type_id;
+
+ elem_type_id = array->type;
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ elem_ops = btf_type_ops(elem_type);
+ seq_puts(m, "[");
+ for (i = 0; i < array->nelems; i++) {
+ if (i)
+ seq_puts(m, ",");
+
+ elem_ops->seq_show(btf, elem_type, elem_type_id, data,
+ bits_offset, m);
+ data += elem_size;
+ }
+ seq_puts(m, "]");
+}
+
+static struct btf_kind_operations array_ops = {
+ .check_meta = btf_array_check_meta,
+ .resolve = btf_array_resolve,
+ .check_member = btf_array_check_member,
+ .log_details = btf_array_log,
+ .seq_show = btf_array_seq_show,
+};
+
+static int btf_struct_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < member_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_struct_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
+ const struct btf_member *member;
+ struct btf *btf = env->btf;
+ u32 struct_size = t->size;
+ u32 meta_needed;
+ u16 i;
+
+ meta_needed = btf_type_vlen(t) * sizeof(*member);
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for_each_member(i, t, member) {
+ if (!btf_name_offset_valid(btf, member->name_off)) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid member name_offset:%u",
+ member->name_off);
+ return -EINVAL;
+ }
+
+ /* A member cannot be in type void */
+ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (is_union && member->offset) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid member bits_offset");
+ return -EINVAL;
+ }
+
+ if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+ btf_verifier_log_member(env, t, member,
+ "Memmber bits_offset exceeds its struct size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_member(env, t, member, NULL);
+ }
+
+ return meta_needed;
+}
+
+static int btf_struct_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_member *member;
+ int err;
+ u16 i;
+
+ /* Before continue resolving the next_member,
+ * ensure the last member is indeed resolved to a
+ * type with size info.
+ */
+ if (v->next_member) {
+ const struct btf_type *last_member_type;
+ const struct btf_member *last_member;
+ u16 last_member_type_id;
+
+ last_member = btf_type_member(v->t) + v->next_member - 1;
+ last_member_type_id = last_member->type;
+ if (WARN_ON_ONCE(!env_type_is_resolved(env,
+ last_member_type_id)))
+ return -EINVAL;
+
+ last_member_type = btf_type_by_id(env->btf,
+ last_member_type_id);
+ err = btf_type_ops(last_member_type)->check_member(env, v->t,
+ last_member,
+ last_member_type);
+ if (err)
+ return err;
+ }
+
+ for_each_member_from(i, v->next_member, v->t, member) {
+ u32 member_type_id = member->type;
+ const struct btf_type *member_type = btf_type_by_id(env->btf,
+ member_type_id);
+
+ if (btf_type_is_void_or_null(member_type)) {
+ btf_verifier_log_member(env, v->t, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, member_type) &&
+ !env_type_is_resolved(env, member_type_id)) {
+ env_stack_set_next_member(env, i + 1);
+ return env_stack_push(env, member_type, member_type_id);
+ }
+
+ err = btf_type_ops(member_type)->check_member(env, v->t,
+ member,
+ member_type);
+ if (err)
+ return err;
+ }
+
+ env_stack_pop_resolved(env, 0, 0);
+
+ return 0;
+}
+
+static void btf_struct_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
+ const struct btf_member *member;
+ u32 i;
+
+ seq_puts(m, "{");
+ for_each_member(i, t, member) {
+ const struct btf_type *member_type = btf_type_by_id(btf,
+ member->type);
+ u32 member_offset = member->offset;
+ u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+ u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+ const struct btf_kind_operations *ops;
+
+ if (i)
+ seq_puts(m, seq);
+
+ ops = btf_type_ops(member_type);
+ ops->seq_show(btf, member_type, member->type,
+ data + bytes_offset, bits8_offset, m);
+ }
+ seq_puts(m, "}");
+}
+
+static struct btf_kind_operations struct_ops = {
+ .check_meta = btf_struct_check_meta,
+ .resolve = btf_struct_resolve,
+ .check_member = btf_struct_check_member,
+ .log_details = btf_struct_log,
+ .seq_show = btf_struct_seq_show,
+};
+
+static int btf_enum_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < sizeof(int)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_enum_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum *enums = btf_type_enum(t);
+ struct btf *btf = env->btf;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size != sizeof(int)) {
+ btf_verifier_log_type(env, t, "Expected size:%zu",
+ sizeof(int));
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name_off);
+ return -EINVAL;
+ }
+
+ btf_verifier_log(env, "\t%s val=%d\n",
+ btf_name_by_offset(btf, enums[i].name_off),
+ enums[i].val);
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const struct btf_enum *enums = btf_type_enum(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ int v = *(int *)data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v == enums[i].val) {
+ seq_printf(m, "%s",
+ btf_name_by_offset(btf, enums[i].name_off));
+ return;
+ }
+ }
+
+ seq_printf(m, "%d", v);
+}
+
+static struct btf_kind_operations enum_ops = {
+ .check_meta = btf_enum_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
+ .log_details = btf_enum_log,
+ .seq_show = btf_enum_seq_show,
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
+ [BTF_KIND_INT] = &int_ops,
+ [BTF_KIND_PTR] = &ptr_ops,
+ [BTF_KIND_ARRAY] = &array_ops,
+ [BTF_KIND_STRUCT] = &struct_ops,
+ [BTF_KIND_UNION] = &struct_ops,
+ [BTF_KIND_ENUM] = &enum_ops,
+ [BTF_KIND_FWD] = &fwd_ops,
+ [BTF_KIND_TYPEDEF] = &modifier_ops,
+ [BTF_KIND_VOLATILE] = &modifier_ops,
+ [BTF_KIND_CONST] = &modifier_ops,
+ [BTF_KIND_RESTRICT] = &modifier_ops,
+};
+
+static s32 btf_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 saved_meta_left = meta_left;
+ s32 var_meta_size;
+
+ if (meta_left < sizeof(*t)) {
+ btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
+ env->log_type_id, meta_left, sizeof(*t));
+ return -EINVAL;
+ }
+ meta_left -= sizeof(*t);
+
+ if (t->info & ~BTF_INFO_MASK) {
+ btf_verifier_log(env, "[%u] Invalid btf_info:%x",
+ env->log_type_id, t->info);
+ return -EINVAL;
+ }
+
+ if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
+ BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
+ btf_verifier_log(env, "[%u] Invalid kind:%u",
+ env->log_type_id, BTF_INFO_KIND(t->info));
+ return -EINVAL;
+ }
+
+ if (!btf_name_offset_valid(env->btf, t->name_off)) {
+ btf_verifier_log(env, "[%u] Invalid name_offset:%u",
+ env->log_type_id, t->name_off);
+ return -EINVAL;
+ }
+
+ var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
+ if (var_meta_size < 0)
+ return var_meta_size;
+
+ meta_left -= var_meta_size;
+
+ return saved_meta_left - meta_left;
+}
+
+static int btf_check_all_metas(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ struct btf_header *hdr;
+ void *cur, *end;
+
+ hdr = &btf->hdr;
+ cur = btf->nohdr_data + hdr->type_off;
+ end = btf->nohdr_data + hdr->type_len;
+
+ env->log_type_id = 1;
+ while (cur < end) {
+ struct btf_type *t = cur;
+ s32 meta_size;
+
+ meta_size = btf_check_meta(env, t, end - cur);
+ if (meta_size < 0)
+ return meta_size;
+
+ btf_add_type(env, t);
+ cur += meta_size;
+ env->log_type_id++;
+ }
+
+ return 0;
+}
+
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ const struct resolve_vertex *v;
+ int err = 0;
+
+ env->resolve_mode = RESOLVE_TBD;
+ env_stack_push(env, t, type_id);
+ while (!err && (v = env_stack_peak(env))) {
+ env->log_type_id = v->type_id;
+ err = btf_type_ops(v->t)->resolve(env, v);
+ }
+
+ env->log_type_id = type_id;
+ if (err == -E2BIG)
+ btf_verifier_log_type(env, t,
+ "Exceeded max resolving depth:%u",
+ MAX_RESOLVE_DEPTH);
+ else if (err == -EEXIST)
+ btf_verifier_log_type(env, t, "Loop detected");
+
+ return err;
+}
+
+static bool btf_resolve_valid(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 type_id)
+{
+ struct btf *btf = env->btf;
+
+ if (!env_type_is_resolved(env, type_id))
+ return false;
+
+ if (btf_type_is_struct(t))
+ return !btf->resolved_ids[type_id] &&
+ !btf->resolved_sizes[type_id];
+
+ if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+ t = btf_type_id_resolve(btf, &type_id);
+ return t && !btf_type_is_modifier(t);
+ }
+
+ if (btf_type_is_array(t)) {
+ const struct btf_array *array = btf_type_array(t);
+ const struct btf_type *elem_type;
+ u32 elem_type_id = array->type;
+ u32 elem_size;
+
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ return elem_type && !btf_type_is_modifier(elem_type) &&
+ (array->nelems * elem_size ==
+ btf->resolved_sizes[type_id]);
+ }
+
+ return false;
+}
+
+static int btf_check_all_types(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ u32 type_id;
+ int err;
+
+ err = env_resolve_init(env);
+ if (err)
+ return err;
+
+ env->phase++;
+ for (type_id = 1; type_id <= btf->nr_types; type_id++) {
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ env->log_type_id = type_id;
+ if (btf_type_needs_resolve(t) &&
+ !env_type_is_resolved(env, type_id)) {
+ err = btf_resolve(env, t, type_id);
+ if (err)
+ return err;
+ }
+
+ if (btf_type_needs_resolve(t) &&
+ !btf_resolve_valid(env, t, type_id)) {
+ btf_verifier_log_type(env, t, "Invalid resolve state");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int btf_parse_type_sec(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr = &env->btf->hdr;
+ int err;
+
+ /* Type section must align to 4 bytes */
+ if (hdr->type_off & (sizeof(u32) - 1)) {
+ btf_verifier_log(env, "Unaligned type_off");
+ return -EINVAL;
+ }
+
+ if (!hdr->type_len) {
+ btf_verifier_log(env, "No type found");
+ return -EINVAL;
+ }
+
+ err = btf_check_all_metas(env);
+ if (err)
+ return err;
+
+ return btf_check_all_types(env);
+}
+
+static int btf_parse_str_sec(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr;
+ struct btf *btf = env->btf;
+ const char *start, *end;
+
+ hdr = &btf->hdr;
+ start = btf->nohdr_data + hdr->str_off;
+ end = start + hdr->str_len;
+
+ if (end != btf->data + btf->data_size) {
+ btf_verifier_log(env, "String section is not at the end");
+ return -EINVAL;
+ }
+
+ if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
+ start[0] || end[-1]) {
+ btf_verifier_log(env, "Invalid string section");
+ return -EINVAL;
+ }
+
+ btf->strings = start;
+
+ return 0;
+}
+
+static const size_t btf_sec_info_offset[] = {
+ offsetof(struct btf_header, type_off),
+ offsetof(struct btf_header, str_off),
+};
+
+static int btf_sec_info_cmp(const void *a, const void *b)
+{
+ const struct btf_sec_info *x = a;
+ const struct btf_sec_info *y = b;
+
+ return (int)(x->off - y->off) ? : (int)(x->len - y->len);
+}
+
+static int btf_check_sec_info(struct btf_verifier_env *env,
+ u32 btf_data_size)
+{
+ struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
+ u32 total, expected_total, i;
+ const struct btf_header *hdr;
+ const struct btf *btf;
+
+ btf = env->btf;
+ hdr = &btf->hdr;
+
+ /* Populate the secs from hdr */
+ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)
+ secs[i] = *(struct btf_sec_info *)((void *)hdr +
+ btf_sec_info_offset[i]);
+
+ sort(secs, ARRAY_SIZE(btf_sec_info_offset),
+ sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);
+
+ /* Check for gaps and overlap among sections */
+ total = 0;
+ expected_total = btf_data_size - hdr->hdr_len;
+ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) {
+ if (expected_total < secs[i].off) {
+ btf_verifier_log(env, "Invalid section offset");
+ return -EINVAL;
+ }
+ if (total < secs[i].off) {
+ /* gap */
+ btf_verifier_log(env, "Unsupported section found");
+ return -EINVAL;
+ }
+ if (total > secs[i].off) {
+ btf_verifier_log(env, "Section overlap found");
+ return -EINVAL;
+ }
+ if (expected_total - total < secs[i].len) {
+ btf_verifier_log(env,
+ "Total section length too long");
+ return -EINVAL;
+ }
+ total += secs[i].len;
+ }
+
+ /* There is data other than hdr and known sections */
+ if (expected_total != total) {
+ btf_verifier_log(env, "Unsupported section found");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,
+ u32 btf_data_size)
+{
+ const struct btf_header *hdr;
+ u32 hdr_len, hdr_copy;
+ /*
+ * Minimal part of the "struct btf_header" that
+ * contains the hdr_len.
+ */
+ struct btf_min_header {
+ u16 magic;
+ u8 version;
+ u8 flags;
+ u32 hdr_len;
+ } __user *min_hdr;
+ struct btf *btf;
+ int err;
+
+ btf = env->btf;
+ min_hdr = btf_data;
+
+ if (btf_data_size < sizeof(*min_hdr)) {
+ btf_verifier_log(env, "hdr_len not found");
+ return -EINVAL;
+ }
+
+ if (get_user(hdr_len, &min_hdr->hdr_len))
+ return -EFAULT;
+
+ if (btf_data_size < hdr_len) {
+ btf_verifier_log(env, "btf_header not found");
+ return -EINVAL;
+ }
+
+ err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len);
+ if (err) {
+ if (err == -E2BIG)
+ btf_verifier_log(env, "Unsupported btf_header");
+ return err;
+ }
+
+ hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
+ if (copy_from_user(&btf->hdr, btf_data, hdr_copy))
+ return -EFAULT;
+
+ hdr = &btf->hdr;
+
+ btf_verifier_log_hdr(env, btf_data_size);
+
+ if (hdr->magic != BTF_MAGIC) {
+ btf_verifier_log(env, "Invalid magic");
+ return -EINVAL;
+ }
+
+ if (hdr->version != BTF_VERSION) {
+ btf_verifier_log(env, "Unsupported version");
+ return -ENOTSUPP;
+ }
+
+ if (hdr->flags) {
+ btf_verifier_log(env, "Unsupported flags");
+ return -ENOTSUPP;
+ }
+
+ if (btf_data_size == hdr->hdr_len) {
+ btf_verifier_log(env, "No data");
+ return -EINVAL;
+ }
+
+ err = btf_check_sec_info(env, btf_data_size);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+ u32 log_level, char __user *log_ubuf, u32 log_size)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf = NULL;
+ u8 *data;
+ int err;
+
+ if (btf_data_size > BTF_MAX_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ if (log_level || log_ubuf || log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log->level = log_level;
+ log->ubuf = log_ubuf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+ !log->level || !log->ubuf) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
+ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+ if (!btf) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ env->btf = btf;
+
+ err = btf_parse_hdr(env, btf_data, btf_data_size);
+ if (err)
+ goto errout;
+
+ data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ btf->data = data;
+ btf->data_size = btf_data_size;
+ btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+ if (copy_from_user(data, btf_data, btf_data_size)) {
+ err = -EFAULT;
+ goto errout;
+ }
+
+ err = btf_parse_str_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_parse_type_sec(env);
+ if (err)
+ goto errout;
+
+ if (log->level && bpf_verifier_log_full(log)) {
+ err = -ENOSPC;
+ goto errout;
+ }
+
+ btf_verifier_env_free(env);
+ refcount_set(&btf->refcnt, 1);
+ return btf;
+
+errout:
+ btf_verifier_env_free(env);
+ if (btf)
+ btf_free(btf);
+ return ERR_PTR(err);
+}
+
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+ struct seq_file *m)
+{
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+}
+
+static int btf_release(struct inode *inode, struct file *filp)
+{
+ btf_put(filp->private_data);
+ return 0;
+}
+
+const struct file_operations btf_fops = {
+ .release = btf_release,
+};
+
+static int __btf_new_fd(struct btf *btf)
+{
+ return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
+}
+
+int btf_new_fd(const union bpf_attr *attr)
+{
+ struct btf *btf;
+ int ret;
+
+ btf = btf_parse(u64_to_user_ptr(attr->btf),
+ attr->btf_size, attr->btf_log_level,
+ u64_to_user_ptr(attr->btf_log_buf),
+ attr->btf_log_size);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ ret = btf_alloc_id(btf);
+ if (ret) {
+ btf_free(btf);
+ return ret;
+ }
+
+ /*
+ * The BTF ID is published to the userspace.
+ * All BTF free must go through call_rcu() from
+ * now on (i.e. free by calling btf_put()).
+ */
+
+ ret = __btf_new_fd(btf);
+ if (ret < 0)
+ btf_put(btf);
+
+ return ret;
+}
+
+struct btf *btf_get_by_fd(int fd)
+{
+ struct btf *btf;
+ struct fd f;
+
+ f = fdget(fd);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &btf_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ btf = f.file->private_data;
+ refcount_inc(&btf->refcnt);
+ fdput(f);
+
+ return btf;
+}
+
+int btf_get_info_by_fd(const struct btf *btf,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_btf_info __user *uinfo;
+ struct bpf_btf_info info = {};
+ u32 info_copy, btf_copy;
+ void __user *ubtf;
+ u32 uinfo_len;
+
+ uinfo = u64_to_user_ptr(attr->info.info);
+ uinfo_len = attr->info.info_len;
+
+ info_copy = min_t(u32, uinfo_len, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_copy))
+ return -EFAULT;
+
+ info.id = btf->id;
+ ubtf = u64_to_user_ptr(info.btf);
+ btf_copy = min_t(u32, btf->data_size, info.btf_size);
+ if (copy_to_user(ubtf, btf->data, btf_copy))
+ return -EFAULT;
+ info.btf_size = btf->data_size;
+
+ if (copy_to_user(uinfo, &info, info_copy) ||
+ put_user(info_copy, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+int btf_get_fd_by_id(u32 id)
+{
+ struct btf *btf;
+ int fd;
+
+ rcu_read_lock();
+ btf = idr_find(&btf_idr, id);
+ if (!btf || !refcount_inc_not_zero(&btf->refcnt))
+ btf = ERR_PTR(-ENOENT);
+ rcu_read_unlock();
+
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ fd = __btf_new_fd(btf);
+ if (fd < 0)
+ btf_put(btf);
+
+ return fd;
+}
+
+u32 btf_id(const struct btf *btf)
+{
+ return btf->id;
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43171a0bb02b..f7c00bd6f8e4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
* @type: The type of program to be exectuted
+ * @t_ctx: Pointer to attach type specific context
*
* socket is expected to be of type INET or INET6.
*
@@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct sockaddr *uaddr,
- enum bpf_attach_type type)
+ enum bpf_attach_type type,
+ void *t_ctx)
{
struct bpf_sock_addr_kern ctx = {
.sk = sk,
.uaddr = uaddr,
+ .t_ctx = t_ctx,
};
+ struct sockaddr_storage unspec;
struct cgroup *cgrp;
int ret;
@@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;
+ if (!ctx.uaddr) {
+ memset(&unspec, 0, sizeof(unspec));
+ ctx.uaddr = (struct sockaddr *)&unspec;
+ }
+
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6ef6746a7871..a9e6c04d0f4a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
+#include <linux/perf_event.h>
#include <asm/unaligned.h>
@@ -349,6 +350,20 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
return prog_adj;
}
+void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
+{
+ int i;
+
+ for (i = 0; i < fp->aux->func_cnt; i++)
+ bpf_prog_kallsyms_del(fp->aux->func[i]);
+}
+
+void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
+{
+ bpf_prog_kallsyms_del_subprogs(fp);
+ bpf_prog_kallsyms_del(fp);
+}
+
#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
@@ -583,6 +598,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_fill_ill_insns(hdr, size);
hdr->pages = size / PAGE_SIZE;
+ hdr->locked = 0;
+
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
@@ -683,23 +700,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
break;
- case BPF_LD | BPF_ABS | BPF_W:
- case BPF_LD | BPF_ABS | BPF_H:
- case BPF_LD | BPF_ABS | BPF_B:
- *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
- *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
- break;
-
- case BPF_LD | BPF_IND | BPF_W:
- case BPF_LD | BPF_IND | BPF_H:
- case BPF_LD | BPF_IND | BPF_B:
- *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
- *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
- *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
- break;
-
case BPF_LD | BPF_IMM | BPF_DW:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -940,14 +940,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(LDX, MEM, W), \
INSN_3(LDX, MEM, DW), \
/* Immediate based. */ \
- INSN_3(LD, IMM, DW), \
- /* Misc (old cBPF carry-over). */ \
- INSN_3(LD, ABS, B), \
- INSN_3(LD, ABS, H), \
- INSN_3(LD, ABS, W), \
- INSN_3(LD, IND, B), \
- INSN_3(LD, IND, H), \
- INSN_3(LD, IND, W)
+ INSN_3(LD, IMM, DW)
bool bpf_opcode_in_insntable(u8 code)
{
@@ -957,6 +950,13 @@ bool bpf_opcode_in_insntable(u8 code)
[0 ... 255] = false,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
+ /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
+ [BPF_LD | BPF_ABS | BPF_B] = true,
+ [BPF_LD | BPF_ABS | BPF_H] = true,
+ [BPF_LD | BPF_ABS | BPF_W] = true,
+ [BPF_LD | BPF_IND | BPF_B] = true,
+ [BPF_LD | BPF_IND | BPF_H] = true,
+ [BPF_LD | BPF_IND | BPF_W] = true,
};
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
@@ -987,8 +987,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
u32 tail_call_cnt = 0;
- void *ptr;
- int off;
#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })
@@ -1315,67 +1313,6 @@ out:
atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
(DST + insn->off));
CONT;
- LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
- off = IMM;
-load_word:
- /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
- * appearing in the programs where ctx == skb
- * (see may_access_skb() in the verifier). All programs
- * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
- * bpf_convert_filter() saves it in BPF_R6, internal BPF
- * verifier will check that BPF_R6 == ctx.
- *
- * BPF_ABS and BPF_IND are wrappers of function calls,
- * so they scratch BPF_R1-BPF_R5 registers, preserve
- * BPF_R6-BPF_R9, and store return value into BPF_R0.
- *
- * Implicit input:
- * ctx == skb == BPF_R6 == CTX
- *
- * Explicit input:
- * SRC == any register
- * IMM == 32-bit immediate
- *
- * Output:
- * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
- */
-
- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = get_unaligned_be32(ptr);
- CONT;
- }
-
- return 0;
- LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
- off = IMM;
-load_half:
- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = get_unaligned_be16(ptr);
- CONT;
- }
-
- return 0;
- LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
- off = IMM;
-load_byte:
- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = *(u8 *)ptr;
- CONT;
- }
-
- return 0;
- LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
- off = IMM + SRC;
- goto load_word;
- LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
- off = IMM + SRC;
- goto load_half;
- LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
- off = IMM + SRC;
- goto load_byte;
default_label:
/* If we ever reach this, we have a bug somewhere. Die hard here
@@ -1513,6 +1450,33 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
return 0;
}
+static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
+{
+#ifdef CONFIG_ARCH_HAS_SET_MEMORY
+ int i, err;
+
+ for (i = 0; i < fp->aux->func_cnt; i++) {
+ err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
+ if (err)
+ return err;
+ }
+
+ return bpf_prog_check_pages_ro_single(fp);
+#endif
+ return 0;
+}
+
+static void bpf_prog_select_func(struct bpf_prog *fp)
+{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+
+ fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+#else
+ fp->bpf_func = __bpf_prog_ret0_warn;
+#endif
+}
+
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
* @fp: bpf_prog populated with internal BPF program
@@ -1523,13 +1487,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
-#ifndef CONFIG_BPF_JIT_ALWAYS_ON
- u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+ /* In case of BPF to BPF calls, verifier did all the prep
+ * work with regards to JITing, etc.
+ */
+ if (fp->bpf_func)
+ goto finalize;
- fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
-#else
- fp->bpf_func = __bpf_prog_ret0_warn;
-#endif
+ bpf_prog_select_func(fp);
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
@@ -1550,6 +1514,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
if (*err)
return fp;
}
+
+finalize:
bpf_prog_lock_ro(fp);
/* The tail call compatibility check can only be done at
@@ -1558,7 +1524,17 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* all eBPF JITs might immediately support all features.
*/
*err = bpf_check_tail_call(fp);
-
+ if (*err)
+ return fp;
+
+ /* Checkpoint: at this point onwards any cBPF -> eBPF or
+ * native eBPF program is read-only. If we failed to change
+ * the page attributes (e.g. allocation failure from
+ * splitting large pages), then reject the whole program
+ * in order to guarantee not ending up with any W+X pages
+ * from BPF side in kernel.
+ */
+ *err = bpf_prog_check_pages_ro_locked(fp);
return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
@@ -1695,6 +1671,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
int new_prog_cnt, carry_prog_cnt = 0;
struct bpf_prog **existing_prog;
struct bpf_prog_array *array;
+ bool found_exclude = false;
int new_prog_idx = 0;
/* Figure out how many existing progs we need to carry over to
@@ -1703,14 +1680,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
if (old_array) {
existing_prog = old_array->progs;
for (; *existing_prog; existing_prog++) {
- if (*existing_prog != exclude_prog &&
- *existing_prog != &dummy_bpf_prog.prog)
+ if (*existing_prog == exclude_prog) {
+ found_exclude = true;
+ continue;
+ }
+ if (*existing_prog != &dummy_bpf_prog.prog)
carry_prog_cnt++;
if (*existing_prog == include_prog)
return -EEXIST;
}
}
+ if (exclude_prog && !found_exclude)
+ return -ENOENT;
+
/* How many progs (not NULL) will be in the new array? */
new_prog_cnt = carry_prog_cnt;
if (include_prog)
@@ -1772,6 +1755,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux = container_of(work, struct bpf_prog_aux, work);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+ if (aux->prog->has_callchain_buf)
+ put_callchain_buffers();
+#endif
for (i = 0; i < aux->func_cnt; i++)
bpf_jit_free(aux->func[i]);
if (aux->func_cnt) {
@@ -1832,6 +1819,8 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
@@ -1844,6 +1833,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
{
return -ENOTSUPP;
}
+EXPORT_SYMBOL_GPL(bpf_event_output);
/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
@@ -1890,9 +1880,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
#include <linux/bpf_trace.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
-
-/* These are only used within the BPF_SYSCALL code */
-#ifdef CONFIG_BPF_SYSCALL
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
-#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a4bb0b34375a..e0918d180f08 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -19,6 +19,7 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ptr_ring.h>
+#include <net/xdp.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
@@ -137,27 +138,6 @@ free_cmap:
return ERR_PTR(err);
}
-static void __cpu_map_queue_destructor(void *ptr)
-{
- /* The tear-down procedure should have made sure that queue is
- * empty. See __cpu_map_entry_replace() and work-queue
- * invoked cpu_map_kthread_stop(). Catch any broken behaviour
- * gracefully and warn once.
- */
- if (WARN_ON_ONCE(ptr))
- page_frag_free(ptr);
-}
-
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- if (atomic_dec_and_test(&rcpu->refcnt)) {
- /* The queue should be empty at this point */
- ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
- kfree(rcpu->queue);
- kfree(rcpu);
- }
-}
-
static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
{
atomic_inc(&rcpu->refcnt);
@@ -179,45 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work)
kthread_stop(rcpu->kthread);
}
-/* For now, xdp_pkt is a cpumap internal data structure, with info
- * carried between enqueue to dequeue. It is mapped into the top
- * headroom of the packet, to avoid allocating separate mem.
- */
-struct xdp_pkt {
- void *data;
- u16 len;
- u16 headroom;
- u16 metasize;
- struct net_device *dev_rx;
-};
-
-/* Convert xdp_buff to xdp_pkt */
-static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
-{
- struct xdp_pkt *xdp_pkt;
- int metasize;
- int headroom;
-
- /* Assure headroom is available for storing info */
- headroom = xdp->data - xdp->data_hard_start;
- metasize = xdp->data - xdp->data_meta;
- metasize = metasize > 0 ? metasize : 0;
- if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
- return NULL;
-
- /* Store info in top of packet */
- xdp_pkt = xdp->data_hard_start;
-
- xdp_pkt->data = xdp->data;
- xdp_pkt->len = xdp->data_end - xdp->data;
- xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
- xdp_pkt->metasize = metasize;
-
- return xdp_pkt;
-}
-
static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
- struct xdp_pkt *xdp_pkt)
+ struct xdp_frame *xdpf)
{
unsigned int frame_size;
void *pkt_data_start;
@@ -232,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* would be preferred to set frame_size to 2048 or 4096
* depending on the driver.
* frame_size = 2048;
- * frame_len = frame_size - sizeof(*xdp_pkt);
+ * frame_len = frame_size - sizeof(*xdp_frame);
*
* Instead, with info avail, skb_shared_info in placed after
* packet len. This, unfortunately fakes the truesize.
@@ -240,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* is not at a fixed memory location, with mixed length
* packets, which is bad for cache-line hotness.
*/
- frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+ frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+ pkt_data_start = xdpf->data - xdpf->headroom;
skb = build_skb(pkt_data_start, frame_size);
if (!skb)
return NULL;
- skb_reserve(skb, xdp_pkt->headroom);
- __skb_put(skb, xdp_pkt->len);
- if (xdp_pkt->metasize)
- skb_metadata_set(skb, xdp_pkt->metasize);
+ skb_reserve(skb, xdpf->headroom);
+ __skb_put(skb, xdpf->len);
+ if (xdpf->metasize)
+ skb_metadata_set(skb, xdpf->metasize);
/* Essential SKB info: protocol and skb->dev */
- skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+ skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
/* Optional SKB info, currently missing:
* - HW checksum info (skb->ip_summed)
@@ -265,6 +208,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
return skb;
}
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
+{
+ /* The tear-down procedure should have made sure that queue is
+ * empty. See __cpu_map_entry_replace() and work-queue
+ * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+ * gracefully and warn once.
+ */
+ struct xdp_frame *xdpf;
+
+ while ((xdpf = ptr_ring_consume(ring)))
+ if (WARN_ON_ONCE(xdpf))
+ xdp_return_frame(xdpf);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ if (atomic_dec_and_test(&rcpu->refcnt)) {
+ /* The queue should be empty at this point */
+ __cpu_map_ring_cleanup(rcpu->queue);
+ ptr_ring_cleanup(rcpu->queue, NULL);
+ kfree(rcpu->queue);
+ kfree(rcpu);
+ }
+}
+
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
@@ -278,7 +246,7 @@ static int cpu_map_kthread_run(void *data)
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
unsigned int processed = 0, drops = 0, sched = 0;
- struct xdp_pkt *xdp_pkt;
+ struct xdp_frame *xdpf;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -301,13 +269,13 @@ static int cpu_map_kthread_run(void *data)
* kthread CPU pinned. Lockless access to ptr_ring
* consume side valid as no-resize allowed of queue.
*/
- while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+ while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
struct sk_buff *skb;
int ret;
- skb = cpu_map_build_skb(rcpu, xdp_pkt);
+ skb = cpu_map_build_skb(rcpu, xdpf);
if (!skb) {
- page_frag_free(xdp_pkt);
+ xdp_return_frame(xdpf);
continue;
}
@@ -604,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
spin_lock(&q->producer_lock);
for (i = 0; i < bq->count; i++) {
- void *xdp_pkt = bq->q[i];
+ struct xdp_frame *xdpf = bq->q[i];
int err;
- err = __ptr_ring_produce(q, xdp_pkt);
+ err = __ptr_ring_produce(q, xdpf);
if (err) {
drops++;
- page_frag_free(xdp_pkt); /* Free xdp_pkt */
+ xdp_return_frame_rx_napi(xdpf);
}
processed++;
}
@@ -625,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
@@ -636,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
* driver to code invoking us to finished, due to driver
* (e.g. ixgbe) recycle tricks based on page-refcnt.
*
- * Thus, incoming xdp_pkt is always queued here (else we race
+ * Thus, incoming xdp_frame is always queued here (else we race
* with another CPU on page-refcnt and remaining driver code).
* Queue time is very short, as driver will invoke flush
* operation, when completing napi->poll call.
*/
- bq->q[bq->count++] = xdp_pkt;
+ bq->q[bq->count++] = xdpf;
return 0;
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
- struct xdp_pkt *xdp_pkt;
+ struct xdp_frame *xdpf;
- xdp_pkt = convert_to_xdp_pkt(xdp);
- if (unlikely(!xdp_pkt))
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
return -EOVERFLOW;
/* Info needed when constructing SKB on remote CPU */
- xdp_pkt->dev_rx = dev_rx;
+ xdpf->dev_rx = dev_rx;
- bq_enqueue(rcpu, xdp_pkt);
+ bq_enqueue(rcpu, xdpf);
return 0;
}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ece9115..642c97f6d1b8 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,15 +48,25 @@
* calls will fail at this point.
*/
#include <linux/bpf.h>
+#include <net/xdp.h>
#include <linux/filter.h>
+#include <trace/events/xdp.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define DEV_MAP_BULK_SIZE 16
+struct xdp_bulk_queue {
+ struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+ struct net_device *dev_rx;
+ unsigned int count;
+};
+
struct bpf_dtab_netdev {
- struct net_device *dev;
+ struct net_device *dev; /* must be first member, due to tracepoint */
struct bpf_dtab *dtab;
unsigned int bit;
+ struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
};
@@ -206,6 +216,50 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
__set_bit(bit, bitmap);
}
+static int bq_xmit_all(struct bpf_dtab_netdev *obj,
+ struct xdp_bulk_queue *bq, u32 flags)
+{
+ struct net_device *dev = obj->dev;
+ int sent = 0, drops = 0, err = 0;
+ int i;
+
+ if (unlikely(!bq->count))
+ return 0;
+
+ for (i = 0; i < bq->count; i++) {
+ struct xdp_frame *xdpf = bq->q[i];
+
+ prefetch(xdpf);
+ }
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+ if (sent < 0) {
+ err = sent;
+ sent = 0;
+ goto error;
+ }
+ drops = bq->count - sent;
+out:
+ bq->count = 0;
+
+ trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+ sent, drops, bq->dev_rx, dev, err);
+ bq->dev_rx = NULL;
+ return 0;
+error:
+ /* If ndo_xdp_xmit fails with an errno, no frames have been
+ * xmit'ed and it's our responsibility to them free all.
+ */
+ for (i = 0; i < bq->count; i++) {
+ struct xdp_frame *xdpf = bq->q[i];
+
+ /* RX path under NAPI protection, can return frames faster */
+ xdp_return_frame_rx_napi(xdpf);
+ drops++;
+ }
+ goto out;
+}
+
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
@@ -221,7 +275,7 @@ void __dev_map_flush(struct bpf_map *map)
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
- struct net_device *netdev;
+ struct xdp_bulk_queue *bq;
/* This is possible if the dev entry is removed by user space
* between xdp redirect and flush op.
@@ -230,9 +284,9 @@ void __dev_map_flush(struct bpf_map *map)
continue;
__clear_bit(bit, bitmap);
- netdev = dev->dev;
- if (likely(netdev->netdev_ops->ndo_xdp_flush))
- netdev->netdev_ops->ndo_xdp_flush(netdev);
+
+ bq = this_cpu_ptr(dev->bulkq);
+ bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
}
}
@@ -240,37 +294,93 @@ void __dev_map_flush(struct bpf_map *map)
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
*/
-struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct bpf_dtab_netdev *dev;
+ struct bpf_dtab_netdev *obj;
if (key >= map->max_entries)
return NULL;
- dev = READ_ONCE(dtab->netdev_map[key]);
- return dev ? dev->dev : NULL;
+ obj = READ_ONCE(dtab->netdev_map[key]);
+ return obj;
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+ struct net_device *dev_rx)
+
+{
+ struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+
+ if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+ bq_xmit_all(obj, bq, 0);
+
+ /* Ingress dev_rx will be the same for all xdp_frame's in
+ * bulk_queue, because bq stored per-CPU and must be flushed
+ * from net_device drivers NAPI func end.
+ */
+ if (!bq->dev_rx)
+ bq->dev_rx = dev_rx;
+
+ bq->q[bq->count++] = xdpf;
+ return 0;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
+ struct xdp_frame *xdpf;
+
+ if (!dev->netdev_ops->ndo_xdp_xmit)
+ return -EOPNOTSUPP;
+
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ return bq_enqueue(dst, xdpf, dev_rx);
+}
+
+int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog)
+{
+ int err;
+
+ err = __xdp_generic_ok_fwd_dev(skb, dst->dev);
+ if (unlikely(err))
+ return err;
+ skb->dev = dst->dev;
+ generic_xdp_tx(skb, xdp_prog);
+
+ return 0;
}
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
- struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+ struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+ struct net_device *dev = obj ? obj->dev : NULL;
return dev ? &dev->ifindex : NULL;
}
static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
- if (dev->dev->netdev_ops->ndo_xdp_flush) {
- struct net_device *fl = dev->dev;
+ if (dev->dev->netdev_ops->ndo_xdp_xmit) {
+ struct xdp_bulk_queue *bq;
unsigned long *bitmap;
+
int cpu;
for_each_online_cpu(cpu) {
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
__clear_bit(dev->bit, bitmap);
- fl->netdev_ops->ndo_xdp_flush(dev->dev);
+ bq = per_cpu_ptr(dev->bulkq, cpu);
+ bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
}
}
}
@@ -281,6 +391,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
dev_map_flush_old(dev);
+ free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -313,6 +424,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct net *net = current->nsproxy->net_ns;
+ gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev, *old_dev;
u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value;
@@ -327,13 +439,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
if (!ifindex) {
dev = NULL;
} else {
- dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
- map->numa_node);
+ dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
if (!dev)
return -ENOMEM;
+ dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+ sizeof(void *), gfp);
+ if (!dev->bulkq) {
+ kfree(dev);
+ return -ENOMEM;
+ }
+
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
+ free_percpu(dev->bulkq);
kfree(dev);
return -EINVAL;
}
@@ -405,6 +524,9 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
+ /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
+ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
+ offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
return 0;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b76828f23b49..3ca2198a6d22 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
- *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
@@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
const int ret = BPF_REG_0;
const int ref_reg = BPF_REG_1;
- *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
offsetof(struct htab_elem, lru_node) +
@@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
- *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e238221e..73065e2d23c2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE,
};
+
+#ifdef CONFIG_CGROUPS
+BPF_CALL_0(bpf_get_current_cgroup_id)
+{
+ struct cgroup *cgrp = task_dfl_cgroup(current);
+
+ return cgrp->kn->id.id;
+}
+
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
+ .func = bpf_get_current_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bf6da59ae0d0..76efe9a183f5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,8 +150,163 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return 0;
}
+struct map_iter {
+ void *key;
+ bool done;
+};
+
+static struct map_iter *map_iter(struct seq_file *m)
+{
+ return m->private;
+}
+
+static struct bpf_map *seq_file_to_map(struct seq_file *m)
+{
+ return file_inode(m->file)->i_private;
+}
+
+static void map_iter_free(struct map_iter *iter)
+{
+ if (iter) {
+ kfree(iter->key);
+ kfree(iter);
+ }
+}
+
+static struct map_iter *map_iter_alloc(struct bpf_map *map)
+{
+ struct map_iter *iter;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
+ if (!iter)
+ goto error;
+
+ iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!iter->key)
+ goto error;
+
+ return iter;
+
+error:
+ map_iter_free(iter);
+ return NULL;
+}
+
+static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct bpf_map *map = seq_file_to_map(m);
+ void *key = map_iter(m)->key;
+
+ if (map_iter(m)->done)
+ return NULL;
+
+ if (unlikely(v == SEQ_START_TOKEN))
+ goto done;
+
+ if (map->ops->map_get_next_key(map, key, key)) {
+ map_iter(m)->done = true;
+ return NULL;
+ }
+
+done:
+ ++(*pos);
+ return key;
+}
+
+static void *map_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (map_iter(m)->done)
+ return NULL;
+
+ return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
+}
+
+static void map_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int map_seq_show(struct seq_file *m, void *v)
+{
+ struct bpf_map *map = seq_file_to_map(m);
+ void *key = map_iter(m)->key;
+
+ if (unlikely(v == SEQ_START_TOKEN)) {
+ seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
+ seq_puts(m, "# WARNING!! The output format will change\n");
+ } else {
+ map->ops->map_seq_show_elem(map, key, m);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations bpffs_map_seq_ops = {
+ .start = map_seq_start,
+ .next = map_seq_next,
+ .show = map_seq_show,
+ .stop = map_seq_stop,
+};
+
+static int bpffs_map_open(struct inode *inode, struct file *file)
+{
+ struct bpf_map *map = inode->i_private;
+ struct map_iter *iter;
+ struct seq_file *m;
+ int err;
+
+ iter = map_iter_alloc(map);
+ if (!iter)
+ return -ENOMEM;
+
+ err = seq_open(file, &bpffs_map_seq_ops);
+ if (err) {
+ map_iter_free(iter);
+ return err;
+ }
+
+ m = file->private_data;
+ m->private = iter;
+
+ return 0;
+}
+
+static int bpffs_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+
+ map_iter_free(map_iter(m));
+
+ return seq_release(inode, file);
+}
+
+/* bpffs_map_fops should only implement the basic
+ * read operation for a BPF map. The purpose is to
+ * provide a simple user intuitive way to do
+ * "cat bpffs/pathto/a-pinned-map".
+ *
+ * Other operations (e.g. write, lookup...) should be realized by
+ * the userspace tools (e.g. bpftool) through the
+ * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
+ * interface.
+ */
+static const struct file_operations bpffs_map_fops = {
+ .open = bpffs_map_open,
+ .read = seq_read,
+ .release = bpffs_map_release,
+};
+
+static int bpffs_obj_open(struct inode *inode, struct file *file)
+{
+ return -EIO;
+}
+
+static const struct file_operations bpffs_obj_fops = {
+ .open = bpffs_obj_open,
+};
+
static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
- const struct inode_operations *iops)
+ const struct inode_operations *iops,
+ const struct file_operations *fops)
{
struct inode *dir = dentry->d_parent->d_inode;
struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
@@ -159,6 +314,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
return PTR_ERR(inode);
inode->i_op = iops;
+ inode->i_fop = fops;
inode->i_private = raw;
bpf_dentry_finalize(dentry, inode, dir);
@@ -167,12 +323,16 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
{
- return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops,
+ &bpffs_obj_fops);
}
static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
{
- return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
+ struct bpf_map *map = arg;
+
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
+ map->btf ? &bpffs_map_fops : &bpffs_obj_fops);
}
static struct dentry *
@@ -279,13 +439,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
ret = bpf_obj_do_pin(pname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
- if ((trace_bpf_obj_pin_prog_enabled() ||
- trace_bpf_obj_pin_map_enabled()) && !ret) {
- if (type == BPF_TYPE_PROG)
- trace_bpf_obj_pin_prog(raw, ufd, pname);
- if (type == BPF_TYPE_MAP)
- trace_bpf_obj_pin_map(raw, ufd, pname);
- }
out:
putname(pname);
return ret;
@@ -352,15 +505,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
else
goto out;
- if (ret < 0) {
+ if (ret < 0)
bpf_any_put(raw, type);
- } else if (trace_bpf_obj_get_prog_enabled() ||
- trace_bpf_obj_get_map_enabled()) {
- if (type == BPF_TYPE_PROG)
- trace_bpf_obj_get_prog(raw, ret, pname);
- if (type == BPF_TYPE_MAP)
- trace_bpf_obj_get_map(raw, ret, pname);
- }
out:
putname(pname);
return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b4b5b81e7251..1603492c9cc7 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -623,8 +623,9 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;
- node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
- GFP_ATOMIC | __GFP_NOWARN);
+ node_stack = kmalloc_array(trie->max_prefixlen,
+ sizeof(struct lpm_trie_node *),
+ GFP_ATOMIC | __GFP_NOWARN);
if (!node_stack)
return -ENOMEM;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index c9401075b58c..ac747d5cf7c6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
*
* This software is licensed under the GNU General License Version 2,
* June 1991 as shown in the file COPYING in the top-level directory of this
@@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
struct bpf_prog_offload *offload;
bool ret;
- if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
+ if (!bpf_prog_is_dev_bound(prog->aux))
return false;
+ if (!bpf_map_is_dev_bound(map))
+ return bpf_map_offload_neutral(map);
down_read(&bpf_devs_lock);
offload = prog->aux->offload;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 95a84b2f10ce..52a91d816c0e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -48,14 +48,40 @@
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-struct bpf_stab {
- struct bpf_map map;
- struct sock **sock_map;
+struct bpf_sock_progs {
struct bpf_prog *bpf_tx_msg;
struct bpf_prog *bpf_parse;
struct bpf_prog *bpf_verdict;
};
+struct bpf_stab {
+ struct bpf_map map;
+ struct sock **sock_map;
+ struct bpf_sock_progs progs;
+};
+
+struct bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct bucket *buckets;
+ atomic_t count;
+ u32 n_buckets;
+ u32 elem_size;
+ struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+ struct rcu_head rcu;
+ struct hlist_node hash_node;
+ u32 hash;
+ struct sock *sk;
+ char key[0];
+};
+
enum smap_psock_state {
SMAP_TX_RUNNING,
};
@@ -63,6 +89,8 @@ enum smap_psock_state {
struct smap_psock_map_entry {
struct list_head list;
struct sock **entry;
+ struct htab_elem *hash_link;
+ struct bpf_htab *htab;
};
struct smap_psock {
@@ -191,6 +219,12 @@ out:
rcu_read_unlock();
}
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ atomic_dec(&htab->count);
+ kfree_rcu(l, rcu);
+}
+
static void bpf_tcp_close(struct sock *sk, long timeout)
{
void (*close_fun)(struct sock *sk, long timeout);
@@ -227,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
}
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- osk = cmpxchg(e->entry, sk, NULL);
- if (osk == sk) {
- list_del(&e->list);
- smap_release_sock(psock, sk);
+ if (e->entry) {
+ osk = cmpxchg(e->entry, sk, NULL);
+ if (osk == sk) {
+ list_del(&e->list);
+ smap_release_sock(psock, sk);
+ }
+ } else {
+ hlist_del_rcu(&e->hash_link->hash_node);
+ smap_release_sock(psock, e->hash_link->sk);
+ free_htab_elem(e->htab, e->hash_link);
}
}
write_unlock_bh(&sk->sk_callback_lock);
@@ -461,7 +501,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
{
return ((_rc == SK_PASS) ?
- (md->map ? __SK_REDIRECT : __SK_PASS) :
+ (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
__SK_DROP);
}
@@ -483,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk,
}
bpf_compute_data_pointers_sg(md);
+ md->sk = sk;
rc = (*prog->bpf_func)(md, prog->insnsi);
psock->apply_bytes = md->apply_bytes;
@@ -1092,7 +1133,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
* when we orphan the skb so that we don't have the possibility
* to reference a stale map.
*/
- TCP_SKB_CB(skb)->bpf.map = NULL;
+ TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
skb->sk = psock->sock;
bpf_compute_data_pointers(skb);
preempt_disable();
@@ -1102,7 +1143,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
/* Moving return codes from UAPI namespace into internal namespace */
return rc == SK_PASS ?
- (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
+ (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
__SK_DROP;
}
@@ -1372,7 +1413,6 @@ static int smap_init_sock(struct smap_psock *psock,
}
static void smap_init_progs(struct smap_psock *psock,
- struct bpf_stab *stab,
struct bpf_prog *verdict,
struct bpf_prog *parse)
{
@@ -1450,14 +1490,13 @@ static void smap_gc_work(struct work_struct *w)
kfree(psock);
}
-static struct smap_psock *smap_init_psock(struct sock *sock,
- struct bpf_stab *stab)
+static struct smap_psock *smap_init_psock(struct sock *sock, int node)
{
struct smap_psock *psock;
psock = kzalloc_node(sizeof(struct smap_psock),
GFP_ATOMIC | __GFP_NOWARN,
- stab->map.numa_node);
+ node);
if (!psock)
return ERR_PTR(-ENOMEM);
@@ -1525,12 +1564,14 @@ free_stab:
return ERR_PTR(err);
}
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+ struct sock **entry,
+ struct htab_elem *hash_link)
{
struct smap_psock_map_entry *e, *tmp;
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- if (e->entry == entry) {
+ if (e->entry == entry || e->hash_link == hash_link) {
list_del(&e->list);
break;
}
@@ -1568,7 +1609,7 @@ static void sock_map_free(struct bpf_map *map)
* to be null and queued for garbage collection.
*/
if (likely(psock)) {
- smap_list_remove(psock, &stab->sock_map[i]);
+ smap_list_remove(psock, &stab->sock_map[i], NULL);
smap_release_sock(psock, sock);
}
write_unlock_bh(&sock->sk_callback_lock);
@@ -1627,7 +1668,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
if (psock->bpf_parse)
smap_stop_sock(psock, sock);
- smap_list_remove(psock, &stab->sock_map[k]);
+ smap_list_remove(psock, &stab->sock_map[k], NULL);
smap_release_sock(psock, sock);
out:
write_unlock_bh(&sock->sk_callback_lock);
@@ -1662,40 +1703,26 @@ out:
* - sock_map must use READ_ONCE and (cmp)xchg operations
* - BPF verdict/parse programs must use READ_ONCE and xchg operations
*/
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
- struct bpf_map *map,
- void *key, u64 flags)
+
+static int __sock_map_ctx_update_elem(struct bpf_map *map,
+ struct bpf_sock_progs *progs,
+ struct sock *sock,
+ struct sock **map_link,
+ void *key)
{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
- struct smap_psock_map_entry *e = NULL;
struct bpf_prog *verdict, *parse, *tx_msg;
- struct sock *osock, *sock;
+ struct smap_psock_map_entry *e = NULL;
struct smap_psock *psock;
- u32 i = *(u32 *)key;
bool new = false;
- int err;
-
- if (unlikely(flags > BPF_EXIST))
- return -EINVAL;
-
- if (unlikely(i >= stab->map.max_entries))
- return -E2BIG;
-
- sock = READ_ONCE(stab->sock_map[i]);
- if (flags == BPF_EXIST && !sock)
- return -ENOENT;
- else if (flags == BPF_NOEXIST && sock)
- return -EEXIST;
-
- sock = skops->sk;
+ int err = 0;
/* 1. If sock map has BPF programs those will be inherited by the
* sock being added. If the sock is already attached to BPF programs
* this results in an error.
*/
- verdict = READ_ONCE(stab->bpf_verdict);
- parse = READ_ONCE(stab->bpf_parse);
- tx_msg = READ_ONCE(stab->bpf_tx_msg);
+ verdict = READ_ONCE(progs->bpf_verdict);
+ parse = READ_ONCE(progs->bpf_parse);
+ tx_msg = READ_ONCE(progs->bpf_tx_msg);
if (parse && verdict) {
/* bpf prog refcnt may be zero if a concurrent attach operation
@@ -1748,7 +1775,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
goto out_progs;
}
} else {
- psock = smap_init_psock(sock, stab);
+ psock = smap_init_psock(sock, map->numa_node);
if (IS_ERR(psock)) {
err = PTR_ERR(psock);
goto out_progs;
@@ -1758,12 +1785,13 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
new = true;
}
- e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
- if (!e) {
- err = -ENOMEM;
- goto out_progs;
+ if (map_link) {
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e) {
+ err = -ENOMEM;
+ goto out_progs;
+ }
}
- e->entry = &stab->sock_map[i];
/* 3. At this point we have a reference to a valid psock that is
* running. Attach any BPF programs needed.
@@ -1780,7 +1808,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
err = smap_init_sock(psock, sock);
if (err)
goto out_free;
- smap_init_progs(psock, stab, verdict, parse);
+ smap_init_progs(psock, verdict, parse);
smap_start_sock(psock, sock);
}
@@ -1789,19 +1817,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
* it with. Because we can only have a single set of programs if
* old_sock has a strp we can stop it.
*/
- list_add_tail(&e->list, &psock->maps);
- write_unlock_bh(&sock->sk_callback_lock);
-
- osock = xchg(&stab->sock_map[i], sock);
- if (osock) {
- struct smap_psock *opsock = smap_psock_sk(osock);
-
- write_lock_bh(&osock->sk_callback_lock);
- smap_list_remove(opsock, &stab->sock_map[i]);
- smap_release_sock(opsock, osock);
- write_unlock_bh(&osock->sk_callback_lock);
+ if (map_link) {
+ e->entry = map_link;
+ list_add_tail(&e->list, &psock->maps);
}
- return 0;
+ write_unlock_bh(&sock->sk_callback_lock);
+ return err;
out_free:
smap_release_sock(psock, sock);
out_progs:
@@ -1816,23 +1837,73 @@ out_progs:
return err;
}
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+ struct bpf_map *map,
+ void *key, u64 flags)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_sock_progs *progs = &stab->progs;
+ struct sock *osock, *sock;
+ u32 i = *(u32 *)key;
+ int err;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (unlikely(i >= stab->map.max_entries))
+ return -E2BIG;
+
+ sock = READ_ONCE(stab->sock_map[i]);
+ if (flags == BPF_EXIST && !sock)
+ return -ENOENT;
+ else if (flags == BPF_NOEXIST && sock)
+ return -EEXIST;
+
+ sock = skops->sk;
+ err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
+ key);
+ if (err)
+ goto out;
+
+ osock = xchg(&stab->sock_map[i], sock);
+ if (osock) {
+ struct smap_psock *opsock = smap_psock_sk(osock);
+
+ write_lock_bh(&osock->sk_callback_lock);
+ smap_list_remove(opsock, &stab->sock_map[i], NULL);
+ smap_release_sock(opsock, osock);
+ write_unlock_bh(&osock->sk_callback_lock);
+ }
+out:
+ return err;
+}
+
+int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+{
+ struct bpf_sock_progs *progs;
struct bpf_prog *orig;
- if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+ if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ progs = &stab->progs;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ progs = &htab->progs;
+ } else {
return -EINVAL;
+ }
switch (type) {
case BPF_SK_MSG_VERDICT:
- orig = xchg(&stab->bpf_tx_msg, prog);
+ orig = xchg(&progs->bpf_tx_msg, prog);
break;
case BPF_SK_SKB_STREAM_PARSER:
- orig = xchg(&stab->bpf_parse, prog);
+ orig = xchg(&progs->bpf_parse, prog);
break;
case BPF_SK_SKB_STREAM_VERDICT:
- orig = xchg(&stab->bpf_verdict, prog);
+ orig = xchg(&progs->bpf_verdict, prog);
break;
default:
return -EOPNOTSUPP;
@@ -1880,21 +1951,421 @@ static int sock_map_update_elem(struct bpf_map *map,
static void sock_map_release(struct bpf_map *map)
{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_sock_progs *progs;
struct bpf_prog *orig;
- orig = xchg(&stab->bpf_parse, NULL);
+ if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ progs = &stab->progs;
+ } else {
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ progs = &htab->progs;
+ }
+
+ orig = xchg(&progs->bpf_parse, NULL);
if (orig)
bpf_prog_put(orig);
- orig = xchg(&stab->bpf_verdict, NULL);
+ orig = xchg(&progs->bpf_verdict, NULL);
if (orig)
bpf_prog_put(orig);
- orig = xchg(&stab->bpf_tx_msg, NULL);
+ orig = xchg(&progs->bpf_tx_msg, NULL);
if (orig)
bpf_prog_put(orig);
}
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int i, err;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->value_size != 4 ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ return ERR_PTR(-E2BIG);
+
+ err = bpf_tcp_ulp_register();
+ if (err && err != -EEXIST)
+ return ERR_PTR(err);
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&htab->map, attr);
+
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8);
+ err = -EINVAL;
+ if (htab->n_buckets == 0 ||
+ htab->n_buckets > U32_MAX / sizeof(struct bucket))
+ goto free_htab;
+
+ cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+ (u64) htab->elem_size * htab->map.max_entries;
+
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_htab;
+
+ htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ err = bpf_map_precharge_memlock(htab->map.pages);
+ if (err)
+ goto free_htab;
+
+ err = -ENOMEM;
+ htab->buckets = bpf_map_area_alloc(
+ htab->n_buckets * sizeof(struct bucket),
+ htab->map.numa_node);
+ if (!htab->buckets)
+ goto free_htab;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }
+
+ return &htab->map;
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* At this point no update, lookup or delete operations can happen.
+ * However, be aware we can still get a socket state event updates,
+ * and data ready callabacks that reference the psock from sk_user_data
+ * Also psock worker threads are still in-flight. So smap_release_sock
+ * will only free the psock after cancel_sync on the worker threads
+ * and a grace period expire to ensure psock is really safe to remove.
+ */
+ rcu_read_lock();
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ struct sock *sock = l->sk;
+ struct smap_psock *psock;
+
+ hlist_del_rcu(&l->hash_node);
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ /* This check handles a racing sock event that can get
+ * the sk_callback_lock before this case but after xchg
+ * causing the refcnt to hit zero and sock user data
+ * (psock) to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, NULL, l);
+ smap_release_sock(psock, sock);
+ }
+ write_unlock_bh(&sock->sk_callback_lock);
+ kfree(l);
+ }
+ }
+ rcu_read_unlock();
+ bpf_map_area_free(htab->buckets);
+ kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+ void *key, u32 key_size, u32 hash,
+ struct sock *sk,
+ struct htab_elem *old_elem)
+{
+ struct htab_elem *l_new;
+
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ if (!old_elem) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ }
+ l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(l_new->key, key, key_size);
+ l_new->sk = sk;
+ l_new->hash = hash;
+ return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+ u32 hash, void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node) {
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+ }
+
+ return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+ void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l, *next_l;
+ struct hlist_head *h;
+ u32 hash, key_size;
+ int i = 0;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+ if (!key)
+ goto find_first_elem;
+ hash = htab_map_hash(key, key_size);
+ h = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(h, hash, key, key_size);
+ if (!l)
+ goto find_first_elem;
+ next_l = hlist_entry_safe(
+ rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ h = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(h)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* iterated over all buckets and all elements */
+ return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+ struct bpf_map *map,
+ void *key, u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_sock_progs *progs = &htab->progs;
+ struct htab_elem *l_new = NULL, *l_old;
+ struct smap_psock_map_entry *e = NULL;
+ struct hlist_head *head;
+ struct smap_psock *psock;
+ u32 key_size, hash;
+ struct sock *sock;
+ struct bucket *b;
+ int err;
+
+ sock = skops->sk;
+
+ if (sock->sk_type != SOCK_STREAM ||
+ sock->sk_protocol != IPPROTO_TCP)
+ return -EOPNOTSUPP;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e)
+ return -ENOMEM;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ key_size = map->key_size;
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+ if (err)
+ goto err;
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_bh(&b->lock);
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+ if (l_old && map_flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto bucket_err;
+ }
+ if (!l_old && map_flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto bucket_err;
+ }
+
+ l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+ if (IS_ERR(l_new)) {
+ err = PTR_ERR(l_new);
+ goto bucket_err;
+ }
+
+ psock = smap_psock_sk(sock);
+ if (unlikely(!psock)) {
+ err = -EINVAL;
+ goto bucket_err;
+ }
+
+ e->hash_link = l_new;
+ e->htab = container_of(map, struct bpf_htab, map);
+ list_add_tail(&e->list, &psock->maps);
+
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ psock = smap_psock_sk(l_old->sk);
+
+ hlist_del_rcu(&l_old->hash_node);
+ smap_list_remove(psock, NULL, l_old);
+ smap_release_sock(psock, l_old->sk);
+ free_htab_elem(htab, l_old);
+ }
+ raw_spin_unlock_bh(&b->lock);
+ return 0;
+bucket_err:
+ raw_spin_unlock_bh(&b->lock);
+err:
+ kfree(e);
+ psock = smap_psock_sk(sock);
+ if (psock)
+ smap_release_sock(psock, sock);
+ return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_sock_ops_kern skops;
+ u32 fd = *(u32 *)value;
+ struct socket *socket;
+ int err;
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ skops.sk = socket->sk;
+ if (!skops.sk) {
+ fput(socket->file);
+ return -EINVAL;
+ }
+
+ err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+ fput(socket->file);
+ return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct bucket *b;
+ struct htab_elem *l;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ key_size = map->key_size;
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (l) {
+ struct sock *sock = l->sk;
+ struct smap_psock *psock;
+
+ hlist_del_rcu(&l->hash_node);
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ /* This check handles a racing sock event that can get the
+ * sk_callback_lock before this case but after xchg happens
+ * causing the refcnt to hit zero and sock user data (psock)
+ * to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, NULL, l);
+ smap_release_sock(psock, sock);
+ }
+ write_unlock_bh(&sock->sk_callback_lock);
+ free_htab_elem(htab, l);
+ ret = 0;
+ }
+ raw_spin_unlock_bh(&b->lock);
+ return ret;
+}
+
+struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 key_size, hash;
+ struct bucket *b;
+ struct sock *sk;
+
+ key_size = map->key_size;
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head, hash, key, key_size);
+ sk = l ? l->sk : NULL;
+ raw_spin_unlock_bh(&b->lock);
+ return sk;
+}
+
const struct bpf_map_ops sock_map_ops = {
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
@@ -1905,6 +2376,15 @@ const struct bpf_map_ops sock_map_ops = {
.map_release_uref = sock_map_release,
};
+const struct bpf_map_ops sock_hash_ops = {
+ .map_alloc = sock_hash_alloc,
+ .map_free = sock_hash_free,
+ .map_lookup_elem = sock_map_lookup,
+ .map_get_next_key = sock_hash_get_next_key,
+ .map_update_elem = sock_hash_update_elem,
+ .map_delete_elem = sock_hash_delete_elem,
+};
+
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
struct bpf_map *, map, void *, key, u64, flags)
{
@@ -1922,3 +2402,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
.arg3_type = ARG_PTR_TO_MAP_KEY,
.arg4_type = ARG_ANYTHING,
};
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+ .func = bpf_sock_hash_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb1234b67..b675a3f3d141 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,7 @@
#include <linux/perf_event.h>
#include <linux/elf.h>
#include <linux/pagemap.h>
+#include <linux/irq_work.h>
#include "percpu_freelist.h"
#define STACK_CREATE_FLAG_MASK \
@@ -32,6 +33,23 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
+/* irq_work to run up_read() for build_id lookup in nmi context */
+struct stack_map_irq_work {
+ struct irq_work irq_work;
+ struct rw_semaphore *sem;
+};
+
+static void do_up_read(struct irq_work *entry)
+{
+ struct stack_map_irq_work *work;
+
+ work = container_of(entry, struct stack_map_irq_work, irq_work);
+ up_read(work->sem);
+ work->sem = NULL;
+}
+
+static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
+
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -262,27 +280,31 @@ out:
return ret;
}
-static void stack_map_get_build_id_offset(struct bpf_map *map,
- struct stack_map_bucket *bucket,
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u64 *ips, u32 trace_nr, bool user)
{
int i;
struct vm_area_struct *vma;
- struct bpf_stack_build_id *id_offs;
-
- bucket->nr = trace_nr;
- id_offs = (struct bpf_stack_build_id *)bucket->data;
+ bool irq_work_busy = false;
+ struct stack_map_irq_work *work = NULL;
+
+ if (in_nmi()) {
+ work = this_cpu_ptr(&up_read_work);
+ if (work->irq_work.flags & IRQ_WORK_BUSY)
+ /* cannot queue more up_read, fallback */
+ irq_work_busy = true;
+ }
/*
- * We cannot do up_read() in nmi context, so build_id lookup is
- * only supported for non-nmi events. If at some point, it is
- * possible to run find_vma() without taking the semaphore, we
- * would like to allow build_id lookup in nmi context.
+ * We cannot do up_read() in nmi context. To do build_id lookup
+ * in nmi context, we need to run up_read() in irq_work. We use
+ * a percpu variable to do the irq_work. If the irq_work is
+ * already used by another lookup, we fall back to report ips.
*
* Same fallback is used for kernel stack (!user) on a stackmap
* with build_id.
*/
- if (!user || !current || !current->mm || in_nmi() ||
+ if (!user || !current || !current->mm || irq_work_busy ||
down_read_trylock(&current->mm->mmap_sem) == 0) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
@@ -304,7 +326,13 @@ static void stack_map_get_build_id_offset(struct bpf_map *map,
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
}
- up_read(&current->mm->mmap_sem);
+
+ if (!work) {
+ up_read(&current->mm->mmap_sem);
+ } else {
+ work->sem = &current->mm->mmap_sem;
+ irq_work_queue(&work->irq_work);
+ }
}
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
@@ -361,8 +389,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
pcpu_freelist_pop(&smap->freelist);
if (unlikely(!new_bucket))
return -ENOMEM;
- stack_map_get_build_id_offset(map, new_bucket, ips,
- trace_nr, user);
+ new_bucket->nr = trace_nr;
+ stack_map_get_build_id_offset(
+ (struct bpf_stack_build_id *)new_bucket->data,
+ ips, trace_nr, user);
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
if (hash_matches && bucket->nr == trace_nr &&
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -405,6 +435,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ bool user = flags & BPF_F_USER_STACK;
+ struct perf_callchain_entry *trace;
+ bool kernel = !user;
+ int err = -EINVAL;
+ u64 *ips;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_USER_BUILD_ID)))
+ goto clear;
+ if (kernel && user_build_id)
+ goto clear;
+
+ elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+ : sizeof(u64);
+ if (unlikely(size % elem_size))
+ goto clear;
+
+ num_elem = size / elem_size;
+ if (sysctl_perf_event_max_stack < num_elem)
+ init_nr = 0;
+ else
+ init_nr = sysctl_perf_event_max_stack - num_elem;
+ trace = get_perf_callchain(regs, init_nr, kernel, user,
+ sysctl_perf_event_max_stack, false, false);
+ if (unlikely(!trace))
+ goto err_fault;
+
+ trace_nr = trace->nr - init_nr;
+ if (trace_nr < skip)
+ goto err_fault;
+
+ trace_nr -= skip;
+ trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+ copy_len = trace_nr * elem_size;
+ ips = trace->ip + skip + init_nr;
+ if (user && user_build_id)
+ stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+ else
+ memcpy(buf, ips, copy_len);
+
+ if (size > copy_len)
+ memset(buf + copy_len, 0, size - copy_len);
+ return copy_len;
+
+err_fault:
+ err = -EFAULT;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+ .func = bpf_get_stack,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -511,3 +608,16 @@ const struct bpf_map_ops stack_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
};
+
+static int __init stack_map_init(void)
+{
+ int cpu;
+ struct stack_map_irq_work *work;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&up_read_work, cpu);
+ init_irq_work(&work->irq_work, do_up_read);
+ }
+ return 0;
+}
+subsys_initcall(stack_map_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 016ef9025827..35dc466641f2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,13 +11,17 @@
*/
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include <linux/bpf_lirc.h>
+#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
#include <linux/file.h>
+#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>
@@ -26,6 +30,7 @@
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
+#include <linux/btf.h>
#include <linux/nospec.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
@@ -63,9 +68,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
* copy_from_user() call. However, this is not a concern since this function is
* meant to be a future-proofing of bits.
*/
-static int check_uarg_tail_zero(void __user *uaddr,
- size_t expected_size,
- size_t actual_size)
+int bpf_check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
{
unsigned char __user *addr;
unsigned char __user *end;
@@ -273,6 +278,7 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
if (atomic_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map, do_idr_lock);
+ btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
schedule_work(&map->work);
}
@@ -282,6 +288,7 @@ void bpf_map_put(struct bpf_map *map)
{
__bpf_map_put(map, true);
}
+EXPORT_SYMBOL_GPL(bpf_map_put);
void bpf_map_put_with_uref(struct bpf_map *map)
{
@@ -320,13 +327,15 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"value_size:\t%u\n"
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
- "memlock:\t%llu\n",
+ "memlock:\t%llu\n"
+ "map_id:\t%u\n",
map->map_type,
map->key_size,
map->value_size,
map->max_entries,
map->map_flags,
- map->pages * 1ULL << PAGE_SHIFT);
+ map->pages * 1ULL << PAGE_SHIFT,
+ map->id);
if (owner_prog_type) {
seq_printf(m, "owner_prog_type:\t%u\n",
@@ -418,7 +427,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
return 0;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -452,6 +461,33 @@ static int map_create(union bpf_attr *attr)
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
+ if (bpf_map_support_seq_show(map) &&
+ (attr->btf_key_type_id || attr->btf_value_type_id)) {
+ struct btf *btf;
+
+ if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
+ err = -EINVAL;
+ goto free_map_nouncharge;
+ }
+
+ btf = btf_get_by_fd(attr->btf_fd);
+ if (IS_ERR(btf)) {
+ err = PTR_ERR(btf);
+ goto free_map_nouncharge;
+ }
+
+ err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
+ attr->btf_value_type_id);
+ if (err) {
+ btf_put(btf);
+ goto free_map_nouncharge;
+ }
+
+ map->btf = btf;
+ map->btf_key_type_id = attr->btf_key_type_id;
+ map->btf_value_type_id = attr->btf_value_type_id;
+ }
+
err = security_bpf_map_alloc(map);
if (err)
goto free_map_nouncharge;
@@ -476,7 +512,6 @@ static int map_create(union bpf_attr *attr)
return err;
}
- trace_bpf_map_create(map, err);
return err;
free_map:
@@ -484,6 +519,7 @@ free_map:
free_map_sec:
security_bpf_map_free(map);
free_map_nouncharge:
+ btf_put(map->btf);
map->ops->map_free(map);
return err;
}
@@ -516,6 +552,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
atomic_inc(&map->usercnt);
return map;
}
+EXPORT_SYMBOL_GPL(bpf_map_inc);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
@@ -635,7 +672,6 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
- trace_bpf_map_lookup_elem(map, ufd, key, value);
err = 0;
free_value:
@@ -732,8 +768,6 @@ static int map_update_elem(union bpf_attr *attr)
__this_cpu_dec(bpf_prog_active);
preempt_enable();
out:
- if (!err)
- trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
kfree(value);
free_key:
@@ -786,8 +820,6 @@ static int map_delete_elem(union bpf_attr *attr)
__this_cpu_dec(bpf_prog_active);
preempt_enable();
out:
- if (!err)
- trace_bpf_map_delete_elem(map, ufd, key);
kfree(key);
err_put:
fdput(f);
@@ -851,7 +883,6 @@ out:
if (copy_to_user(unext_key, next_key, map->key_size) != 0)
goto free_next_key;
- trace_bpf_map_next_key(map, ufd, key, next_key);
err = 0;
free_next_key:
@@ -1003,15 +1034,9 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
- int i;
-
- trace_bpf_prog_put_rcu(prog);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
-
- for (i = 0; i < prog->aux->func_cnt; i++)
- bpf_prog_kallsyms_del(prog->aux->func[i]);
- bpf_prog_kallsyms_del(prog);
+ bpf_prog_kallsyms_del_all(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
@@ -1042,11 +1067,13 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
"prog_tag:\t%s\n"
- "memlock:\t%llu\n",
+ "memlock:\t%llu\n"
+ "prog_id:\t%u\n",
prog->type,
prog->jited,
prog_tag,
- prog->pages * 1ULL << PAGE_SHIFT);
+ prog->pages * 1ULL << PAGE_SHIFT,
+ prog->aux->id);
}
#endif
@@ -1172,11 +1199,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
bool attach_drv)
{
- struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
-
- if (!IS_ERR(prog))
- trace_bpf_prog_get_type(prog);
- return prog;
+ return __bpf_prog_get(ufd, &type, attach_drv);
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
@@ -1226,6 +1249,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
return 0;
default:
return -EINVAL;
@@ -1328,9 +1353,7 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- /* eBPF program is ready to be JITed */
- if (!prog->bpf_func)
- prog = bpf_prog_select_runtime(prog, &err);
+ prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;
@@ -1351,10 +1374,10 @@ static int bpf_prog_load(union bpf_attr *attr)
}
bpf_prog_kallsyms_add(prog);
- trace_bpf_prog_load(prog, err);
return err;
free_used_maps:
+ bpf_prog_kallsyms_del_subprogs(prog);
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
@@ -1543,6 +1566,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -1556,6 +1581,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+ case BPF_LIRC_MODE2:
+ return lirc_prog_attach(attr);
default:
return -EINVAL;
}
@@ -1613,6 +1640,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -1626,6 +1655,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+ case BPF_LIRC_MODE2:
+ return lirc_prog_detach(attr);
default:
return -EINVAL;
}
@@ -1670,9 +1701,13 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
break;
+ case BPF_LIRC_MODE2:
+ return lirc_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -1879,7 +1914,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
u32 ulen;
int err;
- err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -1892,6 +1927,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.load_time = prog->aux->load_time;
info.created_by_uid = from_kuid_munged(current_user_ns(),
prog->aux->user->uid);
+ info.gpl_compatible = prog->gpl_compatible;
memcpy(info.tag, prog->tag, sizeof(prog->tag));
memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
@@ -1912,6 +1948,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
+ info.nr_jited_ksyms = 0;
goto done;
}
@@ -1948,18 +1985,93 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
* for offload.
*/
ulen = info.jited_prog_len;
- info.jited_prog_len = prog->jited_len;
+ if (prog->aux->func_cnt) {
+ u32 i;
+
+ info.jited_prog_len = 0;
+ for (i = 0; i < prog->aux->func_cnt; i++)
+ info.jited_prog_len += prog->aux->func[i]->jited_len;
+ } else {
+ info.jited_prog_len = prog->jited_len;
+ }
+
if (info.jited_prog_len && ulen) {
if (bpf_dump_raw_ok()) {
uinsns = u64_to_user_ptr(info.jited_prog_insns);
ulen = min_t(u32, info.jited_prog_len, ulen);
- if (copy_to_user(uinsns, prog->bpf_func, ulen))
- return -EFAULT;
+
+ /* for multi-function programs, copy the JITed
+ * instructions for all the functions
+ */
+ if (prog->aux->func_cnt) {
+ u32 len, free, i;
+ u8 *img;
+
+ free = ulen;
+ for (i = 0; i < prog->aux->func_cnt; i++) {
+ len = prog->aux->func[i]->jited_len;
+ len = min_t(u32, len, free);
+ img = (u8 *) prog->aux->func[i]->bpf_func;
+ if (copy_to_user(uinsns, img, len))
+ return -EFAULT;
+ uinsns += len;
+ free -= len;
+ if (!free)
+ break;
+ }
+ } else {
+ if (copy_to_user(uinsns, prog->bpf_func, ulen))
+ return -EFAULT;
+ }
} else {
info.jited_prog_insns = 0;
}
}
+ ulen = info.nr_jited_ksyms;
+ info.nr_jited_ksyms = prog->aux->func_cnt;
+ if (info.nr_jited_ksyms && ulen) {
+ if (bpf_dump_raw_ok()) {
+ u64 __user *user_ksyms;
+ ulong ksym_addr;
+ u32 i;
+
+ /* copy the address of the kernel symbol
+ * corresponding to each function
+ */
+ ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+ user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+ for (i = 0; i < ulen; i++) {
+ ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+ ksym_addr &= PAGE_MASK;
+ if (put_user((u64) ksym_addr, &user_ksyms[i]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_ksyms = 0;
+ }
+ }
+
+ ulen = info.nr_jited_func_lens;
+ info.nr_jited_func_lens = prog->aux->func_cnt;
+ if (info.nr_jited_func_lens && ulen) {
+ if (bpf_dump_raw_ok()) {
+ u32 __user *user_lens;
+ u32 func_len, i;
+
+ /* copy the JITed image lengths for each function */
+ ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+ user_lens = u64_to_user_ptr(info.jited_func_lens);
+ for (i = 0; i < ulen; i++) {
+ func_len = prog->aux->func[i]->jited_len;
+ if (put_user(func_len, &user_lens[i]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_func_lens = 0;
+ }
+ }
+
done:
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -1977,7 +2089,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
u32 info_len = attr->info.info_len;
int err;
- err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -1990,6 +2102,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.map_flags = map->map_flags;
memcpy(info.name, map->name, sizeof(map->name));
+ if (map->btf) {
+ info.btf_id = btf_id(map->btf);
+ info.btf_key_type_id = map->btf_key_type_id;
+ info.btf_value_type_id = map->btf_value_type_id;
+ }
+
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
@@ -2003,6 +2121,21 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
return 0;
}
+static int bpf_btf_get_info_by_fd(struct btf *btf,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+ if (err)
+ return err;
+
+ return btf_get_info_by_fd(btf, attr, uattr);
+}
+
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -2025,6 +2158,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
else if (f.file->f_op == &bpf_map_fops)
err = bpf_map_get_info_by_fd(f.file->private_data, attr,
uattr);
+ else if (f.file->f_op == &btf_fops)
+ err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
else
err = -EINVAL;
@@ -2032,6 +2167,158 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+
+static int bpf_btf_load(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_BTF_LOAD))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btf_new_fd(attr);
+}
+
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+
+static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btf_get_fd_by_id(attr->btf_id);
+}
+
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ u32 prog_id, u32 fd_type,
+ const char *buf, u64 probe_offset,
+ u64 probe_addr)
+{
+ char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+ u32 len = buf ? strlen(buf) : 0, input_len;
+ int err = 0;
+
+ if (put_user(len, &uattr->task_fd_query.buf_len))
+ return -EFAULT;
+ input_len = attr->task_fd_query.buf_len;
+ if (input_len && ubuf) {
+ if (!len) {
+ /* nothing to copy, just make ubuf NULL terminated */
+ char zero = '\0';
+
+ if (put_user(zero, ubuf))
+ return -EFAULT;
+ } else if (input_len >= len + 1) {
+ /* ubuf can hold the string with NULL terminator */
+ if (copy_to_user(ubuf, buf, len + 1))
+ return -EFAULT;
+ } else {
+ /* ubuf cannot hold the string with NULL terminator,
+ * do a partial copy with NULL terminator.
+ */
+ char zero = '\0';
+
+ err = -ENOSPC;
+ if (copy_to_user(ubuf, buf, input_len - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + input_len - 1))
+ return -EFAULT;
+ }
+ }
+
+ if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+ put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+ put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+ put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+ return -EFAULT;
+
+ return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ pid_t pid = attr->task_fd_query.pid;
+ u32 fd = attr->task_fd_query.fd;
+ const struct perf_event *event;
+ struct files_struct *files;
+ struct task_struct *task;
+ struct file *file;
+ int err;
+
+ if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (attr->task_fd_query.flags != 0)
+ return -EINVAL;
+
+ task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ if (!task)
+ return -ENOENT;
+
+ files = get_files_struct(task);
+ put_task_struct(task);
+ if (!files)
+ return -ENOENT;
+
+ err = 0;
+ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (!file)
+ err = -EBADF;
+ else
+ get_file(file);
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+ if (err)
+ goto out;
+
+ if (file->f_op == &bpf_raw_tp_fops) {
+ struct bpf_raw_tracepoint *raw_tp = file->private_data;
+ struct bpf_raw_event_map *btp = raw_tp->btp;
+
+ err = bpf_task_fd_query_copy(attr, uattr,
+ raw_tp->prog->aux->id,
+ BPF_FD_TYPE_RAW_TRACEPOINT,
+ btp->tp->name, 0, 0);
+ goto put_file;
+ }
+
+ event = perf_get_event(file);
+ if (!IS_ERR(event)) {
+ u64 probe_offset, probe_addr;
+ u32 prog_id, fd_type;
+ const char *buf;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+ &buf, &probe_offset,
+ &probe_addr);
+ if (!err)
+ err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+ fd_type, buf,
+ probe_offset,
+ probe_addr);
+ goto put_file;
+ }
+
+ err = -ENOTSUPP;
+put_file:
+ fput(file);
+out:
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -2040,7 +2327,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
return -EPERM;
- err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+ err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
size = min_t(u32, size, sizeof(attr));
@@ -2112,6 +2399,15 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_RAW_TRACEPOINT_OPEN:
err = bpf_raw_tracepoint_open(&attr);
break;
+ case BPF_BTF_LOAD:
+ err = bpf_btf_load(&attr);
+ break;
+ case BPF_BTF_GET_FD_BY_ID:
+ err = bpf_btf_get_fd_by_id(&attr);
+ break;
+ case BPF_TASK_FD_QUERY:
+ err = bpf_task_fd_query(&attr, uattr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 1f4bf68c12db..938d41211be7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
return TNUM(a.value >> shift, a.mask >> shift);
}
+struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+{
+ /* if a.value is negative, arithmetic shifting by minimum shift
+ * will have larger negative offset compared to more shifting.
+ * If a.value is nonnegative, arithmetic shifting by minimum shift
+ * will have larger positive offset compare to more shifting.
+ */
+ return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+}
+
struct tnum tnum_add(struct tnum a, struct tnum b)
{
u64 sm, sv, sigma, chi, mu;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1904e814f282..9e2bf834f13a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
#include <linux/stringify.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/perf_event.h>
#include "disasm.h"
@@ -186,6 +187,8 @@ struct bpf_call_arg_meta {
bool pkt_access;
int regno;
int access_size;
+ s64 msize_smax_value;
+ u64 msize_umax_value;
};
static DEFINE_MUTEX(bpf_verifier_lock);
@@ -760,18 +763,19 @@ enum reg_arg_type {
static int cmp_subprogs(const void *a, const void *b)
{
- return *(int *)a - *(int *)b;
+ return ((struct bpf_subprog_info *)a)->start -
+ ((struct bpf_subprog_info *)b)->start;
}
static int find_subprog(struct bpf_verifier_env *env, int off)
{
- u32 *p;
+ struct bpf_subprog_info *p;
- p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
- sizeof(env->subprog_starts[0]), cmp_subprogs);
+ p = bsearch(&off, env->subprog_info, env->subprog_cnt,
+ sizeof(env->subprog_info[0]), cmp_subprogs);
if (!p)
return -ENOENT;
- return p - env->subprog_starts;
+ return p - env->subprog_info;
}
@@ -791,18 +795,24 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
verbose(env, "too many subprograms\n");
return -E2BIG;
}
- env->subprog_starts[env->subprog_cnt++] = off;
- sort(env->subprog_starts, env->subprog_cnt,
- sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
+ env->subprog_info[env->subprog_cnt++].start = off;
+ sort(env->subprog_info, env->subprog_cnt,
+ sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
return 0;
}
static int check_subprogs(struct bpf_verifier_env *env)
{
int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
+ struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
+ /* Add entry function. */
+ ret = add_subprog(env, 0);
+ if (ret < 0)
+ return ret;
+
/* determine subprog starts. The end is one before the next starts */
for (i = 0; i < insn_cnt; i++) {
if (insn[i].code != (BPF_JMP | BPF_CALL))
@@ -822,16 +832,18 @@ static int check_subprogs(struct bpf_verifier_env *env)
return ret;
}
+ /* Add a fake 'exit' subprog which could simplify subprog iteration
+ * logic. 'subprog_cnt' should not be increased.
+ */
+ subprog[env->subprog_cnt].start = insn_cnt;
+
if (env->log.level > 1)
for (i = 0; i < env->subprog_cnt; i++)
- verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
+ verbose(env, "func#%d @%d\n", i, subprog[i].start);
/* now check that all jumps are within the same subprog */
- subprog_start = 0;
- if (env->subprog_cnt == cur_subprog)
- subprog_end = insn_cnt;
- else
- subprog_end = env->subprog_starts[cur_subprog++];
+ subprog_start = subprog[cur_subprog].start;
+ subprog_end = subprog[cur_subprog + 1].start;
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
@@ -856,10 +868,9 @@ next:
return -EINVAL;
}
subprog_start = subprog_end;
- if (env->subprog_cnt == cur_subprog)
- subprog_end = insn_cnt;
- else
- subprog_end = env->subprog_starts[cur_subprog++];
+ cur_subprog++;
+ if (cur_subprog < env->subprog_cnt)
+ subprog_end = subprog[cur_subprog + 1].start;
}
}
return 0;
@@ -1298,6 +1309,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
switch (env->prog->type) {
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
/* dst_input() and dst_output() can't write for now */
if (t == BPF_WRITE)
return false;
@@ -1517,13 +1529,13 @@ static int update_stack_depth(struct bpf_verifier_env *env,
const struct bpf_func_state *func,
int off)
{
- u16 stack = env->subprog_stack_depth[func->subprogno];
+ u16 stack = env->subprog_info[func->subprogno].stack_depth;
if (stack >= -off)
return 0;
/* update known max for given subprogram */
- env->subprog_stack_depth[func->subprogno] = -off;
+ env->subprog_info[func->subprogno].stack_depth = -off;
return 0;
}
@@ -1535,9 +1547,9 @@ static int update_stack_depth(struct bpf_verifier_env *env,
*/
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
- int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+ int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
+ struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
@@ -1545,17 +1557,14 @@ process_func:
/* round up to 32-bytes, since this is granularity
* of interpreter stack size
*/
- depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+ depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
if (depth > MAX_BPF_STACK) {
verbose(env, "combined stack size of %d calls is %d. Too large\n",
frame + 1, depth);
return -EACCES;
}
continue_func:
- if (env->subprog_cnt == subprog)
- subprog_end = insn_cnt;
- else
- subprog_end = env->subprog_starts[subprog];
+ subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
if (insn[i].code != (BPF_JMP | BPF_CALL))
continue;
@@ -1563,17 +1572,16 @@ continue_func:
continue;
/* remember insn and function to return to */
ret_insn[frame] = i + 1;
- ret_prog[frame] = subprog;
+ ret_prog[frame] = idx;
/* find the callee */
i = i + insn[i].imm + 1;
- subprog = find_subprog(env, i);
- if (subprog < 0) {
+ idx = find_subprog(env, i);
+ if (idx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
i);
return -EFAULT;
}
- subprog++;
frame++;
if (frame >= MAX_CALL_FRAMES) {
WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
@@ -1586,10 +1594,10 @@ continue_func:
*/
if (frame == 0)
return 0;
- depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+ depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
frame--;
i = ret_insn[frame];
- subprog = ret_prog[frame];
+ idx = ret_prog[frame];
goto continue_func;
}
@@ -1605,11 +1613,34 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
start);
return -EFAULT;
}
- subprog++;
- return env->subprog_stack_depth[subprog];
+ return env->subprog_info[subprog].stack_depth;
}
#endif
+static int check_ctx_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
+{
+ /* Access to ctx or passing it to a helper is only allowed in
+ * its original, unmodified form.
+ */
+
+ if (reg->off) {
+ verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n",
+ regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
/* truncate register to smaller size (in bytes)
* must be called with size < BPF_REG_SIZE
*/
@@ -1679,24 +1710,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
verbose(env, "R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
- /* ctx accesses must be at a fixed offset, so that we can
- * determine what type of data were returned.
- */
- if (reg->off) {
- verbose(env,
- "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
- regno, reg->off, off - reg->off);
- return -EACCES;
- }
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
- char tn_buf[48];
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env,
- "variable ctx access var_off=%s off=%d size=%d",
- tn_buf, off, size);
- return -EACCES;
- }
+ err = check_ctx_reg(env, reg, regno);
+ if (err < 0)
+ return err;
+
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
@@ -1961,7 +1979,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- if (!type_is_pkt_pointer(type) &&
+ if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
@@ -1977,6 +1995,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
expected_type = PTR_TO_CTX;
if (type != expected_type)
goto err_type;
+ err = check_ctx_reg(env, reg, regno);
+ if (err < 0)
+ return err;
} else if (arg_type_is_mem_ptr(arg_type)) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
@@ -2013,14 +2034,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- if (type_is_pkt_pointer(type))
- err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->key_size,
- false);
- else
- err = check_stack_boundary(env, regno,
- meta->map_ptr->key_size,
- false, NULL);
+ err = check_helper_mem_access(env, regno,
+ meta->map_ptr->key_size, false,
+ NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
@@ -2030,17 +2046,18 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
- if (type_is_pkt_pointer(type))
- err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->value_size,
- false);
- else
- err = check_stack_boundary(env, regno,
- meta->map_ptr->value_size,
- false, NULL);
+ err = check_helper_mem_access(env, regno,
+ meta->map_ptr->value_size, false,
+ NULL);
} else if (arg_type_is_mem_size(arg_type)) {
bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
+ /* remember the mem_size which may be used later
+ * to refine return values.
+ */
+ meta->msize_smax_value = reg->smax_value;
+ meta->msize_umax_value = reg->umax_value;
+
/* The register is SCALAR_VALUE; the access check
* happens using its boundaries.
*/
@@ -2118,8 +2135,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
- /* Restrict bpf side of cpumap, open when use-cases appear */
+ /* Restrict bpf side of cpumap and xskmap, open when use-cases
+ * appear.
+ */
case BPF_MAP_TYPE_CPUMAP:
+ case BPF_MAP_TYPE_XSKMAP:
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
@@ -2135,6 +2155,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_msg_redirect_map)
goto error;
break;
+ case BPF_MAP_TYPE_SOCKHASH:
+ if (func_id != BPF_FUNC_sk_redirect_hash &&
+ func_id != BPF_FUNC_sock_hash_update &&
+ func_id != BPF_FUNC_map_delete_elem &&
+ func_id != BPF_FUNC_msg_redirect_hash)
+ goto error;
+ break;
default:
break;
}
@@ -2144,7 +2171,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_tail_call:
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
- if (env->subprog_cnt) {
+ if (env->subprog_cnt > 1) {
verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
return -EINVAL;
}
@@ -2166,16 +2193,20 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_FUNC_redirect_map:
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
- map->map_type != BPF_MAP_TYPE_CPUMAP)
+ map->map_type != BPF_MAP_TYPE_CPUMAP &&
+ map->map_type != BPF_MAP_TYPE_XSKMAP)
goto error;
break;
case BPF_FUNC_sk_redirect_map:
case BPF_FUNC_msg_redirect_map:
+ case BPF_FUNC_sock_map_update:
if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
goto error;
break;
- case BPF_FUNC_sock_map_update:
- if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ case BPF_FUNC_sk_redirect_hash:
+ case BPF_FUNC_msg_redirect_hash:
+ case BPF_FUNC_sock_hash_update:
+ if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
goto error;
break;
default:
@@ -2316,7 +2347,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* remember the callsite, it will be used by bpf_exit */
*insn_idx /* callsite */,
state->curframe + 1 /* frameno within this callchain */,
- subprog + 1 /* subprog number within this prog */);
+ subprog /* subprog number within this prog */);
/* copy r1 - r5 args that callee can access */
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
@@ -2380,6 +2411,23 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return 0;
}
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+ int func_id,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+
+ if (ret_type != RET_INTEGER ||
+ (func_id != BPF_FUNC_get_stack &&
+ func_id != BPF_FUNC_probe_read_str))
+ return;
+
+ ret_reg->smax_value = meta->msize_smax_value;
+ ret_reg->umax_value = meta->msize_umax_value;
+ __reg_deduce_bounds(ret_reg);
+ __reg_bound_offset(ret_reg);
+}
+
static int
record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
int func_id, int insn_idx)
@@ -2387,8 +2435,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
if (func_id != BPF_FUNC_tail_call &&
- func_id != BPF_FUNC_map_lookup_elem)
+ func_id != BPF_FUNC_map_lookup_elem &&
+ func_id != BPF_FUNC_map_update_elem &&
+ func_id != BPF_FUNC_map_delete_elem)
return 0;
+
if (meta->map_ptr == NULL) {
verbose(env, "kernel subsystem misconfigured verifier\n");
return -EINVAL;
@@ -2428,7 +2479,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
/* eBPF programs must be GPL compatible to use GPL-ed functions */
if (!env->prog->gpl_compatible && fn->gpl_only) {
- verbose(env, "cannot call GPL only function from proprietary program\n");
+ verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n");
return -EINVAL;
}
@@ -2516,10 +2567,30 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
+ do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
+ if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+ const char *err_str;
+
+#ifdef CONFIG_PERF_EVENTS
+ err = get_callchain_buffers(sysctl_perf_event_max_stack);
+ err_str = "cannot get callchain buffer for func %s#%d\n";
+#else
+ err = -ENOTSUPP;
+ err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
+#endif
+ if (err) {
+ verbose(env, err_str, func_id_name(func_id), func_id);
+ return err;
+ }
+
+ env->prog->has_callchain_buf = true;
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@@ -2964,10 +3035,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
dst_reg->umin_value <<= umin_val;
dst_reg->umax_value <<= umax_val;
}
- if (src_known)
- dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
- else
- dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
break;
@@ -2995,16 +3063,35 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
*/
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- if (src_known)
- dst_reg->var_off = tnum_rshift(dst_reg->var_off,
- umin_val);
- else
- dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
dst_reg->umin_value >>= umax_val;
dst_reg->umax_value >>= umin_val;
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
break;
+ case BPF_ARSH:
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
+ */
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ break;
+ }
+
+ /* Upon reaching here, src_known is true and
+ * umax_val is equal to umin_val.
+ */
+ dst_reg->smin_value >>= umin_val;
+ dst_reg->smax_value >>= umin_val;
+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+
+ /* blow away the dst_reg umin_value/umax_value and rely on
+ * dst_reg var_off to refine the result.
+ */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ __update_reg_bounds(dst_reg);
+ break;
default:
mark_reg_unknown(env, regs, insn->dst_reg);
break;
@@ -3888,7 +3975,12 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (env->subprog_cnt) {
+ if (!env->ops->gen_ld_abs) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ if (env->subprog_cnt > 1) {
/* when program has LD_ABS insn JITs and interpreter assume
* that r1 == ctx == skb which is not the case for callees
* that can have arbitrary arguments. It's problematic
@@ -4919,15 +5011,15 @@ process_bpf_exit:
verbose(env, "processed %d insns (limit %d), stack depth ",
insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
- for (i = 0; i < env->subprog_cnt + 1; i++) {
- u32 depth = env->subprog_stack_depth[i];
+ for (i = 0; i < env->subprog_cnt; i++) {
+ u32 depth = env->subprog_info[i].stack_depth;
verbose(env, "%d", depth);
- if (i + 1 < env->subprog_cnt + 1)
+ if (i + 1 < env->subprog_cnt)
verbose(env, "+");
}
verbose(env, "\n");
- env->prog->aux->stack_depth = env->subprog_stack_depth[0];
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return 0;
}
@@ -5051,7 +5143,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
- * and all maps are released in free_bpf_prog_info()
+ * and all maps are released in free_used_maps()
*/
map = bpf_map_inc(map, false);
if (IS_ERR(map)) {
@@ -5114,7 +5206,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
if (cnt == 1)
return 0;
- new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
+ new_data = vzalloc(array_size(prog_len,
+ sizeof(struct bpf_insn_aux_data)));
if (!new_data)
return -ENOMEM;
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
@@ -5133,10 +5226,11 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
if (len == 1)
return;
- for (i = 0; i < env->subprog_cnt; i++) {
- if (env->subprog_starts[i] < off)
+ /* NOTE: fake 'exit' subprog should be updated as well. */
+ for (i = 0; i <= env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start < off)
continue;
- env->subprog_starts[i] += len - 1;
+ env->subprog_info[i].start += len - 1;
}
}
@@ -5210,7 +5304,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
- if (!ops->convert_ctx_access)
+ if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
return 0;
insn = env->prog->insnsi + delta;
@@ -5270,6 +5364,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
*/
is_narrower_load = size < ctx_field_size;
if (is_narrower_load) {
+ u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
u32 off = insn->off;
u8 size_code;
@@ -5284,7 +5379,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
else if (ctx_field_size == 8)
size_code = BPF_DW;
- insn->off = off & ~(ctx_field_size - 1);
+ insn->off = off & ~(size_default - 1);
insn->code = BPF_LDX | BPF_MEM | size_code;
}
@@ -5328,7 +5423,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
void *old_bpf_func;
int err = -ENOMEM;
- if (env->subprog_cnt == 0)
+ if (env->subprog_cnt <= 1)
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5344,7 +5439,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
/* temporarily remember subprog id inside insn instead of
* aux_data, since next loop will split up all insns into funcs
*/
- insn->off = subprog + 1;
+ insn->off = subprog;
/* remember original imm in case JIT fails and fallback
* to interpreter will be needed
*/
@@ -5353,16 +5448,13 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn->imm = 1;
}
- func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
+ func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
if (!func)
return -ENOMEM;
- for (i = 0; i <= env->subprog_cnt; i++) {
+ for (i = 0; i < env->subprog_cnt; i++) {
subprog_start = subprog_end;
- if (env->subprog_cnt == i)
- subprog_end = prog->len;
- else
- subprog_end = env->subprog_starts[i];
+ subprog_end = env->subprog_info[i + 1].start;
len = subprog_end - subprog_start;
func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5379,7 +5471,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* Long term would need debug info to populate names
*/
func[i]->aux->name[0] = 'F';
- func[i]->aux->stack_depth = env->subprog_stack_depth[i];
+ func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
@@ -5392,20 +5484,33 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* now populate all bpf_calls with correct addresses and
* run last pass of JIT
*/
- for (i = 0; i <= env->subprog_cnt; i++) {
+ for (i = 0; i < env->subprog_cnt; i++) {
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
subprog = insn->off;
- insn->off = 0;
insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
func[subprog]->bpf_func -
__bpf_call_base;
}
+
+ /* we use the aux data to keep a list of the start addresses
+ * of the JITed images for each function in the program
+ *
+ * for some architectures, such as powerpc64, the imm field
+ * might not be large enough to hold the offset of the start
+ * address of the callee's JITed image from __bpf_call_base
+ *
+ * in such cases, we can lookup the start address of a callee
+ * by using its subprog id, available from the off field of
+ * the call instruction, as an index for this list
+ */
+ func[i]->aux->func = func;
+ func[i]->aux->func_cnt = env->subprog_cnt;
}
- for (i = 0; i <= env->subprog_cnt; i++) {
+ for (i = 0; i < env->subprog_cnt; i++) {
old_bpf_func = func[i]->bpf_func;
tmp = bpf_int_jit_compile(func[i]);
if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
@@ -5419,7 +5524,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
/* finally lock prog and jit images for all functions and
* populate kallsysm
*/
- for (i = 0; i <= env->subprog_cnt; i++) {
+ for (i = 0; i < env->subprog_cnt; i++) {
bpf_prog_lock_ro(func[i]);
bpf_prog_kallsyms_add(func[i]);
}
@@ -5429,26 +5534,21 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* later look the same as if they were interpreted only.
*/
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- unsigned long addr;
-
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
insn->off = env->insn_aux_data[i].call_imm;
subprog = find_subprog(env, i + insn->off + 1);
- addr = (unsigned long)func[subprog + 1]->bpf_func;
- addr &= PAGE_MASK;
- insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
- addr - __bpf_call_base;
+ insn->imm = subprog;
}
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
prog->aux->func = func;
- prog->aux->func_cnt = env->subprog_cnt + 1;
+ prog->aux->func_cnt = env->subprog_cnt;
return 0;
out_free:
- for (i = 0; i <= env->subprog_cnt; i++)
+ for (i = 0; i < env->subprog_cnt; i++)
if (func[i])
bpf_jit_free(func[i]);
kfree(func);
@@ -5505,6 +5605,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
+ const struct bpf_map_ops *ops;
struct bpf_insn_aux_data *aux;
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
@@ -5552,6 +5653,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ if (BPF_CLASS(insn->code) == BPF_LD &&
+ (BPF_MODE(insn->code) == BPF_ABS ||
+ BPF_MODE(insn->code) == BPF_IND)) {
+ cnt = env->ops->gen_ld_abs(insn, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
if (insn->src_reg == BPF_PSEUDO_CALL)
@@ -5615,35 +5735,61 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
}
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
- * handlers are currently limited to 64 bit only.
+ * and other inlining handlers are currently limited to 64 bit
+ * only.
*/
if (prog->jit_requested && BITS_PER_LONG == 64 &&
- insn->imm == BPF_FUNC_map_lookup_elem) {
+ (insn->imm == BPF_FUNC_map_lookup_elem ||
+ insn->imm == BPF_FUNC_map_update_elem ||
+ insn->imm == BPF_FUNC_map_delete_elem)) {
aux = &env->insn_aux_data[i + delta];
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
map_ptr = BPF_MAP_PTR(aux->map_state);
- if (!map_ptr->ops->map_gen_lookup)
- goto patch_call_imm;
+ ops = map_ptr->ops;
+ if (insn->imm == BPF_FUNC_map_lookup_elem &&
+ ops->map_gen_lookup) {
+ cnt = ops->map_gen_lookup(map_ptr, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
- cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
- }
+ new_prog = bpf_patch_insn_data(env, i + delta,
+ insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
- cnt);
- if (!new_prog)
- return -ENOMEM;
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
- delta += cnt - 1;
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
+ (int (*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_update_elem,
+ (int (*)(struct bpf_map *map, void *key, void *value,
+ u64 flags))NULL));
+ switch (insn->imm) {
+ case BPF_FUNC_map_lookup_elem:
+ insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
+ __bpf_call_base;
+ continue;
+ case BPF_FUNC_map_update_elem:
+ insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
+ __bpf_call_base;
+ continue;
+ case BPF_FUNC_map_delete_elem:
+ insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
+ __bpf_call_base;
+ continue;
+ }
- /* keep walking new program and skip insns we just inserted */
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- continue;
+ goto patch_call_imm;
}
if (insn->imm == BPF_FUNC_redirect_map) {
@@ -5725,8 +5871,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
return -ENOMEM;
log = &env->log;
- env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
- (*prog)->len);
+ env->insn_aux_data =
+ vzalloc(array_size(sizeof(struct bpf_insn_aux_data),
+ (*prog)->len));
ret = -ENOMEM;
if (!env->insn_aux_data)
goto err_free_env;
@@ -5755,16 +5902,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
if (bpf_prog_is_dev_bound(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env);
if (ret)
- goto err_unlock;
+ goto skip_full_check;
}
- ret = replace_map_fd_with_map_ptr(env);
- if (ret < 0)
- goto skip_full_check;
-
env->explored_states = kcalloc(env->prog->len,
sizeof(struct bpf_verifier_state_list *),
GFP_USER);
@@ -5835,7 +5982,7 @@ skip_full_check:
err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
- * them now. Otherwise free_bpf_prog_info() will release them.
+ * them now. Otherwise free_used_maps() will release them.
*/
release_maps(env);
*prog = env->prog;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
new file mode 100644
index 000000000000..b3c557476a8d
--- /dev/null
+++ b/kernel/bpf/xskmap.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+struct xsk_map {
+ struct bpf_map map;
+ struct xdp_sock **xsk_map;
+ struct list_head __percpu *flush_list;
+};
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+ int cpu, err = -EINVAL;
+ struct xsk_map *m;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 ||
+ attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+ return ERR_PTR(-EINVAL);
+
+ m = kzalloc(sizeof(*m), GFP_USER);
+ if (!m)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&m->map, attr);
+
+ cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
+ cost += sizeof(struct list_head) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_m;
+
+ m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* Notice returns -EPERM on if map size is larger than memlock limit */
+ err = bpf_map_precharge_memlock(m->map.pages);
+ if (err)
+ goto free_m;
+
+ err = -ENOMEM;
+
+ m->flush_list = alloc_percpu(struct list_head);
+ if (!m->flush_list)
+ goto free_m;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
+
+ m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
+ sizeof(struct xdp_sock *),
+ m->map.numa_node);
+ if (!m->xsk_map)
+ goto free_percpu;
+ return &m->map;
+
+free_percpu:
+ free_percpu(m->flush_list);
+free_m:
+ kfree(m);
+ return ERR_PTR(err);
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ int i;
+
+ synchronize_net();
+
+ for (i = 0; i < map->max_entries; i++) {
+ struct xdp_sock *xs;
+
+ xs = m->xsk_map[i];
+ if (!xs)
+ continue;
+
+ sock_put((struct sock *)xs);
+ }
+
+ free_percpu(m->flush_list);
+ bpf_map_area_free(m->xsk_map);
+ kfree(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= m->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == m->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock *xs;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ xs = READ_ONCE(m->xsk_map[key]);
+ return xs;
+}
+
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+ struct xdp_sock *xs)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+ int err;
+
+ err = xsk_rcv(xs, xdp);
+ if (err)
+ return err;
+
+ if (!xs->flush_node.prev)
+ list_add(&xs->flush_node, flush_list);
+
+ return 0;
+}
+
+void __xsk_map_flush(struct bpf_map *map)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+ struct xdp_sock *xs, *tmp;
+
+ list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+ xsk_flush(xs);
+ __list_del(xs->flush_node.prev, xs->flush_node.next);
+ xs->flush_node.prev = NULL;
+ }
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ u32 i = *(u32 *)key, fd = *(u32 *)value;
+ struct xdp_sock *xs, *old_xs;
+ struct socket *sock;
+ int err;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= m->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return err;
+
+ if (sock->sk->sk_family != PF_XDP) {
+ sockfd_put(sock);
+ return -EOPNOTSUPP;
+ }
+
+ xs = (struct xdp_sock *)sock->sk;
+
+ if (!xsk_is_setup_for_bpf_map(xs)) {
+ sockfd_put(sock);
+ return -EOPNOTSUPP;
+ }
+
+ sock_hold(sock->sk);
+
+ old_xs = xchg(&m->xsk_map[i], xs);
+ if (old_xs) {
+ /* Make sure we've flushed everything. */
+ synchronize_net();
+ sock_put((struct sock *)old_xs);
+ }
+
+ sockfd_put(sock);
+ return 0;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock *old_xs;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ old_xs = xchg(&m->xsk_map[k], NULL);
+ if (old_xs) {
+ /* Make sure we've flushed everything. */
+ synchronize_net();
+ sock_put((struct sock *)old_xs);
+ }
+
+ return 0;
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+ .map_alloc = xsk_map_alloc,
+ .map_free = xsk_map_free,
+ .map_get_next_key = xsk_map_get_next_key,
+ .map_lookup_elem = xsk_map_lookup_elem,
+ .map_update_elem = xsk_map_update_elem,
+ .map_delete_elem = xsk_map_delete_elem,
+};
+
+
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 2be89a003185..bfcdae896122 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
+obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 0808a33d16d3..77ff1cd6a252 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -201,13 +201,12 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
int cgroup_task_count(const struct cgroup *cgrp);
/*
- * stat.c
+ * rstat.c
*/
-void cgroup_stat_flush(struct cgroup *cgrp);
-int cgroup_stat_init(struct cgroup *cgrp);
-void cgroup_stat_exit(struct cgroup *cgrp);
-void cgroup_stat_show_cputime(struct seq_file *seq);
-void cgroup_stat_boot(void);
+int cgroup_rstat_init(struct cgroup *cgrp);
+void cgroup_rstat_exit(struct cgroup *cgrp);
+void cgroup_rstat_boot(void);
+void cgroup_base_stat_cputime_show(struct seq_file *seq);
/*
* namespace.c
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index e06c97f3ed1a..8b4f0768efd6 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -195,9 +195,9 @@ struct cgroup_pidlist {
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
- return vmalloc(count * sizeof(pid_t));
+ return vmalloc(array_size(count, sizeof(pid_t)));
else
- return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+ return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 12883656e63e..077370bf8964 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -54,6 +54,7 @@
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
+#include <linux/sched/cputime.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -61,6 +62,8 @@
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
+/* let's not notify more than 100 times per second */
+#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
@@ -142,14 +145,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
-static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
/*
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
*/
-struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
+struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
@@ -1554,6 +1557,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
spin_lock_irq(&cgroup_file_kn_lock);
cfile->kn = NULL;
spin_unlock_irq(&cgroup_file_kn_lock);
+
+ del_timer_sync(&cfile->notify_timer);
}
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
@@ -1573,8 +1578,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
- list_for_each_entry(cfts, &css->ss->cfts, node)
+ if (!css->ss) {
+ if (cgroup_on_dfl(cgrp))
+ cfts = cgroup_base_files;
+ else
+ cfts = cgroup1_base_files;
+
cgroup_addrm_files(css, cgrp, cfts, false);
+ } else {
+ list_for_each_entry(cfts, &css->ss->cfts, node)
+ cgroup_addrm_files(css, cgrp, cfts, false);
+ }
}
/**
@@ -1598,14 +1612,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
else
cfts = cgroup1_base_files;
- return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- }
-
- list_for_each_entry(cfts, &css->ss->cfts, node) {
- ret = cgroup_addrm_files(css, cgrp, cfts, true);
- if (ret < 0) {
- failed_cfts = cfts;
- goto err;
+ ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+ if (ret < 0)
+ return ret;
+ } else {
+ list_for_each_entry(cfts, &css->ss->cfts, node) {
+ ret = cgroup_addrm_files(css, cgrp, cfts, true);
+ if (ret < 0) {
+ failed_cfts = cfts;
+ goto err;
+ }
}
}
@@ -1782,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
- spin_lock_irq(&css_set_lock);
-
- if (use_task_css_set_links)
- goto out_unlock;
-
- use_task_css_set_links = true;
-
/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
@@ -1797,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void)
* tasklist if we walk through it with RCU.
*/
read_lock(&tasklist_lock);
+ spin_lock_irq(&css_set_lock);
+
+ if (use_task_css_set_links)
+ goto out_unlock;
+
+ use_task_css_set_links = true;
+
do_each_thread(g, p) {
WARN_ON_ONCE(!list_empty(&p->cg_list) ||
task_css_set(p) != &init_css_set);
@@ -1824,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void)
}
spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
- read_unlock(&tasklist_lock);
out_unlock:
spin_unlock_irq(&css_set_lock);
+ read_unlock(&tasklist_lock);
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1844,6 +1860,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
cgrp->dom_cgrp = cgrp;
cgrp->max_descendants = INT_MAX;
cgrp->max_depth = INT_MAX;
+ INIT_LIST_HEAD(&cgrp->rstat_css_list);
+ prev_cputime_init(&cgrp->prev_cputime);
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -3381,7 +3399,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
int ret = 0;
- cgroup_stat_show_cputime(seq);
+ cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
@@ -3521,6 +3539,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
return kernfs_setattr(kn, &iattr);
}
+static void cgroup_file_notify_timer(struct timer_list *timer)
+{
+ cgroup_file_notify(container_of(timer, struct cgroup_file,
+ notify_timer));
+}
+
static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
struct cftype *cft)
{
@@ -3547,6 +3571,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
if (cft->file_offset) {
struct cgroup_file *cfile = (void *)css + cft->file_offset;
+ timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
+
spin_lock_irq(&cgroup_file_kn_lock);
cfile->kn = kn;
spin_unlock_irq(&cgroup_file_kn_lock);
@@ -3796,8 +3822,17 @@ void cgroup_file_notify(struct cgroup_file *cfile)
unsigned long flags;
spin_lock_irqsave(&cgroup_file_kn_lock, flags);
- if (cfile->kn)
- kernfs_notify(cfile->kn);
+ if (cfile->kn) {
+ unsigned long last = cfile->notified_at;
+ unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+
+ if (time_in_range(jiffies, last, next)) {
+ timer_reduce(&cfile->notify_timer, next);
+ } else {
+ kernfs_notify(cfile->kn);
+ cfile->notified_at = jiffies;
+ }
+ }
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
@@ -4560,7 +4595,7 @@ static void css_free_rwork_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
if (cgroup_on_dfl(cgrp))
- cgroup_stat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4587,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work)
if (ss) {
/* css release path */
+ if (!list_empty(&css->rstat_css_node)) {
+ cgroup_rstat_flush(cgrp);
+ list_del_rcu(&css->rstat_css_node);
+ }
+
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
@@ -4597,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work)
trace_cgroup_release(cgrp);
if (cgroup_on_dfl(cgrp))
- cgroup_stat_flush(cgrp);
+ cgroup_rstat_flush(cgrp);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
tcgrp = cgroup_parent(tcgrp))
@@ -4648,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
+ INIT_LIST_HEAD(&css->rstat_css_node);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -4656,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
+ if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+ list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
+
BUG_ON(cgroup_css(cgrp, ss));
}
@@ -4757,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
+ list_del_rcu(&css->rstat_css_node);
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
return ERR_PTR(err);
@@ -4775,8 +4820,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(sizeof(*cgrp) +
- sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
+ GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -4785,7 +4830,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
goto out_free_cgrp;
if (cgroup_on_dfl(parent)) {
- ret = cgroup_stat_init(cgrp);
+ ret = cgroup_rstat_init(cgrp);
if (ret)
goto out_cancel_ref;
}
@@ -4850,7 +4895,7 @@ out_idr_free:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_stat_exit:
if (cgroup_on_dfl(parent))
- cgroup_stat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5090,10 +5135,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
for_each_css(css, ssid, cgrp)
kill_css(css);
- /*
- * Remove @cgrp directory along with the base files. @cgrp has an
- * extra ref on its kn.
- */
+ /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
+ css_clear_dir(&cgrp->self);
kernfs_remove(cgrp->kn);
if (parent && cgroup_is_threaded(cgrp))
@@ -5245,7 +5288,7 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
- cgroup_stat_boot();
+ cgroup_rstat_boot();
/*
* The latency of the synchronize_sched() is too high for cgroups,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b42037e6e81d..266f10cb7222 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -605,7 +605,7 @@ static inline int nr_cpusets(void)
* load balancing domains (sched domains) as specified by that partial
* partition.
*
- * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
+ * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
* for a background explanation of this.
*
* Does not return errors, on the theory that the callers of this
@@ -683,7 +683,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
}
- csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
+ csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
csn = 0;
@@ -753,7 +753,8 @@ restart:
* The rest of the code, including the scheduler, can deal with
* dattr==NULL case. No need to abort if alloc fails.
*/
- dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
+ dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
+ GFP_KERNEL);
for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index defad3c5e7dc..d3bbb757ee49 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device);
static int parse_resource(char *c, int *intval)
{
substring_t argstr;
- const char **table = &rdmacg_resource_names[0];
char *name, *value = c;
size_t len;
- int ret, i = 0;
+ int ret, i;
name = strsep(&value, "=");
if (!name || !value)
return -EINVAL;
- len = strlen(value);
+ i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
+ if (i < 0)
+ return i;
- for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
- if (strcmp(table[i], name))
- continue;
+ len = strlen(value);
- argstr.from = value;
- argstr.to = value + len;
+ argstr.from = value;
+ argstr.to = value + len;
- ret = match_int(&argstr, intval);
- if (ret >= 0) {
- if (*intval < 0)
- break;
- return i;
- }
- if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
- *intval = S32_MAX;
- return i;
- }
- break;
+ ret = match_int(&argstr, intval);
+ if (ret >= 0) {
+ if (*intval < 0)
+ return -EINVAL;
+ return i;
+ }
+ if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+ *intval = S32_MAX;
+ return i;
}
return -EINVAL;
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
new file mode 100644
index 000000000000..d503d1a9007c
--- /dev/null
+++ b/kernel/cgroup/rstat.c
@@ -0,0 +1,416 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_SPINLOCK(cgroup_rstat_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
+
+static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
+{
+ return per_cpu_ptr(cgrp->rstat_cpu, cpu);
+}
+
+/**
+ * cgroup_rstat_updated - keep track of updated rstat_cpu
+ * @cgrp: target cgroup
+ * @cpu: cpu on which rstat_cpu was updated
+ *
+ * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
+ * rstat_cpu->updated_children list. See the comment on top of
+ * cgroup_rstat_cpu definition for details.
+ */
+void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+{
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+ struct cgroup *parent;
+ unsigned long flags;
+
+ /* nothing to do for root */
+ if (!cgroup_parent(cgrp))
+ return;
+
+ /*
+ * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we
+ * see NULL updated_next or they see our updated stat.
+ */
+ smp_mb();
+
+ /*
+ * Because @parent's updated_children is terminated with @parent
+ * instead of NULL, we can tell whether @cgrp is on the list by
+ * testing the next pointer for NULL.
+ */
+ if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+ return;
+
+ raw_spin_lock_irqsave(cpu_lock, flags);
+
+ /* put @cgrp and all ancestors on the corresponding updated lists */
+ for (parent = cgroup_parent(cgrp); parent;
+ cgrp = parent, parent = cgroup_parent(cgrp)) {
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+
+ /*
+ * Both additions and removals are bottom-up. If a cgroup
+ * is already in the tree, all ancestors are.
+ */
+ if (rstatc->updated_next)
+ break;
+
+ rstatc->updated_next = prstatc->updated_children;
+ prstatc->updated_children = cgrp;
+ }
+
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
+
+/**
+ * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts
+ * the traversal and %NULL return indicates the end. During traversal,
+ * each returned cgroup is unlinked from the tree. Must be called with the
+ * matching cgroup_rstat_cpu_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
+ struct cgroup *root, int cpu)
+{
+ struct cgroup_rstat_cpu *rstatc;
+ struct cgroup *parent;
+
+ if (pos == root)
+ return NULL;
+
+ /*
+ * We're gonna walk down to the first leaf and visit/remove it. We
+ * can pick whatever unvisited node as the starting point.
+ */
+ if (!pos)
+ pos = root;
+ else
+ pos = cgroup_parent(pos);
+
+ /* walk down to the first leaf */
+ while (true) {
+ rstatc = cgroup_rstat_cpu(pos, cpu);
+ if (rstatc->updated_children == pos)
+ break;
+ pos = rstatc->updated_children;
+ }
+
+ /*
+ * Unlink @pos from the tree. As the updated_children list is
+ * singly linked, we have to walk it to find the removal point.
+ * However, due to the way we traverse, @pos will be the first
+ * child in most cases. The only exception is @root.
+ */
+ parent = cgroup_parent(pos);
+ if (parent && rstatc->updated_next) {
+ struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+ struct cgroup_rstat_cpu *nrstatc;
+ struct cgroup **nextp;
+
+ nextp = &prstatc->updated_children;
+ while (true) {
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ if (*nextp == pos)
+ break;
+
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &nrstatc->updated_next;
+ }
+
+ *nextp = rstatc->updated_next;
+ rstatc->updated_next = NULL;
+
+ /*
+ * Paired with the one in cgroup_rstat_cpu_updated().
+ * Either they see NULL updated_next or we see their
+ * updated stat.
+ */
+ smp_mb();
+ }
+
+ return pos;
+}
+
+/* see cgroup_rstat_flush() */
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
+ __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
+{
+ int cpu;
+
+ lockdep_assert_held(&cgroup_rstat_lock);
+
+ for_each_possible_cpu(cpu) {
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
+ cpu);
+ struct cgroup *pos = NULL;
+
+ raw_spin_lock(cpu_lock);
+ while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+ struct cgroup_subsys_state *css;
+
+ cgroup_base_stat_flush(pos, cpu);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(css, &pos->rstat_css_list,
+ rstat_css_node)
+ css->ss->css_rstat_flush(css, cpu);
+ rcu_read_unlock();
+ }
+ raw_spin_unlock(cpu_lock);
+
+ /* if @may_sleep, play nice and yield if necessary */
+ if (may_sleep && (need_resched() ||
+ spin_needbreak(&cgroup_rstat_lock))) {
+ spin_unlock_irq(&cgroup_rstat_lock);
+ if (!cond_resched())
+ cpu_relax();
+ spin_lock_irq(&cgroup_rstat_lock);
+ }
+ }
+}
+
+/**
+ * cgroup_rstat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards. After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ *
+ * This function may block.
+ */
+void cgroup_rstat_flush(struct cgroup *cgrp)
+{
+ might_sleep();
+
+ spin_lock_irq(&cgroup_rstat_lock);
+ cgroup_rstat_flush_locked(cgrp, true);
+ spin_unlock_irq(&cgroup_rstat_lock);
+}
+
+/**
+ * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
+ * @cgrp: target cgroup
+ *
+ * This function can be called from any context.
+ */
+void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cgroup_rstat_lock, flags);
+ cgroup_rstat_flush_locked(cgrp, false);
+ spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
+}
+
+/**
+ * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * @cgrp: target cgroup
+ *
+ * Flush stats in @cgrp's subtree and prevent further flushes. Must be
+ * paired with cgroup_rstat_flush_release().
+ *
+ * This function may block.
+ */
+void cgroup_rstat_flush_hold(struct cgroup *cgrp)
+ __acquires(&cgroup_rstat_lock)
+{
+ might_sleep();
+ spin_lock_irq(&cgroup_rstat_lock);
+ cgroup_rstat_flush_locked(cgrp, true);
+}
+
+/**
+ * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
+ */
+void cgroup_rstat_flush_release(void)
+ __releases(&cgroup_rstat_lock)
+{
+ spin_unlock_irq(&cgroup_rstat_lock);
+}
+
+int cgroup_rstat_init(struct cgroup *cgrp)
+{
+ int cpu;
+
+ /* the root cgrp has rstat_cpu preallocated */
+ if (!cgrp->rstat_cpu) {
+ cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
+ if (!cgrp->rstat_cpu)
+ return -ENOMEM;
+ }
+
+ /* ->updated_children list is self terminated */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+
+ rstatc->updated_children = cgrp;
+ u64_stats_init(&rstatc->bsync);
+ }
+
+ return 0;
+}
+
+void cgroup_rstat_exit(struct cgroup *cgrp)
+{
+ int cpu;
+
+ cgroup_rstat_flush(cgrp);
+
+ /* sanity check */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+
+ if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
+ WARN_ON_ONCE(rstatc->updated_next))
+ return;
+ }
+
+ free_percpu(cgrp->rstat_cpu);
+ cgrp->rstat_cpu = NULL;
+}
+
+void __init cgroup_rstat_boot(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
+
+ BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
+}
+
+/*
+ * Functions for cgroup basic resource statistics implemented on top of
+ * rstat.
+ */
+static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat,
+ struct cgroup_base_stat *src_bstat)
+{
+ dst_bstat->cputime.utime += src_bstat->cputime.utime;
+ dst_bstat->cputime.stime += src_bstat->cputime.stime;
+ dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct task_cputime *last_cputime = &rstatc->last_bstat.cputime;
+ struct task_cputime cputime;
+ struct cgroup_base_stat delta;
+ unsigned seq;
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = __u64_stats_fetch_begin(&rstatc->bsync);
+ cputime = rstatc->bstat.cputime;
+ } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
+
+ /* calculate the delta to propgate */
+ delta.cputime.utime = cputime.utime - last_cputime->utime;
+ delta.cputime.stime = cputime.stime - last_cputime->stime;
+ delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+ last_cputime->sum_exec_runtime;
+ *last_cputime = cputime;
+
+ /* transfer the pending stat into delta */
+ cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat);
+ memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat));
+
+ /* propagate delta into the global stat and the parent's pending */
+ cgroup_base_stat_accumulate(&cgrp->bstat, &delta);
+ if (parent)
+ cgroup_base_stat_accumulate(&parent->pending_bstat, &delta);
+}
+
+static struct cgroup_rstat_cpu *
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+{
+ struct cgroup_rstat_cpu *rstatc;
+
+ rstatc = get_cpu_ptr(cgrp->rstat_cpu);
+ u64_stats_update_begin(&rstatc->bsync);
+ return rstatc;
+}
+
+static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
+ struct cgroup_rstat_cpu *rstatc)
+{
+ u64_stats_update_end(&rstatc->bsync);
+ cgroup_rstat_updated(cgrp, smp_processor_id());
+ put_cpu_ptr(rstatc);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+ struct cgroup_rstat_cpu *rstatc;
+
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+ enum cpu_usage_stat index, u64 delta_exec)
+{
+ struct cgroup_rstat_cpu *rstatc;
+
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+
+ switch (index) {
+ case CPUTIME_USER:
+ case CPUTIME_NICE:
+ rstatc->bstat.cputime.utime += delta_exec;
+ break;
+ case CPUTIME_SYSTEM:
+ case CPUTIME_IRQ:
+ case CPUTIME_SOFTIRQ:
+ rstatc->bstat.cputime.stime += delta_exec;
+ break;
+ default:
+ break;
+ }
+
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+}
+
+void cgroup_base_stat_cputime_show(struct seq_file *seq)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ u64 usage, utime, stime;
+
+ if (!cgroup_parent(cgrp))
+ return;
+
+ cgroup_rstat_flush_hold(cgrp);
+ usage = cgrp->bstat.cputime.sum_exec_runtime;
+ cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
+ cgroup_rstat_flush_release();
+
+ do_div(usage, NSEC_PER_USEC);
+ do_div(utime, NSEC_PER_USEC);
+ do_div(stime, NSEC_PER_USEC);
+
+ seq_printf(seq, "usage_usec %llu\n"
+ "user_usec %llu\n"
+ "system_usec %llu\n",
+ usage, utime, stime);
+}
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
deleted file mode 100644
index 1e111dd455c4..000000000000
--- a/kernel/cgroup/stat.c
+++ /dev/null
@@ -1,338 +0,0 @@
-#include "cgroup-internal.h"
-
-#include <linux/sched/cputime.h>
-
-static DEFINE_MUTEX(cgroup_stat_mutex);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
-
-static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
-{
- return per_cpu_ptr(cgrp->cpu_stat, cpu);
-}
-
-/**
- * cgroup_cpu_stat_updated - keep track of updated cpu_stat
- * @cgrp: target cgroup
- * @cpu: cpu on which cpu_stat was updated
- *
- * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
- * cpu_stat->updated_children list. See the comment on top of
- * cgroup_cpu_stat definition for details.
- */
-static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
-{
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
- struct cgroup *parent;
- unsigned long flags;
-
- /*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
- *
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @cgrp is on the list by
- * testing the next pointer for NULL.
- */
- if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
- return;
-
- raw_spin_lock_irqsave(cpu_lock, flags);
-
- /* put @cgrp and all ancestors on the corresponding updated lists */
- for (parent = cgroup_parent(cgrp); parent;
- cgrp = parent, parent = cgroup_parent(cgrp)) {
- struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
- struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
-
- /*
- * Both additions and removals are bottom-up. If a cgroup
- * is already in the tree, all ancestors are.
- */
- if (cstat->updated_next)
- break;
-
- cstat->updated_next = pcstat->updated_children;
- pcstat->updated_children = cgrp;
- }
-
- raw_spin_unlock_irqrestore(cpu_lock, flags);
-}
-
-/**
- * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
- * @pos: current position
- * @root: root of the tree to traversal
- * @cpu: target cpu
- *
- * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
- * the traversal and %NULL return indicates the end. During traversal,
- * each returned cgroup is unlinked from the tree. Must be called with the
- * matching cgroup_cpu_stat_lock held.
- *
- * The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
- */
-static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
- struct cgroup *root, int cpu)
-{
- struct cgroup_cpu_stat *cstat;
- struct cgroup *parent;
-
- if (pos == root)
- return NULL;
-
- /*
- * We're gonna walk down to the first leaf and visit/remove it. We
- * can pick whatever unvisited node as the starting point.
- */
- if (!pos)
- pos = root;
- else
- pos = cgroup_parent(pos);
-
- /* walk down to the first leaf */
- while (true) {
- cstat = cgroup_cpu_stat(pos, cpu);
- if (cstat->updated_children == pos)
- break;
- pos = cstat->updated_children;
- }
-
- /*
- * Unlink @pos from the tree. As the updated_children list is
- * singly linked, we have to walk it to find the removal point.
- * However, due to the way we traverse, @pos will be the first
- * child in most cases. The only exception is @root.
- */
- parent = cgroup_parent(pos);
- if (parent && cstat->updated_next) {
- struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
- struct cgroup_cpu_stat *ncstat;
- struct cgroup **nextp;
-
- nextp = &pcstat->updated_children;
- while (true) {
- ncstat = cgroup_cpu_stat(*nextp, cpu);
- if (*nextp == pos)
- break;
-
- WARN_ON_ONCE(*nextp == parent);
- nextp = &ncstat->updated_next;
- }
-
- *nextp = cstat->updated_next;
- cstat->updated_next = NULL;
- }
-
- return pos;
-}
-
-static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
- struct cgroup_stat *src_stat)
-{
- dst_stat->cputime.utime += src_stat->cputime.utime;
- dst_stat->cputime.stime += src_stat->cputime.stime;
- dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
-}
-
-static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
-{
- struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
- struct task_cputime *last_cputime = &cstat->last_cputime;
- struct task_cputime cputime;
- struct cgroup_stat delta;
- unsigned seq;
-
- lockdep_assert_held(&cgroup_stat_mutex);
-
- /* fetch the current per-cpu values */
- do {
- seq = __u64_stats_fetch_begin(&cstat->sync);
- cputime = cstat->cputime;
- } while (__u64_stats_fetch_retry(&cstat->sync, seq));
-
- /* accumulate the deltas to propgate */
- delta.cputime.utime = cputime.utime - last_cputime->utime;
- delta.cputime.stime = cputime.stime - last_cputime->stime;
- delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
- last_cputime->sum_exec_runtime;
- *last_cputime = cputime;
-
- /* transfer the pending stat into delta */
- cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
- memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
-
- /* propagate delta into the global stat and the parent's pending */
- cgroup_stat_accumulate(&cgrp->stat, &delta);
- if (parent)
- cgroup_stat_accumulate(&parent->pending_stat, &delta);
-}
-
-/* see cgroup_stat_flush() */
-static void cgroup_stat_flush_locked(struct cgroup *cgrp)
-{
- int cpu;
-
- lockdep_assert_held(&cgroup_stat_mutex);
-
- for_each_possible_cpu(cpu) {
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
- struct cgroup *pos = NULL;
-
- raw_spin_lock_irq(cpu_lock);
- while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
- cgroup_cpu_stat_flush_one(pos, cpu);
- raw_spin_unlock_irq(cpu_lock);
- }
-}
-
-/**
- * cgroup_stat_flush - flush stats in @cgrp's subtree
- * @cgrp: target cgroup
- *
- * Collect all per-cpu stats in @cgrp's subtree into the global counters
- * and propagate them upwards. After this function returns, all cgroups in
- * the subtree have up-to-date ->stat.
- *
- * This also gets all cgroups in the subtree including @cgrp off the
- * ->updated_children lists.
- */
-void cgroup_stat_flush(struct cgroup *cgrp)
-{
- mutex_lock(&cgroup_stat_mutex);
- cgroup_stat_flush_locked(cgrp);
- mutex_unlock(&cgroup_stat_mutex);
-}
-
-static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
-{
- struct cgroup_cpu_stat *cstat;
-
- cstat = get_cpu_ptr(cgrp->cpu_stat);
- u64_stats_update_begin(&cstat->sync);
- return cstat;
-}
-
-static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
- struct cgroup_cpu_stat *cstat)
-{
- u64_stats_update_end(&cstat->sync);
- cgroup_cpu_stat_updated(cgrp, smp_processor_id());
- put_cpu_ptr(cstat);
-}
-
-void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
-{
- struct cgroup_cpu_stat *cstat;
-
- cstat = cgroup_cpu_stat_account_begin(cgrp);
- cstat->cputime.sum_exec_runtime += delta_exec;
- cgroup_cpu_stat_account_end(cgrp, cstat);
-}
-
-void __cgroup_account_cputime_field(struct cgroup *cgrp,
- enum cpu_usage_stat index, u64 delta_exec)
-{
- struct cgroup_cpu_stat *cstat;
-
- cstat = cgroup_cpu_stat_account_begin(cgrp);
-
- switch (index) {
- case CPUTIME_USER:
- case CPUTIME_NICE:
- cstat->cputime.utime += delta_exec;
- break;
- case CPUTIME_SYSTEM:
- case CPUTIME_IRQ:
- case CPUTIME_SOFTIRQ:
- cstat->cputime.stime += delta_exec;
- break;
- default:
- break;
- }
-
- cgroup_cpu_stat_account_end(cgrp, cstat);
-}
-
-void cgroup_stat_show_cputime(struct seq_file *seq)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
- u64 usage, utime, stime;
-
- if (!cgroup_parent(cgrp))
- return;
-
- mutex_lock(&cgroup_stat_mutex);
-
- cgroup_stat_flush_locked(cgrp);
-
- usage = cgrp->stat.cputime.sum_exec_runtime;
- cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
- &utime, &stime);
-
- mutex_unlock(&cgroup_stat_mutex);
-
- do_div(usage, NSEC_PER_USEC);
- do_div(utime, NSEC_PER_USEC);
- do_div(stime, NSEC_PER_USEC);
-
- seq_printf(seq, "usage_usec %llu\n"
- "user_usec %llu\n"
- "system_usec %llu\n",
- usage, utime, stime);
-}
-
-int cgroup_stat_init(struct cgroup *cgrp)
-{
- int cpu;
-
- /* the root cgrp has cpu_stat preallocated */
- if (!cgrp->cpu_stat) {
- cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
- if (!cgrp->cpu_stat)
- return -ENOMEM;
- }
-
- /* ->updated_children list is self terminated */
- for_each_possible_cpu(cpu) {
- struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-
- cstat->updated_children = cgrp;
- u64_stats_init(&cstat->sync);
- }
-
- prev_cputime_init(&cgrp->stat.prev_cputime);
-
- return 0;
-}
-
-void cgroup_stat_exit(struct cgroup *cgrp)
-{
- int cpu;
-
- cgroup_stat_flush(cgrp);
-
- /* sanity check */
- for_each_possible_cpu(cpu) {
- struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-
- if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
- WARN_ON_ONCE(cstat->updated_next))
- return;
- }
-
- free_percpu(cgrp->cpu_stat);
- cgrp->cpu_stat = NULL;
-}
-
-void __init cgroup_stat_boot(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
-
- BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
-}
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 946fb92418f7..81e9af7dcec2 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -12,7 +12,7 @@ CONFIG_BLK_DEV_DM=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_CC_STACKPROTECTOR_STRONG=y
+CONFIG_STACKPROTECTOR_STRONG=y
CONFIG_COMPACTION=y
CONFIG_CPU_SW_DOMAIN_PAN=y
CONFIG_DM_CRYPT=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 9bfdffc100da..7fa0c4ae6394 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -10,7 +10,3 @@ CONFIG_OPTIMIZE_INLINING=y
# CONFIG_SLAB is not set
# CONFIG_SLUB is not set
CONFIG_SLOB=y
-CONFIG_CC_STACKPROTECTOR_NONE=y
-# CONFIG_CC_STACKPROTECTOR_REGULAR is not set
-# CONFIG_CC_STACKPROTECTOR_STRONG is not set
-# CONFIG_CC_STACKPROTECTOR_AUTO is not set
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index f7674d676889..b66aced5e8c2 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -460,6 +460,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
VMCOREINFO_NUMBER(PG_head_mask);
+#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e405677ee08d..2ddfce8f1e8f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -691,7 +691,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
}
if (!s->usable)
return KDB_NOTIMP;
- s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+ s->command = kcalloc(s->count + 1, sizeof(*(s->command)), GFP_KDB);
if (!s->command) {
kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
cmdstr);
@@ -729,8 +729,8 @@ static int kdb_defcmd(int argc, const char **argv)
kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
- defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
- GFP_KDB);
+ defcmd_set = kmalloc_array(defcmd_set_count + 1, sizeof(*defcmd_set),
+ GFP_KDB);
if (!defcmd_set)
goto fail_defcmd;
memcpy(defcmd_set, save_defcmd_set,
@@ -2706,8 +2706,11 @@ int kdb_register_flags(char *cmd,
}
if (i >= kdb_max_commands) {
- kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
- kdb_command_extend) * sizeof(*new), GFP_KDB);
+ kdbtab_t *new = kmalloc_array(kdb_max_commands -
+ KDB_BASE_CMD_MAX +
+ kdb_command_extend,
+ sizeof(*new),
+ GFP_KDB);
if (!new) {
kdb_printf("Could not allocate new kdb_command "
"table\n");
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
new file mode 100644
index 000000000000..9bd54304446f
--- /dev/null
+++ b/kernel/dma/Kconfig
@@ -0,0 +1,50 @@
+
+config HAS_DMA
+ bool
+ depends on !NO_DMA
+ default y
+
+config NEED_SG_DMA_LENGTH
+ bool
+
+config NEED_DMA_MAP_STATE
+ bool
+
+config ARCH_DMA_ADDR_T_64BIT
+ def_bool 64BIT || PHYS_ADDR_T_64BIT
+
+config HAVE_GENERIC_DMA_COHERENT
+ bool
+
+config ARCH_HAS_SYNC_DMA_FOR_DEVICE
+ bool
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU
+ bool
+ select NEED_DMA_MAP_STATE
+
+config DMA_DIRECT_OPS
+ bool
+ depends on HAS_DMA
+
+config DMA_NONCOHERENT_OPS
+ bool
+ depends on HAS_DMA
+ select DMA_DIRECT_OPS
+
+config DMA_NONCOHERENT_MMAP
+ bool
+ depends on DMA_NONCOHERENT_OPS
+
+config DMA_NONCOHERENT_CACHE_SYNC
+ bool
+ depends on DMA_NONCOHERENT_OPS
+
+config DMA_VIRT_OPS
+ bool
+ depends on HAS_DMA
+
+config SWIOTLB
+ bool
+ select DMA_DIRECT_OPS
+ select NEED_DMA_MAP_STATE
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
new file mode 100644
index 000000000000..6de44e4eb454
--- /dev/null
+++ b/kernel/dma/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_HAS_DMA) += mapping.o
+obj-$(CONFIG_DMA_CMA) += contiguous.o
+obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o
+obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o
+obj-$(CONFIG_DMA_NONCOHERENT_OPS) += noncoherent.o
+obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
+obj-$(CONFIG_DMA_API_DEBUG) += debug.o
+obj-$(CONFIG_SWIOTLB) += swiotlb.o
+
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
new file mode 100644
index 000000000000..597d40893862
--- /dev/null
+++ b/kernel/dma/coherent.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Coherent per-device memory handling.
+ * Borrowed from i386
+ */
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+struct dma_coherent_mem {
+ void *virt_base;
+ dma_addr_t device_base;
+ unsigned long pfn_base;
+ int size;
+ int flags;
+ unsigned long *bitmap;
+ spinlock_t spinlock;
+ bool use_dev_dma_pfn_offset;
+};
+
+static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init;
+
+static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev)
+{
+ if (dev && dev->dma_mem)
+ return dev->dma_mem;
+ return NULL;
+}
+
+static inline dma_addr_t dma_get_device_base(struct device *dev,
+ struct dma_coherent_mem * mem)
+{
+ if (mem->use_dev_dma_pfn_offset)
+ return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT;
+ else
+ return mem->device_base;
+}
+
+static int dma_init_coherent_memory(
+ phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags,
+ struct dma_coherent_mem **mem)
+{
+ struct dma_coherent_mem *dma_mem = NULL;
+ void __iomem *mem_base = NULL;
+ int pages = size >> PAGE_SHIFT;
+ int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+ int ret;
+
+ if (!size) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mem_base = memremap(phys_addr, size, MEMREMAP_WC);
+ if (!mem_base) {
+ ret = -EINVAL;
+ goto out;
+ }
+ dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+ if (!dma_mem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dma_mem->bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dma_mem->virt_base = mem_base;
+ dma_mem->device_base = device_addr;
+ dma_mem->pfn_base = PFN_DOWN(phys_addr);
+ dma_mem->size = pages;
+ dma_mem->flags = flags;
+ spin_lock_init(&dma_mem->spinlock);
+
+ *mem = dma_mem;
+ return 0;
+
+out:
+ kfree(dma_mem);
+ if (mem_base)
+ memunmap(mem_base);
+ return ret;
+}
+
+static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
+{
+ if (!mem)
+ return;
+
+ memunmap(mem->virt_base);
+ kfree(mem->bitmap);
+ kfree(mem);
+}
+
+static int dma_assign_coherent_memory(struct device *dev,
+ struct dma_coherent_mem *mem)
+{
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->dma_mem)
+ return -EBUSY;
+
+ dev->dma_mem = mem;
+ return 0;
+}
+
+int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
+ dma_addr_t device_addr, size_t size, int flags)
+{
+ struct dma_coherent_mem *mem;
+ int ret;
+
+ ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem);
+ if (ret)
+ return ret;
+
+ ret = dma_assign_coherent_memory(dev, mem);
+ if (ret)
+ dma_release_coherent_memory(mem);
+ return ret;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+ struct dma_coherent_mem *mem = dev->dma_mem;
+
+ if (!mem)
+ return;
+ dma_release_coherent_memory(mem);
+ dev->dma_mem = NULL;
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+ dma_addr_t device_addr, size_t size)
+{
+ struct dma_coherent_mem *mem = dev->dma_mem;
+ unsigned long flags;
+ int pos, err;
+
+ size += device_addr & ~PAGE_MASK;
+
+ if (!mem)
+ return ERR_PTR(-EINVAL);
+
+ spin_lock_irqsave(&mem->spinlock, flags);
+ pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem));
+ err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
+ spin_unlock_irqrestore(&mem->spinlock, flags);
+
+ if (err != 0)
+ return ERR_PTR(err);
+ return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
+ ssize_t size, dma_addr_t *dma_handle)
+{
+ int order = get_order(size);
+ unsigned long flags;
+ int pageno;
+ void *ret;
+
+ spin_lock_irqsave(&mem->spinlock, flags);
+
+ if (unlikely(size > (mem->size << PAGE_SHIFT)))
+ goto err;
+
+ pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
+ if (unlikely(pageno < 0))
+ goto err;
+
+ /*
+ * Memory was found in the coherent area.
+ */
+ *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
+ ret = mem->virt_base + (pageno << PAGE_SHIFT);
+ spin_unlock_irqrestore(&mem->spinlock, flags);
+ memset(ret, 0, size);
+ return ret;
+err:
+ spin_unlock_irqrestore(&mem->spinlock, flags);
+ return NULL;
+}
+
+/**
+ * dma_alloc_from_dev_coherent() - allocate memory from device coherent pool
+ * @dev: device from which we allocate memory
+ * @size: size of requested memory area
+ * @dma_handle: This will be filled with the correct dma handle
+ * @ret: This pointer will be filled with the virtual address
+ * to allocated area.
+ *
+ * This function should be only called from per-arch dma_alloc_coherent()
+ * to support allocation from per-device coherent memory pools.
+ *
+ * Returns 0 if dma_alloc_coherent should continue with allocating from
+ * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
+ */
+int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle, void **ret)
+{
+ struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+
+ if (!mem)
+ return 0;
+
+ *ret = __dma_alloc_from_coherent(mem, size, dma_handle);
+ if (*ret)
+ return 1;
+
+ /*
+ * In the case where the allocation can not be satisfied from the
+ * per-device area, try to fall back to generic memory if the
+ * constraints allow it.
+ */
+ return mem->flags & DMA_MEMORY_EXCLUSIVE;
+}
+EXPORT_SYMBOL(dma_alloc_from_dev_coherent);
+
+void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
+{
+ if (!dma_coherent_default_memory)
+ return NULL;
+
+ return __dma_alloc_from_coherent(dma_coherent_default_memory, size,
+ dma_handle);
+}
+
+static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
+ int order, void *vaddr)
+{
+ if (mem && vaddr >= mem->virt_base && vaddr <
+ (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mem->spinlock, flags);
+ bitmap_release_region(mem->bitmap, page, order);
+ spin_unlock_irqrestore(&mem->spinlock, flags);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * dma_release_from_dev_coherent() - free memory to device coherent memory pool
+ * @dev: device from which the memory was allocated
+ * @order: the order of pages allocated
+ * @vaddr: virtual address of allocated pages
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, releases that memory.
+ *
+ * Returns 1 if we correctly released the memory, or 0 if the caller should
+ * proceed with releasing memory from generic pools.
+ */
+int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
+{
+ struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+
+ return __dma_release_from_coherent(mem, order, vaddr);
+}
+EXPORT_SYMBOL(dma_release_from_dev_coherent);
+
+int dma_release_from_global_coherent(int order, void *vaddr)
+{
+ if (!dma_coherent_default_memory)
+ return 0;
+
+ return __dma_release_from_coherent(dma_coherent_default_memory, order,
+ vaddr);
+}
+
+static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
+ struct vm_area_struct *vma, void *vaddr, size_t size, int *ret)
+{
+ if (mem && vaddr >= mem->virt_base && vaddr + size <=
+ (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+ unsigned long off = vma->vm_pgoff;
+ int start = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+ int user_count = vma_pages(vma);
+ int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ *ret = -ENXIO;
+ if (off < count && user_count <= count - off) {
+ unsigned long pfn = mem->pfn_base + start + off;
+ *ret = remap_pfn_range(vma, vma->vm_start, pfn,
+ user_count << PAGE_SHIFT,
+ vma->vm_page_prot);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * dma_mmap_from_dev_coherent() - mmap memory from the device coherent pool
+ * @dev: device from which the memory was allocated
+ * @vma: vm_area for the userspace memory
+ * @vaddr: cpu address returned by dma_alloc_from_dev_coherent
+ * @size: size of the memory buffer allocated
+ * @ret: result from remap_pfn_range()
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, maps that memory to the provided vma.
+ *
+ * Returns 1 if @vaddr belongs to the device coherent pool and the caller
+ * should return @ret, or 0 if they should proceed with mapping memory from
+ * generic areas.
+ */
+int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
+ void *vaddr, size_t size, int *ret)
+{
+ struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+
+ return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
+}
+EXPORT_SYMBOL(dma_mmap_from_dev_coherent);
+
+int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
+ size_t size, int *ret)
+{
+ if (!dma_coherent_default_memory)
+ return 0;
+
+ return __dma_mmap_from_coherent(dma_coherent_default_memory, vma,
+ vaddr, size, ret);
+}
+
+/*
+ * Support for reserved memory regions defined in device tree
+ */
+#ifdef CONFIG_OF_RESERVED_MEM
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+
+static struct reserved_mem *dma_reserved_default_memory __initdata;
+
+static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
+{
+ struct dma_coherent_mem *mem = rmem->priv;
+ int ret;
+
+ if (!mem) {
+ ret = dma_init_coherent_memory(rmem->base, rmem->base,
+ rmem->size,
+ DMA_MEMORY_EXCLUSIVE, &mem);
+ if (ret) {
+ pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+ return ret;
+ }
+ }
+ mem->use_dev_dma_pfn_offset = true;
+ rmem->priv = mem;
+ dma_assign_coherent_memory(dev, mem);
+ return 0;
+}
+
+static void rmem_dma_device_release(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ if (dev)
+ dev->dma_mem = NULL;
+}
+
+static const struct reserved_mem_ops rmem_dma_ops = {
+ .device_init = rmem_dma_device_init,
+ .device_release = rmem_dma_device_release,
+};
+
+static int __init rmem_dma_setup(struct reserved_mem *rmem)
+{
+ unsigned long node = rmem->fdt_node;
+
+ if (of_get_flat_dt_prop(node, "reusable", NULL))
+ return -EINVAL;
+
+#ifdef CONFIG_ARM
+ if (!of_get_flat_dt_prop(node, "no-map", NULL)) {
+ pr_err("Reserved memory: regions without no-map are not yet supported\n");
+ return -EINVAL;
+ }
+
+ if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) {
+ WARN(dma_reserved_default_memory,
+ "Reserved memory: region for default DMA coherent area is redefined\n");
+ dma_reserved_default_memory = rmem;
+ }
+#endif
+
+ rmem->ops = &rmem_dma_ops;
+ pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+ return 0;
+}
+
+static int __init dma_init_reserved_memory(void)
+{
+ const struct reserved_mem_ops *ops;
+ int ret;
+
+ if (!dma_reserved_default_memory)
+ return -ENOMEM;
+
+ ops = dma_reserved_default_memory->ops;
+
+ /*
+ * We rely on rmem_dma_device_init() does not propagate error of
+ * dma_assign_coherent_memory() for "NULL" device.
+ */
+ ret = ops->device_init(dma_reserved_default_memory, NULL);
+
+ if (!ret) {
+ dma_coherent_default_memory = dma_reserved_default_memory->priv;
+ pr_info("DMA: default coherent area is set\n");
+ }
+
+ return ret;
+}
+
+core_initcall(dma_init_reserved_memory);
+
+RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup);
+#endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
new file mode 100644
index 000000000000..d987dcd1bd56
--- /dev/null
+++ b/kernel/dma/contiguous.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski@samsung.com>
+ * Michal Nazarewicz <mina86@mina86.com>
+ */
+
+#define pr_fmt(fmt) "cma: " fmt
+
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+# define DEBUG
+#endif
+#endif
+
+#include <asm/page.h>
+#include <asm/dma-contiguous.h>
+
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/sizes.h>
+#include <linux/dma-contiguous.h>
+#include <linux/cma.h>
+
+#ifdef CONFIG_CMA_SIZE_MBYTES
+#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
+#else
+#define CMA_SIZE_MBYTES 0
+#endif
+
+struct cma *dma_contiguous_default_area;
+
+/*
+ * Default global CMA area size can be defined in kernel's .config.
+ * This is useful mainly for distro maintainers to create a kernel
+ * that works correctly for most supported systems.
+ * The size can be set in bytes or as a percentage of the total memory
+ * in the system.
+ *
+ * Users, who want to set the size of global CMA area for their system
+ * should use cma= kernel parameter.
+ */
+static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
+static phys_addr_t size_cmdline = -1;
+static phys_addr_t base_cmdline;
+static phys_addr_t limit_cmdline;
+
+static int __init early_cma(char *p)
+{
+ pr_debug("%s(%s)\n", __func__, p);
+ size_cmdline = memparse(p, &p);
+ if (*p != '@')
+ return 0;
+ base_cmdline = memparse(p + 1, &p);
+ if (*p != '-') {
+ limit_cmdline = base_cmdline + size_cmdline;
+ return 0;
+ }
+ limit_cmdline = memparse(p + 1, &p);
+
+ return 0;
+}
+early_param("cma", early_cma);
+
+#ifdef CONFIG_CMA_SIZE_PERCENTAGE
+
+static phys_addr_t __init __maybe_unused cma_early_percent_memory(void)
+{
+ struct memblock_region *reg;
+ unsigned long total_pages = 0;
+
+ /*
+ * We cannot use memblock_phys_mem_size() here, because
+ * memblock_analyze() has not been called yet.
+ */
+ for_each_memblock(memory, reg)
+ total_pages += memblock_region_memory_end_pfn(reg) -
+ memblock_region_memory_base_pfn(reg);
+
+ return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
+}
+
+#else
+
+static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
+{
+ return 0;
+}
+
+#endif
+
+/**
+ * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
+ */
+void __init dma_contiguous_reserve(phys_addr_t limit)
+{
+ phys_addr_t selected_size = 0;
+ phys_addr_t selected_base = 0;
+ phys_addr_t selected_limit = limit;
+ bool fixed = false;
+
+ pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
+
+ if (size_cmdline != -1) {
+ selected_size = size_cmdline;
+ selected_base = base_cmdline;
+ selected_limit = min_not_zero(limit_cmdline, limit);
+ if (base_cmdline + size_cmdline == limit_cmdline)
+ fixed = true;
+ } else {
+#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
+ selected_size = size_bytes;
+#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
+ selected_size = cma_early_percent_memory();
+#elif defined(CONFIG_CMA_SIZE_SEL_MIN)
+ selected_size = min(size_bytes, cma_early_percent_memory());
+#elif defined(CONFIG_CMA_SIZE_SEL_MAX)
+ selected_size = max(size_bytes, cma_early_percent_memory());
+#endif
+ }
+
+ if (selected_size && !dma_contiguous_default_area) {
+ pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+ (unsigned long)selected_size / SZ_1M);
+
+ dma_contiguous_reserve_area(selected_size, selected_base,
+ selected_limit,
+ &dma_contiguous_default_area,
+ fixed);
+ }
+}
+
+/**
+ * dma_contiguous_reserve_area() - reserve custom contiguous area
+ * @size: Size of the reserved area (in bytes),
+ * @base: Base address of the reserved area optional, use 0 for any
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ * @res_cma: Pointer to store the created cma region.
+ * @fixed: hint about where to place the reserved area
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory. This function allows to create custom reserved areas for specific
+ * devices.
+ *
+ * If @fixed is true, reserve contiguous area at exactly @base. If false,
+ * reserve in range from @base to @limit.
+ */
+int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
+ phys_addr_t limit, struct cma **res_cma,
+ bool fixed)
+{
+ int ret;
+
+ ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed,
+ "reserved", res_cma);
+ if (ret)
+ return ret;
+
+ /* Architecture specific contiguous memory fixup. */
+ dma_contiguous_early_fixup(cma_get_base(*res_cma),
+ cma_get_size(*res_cma));
+
+ return 0;
+}
+
+/**
+ * dma_alloc_from_contiguous() - allocate pages from contiguous area
+ * @dev: Pointer to device for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ * @gfp_mask: GFP flags to use for this allocation.
+ *
+ * This function allocates memory buffer for specified device. It uses
+ * device specific contiguous memory area if available or the default
+ * global one. Requires architecture specific dev_get_cma_area() helper
+ * function.
+ */
+struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
+ unsigned int align, gfp_t gfp_mask)
+{
+ if (align > CONFIG_CMA_ALIGNMENT)
+ align = CONFIG_CMA_ALIGNMENT;
+
+ return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask);
+}
+
+/**
+ * dma_release_from_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_from_contiguous().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ return cma_release(dev_get_cma_area(dev), pages, count);
+}
+
+/*
+ * Support for reserved memory regions defined in device tree
+ */
+#ifdef CONFIG_OF_RESERVED_MEM
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev)
+{
+ dev_set_cma_area(dev, rmem->priv);
+ return 0;
+}
+
+static void rmem_cma_device_release(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ dev_set_cma_area(dev, NULL);
+}
+
+static const struct reserved_mem_ops rmem_cma_ops = {
+ .device_init = rmem_cma_device_init,
+ .device_release = rmem_cma_device_release,
+};
+
+static int __init rmem_cma_setup(struct reserved_mem *rmem)
+{
+ phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
+ phys_addr_t mask = align - 1;
+ unsigned long node = rmem->fdt_node;
+ struct cma *cma;
+ int err;
+
+ if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
+ of_get_flat_dt_prop(node, "no-map", NULL))
+ return -EINVAL;
+
+ if ((rmem->base & mask) || (rmem->size & mask)) {
+ pr_err("Reserved memory: incorrect alignment of CMA region\n");
+ return -EINVAL;
+ }
+
+ err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma);
+ if (err) {
+ pr_err("Reserved memory: unable to setup CMA region\n");
+ return err;
+ }
+ /* Architecture specific contiguous memory fixup. */
+ dma_contiguous_early_fixup(rmem->base, rmem->size);
+
+ if (of_get_flat_dt_prop(node, "linux,cma-default", NULL))
+ dma_contiguous_set_default(cma);
+
+ rmem->ops = &rmem_cma_ops;
+ rmem->priv = cma;
+
+ pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+
+ return 0;
+}
+RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup);
+#endif
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
new file mode 100644
index 000000000000..c007d25bee09
--- /dev/null
+++ b/kernel/dma/debug.c
@@ -0,0 +1,1773 @@
+/*
+ * Copyright (C) 2008 Advanced Micro Devices, Inc.
+ *
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/sched/task_stack.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched/task.h>
+#include <linux/stacktrace.h>
+#include <linux/dma-debug.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <asm/sections.h>
+
+#define HASH_SIZE 1024ULL
+#define HASH_FN_SHIFT 13
+#define HASH_FN_MASK (HASH_SIZE - 1)
+
+/* allow architectures to override this if absolutely required */
+#ifndef PREALLOC_DMA_DEBUG_ENTRIES
+#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
+#endif
+
+enum {
+ dma_debug_single,
+ dma_debug_page,
+ dma_debug_sg,
+ dma_debug_coherent,
+ dma_debug_resource,
+};
+
+enum map_err_types {
+ MAP_ERR_CHECK_NOT_APPLICABLE,
+ MAP_ERR_NOT_CHECKED,
+ MAP_ERR_CHECKED,
+};
+
+#define DMA_DEBUG_STACKTRACE_ENTRIES 5
+
+/**
+ * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
+ * @list: node on pre-allocated free_entries list
+ * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
+ * @type: single, page, sg, coherent
+ * @pfn: page frame of the start address
+ * @offset: offset of mapping relative to pfn
+ * @size: length of the mapping
+ * @direction: enum dma_data_direction
+ * @sg_call_ents: 'nents' from dma_map_sg
+ * @sg_mapped_ents: 'mapped_ents' from dma_map_sg
+ * @map_err_type: track whether dma_mapping_error() was checked
+ * @stacktrace: support backtraces when a violation is detected
+ */
+struct dma_debug_entry {
+ struct list_head list;
+ struct device *dev;
+ int type;
+ unsigned long pfn;
+ size_t offset;
+ u64 dev_addr;
+ u64 size;
+ int direction;
+ int sg_call_ents;
+ int sg_mapped_ents;
+ enum map_err_types map_err_type;
+#ifdef CONFIG_STACKTRACE
+ struct stack_trace stacktrace;
+ unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
+#endif
+};
+
+typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *);
+
+struct hash_bucket {
+ struct list_head list;
+ spinlock_t lock;
+} ____cacheline_aligned_in_smp;
+
+/* Hash list to save the allocated dma addresses */
+static struct hash_bucket dma_entry_hash[HASH_SIZE];
+/* List of pre-allocated dma_debug_entry's */
+static LIST_HEAD(free_entries);
+/* Lock for the list above */
+static DEFINE_SPINLOCK(free_entries_lock);
+
+/* Global disable flag - will be set in case of an error */
+static bool global_disable __read_mostly;
+
+/* Early initialization disable flag, set at the end of dma_debug_init */
+static bool dma_debug_initialized __read_mostly;
+
+static inline bool dma_debug_disabled(void)
+{
+ return global_disable || !dma_debug_initialized;
+}
+
+/* Global error count */
+static u32 error_count;
+
+/* Global error show enable*/
+static u32 show_all_errors __read_mostly;
+/* Number of errors to show */
+static u32 show_num_errors = 1;
+
+static u32 num_free_entries;
+static u32 min_free_entries;
+static u32 nr_total_entries;
+
+/* number of preallocated entries requested by kernel cmdline */
+static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
+
+/* debugfs dentry's for the stuff above */
+static struct dentry *dma_debug_dent __read_mostly;
+static struct dentry *global_disable_dent __read_mostly;
+static struct dentry *error_count_dent __read_mostly;
+static struct dentry *show_all_errors_dent __read_mostly;
+static struct dentry *show_num_errors_dent __read_mostly;
+static struct dentry *num_free_entries_dent __read_mostly;
+static struct dentry *min_free_entries_dent __read_mostly;
+static struct dentry *filter_dent __read_mostly;
+
+/* per-driver filter related state */
+
+#define NAME_MAX_LEN 64
+
+static char current_driver_name[NAME_MAX_LEN] __read_mostly;
+static struct device_driver *current_driver __read_mostly;
+
+static DEFINE_RWLOCK(driver_name_lock);
+
+static const char *const maperr2str[] = {
+ [MAP_ERR_CHECK_NOT_APPLICABLE] = "dma map error check not applicable",
+ [MAP_ERR_NOT_CHECKED] = "dma map error not checked",
+ [MAP_ERR_CHECKED] = "dma map error checked",
+};
+
+static const char *type2name[5] = { "single", "page",
+ "scather-gather", "coherent",
+ "resource" };
+
+static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
+ "DMA_FROM_DEVICE", "DMA_NONE" };
+
+/*
+ * The access to some variables in this macro is racy. We can't use atomic_t
+ * here because all these variables are exported to debugfs. Some of them even
+ * writeable. This is also the reason why a lock won't help much. But anyway,
+ * the races are no big deal. Here is why:
+ *
+ * error_count: the addition is racy, but the worst thing that can happen is
+ * that we don't count some errors
+ * show_num_errors: the subtraction is racy. Also no big deal because in
+ * worst case this will result in one warning more in the
+ * system log than the user configured. This variable is
+ * writeable via debugfs.
+ */
+static inline void dump_entry_trace(struct dma_debug_entry *entry)
+{
+#ifdef CONFIG_STACKTRACE
+ if (entry) {
+ pr_warning("Mapped at:\n");
+ print_stack_trace(&entry->stacktrace, 0);
+ }
+#endif
+}
+
+static bool driver_filter(struct device *dev)
+{
+ struct device_driver *drv;
+ unsigned long flags;
+ bool ret;
+
+ /* driver filter off */
+ if (likely(!current_driver_name[0]))
+ return true;
+
+ /* driver filter on and initialized */
+ if (current_driver && dev && dev->driver == current_driver)
+ return true;
+
+ /* driver filter on, but we can't filter on a NULL device... */
+ if (!dev)
+ return false;
+
+ if (current_driver || !current_driver_name[0])
+ return false;
+
+ /* driver filter on but not yet initialized */
+ drv = dev->driver;
+ if (!drv)
+ return false;
+
+ /* lock to protect against change of current_driver_name */
+ read_lock_irqsave(&driver_name_lock, flags);
+
+ ret = false;
+ if (drv->name &&
+ strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) {
+ current_driver = drv;
+ ret = true;
+ }
+
+ read_unlock_irqrestore(&driver_name_lock, flags);
+
+ return ret;
+}
+
+#define err_printk(dev, entry, format, arg...) do { \
+ error_count += 1; \
+ if (driver_filter(dev) && \
+ (show_all_errors || show_num_errors > 0)) { \
+ WARN(1, "%s %s: " format, \
+ dev ? dev_driver_string(dev) : "NULL", \
+ dev ? dev_name(dev) : "NULL", ## arg); \
+ dump_entry_trace(entry); \
+ } \
+ if (!show_all_errors && show_num_errors > 0) \
+ show_num_errors -= 1; \
+ } while (0);
+
+/*
+ * Hash related functions
+ *
+ * Every DMA-API request is saved into a struct dma_debug_entry. To
+ * have quick access to these structs they are stored into a hash.
+ */
+static int hash_fn(struct dma_debug_entry *entry)
+{
+ /*
+ * Hash function is based on the dma address.
+ * We use bits 20-27 here as the index into the hash
+ */
+ return (entry->dev_addr >> HASH_FN_SHIFT) & HASH_FN_MASK;
+}
+
+/*
+ * Request exclusive access to a hash bucket for a given dma_debug_entry.
+ */
+static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry,
+ unsigned long *flags)
+ __acquires(&dma_entry_hash[idx].lock)
+{
+ int idx = hash_fn(entry);
+ unsigned long __flags;
+
+ spin_lock_irqsave(&dma_entry_hash[idx].lock, __flags);
+ *flags = __flags;
+ return &dma_entry_hash[idx];
+}
+
+/*
+ * Give up exclusive access to the hash bucket
+ */
+static void put_hash_bucket(struct hash_bucket *bucket,
+ unsigned long *flags)
+ __releases(&bucket->lock)
+{
+ unsigned long __flags = *flags;
+
+ spin_unlock_irqrestore(&bucket->lock, __flags);
+}
+
+static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b)
+{
+ return ((a->dev_addr == b->dev_addr) &&
+ (a->dev == b->dev)) ? true : false;
+}
+
+static bool containing_match(struct dma_debug_entry *a,
+ struct dma_debug_entry *b)
+{
+ if (a->dev != b->dev)
+ return false;
+
+ if ((b->dev_addr <= a->dev_addr) &&
+ ((b->dev_addr + b->size) >= (a->dev_addr + a->size)))
+ return true;
+
+ return false;
+}
+
+/*
+ * Search a given entry in the hash bucket list
+ */
+static struct dma_debug_entry *__hash_bucket_find(struct hash_bucket *bucket,
+ struct dma_debug_entry *ref,
+ match_fn match)
+{
+ struct dma_debug_entry *entry, *ret = NULL;
+ int matches = 0, match_lvl, last_lvl = -1;
+
+ list_for_each_entry(entry, &bucket->list, list) {
+ if (!match(ref, entry))
+ continue;
+
+ /*
+ * Some drivers map the same physical address multiple
+ * times. Without a hardware IOMMU this results in the
+ * same device addresses being put into the dma-debug
+ * hash multiple times too. This can result in false
+ * positives being reported. Therefore we implement a
+ * best-fit algorithm here which returns the entry from
+ * the hash which fits best to the reference value
+ * instead of the first-fit.
+ */
+ matches += 1;
+ match_lvl = 0;
+ entry->size == ref->size ? ++match_lvl : 0;
+ entry->type == ref->type ? ++match_lvl : 0;
+ entry->direction == ref->direction ? ++match_lvl : 0;
+ entry->sg_call_ents == ref->sg_call_ents ? ++match_lvl : 0;
+
+ if (match_lvl == 4) {
+ /* perfect-fit - return the result */
+ return entry;
+ } else if (match_lvl > last_lvl) {
+ /*
+ * We found an entry that fits better then the
+ * previous one or it is the 1st match.
+ */
+ last_lvl = match_lvl;
+ ret = entry;
+ }
+ }
+
+ /*
+ * If we have multiple matches but no perfect-fit, just return
+ * NULL.
+ */
+ ret = (matches == 1) ? ret : NULL;
+
+ return ret;
+}
+
+static struct dma_debug_entry *bucket_find_exact(struct hash_bucket *bucket,
+ struct dma_debug_entry *ref)
+{
+ return __hash_bucket_find(bucket, ref, exact_match);
+}
+
+static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
+ struct dma_debug_entry *ref,
+ unsigned long *flags)
+{
+
+ unsigned int max_range = dma_get_max_seg_size(ref->dev);
+ struct dma_debug_entry *entry, index = *ref;
+ unsigned int range = 0;
+
+ while (range <= max_range) {
+ entry = __hash_bucket_find(*bucket, ref, containing_match);
+
+ if (entry)
+ return entry;
+
+ /*
+ * Nothing found, go back a hash bucket
+ */
+ put_hash_bucket(*bucket, flags);
+ range += (1 << HASH_FN_SHIFT);
+ index.dev_addr -= (1 << HASH_FN_SHIFT);
+ *bucket = get_hash_bucket(&index, flags);
+ }
+
+ return NULL;
+}
+
+/*
+ * Add an entry to a hash bucket
+ */
+static void hash_bucket_add(struct hash_bucket *bucket,
+ struct dma_debug_entry *entry)
+{
+ list_add_tail(&entry->list, &bucket->list);
+}
+
+/*
+ * Remove entry from a hash bucket list
+ */
+static void hash_bucket_del(struct dma_debug_entry *entry)
+{
+ list_del(&entry->list);
+}
+
+static unsigned long long phys_addr(struct dma_debug_entry *entry)
+{
+ if (entry->type == dma_debug_resource)
+ return __pfn_to_phys(entry->pfn) + entry->offset;
+
+ return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset;
+}
+
+/*
+ * Dump mapping entries for debugging purposes
+ */
+void debug_dma_dump_mappings(struct device *dev)
+{
+ int idx;
+
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bucket->lock, flags);
+
+ list_for_each_entry(entry, &bucket->list, list) {
+ if (!dev || dev == entry->dev) {
+ dev_info(entry->dev,
+ "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n",
+ type2name[entry->type], idx,
+ phys_addr(entry), entry->pfn,
+ entry->dev_addr, entry->size,
+ dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ }
+
+ spin_unlock_irqrestore(&bucket->lock, flags);
+ }
+}
+
+/*
+ * For each mapping (initial cacheline in the case of
+ * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a
+ * scatterlist, or the cacheline specified in dma_map_single) insert
+ * into this tree using the cacheline as the key. At
+ * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If
+ * the entry already exists at insertion time add a tag as a reference
+ * count for the overlapping mappings. For now, the overlap tracking
+ * just ensures that 'unmaps' balance 'maps' before marking the
+ * cacheline idle, but we should also be flagging overlaps as an API
+ * violation.
+ *
+ * Memory usage is mostly constrained by the maximum number of available
+ * dma-debug entries in that we need a free dma_debug_entry before
+ * inserting into the tree. In the case of dma_map_page and
+ * dma_alloc_coherent there is only one dma_debug_entry and one
+ * dma_active_cacheline entry to track per event. dma_map_sg(), on the
+ * other hand, consumes a single dma_debug_entry, but inserts 'nents'
+ * entries into the tree.
+ *
+ * At any time debug_dma_assert_idle() can be called to trigger a
+ * warning if any cachelines in the given page are in the active set.
+ */
+static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
+static DEFINE_SPINLOCK(radix_lock);
+#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
+#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT)
+#define CACHELINES_PER_PAGE (1 << CACHELINE_PER_PAGE_SHIFT)
+
+static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry)
+{
+ return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) +
+ (entry->offset >> L1_CACHE_SHIFT);
+}
+
+static int active_cacheline_read_overlap(phys_addr_t cln)
+{
+ int overlap = 0, i;
+
+ for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
+ if (radix_tree_tag_get(&dma_active_cacheline, cln, i))
+ overlap |= 1 << i;
+ return overlap;
+}
+
+static int active_cacheline_set_overlap(phys_addr_t cln, int overlap)
+{
+ int i;
+
+ if (overlap > ACTIVE_CACHELINE_MAX_OVERLAP || overlap < 0)
+ return overlap;
+
+ for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
+ if (overlap & 1 << i)
+ radix_tree_tag_set(&dma_active_cacheline, cln, i);
+ else
+ radix_tree_tag_clear(&dma_active_cacheline, cln, i);
+
+ return overlap;
+}
+
+static void active_cacheline_inc_overlap(phys_addr_t cln)
+{
+ int overlap = active_cacheline_read_overlap(cln);
+
+ overlap = active_cacheline_set_overlap(cln, ++overlap);
+
+ /* If we overflowed the overlap counter then we're potentially
+ * leaking dma-mappings. Otherwise, if maps and unmaps are
+ * balanced then this overflow may cause false negatives in
+ * debug_dma_assert_idle() as the cacheline may be marked idle
+ * prematurely.
+ */
+ WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
+ "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n",
+ ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
+}
+
+static int active_cacheline_dec_overlap(phys_addr_t cln)
+{
+ int overlap = active_cacheline_read_overlap(cln);
+
+ return active_cacheline_set_overlap(cln, --overlap);
+}
+
+static int active_cacheline_insert(struct dma_debug_entry *entry)
+{
+ phys_addr_t cln = to_cacheline_number(entry);
+ unsigned long flags;
+ int rc;
+
+ /* If the device is not writing memory then we don't have any
+ * concerns about the cpu consuming stale data. This mitigates
+ * legitimate usages of overlapping mappings.
+ */
+ if (entry->direction == DMA_TO_DEVICE)
+ return 0;
+
+ spin_lock_irqsave(&radix_lock, flags);
+ rc = radix_tree_insert(&dma_active_cacheline, cln, entry);
+ if (rc == -EEXIST)
+ active_cacheline_inc_overlap(cln);
+ spin_unlock_irqrestore(&radix_lock, flags);
+
+ return rc;
+}
+
+static void active_cacheline_remove(struct dma_debug_entry *entry)
+{
+ phys_addr_t cln = to_cacheline_number(entry);
+ unsigned long flags;
+
+ /* ...mirror the insert case */
+ if (entry->direction == DMA_TO_DEVICE)
+ return;
+
+ spin_lock_irqsave(&radix_lock, flags);
+ /* since we are counting overlaps the final put of the
+ * cacheline will occur when the overlap count is 0.
+ * active_cacheline_dec_overlap() returns -1 in that case
+ */
+ if (active_cacheline_dec_overlap(cln) < 0)
+ radix_tree_delete(&dma_active_cacheline, cln);
+ spin_unlock_irqrestore(&radix_lock, flags);
+}
+
+/**
+ * debug_dma_assert_idle() - assert that a page is not undergoing dma
+ * @page: page to lookup in the dma_active_cacheline tree
+ *
+ * Place a call to this routine in cases where the cpu touching the page
+ * before the dma completes (page is dma_unmapped) will lead to data
+ * corruption.
+ */
+void debug_dma_assert_idle(struct page *page)
+{
+ static struct dma_debug_entry *ents[CACHELINES_PER_PAGE];
+ struct dma_debug_entry *entry = NULL;
+ void **results = (void **) &ents;
+ unsigned int nents, i;
+ unsigned long flags;
+ phys_addr_t cln;
+
+ if (dma_debug_disabled())
+ return;
+
+ if (!page)
+ return;
+
+ cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT;
+ spin_lock_irqsave(&radix_lock, flags);
+ nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln,
+ CACHELINES_PER_PAGE);
+ for (i = 0; i < nents; i++) {
+ phys_addr_t ent_cln = to_cacheline_number(ents[i]);
+
+ if (ent_cln == cln) {
+ entry = ents[i];
+ break;
+ } else if (ent_cln >= cln + CACHELINES_PER_PAGE)
+ break;
+ }
+ spin_unlock_irqrestore(&radix_lock, flags);
+
+ if (!entry)
+ return;
+
+ cln = to_cacheline_number(entry);
+ err_printk(entry->dev, entry,
+ "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n",
+ &cln);
+}
+
+/*
+ * Wrapper function for adding an entry to the hash.
+ * This function takes care of locking itself.
+ */
+static void add_dma_entry(struct dma_debug_entry *entry)
+{
+ struct hash_bucket *bucket;
+ unsigned long flags;
+ int rc;
+
+ bucket = get_hash_bucket(entry, &flags);
+ hash_bucket_add(bucket, entry);
+ put_hash_bucket(bucket, &flags);
+
+ rc = active_cacheline_insert(entry);
+ if (rc == -ENOMEM) {
+ pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n");
+ global_disable = true;
+ }
+
+ /* TODO: report -EEXIST errors here as overlapping mappings are
+ * not supported by the DMA API
+ */
+}
+
+static struct dma_debug_entry *__dma_entry_alloc(void)
+{
+ struct dma_debug_entry *entry;
+
+ entry = list_entry(free_entries.next, struct dma_debug_entry, list);
+ list_del(&entry->list);
+ memset(entry, 0, sizeof(*entry));
+
+ num_free_entries -= 1;
+ if (num_free_entries < min_free_entries)
+ min_free_entries = num_free_entries;
+
+ return entry;
+}
+
+/* struct dma_entry allocator
+ *
+ * The next two functions implement the allocator for
+ * struct dma_debug_entries.
+ */
+static struct dma_debug_entry *dma_entry_alloc(void)
+{
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&free_entries_lock, flags);
+
+ if (list_empty(&free_entries)) {
+ global_disable = true;
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+ pr_err("DMA-API: debugging out of memory - disabling\n");
+ return NULL;
+ }
+
+ entry = __dma_entry_alloc();
+
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+
+#ifdef CONFIG_STACKTRACE
+ entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
+ entry->stacktrace.entries = entry->st_entries;
+ entry->stacktrace.skip = 2;
+ save_stack_trace(&entry->stacktrace);
+#endif
+
+ return entry;
+}
+
+static void dma_entry_free(struct dma_debug_entry *entry)
+{
+ unsigned long flags;
+
+ active_cacheline_remove(entry);
+
+ /*
+ * add to beginning of the list - this way the entries are
+ * more likely cache hot when they are reallocated.
+ */
+ spin_lock_irqsave(&free_entries_lock, flags);
+ list_add(&entry->list, &free_entries);
+ num_free_entries += 1;
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+}
+
+int dma_debug_resize_entries(u32 num_entries)
+{
+ int i, delta, ret = 0;
+ unsigned long flags;
+ struct dma_debug_entry *entry;
+ LIST_HEAD(tmp);
+
+ spin_lock_irqsave(&free_entries_lock, flags);
+
+ if (nr_total_entries < num_entries) {
+ delta = num_entries - nr_total_entries;
+
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+
+ for (i = 0; i < delta; i++) {
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ break;
+
+ list_add_tail(&entry->list, &tmp);
+ }
+
+ spin_lock_irqsave(&free_entries_lock, flags);
+
+ list_splice(&tmp, &free_entries);
+ nr_total_entries += i;
+ num_free_entries += i;
+ } else {
+ delta = nr_total_entries - num_entries;
+
+ for (i = 0; i < delta && !list_empty(&free_entries); i++) {
+ entry = __dma_entry_alloc();
+ kfree(entry);
+ }
+
+ nr_total_entries -= i;
+ }
+
+ if (nr_total_entries != num_entries)
+ ret = 1;
+
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+
+ return ret;
+}
+
+/*
+ * DMA-API debugging init code
+ *
+ * The init code does two things:
+ * 1. Initialize core data structures
+ * 2. Preallocate a given number of dma_debug_entry structs
+ */
+
+static int prealloc_memory(u32 num_entries)
+{
+ struct dma_debug_entry *entry, *next_entry;
+ int i;
+
+ for (i = 0; i < num_entries; ++i) {
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto out_err;
+
+ list_add_tail(&entry->list, &free_entries);
+ }
+
+ num_free_entries = num_entries;
+ min_free_entries = num_entries;
+
+ pr_info("DMA-API: preallocated %d debug entries\n", num_entries);
+
+ return 0;
+
+out_err:
+
+ list_for_each_entry_safe(entry, next_entry, &free_entries, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+
+ return -ENOMEM;
+}
+
+static ssize_t filter_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[NAME_MAX_LEN + 1];
+ unsigned long flags;
+ int len;
+
+ if (!current_driver_name[0])
+ return 0;
+
+ /*
+ * We can't copy to userspace directly because current_driver_name can
+ * only be read under the driver_name_lock with irqs disabled. So
+ * create a temporary copy first.
+ */
+ read_lock_irqsave(&driver_name_lock, flags);
+ len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name);
+ read_unlock_irqrestore(&driver_name_lock, flags);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t filter_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[NAME_MAX_LEN];
+ unsigned long flags;
+ size_t len;
+ int i;
+
+ /*
+ * We can't copy from userspace directly. Access to
+ * current_driver_name is protected with a write_lock with irqs
+ * disabled. Since copy_from_user can fault and may sleep we
+ * need to copy to temporary buffer first
+ */
+ len = min(count, (size_t)(NAME_MAX_LEN - 1));
+ if (copy_from_user(buf, userbuf, len))
+ return -EFAULT;
+
+ buf[len] = 0;
+
+ write_lock_irqsave(&driver_name_lock, flags);
+
+ /*
+ * Now handle the string we got from userspace very carefully.
+ * The rules are:
+ * - only use the first token we got
+ * - token delimiter is everything looking like a space
+ * character (' ', '\n', '\t' ...)
+ *
+ */
+ if (!isalnum(buf[0])) {
+ /*
+ * If the first character userspace gave us is not
+ * alphanumerical then assume the filter should be
+ * switched off.
+ */
+ if (current_driver_name[0])
+ pr_info("DMA-API: switching off dma-debug driver filter\n");
+ current_driver_name[0] = 0;
+ current_driver = NULL;
+ goto out_unlock;
+ }
+
+ /*
+ * Now parse out the first token and use it as the name for the
+ * driver to filter for.
+ */
+ for (i = 0; i < NAME_MAX_LEN - 1; ++i) {
+ current_driver_name[i] = buf[i];
+ if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0)
+ break;
+ }
+ current_driver_name[i] = 0;
+ current_driver = NULL;
+
+ pr_info("DMA-API: enable driver filter for driver [%s]\n",
+ current_driver_name);
+
+out_unlock:
+ write_unlock_irqrestore(&driver_name_lock, flags);
+
+ return count;
+}
+
+static const struct file_operations filter_fops = {
+ .read = filter_read,
+ .write = filter_write,
+ .llseek = default_llseek,
+};
+
+static int dma_debug_fs_init(void)
+{
+ dma_debug_dent = debugfs_create_dir("dma-api", NULL);
+ if (!dma_debug_dent) {
+ pr_err("DMA-API: can not create debugfs directory\n");
+ return -ENOMEM;
+ }
+
+ global_disable_dent = debugfs_create_bool("disabled", 0444,
+ dma_debug_dent,
+ &global_disable);
+ if (!global_disable_dent)
+ goto out_err;
+
+ error_count_dent = debugfs_create_u32("error_count", 0444,
+ dma_debug_dent, &error_count);
+ if (!error_count_dent)
+ goto out_err;
+
+ show_all_errors_dent = debugfs_create_u32("all_errors", 0644,
+ dma_debug_dent,
+ &show_all_errors);
+ if (!show_all_errors_dent)
+ goto out_err;
+
+ show_num_errors_dent = debugfs_create_u32("num_errors", 0644,
+ dma_debug_dent,
+ &show_num_errors);
+ if (!show_num_errors_dent)
+ goto out_err;
+
+ num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444,
+ dma_debug_dent,
+ &num_free_entries);
+ if (!num_free_entries_dent)
+ goto out_err;
+
+ min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444,
+ dma_debug_dent,
+ &min_free_entries);
+ if (!min_free_entries_dent)
+ goto out_err;
+
+ filter_dent = debugfs_create_file("driver_filter", 0644,
+ dma_debug_dent, NULL, &filter_fops);
+ if (!filter_dent)
+ goto out_err;
+
+ return 0;
+
+out_err:
+ debugfs_remove_recursive(dma_debug_dent);
+
+ return -ENOMEM;
+}
+
+static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry)
+{
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+ int count = 0, i;
+
+ for (i = 0; i < HASH_SIZE; ++i) {
+ spin_lock_irqsave(&dma_entry_hash[i].lock, flags);
+ list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
+ if (entry->dev == dev) {
+ count += 1;
+ *out_entry = entry;
+ }
+ }
+ spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags);
+ }
+
+ return count;
+}
+
+static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data)
+{
+ struct device *dev = data;
+ struct dma_debug_entry *uninitialized_var(entry);
+ int count;
+
+ if (dma_debug_disabled())
+ return 0;
+
+ switch (action) {
+ case BUS_NOTIFY_UNBOUND_DRIVER:
+ count = device_dma_allocations(dev, &entry);
+ if (count == 0)
+ break;
+ err_printk(dev, entry, "DMA-API: device driver has pending "
+ "DMA allocations while released from device "
+ "[count=%d]\n"
+ "One of leaked entries details: "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [mapped as %s]\n",
+ count, entry->dev_addr, entry->size,
+ dir2name[entry->direction], type2name[entry->type]);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+void dma_debug_add_bus(struct bus_type *bus)
+{
+ struct notifier_block *nb;
+
+ if (dma_debug_disabled())
+ return;
+
+ nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
+ if (nb == NULL) {
+ pr_err("dma_debug_add_bus: out of memory\n");
+ return;
+ }
+
+ nb->notifier_call = dma_debug_device_change;
+
+ bus_register_notifier(bus, nb);
+}
+
+static int dma_debug_init(void)
+{
+ int i;
+
+ /* Do not use dma_debug_initialized here, since we really want to be
+ * called to set dma_debug_initialized
+ */
+ if (global_disable)
+ return 0;
+
+ for (i = 0; i < HASH_SIZE; ++i) {
+ INIT_LIST_HEAD(&dma_entry_hash[i].list);
+ spin_lock_init(&dma_entry_hash[i].lock);
+ }
+
+ if (dma_debug_fs_init() != 0) {
+ pr_err("DMA-API: error creating debugfs entries - disabling\n");
+ global_disable = true;
+
+ return 0;
+ }
+
+ if (prealloc_memory(nr_prealloc_entries) != 0) {
+ pr_err("DMA-API: debugging out of memory error - disabled\n");
+ global_disable = true;
+
+ return 0;
+ }
+
+ nr_total_entries = num_free_entries;
+
+ dma_debug_initialized = true;
+
+ pr_info("DMA-API: debugging enabled by kernel config\n");
+ return 0;
+}
+core_initcall(dma_debug_init);
+
+static __init int dma_debug_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (strncmp(str, "off", 3) == 0) {
+ pr_info("DMA-API: debugging disabled on kernel command line\n");
+ global_disable = true;
+ }
+
+ return 0;
+}
+
+static __init int dma_debug_entries_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!get_option(&str, &nr_prealloc_entries))
+ nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
+ return 0;
+}
+
+__setup("dma_debug=", dma_debug_cmdline);
+__setup("dma_debug_entries=", dma_debug_entries_cmdline);
+
+static void check_unmap(struct dma_debug_entry *ref)
+{
+ struct dma_debug_entry *entry;
+ struct hash_bucket *bucket;
+ unsigned long flags;
+
+ bucket = get_hash_bucket(ref, &flags);
+ entry = bucket_find_exact(bucket, ref);
+
+ if (!entry) {
+ /* must drop lock before calling dma_mapping_error */
+ put_hash_bucket(bucket, &flags);
+
+ if (dma_mapping_error(ref->dev, ref->dev_addr)) {
+ err_printk(ref->dev, NULL,
+ "DMA-API: device driver tries to free an "
+ "invalid DMA memory address\n");
+ } else {
+ err_printk(ref->dev, NULL,
+ "DMA-API: device driver tries to free DMA "
+ "memory it has not allocated [device "
+ "address=0x%016llx] [size=%llu bytes]\n",
+ ref->dev_addr, ref->size);
+ }
+ return;
+ }
+
+ if (ref->size != entry->size) {
+ err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ "DMA memory with different size "
+ "[device address=0x%016llx] [map size=%llu bytes] "
+ "[unmap size=%llu bytes]\n",
+ ref->dev_addr, entry->size, ref->size);
+ }
+
+ if (ref->type != entry->type) {
+ err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ "DMA memory with wrong function "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped as %s] [unmapped as %s]\n",
+ ref->dev_addr, ref->size,
+ type2name[entry->type], type2name[ref->type]);
+ } else if ((entry->type == dma_debug_coherent) &&
+ (phys_addr(ref) != phys_addr(entry))) {
+ err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ "DMA memory with different CPU address "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[cpu alloc address=0x%016llx] "
+ "[cpu free address=0x%016llx]",
+ ref->dev_addr, ref->size,
+ phys_addr(entry),
+ phys_addr(ref));
+ }
+
+ if (ref->sg_call_ents && ref->type == dma_debug_sg &&
+ ref->sg_call_ents != entry->sg_call_ents) {
+ err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ "DMA sg list with different entry count "
+ "[map count=%d] [unmap count=%d]\n",
+ entry->sg_call_ents, ref->sg_call_ents);
+ }
+
+ /*
+ * This may be no bug in reality - but most implementations of the
+ * DMA API don't handle this properly, so check for it here
+ */
+ if (ref->direction != entry->direction) {
+ err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ "DMA memory with different direction "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [unmapped with %s]\n",
+ ref->dev_addr, ref->size,
+ dir2name[entry->direction],
+ dir2name[ref->direction]);
+ }
+
+ /*
+ * Drivers should use dma_mapping_error() to check the returned
+ * addresses of dma_map_single() and dma_map_page().
+ * If not, print this warning message. See Documentation/DMA-API.txt.
+ */
+ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
+ err_printk(ref->dev, entry,
+ "DMA-API: device driver failed to check map error"
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped as %s]",
+ ref->dev_addr, ref->size,
+ type2name[entry->type]);
+ }
+
+ hash_bucket_del(entry);
+ dma_entry_free(entry);
+
+ put_hash_bucket(bucket, &flags);
+}
+
+static void check_for_stack(struct device *dev,
+ struct page *page, size_t offset)
+{
+ void *addr;
+ struct vm_struct *stack_vm_area = task_stack_vm_area(current);
+
+ if (!stack_vm_area) {
+ /* Stack is direct-mapped. */
+ if (PageHighMem(page))
+ return;
+ addr = page_address(page) + offset;
+ if (object_is_on_stack(addr))
+ err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr);
+ } else {
+ /* Stack is vmalloced. */
+ int i;
+
+ for (i = 0; i < stack_vm_area->nr_pages; i++) {
+ if (page != stack_vm_area->pages[i])
+ continue;
+
+ addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
+ err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr);
+ break;
+ }
+ }
+}
+
+static inline bool overlap(void *addr, unsigned long len, void *start, void *end)
+{
+ unsigned long a1 = (unsigned long)addr;
+ unsigned long b1 = a1 + len;
+ unsigned long a2 = (unsigned long)start;
+ unsigned long b2 = (unsigned long)end;
+
+ return !(b1 <= a2 || a1 >= b2);
+}
+
+static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len)
+{
+ if (overlap(addr, len, _stext, _etext) ||
+ overlap(addr, len, __start_rodata, __end_rodata))
+ err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
+}
+
+static void check_sync(struct device *dev,
+ struct dma_debug_entry *ref,
+ bool to_cpu)
+{
+ struct dma_debug_entry *entry;
+ struct hash_bucket *bucket;
+ unsigned long flags;
+
+ bucket = get_hash_bucket(ref, &flags);
+
+ entry = bucket_find_contain(&bucket, ref, &flags);
+
+ if (!entry) {
+ err_printk(dev, NULL, "DMA-API: device driver tries "
+ "to sync DMA memory it has not allocated "
+ "[device address=0x%016llx] [size=%llu bytes]\n",
+ (unsigned long long)ref->dev_addr, ref->size);
+ goto out;
+ }
+
+ if (ref->size > entry->size) {
+ err_printk(dev, entry, "DMA-API: device driver syncs"
+ " DMA memory outside allocated range "
+ "[device address=0x%016llx] "
+ "[allocation size=%llu bytes] "
+ "[sync offset+size=%llu]\n",
+ entry->dev_addr, entry->size,
+ ref->size);
+ }
+
+ if (entry->direction == DMA_BIDIRECTIONAL)
+ goto out;
+
+ if (ref->direction != entry->direction) {
+ err_printk(dev, entry, "DMA-API: device driver syncs "
+ "DMA memory with different direction "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [synced with %s]\n",
+ (unsigned long long)ref->dev_addr, entry->size,
+ dir2name[entry->direction],
+ dir2name[ref->direction]);
+ }
+
+ if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) &&
+ !(ref->direction == DMA_TO_DEVICE))
+ err_printk(dev, entry, "DMA-API: device driver syncs "
+ "device read-only DMA memory for cpu "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [synced with %s]\n",
+ (unsigned long long)ref->dev_addr, entry->size,
+ dir2name[entry->direction],
+ dir2name[ref->direction]);
+
+ if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) &&
+ !(ref->direction == DMA_FROM_DEVICE))
+ err_printk(dev, entry, "DMA-API: device driver syncs "
+ "device write-only DMA memory to device "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [synced with %s]\n",
+ (unsigned long long)ref->dev_addr, entry->size,
+ dir2name[entry->direction],
+ dir2name[ref->direction]);
+
+ if (ref->sg_call_ents && ref->type == dma_debug_sg &&
+ ref->sg_call_ents != entry->sg_call_ents) {
+ err_printk(ref->dev, entry, "DMA-API: device driver syncs "
+ "DMA sg list with different entry count "
+ "[map count=%d] [sync count=%d]\n",
+ entry->sg_call_ents, ref->sg_call_ents);
+ }
+
+out:
+ put_hash_bucket(bucket, &flags);
+}
+
+static void check_sg_segment(struct device *dev, struct scatterlist *sg)
+{
+#ifdef CONFIG_DMA_API_DEBUG_SG
+ unsigned int max_seg = dma_get_max_seg_size(dev);
+ u64 start, end, boundary = dma_get_seg_boundary(dev);
+
+ /*
+ * Either the driver forgot to set dma_parms appropriately, or
+ * whoever generated the list forgot to check them.
+ */
+ if (sg->length > max_seg)
+ err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
+ sg->length, max_seg);
+ /*
+ * In some cases this could potentially be the DMA API
+ * implementation's fault, but it would usually imply that
+ * the scatterlist was built inappropriately to begin with.
+ */
+ start = sg_dma_address(sg);
+ end = start + sg_dma_len(sg) - 1;
+ if ((start ^ end) & ~boundary)
+ err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
+ start, end, boundary);
+#endif
+}
+
+void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
+ size_t size, int direction, dma_addr_t dma_addr,
+ bool map_single)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ if (dma_mapping_error(dev, dma_addr))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->dev = dev;
+ entry->type = dma_debug_page;
+ entry->pfn = page_to_pfn(page);
+ entry->offset = offset,
+ entry->dev_addr = dma_addr;
+ entry->size = size;
+ entry->direction = direction;
+ entry->map_err_type = MAP_ERR_NOT_CHECKED;
+
+ if (map_single)
+ entry->type = dma_debug_single;
+
+ check_for_stack(dev, page, offset);
+
+ if (!PageHighMem(page)) {
+ void *addr = page_address(page) + offset;
+
+ check_for_illegal_area(dev, addr, size);
+ }
+
+ add_dma_entry(entry);
+}
+EXPORT_SYMBOL(debug_dma_map_page);
+
+void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ struct dma_debug_entry ref;
+ struct dma_debug_entry *entry;
+ struct hash_bucket *bucket;
+ unsigned long flags;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ ref.dev = dev;
+ ref.dev_addr = dma_addr;
+ bucket = get_hash_bucket(&ref, &flags);
+
+ list_for_each_entry(entry, &bucket->list, list) {
+ if (!exact_match(&ref, entry))
+ continue;
+
+ /*
+ * The same physical address can be mapped multiple
+ * times. Without a hardware IOMMU this results in the
+ * same device addresses being put into the dma-debug
+ * hash multiple times too. This can result in false
+ * positives being reported. Therefore we implement a
+ * best-fit algorithm here which updates the first entry
+ * from the hash which fits the reference value and is
+ * not currently listed as being checked.
+ */
+ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
+ entry->map_err_type = MAP_ERR_CHECKED;
+ break;
+ }
+ }
+
+ put_hash_bucket(bucket, &flags);
+}
+EXPORT_SYMBOL(debug_dma_mapping_error);
+
+void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, int direction, bool map_single)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_page,
+ .dev = dev,
+ .dev_addr = addr,
+ .size = size,
+ .direction = direction,
+ };
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ if (map_single)
+ ref.type = dma_debug_single;
+
+ check_unmap(&ref);
+}
+EXPORT_SYMBOL(debug_dma_unmap_page);
+
+void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, int mapped_ents, int direction)
+{
+ struct dma_debug_entry *entry;
+ struct scatterlist *s;
+ int i;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ for_each_sg(sg, s, mapped_ents, i) {
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_sg;
+ entry->dev = dev;
+ entry->pfn = page_to_pfn(sg_page(s));
+ entry->offset = s->offset,
+ entry->size = sg_dma_len(s);
+ entry->dev_addr = sg_dma_address(s);
+ entry->direction = direction;
+ entry->sg_call_ents = nents;
+ entry->sg_mapped_ents = mapped_ents;
+
+ check_for_stack(dev, sg_page(s), s->offset);
+
+ if (!PageHighMem(sg_page(s))) {
+ check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
+ }
+
+ check_sg_segment(dev, s);
+
+ add_dma_entry(entry);
+ }
+}
+EXPORT_SYMBOL(debug_dma_map_sg);
+
+static int get_nr_mapped_entries(struct device *dev,
+ struct dma_debug_entry *ref)
+{
+ struct dma_debug_entry *entry;
+ struct hash_bucket *bucket;
+ unsigned long flags;
+ int mapped_ents;
+
+ bucket = get_hash_bucket(ref, &flags);
+ entry = bucket_find_exact(bucket, ref);
+ mapped_ents = 0;
+
+ if (entry)
+ mapped_ents = entry->sg_mapped_ents;
+ put_hash_bucket(bucket, &flags);
+
+ return mapped_ents;
+}
+
+void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+{
+ struct scatterlist *s;
+ int mapped_ents = 0, i;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ for_each_sg(sglist, s, nelems, i) {
+
+ struct dma_debug_entry ref = {
+ .type = dma_debug_sg,
+ .dev = dev,
+ .pfn = page_to_pfn(sg_page(s)),
+ .offset = s->offset,
+ .dev_addr = sg_dma_address(s),
+ .size = sg_dma_len(s),
+ .direction = dir,
+ .sg_call_ents = nelems,
+ };
+
+ if (mapped_ents && i >= mapped_ents)
+ break;
+
+ if (!i)
+ mapped_ents = get_nr_mapped_entries(dev, &ref);
+
+ check_unmap(&ref);
+ }
+}
+EXPORT_SYMBOL(debug_dma_unmap_sg);
+
+void debug_dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t dma_addr, void *virt)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ if (unlikely(virt == NULL))
+ return;
+
+ /* handle vmalloc and linear addresses */
+ if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_coherent;
+ entry->dev = dev;
+ entry->offset = offset_in_page(virt);
+ entry->size = size;
+ entry->dev_addr = dma_addr;
+ entry->direction = DMA_BIDIRECTIONAL;
+
+ if (is_vmalloc_addr(virt))
+ entry->pfn = vmalloc_to_pfn(virt);
+ else
+ entry->pfn = page_to_pfn(virt_to_page(virt));
+
+ add_dma_entry(entry);
+}
+EXPORT_SYMBOL(debug_dma_alloc_coherent);
+
+void debug_dma_free_coherent(struct device *dev, size_t size,
+ void *virt, dma_addr_t addr)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_coherent,
+ .dev = dev,
+ .offset = offset_in_page(virt),
+ .dev_addr = addr,
+ .size = size,
+ .direction = DMA_BIDIRECTIONAL,
+ };
+
+ /* handle vmalloc and linear addresses */
+ if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt))
+ return;
+
+ if (is_vmalloc_addr(virt))
+ ref.pfn = vmalloc_to_pfn(virt);
+ else
+ ref.pfn = page_to_pfn(virt_to_page(virt));
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ check_unmap(&ref);
+}
+EXPORT_SYMBOL(debug_dma_free_coherent);
+
+void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
+ int direction, dma_addr_t dma_addr)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_resource;
+ entry->dev = dev;
+ entry->pfn = PHYS_PFN(addr);
+ entry->offset = offset_in_page(addr);
+ entry->size = size;
+ entry->dev_addr = dma_addr;
+ entry->direction = direction;
+ entry->map_err_type = MAP_ERR_NOT_CHECKED;
+
+ add_dma_entry(entry);
+}
+EXPORT_SYMBOL(debug_dma_map_resource);
+
+void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
+ size_t size, int direction)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_resource,
+ .dev = dev,
+ .dev_addr = dma_addr,
+ .size = size,
+ .direction = direction,
+ };
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ check_unmap(&ref);
+}
+EXPORT_SYMBOL(debug_dma_unmap_resource);
+
+void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+ struct dma_debug_entry ref;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ ref.type = dma_debug_single;
+ ref.dev = dev;
+ ref.dev_addr = dma_handle;
+ ref.size = size;
+ ref.direction = direction;
+ ref.sg_call_ents = 0;
+
+ check_sync(dev, &ref, true);
+}
+EXPORT_SYMBOL(debug_dma_sync_single_for_cpu);
+
+void debug_dma_sync_single_for_device(struct device *dev,
+ dma_addr_t dma_handle, size_t size,
+ int direction)
+{
+ struct dma_debug_entry ref;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ ref.type = dma_debug_single;
+ ref.dev = dev;
+ ref.dev_addr = dma_handle;
+ ref.size = size;
+ ref.direction = direction;
+ ref.sg_call_ents = 0;
+
+ check_sync(dev, &ref, false);
+}
+EXPORT_SYMBOL(debug_dma_sync_single_for_device);
+
+void debug_dma_sync_single_range_for_cpu(struct device *dev,
+ dma_addr_t dma_handle,
+ unsigned long offset, size_t size,
+ int direction)
+{
+ struct dma_debug_entry ref;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ ref.type = dma_debug_single;
+ ref.dev = dev;
+ ref.dev_addr = dma_handle;
+ ref.size = offset + size;
+ ref.direction = direction;
+ ref.sg_call_ents = 0;
+
+ check_sync(dev, &ref, true);
+}
+EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu);
+
+void debug_dma_sync_single_range_for_device(struct device *dev,
+ dma_addr_t dma_handle,
+ unsigned long offset,
+ size_t size, int direction)
+{
+ struct dma_debug_entry ref;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ ref.type = dma_debug_single;
+ ref.dev = dev;
+ ref.dev_addr = dma_handle;
+ ref.size = offset + size;
+ ref.direction = direction;
+ ref.sg_call_ents = 0;
+
+ check_sync(dev, &ref, false);
+}
+EXPORT_SYMBOL(debug_dma_sync_single_range_for_device);
+
+void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nelems, int direction)
+{
+ struct scatterlist *s;
+ int mapped_ents = 0, i;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ for_each_sg(sg, s, nelems, i) {
+
+ struct dma_debug_entry ref = {
+ .type = dma_debug_sg,
+ .dev = dev,
+ .pfn = page_to_pfn(sg_page(s)),
+ .offset = s->offset,
+ .dev_addr = sg_dma_address(s),
+ .size = sg_dma_len(s),
+ .direction = direction,
+ .sg_call_ents = nelems,
+ };
+
+ if (!i)
+ mapped_ents = get_nr_mapped_entries(dev, &ref);
+
+ if (i >= mapped_ents)
+ break;
+
+ check_sync(dev, &ref, true);
+ }
+}
+EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu);
+
+void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nelems, int direction)
+{
+ struct scatterlist *s;
+ int mapped_ents = 0, i;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ for_each_sg(sg, s, nelems, i) {
+
+ struct dma_debug_entry ref = {
+ .type = dma_debug_sg,
+ .dev = dev,
+ .pfn = page_to_pfn(sg_page(s)),
+ .offset = s->offset,
+ .dev_addr = sg_dma_address(s),
+ .size = sg_dma_len(s),
+ .direction = direction,
+ .sg_call_ents = nelems,
+ };
+ if (!i)
+ mapped_ents = get_nr_mapped_entries(dev, &ref);
+
+ if (i >= mapped_ents)
+ break;
+
+ check_sync(dev, &ref, false);
+ }
+}
+EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
+
+static int __init dma_debug_driver_setup(char *str)
+{
+ int i;
+
+ for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) {
+ current_driver_name[i] = *str;
+ if (*str == 0)
+ break;
+ }
+
+ if (current_driver_name[0])
+ pr_info("DMA-API: enable driver filter for driver [%s]\n",
+ current_driver_name);
+
+
+ return 1;
+}
+__setup("dma_debug_driver=", dma_debug_driver_setup);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
new file mode 100644
index 000000000000..8be8106270c2
--- /dev/null
+++ b/kernel/dma/direct.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DMA operations that map physical memory directly without using an IOMMU or
+ * flushing caches.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-contiguous.h>
+#include <linux/pfn.h>
+#include <linux/set_memory.h>
+
+#define DIRECT_MAPPING_ERROR 0
+
+/*
+ * Most architectures use ZONE_DMA for the first 16 Megabytes, but
+ * some use it for entirely different regions:
+ */
+#ifndef ARCH_ZONE_DMA_BITS
+#define ARCH_ZONE_DMA_BITS 24
+#endif
+
+/*
+ * For AMD SEV all DMA must be to unencrypted addresses.
+ */
+static inline bool force_dma_unencrypted(void)
+{
+ return sev_active();
+}
+
+static bool
+check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
+ const char *caller)
+{
+ if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
+ if (!dev->dma_mask) {
+ dev_err(dev,
+ "%s: call on device without dma_mask\n",
+ caller);
+ return false;
+ }
+
+ if (*dev->dma_mask >= DMA_BIT_MASK(32)) {
+ dev_err(dev,
+ "%s: overflow %pad+%zu of device mask %llx\n",
+ caller, &dma_addr, size, *dev->dma_mask);
+ }
+ return false;
+ }
+ return true;
+}
+
+static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
+{
+ dma_addr_t addr = force_dma_unencrypted() ?
+ __phys_to_dma(dev, phys) : phys_to_dma(dev, phys);
+ return addr + size - 1 <= dev->coherent_dma_mask;
+}
+
+void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, unsigned long attrs)
+{
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ int page_order = get_order(size);
+ struct page *page = NULL;
+ void *ret;
+
+ /* we always manually zero the memory once we are done: */
+ gfp &= ~__GFP_ZERO;
+
+ /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */
+ if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+ gfp |= GFP_DMA;
+ if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA))
+ gfp |= GFP_DMA32;
+
+again:
+ /* CMA can be used only in the context which permits sleeping */
+ if (gfpflags_allow_blocking(gfp)) {
+ page = dma_alloc_from_contiguous(dev, count, page_order, gfp);
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ dma_release_from_contiguous(dev, page, count);
+ page = NULL;
+ }
+ }
+ if (!page)
+ page = alloc_pages_node(dev_to_node(dev), gfp, page_order);
+
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ __free_pages(page, page_order);
+ page = NULL;
+
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+ dev->coherent_dma_mask < DMA_BIT_MASK(64) &&
+ !(gfp & (GFP_DMA32 | GFP_DMA))) {
+ gfp |= GFP_DMA32;
+ goto again;
+ }
+
+ if (IS_ENABLED(CONFIG_ZONE_DMA) &&
+ dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
+ !(gfp & GFP_DMA)) {
+ gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+ goto again;
+ }
+ }
+
+ if (!page)
+ return NULL;
+ ret = page_address(page);
+ if (force_dma_unencrypted()) {
+ set_memory_decrypted((unsigned long)ret, 1 << page_order);
+ *dma_handle = __phys_to_dma(dev, page_to_phys(page));
+ } else {
+ *dma_handle = phys_to_dma(dev, page_to_phys(page));
+ }
+ memset(ret, 0, size);
+ return ret;
+}
+
+/*
+ * NOTE: this function must never look at the dma_addr argument, because we want
+ * to be able to use it as a helper for iommu implementations as well.
+ */
+void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_addr, unsigned long attrs)
+{
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned int page_order = get_order(size);
+
+ if (force_dma_unencrypted())
+ set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
+ if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count))
+ free_pages((unsigned long)cpu_addr, page_order);
+}
+
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset;
+
+ if (!check_addr(dev, dma_addr, size, __func__))
+ return DIRECT_MAPPING_ERROR;
+ return dma_addr;
+}
+
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ int i;
+ struct scatterlist *sg;
+
+ for_each_sg(sgl, sg, nents, i) {
+ BUG_ON(!sg_page(sg));
+
+ sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg));
+ if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__))
+ return 0;
+ sg_dma_len(sg) = sg->length;
+ }
+
+ return nents;
+}
+
+int dma_direct_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_ZONE_DMA
+ if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+ return 0;
+#else
+ /*
+ * Because 32-bit DMA masks are so common we expect every architecture
+ * to be able to satisfy them - either by not supporting more physical
+ * memory, or by providing a ZONE_DMA32. If neither is the case, the
+ * architecture needs to use an IOMMU instead of the direct mapping.
+ */
+ if (mask < DMA_BIT_MASK(32))
+ return 0;
+#endif
+ /*
+ * Various PCI/PCIe bridges have broken support for > 32bit DMA even
+ * if the device itself might support it.
+ */
+ if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32))
+ return 0;
+ return 1;
+}
+
+int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return dma_addr == DIRECT_MAPPING_ERROR;
+}
+
+const struct dma_map_ops dma_direct_ops = {
+ .alloc = dma_direct_alloc,
+ .free = dma_direct_free,
+ .map_page = dma_direct_map_page,
+ .map_sg = dma_direct_map_sg,
+ .dma_supported = dma_direct_supported,
+ .mapping_error = dma_direct_mapping_error,
+};
+EXPORT_SYMBOL(dma_direct_ops);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
new file mode 100644
index 000000000000..d2a92ddaac4d
--- /dev/null
+++ b/kernel/dma/mapping.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch-independent dma-mapping routines
+ *
+ * Copyright (c) 2006 SUSE Linux Products GmbH
+ * Copyright (c) 2006 Tejun Heo <teheo@suse.de>
+ */
+
+#include <linux/acpi.h>
+#include <linux/dma-mapping.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*
+ * Managed DMA API
+ */
+struct dma_devres {
+ size_t size;
+ void *vaddr;
+ dma_addr_t dma_handle;
+ unsigned long attrs;
+};
+
+static void dmam_release(struct device *dev, void *res)
+{
+ struct dma_devres *this = res;
+
+ dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle,
+ this->attrs);
+}
+
+static int dmam_match(struct device *dev, void *res, void *match_data)
+{
+ struct dma_devres *this = res, *match = match_data;
+
+ if (this->vaddr == match->vaddr) {
+ WARN_ON(this->size != match->size ||
+ this->dma_handle != match->dma_handle);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * dmam_alloc_coherent - Managed dma_alloc_coherent()
+ * @dev: Device to allocate coherent memory for
+ * @size: Size of allocation
+ * @dma_handle: Out argument for allocated DMA handle
+ * @gfp: Allocation flags
+ *
+ * Managed dma_alloc_coherent(). Memory allocated using this function
+ * will be automatically released on driver detach.
+ *
+ * RETURNS:
+ * Pointer to allocated memory on success, NULL on failure.
+ */
+void *dmam_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct dma_devres *dr;
+ void *vaddr;
+
+ dr = devres_alloc(dmam_release, sizeof(*dr), gfp);
+ if (!dr)
+ return NULL;
+
+ vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp);
+ if (!vaddr) {
+ devres_free(dr);
+ return NULL;
+ }
+
+ dr->vaddr = vaddr;
+ dr->dma_handle = *dma_handle;
+ dr->size = size;
+
+ devres_add(dev, dr);
+
+ return vaddr;
+}
+EXPORT_SYMBOL(dmam_alloc_coherent);
+
+/**
+ * dmam_free_coherent - Managed dma_free_coherent()
+ * @dev: Device to free coherent memory for
+ * @size: Size of allocation
+ * @vaddr: Virtual address of the memory to free
+ * @dma_handle: DMA handle of the memory to free
+ *
+ * Managed dma_free_coherent().
+ */
+void dmam_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle)
+{
+ struct dma_devres match_data = { size, vaddr, dma_handle };
+
+ dma_free_coherent(dev, size, vaddr, dma_handle);
+ WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data));
+}
+EXPORT_SYMBOL(dmam_free_coherent);
+
+/**
+ * dmam_alloc_attrs - Managed dma_alloc_attrs()
+ * @dev: Device to allocate non_coherent memory for
+ * @size: Size of allocation
+ * @dma_handle: Out argument for allocated DMA handle
+ * @gfp: Allocation flags
+ * @attrs: Flags in the DMA_ATTR_* namespace.
+ *
+ * Managed dma_alloc_attrs(). Memory allocated using this function will be
+ * automatically released on driver detach.
+ *
+ * RETURNS:
+ * Pointer to allocated memory on success, NULL on failure.
+ */
+void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, unsigned long attrs)
+{
+ struct dma_devres *dr;
+ void *vaddr;
+
+ dr = devres_alloc(dmam_release, sizeof(*dr), gfp);
+ if (!dr)
+ return NULL;
+
+ vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs);
+ if (!vaddr) {
+ devres_free(dr);
+ return NULL;
+ }
+
+ dr->vaddr = vaddr;
+ dr->dma_handle = *dma_handle;
+ dr->size = size;
+ dr->attrs = attrs;
+
+ devres_add(dev, dr);
+
+ return vaddr;
+}
+EXPORT_SYMBOL(dmam_alloc_attrs);
+
+#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
+
+static void dmam_coherent_decl_release(struct device *dev, void *res)
+{
+ dma_release_declared_memory(dev);
+}
+
+/**
+ * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory()
+ * @dev: Device to declare coherent memory for
+ * @phys_addr: Physical address of coherent memory to be declared
+ * @device_addr: Device address of coherent memory to be declared
+ * @size: Size of coherent memory to be declared
+ * @flags: Flags
+ *
+ * Managed dma_declare_coherent_memory().
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
+ dma_addr_t device_addr, size_t size, int flags)
+{
+ void *res;
+ int rc;
+
+ res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL);
+ if (!res)
+ return -ENOMEM;
+
+ rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size,
+ flags);
+ if (!rc)
+ devres_add(dev, res);
+ else
+ devres_free(res);
+
+ return rc;
+}
+EXPORT_SYMBOL(dmam_declare_coherent_memory);
+
+/**
+ * dmam_release_declared_memory - Managed dma_release_declared_memory().
+ * @dev: Device to release declared coherent memory for
+ *
+ * Managed dmam_release_declared_memory().
+ */
+void dmam_release_declared_memory(struct device *dev)
+{
+ WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL));
+}
+EXPORT_SYMBOL(dmam_release_declared_memory);
+
+#endif
+
+/*
+ * Create scatter-list for the already allocated DMA buffer.
+ */
+int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t handle, size_t size)
+{
+ struct page *page = virt_to_page(cpu_addr);
+ int ret;
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+ if (unlikely(ret))
+ return ret;
+
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ return 0;
+}
+EXPORT_SYMBOL(dma_common_get_sgtable);
+
+/*
+ * Create userspace mapping for the DMA-coherent memory.
+ */
+int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size)
+{
+ int ret = -ENXIO;
+#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP
+ unsigned long user_count = vma_pages(vma);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long off = vma->vm_pgoff;
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+ return ret;
+
+ if (off < count && user_count <= (count - off))
+ ret = remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(virt_to_page(cpu_addr)) + off,
+ user_count << PAGE_SHIFT,
+ vma->vm_page_prot);
+#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
+
+ return ret;
+}
+EXPORT_SYMBOL(dma_common_mmap);
+
+#ifdef CONFIG_MMU
+static struct vm_struct *__dma_common_pages_remap(struct page **pages,
+ size_t size, unsigned long vm_flags, pgprot_t prot,
+ const void *caller)
+{
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(size, vm_flags, caller);
+ if (!area)
+ return NULL;
+
+ if (map_vm_area(area, prot, pages)) {
+ vunmap(area->addr);
+ return NULL;
+ }
+
+ return area;
+}
+
+/*
+ * remaps an array of PAGE_SIZE pages into another vm_area
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_pages_remap(struct page **pages, size_t size,
+ unsigned long vm_flags, pgprot_t prot,
+ const void *caller)
+{
+ struct vm_struct *area;
+
+ area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+ if (!area)
+ return NULL;
+
+ area->pages = pages;
+
+ return area->addr;
+}
+
+/*
+ * remaps an allocated contiguous region into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+
+void *dma_common_contiguous_remap(struct page *page, size_t size,
+ unsigned long vm_flags,
+ pgprot_t prot, const void *caller)
+{
+ int i;
+ struct page **pages;
+ struct vm_struct *area;
+
+ pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+
+ for (i = 0; i < (size >> PAGE_SHIFT); i++)
+ pages[i] = nth_page(page, i);
+
+ area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+
+ kfree(pages);
+
+ if (!area)
+ return NULL;
+ return area->addr;
+}
+
+/*
+ * unmaps a range previously mapped by dma_common_*_remap
+ */
+void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+{
+ struct vm_struct *area = find_vm_area(cpu_addr);
+
+ if (!area || (area->flags & vm_flags) != vm_flags) {
+ WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
+ return;
+ }
+
+ unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
+ vunmap(cpu_addr);
+}
+#endif
+
+/*
+ * enables DMA API use for a device
+ */
+int dma_configure(struct device *dev)
+{
+ if (dev->bus->dma_configure)
+ return dev->bus->dma_configure(dev);
+ return 0;
+}
+
+void dma_deconfigure(struct device *dev)
+{
+ of_dma_deconfigure(dev);
+ acpi_dma_deconfigure(dev);
+}
diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c
new file mode 100644
index 000000000000..79e9a757387f
--- /dev/null
+++ b/kernel/dma/noncoherent.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without providing cache
+ * coherence.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/scatterlist.h>
+
+static void dma_noncoherent_sync_single_for_device(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i)
+ arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+}
+
+static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ dma_addr_t addr;
+
+ addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+ if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ arch_sync_dma_for_device(dev, page_to_phys(page) + offset,
+ size, dir);
+ return addr;
+}
+
+static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs);
+ if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir);
+ return nents;
+}
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+static void dma_noncoherent_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i)
+ arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+}
+
+static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir);
+}
+#endif
+
+const struct dma_map_ops dma_noncoherent_ops = {
+ .alloc = arch_dma_alloc,
+ .free = arch_dma_free,
+ .mmap = arch_dma_mmap,
+ .sync_single_for_device = dma_noncoherent_sync_single_for_device,
+ .sync_sg_for_device = dma_noncoherent_sync_sg_for_device,
+ .map_page = dma_noncoherent_map_page,
+ .map_sg = dma_noncoherent_map_sg,
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+ .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu,
+ .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu,
+ .unmap_page = dma_noncoherent_unmap_page,
+ .unmap_sg = dma_noncoherent_unmap_sg,
+#endif
+ .dma_supported = dma_direct_supported,
+ .mapping_error = dma_direct_mapping_error,
+ .cache_sync = arch_dma_cache_sync,
+};
+EXPORT_SYMBOL(dma_noncoherent_ops);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
new file mode 100644
index 000000000000..04b68d9dffac
--- /dev/null
+++ b/kernel/dma/swiotlb.c
@@ -0,0 +1,1087 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * This implementation is a fallback for platforms that do not support
+ * I/O TLBs (aka DMA address translation hardware).
+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API.
+ * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid
+ * unnecessary i-cache flushing.
+ * 04/07/.. ak Better overflow handling. Assorted fixes.
+ * 05/09/10 linville Add support for syncing ranges, support syncing for
+ * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
+ * 08/12/11 beckyb Add highmem support
+ */
+
+#include <linux/cache.h>
+#include <linux/dma-direct.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/swiotlb.h>
+#include <linux/pfn.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/scatterlist.h>
+#include <linux/mem_encrypt.h>
+#include <linux/set_memory.h>
+
+#include <asm/io.h>
+#include <asm/dma.h>
+
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/iommu-helper.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/swiotlb.h>
+
+#define OFFSET(val,align) ((unsigned long) \
+ ( (val) & ( (align) - 1)))
+
+#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+
+/*
+ * Minimum IO TLB size to bother booting with. Systems with mainly
+ * 64bit capable cards will only lightly use the swiotlb. If we can't
+ * allocate a contiguous 1MB, we're probably in trouble anyway.
+ */
+#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+
+enum swiotlb_force swiotlb_force;
+
+/*
+ * Used to do a quick range check in swiotlb_tbl_unmap_single and
+ * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+static phys_addr_t io_tlb_start, io_tlb_end;
+
+/*
+ * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
+ * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
+ */
+static unsigned long io_tlb_nslabs;
+
+/*
+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
+ */
+static unsigned long io_tlb_overflow = 32*1024;
+
+static phys_addr_t io_tlb_overflow_buffer;
+
+/*
+ * This is a free list describing the number of free entries available from
+ * each index
+ */
+static unsigned int *io_tlb_list;
+static unsigned int io_tlb_index;
+
+/*
+ * Max segment that we can provide which (if pages are contingous) will
+ * not be bounced (unless SWIOTLB_FORCE is set).
+ */
+unsigned int max_segment;
+
+/*
+ * We need to save away the original address corresponding to a mapped entry
+ * for the sync operations.
+ */
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+static phys_addr_t *io_tlb_orig_addr;
+
+/*
+ * Protect the above data structures in the map and unmap calls
+ */
+static DEFINE_SPINLOCK(io_tlb_lock);
+
+static int late_alloc;
+
+static int __init
+setup_io_tlb_npages(char *str)
+{
+ if (isdigit(*str)) {
+ io_tlb_nslabs = simple_strtoul(str, &str, 0);
+ /* avoid tail segment of size < IO_TLB_SEGSIZE */
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ }
+ if (*str == ',')
+ ++str;
+ if (!strcmp(str, "force")) {
+ swiotlb_force = SWIOTLB_FORCE;
+ } else if (!strcmp(str, "noforce")) {
+ swiotlb_force = SWIOTLB_NO_FORCE;
+ io_tlb_nslabs = 1;
+ }
+
+ return 0;
+}
+early_param("swiotlb", setup_io_tlb_npages);
+/* make io_tlb_overflow tunable too? */
+
+unsigned long swiotlb_nr_tbl(void)
+{
+ return io_tlb_nslabs;
+}
+EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
+
+unsigned int swiotlb_max_segment(void)
+{
+ return max_segment;
+}
+EXPORT_SYMBOL_GPL(swiotlb_max_segment);
+
+void swiotlb_set_max_segment(unsigned int val)
+{
+ if (swiotlb_force == SWIOTLB_FORCE)
+ max_segment = 1;
+ else
+ max_segment = rounddown(val, PAGE_SIZE);
+}
+
+/* default to 64MB */
+#define IO_TLB_DEFAULT_SIZE (64UL<<20)
+unsigned long swiotlb_size_or_default(void)
+{
+ unsigned long size;
+
+ size = io_tlb_nslabs << IO_TLB_SHIFT;
+
+ return size ? size : (IO_TLB_DEFAULT_SIZE);
+}
+
+static bool no_iotlb_memory;
+
+void swiotlb_print_info(void)
+{
+ unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+ unsigned char *vstart, *vend;
+
+ if (no_iotlb_memory) {
+ pr_warn("software IO TLB: No low mem\n");
+ return;
+ }
+
+ vstart = phys_to_virt(io_tlb_start);
+ vend = phys_to_virt(io_tlb_end);
+
+ printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n",
+ (unsigned long long)io_tlb_start,
+ (unsigned long long)io_tlb_end,
+ bytes >> 20, vstart, vend - 1);
+}
+
+/*
+ * Early SWIOTLB allocation may be too early to allow an architecture to
+ * perform the desired operations. This function allows the architecture to
+ * call SWIOTLB when the operations are possible. It needs to be called
+ * before the SWIOTLB memory is used.
+ */
+void __init swiotlb_update_mem_attributes(void)
+{
+ void *vaddr;
+ unsigned long bytes;
+
+ if (no_iotlb_memory || late_alloc)
+ return;
+
+ vaddr = phys_to_virt(io_tlb_start);
+ bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+ set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
+ memset(vaddr, 0, bytes);
+
+ vaddr = phys_to_virt(io_tlb_overflow_buffer);
+ bytes = PAGE_ALIGN(io_tlb_overflow);
+ set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
+ memset(vaddr, 0, bytes);
+}
+
+int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+{
+ void *v_overflow_buffer;
+ unsigned long i, bytes;
+
+ bytes = nslabs << IO_TLB_SHIFT;
+
+ io_tlb_nslabs = nslabs;
+ io_tlb_start = __pa(tlb);
+ io_tlb_end = io_tlb_start + bytes;
+
+ /*
+ * Get the overflow emergency buffer
+ */
+ v_overflow_buffer = memblock_virt_alloc_low_nopanic(
+ PAGE_ALIGN(io_tlb_overflow),
+ PAGE_SIZE);
+ if (!v_overflow_buffer)
+ return -ENOMEM;
+
+ io_tlb_overflow_buffer = __pa(v_overflow_buffer);
+
+ /*
+ * Allocate and initialize the free list array. This array is used
+ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+ * between io_tlb_start and io_tlb_end.
+ */
+ io_tlb_list = memblock_virt_alloc(
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(int)),
+ PAGE_SIZE);
+ io_tlb_orig_addr = memblock_virt_alloc(
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)),
+ PAGE_SIZE);
+ for (i = 0; i < io_tlb_nslabs; i++) {
+ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ }
+ io_tlb_index = 0;
+
+ if (verbose)
+ swiotlb_print_info();
+
+ swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
+ return 0;
+}
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void __init
+swiotlb_init(int verbose)
+{
+ size_t default_size = IO_TLB_DEFAULT_SIZE;
+ unsigned char *vstart;
+ unsigned long bytes;
+
+ if (!io_tlb_nslabs) {
+ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ }
+
+ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+ /* Get IO TLB memory from the low pages */
+ vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE);
+ if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
+ return;
+
+ if (io_tlb_start)
+ memblock_free_early(io_tlb_start,
+ PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+ pr_warn("Cannot allocate SWIOTLB buffer");
+ no_iotlb_memory = true;
+}
+
+/*
+ * Systems with larger DMA zones (those that don't support ISA) can
+ * initialize the swiotlb later using the slab allocator if needed.
+ * This should be just like above, but with some error catching.
+ */
+int
+swiotlb_late_init_with_default_size(size_t default_size)
+{
+ unsigned long bytes, req_nslabs = io_tlb_nslabs;
+ unsigned char *vstart = NULL;
+ unsigned int order;
+ int rc = 0;
+
+ if (!io_tlb_nslabs) {
+ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ }
+
+ /*
+ * Get IO TLB memory from the low pages
+ */
+ order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
+ io_tlb_nslabs = SLABS_PER_PAGE << order;
+ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+ while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+ vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+ order);
+ if (vstart)
+ break;
+ order--;
+ }
+
+ if (!vstart) {
+ io_tlb_nslabs = req_nslabs;
+ return -ENOMEM;
+ }
+ if (order != get_order(bytes)) {
+ printk(KERN_WARNING "Warning: only able to allocate %ld MB "
+ "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+ io_tlb_nslabs = SLABS_PER_PAGE << order;
+ }
+ rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
+ if (rc)
+ free_pages((unsigned long)vstart, order);
+
+ return rc;
+}
+
+int
+swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
+{
+ unsigned long i, bytes;
+ unsigned char *v_overflow_buffer;
+
+ bytes = nslabs << IO_TLB_SHIFT;
+
+ io_tlb_nslabs = nslabs;
+ io_tlb_start = virt_to_phys(tlb);
+ io_tlb_end = io_tlb_start + bytes;
+
+ set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
+ memset(tlb, 0, bytes);
+
+ /*
+ * Get the overflow emergency buffer
+ */
+ v_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
+ get_order(io_tlb_overflow));
+ if (!v_overflow_buffer)
+ goto cleanup2;
+
+ set_memory_decrypted((unsigned long)v_overflow_buffer,
+ io_tlb_overflow >> PAGE_SHIFT);
+ memset(v_overflow_buffer, 0, io_tlb_overflow);
+ io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
+
+ /*
+ * Allocate and initialize the free list array. This array is used
+ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+ * between io_tlb_start and io_tlb_end.
+ */
+ io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
+ get_order(io_tlb_nslabs * sizeof(int)));
+ if (!io_tlb_list)
+ goto cleanup3;
+
+ io_tlb_orig_addr = (phys_addr_t *)
+ __get_free_pages(GFP_KERNEL,
+ get_order(io_tlb_nslabs *
+ sizeof(phys_addr_t)));
+ if (!io_tlb_orig_addr)
+ goto cleanup4;
+
+ for (i = 0; i < io_tlb_nslabs; i++) {
+ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+ io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ }
+ io_tlb_index = 0;
+
+ swiotlb_print_info();
+
+ late_alloc = 1;
+
+ swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
+
+ return 0;
+
+cleanup4:
+ free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+ sizeof(int)));
+ io_tlb_list = NULL;
+cleanup3:
+ free_pages((unsigned long)v_overflow_buffer,
+ get_order(io_tlb_overflow));
+ io_tlb_overflow_buffer = 0;
+cleanup2:
+ io_tlb_end = 0;
+ io_tlb_start = 0;
+ io_tlb_nslabs = 0;
+ max_segment = 0;
+ return -ENOMEM;
+}
+
+void __init swiotlb_exit(void)
+{
+ if (!io_tlb_orig_addr)
+ return;
+
+ if (late_alloc) {
+ free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer),
+ get_order(io_tlb_overflow));
+ free_pages((unsigned long)io_tlb_orig_addr,
+ get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
+ free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+ sizeof(int)));
+ free_pages((unsigned long)phys_to_virt(io_tlb_start),
+ get_order(io_tlb_nslabs << IO_TLB_SHIFT));
+ } else {
+ memblock_free_late(io_tlb_overflow_buffer,
+ PAGE_ALIGN(io_tlb_overflow));
+ memblock_free_late(__pa(io_tlb_orig_addr),
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+ memblock_free_late(__pa(io_tlb_list),
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
+ memblock_free_late(io_tlb_start,
+ PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+ }
+ io_tlb_nslabs = 0;
+ max_segment = 0;
+}
+
+int is_swiotlb_buffer(phys_addr_t paddr)
+{
+ return paddr >= io_tlb_start && paddr < io_tlb_end;
+}
+
+/*
+ * Bounce: copy the swiotlb buffer back to the original dma location
+ */
+static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ unsigned long pfn = PFN_DOWN(orig_addr);
+ unsigned char *vaddr = phys_to_virt(tlb_addr);
+
+ if (PageHighMem(pfn_to_page(pfn))) {
+ /* The buffer does not have a mapping. Map it in and copy */
+ unsigned int offset = orig_addr & ~PAGE_MASK;
+ char *buffer;
+ unsigned int sz = 0;
+ unsigned long flags;
+
+ while (size) {
+ sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+ local_irq_save(flags);
+ buffer = kmap_atomic(pfn_to_page(pfn));
+ if (dir == DMA_TO_DEVICE)
+ memcpy(vaddr, buffer + offset, sz);
+ else
+ memcpy(buffer + offset, vaddr, sz);
+ kunmap_atomic(buffer);
+ local_irq_restore(flags);
+
+ size -= sz;
+ pfn++;
+ vaddr += sz;
+ offset = 0;
+ }
+ } else if (dir == DMA_TO_DEVICE) {
+ memcpy(vaddr, phys_to_virt(orig_addr), size);
+ } else {
+ memcpy(phys_to_virt(orig_addr), vaddr, size);
+ }
+}
+
+phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
+ dma_addr_t tbl_dma_addr,
+ phys_addr_t orig_addr, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ unsigned long flags;
+ phys_addr_t tlb_addr;
+ unsigned int nslots, stride, index, wrap;
+ int i;
+ unsigned long mask;
+ unsigned long offset_slots;
+ unsigned long max_slots;
+
+ if (no_iotlb_memory)
+ panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+
+ if (mem_encrypt_active())
+ pr_warn_once("%s is active and system is using DMA bounce buffers\n",
+ sme_active() ? "SME" : "SEV");
+
+ mask = dma_get_seg_boundary(hwdev);
+
+ tbl_dma_addr &= mask;
+
+ offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+
+ /*
+ * Carefully handle integer overflow which can occur when mask == ~0UL.
+ */
+ max_slots = mask + 1
+ ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+ : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+
+ /*
+ * For mappings greater than or equal to a page, we limit the stride
+ * (and hence alignment) to a page size.
+ */
+ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ if (size >= PAGE_SIZE)
+ stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+ else
+ stride = 1;
+
+ BUG_ON(!nslots);
+
+ /*
+ * Find suitable number of IO TLB entries size that will fit this
+ * request and allocate a buffer from that IO TLB pool.
+ */
+ spin_lock_irqsave(&io_tlb_lock, flags);
+ index = ALIGN(io_tlb_index, stride);
+ if (index >= io_tlb_nslabs)
+ index = 0;
+ wrap = index;
+
+ do {
+ while (iommu_is_span_boundary(index, nslots, offset_slots,
+ max_slots)) {
+ index += stride;
+ if (index >= io_tlb_nslabs)
+ index = 0;
+ if (index == wrap)
+ goto not_found;
+ }
+
+ /*
+ * If we find a slot that indicates we have 'nslots' number of
+ * contiguous buffers, we allocate the buffers from that slot
+ * and mark the entries as '0' indicating unavailable.
+ */
+ if (io_tlb_list[index] >= nslots) {
+ int count = 0;
+
+ for (i = index; i < (int) (index + nslots); i++)
+ io_tlb_list[i] = 0;
+ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+ io_tlb_list[i] = ++count;
+ tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+
+ /*
+ * Update the indices to avoid searching in the next
+ * round.
+ */
+ io_tlb_index = ((index + nslots) < io_tlb_nslabs
+ ? (index + nslots) : 0);
+
+ goto found;
+ }
+ index += stride;
+ if (index >= io_tlb_nslabs)
+ index = 0;
+ } while (index != wrap);
+
+not_found:
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+ if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
+ dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size);
+ return SWIOTLB_MAP_ERROR;
+found:
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+
+ /*
+ * Save away the mapping from the original address to the DMA address.
+ * This is needed when we sync the memory. Then we sync the buffer if
+ * needed.
+ */
+ for (i = 0; i < nslots; i++)
+ io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE);
+
+ return tlb_addr;
+}
+
+/*
+ * Allocates bounce buffer and returns its physical address.
+ */
+static phys_addr_t
+map_single(struct device *hwdev, phys_addr_t phys, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ dma_addr_t start_dma_addr;
+
+ if (swiotlb_force == SWIOTLB_NO_FORCE) {
+ dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n",
+ &phys);
+ return SWIOTLB_MAP_ERROR;
+ }
+
+ start_dma_addr = __phys_to_dma(hwdev, io_tlb_start);
+ return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
+ dir, attrs);
+}
+
+/*
+ * tlb_addr is the physical address of the bounce buffer to unmap.
+ */
+void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ unsigned long flags;
+ int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ phys_addr_t orig_addr = io_tlb_orig_addr[index];
+
+ /*
+ * First, sync the memory before unmapping the entry
+ */
+ if (orig_addr != INVALID_PHYS_ADDR &&
+ !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+ swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE);
+
+ /*
+ * Return the buffer to the free list by setting the corresponding
+ * entries to indicate the number of contiguous entries available.
+ * While returning the entries to the free list, we merge the entries
+ * with slots below and above the pool being returned.
+ */
+ spin_lock_irqsave(&io_tlb_lock, flags);
+ {
+ count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+ io_tlb_list[index + nslots] : 0);
+ /*
+ * Step 1: return the slots to the free list, merging the
+ * slots with superceeding slots
+ */
+ for (i = index + nslots - 1; i >= index; i--) {
+ io_tlb_list[i] = ++count;
+ io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ }
+ /*
+ * Step 2: merge the returned slots with the preceding slots,
+ * if available (non zero)
+ */
+ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+ io_tlb_list[i] = ++count;
+ }
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+}
+
+void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ enum dma_sync_target target)
+{
+ int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ phys_addr_t orig_addr = io_tlb_orig_addr[index];
+
+ if (orig_addr == INVALID_PHYS_ADDR)
+ return;
+ orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
+
+ switch (target) {
+ case SYNC_FOR_CPU:
+ if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(orig_addr, tlb_addr,
+ size, DMA_FROM_DEVICE);
+ else
+ BUG_ON(dir != DMA_TO_DEVICE);
+ break;
+ case SYNC_FOR_DEVICE:
+ if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(orig_addr, tlb_addr,
+ size, DMA_TO_DEVICE);
+ else
+ BUG_ON(dir != DMA_FROM_DEVICE);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr,
+ size_t size)
+{
+ u64 mask = DMA_BIT_MASK(32);
+
+ if (dev && dev->coherent_dma_mask)
+ mask = dev->coherent_dma_mask;
+ return addr + size - 1 <= mask;
+}
+
+static void *
+swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ unsigned long attrs)
+{
+ phys_addr_t phys_addr;
+
+ if (swiotlb_force == SWIOTLB_NO_FORCE)
+ goto out_warn;
+
+ phys_addr = swiotlb_tbl_map_single(dev,
+ __phys_to_dma(dev, io_tlb_start),
+ 0, size, DMA_FROM_DEVICE, attrs);
+ if (phys_addr == SWIOTLB_MAP_ERROR)
+ goto out_warn;
+
+ *dma_handle = __phys_to_dma(dev, phys_addr);
+ if (!dma_coherent_ok(dev, *dma_handle, size))
+ goto out_unmap;
+
+ memset(phys_to_virt(phys_addr), 0, size);
+ return phys_to_virt(phys_addr);
+
+out_unmap:
+ dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
+ (unsigned long long)dev->coherent_dma_mask,
+ (unsigned long long)*dma_handle);
+
+ /*
+ * DMA_TO_DEVICE to avoid memcpy in unmap_single.
+ * DMA_ATTR_SKIP_CPU_SYNC is optional.
+ */
+ swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC);
+out_warn:
+ if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) {
+ dev_warn(dev,
+ "swiotlb: coherent allocation failed, size=%zu\n",
+ size);
+ dump_stack();
+ }
+ return NULL;
+}
+
+static bool swiotlb_free_buffer(struct device *dev, size_t size,
+ dma_addr_t dma_addr)
+{
+ phys_addr_t phys_addr = dma_to_phys(dev, dma_addr);
+
+ WARN_ON_ONCE(irqs_disabled());
+
+ if (!is_swiotlb_buffer(phys_addr))
+ return false;
+
+ /*
+ * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single.
+ * DMA_ATTR_SKIP_CPU_SYNC is optional.
+ */
+ swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ return true;
+}
+
+static void
+swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
+ int do_panic)
+{
+ if (swiotlb_force == SWIOTLB_NO_FORCE)
+ return;
+
+ /*
+ * Ran out of IOMMU space for this operation. This is very bad.
+ * Unfortunately the drivers cannot handle this operation properly.
+ * unless they check for dma_mapping_error (most don't)
+ * When the mapping is small enough return a static buffer to limit
+ * the damage, or panic when the transfer is too big.
+ */
+ dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n",
+ size);
+
+ if (size <= io_tlb_overflow || !do_panic)
+ return;
+
+ if (dir == DMA_BIDIRECTIONAL)
+ panic("DMA: Random memory could be DMA accessed\n");
+ if (dir == DMA_FROM_DEVICE)
+ panic("DMA: Random memory could be DMA written\n");
+ if (dir == DMA_TO_DEVICE)
+ panic("DMA: Random memory could be DMA read\n");
+}
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode. The
+ * physical address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ phys_addr_t map, phys = page_to_phys(page) + offset;
+ dma_addr_t dev_addr = phys_to_dma(dev, phys);
+
+ BUG_ON(dir == DMA_NONE);
+ /*
+ * If the address happens to be in the device's DMA window,
+ * we can safely return the device addr and not worry about bounce
+ * buffering it.
+ */
+ if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE)
+ return dev_addr;
+
+ trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
+
+ /* Oh well, have to allocate and map a bounce buffer. */
+ map = map_single(dev, phys, size, dir, attrs);
+ if (map == SWIOTLB_MAP_ERROR) {
+ swiotlb_full(dev, size, dir, 1);
+ return __phys_to_dma(dev, io_tlb_overflow_buffer);
+ }
+
+ dev_addr = __phys_to_dma(dev, map);
+
+ /* Ensure that the address returned is DMA'ble */
+ if (dma_capable(dev, dev_addr, size))
+ return dev_addr;
+
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
+
+ return __phys_to_dma(dev, io_tlb_overflow_buffer);
+}
+
+/*
+ * Unmap a single streaming mode DMA translation. The dma_addr and size must
+ * match what was provided for in a previous swiotlb_map_page call. All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
+
+ BUG_ON(dir == DMA_NONE);
+
+ if (is_swiotlb_buffer(paddr)) {
+ swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
+ return;
+ }
+
+ if (dir != DMA_FROM_DEVICE)
+ return;
+
+ /*
+ * phys_to_virt doesn't work with hihgmem page but we could
+ * call dma_mark_clean() with hihgmem page here. However, we
+ * are fine since dma_mark_clean() is null on POWERPC. We can
+ * make dma_mark_clean() take a physical address if necessary.
+ */
+ dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ unmap_single(hwdev, dev_addr, size, dir, attrs);
+}
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a swiotlb_map_page() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the dma mapping, you must
+ * call this function before doing so. At the next point you give the dma
+ * address back to the card, you must first perform a
+ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static void
+swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ enum dma_sync_target target)
+{
+ phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
+
+ BUG_ON(dir == DMA_NONE);
+
+ if (is_swiotlb_buffer(paddr)) {
+ swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
+ return;
+ }
+
+ if (dir != DMA_FROM_DEVICE)
+ return;
+
+ dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void
+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+
+void
+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above swiotlb_map_page
+ * interface. Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length. They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ * DMA address/length pairs than there are SG table elements.
+ * (for example via virtual mapping capabilities)
+ * The routine returns the number of addr/length pairs actually
+ * used, at most nents.
+ *
+ * Device ownership issues as mentioned above for swiotlb_map_page are the
+ * same here.
+ */
+int
+swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i) {
+ phys_addr_t paddr = sg_phys(sg);
+ dma_addr_t dev_addr = phys_to_dma(hwdev, paddr);
+
+ if (swiotlb_force == SWIOTLB_FORCE ||
+ !dma_capable(hwdev, dev_addr, sg->length)) {
+ phys_addr_t map = map_single(hwdev, sg_phys(sg),
+ sg->length, dir, attrs);
+ if (map == SWIOTLB_MAP_ERROR) {
+ /* Don't panic here, we expect map_sg users
+ to do proper error handling. */
+ swiotlb_full(hwdev, sg->length, dir, 0);
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+ attrs);
+ sg_dma_len(sgl) = 0;
+ return 0;
+ }
+ sg->dma_address = __phys_to_dma(hwdev, map);
+ } else
+ sg->dma_address = dev_addr;
+ sg_dma_len(sg) = sg->length;
+ }
+ return nelems;
+}
+
+/*
+ * Unmap a set of streaming mode DMA translations. Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
+ */
+void
+swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i)
+ unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir,
+ attrs);
+}
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+static void
+swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ enum dma_sync_target target)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nelems, i)
+ swiotlb_sync_single(hwdev, sg->dma_address,
+ sg_dma_len(sg), dir, target);
+}
+
+void
+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+
+void
+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+
+int
+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
+{
+ return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer));
+}
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly. For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+swiotlb_dma_supported(struct device *hwdev, u64 mask)
+{
+ return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+}
+
+void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, unsigned long attrs)
+{
+ void *vaddr;
+
+ /* temporary workaround: */
+ if (gfp & __GFP_NOWARN)
+ attrs |= DMA_ATTR_NO_WARN;
+
+ /*
+ * Don't print a warning when the first allocation attempt fails.
+ * swiotlb_alloc_coherent() will print a warning when the DMA memory
+ * allocation ultimately failed.
+ */
+ gfp |= __GFP_NOWARN;
+
+ vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs);
+ if (!vaddr)
+ vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs);
+ return vaddr;
+}
+
+void swiotlb_free(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr, unsigned long attrs)
+{
+ if (!swiotlb_free_buffer(dev, size, dma_addr))
+ dma_direct_free(dev, size, vaddr, dma_addr, attrs);
+}
+
+const struct dma_map_ops swiotlb_dma_ops = {
+ .mapping_error = swiotlb_dma_mapping_error,
+ .alloc = swiotlb_alloc,
+ .free = swiotlb_free,
+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = swiotlb_sync_single_for_device,
+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
+ .map_sg = swiotlb_map_sg_attrs,
+ .unmap_sg = swiotlb_unmap_sg_attrs,
+ .map_page = swiotlb_map_page,
+ .unmap_page = swiotlb_unmap_page,
+ .dma_supported = dma_direct_supported,
+};
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
new file mode 100644
index 000000000000..631ddec4b60a
--- /dev/null
+++ b/kernel/dma/virt.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DMA operations that map to virtual addresses without flushing memory.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+
+static void *dma_virt_alloc(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp,
+ unsigned long attrs)
+{
+ void *ret;
+
+ ret = (void *)__get_free_pages(gfp, get_order(size));
+ if (ret)
+ *dma_handle = (uintptr_t)ret;
+ return ret;
+}
+
+static void dma_virt_free(struct device *dev, size_t size,
+ void *cpu_addr, dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+ free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return (uintptr_t)(page_address(page) + offset);
+}
+
+static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ int i;
+ struct scatterlist *sg;
+
+ for_each_sg(sgl, sg, nents, i) {
+ BUG_ON(!sg_page(sg));
+ sg_dma_address(sg) = (uintptr_t)sg_virt(sg);
+ sg_dma_len(sg) = sg->length;
+ }
+
+ return nents;
+}
+
+const struct dma_map_ops dma_virt_ops = {
+ .alloc = dma_virt_alloc,
+ .free = dma_virt_free,
+ .map_page = dma_virt_map_page,
+ .map_sg = dma_virt_map_sg,
+};
+EXPORT_SYMBOL(dma_virt_ops);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 08f5e1b42b43..80cca2b30c4f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
return file;
}
+const struct perf_event *perf_get_event(struct file *file)
+{
+ if (file->f_op != &perf_fops)
+ return ERR_PTR(-EINVAL);
+
+ return file->private_data;
+}
+
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
if (!event)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 1d8ca9ea9979..045a37e9ddee 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -614,7 +614,8 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
}
}
- rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+ rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
+ node);
if (!rb->aux_pages)
return -ENOMEM;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1725b902983f..ccc579a7d32e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1184,7 +1184,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (unlikely(!area))
goto out;
- area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
+ area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
+ GFP_KERNEL);
if (!area->bitmap)
goto free_area;
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 1d5632d8bbcc..5349c91c2298 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -258,7 +258,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
/* cut off if it is too long */
if (count > KSYM_NAME_LEN)
count = KSYM_NAME_LEN;
- buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL);
+ buf = kmalloc(count + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
diff --git a/kernel/fork.c b/kernel/fork.c
index a5d21c42acfc..9440d61b925c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -440,6 +440,14 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
continue;
}
charge = 0;
+ /*
+ * Don't duplicate many vmas if we've been oom-killed (for
+ * example)
+ */
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto out;
+ }
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
@@ -811,7 +819,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
@@ -899,6 +907,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
+ spin_lock_init(&mm->arg_lock);
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
@@ -1712,7 +1721,7 @@ static __latent_entropy struct task_struct *copy_process(
p->start_time = ktime_get_ns();
p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
- p->audit_context = NULL;
+ audit_set_context(p, NULL);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1899,6 +1908,8 @@ static __latent_entropy struct task_struct *copy_process(
*/
copy_seccomp(p);
+ rseq_fork(p, clone_flags);
+
/*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 1276aabaab55..1e3823fa799b 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -53,23 +53,16 @@ config GCOV_PROFILE_ALL
choice
prompt "Specify GCOV format"
depends on GCOV_KERNEL
- default GCOV_FORMAT_AUTODETECT
---help---
- The gcov format is usually determined by the GCC version, but there are
+ The gcov format is usually determined by the GCC version, and the
+ default is chosen according to your GCC version. However, there are
exceptions where format changes are integrated in lower-version GCCs.
- In such a case use this option to adjust the format used in the kernel
- accordingly.
-
- If unsure, choose "Autodetect".
-
-config GCOV_FORMAT_AUTODETECT
- bool "Autodetect"
- ---help---
- Select this option to use the format that corresponds to your GCC
- version.
+ In such a case, change this option to adjust the format used in the
+ kernel accordingly.
config GCOV_FORMAT_3_4
bool "GCC 3.4 format"
+ depends on CC_IS_GCC && GCC_VERSION < 40700
---help---
Select this option to use the format defined by GCC 3.4.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index c6c50e5c680e..ff06d64df397 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -4,5 +4,3 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
-obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \
- gcc_3_4.o, gcc_4_7.o)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 751593ed7c0b..32b479468e4d 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -44,6 +44,7 @@ int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
static bool hung_task_show_lock;
+static bool hung_task_call_panic;
static struct task_struct *watchdog_task;
@@ -127,10 +128,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
touch_nmi_watchdog();
if (sysctl_hung_task_panic) {
- if (hung_task_show_lock)
- debug_show_all_locks();
- trigger_all_cpu_backtrace();
- panic("hung_task: blocked tasks");
+ hung_task_show_lock = true;
+ hung_task_call_panic = true;
}
}
@@ -193,6 +192,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
rcu_read_unlock();
if (hung_task_show_lock)
debug_show_all_locks();
+ if (hung_task_call_panic) {
+ trigger_all_cpu_backtrace();
+ panic("hung_task: blocked tasks");
+ }
}
static long hung_timeout_jiffies(unsigned long last_checked,
diff --git a/kernel/iomem.c b/kernel/iomem.c
new file mode 100644
index 000000000000..f7525e14ebc6
--- /dev/null
+++ b/kernel/iomem.c
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#ifndef ioremap_cache
+/* temporary while we convert existing ioremap_cache users to memremap */
+__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
+{
+ return ioremap(offset, size);
+}
+#endif
+
+#ifndef arch_memremap_wb
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+{
+ return (__force void *)ioremap_cache(offset, size);
+}
+#endif
+
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
+{
+ return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
+{
+ unsigned long pfn = PHYS_PFN(offset);
+
+ /* In the simple case just return the existing linear address */
+ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+ arch_memremap_can_ram_remap(offset, size, flags))
+ return __va(offset);
+
+ return NULL; /* fallback to arch_memremap_wb */
+}
+
+/**
+ * memremap() - remap an iomem_resource as cacheable memory
+ * @offset: iomem resource start address
+ * @size: size of remap
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ * MEMREMAP_ENC, MEMREMAP_DEC
+ *
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable. In the case of multiple flags, the different
+ * mapping types will be attempted in the order listed below until one of
+ * them succeeds.
+ *
+ * MEMREMAP_WB - matches the default mapping for System RAM on
+ * the architecture. This is usually a read-allocate write-back cache.
+ * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * memremap() will bypass establishing a new mapping and instead return
+ * a pointer into the direct map.
+ *
+ * MEMREMAP_WT - establish a mapping whereby writes either bypass the
+ * cache or are written through to memory and never exist in a
+ * cache-dirty state with respect to program visibility. Attempts to
+ * map System RAM with this mapping type will fail.
+ *
+ * MEMREMAP_WC - establish a writecombine mapping, whereby writes may
+ * be coalesced together (e.g. in the CPU's write buffers), but is otherwise
+ * uncached. Attempts to map System RAM with this mapping type will fail.
+ */
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+ int is_ram = region_intersects(offset, size,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+ void *addr = NULL;
+
+ if (!flags)
+ return NULL;
+
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ /* Try all mapping types requested until one returns non-NULL */
+ if (flags & MEMREMAP_WB) {
+ /*
+ * MEMREMAP_WB is special in that it can be satisifed
+ * from the direct map. Some archs depend on the
+ * capability of memremap() to autodetect cases where
+ * the requested range is potentially in System RAM.
+ */
+ if (is_ram == REGION_INTERSECTS)
+ addr = try_ram_remap(offset, size, flags);
+ if (!addr)
+ addr = arch_memremap_wb(offset, size);
+ }
+
+ /*
+ * If we don't have a mapping yet and other request flags are
+ * present then we will be attempting to establish a new virtual
+ * address mapping. Enforce that this mapping is not aliasing
+ * System RAM.
+ */
+ if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
+ WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ if (!addr && (flags & MEMREMAP_WT))
+ addr = ioremap_wt(offset, size);
+
+ if (!addr && (flags & MEMREMAP_WC))
+ addr = ioremap_wc(offset, size);
+
+ return addr;
+}
+EXPORT_SYMBOL(memremap);
+
+void memunmap(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ iounmap((void __iomem *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+
+static void devm_memremap_release(struct device *dev, void *res)
+{
+ memunmap(*(void **)res);
+}
+
+static int devm_memremap_match(struct device *dev, void *res, void *match_data)
+{
+ return *(void **)res == match_data;
+}
+
+void *devm_memremap(struct device *dev, resource_size_t offset,
+ size_t size, unsigned long flags)
+{
+ void **ptr, *addr;
+
+ ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
+ dev_to_node(dev));
+ if (!ptr)
+ return ERR_PTR(-ENOMEM);
+
+ addr = memremap(offset, size, flags);
+ if (addr) {
+ *ptr = addr;
+ devres_add(dev, ptr);
+ } else {
+ devres_free(ptr);
+ return ERR_PTR(-ENXIO);
+ }
+
+ return addr;
+}
+EXPORT_SYMBOL(devm_memremap);
+
+void devm_memunmap(struct device *dev, void *addr)
+{
+ WARN_ON(devres_release(dev, devm_memremap_release,
+ devm_memremap_match, addr));
+}
+EXPORT_SYMBOL(devm_memunmap);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 4dadeb3d6666..6f636136cccc 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -55,6 +55,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE),
BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),
BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
+ BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
};
static void
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e3336d904f64..daeabd791d58 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -24,6 +24,7 @@
#ifdef CONFIG_IRQ_FORCED_THREADING
__read_mostly bool force_irqthreads;
+EXPORT_SYMBOL_GPL(force_irqthreads);
static int __init setup_forced_irqthreads(char *arg)
{
@@ -204,6 +205,39 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
return ret;
}
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline int irq_set_affinity_pending(struct irq_data *data,
+ const struct cpumask *dest)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+
+ irqd_set_move_pending(data);
+ irq_copy_pending(desc, dest);
+ return 0;
+}
+#else
+static inline int irq_set_affinity_pending(struct irq_data *data,
+ const struct cpumask *dest)
+{
+ return -EBUSY;
+}
+#endif
+
+static int irq_try_set_affinity(struct irq_data *data,
+ const struct cpumask *dest, bool force)
+{
+ int ret = irq_do_set_affinity(data, dest, force);
+
+ /*
+ * In case that the underlying vector management is busy and the
+ * architecture supports the generic pending mechanism then utilize
+ * this to avoid returning an error to user space.
+ */
+ if (ret == -EBUSY && !force)
+ ret = irq_set_affinity_pending(data, dest);
+ return ret;
+}
+
int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -214,8 +248,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- if (irq_can_move_pcntxt(data)) {
- ret = irq_do_set_affinity(data, mask, force);
+ if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
+ ret = irq_try_set_affinity(data, mask, force);
} else {
irqd_set_move_pending(data);
irq_copy_pending(desc, mask);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 86ae0eb80b53..def48589ea48 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -38,17 +38,18 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
- struct irq_chip *chip = desc->irq_data.chip;
+ struct irq_data *data = &desc->irq_data;
+ struct irq_chip *chip = data->chip;
- if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
+ if (likely(!irqd_is_setaffinity_pending(data)))
return;
- irqd_clr_move_pending(&desc->irq_data);
+ irqd_clr_move_pending(data);
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
- if (irqd_is_per_cpu(&desc->irq_data)) {
+ if (irqd_is_per_cpu(data)) {
WARN_ON(1);
return;
}
@@ -73,13 +74,24 @@ void irq_move_masked_irq(struct irq_data *idata)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
- irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
-
+ if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) {
+ int ret;
+
+ ret = irq_do_set_affinity(data, desc->pending_mask, false);
+ /*
+ * If the there is a cleanup pending in the underlying
+ * vector management, reschedule the move for the next
+ * interrupt. Leave desc->pending_mask intact.
+ */
+ if (ret == -EBUSY) {
+ irqd_set_move_pending(data);
+ return;
+ }
+ }
cpumask_clear(desc->pending_mask);
}
-void irq_move_irq(struct irq_data *idata)
+void __irq_move_irq(struct irq_data *idata)
{
bool masked;
@@ -90,9 +102,6 @@ void irq_move_irq(struct irq_data *idata)
*/
idata = irq_desc_get_irq_data(irq_data_to_desc(idata));
- if (likely(!irqd_is_setaffinity_pending(idata)))
- return;
-
if (unlikely(irqd_irq_disabled(idata)))
return;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 2c16f1ab5e10..3ebd09efe72a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -58,7 +58,7 @@ struct kcov {
static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
{
- enum kcov_mode mode;
+ unsigned int mode;
/*
* We are interested in code coverage as a function of a syscall inputs,
@@ -241,7 +241,8 @@ static void kcov_put(struct kcov *kcov)
void kcov_task_init(struct task_struct *t)
{
- t->kcov_mode = KCOV_MODE_DISABLED;
+ WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED);
+ barrier();
t->kcov_size = 0;
t->kcov_area = NULL;
t->kcov = NULL;
@@ -323,6 +324,21 @@ static int kcov_close(struct inode *inode, struct file *filep)
return 0;
}
+/*
+ * Fault in a lazily-faulted vmalloc area before it can be used by
+ * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the
+ * vmalloc fault handling path is instrumented.
+ */
+static void kcov_fault_in_area(struct kcov *kcov)
+{
+ unsigned long stride = PAGE_SIZE / sizeof(unsigned long);
+ unsigned long *area = kcov->area;
+ unsigned long offset;
+
+ for (offset = 0; offset < kcov->size; offset += stride)
+ READ_ONCE(area[offset]);
+}
+
static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
{
@@ -371,6 +387,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
#endif
else
return -EINVAL;
+ kcov_fault_in_area(kcov);
/* Cache in task struct for performance. */
t->kcov_size = kcov->size;
t->kcov_area = kcov->area;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 20fef1a38602..23a83a4da38a 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -829,6 +829,8 @@ static int kimage_load_normal_segment(struct kimage *image,
else
buf += mchunk;
mbytes -= mchunk;
+
+ cond_resched();
}
out:
return result;
@@ -893,6 +895,8 @@ static int kimage_load_crash_segment(struct kimage *image,
else
buf += mchunk;
mbytes -= mchunk;
+
+ cond_resched();
}
out:
return result;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 75d8e7cf040e..c6a3b6851372 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -793,7 +793,7 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
* The section headers in kexec_purgatory are read-only. In order to
* have them modifiable make a temporary copy.
*/
- sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ sechdrs = vzalloc(array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum));
if (!sechdrs)
return -ENOMEM;
memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff,
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 6850ffd69125..8402b3349dca 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -913,7 +913,9 @@ static int __init lock_torture_init(void)
/* Initialize the statistics so that each run gets its own numbers. */
if (nwriters_stress) {
lock_is_write_held = 0;
- cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
+ cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress,
+ sizeof(*cxt.lwsa),
+ GFP_KERNEL);
if (cxt.lwsa == NULL) {
VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
firsterr = -ENOMEM;
@@ -942,7 +944,9 @@ static int __init lock_torture_init(void)
if (nreaders_stress) {
lock_is_read_held = 0;
- cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
+ cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
+ sizeof(*cxt.lrsa),
+ GFP_KERNEL);
if (cxt.lrsa == NULL) {
VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
firsterr = -ENOMEM;
@@ -985,7 +989,8 @@ static int __init lock_torture_init(void)
}
if (nwriters_stress) {
- writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
+ writer_tasks = kcalloc(cxt.nrealwriters_stress,
+ sizeof(writer_tasks[0]),
GFP_KERNEL);
if (writer_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
@@ -995,7 +1000,8 @@ static int __init lock_torture_init(void)
}
if (cxt.cur_ops->readlock) {
- reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]),
+ reader_tasks = kcalloc(cxt.nrealreaders_stress,
+ sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 895e6b76b25e..5857267a4af5 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -1,15 +1,5 @@
-/*
- * Copyright(c) 2015 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
#include <linux/radix-tree.h>
#include <linux/device.h>
#include <linux/types.h>
@@ -19,170 +9,8 @@
#include <linux/memory_hotplug.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/wait_bit.h>
-#ifndef ioremap_cache
-/* temporary while we convert existing ioremap_cache users to memremap */
-__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
-{
- return ioremap(offset, size);
-}
-#endif
-
-#ifndef arch_memremap_wb
-static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
-{
- return (__force void *)ioremap_cache(offset, size);
-}
-#endif
-
-#ifndef arch_memremap_can_ram_remap
-static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
- unsigned long flags)
-{
- return true;
-}
-#endif
-
-static void *try_ram_remap(resource_size_t offset, size_t size,
- unsigned long flags)
-{
- unsigned long pfn = PHYS_PFN(offset);
-
- /* In the simple case just return the existing linear address */
- if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
- arch_memremap_can_ram_remap(offset, size, flags))
- return __va(offset);
-
- return NULL; /* fallback to arch_memremap_wb */
-}
-
-/**
- * memremap() - remap an iomem_resource as cacheable memory
- * @offset: iomem resource start address
- * @size: size of remap
- * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
- * MEMREMAP_ENC, MEMREMAP_DEC
- *
- * memremap() is "ioremap" for cases where it is known that the resource
- * being mapped does not have i/o side effects and the __iomem
- * annotation is not applicable. In the case of multiple flags, the different
- * mapping types will be attempted in the order listed below until one of
- * them succeeds.
- *
- * MEMREMAP_WB - matches the default mapping for System RAM on
- * the architecture. This is usually a read-allocate write-back cache.
- * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
- * memremap() will bypass establishing a new mapping and instead return
- * a pointer into the direct map.
- *
- * MEMREMAP_WT - establish a mapping whereby writes either bypass the
- * cache or are written through to memory and never exist in a
- * cache-dirty state with respect to program visibility. Attempts to
- * map System RAM with this mapping type will fail.
- *
- * MEMREMAP_WC - establish a writecombine mapping, whereby writes may
- * be coalesced together (e.g. in the CPU's write buffers), but is otherwise
- * uncached. Attempts to map System RAM with this mapping type will fail.
- */
-void *memremap(resource_size_t offset, size_t size, unsigned long flags)
-{
- int is_ram = region_intersects(offset, size,
- IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
- void *addr = NULL;
-
- if (!flags)
- return NULL;
-
- if (is_ram == REGION_MIXED) {
- WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
- &offset, (unsigned long) size);
- return NULL;
- }
-
- /* Try all mapping types requested until one returns non-NULL */
- if (flags & MEMREMAP_WB) {
- /*
- * MEMREMAP_WB is special in that it can be satisifed
- * from the direct map. Some archs depend on the
- * capability of memremap() to autodetect cases where
- * the requested range is potentially in System RAM.
- */
- if (is_ram == REGION_INTERSECTS)
- addr = try_ram_remap(offset, size, flags);
- if (!addr)
- addr = arch_memremap_wb(offset, size);
- }
-
- /*
- * If we don't have a mapping yet and other request flags are
- * present then we will be attempting to establish a new virtual
- * address mapping. Enforce that this mapping is not aliasing
- * System RAM.
- */
- if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
- WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
- &offset, (unsigned long) size);
- return NULL;
- }
-
- if (!addr && (flags & MEMREMAP_WT))
- addr = ioremap_wt(offset, size);
-
- if (!addr && (flags & MEMREMAP_WC))
- addr = ioremap_wc(offset, size);
-
- return addr;
-}
-EXPORT_SYMBOL(memremap);
-
-void memunmap(void *addr)
-{
- if (is_vmalloc_addr(addr))
- iounmap((void __iomem *) addr);
-}
-EXPORT_SYMBOL(memunmap);
-
-static void devm_memremap_release(struct device *dev, void *res)
-{
- memunmap(*(void **)res);
-}
-
-static int devm_memremap_match(struct device *dev, void *res, void *match_data)
-{
- return *(void **)res == match_data;
-}
-
-void *devm_memremap(struct device *dev, resource_size_t offset,
- size_t size, unsigned long flags)
-{
- void **ptr, *addr;
-
- ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
- dev_to_node(dev));
- if (!ptr)
- return ERR_PTR(-ENOMEM);
-
- addr = memremap(offset, size, flags);
- if (addr) {
- *ptr = addr;
- devres_add(dev, ptr);
- } else {
- devres_free(ptr);
- return ERR_PTR(-ENXIO);
- }
-
- return addr;
-}
-EXPORT_SYMBOL(devm_memremap);
-
-void devm_memunmap(struct device *dev, void *addr)
-{
- WARN_ON(devres_release(dev, devm_memremap_release,
- devm_memremap_match, addr));
-}
-EXPORT_SYMBOL(devm_memunmap);
-
-#ifdef CONFIG_ZONE_DEVICE
static DEFINE_MUTEX(pgmap_lock);
static RADIX_TREE(pgmap_radix, GFP_KERNEL);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
@@ -473,10 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
return pgmap;
}
-#endif /* CONFIG_ZONE_DEVICE */
+EXPORT_SYMBOL_GPL(get_dev_pagemap);
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL_GPL(devmap_managed_key);
+static atomic_t devmap_enable;
+
+/*
+ * Toggle the static key for ->page_free() callbacks when dev_pagemap
+ * pages go idle.
+ */
+void dev_pagemap_get_ops(void)
+{
+ if (atomic_inc_return(&devmap_enable) == 1)
+ static_branch_enable(&devmap_managed_key);
+}
+EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
+
+void dev_pagemap_put_ops(void)
+{
+ if (atomic_dec_and_test(&devmap_enable))
+ static_branch_disable(&devmap_managed_key);
+}
+EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
-void put_zone_device_private_or_public_page(struct page *page)
+void __put_devmap_managed_page(struct page *page)
{
int count = page_ref_dec_return(page);
@@ -496,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page)
} else if (!count)
__put_page(page);
}
-EXPORT_SYMBOL(put_zone_device_private_or_public_page);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+EXPORT_SYMBOL_GPL(__put_devmap_managed_page);
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/kernel/module.c b/kernel/module.c
index c9bea7f2b43e..f475f30eed8c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -274,9 +274,7 @@ static void module_assert_mutex_or_preempt(void)
}
static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
-#ifndef CONFIG_MODULE_SIG_FORCE
module_param(sig_enforce, bool_enable_only, 0644);
-#endif /* !CONFIG_MODULE_SIG_FORCE */
/*
* Export sig_enforce kernel cmdline parameter to allow other subsystems rely
@@ -1604,8 +1602,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info)
if (notes == 0)
return;
- notes_attrs = kzalloc(sizeof(*notes_attrs)
- + notes * sizeof(notes_attrs->attrs[0]),
+ notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
GFP_KERNEL);
if (notes_attrs == NULL)
return;
@@ -2786,7 +2783,7 @@ static int module_sig_check(struct load_info *info, int flags)
}
/* Not having a signature is only an error if we're strict. */
- if (err == -ENOKEY && !sig_enforce)
+ if (err == -ENOKEY && !is_module_sig_enforced())
err = 0;
return err;
diff --git a/kernel/panic.c b/kernel/panic.c
index 42e487488554..8b2e002d52eb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -623,7 +623,7 @@ static __init int register_warn_debugfs(void)
device_initcall(register_warn_debugfs);
#endif
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
/*
* Called when gcc's -fstack-protector feature is used, and
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 5454cc639a8d..9c85c7822383 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -287,6 +287,8 @@ static int create_image(int platform_mode)
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (error) {
pr_err("Some system devices failed to power down, aborting hibernation\n");
@@ -317,6 +319,7 @@ static int create_image(int platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
@@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode)
goto Enable_cpus;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
error = syscore_suspend();
if (error)
@@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
@@ -563,6 +568,7 @@ int hibernation_platform_enter(void)
goto Enable_cpus;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
syscore_suspend();
if (pm_wakeup_pending()) {
error = -EAGAIN;
@@ -575,6 +581,7 @@ int hibernation_platform_enter(void)
Power_up:
syscore_resume();
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 705c2366dafe..d9706da10930 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -455,8 +455,9 @@ struct kobject *power_kobj;
* state - control system sleep states.
*
* show() returns available sleep state labels, which may be "mem", "standby",
- * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
- * description of what they mean.
+ * "freeze" and "disk" (hibernation).
+ * See Documentation/admin-guide/pm/sleep-states.rst for a description of
+ * what they mean.
*
* store() accepts one of those strings, translates it into the proper
* enumerated value, and initiates a suspend transition.
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index fa39092b7aea..86d72ffb811b 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -184,7 +184,6 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
c->target_value = value;
}
-static inline int pm_qos_get_value(struct pm_qos_constraints *c);
static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
{
struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4c10be0f4843..87331565e505 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/swait.h>
#include <linux/ftrace.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
@@ -57,10 +58,10 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_s2idle_ops *s2idle_ops;
-static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
+static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
enum s2idle_states __read_mostly s2idle_state;
-static DEFINE_SPINLOCK(s2idle_lock);
+static DEFINE_RAW_SPINLOCK(s2idle_lock);
void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
@@ -78,12 +79,12 @@ static void s2idle_enter(void)
{
trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
- spin_lock_irq(&s2idle_lock);
+ raw_spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
s2idle_state = S2IDLE_STATE_ENTER;
- spin_unlock_irq(&s2idle_lock);
+ raw_spin_unlock_irq(&s2idle_lock);
get_online_cpus();
cpuidle_resume();
@@ -91,17 +92,17 @@ static void s2idle_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
- wait_event(s2idle_wait_head,
- s2idle_state == S2IDLE_STATE_WAKE);
+ swait_event(s2idle_wait_head,
+ s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
put_online_cpus();
- spin_lock_irq(&s2idle_lock);
+ raw_spin_lock_irq(&s2idle_lock);
out:
s2idle_state = S2IDLE_STATE_NONE;
- spin_unlock_irq(&s2idle_lock);
+ raw_spin_unlock_irq(&s2idle_lock);
trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
}
@@ -156,12 +157,12 @@ void s2idle_wake(void)
{
unsigned long flags;
- spin_lock_irqsave(&s2idle_lock, flags);
+ raw_spin_lock_irqsave(&s2idle_lock, flags);
if (s2idle_state > S2IDLE_STATE_NONE) {
s2idle_state = S2IDLE_STATE_WAKE;
- wake_up(&s2idle_wait_head);
+ swake_up(&s2idle_wait_head);
}
- spin_unlock_irqrestore(&s2idle_lock, flags);
+ raw_spin_unlock_irqrestore(&s2idle_lock, flags);
}
EXPORT_SYMBOL_GPL(s2idle_wake);
@@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (!error) {
*wakeup = pm_wakeup_pending();
@@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
syscore_resume();
}
+ system_state = SYSTEM_RUNNING;
+
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1efcb5b0c3ed..c2bcf97d24c8 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -698,7 +698,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
- data = vmalloc(sizeof(*data) * nr_threads);
+ data = vmalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
@@ -1183,14 +1183,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
+ page = vmalloc(array_size(LZO_MAX_RD_PAGES, sizeof(*page)));
if (!page) {
pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
goto out_clean;
}
- data = vmalloc(sizeof(*data) * nr_threads);
+ data = vmalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 75c959de4b29..abd225550271 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -186,6 +186,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
res = PAGE_SIZE - pg_offp;
}
+ if (!data_of(data->handle)) {
+ res = -EINVAL;
+ goto unlock;
+ }
+
res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
buf, count);
if (res > 0)
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index dfba59be190b..4210152e56f0 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -188,6 +188,7 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
return ERR_PTR(-ENOMEM);
}
wl->ws.name = wl->name;
+ wl->ws.last_time = ktime_get();
wakeup_source_add(&wl->ws);
rb_link_node(&wl->node, parent, node);
rb_insert_color(&wl->node, &wakelocks_tree);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2f4af216bd6e..247808333ba4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1908,6 +1908,7 @@ asmlinkage int vprintk_emit(int facility, int level,
preempt_enable();
}
+ wake_up_klogd();
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -2289,9 +2290,7 @@ void console_unlock(void)
{
static char ext_text[CONSOLE_EXT_LOG_MAX];
static char text[LOG_LINE_MAX + PREFIX_MAX];
- static u64 seen_seq;
unsigned long flags;
- bool wake_klogd = false;
bool do_cond_resched, retry;
if (console_suspended) {
@@ -2335,11 +2334,6 @@ again:
printk_safe_enter_irqsave(flags);
raw_spin_lock(&logbuf_lock);
- if (seen_seq != log_next_seq) {
- wake_klogd = true;
- seen_seq = log_next_seq;
- }
-
if (console_seq < log_first_seq) {
len = sprintf(text, "** %u printk messages dropped **\n",
(unsigned)(log_first_seq - console_seq));
@@ -2397,7 +2391,7 @@ skip:
if (console_lock_spinning_disable_and_check()) {
printk_safe_exit_irqrestore(flags);
- goto out;
+ return;
}
printk_safe_exit_irqrestore(flags);
@@ -2429,10 +2423,6 @@ skip:
if (retry && console_trylock())
goto again;
-
-out:
- if (wake_klogd)
- wake_up_klogd();
}
EXPORT_SYMBOL(console_unlock);
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3e3c2004bb23..d7d091309054 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -82,6 +82,7 @@ static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
{
int add;
size_t len;
+ va_list ap;
again:
len = atomic_read(&s->len);
@@ -100,7 +101,9 @@ again:
if (!len)
smp_rmb();
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+ va_copy(ap, args);
+ add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
+ va_end(ap);
if (!add)
return 0;
@@ -278,7 +281,7 @@ void printk_safe_flush_on_panic(void)
* Make sure that we could access the main ring buffer.
* Do not risk a double release when more CPUs are up.
*/
- if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+ if (raw_spin_is_locked(&logbuf_lock)) {
if (num_online_cpus() > 1)
return;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index e628fcfd1bde..42fcb7f05fac 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -831,8 +831,9 @@ rcu_torture_cbflood(void *arg)
cbflood_intra_holdoff > 0 &&
cur_ops->call &&
cur_ops->cb_barrier) {
- rhp = vmalloc(sizeof(*rhp) *
- cbflood_n_burst * cbflood_n_per_burst);
+ rhp = vmalloc(array3_size(cbflood_n_burst,
+ cbflood_n_per_burst,
+ sizeof(*rhp)));
err = !rhp;
}
if (err) {
diff --git a/kernel/relay.c b/kernel/relay.c
index c955b10c973c..04f248644e06 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -39,7 +39,7 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
/*
* fault() vm_op implementation for relay file mapping.
*/
-static int relay_buf_fault(struct vm_fault *vmf)
+static vm_fault_t relay_buf_fault(struct vm_fault *vmf)
{
struct page *page;
struct rchan_buf *buf = vmf->vma->vm_private_data;
@@ -169,7 +169,8 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan)
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
- buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+ buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *),
+ GFP_KERNEL);
if (!buf->padding)
goto free_buf;
diff --git a/kernel/resource.c b/kernel/resource.c
index b589dda910b3..30e1bc68503b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -415,6 +415,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
return __walk_iomem_res_desc(&res, desc, false, arg, func);
}
+EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
/*
* This function calls the @func callback against all memory ranges of type
diff --git a/kernel/rseq.c b/kernel/rseq.c
new file mode 100644
index 000000000000..ae306f90c514
--- /dev/null
+++ b/kernel/rseq.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Restartable sequences system call
+ *
+ * Copyright (C) 2015, Google, Inc.,
+ * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
+ * Copyright (C) 2015-2018, EfficiOS Inc.,
+ * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/rseq.h>
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rseq.h>
+
+#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
+
+/*
+ *
+ * Restartable sequences are a lightweight interface that allows
+ * user-level code to be executed atomically relative to scheduler
+ * preemption and signal delivery. Typically used for implementing
+ * per-cpu operations.
+ *
+ * It allows user-space to perform update operations on per-cpu data
+ * without requiring heavy-weight atomic operations.
+ *
+ * Detailed algorithm of rseq user-space assembly sequences:
+ *
+ * init(rseq_cs)
+ * cpu = TLS->rseq::cpu_id_start
+ * [1] TLS->rseq::rseq_cs = rseq_cs
+ * [start_ip] ----------------------------
+ * [2] if (cpu != TLS->rseq::cpu_id)
+ * goto abort_ip;
+ * [3] <last_instruction_in_cs>
+ * [post_commit_ip] ----------------------------
+ *
+ * The address of jump target abort_ip must be outside the critical
+ * region, i.e.:
+ *
+ * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip]
+ *
+ * Steps [2]-[3] (inclusive) need to be a sequence of instructions in
+ * userspace that can handle being interrupted between any of those
+ * instructions, and then resumed to the abort_ip.
+ *
+ * 1. Userspace stores the address of the struct rseq_cs assembly
+ * block descriptor into the rseq_cs field of the registered
+ * struct rseq TLS area. This update is performed through a single
+ * store within the inline assembly instruction sequence.
+ * [start_ip]
+ *
+ * 2. Userspace tests to check whether the current cpu_id field match
+ * the cpu number loaded before start_ip, branching to abort_ip
+ * in case of a mismatch.
+ *
+ * If the sequence is preempted or interrupted by a signal
+ * at or after start_ip and before post_commit_ip, then the kernel
+ * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
+ * ip to abort_ip before returning to user-space, so the preempted
+ * execution resumes at abort_ip.
+ *
+ * 3. Userspace critical section final instruction before
+ * post_commit_ip is the commit. The critical section is
+ * self-terminating.
+ * [post_commit_ip]
+ *
+ * 4. <success>
+ *
+ * On failure at [2], or if interrupted by preempt or signal delivery
+ * between [1] and [3]:
+ *
+ * [abort_ip]
+ * F1. <failure>
+ */
+
+static int rseq_update_cpu_id(struct task_struct *t)
+{
+ u32 cpu_id = raw_smp_processor_id();
+
+ if (__put_user(cpu_id, &t->rseq->cpu_id_start))
+ return -EFAULT;
+ if (__put_user(cpu_id, &t->rseq->cpu_id))
+ return -EFAULT;
+ trace_rseq_update(t);
+ return 0;
+}
+
+static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+{
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+
+ /*
+ * Reset cpu_id_start to its initial state (0).
+ */
+ if (__put_user(cpu_id_start, &t->rseq->cpu_id_start))
+ return -EFAULT;
+ /*
+ * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
+ * in after unregistration can figure out that rseq needs to be
+ * registered again.
+ */
+ if (__put_user(cpu_id, &t->rseq->cpu_id))
+ return -EFAULT;
+ return 0;
+}
+
+static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
+{
+ struct rseq_cs __user *urseq_cs;
+ unsigned long ptr;
+ u32 __user *usig;
+ u32 sig;
+ int ret;
+
+ ret = __get_user(ptr, &t->rseq->rseq_cs);
+ if (ret)
+ return ret;
+ if (!ptr) {
+ memset(rseq_cs, 0, sizeof(*rseq_cs));
+ return 0;
+ }
+ urseq_cs = (struct rseq_cs __user *)ptr;
+ if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
+ return -EFAULT;
+ if (rseq_cs->version > 0)
+ return -EINVAL;
+
+ /* Ensure that abort_ip is not in the critical section. */
+ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
+ return -EINVAL;
+
+ usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32));
+ ret = get_user(sig, usig);
+ if (ret)
+ return ret;
+
+ if (current->rseq_sig != sig) {
+ printk_ratelimited(KERN_WARNING
+ "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
+ sig, current->rseq_sig, current->pid, usig);
+ return -EPERM;
+ }
+ return 0;
+}
+
+static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
+{
+ u32 flags, event_mask;
+ int ret;
+
+ /* Get thread flags. */
+ ret = __get_user(flags, &t->rseq->flags);
+ if (ret)
+ return ret;
+
+ /* Take critical section flags into account. */
+ flags |= cs_flags;
+
+ /*
+ * Restart on signal can only be inhibited when restart on
+ * preempt and restart on migrate are inhibited too. Otherwise,
+ * a preempted signal handler could fail to restart the prior
+ * execution context on sigreturn.
+ */
+ if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
+ (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
+ RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
+ return -EINVAL;
+
+ /*
+ * Load and clear event mask atomically with respect to
+ * scheduler preemption.
+ */
+ preempt_disable();
+ event_mask = t->rseq_event_mask;
+ t->rseq_event_mask = 0;
+ preempt_enable();
+
+ return !!(event_mask & ~flags);
+}
+
+static int clear_rseq_cs(struct task_struct *t)
+{
+ /*
+ * The rseq_cs field is set to NULL on preemption or signal
+ * delivery on top of rseq assembly block, as well as on top
+ * of code outside of the rseq assembly block. This performs
+ * a lazy clear of the rseq_cs field.
+ *
+ * Set rseq_cs to NULL with single-copy atomicity.
+ */
+ return __put_user(0UL, &t->rseq->rseq_cs);
+}
+
+/*
+ * Unsigned comparison will be true when ip >= start_ip, and when
+ * ip < start_ip + post_commit_offset.
+ */
+static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
+{
+ return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
+}
+
+static int rseq_ip_fixup(struct pt_regs *regs)
+{
+ unsigned long ip = instruction_pointer(regs);
+ struct task_struct *t = current;
+ struct rseq_cs rseq_cs;
+ int ret;
+
+ ret = rseq_get_rseq_cs(t, &rseq_cs);
+ if (ret)
+ return ret;
+
+ /*
+ * Handle potentially not being within a critical section.
+ * If not nested over a rseq critical section, restart is useless.
+ * Clear the rseq_cs pointer and return.
+ */
+ if (!in_rseq_cs(ip, &rseq_cs))
+ return clear_rseq_cs(t);
+ ret = rseq_need_restart(t, rseq_cs.flags);
+ if (ret <= 0)
+ return ret;
+ ret = clear_rseq_cs(t);
+ if (ret)
+ return ret;
+ trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
+ rseq_cs.abort_ip);
+ instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
+ return 0;
+}
+
+/*
+ * This resume handler must always be executed between any of:
+ * - preemption,
+ * - signal delivery,
+ * and return to user-space.
+ *
+ * This is how we can ensure that the entire rseq critical section,
+ * consisting of both the C part and the assembly instruction sequence,
+ * will issue the commit instruction only if executed atomically with
+ * respect to other threads scheduled on the same CPU, and with respect
+ * to signal handlers.
+ */
+void __rseq_handle_notify_resume(struct pt_regs *regs)
+{
+ struct task_struct *t = current;
+ int ret;
+
+ if (unlikely(t->flags & PF_EXITING))
+ return;
+ if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq))))
+ goto error;
+ ret = rseq_ip_fixup(regs);
+ if (unlikely(ret < 0))
+ goto error;
+ if (unlikely(rseq_update_cpu_id(t)))
+ goto error;
+ return;
+
+error:
+ force_sig(SIGSEGV, t);
+}
+
+#ifdef CONFIG_DEBUG_RSEQ
+
+/*
+ * Terminate the process if a syscall is issued within a restartable
+ * sequence.
+ */
+void rseq_syscall(struct pt_regs *regs)
+{
+ unsigned long ip = instruction_pointer(regs);
+ struct task_struct *t = current;
+ struct rseq_cs rseq_cs;
+
+ if (!t->rseq)
+ return;
+ if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) ||
+ rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
+ force_sig(SIGSEGV, t);
+}
+
+#endif
+
+/*
+ * sys_rseq - setup restartable sequences for caller thread.
+ */
+SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
+ int, flags, u32, sig)
+{
+ int ret;
+
+ if (flags & RSEQ_FLAG_UNREGISTER) {
+ /* Unregister rseq for current thread. */
+ if (current->rseq != rseq || !current->rseq)
+ return -EINVAL;
+ if (current->rseq_len != rseq_len)
+ return -EINVAL;
+ if (current->rseq_sig != sig)
+ return -EPERM;
+ ret = rseq_reset_rseq_cpu_id(current);
+ if (ret)
+ return ret;
+ current->rseq = NULL;
+ current->rseq_len = 0;
+ current->rseq_sig = 0;
+ return 0;
+ }
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (current->rseq) {
+ /*
+ * If rseq is already registered, check whether
+ * the provided address differs from the prior
+ * one.
+ */
+ if (current->rseq != rseq || current->rseq_len != rseq_len)
+ return -EINVAL;
+ if (current->rseq_sig != sig)
+ return -EPERM;
+ /* Already registered. */
+ return -EBUSY;
+ }
+
+ /*
+ * If there was no rseq previously registered,
+ * ensure the provided rseq is properly aligned and valid.
+ */
+ if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ rseq_len != sizeof(*rseq))
+ return -EINVAL;
+ if (!access_ok(VERIFY_WRITE, rseq, rseq_len))
+ return -EFAULT;
+ current->rseq = rseq;
+ current->rseq_len = rseq_len;
+ current->rseq_sig = sig;
+ /*
+ * If rseq was previously inactive, and has just been
+ * registered, ensure the cpu_id_start and cpu_id fields
+ * are updated before returning to user-space.
+ */
+ rseq_set_notify_resume(current);
+
+ return 0;
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9866f86f304..78d8facba456 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10,6 +10,8 @@
#include <linux/kthread.h>
#include <linux/nospec.h>
+#include <linux/kcov.h>
+
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -1191,6 +1193,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
+ rseq_migrate(p);
perf_event_task_migrate(p);
}
@@ -2632,8 +2635,10 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+ kcov_prepare_switch(prev);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
+ rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
prepare_task(next);
prepare_arch_switch(next);
@@ -2700,6 +2705,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
finish_task(prev);
finish_lock_switch(rq);
finish_arch_post_lock_switch();
+ kcov_finish_switch(current);
fire_sched_in_preempt_notifiers(current);
/*
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 28592b62b1d5..3cde46483f0a 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -51,7 +51,7 @@ struct sugov_cpu {
bool iowait_boost_pending;
unsigned int iowait_boost;
unsigned int iowait_boost_max;
- u64 last_update;
+ u64 last_update;
/* The fields below are only needed when sharing a policy: */
unsigned long util_cfs;
@@ -89,46 +89,52 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
* schedule the kthread.
*/
if (sg_policy->policy->fast_switch_enabled &&
- !cpufreq_can_do_remote_dvfs(sg_policy->policy))
+ !cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (sg_policy->work_in_progress)
- return false;
-
- if (unlikely(sg_policy->need_freq_update)) {
- sg_policy->need_freq_update = false;
- /*
- * This happens when limits change, so forget the previous
- * next_freq value and force an update.
- */
- sg_policy->next_freq = UINT_MAX;
+ if (unlikely(sg_policy->need_freq_update))
return true;
- }
delta_ns = time - sg_policy->last_freq_update_time;
return delta_ns >= sg_policy->freq_update_delay_ns;
}
-static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
+static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
{
- struct cpufreq_policy *policy = sg_policy->policy;
-
if (sg_policy->next_freq == next_freq)
- return;
+ return false;
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
- if (policy->fast_switch_enabled) {
- next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (!next_freq)
- return;
+ return true;
+}
- policy->cur = next_freq;
- trace_cpu_frequency(next_freq, smp_processor_id());
- } else {
+static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+
+ if (!sugov_update_next_freq(sg_policy, time, next_freq))
+ return;
+
+ next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+ if (!next_freq)
+ return;
+
+ policy->cur = next_freq;
+ trace_cpu_frequency(next_freq, smp_processor_id());
+}
+
+static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ if (!sugov_update_next_freq(sg_policy, time, next_freq))
+ return;
+
+ if (!sg_policy->work_in_progress) {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -165,8 +171,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
freq = (freq + (freq >> 2)) * util / max;
- if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
+
+ sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -200,43 +208,120 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
}
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
+/**
+ * sugov_iowait_reset() - Reset the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @set_iowait_boost: true if an IO boost has been requested
+ *
+ * The IO wait boost of a task is disabled after a tick since the last update
+ * of a CPU. If a new IO wait boost is requested after more then a tick, then
+ * we enable the boost starting from the minimum frequency, which improves
+ * energy efficiency by ignoring sporadic wakeups from IO.
+ */
+static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
+ bool set_iowait_boost)
{
- if (flags & SCHED_CPUFREQ_IOWAIT) {
- if (sg_cpu->iowait_boost_pending)
- return;
+ s64 delta_ns = time - sg_cpu->last_update;
- sg_cpu->iowait_boost_pending = true;
+ /* Reset boost only if a tick has elapsed since last request */
+ if (delta_ns <= TICK_NSEC)
+ return false;
- if (sg_cpu->iowait_boost) {
- sg_cpu->iowait_boost <<= 1;
- if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
- } else {
- sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
- }
- } else if (sg_cpu->iowait_boost) {
- s64 delta_ns = time - sg_cpu->last_update;
+ sg_cpu->iowait_boost = set_iowait_boost
+ ? sg_cpu->sg_policy->policy->min : 0;
+ sg_cpu->iowait_boost_pending = set_iowait_boost;
- /* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC) {
- sg_cpu->iowait_boost = 0;
- sg_cpu->iowait_boost_pending = false;
- }
+ return true;
+}
+
+/**
+ * sugov_iowait_boost() - Updates the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
+ *
+ * Each time a task wakes up after an IO operation, the CPU utilization can be
+ * boosted to a certain utilization which doubles at each "frequent and
+ * successive" wakeup from IO, ranging from the utilization of the minimum
+ * OPP to the utilization of the maximum OPP.
+ * To keep doubling, an IO boost has to be requested at least once per tick,
+ * otherwise we restart from the utilization of the minimum OPP.
+ */
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
+
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sg_cpu->iowait_boost &&
+ sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
+ return;
+
+ /* Boost only tasks waking up after IO */
+ if (!set_iowait_boost)
+ return;
+
+ /* Ensure boost doubles only one time at each request */
+ if (sg_cpu->iowait_boost_pending)
+ return;
+ sg_cpu->iowait_boost_pending = true;
+
+ /* Double the boost at each request */
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ return;
}
+
+ /* First wakeup after IO: start with minimum boost */
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
}
-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
- unsigned long *max)
+/**
+ * sugov_iowait_apply() - Apply the IO boost to a CPU.
+ * @sg_cpu: the sugov data for the cpu to boost
+ * @time: the update time from the caller
+ * @util: the utilization to (eventually) boost
+ * @max: the maximum value the utilization can be boosted to
+ *
+ * A CPU running a task which woken up after an IO operation can have its
+ * utilization boosted to speed up the completion of those IO operations.
+ * The IO boost value is increased each time a task wakes up from IO, in
+ * sugov_iowait_apply(), and it's instead decreased by this function,
+ * each time an increase has not been requested (!iowait_boost_pending).
+ *
+ * A CPU which also appears to have been idle for at least one tick has also
+ * its IO boost utilization reset.
+ *
+ * This mechanism is designed to boost high frequently IO waiting tasks, while
+ * being more conservative on tasks which does sporadic IO operations.
+ */
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long *util, unsigned long *max)
{
unsigned int boost_util, boost_max;
+ /* No boost currently required */
if (!sg_cpu->iowait_boost)
return;
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sugov_iowait_reset(sg_cpu, time, false))
+ return;
+
+ /*
+ * An IO waiting task has just woken up:
+ * allow to further double the boost value
+ */
if (sg_cpu->iowait_boost_pending) {
sg_cpu->iowait_boost_pending = false;
} else {
+ /*
+ * Otherwise: reduce the boost value and disable it when we
+ * reach the minimum.
+ */
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
sg_cpu->iowait_boost = 0;
@@ -244,9 +329,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
}
}
+ /*
+ * Apply the current boost value: a CPU is boosted only if its current
+ * utilization is smaller then the current IO boost level.
+ */
boost_util = sg_cpu->iowait_boost;
boost_max = sg_cpu->iowait_boost_max;
-
if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
@@ -285,7 +373,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int next_f;
bool busy;
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
@@ -298,21 +386,31 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
sugov_get_util(sg_cpu);
max = sg_cpu->max;
util = sugov_aggregate_util(sg_cpu);
- sugov_iowait_boost(sg_cpu, &util, &max);
+ sugov_iowait_apply(sg_cpu, time, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
- if (busy && next_f < sg_policy->next_freq &&
- sg_policy->next_freq != UINT_MAX) {
+ if (busy && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
/* Reset cached freq as next_freq has changed */
sg_policy->cached_raw_freq = 0;
}
- sugov_update_commit(sg_policy, time, next_f);
+ /*
+ * This code runs under rq->lock for the target CPU, so it won't run
+ * concurrently on two different CPUs for the same target and it is not
+ * necessary to acquire the lock in the fast switch case.
+ */
+ if (sg_policy->policy->fast_switch_enabled) {
+ sugov_fast_switch(sg_policy, time, next_f);
+ } else {
+ raw_spin_lock(&sg_policy->update_lock);
+ sugov_deferred_update(sg_policy, time, next_f);
+ raw_spin_unlock(&sg_policy->update_lock);
+ }
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
@@ -325,28 +423,12 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
- s64 delta_ns;
sugov_get_util(j_sg_cpu);
-
- /*
- * If the CFS CPU utilization was last updated before the
- * previous frequency update and the time elapsed between the
- * last update of the CPU utilization and the last frequency
- * update is long enough, reset iowait_boost and util_cfs, as
- * they are now probably stale. However, still consider the
- * CPU contribution if it has some DEADLINE utilization
- * (util_dl).
- */
- delta_ns = time - j_sg_cpu->last_update;
- if (delta_ns > TICK_NSEC) {
- j_sg_cpu->iowait_boost = 0;
- j_sg_cpu->iowait_boost_pending = false;
- }
-
j_max = j_sg_cpu->max;
j_util = sugov_aggregate_util(j_sg_cpu);
- sugov_iowait_boost(j_sg_cpu, &j_util, &j_max);
+ sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
+
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
@@ -365,14 +447,18 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
raw_spin_lock(&sg_policy->update_lock);
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
- sugov_update_commit(sg_policy, time, next_f);
+
+ if (sg_policy->policy->fast_switch_enabled)
+ sugov_fast_switch(sg_policy, time, next_f);
+ else
+ sugov_deferred_update(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
@@ -381,13 +467,27 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+ unsigned int freq;
+ unsigned long flags;
+
+ /*
+ * Hold sg_policy->update_lock shortly to handle the case where:
+ * incase sg_policy->next_freq is read here, and then updated by
+ * sugov_deferred_update() just before work_in_progress is set to false
+ * here, we may miss queueing the new update.
+ *
+ * Note: If a work was queued after the update_lock is released,
+ * sugov_work() will just be called again by kthread_work code; and the
+ * request will be proceed before the sugov thread sleeps.
+ */
+ raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
+ freq = sg_policy->next_freq;
+ sg_policy->work_in_progress = false;
+ raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
mutex_lock(&sg_policy->work_lock);
- __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
- CPUFREQ_RELATION_L);
+ __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
mutex_unlock(&sg_policy->work_lock);
-
- sg_policy->work_in_progress = false;
}
static void sugov_irq_work(struct irq_work *irq_work)
@@ -510,11 +610,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
-
- /* Kthread is bound to all CPUs by default */
- if (!policy->dvfs_possible_from_any_cpu)
- kthread_bind_mask(thread, policy->related_cpus);
-
+ kthread_bind_mask(thread, policy->related_cpus);
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -657,7 +753,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
sg_policy->last_freq_update_time = 0;
- sg_policy->next_freq = UINT_MAX;
+ sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e497c05aab7f..1866e64792a7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10215,10 +10215,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
struct cfs_rq *cfs_rq;
int i;
- tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+ tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
- tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+ tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
if (!tg->se)
goto err;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ef3c4e6f5345..47556b0c9a95 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -183,10 +183,10 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
struct sched_rt_entity *rt_se;
int i;
- tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+ tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
if (!tg->rt_rq)
goto err;
- tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
+ tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
if (!tg->rt_se)
goto err;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 61a1125c1ae4..05a831427bc7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1750,7 +1750,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
int i;
cpumask_var_t *doms;
- doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+ doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
if (!doms)
return NULL;
for (i = 0; i < ndoms; i++) {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e691d9a6c58d..fd023ac24e10 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -593,18 +593,15 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
}
/*
- * Force an audit message to be emitted when the action is RET_KILL_*,
- * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
- * allowed to be logged by the admin.
+ * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
+ * FILTER_FLAG_LOG bit was set. The admin has the ability to silence
+ * any action from being logged by removing the action name from the
+ * seccomp_actions_logged sysctl.
*/
- if (log)
- return __audit_seccomp(syscall, signr, action);
+ if (!log)
+ return;
- /*
- * Let the audit subsystem decide if the action should be audited based
- * on whether the current task itself is being audited.
- */
- return audit_seccomp(syscall, signr, action);
+ audit_seccomp(syscall, signr, action);
}
/*
@@ -1144,10 +1141,11 @@ static const struct seccomp_log_name seccomp_log_names[] = {
};
static bool seccomp_names_from_actions_logged(char *names, size_t size,
- u32 actions_logged)
+ u32 actions_logged,
+ const char *sep)
{
const struct seccomp_log_name *cur;
- bool append_space = false;
+ bool append_sep = false;
for (cur = seccomp_log_names; cur->name && size; cur++) {
ssize_t ret;
@@ -1155,15 +1153,15 @@ static bool seccomp_names_from_actions_logged(char *names, size_t size,
if (!(actions_logged & cur->log))
continue;
- if (append_space) {
- ret = strscpy(names, " ", size);
+ if (append_sep) {
+ ret = strscpy(names, sep, size);
if (ret < 0)
return false;
names += ret;
size -= ret;
} else
- append_space = true;
+ append_sep = true;
ret = strscpy(names, cur->name, size);
if (ret < 0)
@@ -1208,46 +1206,102 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
return true;
}
-static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ char names[sizeof(seccomp_actions_avail)];
+ struct ctl_table table;
+
+ memset(names, 0, sizeof(names));
+
+ if (!seccomp_names_from_actions_logged(names, sizeof(names),
+ seccomp_actions_logged, " "))
+ return -EINVAL;
+
+ table = *ro_table;
+ table.data = names;
+ table.maxlen = sizeof(names);
+ return proc_dostring(&table, 0, buffer, lenp, ppos);
+}
+
+static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+ size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
struct ctl_table table;
int ret;
- if (write && !capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN))
return -EPERM;
memset(names, 0, sizeof(names));
- if (!write) {
- if (!seccomp_names_from_actions_logged(names, sizeof(names),
- seccomp_actions_logged))
- return -EINVAL;
- }
-
table = *ro_table;
table.data = names;
table.maxlen = sizeof(names);
- ret = proc_dostring(&table, write, buffer, lenp, ppos);
+ ret = proc_dostring(&table, 1, buffer, lenp, ppos);
if (ret)
return ret;
- if (write) {
- u32 actions_logged;
+ if (!seccomp_actions_logged_from_names(actions_logged, table.data))
+ return -EINVAL;
- if (!seccomp_actions_logged_from_names(&actions_logged,
- table.data))
- return -EINVAL;
+ if (*actions_logged & SECCOMP_LOG_ALLOW)
+ return -EINVAL;
- if (actions_logged & SECCOMP_LOG_ALLOW)
- return -EINVAL;
+ seccomp_actions_logged = *actions_logged;
+ return 0;
+}
- seccomp_actions_logged = actions_logged;
- }
+static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
+ int ret)
+{
+ char names[sizeof(seccomp_actions_avail)];
+ char old_names[sizeof(seccomp_actions_avail)];
+ const char *new = names;
+ const char *old = old_names;
- return 0;
+ if (!audit_enabled)
+ return;
+
+ memset(names, 0, sizeof(names));
+ memset(old_names, 0, sizeof(old_names));
+
+ if (ret)
+ new = "?";
+ else if (!actions_logged)
+ new = "(none)";
+ else if (!seccomp_names_from_actions_logged(names, sizeof(names),
+ actions_logged, ","))
+ new = "?";
+
+ if (!old_actions_logged)
+ old = "(none)";
+ else if (!seccomp_names_from_actions_logged(old_names,
+ sizeof(old_names),
+ old_actions_logged, ","))
+ old = "?";
+
+ return audit_seccomp_actions_logged(new, old, !ret);
+}
+
+static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ if (write) {
+ u32 actions_logged = 0;
+ u32 old_actions_logged = seccomp_actions_logged;
+
+ ret = write_actions_logged(ro_table, buffer, lenp, ppos,
+ &actions_logged);
+ audit_actions_logged(actions_logged, old_actions_logged, ret);
+ } else
+ ret = read_actions_logged(ro_table, buffer, lenp, ppos);
+
+ return ret;
}
static struct ctl_path seccomp_sysctl_path[] = {
diff --git a/kernel/signal.c b/kernel/signal.c
index 0f865d67415d..8d8a940422a8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1244,19 +1244,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
{
struct sighand_struct *sighand;
+ rcu_read_lock();
for (;;) {
- /*
- * Disable interrupts early to avoid deadlocks.
- * See rcu_read_unlock() comment header for details.
- */
- local_irq_save(*flags);
- rcu_read_lock();
sighand = rcu_dereference(tsk->sighand);
- if (unlikely(sighand == NULL)) {
- rcu_read_unlock();
- local_irq_restore(*flags);
+ if (unlikely(sighand == NULL))
break;
- }
+
/*
* This sighand can be already freed and even reused, but
* we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
@@ -1268,15 +1261,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
* __exit_signal(). In the latter case the next iteration
* must see ->sighand == NULL.
*/
- spin_lock(&sighand->siglock);
- if (likely(sighand == tsk->sighand)) {
- rcu_read_unlock();
+ spin_lock_irqsave(&sighand->siglock, *flags);
+ if (likely(sighand == tsk->sighand))
break;
- }
- spin_unlock(&sighand->siglock);
- rcu_read_unlock();
- local_irq_restore(*flags);
+ spin_unlock_irqrestore(&sighand->siglock, *flags);
}
+ rcu_read_unlock();
return sighand;
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index de2f57fddc04..900dcfee542c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -139,9 +139,13 @@ static void __local_bh_enable(unsigned int cnt)
{
lockdep_assert_irqs_disabled();
+ if (preempt_count() == cnt)
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+
if (softirq_count() == (cnt & SOFTIRQ_MASK))
trace_softirqs_on(_RET_IP_);
- preempt_count_sub(cnt);
+
+ __preempt_count_sub(cnt);
}
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index d1b2b8d934bb..38509dc1f77b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2018,7 +2018,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
return error;
}
- down_write(&mm->mmap_sem);
+ /*
+ * arg_lock protects concurent updates but we still need mmap_sem for
+ * read to exclude races with sys_brk.
+ */
+ down_read(&mm->mmap_sem);
/*
* We don't validate if these members are pointing to
@@ -2032,6 +2036,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
* to any problem in kernel itself
*/
+ spin_lock(&mm->arg_lock);
mm->start_code = prctl_map.start_code;
mm->end_code = prctl_map.end_code;
mm->start_data = prctl_map.start_data;
@@ -2043,6 +2048,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
mm->arg_end = prctl_map.arg_end;
mm->env_start = prctl_map.env_start;
mm->env_end = prctl_map.env_end;
+ spin_unlock(&mm->arg_lock);
/*
* Note this update of @saved_auxv is lockless thus
@@ -2055,7 +2061,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.auxv_size)
memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
- up_write(&mm->mmap_sem);
+ up_read(&mm->mmap_sem);
return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 183169c2a75b..df556175be50 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -367,7 +367,7 @@ COND_SYSCALL(s390_pci_mmio_write);
COND_SYSCALL_COMPAT(s390_ipc);
/* powerpc */
-cond_syscall(ppc_rtas);
+COND_SYSCALL(rtas);
COND_SYSCALL(spu_run);
COND_SYSCALL(spu_create);
COND_SYSCALL(subpage_prot);
@@ -432,3 +432,6 @@ COND_SYSCALL(setresgid16);
COND_SYSCALL(setresuid16);
COND_SYSCALL(setreuid16);
COND_SYSCALL(setuid16);
+
+/* restartable sequence */
+COND_SYSCALL(rseq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a78cf70761d..2d9837c0aff4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3047,7 +3047,8 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
+ tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len),
+ sizeof(unsigned long),
GFP_KERNEL);
if (!tmp_bitmap) {
kfree(kbuf);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 055a4a728c00..3e93c54bd3a1 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1659,7 +1659,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
switch(restart->nanosleep.type) {
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
case TT_COMPAT:
if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp))
return -EFAULT;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5a6251ac6f7a..9cdf54b04ca8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -604,7 +604,6 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
/*
* Disarm any old timer after extracting its expiry time.
*/
- lockdep_assert_irqs_disabled();
ret = 0;
old_incr = timer->it.cpu.incr;
@@ -1049,7 +1048,6 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
/*
* Now re-arm for the new expiry time.
*/
- lockdep_assert_irqs_disabled();
arm_timer(timer);
unlock:
unlock_task_sighand(p, &flags);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 78e598334007..b7005dd21ec1 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -491,6 +491,7 @@ void tick_freeze(void)
if (tick_freeze_depth == num_online_cpus()) {
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
+ system_state = SYSTEM_SUSPEND;
timekeeping_suspend();
} else {
tick_suspend_local();
@@ -514,6 +515,7 @@ void tick_unfreeze(void)
if (tick_freeze_depth == num_online_cpus()) {
timekeeping_resume();
+ system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
} else {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 6fa99213fc72..2b41e8e2d31d 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -28,6 +28,7 @@
*/
#include <linux/export.h>
+#include <linux/kernel.h>
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/timekeeper_internal.h>
@@ -314,9 +315,10 @@ unsigned int jiffies_to_msecs(const unsigned long j)
return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
- return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+ return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >>
+ HZ_TO_MSEC_SHR32;
# else
- return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
+ return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
# endif
#endif
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index c4f0f2e4126e..dcc0166d1997 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,22 +12,22 @@ config NOP_TRACER
config HAVE_FTRACE_NMI_ENTER
bool
help
- See Documentation/trace/ftrace-design.txt
+ See Documentation/trace/ftrace-design.rst
config HAVE_FUNCTION_TRACER
bool
help
- See Documentation/trace/ftrace-design.txt
+ See Documentation/trace/ftrace-design.rst
config HAVE_FUNCTION_GRAPH_TRACER
bool
help
- See Documentation/trace/ftrace-design.txt
+ See Documentation/trace/ftrace-design.rst
config HAVE_DYNAMIC_FTRACE
bool
help
- See Documentation/trace/ftrace-design.txt
+ See Documentation/trace/ftrace-design.rst
config HAVE_DYNAMIC_FTRACE_WITH_REGS
bool
@@ -35,12 +35,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
- See Documentation/trace/ftrace-design.txt
+ See Documentation/trace/ftrace-design.rst
config HAVE_SYSCALL_TRACEPOINTS
bool
help
- See Documentation/trace/ftrace-design.txt
+ See Documentation/trace/ftrace-design.rst
config HAVE_FENTRY
bool
@@ -110,11 +110,7 @@ config GENERIC_TRACER
#
config TRACING_SUPPORT
bool
- # PPC32 has no irqflags tracing support, but it can use most of the
- # tracers anyway, they were tested to build and work. Note that new
- # exceptions to this list aren't welcomed, better implement the
- # irqflags tracing for your architecture.
- depends on TRACE_IRQFLAGS_SUPPORT || PPC32
+ depends on TRACE_IRQFLAGS_SUPPORT
depends on STACKTRACE_SUPPORT
default y
@@ -452,7 +448,7 @@ config KPROBE_EVENTS
help
This allows the user to add tracing events (similar to tracepoints)
on the fly via the ftrace interface. See
- Documentation/trace/kprobetrace.txt for more details.
+ Documentation/trace/kprobetrace.rst for more details.
Those events can be inserted wherever kprobes can probe, and record
various register and memory values.
@@ -579,7 +575,7 @@ config MMIOTRACE
implementation and works via page faults. Tracing is disabled by
default and can be enabled at run-time.
- See Documentation/trace/mmiotrace.txt.
+ See Documentation/trace/mmiotrace.rst.
If you are not helping to develop drivers, say N.
config TRACING_MAP
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 56ba0f2a01db..0ae6829804bc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,12 +14,14 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
+#include <linux/syscalls.h>
#include <linux/error-injection.h>
#include "trace_probe.h"
#include "trace.h"
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
/**
* trace_call_bpf - invoke BPF program
@@ -474,8 +476,6 @@ BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct cgroup *cgrp;
- if (unlikely(in_interrupt()))
- return -EINVAL;
if (unlikely(idx >= array->map.max_entries))
return -E2BIG;
@@ -564,6 +564,10 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_probe_read_str:
return &bpf_probe_read_str_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+#endif
default:
return NULL;
}
@@ -577,6 +581,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto;
case BPF_FUNC_perf_event_read_value:
return &bpf_perf_event_read_value_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +670,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+ u64, flags)
+{
+ struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+ return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+ .func = bpf_get_stack_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -672,6 +697,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto_tp;
default:
return tracing_func_proto(func_id, prog);
}
@@ -734,6 +761,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto_tp;
case BPF_FUNC_perf_prog_read_value:
return &bpf_perf_prog_read_value_proto;
default:
@@ -744,7 +773,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
/*
* bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
* to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
*/
static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +816,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+ void *, buf, u32, size, u64, flags)
+{
+ struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+ perf_fetch_caller_regs(regs);
+ return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+ .func = bpf_get_stack_raw_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -795,6 +844,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_output_proto_raw_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_raw_tp;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto_raw_tp;
default:
return tracing_func_proto(func_id, prog);
}
@@ -833,8 +884,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
return false;
if (type != BPF_READ)
return false;
- if (off % size != 0)
- return false;
+ if (off % size != 0) {
+ if (sizeof(unsigned long) != 4)
+ return false;
+ if (size != 8)
+ return false;
+ if (off % size != 4)
+ return false;
+ }
switch (off) {
case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
@@ -959,6 +1016,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
old_array = event->tp_event->prog_array;
ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ if (ret == -ENOENT)
+ goto unlock;
if (ret < 0) {
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
@@ -1117,3 +1176,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
mutex_unlock(&bpf_event_mutex);
return err;
}
+
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+ u32 *fd_type, const char **buf,
+ u64 *probe_offset, u64 *probe_addr)
+{
+ bool is_tracepoint, is_syscall_tp;
+ struct bpf_prog *prog;
+ int flags, err = 0;
+
+ prog = event->prog;
+ if (!prog)
+ return -ENOENT;
+
+ /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
+ if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
+ return -EOPNOTSUPP;
+
+ *prog_id = prog->aux->id;
+ flags = event->tp_event->flags;
+ is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+
+ if (is_tracepoint || is_syscall_tp) {
+ *buf = is_tracepoint ? event->tp_event->tp->name
+ : event->tp_event->name;
+ *fd_type = BPF_FD_TYPE_TRACEPOINT;
+ *probe_offset = 0x0;
+ *probe_addr = 0x0;
+ } else {
+ /* kprobe/uprobe */
+ err = -EOPNOTSUPP;
+#ifdef CONFIG_KPROBE_EVENTS
+ if (flags & TRACE_EVENT_FL_KPROBE)
+ err = bpf_get_kprobe_info(event, fd_type, buf,
+ probe_offset, probe_addr,
+ event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (flags & TRACE_EVENT_FL_UPROBE)
+ err = bpf_get_uprobe_info(event, fd_type, buf,
+ probe_offset,
+ event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+ }
+
+ return err;
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8d83bcf9ef69..efed9c1cfb7e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -728,7 +728,7 @@ static int ftrace_profile_init_cpu(int cpu)
*/
size = FTRACE_PROFILE_HASH_SIZE;
- stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+ stat->hash = kcalloc(size, sizeof(struct hlist_head), GFP_KERNEL);
if (!stat->hash)
return -ENOMEM;
@@ -6830,9 +6830,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
struct task_struct *g, *t;
for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
- ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
- * sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack_list[i] =
+ kmalloc_array(FTRACE_RETFUNC_DEPTH,
+ sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
if (!ret_stack_list[i]) {
start = 0;
end = i;
@@ -6904,9 +6905,9 @@ static int start_graph_tracing(void)
struct ftrace_ret_stack **ret_stack_list;
int ret, cpu;
- ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
- sizeof(struct ftrace_ret_stack *),
- GFP_KERNEL);
+ ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE,
+ sizeof(struct ftrace_ret_stack *),
+ GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
@@ -7088,9 +7089,10 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
ret_stack = per_cpu(idle_ret_stack, cpu);
if (!ret_stack) {
- ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
- * sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack =
+ kmalloc_array(FTRACE_RETFUNC_DEPTH,
+ sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
if (!ret_stack)
return;
per_cpu(idle_ret_stack, cpu) = ret_stack;
@@ -7109,9 +7111,9 @@ void ftrace_graph_init_task(struct task_struct *t)
if (ftrace_graph_active) {
struct ftrace_ret_stack *ret_stack;
- ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
- * sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH,
+ sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
if (!ret_stack)
return;
graph_init_task(t, ret_stack);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c9cb9767d49b..6a46af21765c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -809,7 +809,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
*
* You can see, it is legitimate for the previous pointer of
* the head (or any page) not to point back to itself. But only
- * temporarially.
+ * temporarily.
*/
#define RB_PAGE_NORMAL 0UL
@@ -906,7 +906,7 @@ static void rb_list_head_clear(struct list_head *list)
}
/*
- * rb_head_page_dactivate - clears head page ptr (for free list)
+ * rb_head_page_deactivate - clears head page ptr (for free list)
*/
static void
rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
@@ -1780,7 +1780,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
put_online_cpus();
} else {
- /* Make sure this CPU has been intitialized */
+ /* Make sure this CPU has been initialized */
if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
goto out;
@@ -2325,7 +2325,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
/*
* If we need to add a timestamp, then we
- * add it to the start of the resevered space.
+ * add it to the start of the reserved space.
*/
if (unlikely(info->add_timestamp)) {
bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
@@ -2681,7 +2681,7 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
* ring_buffer_nest_start - Allow to trace while nested
* @buffer: The ring buffer to modify
*
- * The ring buffer has a safty mechanism to prevent recursion.
+ * The ring buffer has a safety mechanism to prevent recursion.
* But there may be a case where a trace needs to be done while
* tracing something else. In this case, calling this function
* will allow this function to nest within a currently active
@@ -2699,7 +2699,7 @@ void ring_buffer_nest_start(struct ring_buffer *buffer)
preempt_disable_notrace();
cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
- /* This is the shift value for the above recusive locking */
+ /* This is the shift value for the above recursive locking */
cpu_buffer->nest += NESTED_BITS;
}
@@ -2718,7 +2718,7 @@ void ring_buffer_nest_end(struct ring_buffer *buffer)
/* disabled by ring_buffer_nest_start() */
cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
- /* This is the shift value for the above recusive locking */
+ /* This is the shift value for the above recursive locking */
cpu_buffer->nest -= NESTED_BITS;
preempt_enable_notrace();
}
@@ -2907,7 +2907,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
* @buffer: the ring buffer to reserve from
* @length: the length of the data to reserve (excluding event header)
*
- * Returns a reseverd event on the ring buffer to copy directly to.
+ * Returns a reserved event on the ring buffer to copy directly to.
* The user of this interface will need to get the body to write into
* and can use the ring_buffer_event_data() interface.
*
@@ -3009,7 +3009,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
* This function lets the user discard an event in the ring buffer
* and then that event will not be read later.
*
- * This function only works if it is called before the the item has been
+ * This function only works if it is called before the item has been
* committed. It will try to free the event from the ring buffer
* if another event has not been added behind it.
*
@@ -4127,7 +4127,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* through the buffer. Memory is allocated, buffer recording
* is disabled, and the iterator pointer is returned to the caller.
*
- * Disabling buffer recordng prevents the reading from being
+ * Disabling buffer recording prevents the reading from being
* corrupted. This is not a consuming read, so a producer is not
* expected.
*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bcd93031d042..a0079b4c7a49 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1360,8 +1360,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct ring_buffer *buf;
-
if (tr->stop_count)
return;
@@ -1375,9 +1373,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
arch_spin_lock(&tr->max_lock);
- buf = tr->trace_buffer.buffer;
- tr->trace_buffer.buffer = tr->max_buffer.buffer;
- tr->max_buffer.buffer = buf;
+ swap(tr->trace_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&tr->max_lock);
@@ -1751,12 +1747,13 @@ static inline void set_cmdline(int idx, const char *cmdline)
static int allocate_cmdlines_buffer(unsigned int val,
struct saved_cmdlines_buffer *s)
{
- s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid),
- GFP_KERNEL);
+ s->map_cmdline_to_pid = kmalloc_array(val,
+ sizeof(*s->map_cmdline_to_pid),
+ GFP_KERNEL);
if (!s->map_cmdline_to_pid)
return -ENOMEM;
- s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL);
+ s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL);
if (!s->saved_cmdlines) {
kfree(s->map_cmdline_to_pid);
return -ENOMEM;
@@ -4360,7 +4357,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_TGID) {
if (!tgid_map)
- tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map),
+ tgid_map = kcalloc(PID_MAX_DEFAULT + 1,
+ sizeof(*tgid_map),
GFP_KERNEL);
if (!tgid_map) {
tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
@@ -4395,8 +4393,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
{
char *cmp;
int neg = 0;
- int ret = -ENODEV;
- int i;
+ int ret;
size_t orig_len = strlen(option);
cmp = strstrip(option);
@@ -4408,16 +4405,12 @@ static int trace_set_options(struct trace_array *tr, char *option)
mutex_lock(&trace_types_lock);
- for (i = 0; trace_options[i]; i++) {
- if (strcmp(cmp, trace_options[i]) == 0) {
- ret = set_tracer_flag(tr, 1 << i, !neg);
- break;
- }
- }
-
+ ret = match_string(trace_options, -1, cmp);
/* If no option could be set, test the specific tracer options */
- if (!trace_options[i])
+ if (ret < 0)
ret = set_tracer_option(tr, cmp, neg);
+ else
+ ret = set_tracer_flag(tr, 1 << ret, !neg);
mutex_unlock(&trace_types_lock);
@@ -5068,7 +5061,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
* where the head holds the module and length of array, and the
* tail holds a pointer to the next list.
*/
- map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
+ map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL);
if (!map_array) {
pr_warn("Unable to allocate trace eval mapping\n");
return;
@@ -6074,6 +6067,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
+ enum event_trigger_type tt = ETT_NONE;
struct ring_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
@@ -6122,6 +6116,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
written = cnt;
len = cnt;
+ if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
+ /* do not add \n before testing triggers, but add \0 */
+ entry->buf[cnt] = '\0';
+ tt = event_triggers_call(tr->trace_marker_file, entry, event);
+ }
+
if (entry->buf[cnt - 1] != '\n') {
entry->buf[cnt] = '\n';
entry->buf[cnt + 1] = '\0';
@@ -6130,6 +6130,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
__buffer_unlock_commit(buffer, event);
+ if (tt)
+ event_triggers_post_call(tr->trace_marker_file, tt);
+
if (written > 0)
*fpos += written;
@@ -7896,6 +7899,7 @@ static __init void create_trace_instances(struct dentry *d_tracer)
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
+ struct trace_event_file *file;
int cpu;
trace_create_file("available_tracers", 0444, d_tracer,
@@ -7928,6 +7932,12 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
+ file = __find_event_file(tr, "ftrace", "print");
+ if (file && file->dir)
+ trace_create_file("trigger", 0644, file->dir, file,
+ &event_trigger_fops);
+ tr->trace_marker_file = file;
+
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
@@ -8111,6 +8121,8 @@ static __init int tracer_init_tracefs(void)
if (IS_ERR(d_tracer))
return 0;
+ event_trace_init();
+
init_tracer_tracefs(&global_trace, d_tracer);
ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 507954b4e058..630c5a24b2b2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -259,6 +259,7 @@ struct trace_array {
struct trace_options *topts;
struct list_head systems;
struct list_head events;
+ struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
int ref;
#ifdef CONFIG_FUNCTION_TRACER
@@ -1334,7 +1335,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
if (tt)
- event_triggers_post_call(file, tt, entry, event);
+ event_triggers_post_call(file, tt);
}
/**
@@ -1367,7 +1368,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
irq_flags, pc, regs);
if (tt)
- event_triggers_post_call(file, tt, entry, event);
+ event_triggers_post_call(file, tt);
}
#define FILTER_PRED_INVALID ((unsigned short)-1)
@@ -1451,9 +1452,13 @@ trace_find_event_field(struct trace_event_call *call, char *name);
extern void trace_event_enable_cmd_record(bool enable);
extern void trace_event_enable_tgid_record(bool enable);
+extern int event_trace_init(void);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
+extern struct trace_event_file *__find_event_file(struct trace_array *tr,
+ const char *system,
+ const char *event);
extern struct trace_event_file *find_event_file(struct trace_array *tr,
const char *system,
const char *event);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3a658bac10f..1d67464ed95e 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -230,7 +230,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
FILTER_OTHER
);
-FTRACE_ENTRY(print, print_entry,
+FTRACE_ENTRY_REG(print, print_entry,
TRACE_PRINT,
@@ -242,7 +242,9 @@ FTRACE_ENTRY(print, print_entry,
F_printk("%ps: %s",
(void *)__entry->ip, __entry->buf),
- FILTER_OTHER
+ FILTER_OTHER,
+
+ ftrace_event_register
);
FTRACE_ENTRY(raw_data, raw_data_entry,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05c7172c6667..14ff4ff3caab 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2007,16 +2007,18 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
return -1;
}
}
- trace_create_file("filter", 0644, file->dir, file,
- &ftrace_event_filter_fops);
/*
* Only event directories that can be enabled should have
- * triggers.
+ * triggers or filters.
*/
- if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
+ trace_create_file("filter", 0644, file->dir, file,
+ &ftrace_event_filter_fops);
+
trace_create_file("trigger", 0644, file->dir, file,
&event_trigger_fops);
+ }
#ifdef CONFIG_HIST_TRIGGERS
trace_create_file("hist", 0444, file->dir, file,
@@ -2473,8 +2475,9 @@ __trace_add_event_dirs(struct trace_array *tr)
}
}
+/* Returns any file that matches the system and event */
struct trace_event_file *
-find_event_file(struct trace_array *tr, const char *system, const char *event)
+__find_event_file(struct trace_array *tr, const char *system, const char *event)
{
struct trace_event_file *file;
struct trace_event_call *call;
@@ -2485,10 +2488,7 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
call = file->event_call;
name = trace_event_name(call);
- if (!name || !call->class || !call->class->reg)
- continue;
-
- if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ if (!name || !call->class)
continue;
if (strcmp(event, name) == 0 &&
@@ -2498,6 +2498,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
return NULL;
}
+/* Returns valid trace event files that match system and event */
+struct trace_event_file *
+find_event_file(struct trace_array *tr, const char *system, const char *event)
+{
+ struct trace_event_file *file;
+
+ file = __find_event_file(tr, system, event);
+ if (!file || !file->event_call->class->reg ||
+ file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ return NULL;
+
+ return file;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE
/* Avoid typos */
@@ -3132,7 +3146,7 @@ static __init int event_trace_enable_again(void)
early_initcall(event_trace_enable_again);
-static __init int event_trace_init(void)
+__init int event_trace_init(void)
{
struct trace_array *tr;
struct dentry *d_tracer;
@@ -3177,8 +3191,6 @@ void __init trace_event_init(void)
event_trace_enable();
}
-fs_initcall(event_trace_init);
-
#ifdef CONFIG_FTRACE_STARTUP_TEST
static DEFINE_SPINLOCK(test_spinlock);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7d306b74230f..0dceb77d1d42 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -78,7 +78,8 @@ static const char * ops[] = { OPS };
C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \
C(INVALID_FILTER, "Meaningless filter expression"), \
C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
- C(INVALID_VALUE, "Invalid value (did you forget quotes)?"),
+ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \
+ C(NO_FILTER, "No filter found"),
#undef C
#define C(a, b) FILT_ERR_##a
@@ -436,15 +437,15 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
nr_preds += 2; /* For TRUE and FALSE */
- op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL);
+ op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL);
if (!op_stack)
return ERR_PTR(-ENOMEM);
- prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL);
+ prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
if (!prog_stack) {
parse_error(pe, -ENOMEM, 0);
goto out_free;
}
- inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL);
+ inverts = kmalloc_array(nr_preds, sizeof(*inverts), GFP_KERNEL);
if (!inverts) {
parse_error(pe, -ENOMEM, 0);
goto out_free;
@@ -550,6 +551,13 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
goto out_free;
}
+ if (!N) {
+ /* No program? */
+ ret = -EINVAL;
+ parse_error(pe, FILT_ERR_NO_FILTER, ptr - str);
+ goto out_free;
+ }
+
prog[N].pred = NULL; /* #13 */
prog[N].target = 1; /* TRUE */
prog[N+1].pred = NULL;
@@ -750,31 +758,32 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
*
* Note:
* - @str might not be NULL-terminated if it's of type DYN_STRING
- * or STATIC_STRING
+ * or STATIC_STRING, unless @len is zero.
*/
static int regex_match_full(char *str, struct regex *r, int len)
{
- if (strncmp(str, r->pattern, len) == 0)
- return 1;
- return 0;
+ /* len of zero means str is dynamic and ends with '\0' */
+ if (!len)
+ return strcmp(str, r->pattern) == 0;
+
+ return strncmp(str, r->pattern, len) == 0;
}
static int regex_match_front(char *str, struct regex *r, int len)
{
- if (len < r->len)
+ if (len && len < r->len)
return 0;
- if (strncmp(str, r->pattern, r->len) == 0)
- return 1;
- return 0;
+ return strncmp(str, r->pattern, r->len) == 0;
}
static int regex_match_middle(char *str, struct regex *r, int len)
{
- if (strnstr(str, r->pattern, len))
- return 1;
- return 0;
+ if (!len)
+ return strstr(str, r->pattern) != NULL;
+
+ return strnstr(str, r->pattern, len) != NULL;
}
static int regex_match_end(char *str, struct regex *r, int len)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index b9061ed59bbd..046c716a6536 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2865,7 +2865,7 @@ static struct trace_event_file *event_file(struct trace_array *tr,
{
struct trace_event_file *file;
- file = find_event_file(tr, system, event_name);
+ file = __find_event_file(tr, system, event_name);
if (!file)
return ERR_PTR(-EINVAL);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8b5bdcf64871..d18249683682 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -97,7 +97,6 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
* event_triggers_post_call - Call 'post_triggers' for a trace event
* @file: The trace_event_file associated with the event
* @tt: enum event_trigger_type containing a set bit for each trigger to invoke
- * @rec: The trace entry for the event
*
* For each trigger associated with an event, invoke the trigger
* function registered with the associated trigger command, if the
@@ -108,8 +107,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
*/
void
event_triggers_post_call(struct trace_event_file *file,
- enum event_trigger_type tt,
- void *rec, struct ring_buffer_event *event)
+ enum event_trigger_type tt)
{
struct event_trigger_data *data;
@@ -117,7 +115,7 @@ event_triggers_post_call(struct trace_event_file *file,
if (data->paused)
continue;
if (data->cmd_ops->trigger_type & tt)
- data->ops->func(data, rec, event);
+ data->ops->func(data, NULL, NULL);
}
}
EXPORT_SYMBOL_GPL(event_triggers_post_call);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 548e62eb5c46..45630a76ed3a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -14,6 +14,13 @@
#include "trace_output.h"
+/* Stub function for events with triggers */
+static int ftrace_event_register(struct trace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ return 0;
+}
+
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ftrace
@@ -117,7 +124,7 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __dynamic_array
#define __dynamic_array(type, item) \
- ret = trace_define_field(event_call, #type, #item, \
+ ret = trace_define_field(event_call, #type "[]", #item, \
offsetof(typeof(field), item), \
0, is_signed_type(type), filter_type);\
if (ret) \
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 02aed76e0978..daa81571b22a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
+
+int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
+ const char **symbol, u64 *probe_offset,
+ u64 *probe_addr, bool perf_type_tracepoint)
+{
+ const char *pevent = trace_event_name(event->tp_event);
+ const char *group = event->tp_event->class->system;
+ struct trace_kprobe *tk;
+
+ if (perf_type_tracepoint)
+ tk = find_trace_kprobe(pevent, group);
+ else
+ tk = event->tp_event->data;
+ if (!tk)
+ return -EINVAL;
+
+ *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
+ : BPF_FD_TYPE_KPROBE;
+ if (tk->symbol) {
+ *symbol = tk->symbol;
+ *probe_offset = tk->rp.kp.offset;
+ *probe_addr = 0;
+ } else {
+ *symbol = NULL;
+ *probe_offset = 0;
+ *probe_addr = (unsigned long)tk->rp.kp.addr;
+ }
+ return 0;
+}
#endif /* CONFIG_PERF_EVENTS */
/*
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index ac892878dbe6..bf89a51e740d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
{
__uprobe_perf_func(tu, func, regs, ucb, dsize);
}
+
+int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
+ const char **filename, u64 *probe_offset,
+ bool perf_type_tracepoint)
+{
+ const char *pevent = trace_event_name(event->tp_event);
+ const char *group = event->tp_event->class->system;
+ struct trace_uprobe *tu;
+
+ if (perf_type_tracepoint)
+ tu = find_probe_event(pevent, group);
+ else
+ tu = event->tp_event->data;
+ if (!tu)
+ return -EINVAL;
+
+ *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
+ : BPF_FD_TYPE_UPROBE;
+ *filename = tu->filename;
+ *probe_offset = tu->offset;
+ return 0;
+}
#endif /* CONFIG_PERF_EVENTS */
static int
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 5cadb1b8b5fe..752d8042bad4 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -1075,7 +1075,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
struct tracing_map_sort_entry *sort_entry, **entries;
int i, n_entries, ret;
- entries = vmalloc(map->max_elts * sizeof(sort_entry));
+ entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts));
if (!entries)
return -ENOMEM;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1e37da2e0c25..6dc6356c3327 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -257,7 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
}
/**
- * tracepoint_probe_register - Connect a probe to a tracepoint
+ * tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority
* @tp: tracepoint
* @probe: probe handler
* @data: tracepoint data
@@ -290,7 +290,6 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio);
* @tp: tracepoint
* @probe: probe handler
* @data: tracepoint data
- * @prio: priority of this function over other registered functions
*
* Returns 0 if ok, error value on error.
* Note: if @tp is within a module, the caller is responsible for
diff --git a/kernel/umh.c b/kernel/umh.c
index f76b3ff876cf..c449858946af 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -25,6 +25,8 @@
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
#include <trace/events/module.h>
@@ -97,9 +99,14 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
+ sub_info->pid = task_pid_nr(current);
+ if (sub_info->file)
+ retval = do_execve_file(sub_info->file,
+ sub_info->argv, sub_info->envp);
+ else
+ retval = do_execve(getname_kernel(sub_info->path),
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
out:
sub_info->retval = retval;
/*
@@ -393,6 +400,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
}
EXPORT_SYMBOL(call_usermodehelper_setup);
+struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info), void *data)
+{
+ struct subprocess_info *sub_info;
+
+ sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
+ if (!sub_info)
+ return NULL;
+
+ INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+ sub_info->path = "none";
+ sub_info->file = file;
+ sub_info->init = init;
+ sub_info->cleanup = cleanup;
+ sub_info->data = data;
+ return sub_info;
+}
+
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+ struct umh_info *umh_info = info->data;
+ struct file *from_umh[2];
+ struct file *to_umh[2];
+ int err;
+
+ /* create pipe to send data to umh */
+ err = create_pipe_files(to_umh, 0);
+ if (err)
+ return err;
+ err = replace_fd(0, to_umh[0], 0);
+ fput(to_umh[0]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ return err;
+ }
+
+ /* create pipe to receive data from umh */
+ err = create_pipe_files(from_umh, 0);
+ if (err) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ return err;
+ }
+ err = replace_fd(1, from_umh[1], 0);
+ fput(from_umh[1]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ fput(from_umh[0]);
+ return err;
+ }
+
+ umh_info->pipe_to_umh = to_umh[1];
+ umh_info->pipe_from_umh = from_umh[0];
+ return 0;
+}
+
+static void umh_save_pid(struct subprocess_info *info)
+{
+ struct umh_info *umh_info = info->data;
+
+ umh_info->pid = info->pid;
+}
+
+/**
+ * fork_usermode_blob - fork a blob of bytes as a usermode process
+ * @data: a blob of bytes that can be do_execv-ed as a file
+ * @len: length of the blob
+ * @info: information about usermode process (shouldn't be NULL)
+ *
+ * Returns either negative error or zero which indicates success
+ * in executing a blob of bytes as a usermode process. In such
+ * case 'struct umh_info *info' is populated with two pipes
+ * and a pid of the process. The caller is responsible for health
+ * check of the user process, killing it via pid, and closing the
+ * pipes when user process is no longer needed.
+ */
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+{
+ struct subprocess_info *sub_info;
+ struct file *file;
+ ssize_t written;
+ loff_t pos = 0;
+ int err;
+
+ file = shmem_kernel_file_setup("", len, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ written = kernel_write(file, data, len, &pos);
+ if (written != len) {
+ err = written;
+ if (err >= 0)
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = -ENOMEM;
+ sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
+ umh_save_pid, info);
+ if (!sub_info)
+ goto out;
+
+ err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+out:
+ fput(file);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_blob);
+
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 246d4d4ce5c7..c3d7583fcd21 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -764,8 +764,9 @@ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
struct uid_gid_extent *forward;
/* Allocate memory for 340 mappings. */
- forward = kmalloc(sizeof(struct uid_gid_extent) *
- UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL);
+ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS,
+ sizeof(struct uid_gid_extent),
+ GFP_KERNEL);
if (!forward)
return -ENOMEM;
@@ -1235,6 +1236,7 @@ bool current_in_userns(const struct user_namespace *target_ns)
{
return in_userns(target_ns, current_user_ns());
}
+EXPORT_SYMBOL(current_in_userns);
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca7959be8aaa..78b192071ef7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -66,7 +66,7 @@ enum {
* be executing on any CPU. The pool behaves as an unbound one.
*
* Note that DISASSOCIATED should be flipped only while holding
- * attach_mutex to avoid changing binding state while
+ * wq_pool_attach_mutex to avoid changing binding state while
* worker_attach_to_pool() is in progress.
*/
POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
@@ -123,7 +123,7 @@ enum {
* cpu or grabbing pool->lock is enough for read access. If
* POOL_DISASSOCIATED is set, it's identical to L.
*
- * A: pool->attach_mutex protected.
+ * A: wq_pool_attach_mutex protected.
*
* PL: wq_pool_mutex protected.
*
@@ -166,7 +166,6 @@ struct worker_pool {
/* L: hash of busy workers */
struct worker *manager; /* L: purely informational */
- struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
@@ -297,6 +296,7 @@ static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
+static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
@@ -399,14 +399,14 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
* @worker: iteration cursor
* @pool: worker_pool to iterate workers of
*
- * This must be called with @pool->attach_mutex.
+ * This must be called with wq_pool_attach_mutex.
*
* The if/else clause exists only for the lockdep assertion and can be
* ignored.
*/
#define for_each_pool_worker(worker, pool) \
list_for_each_entry((worker), &(pool)->workers, node) \
- if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
+ if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
else
/**
@@ -1724,7 +1724,7 @@ static struct worker *alloc_worker(int node)
static void worker_attach_to_pool(struct worker *worker,
struct worker_pool *pool)
{
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
/*
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
@@ -1733,37 +1733,40 @@ static void worker_attach_to_pool(struct worker *worker,
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
/*
- * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
- * stable across this function. See the comments above the
- * flag definition for details.
+ * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
+ * stable across this function. See the comments above the flag
+ * definition for details.
*/
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
list_add_tail(&worker->node, &pool->workers);
+ worker->pool = pool;
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
}
/**
* worker_detach_from_pool() - detach a worker from its pool
* @worker: worker which is attached to its pool
- * @pool: the pool @worker is attached to
*
* Undo the attaching which had been done in worker_attach_to_pool(). The
* caller worker shouldn't access to the pool after detached except it has
* other reference to the pool.
*/
-static void worker_detach_from_pool(struct worker *worker,
- struct worker_pool *pool)
+static void worker_detach_from_pool(struct worker *worker)
{
+ struct worker_pool *pool = worker->pool;
struct completion *detach_completion = NULL;
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
+
list_del(&worker->node);
+ worker->pool = NULL;
+
if (list_empty(&pool->workers))
detach_completion = pool->detach_completion;
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
/* clear leftover flags without pool->lock after it is detached */
worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
@@ -1799,7 +1802,6 @@ static struct worker *create_worker(struct worker_pool *pool)
if (!worker)
goto fail;
- worker->pool = pool;
worker->id = id;
if (pool->cpu >= 0)
@@ -2086,6 +2088,12 @@ __acquires(&pool->lock)
worker->current_pwq = pwq;
work_color = get_work_color(work);
+ /*
+ * Record wq name for cmdline and debug reporting, may get
+ * overridden through set_worker_desc().
+ */
+ strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
+
list_del_init(&work->entry);
/*
@@ -2181,7 +2189,6 @@ __acquires(&pool->lock)
worker->current_work = NULL;
worker->current_func = NULL;
worker->current_pwq = NULL;
- worker->desc_valid = false;
pwq_dec_nr_in_flight(pwq, work_color);
}
@@ -2206,6 +2213,16 @@ static void process_scheduled_works(struct worker *worker)
}
}
+static void set_pf_worker(bool val)
+{
+ mutex_lock(&wq_pool_attach_mutex);
+ if (val)
+ current->flags |= PF_WQ_WORKER;
+ else
+ current->flags &= ~PF_WQ_WORKER;
+ mutex_unlock(&wq_pool_attach_mutex);
+}
+
/**
* worker_thread - the worker thread function
* @__worker: self
@@ -2224,7 +2241,7 @@ static int worker_thread(void *__worker)
struct worker_pool *pool = worker->pool;
/* tell the scheduler that this is a workqueue worker */
- worker->task->flags |= PF_WQ_WORKER;
+ set_pf_worker(true);
woke_up:
spin_lock_irq(&pool->lock);
@@ -2232,11 +2249,11 @@ woke_up:
if (unlikely(worker->flags & WORKER_DIE)) {
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
- worker->task->flags &= ~PF_WQ_WORKER;
+ set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
ida_simple_remove(&pool->worker_ida, worker->id);
- worker_detach_from_pool(worker, pool);
+ worker_detach_from_pool(worker);
kfree(worker);
return 0;
}
@@ -2335,7 +2352,7 @@ static int rescuer_thread(void *__rescuer)
* Mark rescuer as worker too. As WORKER_PREP is never cleared, it
* doesn't participate in concurrency management.
*/
- rescuer->task->flags |= PF_WQ_WORKER;
+ set_pf_worker(true);
repeat:
set_current_state(TASK_IDLE);
@@ -2367,7 +2384,6 @@ repeat:
worker_attach_to_pool(rescuer, pool);
spin_lock_irq(&pool->lock);
- rescuer->pool = pool;
/*
* Slurp in all works issued via this workqueue and
@@ -2417,10 +2433,9 @@ repeat:
if (need_more_worker(pool))
wake_up_worker(pool);
- rescuer->pool = NULL;
spin_unlock_irq(&pool->lock);
- worker_detach_from_pool(rescuer, pool);
+ worker_detach_from_pool(rescuer);
spin_lock_irq(&wq_mayday_lock);
}
@@ -2429,7 +2444,7 @@ repeat:
if (should_stop) {
__set_current_state(TASK_RUNNING);
- rescuer->task->flags &= ~PF_WQ_WORKER;
+ set_pf_worker(false);
return 0;
}
@@ -3271,7 +3286,6 @@ static int init_worker_pool(struct worker_pool *pool)
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
- mutex_init(&pool->attach_mutex);
INIT_LIST_HEAD(&pool->workers);
ida_init(&pool->worker_ida);
@@ -3354,10 +3368,10 @@ static void put_unbound_pool(struct worker_pool *pool)
WARN_ON(pool->nr_workers || pool->nr_idle);
spin_unlock_irq(&pool->lock);
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
if (!list_empty(&pool->workers))
pool->detach_completion = &detach_completion;
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
if (pool->detach_completion)
wait_for_completion(pool->detach_completion);
@@ -3700,8 +3714,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
lockdep_assert_held(&wq_pool_mutex);
- ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
- GFP_KERNEL);
+ ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
@@ -4347,9 +4360,9 @@ void set_worker_desc(const char *fmt, ...)
va_start(args, fmt);
vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
va_end(args);
- worker->desc_valid = true;
}
}
+EXPORT_SYMBOL_GPL(set_worker_desc);
/**
* print_worker_info - print out worker information and description
@@ -4371,7 +4384,6 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
char desc[WORKER_DESC_LEN] = { };
struct pool_workqueue *pwq = NULL;
struct workqueue_struct *wq = NULL;
- bool desc_valid = false;
struct worker *worker;
if (!(task->flags & PF_WQ_WORKER))
@@ -4384,22 +4396,18 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
worker = kthread_probe_data(task);
/*
- * Carefully copy the associated workqueue's workfn and name. Keep
- * the original last '\0' in case the original contains garbage.
+ * Carefully copy the associated workqueue's workfn, name and desc.
+ * Keep the original last '\0' in case the original is garbage.
*/
probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
probe_kernel_read(name, wq->name, sizeof(name) - 1);
-
- /* copy worker description */
- probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
- if (desc_valid)
- probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+ probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
if (fn || name[0] || desc[0]) {
printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
- if (desc[0])
+ if (strcmp(name, desc))
pr_cont(" (%s)", desc);
pr_cont("\n");
}
@@ -4579,6 +4587,47 @@ void show_workqueue_state(void)
rcu_read_unlock_sched();
}
+/* used to show worker information through /proc/PID/{comm,stat,status} */
+void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
+{
+ int off;
+
+ /* always show the actual comm */
+ off = strscpy(buf, task->comm, size);
+ if (off < 0)
+ return;
+
+ /* stabilize PF_WQ_WORKER and worker pool association */
+ mutex_lock(&wq_pool_attach_mutex);
+
+ if (task->flags & PF_WQ_WORKER) {
+ struct worker *worker = kthread_data(task);
+ struct worker_pool *pool = worker->pool;
+
+ if (pool) {
+ spin_lock_irq(&pool->lock);
+ /*
+ * ->desc tracks information (wq name or
+ * set_worker_desc()) for the latest execution. If
+ * current, prepend '+', otherwise '-'.
+ */
+ if (worker->desc[0] != '\0') {
+ if (worker->current_work)
+ scnprintf(buf + off, size - off, "+%s",
+ worker->desc);
+ else
+ scnprintf(buf + off, size - off, "-%s",
+ worker->desc);
+ }
+ spin_unlock_irq(&pool->lock);
+ }
+ }
+
+ mutex_unlock(&wq_pool_attach_mutex);
+}
+
+#ifdef CONFIG_SMP
+
/*
* CPU hotplug.
*
@@ -4600,7 +4649,7 @@ static void unbind_workers(int cpu)
struct worker *worker;
for_each_cpu_worker_pool(pool, cpu) {
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
spin_lock_irq(&pool->lock);
/*
@@ -4616,7 +4665,7 @@ static void unbind_workers(int cpu)
pool->flags |= POOL_DISASSOCIATED;
spin_unlock_irq(&pool->lock);
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
/*
* Call schedule() so that we cross rq->lock and thus can
@@ -4657,7 +4706,7 @@ static void rebind_workers(struct worker_pool *pool)
{
struct worker *worker;
- lockdep_assert_held(&pool->attach_mutex);
+ lockdep_assert_held(&wq_pool_attach_mutex);
/*
* Restore CPU affinity of all workers. As all idle workers should
@@ -4727,7 +4776,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
static cpumask_t cpumask;
struct worker *worker;
- lockdep_assert_held(&pool->attach_mutex);
+ lockdep_assert_held(&wq_pool_attach_mutex);
/* is @cpu allowed for @pool? */
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4762,14 +4811,14 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_lock(&wq_pool_mutex);
for_each_pool(pool, pi) {
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
if (pool->cpu == cpu)
rebind_workers(pool);
else if (pool->cpu < 0)
restore_unbound_workers_cpumask(pool, cpu);
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
}
/* update NUMA affinity of unbound workqueues */
@@ -4799,8 +4848,6 @@ int workqueue_offline_cpu(unsigned int cpu)
return 0;
}
-#ifdef CONFIG_SMP
-
struct work_for_cpu {
struct work_struct work;
long (*fn)(void *);
@@ -5591,7 +5638,7 @@ static void __init wq_numa_init(void)
* available. Build one from cpu_to_node() which should have been
* fully initialized by now.
*/
- tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
+ tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL);
BUG_ON(!tbl);
for_each_node(node)
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index d390d1be3748..66fbb5a9e633 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -31,13 +31,12 @@ struct worker {
struct work_struct *current_work; /* L: work being processed */
work_func_t current_func; /* L: current_work's fn */
struct pool_workqueue *current_pwq; /* L: current_work's pwq */
- bool desc_valid; /* ->desc is valid */
struct list_head scheduled; /* L: scheduled works */
/* 64 bytes boundary on 64bit, 32 on 32bit */
struct task_struct *task; /* I: worker task */
- struct worker_pool *pool; /* I: the associated pool */
+ struct worker_pool *pool; /* A: the associated pool */
/* L: for rescuers */
struct list_head node; /* A: anchored at pool->workers */
/* A: runs through worker->node */