From 581da2cab557fa6e8a954c148c487eb7e0658979 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 19 Nov 2015 16:31:11 +0100 Subject: async: export current_is_async() Introduced by 84b233adcca3 ("workqueue: implement current_is_async()"). Cc: Tejun Heo Signed-off-by: Lukas Wunner Acked-by: Tejun Heo Signed-off-by: Daniel Vetter --- kernel/async.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 4c3773c0bf63..d2edd6efec56 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -326,3 +326,4 @@ bool current_is_async(void) return worker && worker->current_func == async_run_entry_fn; } +EXPORT_SYMBOL_GPL(current_is_async); -- cgit v1.3-14-g43fede From d6335d77a7622a88380f3f207cc1f727f878dd21 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 24 Dec 2015 11:09:39 -0500 Subject: security: Make inode argument of inode_getsecid non-const Make the inode argument of the inode_getsecid hook non-const so that we can use it to revalidate invalid security labels. Signed-off-by: Andreas Gruenbacher Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- include/linux/audit.h | 8 ++++---- include/linux/lsm_hooks.h | 2 +- include/linux/security.h | 4 ++-- kernel/audit.c | 2 +- kernel/audit.h | 2 +- kernel/auditsc.c | 6 +++--- security/security.c | 2 +- security/selinux/hooks.c | 2 +- security/smack/smack_lsm.c | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 20eba1eb0a3c..8a2d046e9f6b 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -137,7 +137,7 @@ extern void __audit_getname(struct filename *name); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); -extern void __audit_inode_child(const struct inode *parent, +extern void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type); extern void __audit_seccomp(unsigned long syscall, long signr, int code); @@ -202,7 +202,7 @@ static inline void audit_inode_parent_hidden(struct filename *name, __audit_inode(name, dentry, AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } -static inline void audit_inode_child(const struct inode *parent, +static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { if (unlikely(!audit_dummy_context())) @@ -359,7 +359,7 @@ static inline void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { } -static inline void __audit_inode_child(const struct inode *parent, +static inline void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } @@ -373,7 +373,7 @@ static inline void audit_file(struct file *file) static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { } -static inline void audit_inode_child(const struct inode *parent, +static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index bdd0a3a8a0e4..4c48227450e6 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1420,7 +1420,7 @@ union security_list_options { int flags); int (*inode_listsecurity)(struct inode *inode, char *buffer, size_t buffer_size); - void (*inode_getsecid)(const struct inode *inode, u32 *secid); + void (*inode_getsecid)(struct inode *inode, u32 *secid); int (*file_permission)(struct file *file, int mask); int (*file_alloc_security)(struct file *file); diff --git a/include/linux/security.h b/include/linux/security.h index 9ee61b264b23..e79149a06454 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -273,7 +273,7 @@ int security_inode_killpriv(struct dentry *dentry); int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc); int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags); int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size); -void security_inode_getsecid(const struct inode *inode, u32 *secid); +void security_inode_getsecid(struct inode *inode, u32 *secid); int security_file_permission(struct file *file, int mask); int security_file_alloc(struct file *file); void security_file_free(struct file *file); @@ -734,7 +734,7 @@ static inline int security_inode_listsecurity(struct inode *inode, char *buffer, return 0; } -static inline void security_inode_getsecid(const struct inode *inode, u32 *secid) +static inline void security_inode_getsecid(struct inode *inode, u32 *secid) { *secid = 0; } diff --git a/kernel/audit.c b/kernel/audit.c index 5ffcbd354a52..bc2ff61bc1d6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1722,7 +1722,7 @@ static inline int audit_copy_fcaps(struct audit_names *name, /* Copy inode data into an audit_names. */ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode) + struct inode *inode) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; diff --git a/kernel/audit.h b/kernel/audit.h index de6cbb7cf547..cbbe6bb6496e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -207,7 +207,7 @@ extern u32 audit_ever_enabled; extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode); + struct inode *inode); extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); extern void audit_log_name(struct audit_context *context, diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b86cc04959de..195ffaee50b9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { struct audit_context *context = current->audit_context; - const struct inode *inode = d_backing_inode(dentry); + struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; @@ -1848,12 +1848,12 @@ void __audit_file(const struct file *file) * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const struct inode *parent, +void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { struct audit_context *context = current->audit_context; - const struct inode *inode = d_backing_inode(dentry); + struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; diff --git a/security/security.c b/security/security.c index 73514c91d87f..c5beb7e90721 100644 --- a/security/security.c +++ b/security/security.c @@ -721,7 +721,7 @@ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer } EXPORT_SYMBOL(security_inode_listsecurity); -void security_inode_getsecid(const struct inode *inode, u32 *secid) +void security_inode_getsecid(struct inode *inode, u32 *secid) { call_void_hook(inode_getsecid, inode, secid); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 2e40c9c4e12c..19a8f1500a7e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3180,7 +3180,7 @@ static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t return len; } -static void selinux_inode_getsecid(const struct inode *inode, u32 *secid) +static void selinux_inode_getsecid(struct inode *inode, u32 *secid) { struct inode_security_struct *isec = inode->i_security; *secid = isec->sid; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index f0e694bccfd4..ac7436f1bc2b 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1538,7 +1538,7 @@ static int smack_inode_listsecurity(struct inode *inode, char *buffer, * @inode: inode to extract the info from * @secid: where result will be saved */ -static void smack_inode_getsecid(const struct inode *inode, u32 *secid) +static void smack_inode_getsecid(struct inode *inode, u32 *secid) { struct inode_smack *isp = inode->i_security; -- cgit v1.3-14-g43fede From c4b7a7755f91081e430bbd58fec77194b05f834b Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 13 Jan 2016 09:15:18 -0500 Subject: audit: don't needlessly reset valid wait time After auditd has recovered from an overflowed queue, the first process that doesn't use reserves to make it through the queue checks should reset the audit backlog wait time to the configured value. After that, there is no need to keep resetting it. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 5ffcbd354a52..6d00bd1ff249 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1400,7 +1400,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - if (!reserve) + if (!reserve && !audit_backlog_wait_time) audit_backlog_wait_time = audit_backlog_wait_time_master; ab = audit_buffer_alloc(ctx, gfp_mask, type); -- cgit v1.3-14-g43fede From eb8baf6aa3ba1fcb1c1fd2cc57e31195a42689fd Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 13 Jan 2016 09:15:18 -0500 Subject: audit: remove audit_backlog_wait_overflow It seems much more obvious and readable to simply use "0". Signed-off-by: Paul Moore --- kernel/audit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6d00bd1ff249..07d60e4b2af8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -110,7 +110,6 @@ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; -static u32 audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ kuid_t audit_sig_uid = INVALID_UID; @@ -1395,7 +1394,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, skb_queue_len(&audit_skb_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); - audit_backlog_wait_time = audit_backlog_wait_overflow; + audit_backlog_wait_time = 0; wake_up(&audit_backlog_wait); return NULL; } -- cgit v1.3-14-g43fede From f48a942926c58e4b2dfc3f21c58579d5435841ef Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 13 Jan 2016 09:15:19 -0500 Subject: audit: include auditd's threads in audit_log_start() wait exception Should auditd spawn threads, allow all members of its thread group to use the audit_backlog_limit reserves to bypass the queue limits too. Signed-off-by: Richard Guy Briggs [PM: minor upstream merge tweaks] Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 07d60e4b2af8..60c9c5adc5be 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1371,7 +1371,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; if (gfp_mask & __GFP_DIRECT_RECLAIM) { - if (audit_pid && audit_pid == current->pid) + if (audit_pid && audit_pid == current->tgid) gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; -- cgit v1.3-14-g43fede From 1194b994bec308433cc84ffdb92fd668713b8f93 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 13 Jan 2016 09:18:54 -0500 Subject: audit: wake up threads if queue switched from limited to unlimited If the audit_backlog_limit is changed from a limited value to an unlimited value (zero) while the queue was overflowed, wake up the audit_backlog_wait queue to allow those processes to continue. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 60c9c5adc5be..d7b675781934 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -523,7 +523,8 @@ static int kauditd_thread(void *dummy) skb = skb_dequeue(&audit_skb_queue); if (skb) { - if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit) + if (!audit_backlog_limit || + (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)) wake_up(&audit_backlog_wait); if (audit_pid) kauditd_send_skb(skb); -- cgit v1.3-14-g43fede From d865e573b8a4f30fbb74fa7666ca81e3132eb547 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 13 Jan 2016 09:18:55 -0500 Subject: audit: Delete unnecessary checks before two function calls The functions consume_skb() and kfree_skb() test whether their argument is NULL and then return immediately. Thus the tests around their calls are not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring [PM: tweak patch prefix] Signed-off-by: Paul Moore --- kernel/audit.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d7b675781934..d6dd95cc59e6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -508,8 +508,7 @@ static void flush_hold_queue(void) * if auditd just disappeared but we * dequeued an skb we need to drop ref */ - if (skb) - consume_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) @@ -1232,9 +1231,7 @@ static void audit_buffer_free(struct audit_buffer *ab) if (!ab) return; - if (ab->skb) - kfree_skb(ab->skb); - + kfree_skb(ab->skb); spin_lock_irqsave(&audit_freelist_lock, flags); if (audit_freelist_count > AUDIT_MAXFREE) kfree(ab); -- cgit v1.3-14-g43fede From 7717c6be699975f6733d278b13b7c4295d73caf6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 Jan 2016 15:48:54 -0500 Subject: tracing: Fix stacktrace skip depth in trace_buffer_unlock_commit_regs() While cleaning the stacktrace code I unintentially changed the skip depth of trace_buffer_unlock_commit_regs() from 0 to 6. kprobes uses this function, and with skipping 6 call backs, it can easily produce no stack. Here's how I tested it: # echo 'p:ext4_sync_fs ext4_sync_fs ' > /sys/kernel/debug/tracing/kprobe_events # echo 1 > /sys/kernel/debug/tracing/events/kprobes/enable # cat /sys/kernel/debug/trace sync-2394 [005] 502.457060: ext4_sync_fs: (ffffffff81317650) sync-2394 [005] 502.457063: kernel_stack: sync-2394 [005] 502.457086: ext4_sync_fs: (ffffffff81317650) sync-2394 [005] 502.457087: kernel_stack: sync-2394 [005] 502.457091: ext4_sync_fs: (ffffffff81317650) After putting back the skip stack to zero, we have: sync-2270 [000] 748.052693: ext4_sync_fs: (ffffffff81317650) sync-2270 [000] 748.052695: kernel_stack: => iterate_supers (ffffffff8126412e) => sys_sync (ffffffff8129c4b6) => entry_SYSCALL_64_fastpath (ffffffff8181f0b2) sync-2270 [000] 748.053017: ext4_sync_fs: (ffffffff81317650) sync-2270 [000] 748.053019: kernel_stack: => iterate_supers (ffffffff8126412e) => sys_sync (ffffffff8129c4b6) => entry_SYSCALL_64_fastpath (ffffffff8181f0b2) sync-2270 [000] 748.053381: ext4_sync_fs: (ffffffff81317650) sync-2270 [000] 748.053383: kernel_stack: => iterate_supers (ffffffff8126412e) => sys_sync (ffffffff8129c4b6) => entry_SYSCALL_64_fastpath (ffffffff8181f0b2) Cc: stable@vger.kernel.org # v4.4+ Fixes: 73dddbb57bb0 "tracing: Only create stacktrace option when STACKTRACE is configured" Reported-by: Brendan Gregg Tested-by: Brendan Gregg Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87fb9801bd9e..d9293402ee68 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1751,7 +1751,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 6, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); -- cgit v1.3-14-g43fede From 570540d50710ed192e98e2f7f74578c9486b6b05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jan 2016 14:07:25 +0100 Subject: genirq: Validate action before dereferencing it in handle_irq_event_percpu() commit 71f64340fc0e changed the handling of irq_desc->action from CPU 0 CPU 1 free_irq() lock(desc) lock(desc) handle_edge_irq() if (desc->action) { handle_irq_event() action = desc->action unlock(desc) desc->action = NULL handle_irq_event_percpu(desc, action) action->xxx to CPU 0 CPU 1 free_irq() lock(desc) lock(desc) handle_edge_irq() if (desc->action) { handle_irq_event() unlock(desc) desc->action = NULL handle_irq_event_percpu(desc, action) action = desc->action action->xxx So if free_irq manages to set the action to NULL between the unlock and before the readout, we happily dereference a null pointer. We could simply revert 71f64340fc0e, but we want to preserve the better code generation. A simple solution is to change the action loop from a do {} while to a while {} loop. This is safe because we either see a valid desc->action or NULL. If the action is about to be removed it is still valid as free_irq() is blocked on synchronize_irq(). CPU 0 CPU 1 free_irq() lock(desc) lock(desc) handle_edge_irq() handle_irq_event(desc) set(INPROGRESS) unlock(desc) handle_irq_event_percpu(desc) action = desc->action desc->action = NULL while (action) { action->xxx ... action = action->next; sychronize_irq() while(INPROGRESS); lock(desc) clr(INPROGRESS) free(action) That's basically the same mechanism as we have for shared interrupts. action->next can become NULL while handle_irq_event_percpu() runs. Either it sees the action or NULL. It does not matter, because action itself cannot go away before the interrupt in progress flag has been cleared. Fixes: commit 71f64340fc0e "genirq: Remove the second parameter from handle_irq_event_percpu()" Reported-by: zyjzyj2000@gmail.com Signed-off-by: Thomas Gleixner Cc: Huang Shijie Cc: Jiang Liu Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1601131224190.3575@nanos --- kernel/irq/handle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a302cf9a2126..57bff7857e87 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -138,7 +138,8 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) unsigned int flags = 0, irq = desc->irq_data.irq; struct irqaction *action = desc->action; - do { + /* action might have become NULL since we dropped the lock */ + while (action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,7 +174,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) retval |= res; action = action->next; - } while (action); + } add_interrupt_randomness(irq, flags); -- cgit v1.3-14-g43fede From bf6f869f8c111ba265cd199c64b25c646987500a Mon Sep 17 00:00:00 2001 From: Liu Jiang Date: Tue, 12 Jan 2016 13:18:06 -0700 Subject: genirq/MSI: Relax msi_domain_alloc() to support parentless MSI irqdomains Previously msi_domain_alloc() assumed MSI irqdomains always had parent irqdomains, but that's not true for the new Intel VMD devices. Relax msi_domain_alloc() to support parentless MSI irqdomains. Signed-off-by: Jiang Liu Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas Acked-by: Thomas Gleixner --- kernel/irq/msi.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6b0c0b74a2a1..5e15cb4b2f00 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -109,9 +109,11 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, if (irq_find_mapping(domain, hwirq) > 0) return -EEXIST; - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret < 0) - return ret; + if (domain->parent) { + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + } for (i = 0; i < nr_irqs; i++) { ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); -- cgit v1.3-14-g43fede From 64bce3e8bfb3cf2cbc638296390af3e02e747347 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Jan 2016 13:18:07 -0700 Subject: irqdomain: Export irq_domain_set_info() for module use Export irq_domain_set_info() for module use. It will be used by the Volume Management Device driver. [bhelgaas: changelog] Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas Acked-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 22aa9612ef7c..ca05cc841f00 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1058,6 +1058,7 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, __irq_set_handler(virq, handler, 0, handler_name); irq_set_handler_data(virq, handler_data); } +EXPORT_SYMBOL(irq_domain_set_info); /** * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data -- cgit v1.3-14-g43fede From 46373a15f65fe862f31c19a484acdf551f2b442f Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 11 Jan 2016 17:40:31 +0100 Subject: time: nohz: Expose tick_nohz_enabled The cpuidle subsystem needs it. Signed-off-by: Jean Delvare Reviewed-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- include/linux/tick.h | 2 ++ kernel/time/tick-sched.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index e312219ff823..97fd4e543846 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -98,6 +98,7 @@ static inline void tick_broadcast_exit(void) } #ifdef CONFIG_NO_HZ_COMMON +extern int tick_nohz_enabled; extern int tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); @@ -106,6 +107,7 @@ extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ +#define tick_nohz_enabled (0) static inline int tick_nohz_tick_stopped(void) { return 0; } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7c7ec4515983..edfea95c39db 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -387,7 +387,7 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_enabled __read_mostly = 1; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode -- cgit v1.3-14-g43fede From d281ee6145183594788ab6d5b55f8d144e69eace Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:16 -0800 Subject: rmap: add argument to charge compound page We're going to allow mapping of individual 4k pages of THP compound page. It means we cannot rely on PageTransHuge() check to decide if map/unmap small page or THP. The patch adds new argument to rmap functions to indicate whether we want to operate on whole compound page or only the small page. [n-horiguchi@ah.jp.nec.com: fix mapcount mismatch in hugepage migration] Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 12 +++++++++--- kernel/events/uprobes.c | 4 ++-- mm/huge_memory.c | 16 ++++++++-------- mm/hugetlb.c | 4 ++-- mm/ksm.c | 4 ++-- mm/memory.c | 14 +++++++------- mm/migrate.c | 8 ++++---- mm/rmap.c | 48 +++++++++++++++++++++++++++++++----------------- mm/swapfile.c | 4 ++-- mm/userfaultfd.c | 2 +- 10 files changed, 68 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 29446aeef36e..038b6e704d9b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -161,16 +161,22 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *page_get_anon_vma(struct page *page); +/* bitflags for do_page_add_anon_rmap() */ +#define RMAP_EXCLUSIVE 0x01 +#define RMAP_COMPOUND 0x02 + /* * rmap interfaces called when adding or removing pte of page */ void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); -void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_add_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, int); -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); void page_add_file_rmap(struct page *); -void page_remove_rmap(struct page *); +void page_remove_rmap(struct page *, bool); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0669169716..060c7a0edfdf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -175,7 +175,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, goto unlock; get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr); + page_add_new_anon_rmap(kpage, vma, addr, false); mem_cgroup_commit_charge(kpage, memcg, false); lru_cache_add_active_or_unevictable(kpage, vma); @@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); pte_unmap_unlock(ptep, ptl); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 370d44a5e25b..b7669cfe9dc9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -797,7 +797,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); + page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); @@ -1139,7 +1139,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vma, haddr); + page_add_new_anon_rmap(pages[i], vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false); lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); @@ -1151,7 +1151,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - page_remove_rmap(page); + page_remove_rmap(page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1271,7 +1271,7 @@ alloc: entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_huge_clear_flush_notify(vma, haddr, pmd); - page_add_new_anon_rmap(new_page, vma, haddr); + page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); @@ -1281,7 +1281,7 @@ alloc: put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page); + page_remove_rmap(page, true); put_page(page); } ret |= VM_FAULT_WRITE; @@ -1508,7 +1508,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, put_huge_zero_page(); } else { struct page *page = pmd_page(orig_pmd); - page_remove_rmap(page); + page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON_PAGE(!PageHead(page), page); @@ -2371,7 +2371,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * superfluous. */ pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page); + page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -2682,7 +2682,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address, true); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cdf38252f82e..e924529f7b38 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3186,7 +3186,7 @@ again: set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page); + page_remove_rmap(page, true); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { address += sz; @@ -3415,7 +3415,7 @@ retry_avoidcopy: mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); - page_remove_rmap(old_page); + page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; diff --git a/mm/ksm.c b/mm/ksm.c index 643abe7a75de..b4f7b69efad0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -956,13 +956,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } get_page(kpage); - page_add_anon_rmap(kpage, vma, addr); + page_add_anon_rmap(kpage, vma, addr, false); flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); diff --git a/mm/memory.c b/mm/memory.c index f9360dde6967..f964d190ce83 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1118,7 +1118,7 @@ again: mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page); + page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(!__tlb_remove_page(tlb, page))) { @@ -2118,7 +2118,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - page_add_new_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address, false); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2151,7 +2151,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page); + page_remove_rmap(old_page, false); } /* Free the old page.. */ @@ -2567,7 +2567,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; - exclusive = 1; + exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(orig_pte)) @@ -2577,7 +2577,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, do_page_add_anon_rmap(page, vma, address, exclusive); mem_cgroup_commit_charge(page, memcg, true); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address, false); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2735,7 +2735,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address, false); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); setpte: @@ -2824,7 +2824,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address, false); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page); diff --git a/mm/migrate.c b/mm/migrate.c index f7f345ddc9ae..3921f20f8de4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -167,7 +167,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_dup_rmap(new); } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, addr); + page_add_anon_rmap(new, vma, addr, false); else page_add_file_rmap(new); @@ -1815,7 +1815,7 @@ fail_putback: * guarantee the copy is visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start); + page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); @@ -1826,14 +1826,14 @@ fail_putback: flush_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); - page_remove_rmap(new_page); + page_remove_rmap(new_page, true); goto fail_putback; } mlock_migrate_page(new_page, page); set_page_memcg(new_page, page_memcg(page)); set_page_memcg(page, NULL); - page_remove_rmap(page); + page_remove_rmap(page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/rmap.c b/mm/rmap.c index 622756c16ac8..c330f9aba63a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1133,6 +1133,7 @@ static void __page_check_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, @@ -1140,9 +1141,9 @@ static void __page_check_anon_rmap(struct page *page, * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { - do_page_add_anon_rmap(page, vma, address, 0); + do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); } /* @@ -1151,21 +1152,24 @@ void page_add_anon_rmap(struct page *page, * Everybody else should continue to use page_add_anon_rmap above. */ void do_page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, int exclusive) + struct vm_area_struct *vma, unsigned long address, int flags) { int first = atomic_inc_and_test(&page->_mapcount); if (first) { + bool compound = flags & RMAP_COMPOUND; + int nr = compound ? hpage_nr_pages(page) : 1; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption * disabled. */ - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); } if (unlikely(PageKsm(page))) return; @@ -1173,7 +1177,8 @@ void do_page_add_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); /* address might be in next vma when migration races vma_adjust */ if (first) - __page_set_anon_rmap(page, vma, address, exclusive); + __page_set_anon_rmap(page, vma, address, + flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); } @@ -1183,21 +1188,25 @@ void do_page_add_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * Same as page_add_anon_rmap but must only be called on *new* pages. * This means the inc-and-test can be bypassed. * Page does not have to be locked. */ void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { + int nr = compound ? hpage_nr_pages(page) : 1; + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1249,13 +1258,17 @@ out: /** * page_remove_rmap - take down pte mapping from a page - * @page: page to remove mapping from + * @page: page to remove mapping from + * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page) +void page_remove_rmap(struct page *page, bool compound) { + int nr = compound ? hpage_nr_pages(page) : 1; + if (!PageAnon(page)) { + VM_BUG_ON_PAGE(compound && !PageHuge(page), page); page_remove_file_rmap(page); return; } @@ -1273,11 +1286,12 @@ void page_remove_rmap(struct page *page) * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + } - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - -hpage_nr_pages(page)); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1416,7 +1430,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } else dec_mm_counter(mm, mm_counter_file(page)); - page_remove_rmap(page); + page_remove_rmap(page, PageHuge(page)); page_cache_release(page); out_unmap: diff --git a/mm/swapfile.c b/mm/swapfile.c index e6b8591a3ed2..058e6f0162eb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1160,10 +1160,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr); + page_add_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, true); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr); + page_add_new_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 77fee9325a57..ae21a1f309c2 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -76,7 +76,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; inc_mm_counter(dst_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, dst_vma, dst_addr); + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, dst_vma); -- cgit v1.3-14-g43fede From f627c2f53786b0445abca47f6aa84c96a1fffec2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:20 -0800 Subject: memcg: adjust to support new THP refcounting As with rmap, with new refcounting we cannot rely on PageTransHuge() to check if we need to charge size of huge page form the cgroup. We need to get information from caller to know whether it was mapped with PMD or PTE. We do uncharge when last reference on the page gone. At that point if we see PageTransHuge() it means we need to unchange whole huge page. The tricky part is partial unmap -- when we try to unmap part of huge page. We don't do a special handing of this situation, meaning we don't uncharge the part of huge page unless last user is gone or split_huge_page() is triggered. In case of cgroup memory pressure happens the partial unmapped page will be split through shrinker. This should be good enough. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 16 +++++++----- kernel/events/uprobes.c | 7 +++--- mm/filemap.c | 8 +++--- mm/huge_memory.c | 32 +++++++++++++----------- mm/memcontrol.c | 62 +++++++++++++++++----------------------------- mm/memory.c | 28 ++++++++++----------- mm/shmem.c | 21 +++++++++------- mm/swapfile.c | 9 ++++--- mm/userfaultfd.c | 6 ++--- 9 files changed, 92 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2292468f2a30..189f04d4d2ec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -280,10 +280,12 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg, bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp); + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare); -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); + bool lrucare, bool compound); +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, + bool compound); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -515,7 +517,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root, static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, - struct mem_cgroup **memcgp) + struct mem_cgroup **memcgp, + bool compound) { *memcgp = NULL; return 0; @@ -523,12 +526,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, static inline void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) + bool lrucare, bool compound) { } static inline void mem_cgroup_cancel_charge(struct page *page, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg, + bool compound) { } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 060c7a0edfdf..0167679182c0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; - err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, + false); if (err) return err; @@ -176,7 +177,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr, false); - mem_cgroup_commit_charge(kpage, memcg, false); + mem_cgroup_commit_charge(kpage, memcg, false, false); lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { @@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg); + mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; diff --git a/mm/filemap.c b/mm/filemap.c index ae652ded700c..a729345ed6ec 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -618,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page, if (!huge) { error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg); + gfp_mask, &memcg, false); if (error) return error; } @@ -626,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page, error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error) { if (!huge) - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); return error; } @@ -645,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page, __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; err_insert: @@ -653,7 +653,7 @@ err_insert: /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); if (!huge) - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); return error; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b7669cfe9dc9..4211682f223b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -751,7 +751,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -759,7 +759,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); return VM_FAULT_OOM; } @@ -775,7 +775,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { spin_unlock(ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(mm, pgtable); } else { @@ -786,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, int ret; spin_unlock(ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(mm, pgtable); ret = handle_userfault(vma, address, flags, @@ -798,7 +798,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); @@ -1095,13 +1095,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, - &memcg))) { + &memcg, false))) { if (pages[i]) put_page(pages[i]); while (--i >= 0) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg); + mem_cgroup_cancel_charge(pages[i], memcg, + false); put_page(pages[i]); } kfree(pages); @@ -1140,7 +1141,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); page_add_new_anon_rmap(pages[i], vma, haddr, false); - mem_cgroup_commit_charge(pages[i], memcg, false); + mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); @@ -1168,7 +1169,7 @@ out_free_pages: for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg); + mem_cgroup_cancel_charge(pages[i], memcg, false); put_page(pages[i]); } kfree(pages); @@ -1234,7 +1235,8 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, + true))) { put_page(new_page); if (page) { split_huge_page(page); @@ -1263,7 +1265,7 @@ alloc: put_user_huge_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; } else { @@ -1272,7 +1274,7 @@ alloc: entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_huge_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); - mem_cgroup_commit_charge(new_page, memcg, false); + mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); @@ -2583,7 +2585,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -2683,7 +2685,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); - mem_cgroup_commit_charge(new_page, memcg, false); + mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -2703,7 +2705,7 @@ out_nolock: trace_mm_collapse_huge_page(mm, isolated, result); return; out: - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, true); goto out_up_write; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 54eae4f19d80..311fd2b71bae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -647,7 +647,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - int nr_pages) + bool compound, int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is @@ -660,9 +660,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_pages); + } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) @@ -4513,30 +4515,24 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, * from old cgroup. */ static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, + bool compound, struct mem_cgroup *from, struct mem_cgroup *to) { unsigned long flags; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; + VM_BUG_ON(compound && !PageTransHuge(page)); /* * Prevent mem_cgroup_replace_page() from looking at * page->mem_cgroup of its source page while we change it. */ + ret = -EBUSY; if (!trylock_page(page)) goto out; @@ -4591,9 +4587,9 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); + mem_cgroup_charge_statistics(to, page, compound, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); + mem_cgroup_charge_statistics(from, page, compound, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -4890,7 +4886,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (target_type == MC_TARGET_PAGE) { page = target.page; if (!isolate_lru_page(page)) { - if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, + if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; @@ -4919,7 +4915,8 @@ retry: page = target.page; if (isolate_lru_page(page)) goto put; - if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { + if (!mem_cgroup_move_account(page, false, + mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -5258,10 +5255,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) * with mem_cgroup_cancel_charge() in case page instantiation fails. */ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp) + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound) { struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) @@ -5291,11 +5289,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, } } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - if (!memcg) memcg = get_mem_cgroup_from_mm(mm); @@ -5324,9 +5317,9 @@ out: * Use mem_cgroup_cancel_charge() to cancel the transaction instead. */ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) + bool lrucare, bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; VM_BUG_ON_PAGE(!page->mapping, page); VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); @@ -5343,13 +5336,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, commit_charge(page, memcg, lrucare); - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); + mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); @@ -5371,9 +5359,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, + bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; if (mem_cgroup_disabled()) return; @@ -5385,11 +5374,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) if (!memcg) return; - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - cancel_charge(memcg, nr_pages); } @@ -5750,7 +5734,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for udpating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, -1); + mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); } diff --git a/mm/memory.c b/mm/memory.c index f964d190ce83..a021c295e88d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2087,7 +2087,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, cow_user_page(new_page, old_page, address, vma); } - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_new; __SetPageUptodate(new_page); @@ -2119,7 +2119,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, */ ptep_clear_flush_notify(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address, false); - mem_cgroup_commit_charge(new_page, memcg, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2158,7 +2158,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, new_page = old_page; page_copied = 1; } else { - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, false); } if (new_page) @@ -2533,7 +2533,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page; } - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -2575,10 +2575,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, pte); if (page == swapcache) { do_page_add_anon_rmap(page, vma, address, exclusive); - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, address, false); - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2613,7 +2613,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); @@ -2707,7 +2707,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!page) goto oom; - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_page; /* @@ -2728,7 +2728,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(page_table, ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); return handle_userfault(vma, address, flags, VM_UFFD_MISSING); @@ -2736,7 +2736,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address, false); - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(mm, address, page_table, entry); @@ -2747,7 +2747,7 @@ unlock: pte_unmap_unlock(page_table, ptl); return 0; release: - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); goto unlock; oom_free_page: @@ -3000,7 +3000,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { page_cache_release(new_page); return VM_FAULT_OOM; } @@ -3029,7 +3029,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); - mem_cgroup_commit_charge(new_page, memcg, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); if (fault_page) { @@ -3044,7 +3044,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, false); page_cache_release(new_page); return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index d271932f9ef9..b98e1011858c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -810,7 +810,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, + false); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -833,9 +834,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) if (error) { if (error != -ENOMEM) error = 0; - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); } else - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); out: unlock_page(page); page_cache_release(page); @@ -1218,7 +1219,8 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, swp_to_radix_entry(swap)); @@ -1235,14 +1237,14 @@ repeat: * "repeat": reading a hole and writing should succeed. */ if (error) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); delete_from_swap_cache(page); } } if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); spin_lock(&info->lock); info->swapped--; @@ -1281,7 +1283,8 @@ repeat: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + false); if (error) goto decused; error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); @@ -1291,10 +1294,10 @@ repeat: radix_tree_preload_end(); } if (error) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); goto decused; } - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_anon(page); spin_lock(&info->lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index 058e6f0162eb..efe26bb10adb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1142,14 +1142,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); ret = 0; goto out; } @@ -1161,10 +1162,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ae21a1f309c2..806b0c758c5b 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); @@ -77,7 +77,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -91,7 +91,7 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); out_release: page_cache_release(page); goto out; -- cgit v1.3-14-g43fede From 14d27abd1d12a64c89df1ce8c00ef1403226db5a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:00 -0800 Subject: futex, thp: remove special case for THP in get_futex_key With new THP refcounting, we don't need tricks to stabilize huge page. If we've got reference to tail page, it can't split under us. This patch effectively reverts a5b338f2b0b1 ("thp: update futex compound knowledge"). Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Tested-by: Artem Savkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 63 +++++++++++++--------------------------------------------- 1 file changed, 14 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8a310e240cda..eed92a8a4c49 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page, *page_head; + struct page *page; + struct address_space *mapping; int err, ro = 0; /* @@ -519,46 +520,9 @@ again: else err = 0; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - page_head = page; - if (unlikely(PageTail(page))) { - put_page(page); - /* serialize against __split_huge_page_splitting() */ - local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { - page_head = compound_head(page); - /* - * page_head is valid pointer but we must pin - * it before taking the PG_lock and/or - * PG_compound_lock. The moment we re-enable - * irqs __split_huge_page_splitting() can - * return and the head page can be freed from - * under us. We can't take the PG_lock and/or - * PG_compound_lock on a page that could be - * freed from under us. - */ - if (page != page_head) { - get_page(page_head); - put_page(page); - } - local_irq_enable(); - } else { - local_irq_enable(); - goto again; - } - } -#else - page_head = compound_head(page); - if (page != page_head) { - get_page(page_head); - put_page(page); - } -#endif - - lock_page(page_head); - + lock_page(page); /* - * If page_head->mapping is NULL, then it cannot be a PageAnon + * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast @@ -570,12 +534,13 @@ again: * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page_head->mapping. + * an unlikely race, but we do need to retry for page->mapping. */ - if (!page_head->mapping) { - int shmem_swizzled = PageSwapCache(page_head); - unlock_page(page_head); - put_page(page_head); + mapping = compound_head(page)->mapping; + if (!mapping) { + int shmem_swizzled = PageSwapCache(page); + unlock_page(page); + put_page(page); if (shmem_swizzled) goto again; return -EFAULT; @@ -588,7 +553,7 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page_head)) { + if (PageAnon(page)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. @@ -603,15 +568,15 @@ again: key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page_head->mapping->host; + key->shared.inode = mapping->host; key->shared.pgoff = basepage_index(page); } get_futex_key_refs(key); /* implies MB (B) */ out: - unlock_page(page_head); - put_page(page_head); + unlock_page(page); + put_page(page); return err; } -- cgit v1.3-14-g43fede From 34c0fd540e79fb49ef9ce864dae1058cca265780 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:14 -0800 Subject: mm, dax, pmem: introduce pfn_t For the purpose of communicating the optional presence of a 'struct page' for the pfn returned from ->direct_access(), introduce a type that encapsulates a page-frame-number plus flags. These flags contain the historical "page_link" encoding for a scatterlist entry, but can also denote "device memory". Where "device memory" is a set of pfns that are not part of the kernel's linear mapping by default, but are accessed via the same memory controller as ram. The motivation for this new type is large capacity persistent memory that needs struct page entries in the 'memmap' to support 3rd party DMA (i.e. O_DIRECT I/O with a persistent memory source/target). However, we also need it in support of maintaining a list of mapped inodes which need to be unmapped at driver teardown or freeze_bdev() time. Signed-off-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Hansen Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/sysdev/axonram.c | 9 +++--- drivers/block/brd.c | 7 +++-- drivers/nvdimm/pmem.c | 13 ++++++--- drivers/s390/block/dcssblk.c | 11 ++++--- fs/dax.c | 11 ++++--- include/linux/blkdev.h | 5 ++-- include/linux/pfn.h | 9 ++++++ include/linux/pfn_t.h | 67 +++++++++++++++++++++++++++++++++++++++++++ kernel/memremap.c | 7 +++++ 9 files changed, 116 insertions(+), 23 deletions(-) create mode 100644 include/linux/pfn_t.h (limited to 'kernel') diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index c713b349d967..0d112b94d91d 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -142,15 +143,13 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) */ static long axon_ram_direct_access(struct block_device *device, sector_t sector, - void __pmem **kaddr, unsigned long *pfn) + void __pmem **kaddr, pfn_t *pfn) { struct axon_ram_bank *bank = device->bd_disk->private_data; loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; - void *addr = (void *)(bank->ph_addr + offset); - - *kaddr = (void __pmem *)addr; - *pfn = virt_to_phys(addr) >> PAGE_SHIFT; + *kaddr = (void __pmem __force *) bank->io_addr + offset; + *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV); return bank->size - offset; } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index a5880f4ab40e..cb27190e9f39 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -19,6 +19,9 @@ #include #include #include +#ifdef CONFIG_BLK_DEV_RAM_DAX +#include +#endif #include @@ -378,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, #ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, unsigned long *pfn) + void __pmem **kaddr, pfn_t *pfn) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; @@ -389,7 +392,7 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector, if (!page) return -ENOSPC; *kaddr = (void __pmem *)page_address(page); - *pfn = page_to_pfn(page); + *pfn = page_to_pfn_t(page); return PAGE_SIZE; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index b493ff3fccb2..5def7f4ddbd2 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ struct pmem_device { phys_addr_t phys_addr; /* when non-zero this device is hosting a 'pfn' instance */ phys_addr_t data_offset; + unsigned long pfn_flags; void __pmem *virt_addr; size_t size; struct badblocks bb; @@ -135,13 +137,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, } static long pmem_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, unsigned long *pfn) + void __pmem **kaddr, pfn_t *pfn) { struct pmem_device *pmem = bdev->bd_disk->private_data; resource_size_t offset = sector * 512 + pmem->data_offset; *kaddr = pmem->virt_addr + offset; - *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; + *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); return pmem->size - offset; } @@ -174,9 +176,11 @@ static struct pmem_device *pmem_alloc(struct device *dev, return ERR_PTR(-EBUSY); } - if (pmem_should_map_pages(dev)) + pmem->pfn_flags = PFN_DEV; + if (pmem_should_map_pages(dev)) { pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res); - else + pmem->pfn_flags |= PFN_MAP; + } else pmem->virt_addr = (void __pmem *) devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); @@ -384,6 +388,7 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) pmem = dev_get_drvdata(dev); devm_memunmap(dev, (void __force *) pmem->virt_addr); pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res); + pmem->pfn_flags |= PFN_MAP; if (IS_ERR(pmem->virt_addr)) { rc = PTR_ERR(pmem->virt_addr); goto err; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 94a8f4ab57bc..ce7b70181740 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -30,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode); static blk_qc_t dcssblk_make_request(struct request_queue *q, struct bio *bio); static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, - void __pmem **kaddr, unsigned long *pfn); + void __pmem **kaddr, pfn_t *pfn); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -883,20 +884,18 @@ fail: static long dcssblk_direct_access (struct block_device *bdev, sector_t secnum, - void __pmem **kaddr, unsigned long *pfn) + void __pmem **kaddr, pfn_t *pfn) { struct dcssblk_dev_info *dev_info; unsigned long offset, dev_sz; - void *addr; dev_info = bdev->bd_disk->private_data; if (!dev_info) return -ENODEV; dev_sz = dev_info->end - dev_info->start; offset = secnum * 512; - addr = (void *) (dev_info->start + offset); - *pfn = virt_to_phys(addr) >> PAGE_SHIFT; - *kaddr = (void __pmem *) addr; + *kaddr = (void __pmem *) (dev_info->start + offset); + *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV); return dev_sz - offset; } diff --git a/fs/dax.c b/fs/dax.c index 3220da70ee20..6b13d6cd9a9a 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -28,6 +28,7 @@ #include #include #include +#include #include static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) @@ -362,7 +363,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, } dax_unmap_atomic(bdev, &dax); - error = vm_insert_mixed(vma, vaddr, dax.pfn); + error = vm_insert_mixed(vma, vaddr, pfn_t_to_pfn(dax.pfn)); out: i_mmap_unlock_read(mapping); @@ -667,7 +668,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, result = VM_FAULT_SIGBUS; goto out; } - if ((length < PMD_SIZE) || (dax.pfn & PG_PMD_COLOUR)) { + if (length < PMD_SIZE + || (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)) { dax_unmap_atomic(bdev, &dax); goto fallback; } @@ -676,7 +678,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * TODO: teach vmf_insert_pfn_pmd() to support * 'pte_special' for pmds */ - if (pfn_valid(dax.pfn)) { + if (pfn_t_has_page(dax.pfn)) { dax_unmap_atomic(bdev, &dax); goto fallback; } @@ -690,7 +692,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } dax_unmap_atomic(bdev, &dax); - result |= vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); + result |= vmf_insert_pfn_pmd(vma, address, pmd, + pfn_t_to_pfn(dax.pfn), write); } out: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 88821fa26f19..bfb64d672e19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1628,7 +1629,7 @@ struct blk_dax_ctl { sector_t sector; void __pmem *addr; long size; - unsigned long pfn; + pfn_t pfn; }; struct block_device_operations { @@ -1638,7 +1639,7 @@ struct block_device_operations { int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); long (*direct_access)(struct block_device *, sector_t, void __pmem **, - unsigned long *pfn); + pfn_t *); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ diff --git a/include/linux/pfn.h b/include/linux/pfn.h index 97f3e88aead4..2d8e49711b63 100644 --- a/include/linux/pfn.h +++ b/include/linux/pfn.h @@ -3,6 +3,15 @@ #ifndef __ASSEMBLY__ #include + +/* + * pfn_t: encapsulates a page-frame number that is optionally backed + * by memmap (struct page). Whether a pfn_t has a 'struct page' + * backing is indicated by flags in the high bits of the value. + */ +typedef struct { + unsigned long val; +} pfn_t; #endif #define PFN_ALIGN(x) (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK) diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h new file mode 100644 index 000000000000..c557a0e0b20c --- /dev/null +++ b/include/linux/pfn_t.h @@ -0,0 +1,67 @@ +#ifndef _LINUX_PFN_T_H_ +#define _LINUX_PFN_T_H_ +#include + +/* + * PFN_FLAGS_MASK - mask of all the possible valid pfn_t flags + * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry + * PFN_SG_LAST - pfn references a page and is the last scatterlist entry + * PFN_DEV - pfn is not covered by system memmap by default + * PFN_MAP - pfn has a dynamic page mapping established by a device driver + */ +#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \ + << (BITS_PER_LONG - PAGE_SHIFT)) +#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1)) +#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2)) +#define PFN_DEV (1UL << (BITS_PER_LONG - 3)) +#define PFN_MAP (1UL << (BITS_PER_LONG - 4)) + +static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags) +{ + pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), }; + + return pfn_t; +} + +/* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */ +static inline pfn_t pfn_to_pfn_t(unsigned long pfn) +{ + return __pfn_to_pfn_t(pfn, 0); +} + +extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); + +static inline bool pfn_t_has_page(pfn_t pfn) +{ + return (pfn.val & PFN_MAP) == PFN_MAP || (pfn.val & PFN_DEV) == 0; +} + +static inline unsigned long pfn_t_to_pfn(pfn_t pfn) +{ + return pfn.val & ~PFN_FLAGS_MASK; +} + +static inline struct page *pfn_t_to_page(pfn_t pfn) +{ + if (pfn_t_has_page(pfn)) + return pfn_to_page(pfn_t_to_pfn(pfn)); + return NULL; +} + +static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) +{ + return PFN_PHYS(pfn_t_to_pfn(pfn)); +} + +static inline void *pfn_t_to_virt(pfn_t pfn) +{ + if (pfn_t_has_page(pfn)) + return __va(pfn_t_to_phys(pfn)); + return NULL; +} + +static inline pfn_t page_to_pfn_t(struct page *page) +{ + return pfn_to_pfn_t(page_to_pfn(page)); +} +#endif /* _LINUX_PFN_T_H_ */ diff --git a/kernel/memremap.c b/kernel/memremap.c index 7658d32c5c78..449cb6a5d9a1 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -12,6 +12,7 @@ */ #include #include +#include #include #include #include @@ -147,6 +148,12 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); +pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +{ + return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); +} +EXPORT_SYMBOL(phys_to_pfn_t); + #ifdef CONFIG_ZONE_DEVICE struct page_map { struct resource res; -- cgit v1.3-14-g43fede From 9476df7d80dfc425b37bfecf1d89edf8ec81fcb6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:19 -0800 Subject: mm: introduce find_dev_pagemap() There are several scenarios where we need to retrieve and update metadata associated with a given devm_memremap_pages() mapping, and the only lookup key available is a pfn in the range: 1/ We want to augment vmemmap_populate() (called via arch_add_memory()) to allocate memmap storage from pre-allocated pages reserved by the device driver. At vmemmap_alloc_block_buf() time it grabs device pages rather than page allocator pages. This is in support of devm_memremap_pages() mappings where the memmap is too large to fit in main memory (i.e. large persistent memory devices). 2/ Taking a reference against the mapping when inserting device pages into the address_space radix of a given inode. This facilitates unmap_mapping_range() and truncate_inode_pages() operations when the driver is tearing down the mapping. 3/ get_user_pages() operations on ZONE_DEVICE memory require taking a reference against the mapping so that the driver teardown path can revoke and drain usage of device pages. Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/nvdimm/pmem.c | 2 +- include/linux/io.h | 15 --------- include/linux/memremap.h | 38 ++++++++++++++++++++++ kernel/memremap.c | 85 +++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 116 insertions(+), 24 deletions(-) create mode 100644 include/linux/memremap.h (limited to 'kernel') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5def7f4ddbd2..904629b97c4f 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -21,9 +21,9 @@ #include #include #include -#include #include #include +#include #include #include #include diff --git a/include/linux/io.h b/include/linux/io.h index de64c1e53612..fffd88d7f426 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -89,21 +89,6 @@ void devm_memunmap(struct device *dev, void *addr); void *__devm_memremap_pages(struct device *dev, struct resource *res); -#ifdef CONFIG_ZONE_DEVICE -void *devm_memremap_pages(struct device *dev, struct resource *res); -#else -static inline void *devm_memremap_pages(struct device *dev, struct resource *res) -{ - /* - * Fail attempts to call devm_memremap_pages() without - * ZONE_DEVICE support enabled, this requires callers to fall - * back to plain devm_memremap() based on config - */ - WARN_ON_ONCE(1); - return ERR_PTR(-ENXIO); -} -#endif - /* * Some systems do not have legacy ISA devices. * /dev/port is not a valid interface on these systems. diff --git a/include/linux/memremap.h b/include/linux/memremap.h new file mode 100644 index 000000000000..d90721c178bb --- /dev/null +++ b/include/linux/memremap.h @@ -0,0 +1,38 @@ +#ifndef _LINUX_MEMREMAP_H_ +#define _LINUX_MEMREMAP_H_ +#include + +struct resource; +struct device; +/** + * struct dev_pagemap - metadata for ZONE_DEVICE mappings + * @dev: host device of the mapping for debug + */ +struct dev_pagemap { + /* TODO: vmem_altmap and percpu_ref count */ + struct device *dev; +}; + +#ifdef CONFIG_ZONE_DEVICE +void *devm_memremap_pages(struct device *dev, struct resource *res); +struct dev_pagemap *find_dev_pagemap(resource_size_t phys); +#else +static inline void *devm_memremap_pages(struct device *dev, + struct resource *res) +{ + /* + * Fail attempts to call devm_memremap_pages() without + * ZONE_DEVICE support enabled, this requires callers to fall + * back to plain devm_memremap() based on config + */ + WARN_ON_ONCE(1); + return ERR_PTR(-ENXIO); +} + +static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +{ + return NULL; +} +#endif + +#endif /* _LINUX_MEMREMAP_H_ */ diff --git a/kernel/memremap.c b/kernel/memremap.c index 449cb6a5d9a1..61cfbf4d3054 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -10,6 +10,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include +#include #include #include #include @@ -155,22 +157,57 @@ pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) EXPORT_SYMBOL(phys_to_pfn_t); #ifdef CONFIG_ZONE_DEVICE +static DEFINE_MUTEX(pgmap_lock); +static RADIX_TREE(pgmap_radix, GFP_KERNEL); +#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) +#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) + struct page_map { struct resource res; + struct percpu_ref *ref; + struct dev_pagemap pgmap; }; -static void devm_memremap_pages_release(struct device *dev, void *res) +static void pgmap_radix_release(struct resource *res) +{ + resource_size_t key; + + mutex_lock(&pgmap_lock); + for (key = res->start; key <= res->end; key += SECTION_SIZE) + radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); + mutex_unlock(&pgmap_lock); +} + +static void devm_memremap_pages_release(struct device *dev, void *data) { - struct page_map *page_map = res; + struct page_map *page_map = data; + struct resource *res = &page_map->res; + resource_size_t align_start, align_size; + + pgmap_radix_release(res); /* pages are dead and unused, undo the arch mapping */ - arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + arch_remove_memory(align_start, align_size); +} + +/* assumes rcu_read_lock() held at entry */ +struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +{ + struct page_map *page_map; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); + return page_map ? &page_map->pgmap : NULL; } void *devm_memremap_pages(struct device *dev, struct resource *res) { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); + resource_size_t key, align_start, align_size; struct page_map *page_map; int error, nid; @@ -190,18 +227,50 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) memcpy(&page_map->res, res, sizeof(*res)); + page_map->pgmap.dev = dev; + mutex_lock(&pgmap_lock); + error = 0; + for (key = res->start; key <= res->end; key += SECTION_SIZE) { + struct dev_pagemap *dup; + + rcu_read_lock(); + dup = find_dev_pagemap(key); + rcu_read_unlock(); + if (dup) { + dev_err(dev, "%s: %pr collides with mapping for %s\n", + __func__, res, dev_name(dup->dev)); + error = -EBUSY; + break; + } + error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, + page_map); + if (error) { + dev_err(dev, "%s: failed: %d\n", __func__, error); + break; + } + } + mutex_unlock(&pgmap_lock); + if (error) + goto err_radix; + nid = dev_to_node(dev); if (nid < 0) nid = numa_mem_id(); - error = arch_add_memory(nid, res->start, resource_size(res), true); - if (error) { - devres_free(page_map); - return ERR_PTR(error); - } + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + error = arch_add_memory(nid, align_start, align_size, true); + if (error) + goto err_add_memory; devres_add(dev, page_map); return __va(res->start); + + err_add_memory: + err_radix: + pgmap_radix_release(res); + devres_free(page_map); + return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); #endif /* CONFIG_ZONE_DEVICE */ -- cgit v1.3-14-g43fede From 4b94ffdc4163bae1ec73b6e977ffb7a7da3d06d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:22 -0800 Subject: x86, mm: introduce vmem_altmap to augment vmemmap_populate() In support of providing struct page for large persistent memory capacities, use struct vmem_altmap to change the default policy for allocating memory for the memmap array. The default vmemmap_populate() allocates page table storage area from the page allocator. Given persistent memory capacities relative to DRAM it may not be feasible to store the memmap in 'System Memory'. Instead vmem_altmap represents pre-allocated "device pages" to satisfy vmemmap_alloc_block_buf() requests. Signed-off-by: Dan Williams Reported-by: kbuild test robot Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 33 ++++++++++++++---- drivers/nvdimm/pmem.c | 6 ++-- include/linux/memory_hotplug.h | 3 +- include/linux/memremap.h | 39 +++++++++++++++++++--- include/linux/mm.h | 9 ++++- kernel/memremap.c | 72 +++++++++++++++++++++++++++++++++++++-- mm/memory_hotplug.c | 67 ++++++++++++++++++++++++++----------- mm/page_alloc.c | 11 +++++- mm/sparse-vmemmap.c | 76 ++++++++++++++++++++++++++++++++++++++++-- mm/sparse.c | 8 +++-- 10 files changed, 282 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8829482d69ec..5488d21123bd 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order) { unsigned long magic; unsigned int nr_pages = 1 << order; + struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page); + + if (altmap) { + vmem_altmap_free(altmap, nr_pages); + return; + } /* bootmem page has reserved flag */ if (PageReserved(page)) { @@ -1017,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + struct page *page = pfn_to_page(start_pfn); + struct vmem_altmap *altmap; struct zone *zone; int ret; - zone = page_zone(pfn_to_page(start_pfn)); - kernel_physical_mapping_remove(start, start + size); + /* With altmap the first mapped page is offset from @start */ + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + page += vmem_altmap_offset(altmap); + zone = page_zone(page); ret = __remove_pages(zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + kernel_physical_mapping_remove(start, start + size); return ret; } @@ -1235,7 +1248,7 @@ static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; static int __meminit vmemmap_populate_hugepages(unsigned long start, - unsigned long end, int node) + unsigned long end, int node, struct vmem_altmap *altmap) { unsigned long addr; unsigned long next; @@ -1258,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (pmd_none(*pmd)) { void *p; - p = vmemmap_alloc_block_buf(PMD_SIZE, node); + p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) { pte_t entry; @@ -1279,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; continue; - } + } else if (altmap) + return -ENOMEM; /* no fallback */ } else if (pmd_large(*pmd)) { vmemmap_verify((pte_t *)pmd, node, addr, next); continue; @@ -1293,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { + struct vmem_altmap *altmap = to_vmem_altmap(start); int err; if (cpu_has_pse) - err = vmemmap_populate_hugepages(start, end, node); - else + err = vmemmap_populate_hugepages(start, end, node, altmap); + else if (altmap) { + pr_err_once("%s: no cpu support for altmap allocations\n", + __func__); + err = -ENOMEM; + } else err = vmemmap_populate_basepages(start, end, node); if (!err) sync_global_pgds(start, end - 1, 0); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 904629b97c4f..be3f8547b702 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -178,7 +178,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, pmem->pfn_flags = PFN_DEV; if (pmem_should_map_pages(dev)) { - pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res); + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res, + NULL); pmem->pfn_flags |= PFN_MAP; } else pmem->virt_addr = (void __pmem *) devm_memremap(dev, @@ -387,7 +388,8 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) /* establish pfn range for lookup, and switch to direct map */ pmem = dev_get_drvdata(dev); devm_memunmap(dev, (void __force *) pmem->virt_addr); - pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res); + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res, + NULL); pmem->pfn_flags |= PFN_MAP; if (IS_ERR(pmem->virt_addr)) { rc = PTR_ERR(pmem->virt_addr); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2ea574ff9714..43405992d027 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -275,7 +275,8 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); -extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); +extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d90721c178bb..aa3e82a80d7b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -4,21 +4,53 @@ struct resource; struct device; + +/** + * struct vmem_altmap - pre-allocated storage for vmemmap_populate + * @base_pfn: base of the entire dev_pagemap mapping + * @reserve: pages mapped, but reserved for driver use (relative to @base) + * @free: free pages set aside in the mapping for memmap storage + * @align: pages reserved to meet allocation alignments + * @alloc: track pages consumed, private to vmemmap_populate() + */ +struct vmem_altmap { + const unsigned long base_pfn; + const unsigned long reserve; + unsigned long free; + unsigned long align; + unsigned long alloc; +}; + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); +#else +static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + return NULL; +} +#endif + /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings + * @altmap: pre-allocated/reserved memory for vmemmap allocations * @dev: host device of the mapping for debug */ struct dev_pagemap { - /* TODO: vmem_altmap and percpu_ref count */ + struct vmem_altmap *altmap; + const struct resource *res; struct device *dev; }; #ifdef CONFIG_ZONE_DEVICE -void *devm_memremap_pages(struct device *dev, struct resource *res); +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct vmem_altmap *altmap); struct dev_pagemap *find_dev_pagemap(resource_size_t phys); #else static inline void *devm_memremap_pages(struct device *dev, - struct resource *res) + struct resource *res, struct vmem_altmap *altmap) { /* * Fail attempts to call devm_memremap_pages() without @@ -34,5 +66,4 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return NULL; } #endif - #endif /* _LINUX_MEMREMAP_H_ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index d9fe12d45c21..8bb0907a3603 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2217,7 +2217,14 @@ pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); void *vmemmap_alloc_block(unsigned long size, int node); -void *vmemmap_alloc_block_buf(unsigned long size, int node); +struct vmem_altmap; +void *__vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap); +static inline void *vmemmap_alloc_block_buf(unsigned long size, int node) +{ + return __vmemmap_alloc_block_buf(size, node, NULL); +} + void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node); diff --git a/kernel/memremap.c b/kernel/memremap.c index 61cfbf4d3054..562f6471fe90 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -166,6 +166,7 @@ struct page_map { struct resource res; struct percpu_ref *ref; struct dev_pagemap pgmap; + struct vmem_altmap altmap; }; static void pgmap_radix_release(struct resource *res) @@ -183,6 +184,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) struct page_map *page_map = data; struct resource *res = &page_map->res; resource_size_t align_start, align_size; + struct dev_pagemap *pgmap = &page_map->pgmap; pgmap_radix_release(res); @@ -190,6 +192,8 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, + "%s: failed to free all reserved pages\n", __func__); } /* assumes rcu_read_lock() held at entry */ @@ -203,11 +207,23 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return page_map ? &page_map->pgmap : NULL; } -void *devm_memremap_pages(struct device *dev, struct resource *res) +/** + * devm_memremap_pages - remap and provide memmap backing for the given resource + * @dev: hosting device for @res + * @res: "host memory" address range + * @altmap: optional descriptor for allocating the memmap from @res + * + * Note, the expectation is that @res is a host memory range that could + * feasibly be treated as a "System RAM" range, i.e. not a device mmio + * range, but this is not enforced. + */ +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct vmem_altmap *altmap) { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); resource_size_t key, align_start, align_size; + struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid; @@ -220,14 +236,27 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) if (is_ram == REGION_INTERSECTS) return __va(res->start); + if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { + dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", + __func__); + return ERR_PTR(-ENXIO); + } + page_map = devres_alloc_node(devm_memremap_pages_release, sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) return ERR_PTR(-ENOMEM); + pgmap = &page_map->pgmap; memcpy(&page_map->res, res, sizeof(*res)); - page_map->pgmap.dev = dev; + pgmap->dev = dev; + if (altmap) { + memcpy(&page_map->altmap, altmap, sizeof(*altmap)); + pgmap->altmap = &page_map->altmap; + } + pgmap->res = &page_map->res; + mutex_lock(&pgmap_lock); error = 0; for (key = res->start; key <= res->end; key += SECTION_SIZE) { @@ -273,4 +302,43 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + return altmap->reserve + altmap->free; +} + +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + /* + * 'memmap_start' is the virtual address for the first "struct + * page" in this range of the vmemmap array. In the case of + * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * pointer arithmetic, so we can perform this to_vmem_altmap() + * conversion without concern for the initialization state of + * the struct page fields. + */ + struct page *page = (struct page *) memmap_start; + struct dev_pagemap *pgmap; + + /* + * Uncoditionally retrieve a dev_pagemap associated with the + * given physical address, this is only for use in the + * arch_{add|remove}_memory() for setting up and tearing down + * the memmap. + */ + rcu_read_lock(); + pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); + rcu_read_unlock(); + + return pgmap ? pgmap->altmap : NULL; +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 92f95952692b..4af58a3a8ffa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -506,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap; + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); + altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); + if (altmap) { + /* + * Validate altmap is within bounds of the total request + */ + if (altmap->base_pfn != phys_start_pfn + || vmem_altmap_offset(altmap) > nr_pages) { + pr_warn_once("memory add fail, invalid altmap\n"); + return -EINVAL; + } + altmap->alloc = 0; + } + for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, zone, section_nr_to_pfn(i)); @@ -731,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static int __remove_section(struct zone *zone, struct mem_section *ms) +static int __remove_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { unsigned long start_pfn; int scn_nr; @@ -748,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) start_pfn = section_nr_to_pfn(scn_nr); __remove_zone(zone, start_pfn); - sparse_remove_one_section(zone, ms); + sparse_remove_one_section(zone, ms, map_offset); return 0; } @@ -767,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages) { unsigned long i; - int sections_to_remove; - resource_size_t start, size; - int ret = 0; + unsigned long map_offset = 0; + int sections_to_remove, ret = 0; + + /* In the ZONE_DEVICE case device driver owns the memory region */ + if (is_dev_zone(zone)) { + struct page *page = pfn_to_page(phys_start_pfn); + struct vmem_altmap *altmap; + + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + map_offset = vmem_altmap_offset(altmap); + } else { + resource_size_t start, size; + + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; + + ret = release_mem_region_adjustable(&iomem_resource, start, + size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } + } /* * We can only remove entire sections @@ -777,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; - - /* in the ZONE_DEVICE case device driver owns the memory region */ - if (!is_dev_zone(zone)) - ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } - sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - ret = __remove_section(zone, __pfn_to_section(pfn)); + + ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); + map_offset = 0; if (ret) break; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25409714160e..7ca3fe6ef92d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -4485,8 +4486,9 @@ static inline unsigned long wait_table_bits(unsigned long size) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - pg_data_t *pgdat = NODE_DATA(nid); + struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); unsigned long end_pfn = start_pfn + size; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; struct zone *z; unsigned long nr_initialised = 0; @@ -4494,6 +4496,13 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; + /* + * Honor reservation requested by the driver for this ZONE_DEVICE + * memory + */ + if (altmap && start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + z = &pgdat->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 4cba9c2783a1..b60802b3e5ea 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) } /* need to make sure size is all the same during early stage */ -void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +static void * __meminit alloc_block_buf(unsigned long size, int node) { void *ptr; @@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) return ptr; } +static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) +{ + return altmap->base_pfn + altmap->reserve + altmap->alloc + + altmap->align; +} + +static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap) +{ + unsigned long allocated = altmap->alloc + altmap->align; + + if (altmap->free > allocated) + return altmap->free - allocated; + return 0; +} + +/** + * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation + * @altmap - reserved page pool for the allocation + * @nr_pfns - size (in pages) of the allocation + * + * Allocations are aligned to the size of the request + */ +static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + unsigned long pfn = vmem_altmap_next_pfn(altmap); + unsigned long nr_align; + + nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG); + nr_align = ALIGN(pfn, nr_align) - pfn; + + if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) + return ULONG_MAX; + altmap->alloc += nr_pfns; + altmap->align += nr_align; + return pfn + nr_align; +} + +static void * __meminit altmap_alloc_block_buf(unsigned long size, + struct vmem_altmap *altmap) +{ + unsigned long pfn, nr_pfns; + void *ptr; + + if (size & ~PAGE_MASK) { + pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n", + __func__, size); + return NULL; + } + + nr_pfns = size >> PAGE_SHIFT; + pfn = vmem_altmap_alloc(altmap, nr_pfns); + if (pfn < ULONG_MAX) + ptr = __va(__pfn_to_phys(pfn)); + else + ptr = NULL; + pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n", + __func__, pfn, altmap->alloc, altmap->align, nr_pfns); + + return ptr; +} + +/* need to make sure size is all the same during early stage */ +void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap) +{ + if (altmap) + return altmap_alloc_block_buf(size, altmap); + return alloc_block_buf(size, node); +} + void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { @@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; - void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); + void *p = alloc_block_buf(PAGE_SIZE, node); if (!p) return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); diff --git a/mm/sparse.c b/mm/sparse.c index d1b48b691ac8..3717ceed4177 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) if (!memmap) return; - for (i = 0; i < PAGES_PER_SECTION; i++) { + for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { atomic_long_sub(1, &num_poisoned_pages); ClearPageHWPoison(&memmap[i]); @@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) free_map_bootmem(memmap); } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) +void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { struct page *memmap = NULL; unsigned long *usemap = NULL, flags; @@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) } pgdat_resize_unlock(pgdat, &flags); - clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); + clear_hwpoisoned_pages(memmap + map_offset, + PAGES_PER_SECTION - map_offset); free_section_usemap(memmap, usemap); } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.3-14-g43fede From 5c2c2587b13235bf8b5c9027589f22eff68bdf49 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:49 -0800 Subject: mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup get_dev_page() enables paths like get_user_pages() to pin a dynamically mapped pfn-range (devm_memremap_pages()) while the resulting struct page objects are in use. Unlike get_page() it may fail if the device is, or is in the process of being, disabled. While the initial lookup of the range may be an expensive list walk, the result is cached to speed up subsequent lookups which are likely to be in the same mapped range. devm_memremap_pages() now requires a reference counter to be specified at init time. For pmem this means moving request_queue allocation into pmem_alloc() so the existing queue usage counter can track "device pages". ZONE_DEVICE pages always have an elevated count and will never be on an lru reclaim list. That space in 'struct page' can be redirected for other uses, but for safety introduce a poison value that will always trip __list_add() to assert. This allows half of the struct list_head storage to be reclaimed with some assurance to back up the assumption that the page count never goes to zero and a list_add() is never attempted. Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Dave Hansen Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/nvdimm/pmem.c | 6 ++++-- include/linux/list.h | 11 ++++++++++ include/linux/memremap.h | 49 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/mm_types.h | 5 +++++ kernel/memremap.c | 53 ++++++++++++++++++++++++++++++++++++++++++++---- lib/list_debug.c | 9 ++++++++ 6 files changed, 125 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 328173d7e1ac..7edf31671dab 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -184,7 +184,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, pmem->pfn_flags = PFN_DEV; if (pmem_should_map_pages(dev)) { pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res, - NULL); + &q->q_usage_counter, NULL); pmem->pfn_flags |= PFN_MAP; } else pmem->virt_addr = (void __pmem *) devm_memremap(dev, @@ -365,6 +365,7 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) struct vmem_altmap *altmap; struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; + struct request_queue *q; phys_addr_t offset; int rc; struct vmem_altmap __altmap = { @@ -406,9 +407,10 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) /* establish pfn range for lookup, and switch to direct map */ pmem = dev_get_drvdata(dev); + q = pmem->pmem_queue; devm_memunmap(dev, (void __force *) pmem->virt_addr); pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res, - altmap); + &q->q_usage_counter, altmap); pmem->pfn_flags |= PFN_MAP; if (IS_ERR(pmem->virt_addr)) { rc = PTR_ERR(pmem->virt_addr); diff --git a/include/linux/list.h b/include/linux/list.h index 5356f4d661a7..30cf4200ab40 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -113,6 +113,17 @@ extern void __list_del_entry(struct list_head *entry); extern void list_del(struct list_head *entry); #endif +#ifdef CONFIG_DEBUG_LIST +/* + * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one + * of the pages it allocates is ever passed to list_add() + */ +extern void list_force_poison(struct list_head *entry); +#else +/* fallback to the less strict LIST_POISON* definitions */ +#define list_force_poison list_del +#endif + /** * list_replace - replace old entry by new one * @old : the element to be replaced diff --git a/include/linux/memremap.h b/include/linux/memremap.h index aa3e82a80d7b..bcaa634139a9 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -1,6 +1,8 @@ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ #include +#include +#include struct resource; struct device; @@ -36,21 +38,25 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations + * @res: physical address range covered by @ref + * @ref: reference count that pins the devm_memremap_pages() mapping * @dev: host device of the mapping for debug */ struct dev_pagemap { struct vmem_altmap *altmap; const struct resource *res; + struct percpu_ref *ref; struct device *dev; }; #ifdef CONFIG_ZONE_DEVICE void *devm_memremap_pages(struct device *dev, struct resource *res, - struct vmem_altmap *altmap); + struct percpu_ref *ref, struct vmem_altmap *altmap); struct dev_pagemap *find_dev_pagemap(resource_size_t phys); #else static inline void *devm_memremap_pages(struct device *dev, - struct resource *res, struct vmem_altmap *altmap) + struct resource *res, struct percpu_ref *ref, + struct vmem_altmap *altmap) { /* * Fail attempts to call devm_memremap_pages() without @@ -66,4 +72,43 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return NULL; } #endif + +/** + * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn + * @pfn: page frame number to lookup page_map + * @pgmap: optional known pgmap that already has a reference + * + * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the + * same mapping. + */ +static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap) +{ + const struct resource *res = pgmap ? pgmap->res : NULL; + resource_size_t phys = PFN_PHYS(pfn); + + /* + * In the cached case we're already holding a live reference so + * we can simply do a blind increment + */ + if (res && phys >= res->start && phys <= res->end) { + percpu_ref_get(pgmap->ref); + return pgmap; + } + + /* fall back to slow path lookup */ + rcu_read_lock(); + pgmap = find_dev_pagemap(phys); + if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + pgmap = NULL; + rcu_read_unlock(); + + return pgmap; +} + +static inline void put_dev_pagemap(struct dev_pagemap *pgmap) +{ + if (pgmap) + percpu_ref_put(pgmap->ref); +} #endif /* _LINUX_MEMREMAP_H_ */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2dd9c313a8c0..d3ebb9d21a53 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -116,6 +116,11 @@ struct page { * Can be used as a generic list * by the page owner. */ + struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an + * lru or handled by a slab + * allocator, this points to the + * hosting device page map. + */ struct { /* slub per cpu partial pages */ struct page *next; /* Next partial slab */ #ifdef CONFIG_64BIT diff --git a/kernel/memremap.c b/kernel/memremap.c index 562f6471fe90..3eb8944265d5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -179,6 +179,29 @@ static void pgmap_radix_release(struct resource *res) mutex_unlock(&pgmap_lock); } +static unsigned long pfn_first(struct page_map *page_map) +{ + struct dev_pagemap *pgmap = &page_map->pgmap; + const struct resource *res = &page_map->res; + struct vmem_altmap *altmap = pgmap->altmap; + unsigned long pfn; + + pfn = res->start >> PAGE_SHIFT; + if (altmap) + pfn += vmem_altmap_offset(altmap); + return pfn; +} + +static unsigned long pfn_end(struct page_map *page_map) +{ + const struct resource *res = &page_map->res; + + return (res->start + resource_size(res)) >> PAGE_SHIFT; +} + +#define for_each_device_pfn(pfn, map) \ + for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) + static void devm_memremap_pages_release(struct device *dev, void *data) { struct page_map *page_map = data; @@ -186,6 +209,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data) resource_size_t align_start, align_size; struct dev_pagemap *pgmap = &page_map->pgmap; + if (percpu_ref_tryget_live(pgmap->ref)) { + dev_WARN(dev, "%s: page mapping is still live!\n", __func__); + percpu_ref_put(pgmap->ref); + } + pgmap_radix_release(res); /* pages are dead and unused, undo the arch mapping */ @@ -211,20 +239,26 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @res: "host memory" address range + * @ref: a live per-cpu reference count * @altmap: optional descriptor for allocating the memmap from @res * - * Note, the expectation is that @res is a host memory range that could - * feasibly be treated as a "System RAM" range, i.e. not a device mmio - * range, but this is not enforced. + * Notes: + * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time + * (or devm release event). + * + * 2/ @res is expected to be a host memory range that could feasibly be + * treated as a "System RAM" range, i.e. not a device mmio range, but + * this is not enforced. */ void *devm_memremap_pages(struct device *dev, struct resource *res, - struct vmem_altmap *altmap) + struct percpu_ref *ref, struct vmem_altmap *altmap) { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); resource_size_t key, align_start, align_size; struct dev_pagemap *pgmap; struct page_map *page_map; + unsigned long pfn; int error, nid; if (is_ram == REGION_MIXED) { @@ -242,6 +276,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return ERR_PTR(-ENXIO); } + if (!ref) + return ERR_PTR(-EINVAL); + page_map = devres_alloc_node(devm_memremap_pages_release, sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) @@ -255,6 +292,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, memcpy(&page_map->altmap, altmap, sizeof(*altmap)); pgmap->altmap = &page_map->altmap; } + pgmap->ref = ref; pgmap->res = &page_map->res; mutex_lock(&pgmap_lock); @@ -292,6 +330,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_add_memory; + for_each_device_pfn(pfn, page_map) { + struct page *page = pfn_to_page(pfn); + + /* ZONE_DEVICE pages must never appear on a slab lru */ + list_force_poison(&page->lru); + page->pgmap = pgmap; + } devres_add(dev, page_map); return __va(res->start); diff --git a/lib/list_debug.c b/lib/list_debug.c index 3859bf63561c..3345a089ef7b 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -12,6 +12,13 @@ #include #include +static struct list_head force_poison; +void list_force_poison(struct list_head *entry) +{ + entry->next = &force_poison; + entry->prev = &force_poison; +} + /* * Insert a new entry between two known consecutive entries. * @@ -23,6 +30,8 @@ void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { + WARN(new->next == &force_poison || new->prev == &force_poison, + "list_add attempted on force-poisoned entry\n"); WARN(next->prev != prev, "list_add corruption. next->prev should be " "prev (%p), but was %p. (next=%p).\n", -- cgit v1.3-14-g43fede From 3565fce3a6597e91b8dee3e8e36ebf70f8b7ef9b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:55 -0800 Subject: mm, x86: get_user_pages() for dax mappings A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver has established a devm_memremap_pages() mapping, i.e. when the pfn_t return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when encountering _PAGE_DEVMAP during a page table walk we lookup and pin a struct dev_pagemap instance to keep the result of pfn_to_page() valid until put_page(). Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Dave Hansen Cc: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 7 ++++ arch/x86/mm/gup.c | 57 ++++++++++++++++++++++++++++++-- include/linux/huge_mm.h | 10 +++++- include/linux/mm.h | 59 +++++++++++++++++++++++---------- kernel/memremap.c | 12 +++++++ mm/gup.c | 30 +++++++++++++++-- mm/huge_memory.c | 75 +++++++++++++++++++++++++++++++++--------- mm/swap.c | 1 + 8 files changed, 212 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6a0ad82c8d0f..0687c4748b8f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -479,6 +479,13 @@ static inline int pte_present(pte_t a) return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } +#ifdef __HAVE_ARCH_PTE_DEVMAP +static inline int pte_devmap(pte_t a) +{ + return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP; +} +#endif + #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index f8cb3e8ac250..6d5eb5900372 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -63,6 +64,16 @@ retry: #endif } +static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + put_page(page); + } +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -71,7 +82,9 @@ retry: static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { + struct dev_pagemap *pgmap = NULL; unsigned long mask; + int nr_start = *nr; pte_t *ptep; mask = _PAGE_PRESENT|_PAGE_USER; @@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, return 0; } - if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { + page = pte_page(pte); + if (pte_devmap(pte)) { + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + pte_unmap(ptep); + return 0; + } + } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; } VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); get_page(page); + put_dev_pagemap(pgmap); SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + int nr_start = *nr; + unsigned long pfn = pmd_pfn(pmd); + struct dev_pagemap *pgmap = NULL; + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + do { + struct page *page = pfn_to_page(pfn); + + pgmap = get_dev_pagemap(pfn, pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + SetPageReferenced(page); + pages[*nr] = page; + get_page(page); + put_dev_pagemap(pgmap); + (*nr)++; + pfn++; + } while (addr += PAGE_SIZE, addr != end); + return 1; +} + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, mask |= _PAGE_RW; if ((pmd_flags(pmd) & mask) != mask) return 0; + + VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); + if (pmd_devmap(pmd)) + return __gup_device_huge_pmd(pmd, addr, end, pages, nr); + /* hugepages are never "special" */ VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); refs = 0; head = pmd_page(pmd); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d39fa60bd6bf..cfe81e10bd54 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -38,7 +38,6 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, int prot_numa); int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, pfn_t pfn, bool write); - enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, @@ -55,6 +54,9 @@ enum transparent_hugepage_flag { #define HPAGE_PMD_NR (1< #include #include +#include #include #include #include @@ -465,17 +466,6 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -static inline void get_page(struct page *page) -{ - page = compound_head(page); - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); -} - static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); @@ -494,13 +484,6 @@ static inline void init_page_count(struct page *page) void __put_page(struct page *page); -static inline void put_page(struct page *page) -{ - page = compound_head(page); - if (put_page_testzero(page)) - __put_page(page); -} - void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); @@ -682,17 +665,50 @@ static inline enum zone_type page_zonenum(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE +void get_zone_device_page(struct page *page); +void put_zone_device_page(struct page *page); static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } #else +static inline void get_zone_device_page(struct page *page) +{ +} +static inline void put_zone_device_page(struct page *page) +{ +} static inline bool is_zone_device_page(const struct page *page) { return false; } #endif +static inline void get_page(struct page *page) +{ + page = compound_head(page); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); + atomic_inc(&page->_count); + + if (unlikely(is_zone_device_page(page))) + get_zone_device_page(page); +} + +static inline void put_page(struct page *page) +{ + page = compound_head(page); + + if (put_page_testzero(page)) + __put_page(page); + + if (unlikely(is_zone_device_page(page))) + put_zone_device_page(page); +} + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif @@ -1444,6 +1460,13 @@ static inline void sync_mm_rss(struct mm_struct *mm) } #endif +#ifndef __HAVE_ARCH_PTE_DEVMAP +static inline int pte_devmap(pte_t pte) +{ + return 0; +} +#endif + int vma_wants_writenotify(struct vm_area_struct *vma); extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, diff --git a/kernel/memremap.c b/kernel/memremap.c index 3eb8944265d5..e517a16cb426 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -169,6 +169,18 @@ struct page_map { struct vmem_altmap altmap; }; +void get_zone_device_page(struct page *page) +{ + percpu_ref_get(page->pgmap->ref); +} +EXPORT_SYMBOL(get_zone_device_page); + +void put_zone_device_page(struct page *page) +{ + put_dev_pagemap(page->pgmap); +} +EXPORT_SYMBOL(put_zone_device_page); + static void pgmap_radix_release(struct resource *res) { resource_size_t key; diff --git a/mm/gup.c b/mm/gup.c index e95b0cb6ed81..aa21c4b865a5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -98,7 +100,17 @@ retry: } page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { + if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { + /* + * Only return device mapping pages in the FOLL_GET case since + * they are only valid while holding the pgmap reference. + */ + pgmap = get_dev_pagemap(pte_pfn(pte), NULL); + if (pgmap) + page = pte_page(pte); + else + goto no_page; + } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -129,8 +141,15 @@ retry: goto retry; } - if (flags & FOLL_GET) + if (flags & FOLL_GET) { get_page(page); + + /* drop the pgmap reference now that we hold the page */ + if (pgmap) { + put_dev_pagemap(pgmap); + pgmap = NULL; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -237,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, } if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); + if (pmd_devmap(*pmd)) { + ptl = pmd_lock(mm, pmd); + page = follow_devmap_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + if (page) + return page; + } if (likely(!pmd_trans_huge(*pmd))) return follow_page_pte(vma, address, pmd, flags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 82bed2bec3ed..b2db98136af9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -974,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_NOPAGE; } +static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + pmd_t _pmd; + + /* + * We should set the dirty bit only for FOLL_WRITE but for now + * the dirty bit in the pmd is meaningless. And if the dirty + * bit will become meaningful and we'll only set it with + * FOLL_WRITE, an atomic set_bit will be required on the pmd to + * set the young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); +} + +struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, int flags) +{ + unsigned long pfn = pmd_pfn(*pmd); + struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap; + struct page *page; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + return NULL; + + if (pmd_present(*pmd) && pmd_devmap(*pmd)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & FOLL_GET)) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + get_page(page); + put_dev_pagemap(pgmap); + + return page; +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) @@ -1331,21 +1389,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - if (flags & FOLL_TOUCH) { - pmd_t _pmd; - /* - * We should set the dirty bit only for FOLL_WRITE but - * for now the dirty bit in the pmd is meaningless. - * And if the dirty bit will become meaningful and - * we'll only set it with FOLL_WRITE, an atomic - * set_bit will be required on the pmd to set the - * young bit, instead of the current set_pmd_at. - */ - _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, 1)) - update_mmu_cache_pmd(vma, addr, pmd); - } + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * We don't mlock() pte-mapped THPs. This way we can avoid diff --git a/mm/swap.c b/mm/swap.c index 674e2c93da4e..09fe5e97714a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include -- cgit v1.3-14-g43fede From 4a9e1cda274893eca7d178d7dc265503ccb9d87a Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 15 Jan 2016 16:57:04 -0800 Subject: mm: bring in additional flag for fixup_user_fault to signal unlock During Jason's work with postcopy migration support for s390 a problem regarding gmap faults was discovered. The gmap code will call fixup_user_fault which will end up always in handle_mm_fault. Till now we never cared about retries, but as the userfaultfd code kind of relies on it. this needs some fix. This patchset does not take care of the futex code. I will now look closer at this. This patch (of 2): With the introduction of userfaultfd, kvm on s390 needs fixup_user_fault to pass in FAULT_FLAG_ALLOW_RETRY and give feedback if during the faulting we ever unlocked mmap_sem. This patch brings in the logic to handle retries as well as it cleans up the current documentation. fixup_user_fault was not having the same semantics as filemap_fault. It never indicated if a retry happened and so a caller wasn't able to handle that case. So we now changed the behaviour to always retry a locked mmap_sem. Signed-off-by: Dominik Dingel Reviewed-by: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Christian Borntraeger Cc: "Jason J. Herne" Cc: David Rientjes Cc: Eric B Munson Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Heiko Carstens Cc: Dominik Dingel Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/pgtable.c | 8 +++++--- include/linux/mm.h | 5 +++-- kernel/futex.c | 2 +- mm/gup.c | 30 +++++++++++++++++++++++++----- 4 files changed, 34 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4e54492f463a..84bddda8d412 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -585,7 +585,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr, rc = vmaddr; goto out_up; } - if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { + if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, NULL)) { rc = -EFAULT; goto out_up; } @@ -727,7 +727,8 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) break; } /* Get the page mapped */ - if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { + if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, + NULL)) { rc = -EFAULT; break; } @@ -802,7 +803,8 @@ retry: if (!(pte_val(*ptep) & _PAGE_INVALID) && (pte_val(*ptep) & _PAGE_PROTECT)) { pte_unmap_unlock(ptep, ptl); - if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { + if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE, + NULL)) { up_read(&mm->mmap_sem); return -EFAULT; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 792f2469c142..1d6ec55d8b25 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1194,7 +1194,8 @@ int invalidate_inode_page(struct page *page); extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags); + unsigned long address, unsigned int fault_flags, + bool *unlocked); #else static inline int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1206,7 +1207,7 @@ static inline int handle_mm_fault(struct mm_struct *mm, } static inline int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, - unsigned int fault_flags) + unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); diff --git a/kernel/futex.c b/kernel/futex.c index eed92a8a4c49..c6f514573b28 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -604,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) down_read(&mm->mmap_sem); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, - FAULT_FLAG_WRITE); + FAULT_FLAG_WRITE, NULL); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; diff --git a/mm/gup.c b/mm/gup.c index aa21c4b865a5..b64a36175884 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -618,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages); * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() + * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller + * does not allow retry * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -629,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages); * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. + * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). + * This function will not return with an unlocked mmap_sem. So it has not the + * same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) + unsigned long address, unsigned int fault_flags, + bool *unlocked) { struct vm_area_struct *vma; vm_flags_t vm_flags; - int ret; + int ret, major = 0; + + if (unlocked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; +retry: vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; @@ -654,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); + major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return -ENOMEM; @@ -663,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; BUG(); } + + if (ret & VM_FAULT_RETRY) { + down_read(&mm->mmap_sem); + if (!(fault_flags & FAULT_FLAG_TRIED)) { + *unlocked = true; + fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; + fault_flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + if (tsk) { - if (ret & VM_FAULT_MAJOR) + if (major) tsk->maj_flt++; else tsk->min_flt++; -- cgit v1.3-14-g43fede From b493c34309cb4aebc44272897067ebf54cb07271 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Jan 2016 16:58:16 -0800 Subject: kernel/stop_machine.c: remove CONFIG_SMP dependencies stop_machine.o is only built if CONFIG_SMP=y, so this ifdef always evaluates to true. [akpm@linux-foundation.org: remove now-unneeded ifdef] Reported-by: Valentin Rothberg Cc: Chris Wilson Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index edb6de4f5908..a467e6c28a3b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -529,8 +529,6 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) - static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { @@ -628,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, mutex_unlock(&stop_cpus_mutex); return ret ?: done.ret; } - -#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -- cgit v1.3-14-g43fede From 81cc26f2bd11ba4421a17a2d5cebe4bba206c239 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 15 Jan 2016 16:58:21 -0800 Subject: printk: only unregister boot consoles when necessary Boot consoles are typically replaced by proper consoles during the boot process. This can be problematic if the boot console data is part of the init section that is reclaimed late during boot. If the proper console does not register before this point in time, the boot console will need to be removed (so that the freed memory is not accessed), leaving the system without output for some time. There are various reasons why the proper console may not register early enough, such as deferred probe or the driver being a loadable module. If that happens, there is some amount of time where no console messages are visible to the user, which in turn can mean that they won't see crashes or other potentially useful information. To avoid this situation, only remove the boot console when it resides in the init section. Code exists to replace the boot console by the proper console when it is registered, keeping a seamless transition between the boot and proper consoles. Signed-off-by: Thierry Reding Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2ce8826f1053..0c9f02506169 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -48,6 +48,7 @@ #include #include +#include #define CREATE_TRACE_POINTS #include @@ -2658,13 +2659,36 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); +/* + * Some boot consoles access data that is in the init section and which will + * be discarded after the initcalls have been run. To make sure that no code + * will access this data, unregister the boot consoles in a late initcall. + * + * If for some reason, such as deferred probe or the driver being a loadable + * module, the real console hasn't registered yet at this point, there will + * be a brief interval in which no messages are logged to the console, which + * makes it difficult to diagnose problems that occur during this time. + * + * To mitigate this problem somewhat, only unregister consoles whose memory + * intersects with the init section. Note that code exists elsewhere to get + * rid of the boot console as soon as the proper console shows up, so there + * won't be side-effects from postponing the removal. + */ static int __init printk_late_init(void) { struct console *con; for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { - unregister_console(con); + /* + * Make sure to unregister boot consoles whose data + * resides in the init section before the init section + * is discarded. Boot consoles whose data will stick + * around will automatically be unregistered when the + * proper console replaces them. + */ + if (init_section_intersects(con, sizeof(*con))) + unregister_console(con); } } hotcpu_notifier(console_cpu_notify, 0); -- cgit v1.3-14-g43fede From 8d91f8b15361dfb438ab6eb3b319e2ded43458ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 15 Jan 2016 16:58:24 -0800 Subject: printk: do cond_resched() between lines while outputting to consoles @console_may_schedule tracks whether console_sem was acquired through lock or trylock. If the former, we're inside a sleepable context and console_conditional_schedule() performs cond_resched(). This allows console drivers which use console_lock for synchronization to yield while performing time-consuming operations such as scrolling. However, the actual console outputting is performed while holding irq-safe logbuf_lock, so console_unlock() clears @console_may_schedule before starting outputting lines. Also, only a few drivers call console_conditional_schedule() to begin with. This means that when a lot of lines need to be output by console_unlock(), for example on a console registration, the task doing console_unlock() may not yield for a long time on a non-preemptible kernel. If this happens with a slow console devices, for example a serial console, the outputting task may occupy the cpu for a very long time. Long enough to trigger softlockup and/or RCU stall warnings, which in turn pile more messages, sometimes enough to trigger the next cycle of warnings incapacitating the system. Fix it by making console_unlock() insert cond_resched() between lines if @console_may_schedule. Signed-off-by: Tejun Heo Reported-by: Calvin Owens Acked-by: Jan Kara Cc: Dave Jones Cc: Kyle McMartin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/console.h | 1 + kernel/panic.c | 3 +-- kernel/printk/printk.c | 35 ++++++++++++++++++++++++++++++++++- 3 files changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/console.h b/include/linux/console.h index bd194343c346..ea731af2451e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -150,6 +150,7 @@ extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); +extern void console_flush_on_panic(void); extern struct tty_driver *console_device(int *); extern void console_stop(struct console *); extern void console_start(struct console *); diff --git a/kernel/panic.c b/kernel/panic.c index b333380c6bb2..d96469de72dc 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -180,8 +180,7 @@ void panic(const char *fmt, ...) * panic() is not being callled from OOPS. */ debug_locks_off(); - console_trylock(); - console_unlock(); + console_flush_on_panic(); if (!panic_blink) panic_blink = no_blink; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0c9f02506169..08934a395c1a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2234,13 +2234,24 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; - bool retry; + bool do_cond_resched, retry; if (console_suspended) { up_console_sem(); return; } + /* + * Console drivers are called under logbuf_lock, so + * @console_may_schedule should be cleared before; however, we may + * end up dumping a lot of lines, for example, if called from + * console registration path, and should invoke cond_resched() + * between lines if allowable. Not doing so can cause a very long + * scheduling stall on a slow console leading to RCU stall and + * softlockup warnings which exacerbate the issue with more + * messages practically incapacitating the system. + */ + do_cond_resched = console_may_schedule; console_may_schedule = 0; /* flush buffered message fragment immediately to console */ @@ -2312,6 +2323,9 @@ skip: call_console_drivers(level, ext_text, ext_len, text, len); start_critical_timings(); local_irq_restore(flags); + + if (do_cond_resched) + cond_resched(); } console_locked = 0; @@ -2379,6 +2393,25 @@ void console_unblank(void) console_unlock(); } +/** + * console_flush_on_panic - flush console content on panic + * + * Immediately output all pending messages no matter what. + */ +void console_flush_on_panic(void) +{ + /* + * If someone else is holding the console lock, trylock will fail + * and may_schedule may be set. Ignore and proceed to unlock so + * that messages are flushed out. As this can be called from any + * context and we don't want to get preempted while flushing, + * ensure may_schedule is cleared. + */ + console_trylock(); + console_may_schedule = 0; + console_unlock(); +} + /* * Return the console tty driver structure and its associated index */ -- cgit v1.3-14-g43fede From 06b031de22d28ae76b2e5bfaf22c56a265a1e106 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 15 Jan 2016 16:59:23 -0800 Subject: printk: change recursion_bug type to bool `recursion_bug' is used as recursion_bug toggle, so make it `bool'. Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 08934a395c1a..e79439134978 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1661,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { - static int recursion_bug; + static bool recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len = 0; @@ -1697,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level, * it can be printed at the next appropriate moment: */ if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = 1; + recursion_bug = true; local_irq_restore(flags); return 0; } @@ -1712,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level, static const char recursion_msg[] = "BUG: recent printk recursion!"; - recursion_bug = 0; + recursion_bug = false; /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, NULL, 0, recursion_msg, -- cgit v1.3-14-g43fede From 203cbf77de59fc8f13502dcfd11350c6d4a5c95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:46 +0000 Subject: hrtimer: Handle remaining time proper for TIME_LOW_RES If CONFIG_TIME_LOW_RES is enabled we add a jiffie to the relative timeout to prevent short sleeps, but we do not account for that in interfaces which retrieve the remaining time. Helge observed that timerfd can return a remaining time larger than the relative timeout. That's not expected and breaks userland test programs. Store the information that the timer was armed relative and provide functions to adjust the remaining time. To avoid bloating the hrtimer struct make state a u8, which as a bonus results in better code on x86 at least. Reported-and-tested-by: Helge Deller Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.273328486@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 34 +++++++++++++++++++++++++++--- kernel/time/hrtimer.c | 55 ++++++++++++++++++++++++++++++++---------------- kernel/time/timer_list.c | 2 +- 3 files changed, 69 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..2ead22dd74a0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -87,7 +87,8 @@ enum hrtimer_restart { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @start_pid: timer statistics field to store the pid of the task which + * @is_rel: Set if the timer was armed relative + * @start_pid: timer statistics field to store the pid of the task which * started the timer * @start_site: timer statistics field to store the site where the timer * was started @@ -101,7 +102,8 @@ struct hrtimer { ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; - unsigned long state; + u8 state; + u8 is_rel; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -321,6 +323,27 @@ static inline void clock_was_set_delayed(void) { } #endif +static inline ktime_t +__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) +{ + ktime_t rem = ktime_sub(timer->node.expires, now); + + /* + * Adjust relative timers for the extra we added in + * hrtimer_start_range_ns() to prevent short timeouts. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) + rem.tv64 -= hrtimer_resolution; + return rem; +} + +static inline ktime_t +hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) +{ + return __hrtimer_expires_remaining_adjusted(timer, + timer->base->get_time()); +} + extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); @@ -390,7 +413,12 @@ static inline void hrtimer_restart(struct hrtimer *timer) } /* Query timers: */ -extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); +extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); + +static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +{ + return __hrtimer_get_remaining(timer, false); +} extern u64 hrtimer_get_next_event(void); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..fa909f9fd559 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -897,10 +897,10 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) + u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - unsigned int state = timer->state; + u8 state = timer->state; timer->state = newstate; if (!(state & HRTIMER_STATE_ENQUEUED)) @@ -930,7 +930,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state = timer->state; + u8 state = timer->state; int reprogram; /* @@ -954,6 +954,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest return 0; } +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); +#endif + return tim; +} + /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added @@ -974,19 +990,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); - if (mode & HRTIMER_MODE_REL) { + if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); -#endif - } + + tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); @@ -1074,19 +1081,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** @@ -1219,6 +1230,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, timer_stats_account_hrtimer(timer); fn = timer->function; + /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + /* * Because we run timers from hardirq context, there is no chance * they get migrated to another cpu, therefore its safe to unlock diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index f75e35b60149..ba7d8b288bb3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); + SEQ_printf(m, ", S:%02x", timer->state); #ifdef CONFIG_TIMER_STATS SEQ_printf(m, ", "); print_name_offset(m, timer->start_site); -- cgit v1.3-14-g43fede From 572c39172684c3711e4a03c9a7380067e2b0661c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:47 +0000 Subject: posix-timers: Handle relative timers with CONFIG_TIME_LOW_RES proper As Helge reported for timerfd we have the same issue in posix timers. We return remaining time larger than the programmed relative time to user space in case of CONFIG_TIME_LOW_RES=y. Use the proper function to adjust the extra time added in hrtimer_start_range_ns(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Helge Deller Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.450510905@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31d11ac9fa47..f2826c35e918 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(hrtimer_get_expires(timer), now); + remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* -- cgit v1.3-14-g43fede From 51cbb5242a41700a3f250ecfb48dcfb7e4375ea4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:48 +0000 Subject: itimers: Handle relative timers with CONFIG_TIME_LOW_RES proper As Helge reported for timerfd we have the same issue in itimers. We return remaining time larger than the programmed relative time to user space in case of CONFIG_TIME_LOW_RES=y. Use the proper function to adjust the extra time added in hrtimer_start_range_ns(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Helge Deller Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.528222587@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/itimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 8d262b467573..1d5c7204ddc9 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ */ static struct timeval itimer_get_remtime(struct hrtimer *timer) { - ktime_t rem = hrtimer_get_remaining(timer); + ktime_t rem = __hrtimer_get_remaining(timer, true); /* * Racy but safe: if the itimer expires after the above -- cgit v1.3-14-g43fede From 9c03ee147193645be4c186d3688232fa438c57c7 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Sat, 16 Jan 2016 00:31:23 +0530 Subject: sched: Fix crash in sched_init_numa() The following PowerPC commit: c118baf80256 ("arch/powerpc/mm/numa.c: do not allocate bootmem memory for non existing nodes") avoids allocating bootmem memory for non existent nodes. But when DEBUG_PER_CPU_MAPS=y is enabled, my powerNV system failed to boot because in sched_init_numa(), cpumask_or() operation was done on unallocated nodes. Fix that by making cpumask_or() operation only on existing nodes. [ Tested with and w/o DEBUG_PER_CPU_MAPS=y on x86 and PowerPC. ] Reported-by: Jan Stancek Tested-by: Jan Stancek Signed-off-by: Raghavendra K T Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1452884483-11676-1-git-send-email-raghavendra.kt@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44253adb3c36..474658b51fe6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6840,7 +6840,7 @@ static void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; - for (k = 0; k < nr_node_ids; k++) { + for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; -- cgit v1.3-14-g43fede From 51164251f5c35e6596130ef0de94ffe65fe441e0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 16 Jan 2016 00:54:53 +0100 Subject: sched / idle: Drop default_idle_call() fallback from call_cpuidle() After commit 9c4b2867ed7c (cpuidle: menu: Fix menu_select() for CPUIDLE_DRIVER_STATE_START == 0) it is clear that menu_select() cannot return negative values. Moreover, ladder_select_state() will never return a negative value too, so make find_deepest_state() return non-negative values too and drop the default_idle_call() fallback from call_cpuidle(). This eliminates one branch from the idle loop and makes the governors and find_deepest_state() handle the case when all states have been disabled from sysfs consistently. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar Tested-by: Sudeep Holla --- drivers/cpuidle/cpuidle.c | 6 +++--- kernel/sched/idle.c | 6 ------ 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 17a6dc0e2111..046423b0c5ca 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -79,9 +79,9 @@ static int find_deepest_state(struct cpuidle_driver *drv, bool freeze) { unsigned int latency_req = 0; - int i, ret = -ENXIO; + int i, ret = 0; - for (i = 0; i < drv->state_count; i++) { + for (i = 1; i < drv->state_count; i++) { struct cpuidle_state *s = &drv->states[i]; struct cpuidle_state_usage *su = &dev->states_usage[i]; @@ -243,7 +243,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, * @drv: the cpuidle driver * @dev: the cpuidle device * - * Returns the index of the idle state. + * Returns the index of the idle state. The return value must not be negative. */ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..34852ee70d54 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -97,12 +97,6 @@ void default_idle_call(void) static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, int next_state) { - /* Fall back to the default arch idle method on errors. */ - if (next_state < 0) { - default_idle_call(); - return next_state; - } - /* * The idle task must be scheduled, it is pointless to go to idle, just * update no idle residency and return. -- cgit v1.3-14-g43fede From 759c01142a5d0f364a462346168a56de28a80f52 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 18 Jan 2016 16:36:09 +0100 Subject: pipe: limit the per-user amount of pages allocated in pipes On no-so-small systems, it is possible for a single process to cause an OOM condition by filling large pipes with data that are never read. A typical process filling 4000 pipes with 1 MB of data will use 4 GB of memory. On small systems it may be tricky to set the pipe max size to prevent this from happening. This patch makes it possible to enforce a per-user soft limit above which new pipes will be limited to a single page, effectively limiting them to 4 kB each, as well as a hard limit above which no new pipes may be created for this user. This has the effect of protecting the system against memory abuse without hurting other users, and still allowing pipes to work correctly though with less data at once. The limit are controlled by two new sysctls : pipe-user-pages-soft, and pipe-user-pages-hard. Both may be disabled by setting them to zero. The default soft limit allows the default number of FDs per process (1024) to create pipes of the default size (64kB), thus reaching a limit of 64MB before starting to create only smaller pipes. With 256 processes limited to 1024 FDs each, this results in 1024*64kB + (256*1024 - 1024) * 4kB = 1084 MB of memory allocated for a user. The hard limit is disabled by default to avoid breaking existing applications that make intensive use of pipes (eg: for splicing). Reported-by: socketpair@gmail.com Reported-by: Tetsuo Handa Mitigates: CVE-2013-4312 (Linux 2.0+) Suggested-by: Linus Torvalds Signed-off-by: Willy Tarreau Signed-off-by: Al Viro --- Documentation/sysctl/fs.txt | 23 ++++++++++++++++++++++ fs/pipe.c | 47 +++++++++++++++++++++++++++++++++++++++++++-- include/linux/pipe_fs_i.h | 4 ++++ include/linux/sched.h | 1 + kernel/sysctl.c | 14 ++++++++++++++ 5 files changed, 87 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 88152f214f48..302b5ed616a6 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/fs: - nr_open - overflowuid - overflowgid +- pipe-user-pages-hard +- pipe-user-pages-soft - protected_hardlinks - protected_symlinks - suid_dumpable @@ -159,6 +161,27 @@ The default is 65534. ============================================================== +pipe-user-pages-hard: + +Maximum total number of pages a non-privileged user may allocate for pipes. +Once this limit is reached, no new pipes may be allocated until usage goes +below the limit again. When set to 0, no limit is applied, which is the default +setting. + +============================================================== + +pipe-user-pages-soft: + +Maximum total number of pages a non-privileged user may allocate for pipes +before the pipe size gets limited to a single page. Once this limit is reached, +new pipes will be limited to a single page in size for this user in order to +limit total memory usage, and trying to increase them using fcntl() will be +denied until usage goes below the limit again. The default value allows to +allocate up to 1024 pipes at their default size. When set to 0, no limit is +applied. + +============================================================== + protected_hardlinks: A long-standing class of security issues is the hardlink-based diff --git a/fs/pipe.c b/fs/pipe.c index 42cf8ddf0e55..ab8dad3ccb6a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576; */ unsigned int pipe_min_size = PAGE_SIZE; +/* Maximum allocatable pages per user. Hard limit is unset by default, soft + * matches default values. + */ +unsigned long pipe_user_pages_hard; +unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } +static void account_pipe_buffers(struct pipe_inode_info *pipe, + unsigned long old, unsigned long new) +{ + atomic_long_add(new - old, &pipe->user->pipe_bufs); +} + +static bool too_many_pipe_buffers_soft(struct user_struct *user) +{ + return pipe_user_pages_soft && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; +} + +static bool too_many_pipe_buffers_hard(struct user_struct *user) +{ + return pipe_user_pages_hard && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; +} + struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); + + if (!too_many_pipe_buffers_hard(user)) { + if (too_many_pipe_buffers_soft(user)) + pipe_bufs = 1; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + } + if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = PIPE_DEF_BUFFERS; + pipe->buffers = pipe_bufs; + pipe->user = user; + account_pipe_buffers(pipe, 0, pipe_bufs); mutex_init(&pipe->mutex); return pipe; } + free_uid(user); kfree(pipe); } @@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; + account_pipe_buffers(pipe, pipe->buffers, 0); + free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) @@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } + account_pipe_buffers(pipe, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; @@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; + } else if ((too_many_pipe_buffers_hard(pipe->user) || + too_many_pipe_buffers_soft(pipe->user)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; } ret = pipe_set_size(pipe, nr_pages); break; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index eb8b8ac6df3c..24f5470d3944 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -42,6 +42,7 @@ struct pipe_buffer { * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers + * @user: the user who created this pipe **/ struct pipe_inode_info { struct mutex mutex; @@ -57,6 +58,7 @@ struct pipe_inode_info { struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; + struct user_struct *user; }; /* @@ -123,6 +125,8 @@ void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); extern unsigned int pipe_max_size, pipe_min_size; +extern unsigned long pipe_user_pages_hard; +extern unsigned long pipe_user_pages_soft; int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); diff --git a/include/linux/sched.h b/include/linux/sched.h index 61aa9bbea871..1589ddc88e38 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -835,6 +835,7 @@ struct user_struct { #endif unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ + atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ #ifdef CONFIG_KEYS struct key *uid_keyring; /* UID specific keyring */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c810f8afdb7f..f6fd236429bd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1757,6 +1757,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; -- cgit v1.3-14-g43fede From 7c3b00e06d731a28fc3d17ed02ba250642b15b81 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Jan 2016 14:59:55 -0800 Subject: ptrace: make wait_on_bit(JOBCTL_TRAPPING_BIT) in ptrace_attach() killable ptrace_attach() can hang waiting for STOPPED -> TRACED transition if the tracee gets frozen in between, change wait_on_bit() to use TASK_KILLABLE. This doesn't really solve the problem(s) and we probably need to fix the freezer. In particular, note that this means that pm freezer will fail if it races attach-to-stopped-task. And otoh perhaps we can just remove JOBCTL_TRAPPING_BIT altogether, it is not clear if we really need to hide this transition from debugger, WNOHANG after PTRACE_ATTACH can fail anyway if it races with SIGCONT. Signed-off-by: Oleg Nesterov Reported-by: Andrey Ryabinin Cc: Roland McGrath Acked-by: Tejun Heo Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b760bae64cf1..aa94aee9d4c9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -364,8 +364,14 @@ unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: if (!retval) { - wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, - TASK_UNINTERRUPTIBLE); + /* + * We do not bother to change retval or clear JOBCTL_TRAPPING + * if wait_on_bit() was interrupted by SIGKILL. The tracer will + * not return to user-mode, it will exit and clear this bit in + * __ptrace_unlink() if it wasn't already cleared by the tracee; + * and until then nobody can ptrace this task. + */ + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); proc_ptrace_connector(task, PTRACE_ATTACH); } -- cgit v1.3-14-g43fede From 570ac9337b5c13dbf46ca6758c376e2e13e8956f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Jan 2016 14:59:58 -0800 Subject: ptrace: task_stopped_code(ptrace => true) can't see TASK_STOPPED task task_stopped_code()->task_is_stopped_or_traced() doesn't look right, the traced task must never be TASK_STOPPED. We can not add WARN_ON(task_is_stopped(p)), but this is only because do_wait() can race with PTRACE_ATTACH from another thread. [akpm@linux-foundation.org: teeny cleanup] Signed-off-by: Oleg Nesterov Cc: Andrey Ryabinin Cc: Roland McGrath Acked-by: Tejun Heo Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 07110c6020a0..b0eea830303c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1120,8 +1120,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p) && - !(p->jobctl & JOBCTL_LISTENING)) + if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) -- cgit v1.3-14-g43fede From caaee6234d05a58c5b4d05e7bf766131b810a657 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jan 2016 15:00:04 -0800 Subject: ptrace: use fsuid, fsgid, effective creds for fs access checks By checking the effective credentials instead of the real UID / permitted capabilities, ensure that the calling process actually intended to use its credentials. To ensure that all ptrace checks use the correct caller credentials (e.g. in case out-of-tree code or newly added code omits the PTRACE_MODE_*CREDS flag), use two new flags and require one of them to be set. The problem was that when a privileged task had temporarily dropped its privileges, e.g. by calling setreuid(0, user_uid), with the intent to perform following syscalls with the credentials of a user, it still passed ptrace access checks that the user would not be able to pass. While an attacker should not be able to convince the privileged task to perform a ptrace() syscall, this is a problem because the ptrace access check is reused for things in procfs. In particular, the following somewhat interesting procfs entries only rely on ptrace access checks: /proc/$pid/stat - uses the check for determining whether pointers should be visible, useful for bypassing ASLR /proc/$pid/maps - also useful for bypassing ASLR /proc/$pid/cwd - useful for gaining access to restricted directories that contain files with lax permissions, e.g. in this scenario: lrwxrwxrwx root root /proc/13020/cwd -> /root/foobar drwx------ root root /root drwxr-xr-x root root /root/foobar -rw-r--r-- root root /root/foobar/secret Therefore, on a system where a root-owned mode 6755 binary changes its effective credentials as described and then dumps a user-specified file, this could be used by an attacker to reveal the memory layout of root's processes or reveal the contents of files he is not allowed to access (through /proc/$pid/cwd). [akpm@linux-foundation.org: fix warning] Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Casey Schaufler Cc: Oleg Nesterov Cc: Ingo Molnar Cc: James Morris Cc: "Serge E. Hallyn" Cc: Andy Shevchenko Cc: Andy Lutomirski Cc: Al Viro Cc: "Eric W. Biederman" Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- fs/proc/base.c | 21 +++++++++++---------- fs/proc/namespaces.c | 4 ++-- include/linux/ptrace.h | 24 +++++++++++++++++++++++- kernel/events/core.c | 2 +- kernel/futex.c | 2 +- kernel/futex_compat.c | 2 +- kernel/kcmp.c | 4 ++-- kernel/ptrace.c | 39 +++++++++++++++++++++++++++++++-------- mm/process_vm_access.c | 2 +- security/commoncap.c | 7 ++++++- 11 files changed, 80 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/fs/proc/array.c b/fs/proc/array.c index d73291f5f0fc..b6c00ce0e29e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); + permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); diff --git a/fs/proc/base.c b/fs/proc/base.c index 2cf5d7e37375..e665097c1da5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = { static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); + struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { @@ -430,7 +430,8 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) + if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) + && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); else seq_putc(m, '0'); @@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task) int err = mutex_lock_killable(&task->signal->cred_guard_mutex); if (err) return err; - if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { mutex_unlock(&task->signal->cred_guard_mutex); return -EPERM; } @@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode) */ task = get_proc_task(inode); if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ); + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; @@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid, return true; if (in_group_p(pid->pid_gid)) return true; - return ptrace_may_access(task, PTRACE_MODE_READ); + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } @@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) struct mm_struct *mm = ERR_PTR(-ESRCH); if (task) { - mm = mm_access(task, mode); + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (!IS_ERR_OR_NULL(mm)) { @@ -1860,7 +1861,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) if (!task) goto out_notask; - mm = mm_access(task, PTRACE_MODE_READ); + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) goto out; @@ -2013,7 +2014,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out; result = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; result = -ENOENT; @@ -2066,7 +2067,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) goto out; ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; ret = 0; @@ -2533,7 +2534,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh if (result) return result; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { result = -EACCES; goto out_unlock; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 1dece8781f91..276f12431dbf 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -46,7 +46,7 @@ static const char *proc_ns_get_link(struct dentry *dentry, if (!task) return error; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { error = ns_get_path(&ns_path, task, ns_ops); if (!error) nd_jump_link(&ns_path); @@ -67,7 +67,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!task) return res; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 061265f92876..504c98a278d4 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -57,7 +57,29 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 -/* Returns true on success, false on denial. */ +#define PTRACE_MODE_FSCREDS 0x08 +#define PTRACE_MODE_REALCREDS 0x10 + +/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ +#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) +#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) + +/** + * ptrace_may_access - check whether the caller is permitted to access + * a target task. + * @task: target task + * @mode: selects type of access and caller credentials + * + * Returns true on success, false on denial. + * + * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must + * be set in @mode to specify whether the access was requested through + * a filesystem syscall (should use effective capabilities and fsuid + * of the caller) or through an explicit syscall such as + * process_vm_writev or ptrace (and should use the real credentials). + */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) diff --git a/kernel/events/core.c b/kernel/events/core.c index bf8244190d0f..c0957416b32e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3376,7 +3376,7 @@ find_lively_task_by_vpid(pid_t vpid) /* Reuse ptrace permission checks for now. */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto errout; return task; diff --git a/kernel/futex.c b/kernel/futex.c index c6f514573b28..0773f2b23b10 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2884,7 +2884,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 55c8c9349cfe..4ae3232e7a28 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 0aa69ea1d8fd..3a47fa998fe0 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, &task2->signal->cred_guard_mutex); if (ret) goto err; - if (!ptrace_may_access(task1, PTRACE_MODE_READ) || - !ptrace_may_access(task2, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || + !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aa94aee9d4c9..2341efe7fe02 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -219,6 +219,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; + int dumpable = 0; + kuid_t caller_uid; + kgid_t caller_gid; + + if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) { + WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n"); + return -EPERM; + } /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -228,18 +236,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - int dumpable = 0; + /* Don't let security modules deny introspection */ if (same_thread_group(task, current)) return 0; rcu_read_lock(); + if (mode & PTRACE_MODE_FSCREDS) { + caller_uid = cred->fsuid; + caller_gid = cred->fsgid; + } else { + /* + * Using the euid would make more sense here, but something + * in userland might rely on the old behavior, and this + * shouldn't be a security problem since + * PTRACE_MODE_REALCREDS implies that the caller explicitly + * used a syscall that requests access to another process + * (and not a filesystem syscall to procfs). + */ + caller_uid = cred->uid; + caller_gid = cred->gid; + } tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) + if (uid_eq(caller_uid, tcred->euid) && + uid_eq(caller_uid, tcred->suid) && + uid_eq(caller_uid, tcred->uid) && + gid_eq(caller_gid, tcred->egid) && + gid_eq(caller_gid, tcred->sgid) && + gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; @@ -306,7 +329,7 @@ static int ptrace_attach(struct task_struct *task, long request, goto out; task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); task_unlock(task); if (retval) goto unlock_creds; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e88d071648c2..5d453e58ddbf 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto free_proc_pages; } - mm = mm_access(task, PTRACE_MODE_ATTACH); + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* diff --git a/security/commoncap.c b/security/commoncap.c index 1832cf701c3d..48071ed7c445 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -137,12 +137,17 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; + const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); + if (mode & PTRACE_MODE_FSCREDS) + caller_caps = &cred->cap_effective; + else + caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && - cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) + cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; -- cgit v1.3-14-g43fede From c428fbdbf3e9515bfe686881ffdba862dbd8cb6f Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 20 Jan 2016 15:00:10 -0800 Subject: exit: remove unneeded declaration of exit_mm() Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b0eea830303c..10e088237fed 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,8 +59,6 @@ #include #include -static void exit_mm(struct task_struct *tsk); - static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; -- cgit v1.3-14-g43fede From c4c54dd1caf1393c529e7ea1f18b4342c796a49c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:16 -0800 Subject: kernel/cpu.c: change type of cpu_possible_bits and friends Change cpu_possible_bits and friends (online, present, active) from being bitmaps that happen to have the right size to actually being struct cpumasks. Also rename them to __cpu_xyz_mask. This is mostly a small cleanup in preparation for exporting them and, eventually, eliminating the extra indirection through the cpu_xyz_mask variables. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 85ff5e26e23b..6a96b713cea7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -759,71 +759,71 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly - = CPU_BITS_ALL; +static struct cpumask __cpu_possible_mask __read_mostly + = {CPU_BITS_ALL}; #else -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +static struct cpumask __cpu_possible_mask __read_mostly; #endif -const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); +const struct cpumask *const cpu_possible_mask = &__cpu_possible_mask; EXPORT_SYMBOL(cpu_possible_mask); -static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); +static struct cpumask __cpu_online_mask __read_mostly; +const struct cpumask *const cpu_online_mask = &__cpu_online_mask; EXPORT_SYMBOL(cpu_online_mask); -static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); +static struct cpumask __cpu_present_mask __read_mostly; +const struct cpumask *const cpu_present_mask = &__cpu_present_mask; EXPORT_SYMBOL(cpu_present_mask); -static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); +static struct cpumask __cpu_active_mask __read_mostly; +const struct cpumask *const cpu_active_mask = &__cpu_active_mask; EXPORT_SYMBOL(cpu_active_mask); void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) - cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); + cpumask_set_cpu(cpu, &__cpu_possible_mask); else - cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); + cpumask_clear_cpu(cpu, &__cpu_possible_mask); } void set_cpu_present(unsigned int cpu, bool present) { if (present) - cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); + cpumask_set_cpu(cpu, &__cpu_present_mask); else - cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); + cpumask_clear_cpu(cpu, &__cpu_present_mask); } void set_cpu_online(unsigned int cpu, bool online) { if (online) { - cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + cpumask_set_cpu(cpu, &__cpu_online_mask); + cpumask_set_cpu(cpu, &__cpu_active_mask); } else { - cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); + cpumask_clear_cpu(cpu, &__cpu_online_mask); } } void set_cpu_active(unsigned int cpu, bool active) { if (active) - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + cpumask_set_cpu(cpu, &__cpu_active_mask); else - cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); + cpumask_clear_cpu(cpu, &__cpu_active_mask); } void init_cpu_present(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_present_bits), src); + cpumask_copy(&__cpu_present_mask, src); } void init_cpu_possible(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_possible_bits), src); + cpumask_copy(&__cpu_possible_mask, src); } void init_cpu_online(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_online_bits), src); + cpumask_copy(&__cpu_online_mask, src); } -- cgit v1.3-14-g43fede From 4b804c85dc37db6c108832b28cd54673ff7ee037 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:19 -0800 Subject: kernel/cpu.c: export __cpu_*_mask Exporting the cpumasks __cpu_possible_mask and friends will allow us to remove the extra indirection through the cpu_*_mask variables. It will also allow the set_cpu_* functions to become static inlines, which will give a .text reduction. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 4 ++++ kernel/cpu.c | 14 +++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 59915ea5373c..d4545a1852f2 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -89,6 +89,10 @@ extern const struct cpumask *const cpu_possible_mask; extern const struct cpumask *const cpu_online_mask; extern const struct cpumask *const cpu_present_mask; extern const struct cpumask *const cpu_active_mask; +extern struct cpumask __cpu_possible_mask; +extern struct cpumask __cpu_online_mask; +extern struct cpumask __cpu_present_mask; +extern struct cpumask __cpu_active_mask; #if NR_CPUS > 1 #define num_online_cpus() cpumask_weight(cpu_online_mask) diff --git a/kernel/cpu.c b/kernel/cpu.c index 6a96b713cea7..35d1d45be8e9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -759,23 +759,27 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE -static struct cpumask __cpu_possible_mask __read_mostly +struct cpumask __cpu_possible_mask __read_mostly = {CPU_BITS_ALL}; #else -static struct cpumask __cpu_possible_mask __read_mostly; +struct cpumask __cpu_possible_mask __read_mostly; #endif +EXPORT_SYMBOL(__cpu_possible_mask); const struct cpumask *const cpu_possible_mask = &__cpu_possible_mask; EXPORT_SYMBOL(cpu_possible_mask); -static struct cpumask __cpu_online_mask __read_mostly; +struct cpumask __cpu_online_mask __read_mostly; +EXPORT_SYMBOL(__cpu_online_mask); const struct cpumask *const cpu_online_mask = &__cpu_online_mask; EXPORT_SYMBOL(cpu_online_mask); -static struct cpumask __cpu_present_mask __read_mostly; +struct cpumask __cpu_present_mask __read_mostly; +EXPORT_SYMBOL(__cpu_present_mask); const struct cpumask *const cpu_present_mask = &__cpu_present_mask; EXPORT_SYMBOL(cpu_present_mask); -static struct cpumask __cpu_active_mask __read_mostly; +struct cpumask __cpu_active_mask __read_mostly; +EXPORT_SYMBOL(__cpu_active_mask); const struct cpumask *const cpu_active_mask = &__cpu_active_mask; EXPORT_SYMBOL(cpu_active_mask); -- cgit v1.3-14-g43fede From 5aec01b834fd6f8ca49d1aeede665b950d0c148e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:25 -0800 Subject: kernel/cpu.c: eliminate cpu_*_mask Replace the variables cpu_possible_mask, cpu_online_mask, cpu_present_mask and cpu_active_mask with macros expanding to expressions of the same type and value, eliminating some indirection. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 8 ++++---- kernel/cpu.c | 8 -------- 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index d4545a1852f2..52ab539aefce 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -85,14 +85,14 @@ extern int nr_cpu_ids; * only one CPU. */ -extern const struct cpumask *const cpu_possible_mask; -extern const struct cpumask *const cpu_online_mask; -extern const struct cpumask *const cpu_present_mask; -extern const struct cpumask *const cpu_active_mask; extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; +#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) +#define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) +#define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) +#define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) #if NR_CPUS > 1 #define num_online_cpus() cpumask_weight(cpu_online_mask) diff --git a/kernel/cpu.c b/kernel/cpu.c index 35d1d45be8e9..8734fc74fcbc 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -765,23 +765,15 @@ struct cpumask __cpu_possible_mask __read_mostly struct cpumask __cpu_possible_mask __read_mostly; #endif EXPORT_SYMBOL(__cpu_possible_mask); -const struct cpumask *const cpu_possible_mask = &__cpu_possible_mask; -EXPORT_SYMBOL(cpu_possible_mask); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); -const struct cpumask *const cpu_online_mask = &__cpu_online_mask; -EXPORT_SYMBOL(cpu_online_mask); struct cpumask __cpu_present_mask __read_mostly; EXPORT_SYMBOL(__cpu_present_mask); -const struct cpumask *const cpu_present_mask = &__cpu_present_mask; -EXPORT_SYMBOL(cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); -const struct cpumask *const cpu_active_mask = &__cpu_active_mask; -EXPORT_SYMBOL(cpu_active_mask); void set_cpu_possible(unsigned int cpu, bool possible) { -- cgit v1.3-14-g43fede From 9425676a363c0976e3d43dda792dc4711a651d1d Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:28 -0800 Subject: kernel/cpu.c: make set_cpu_* static inlines Almost all callers of the set_cpu_* functions pass an explicit true or false. Making them static inline thus replaces the function calls with a simple set_bit/clear_bit, saving some .text. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 43 +++++++++++++++++++++++++++++++++++++++---- kernel/cpu.c | 34 ---------------------------------- 2 files changed, 39 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 52ab539aefce..fc14275ff34e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -720,14 +720,49 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ -void set_cpu_possible(unsigned int cpu, bool possible); -void set_cpu_present(unsigned int cpu, bool present); -void set_cpu_online(unsigned int cpu, bool online); -void set_cpu_active(unsigned int cpu, bool active); void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); +static inline void +set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, &__cpu_possible_mask); + else + cpumask_clear_cpu(cpu, &__cpu_possible_mask); +} + +static inline void +set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, &__cpu_present_mask); + else + cpumask_clear_cpu(cpu, &__cpu_present_mask); +} + +static inline void +set_cpu_online(unsigned int cpu, bool online) +{ + if (online) { + cpumask_set_cpu(cpu, &__cpu_online_mask); + cpumask_set_cpu(cpu, &__cpu_active_mask); + } else { + cpumask_clear_cpu(cpu, &__cpu_online_mask); + } +} + +static inline void +set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, &__cpu_active_mask); + else + cpumask_clear_cpu(cpu, &__cpu_active_mask); +} + + /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap diff --git a/kernel/cpu.c b/kernel/cpu.c index 8734fc74fcbc..5b9d39633ce9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -775,40 +775,6 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); -void set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, &__cpu_possible_mask); - else - cpumask_clear_cpu(cpu, &__cpu_possible_mask); -} - -void set_cpu_present(unsigned int cpu, bool present) -{ - if (present) - cpumask_set_cpu(cpu, &__cpu_present_mask); - else - cpumask_clear_cpu(cpu, &__cpu_present_mask); -} - -void set_cpu_online(unsigned int cpu, bool online) -{ - if (online) { - cpumask_set_cpu(cpu, &__cpu_online_mask); - cpumask_set_cpu(cpu, &__cpu_active_mask); - } else { - cpumask_clear_cpu(cpu, &__cpu_online_mask); - } -} - -void set_cpu_active(unsigned int cpu, bool active) -{ - if (active) - cpumask_set_cpu(cpu, &__cpu_active_mask); - else - cpumask_clear_cpu(cpu, &__cpu_active_mask); -} - void init_cpu_present(const struct cpumask *src) { cpumask_copy(&__cpu_present_mask, src); -- cgit v1.3-14-g43fede From cdf4b3fa03bab157d2d70d4de65bb7ae319b084f Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jan 2016 15:00:31 -0800 Subject: kexec: set KEXEC_TYPE_CRASH before sanity_check_segment_list() sanity_check_segment_list() checks KEXEC_TYPE_CRASH flag to ensure all the segments of the loaded crash kernel are within the kernel crash resource limits, so set the flag beforehand. Signed-off-by: Xunlei Pang Acked-by: Dave Young Cc: Eric Biederman Cc: Vivek Goyal Acked-by: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index d873b64fbddc..ee70aef5cd81 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -63,16 +63,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, if (ret) goto out_free_image; - ret = sanity_check_segment_list(image); - if (ret) - goto out_free_image; - - /* Enable the special crash kernel control page allocation policy. */ if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be -- cgit v1.3-14-g43fede From 2b24692b9235cb82b6f735b7a4c4137211ddf005 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 20 Jan 2016 15:00:34 -0800 Subject: kernel/kexec_core.c: use list_for_each_entry_safe in kimage_free_page_list Use list_for_each_entry_safe() instead of list_for_each_safe() to simplify the code. Signed-off-by: Geliang Tang Cc: Dave Young Cc: Vivek Goyal Acked-by: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c823f3001e12..8dc659144869 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -310,12 +310,9 @@ static void kimage_free_pages(struct page *page) void kimage_free_page_list(struct list_head *list) { - struct list_head *pos, *next; + struct page *page, *next; - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); + list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } -- cgit v1.3-14-g43fede From 978e30c9b46161c792ecdad0091fd017b21b8ca5 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jan 2016 15:00:36 -0800 Subject: kexec: move some memembers and definitions within the scope of CONFIG_KEXEC_FILE Move the stuff currently only used by the kexec file code within CONFIG_KEXEC_FILE (and CONFIG_KEXEC_VERIFY_SIG). Also move internal "struct kexec_sha_region" and "struct kexec_buf" into "kexec_internal.h". Signed-off-by: Xunlei Pang Cc: "Eric W. Biederman" Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 2 ++ include/linux/kexec.h | 62 +++++++++++++++----------------------- kernel/kexec_file.c | 2 ++ kernel/kexec_internal.h | 21 +++++++++++++ 4 files changed, 50 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 819ab3f9c9c7..ba7fbba9831b 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -385,6 +385,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } +#ifdef CONFIG_KEXEC_VERIFY_SIG int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, unsigned long kernel_len) { @@ -395,6 +396,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, return image->fops->verify_sig(kernel, kernel_len); } +#endif /* * Apply purgatory relocations. diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 7b68d2788a56..2cc643c6e870 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -109,11 +109,7 @@ struct compat_kexec_segment { }; #endif -struct kexec_sha_region { - unsigned long start; - unsigned long len; -}; - +#ifdef CONFIG_KEXEC_FILE struct purgatory_info { /* Pointer to elf header of read only purgatory */ Elf_Ehdr *ehdr; @@ -130,6 +126,28 @@ struct purgatory_info { unsigned long purgatory_load_addr; }; +typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); +typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len); +typedef int (kexec_cleanup_t)(void *loader_data); + +#ifdef CONFIG_KEXEC_VERIFY_SIG +typedef int (kexec_verify_sig_t)(const char *kernel_buf, + unsigned long kernel_len); +#endif + +struct kexec_file_ops { + kexec_probe_t *probe; + kexec_load_t *load; + kexec_cleanup_t *cleanup; +#ifdef CONFIG_KEXEC_VERIFY_SIG + kexec_verify_sig_t *verify_sig; +#endif +}; +#endif + struct kimage { kimage_entry_t head; kimage_entry_t *entry; @@ -161,6 +179,7 @@ struct kimage { struct kimage_arch arch; #endif +#ifdef CONFIG_KEXEC_FILE /* Additional fields for file based kexec syscall */ void *kernel_buf; unsigned long kernel_buf_len; @@ -179,38 +198,7 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; -}; - -/* - * Keeps track of buffer parameters as provided by caller for requesting - * memory placement of buffer. - */ -struct kexec_buf { - struct kimage *image; - char *buffer; - unsigned long bufsz; - unsigned long mem; - unsigned long memsz; - unsigned long buf_align; - unsigned long buf_min; - unsigned long buf_max; - bool top_down; /* allocate from top of memory hole */ -}; - -typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); -typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, - unsigned long kernel_len, char *initrd, - unsigned long initrd_len, char *cmdline, - unsigned long cmdline_len); -typedef int (kexec_cleanup_t)(void *loader_data); -typedef int (kexec_verify_sig_t)(const char *kernel_buf, - unsigned long kernel_len); - -struct kexec_file_ops { - kexec_probe_t *probe; - kexec_load_t *load; - kexec_cleanup_t *cleanup; - kexec_verify_sig_t *verify_sig; +#endif }; /* kexec interface functions */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b70ada0028d2..007b791f676d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -109,11 +109,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) return -EINVAL; } +#ifdef CONFIG_KEXEC_VERIFY_SIG int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { return -EKEYREJECTED; } +#endif /* Apply relocations of type RELA */ int __weak diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index e4392a698ad4..0a52315d9c62 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image, extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +/* + * Keeps track of buffer parameters as provided by caller for requesting + * memory placement of buffer. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long mem; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; /* allocate from top of memory hole */ +}; + void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } -- cgit v1.3-14-g43fede From 41662f5cc55335807d39404371cfcbb1909304c4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Jan 2016 15:00:45 -0800 Subject: sysctl: enable strict writes SYSCTL_WRITES_WARN was added in commit f4aacea2f5d1 ("sysctl: allow for strict write position handling"), and released in v3.16 in August of 2014. Since then I can find only 1 instance of non-zero offset writing[1], and it was fixed immediately in CRIU[2]. As such, it appears safe to flip this to the strict state now. [1] https://www.google.com/search?q="when%20file%20position%20was%20not%200" [2] http://lists.openvz.org/pipermail/criu/2015-April/019819.html Signed-off-by: Kees Cook Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 15 +++++++-------- kernel/sysctl.c | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 73c6b1ef0e84..a93b414672a7 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -825,14 +825,13 @@ via the /proc/sys interface: Each write syscall must fully contain the sysctl value to be written, and multiple writes on the same sysctl file descriptor will rewrite the sysctl value, regardless of file position. - 0 - (default) Same behavior as above, but warn about processes that - perform writes to a sysctl file descriptor when the file position - is not 0. - 1 - Respect file position when writing sysctl strings. Multiple writes - will append to the sysctl value buffer. Anything past the max length - of the sysctl value buffer will be ignored. Writes to numeric sysctl - entries must always be at file position 0 and the value must be - fully contained in the buffer sent in the write syscall. + 0 - Same behavior as above, but warn about processes that perform writes + to a sysctl file descriptor when the file position is not 0. + 1 - (default) Respect file position when writing sysctl strings. Multiple + writes will append to the sysctl value buffer. Anything past the max + length of the sysctl value buffer will be ignored. Writes to numeric + sysctl entries must always be at file position 0 and the value must + be fully contained in the buffer sent in the write syscall. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c810f8afdb7f..91420362e0b3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,7 +173,7 @@ extern int no_unaligned_warning; #define SYSCTL_WRITES_WARN 0 #define SYSCTL_WRITES_STRICT 1 -static int sysctl_writes_strict = SYSCTL_WRITES_WARN; +static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.3-14-g43fede From 5c9cf8af2e77388f1da81c39237fb4f20c2f85d5 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 20 Jan 2016 15:00:48 -0800 Subject: kernel: printk: specify alignment for struct printk_log On architectures that have support for efficient unaligned access struct printk_log has 4-byte alignment. Specify alignment attribute in type declaration. The whole point of this patch is to fix deadlock which happening when UBSAN detects unaligned access in printk() thus UBSAN recursively calls printk() with logbuf_lock held by top printk() call. Signed-off-by: Andrey Ryabinin Cc: Peter Zijlstra Cc: Sasha Levin Cc: Randy Dunlap Cc: Rasmus Villemoes Cc: Jonathan Corbet Cc: Michal Marek Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Yury Gribov Cc: Dmitry Vyukov Cc: Konstantin Khlebnikov Cc: Kostya Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e79439134978..c963ba534a78 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -233,7 +233,11 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ -}; +} +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +__packed __aligned(4) +#endif +; /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken @@ -274,11 +278,7 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define LOG_ALIGN 4 -#else #define LOG_ALIGN __alignof__(struct printk_log) -#endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; -- cgit v1.3-14-g43fede From ddf1d398e517e660207e2c807f76a90df543a217 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Jan 2016 15:01:02 -0800 Subject: prctl: take mmap sem for writing to protect against others An unprivileged user can trigger an oops on a kernel with CONFIG_CHECKPOINT_RESTORE. proc_pid_cmdline_read takes mmap_sem for reading and obtains args + env start/end values. These get sanity checked as follows: BUG_ON(arg_start > arg_end); BUG_ON(env_start > env_end); These can be changed by prctl_set_mm. Turns out also takes the semaphore for reading, effectively rendering it useless. This results in: kernel BUG at fs/proc/base.c:240! invalid opcode: 0000 [#1] SMP Modules linked in: virtio_net CPU: 0 PID: 925 Comm: a.out Not tainted 4.4.0-rc8-next-20160105dupa+ #71 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff880077a68000 ti: ffff8800784d0000 task.ti: ffff8800784d0000 RIP: proc_pid_cmdline_read+0x520/0x530 RSP: 0018:ffff8800784d3db8 EFLAGS: 00010206 RAX: ffff880077c5b6b0 RBX: ffff8800784d3f18 RCX: 0000000000000000 RDX: 0000000000000002 RSI: 00007f78e8857000 RDI: 0000000000000246 RBP: ffff8800784d3e40 R08: 0000000000000008 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000050 R13: 00007f78e8857800 R14: ffff88006fcef000 R15: ffff880077c5b600 FS: 00007f78e884a740(0000) GS:ffff88007b200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f78e8361770 CR3: 00000000790a5000 CR4: 00000000000006f0 Call Trace: __vfs_read+0x37/0x100 vfs_read+0x82/0x130 SyS_read+0x58/0xd0 entry_SYSCALL_64_fastpath+0x12/0x76 Code: 4c 8b 7d a8 eb e9 48 8b 9d 78 ff ff ff 4c 8b 7d 90 48 8b 03 48 39 45 a8 0f 87 f0 fe ff ff e9 d1 fe ff ff 4c 8b 7d 90 eb c6 0f 0b <0f> 0b 0f 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 RIP proc_pid_cmdline_read+0x520/0x530 ---[ end trace 97882617ae9c6818 ]--- Turns out there are instances where the code just reads aformentioned values without locking whatsoever - namely environ_read and get_cmdline. Interestingly these functions look quite resilient against bogus values, but I don't believe this should be relied upon. The first patch gets rid of the oops bug by grabbing mmap_sem for writing. The second patch is optional and puts locking around aformentioned consumers for safety. Consumers of other fields don't seem to benefit from similar treatment and are left untouched. This patch (of 2): The code was taking the semaphore for reading, which does not protect against readers nor concurrent modifications. The problem could cause a sanity checks to fail in procfs's cmdline reader, resulting in an OOPS. Note that some functions perform an unlocked read of various mm fields, but they seem to be fine despite possible modificaton. Signed-off-by: Mateusz Guzik Acked-by: Cyrill Gorcunov Cc: Alexey Dobriyan Cc: Jarod Wilson Cc: Jan Stancek Cc: Al Viro Cc: Anshuman Khandual Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 6af9212ab5aa..78947de6f969 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1853,11 +1853,13 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } - if (prctl_map.exe_fd != (u32)-1) + if (prctl_map.exe_fd != (u32)-1) { error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); - down_read(&mm->mmap_sem); - if (error) - goto out; + if (error) + return error; + } + + down_write(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -1894,10 +1896,8 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - error = 0; -out: - up_read(&mm->mmap_sem); - return error; + up_write(&mm->mmap_sem); + return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -1963,7 +1963,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, addr); prctl_map.start_code = mm->start_code; @@ -2056,7 +2056,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); return error; } -- cgit v1.3-14-g43fede From c994d6136738fd8b24a79f5ad8df40a6a79e2cf7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 09:20:23 +0100 Subject: perf: Add lockdep assertions Make various bugs easier to see. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bf8244190d0f..c77b05d9a37d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1246,6 +1246,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; @@ -2342,8 +2344,10 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { - struct perf_event *event; int is_active = ctx->is_active; + struct perf_event *event; + + lockdep_assert_held(&ctx->lock); ctx->is_active &= ~event_type; if (likely(!ctx->nr_events)) @@ -2725,8 +2729,10 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type, struct task_struct *task) { - u64 now; int is_active = ctx->is_active; + u64 now; + + lockdep_assert_held(&ctx->lock); ctx->is_active |= event_type; if (likely(!ctx->nr_events)) -- cgit v1.3-14-g43fede From 7e41d17753e6e0da55d343997454dd4fbe8d28a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 09:21:40 +0100 Subject: perf: Fix cgroup event scheduling There appears to be a problem in __perf_event_task_sched_in() wrt cgroup event scheduling. The normal event scheduling order is: CPU pinned Task pinned CPU flexible Task flexible And since perf_cgroup_sched*() only schedules the cpu context, we must call this _before_ adding the task events. Note: double check what happens on the ctx switch optimization where the task ctx isn't scheduled. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c77b05d9a37d..9d1195af819c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2806,6 +2806,16 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; + /* + * If cgroup events exist on this CPU, then we need to check if we have + * to switch in PMU state; cgroup event are system-wide mode only. + * + * Since cgroup events are CPU events, we must schedule these in before + * we schedule in the task events. + */ + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) + perf_cgroup_sched_in(prev, task); + for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -2813,13 +2823,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, perf_event_context_sched_in(ctx, task); } - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch in PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); -- cgit v1.3-14-g43fede From 70a0165752944e0be0b1de4a9020473079962c18 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 09:29:16 +0100 Subject: perf: Fix cgroup scheduling in perf_enable_on_exec() There is a comment that states that perf_event_context_sched_in() will also switch in the cgroup events, I cannot find it does so. Therefore all the resulting logic goes out the window too. Clean that up. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9d1195af819c..e7bda0ed8d40 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -579,13 +579,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* - * next is NULL when called from perf_event_enable_on_exec() - * that will systematically cause a cgroup_switch() - */ - if (next) - cgrp2 = perf_cgroup_from_task(next, NULL); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -611,8 +605,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* prev can never be NULL */ cgrp2 = perf_cgroup_from_task(prev, NULL); /* @@ -1450,11 +1442,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) { ctx->nr_cgroups--; + /* + * Because cgroup events are always per-cpu events, this will + * always be called from the right CPU. + */ cpuctx = __get_cpu_context(ctx); /* - * if there are no more cgroup events - * then cler cgrp to avoid stale pointer - * in update_cgrp_time_from_cpuctx() + * If there are no more cgroup events then clear cgrp to avoid + * stale pointer in update_cgrp_time_from_cpuctx(). */ if (!ctx->nr_cgroups) cpuctx->cgrp = NULL; @@ -3118,15 +3113,6 @@ static void perf_event_enable_on_exec(int ctxn) if (!ctx || !ctx->nr_events) goto out; - /* - * We must ctxsw out cgroup events to avoid conflict - * when invoking perf_task_event_sched_in() later on - * in this function. Otherwise we end up trying to - * ctxswin cgroup events which are already scheduled - * in. - */ - perf_cgroup_sched_out(current, NULL); - raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); @@ -3144,9 +3130,6 @@ static void perf_event_enable_on_exec(int ctxn) raw_spin_unlock(&ctx->lock); - /* - * Also calls ctxswin for cgroup events, if any: - */ perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); -- cgit v1.3-14-g43fede From 5947f6576e2edee1189b00bf5b2a3f2c653caa6b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 09:43:38 +0100 Subject: perf: Remove stale comment The comment here is horribly out of date, remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e7bda0ed8d40..751538ce19a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2130,13 +2130,6 @@ static int __perf_install_in_context(void *info) /* * Attach a performance event to a context - * - * First we add the event to the list with the hardware enable bit - * in event->hw_config cleared. - * - * If the event is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. */ static void perf_install_in_context(struct perf_event_context *ctx, -- cgit v1.3-14-g43fede From 3e349507d12de93b08b0aa814fc2aa0dee91c5ba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 10:01:18 +0100 Subject: perf: Fix perf_enable_on_exec() event scheduling There are two problems with the current perf_enable_on_exec() event scheduling: - the newly enabled events will be immediately scheduled irrespective of their ctx event list order. - there's a hole in the ctx->lock between scheduling the events out and putting them back on. Esp. the latter issue is a real problem because a hole in event scheduling leaves the thing in an observable inconsistent state, confusing things. Fix both issues by first doing the enable iteration and at the end, when there are newly enabled events, reschedule the ctx in one go. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 751538ce19a7..0679e73f5f63 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2036,7 +2036,8 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_event_context *ctx); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, @@ -2067,6 +2068,17 @@ static void ___perf_install_in_context(void *info) add_event_to_ctx(event, ctx); } +static void ctx_resched(struct perf_cpu_context *cpuctx, + struct perf_event_context *task_ctx) +{ + perf_pmu_disable(cpuctx->ctx.pmu); + if (task_ctx) + task_ctx_sched_out(cpuctx, task_ctx); + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + perf_event_sched_in(cpuctx, task_ctx, current); + perf_pmu_enable(cpuctx->ctx.pmu); +} + /* * Cross CPU call to install and enable a performance event * @@ -2087,7 +2099,7 @@ static int __perf_install_in_context(void *info) * If there was an active task_ctx schedule it out. */ if (task_ctx) - task_ctx_sched_out(task_ctx); + task_ctx_sched_out(cpuctx, task_ctx); /* * If the context we're installing events in is not the @@ -2629,10 +2641,9 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_event_context *ctx) +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) return; @@ -3096,34 +3107,30 @@ static int event_enable_on_exec(struct perf_event *event, static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; - int ret; local_irq_save(flags); ctx = current->perf_event_ctxp[ctxn]; if (!ctx || !ctx->nr_events) goto out; - raw_spin_lock(&ctx->lock); - task_ctx_sched_out(ctx); - - list_for_each_entry(event, &ctx->event_list, event_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } + cpuctx = __get_cpu_context(ctx); + perf_ctx_lock(cpuctx, ctx); + list_for_each_entry(event, &ctx->event_list, event_entry) + enabled |= event_enable_on_exec(event, ctx); /* - * Unclone this context if we enabled any event. + * Unclone and reschedule this context if we enabled any event. */ - if (enabled) + if (enabled) { clone_ctx = unclone_ctx(ctx); + ctx_resched(cpuctx, ctx); + } + perf_ctx_unlock(cpuctx, ctx); - raw_spin_unlock(&ctx->lock); - - perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); @@ -8737,7 +8744,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(child_ctx); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); child->perf_event_ctxp[ctxn] = NULL; /* -- cgit v1.3-14-g43fede From 8833d0e286c12fd4456089a7a553faf4921e4b08 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 10:02:37 +0100 Subject: perf: Use task_ctx_sched_out() We have a function that does exactly what we want here, use it. This reduces the amount of cpuctx->task_ctx muckery. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0679e73f5f63..12f1d4a52da9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2545,8 +2545,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; + task_ctx_sched_out(cpuctx, ctx); raw_spin_unlock(&ctx->lock); } } -- cgit v1.3-14-g43fede From aee7dbc45f8aa976913de9b352fa6da816f1f3cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 10:45:11 +0100 Subject: perf: Simplify/fix perf_event_enable() event scheduling Like perf_enable_on_exec(), perf_event_enable() event scheduling has problems respecting the context hierarchy when trying to schedule events (for example, it will try and add a pinned event without first removing existing flexible events). So simplify it by using the new ctx_resched() call which will DTRT. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 12f1d4a52da9..079eb9fcaaa8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2188,7 +2188,7 @@ static int __perf_event_enable(void *info) struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; + struct perf_event_context *task_ctx = cpuctx->task_ctx; /* * There's a time window between 'ctx->is_active' check @@ -2202,7 +2202,8 @@ static int __perf_event_enable(void *info) if (!ctx->is_active) return -EINVAL; - raw_spin_lock(&ctx->lock); + perf_ctx_lock(cpuctx, task_ctx); + WARN_ON_ONCE(&cpuctx->ctx != ctx && task_ctx != ctx); update_context_time(ctx); if (event->state >= PERF_EVENT_STATE_INACTIVE) @@ -2228,32 +2229,10 @@ static int __perf_event_enable(void *info) if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) goto unlock; - if (!group_can_go_on(event, cpuctx, 1)) { - err = -EEXIST; - } else { - if (event == leader) - err = group_sched_in(event, cpuctx, ctx); - else - err = event_sched_in(event, cpuctx, ctx); - } - - if (err) { - /* - * If this event can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != event) { - group_sched_out(leader, cpuctx, ctx); - perf_mux_hrtimer_restart(cpuctx); - } - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } + ctx_resched(cpuctx, task_ctx); unlock: - raw_spin_unlock(&ctx->lock); + perf_ctx_unlock(cpuctx, task_ctx); return 0; } -- cgit v1.3-14-g43fede From 25432ae96a9889774a05bf5f0f6fd8dbcdec5e72 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 11:05:09 +0100 Subject: perf: Optimize perf_sched_events() usage It doesn't make sense to take up-to _4_ references on perf_sched_events() per event, avoid doing this. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 079eb9fcaaa8..935aefd16354 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3491,11 +3491,13 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) static void unaccount_event(struct perf_event *event) { + bool dec = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -3505,12 +3507,15 @@ static void unaccount_event(struct perf_event *event) if (event->attr.freq) atomic_dec(&nr_freq_events); if (event->attr.context_switch) { - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (has_branch_stack(event)) + dec = true; + + if (dec) static_key_slow_dec_deferred(&perf_sched_events); unaccount_event_cpu(event, event->cpu); @@ -7723,11 +7728,13 @@ static void account_event_cpu(struct perf_event *event, int cpu) static void account_event(struct perf_event *event) { + bool inc = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -7740,11 +7747,14 @@ static void account_event(struct perf_event *event) } if (event->attr.context_switch) { atomic_inc(&nr_switch_events); - static_key_slow_inc(&perf_sched_events.key); + inc = true; } if (has_branch_stack(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (is_cgroup_event(event)) + inc = true; + + if (inc) static_key_slow_inc(&perf_sched_events.key); account_event_cpu(event, event->cpu); -- cgit v1.3-14-g43fede From 63e30d3e52d4d85854ce6c761ffc6ab55209a630 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 11:39:10 +0100 Subject: perf: Make ctx->is_active and cpuctx->task_ctx consistent For no apparent reason and to great confusion the rules for ctx->is_active and cpuctx->task_ctx are different. This means that its not always possible to find all active (task) contexts. Fix this such that if ctx->is_active gets set, we also set (or verify) cpuctx->task_ctx. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 935aefd16354..89b47050a2e8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2329,6 +2329,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, lockdep_assert_held(&ctx->lock); ctx->is_active &= ~event_type; + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; + } + if (likely(!ctx->nr_events)) return; @@ -2629,7 +2635,6 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, return; ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; } /* @@ -2712,6 +2717,13 @@ ctx_sched_in(struct perf_event_context *ctx, lockdep_assert_held(&ctx->lock); ctx->is_active |= event_type; + if (ctx->task) { + if (!is_active) + cpuctx->task_ctx = ctx; + else + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + } + if (likely(!ctx->nr_events)) return; @@ -2756,12 +2768,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * cpu flexible, task flexible. */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - - if (ctx->nr_events) - cpuctx->task_ctx = ctx; - - perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); - + perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); } -- cgit v1.3-14-g43fede From 39a4364076921511e212bc42f94fbf062c989576 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jan 2016 12:46:35 +0100 Subject: perf: Fix task context scheduling There is a very nasty problem wrt disabling the perf task scheduling hooks. Currently we {set,clear} ctx->is_active on every __perf_event_task_sched_{in,out}, _however_ this means that if we disable these calls we'll have task contexts with ->is_active set that are not active and 'active' task contexts without ->is_active set. This can result in event_function_call() looping on the ctx->is_active condition basically indefinitely. Resolve this by changing things such that contexts without events do not set ->is_active like we used to. From this invariant it trivially follows that if there are no (task) events, every task ctx is inactive and disabling the context switch hooks is harmless. This leaves two places that need attention (and already had accumulated weird and wonderful hacks to work around, without recognising this actual problem). Namely: - perf_install_in_context() will need to deal with installing events in an inactive context, meaning it cannot rely on ctx-is_active for its IPIs. - perf_remove_from_context() will have to mark a context as inactive when it removes the last event. For specific detail, see the patch/comments. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Dmitry Vyukov Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 155 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 89b47050a2e8..c27e04655d86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -126,6 +126,38 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } +/* + * On task ctx scheduling... + * + * When !ctx->nr_events a task context will not be scheduled. This means + * we can disable the scheduler hooks (for performance) without leaving + * pending task ctx state. + * + * This however results in two special cases: + * + * - removing the last event from a task ctx; this is relatively straight + * forward and is done in __perf_remove_from_context. + * + * - adding the first event to a task ctx; this is tricky because we cannot + * rely on ctx->is_active and therefore cannot use event_function_call(). + * See perf_install_in_context(). + * + * This is because we need a ctx->lock serialized variable (ctx->is_active) + * to reliably determine if a particular task/context is scheduled in. The + * task_curr() use in task_function_call() is racy in that a remote context + * switch is not a single atomic operation. + * + * As is, the situation is 'safe' because we set rq->curr before we do the + * actual context switch. This means that task_curr() will fail early, but + * we'll continue spinning on ctx->is_active until we've passed + * perf_event_task_sched_out(). + * + * Without this ctx->lock serialized variable we could have race where we find + * the task (and hence the context) would not be active while in fact they are. + * + * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. + */ + static void event_function_call(struct perf_event *event, int (*active)(void *), void (*inactive)(void *), @@ -1686,9 +1718,13 @@ static int __perf_remove_from_context(void *info) if (re->detach_group) perf_group_detach(event); list_del_event(event, ctx); - if (!ctx->nr_events && cpuctx->task_ctx == ctx) { + + if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; - cpuctx->task_ctx = NULL; + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + cpuctx->task_ctx = NULL; + } } raw_spin_unlock(&ctx->lock); @@ -2056,18 +2092,6 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } -static void ___perf_install_in_context(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - - /* - * Since the task isn't running, its safe to add the event, us holding - * the ctx->lock ensures the task won't get scheduled in. - */ - add_event_to_ctx(event, ctx); -} - static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx) { @@ -2086,55 +2110,27 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, */ static int __perf_install_in_context(void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; + struct perf_event_context *ctx = info; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; - struct task_struct *task = current; - - perf_ctx_lock(cpuctx, task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - /* - * If there was an active task_ctx schedule it out. - */ - if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx); + if (ctx->task) { + /* + * If we hit the 'wrong' task, we've since scheduled and + * everything should be sorted, nothing to do! + */ + if (ctx->task != current) + return 0; - /* - * If the context we're installing events in is not the - * active task_ctx, flip them. - */ - if (ctx->task && task_ctx != ctx) { - if (task_ctx) - raw_spin_unlock(&task_ctx->lock); - raw_spin_lock(&ctx->lock); + /* + * If task_ctx is set, it had better be to us. + */ + WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); task_ctx = ctx; } - if (task_ctx) { - cpuctx->task_ctx = task_ctx; - task = task_ctx->task; - } - - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - - update_context_time(ctx); - /* - * update cgrp time only if current cgrp - * matches event->cgrp. Must be done before - * calling add_event_to_ctx() - */ - update_cgrp_time_from_event(event); - - add_event_to_ctx(event, ctx); - - /* - * Schedule everything back in - */ - perf_event_sched_in(cpuctx, task_ctx, task); - - perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_lock(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx); perf_ctx_unlock(cpuctx, task_ctx); return 0; @@ -2148,14 +2144,38 @@ perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { + struct task_struct *task = NULL; + lockdep_assert_held(&ctx->mutex); event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; - event_function_call(event, __perf_install_in_context, - ___perf_install_in_context, event); + /* + * Installing events is tricky because we cannot rely on ctx->is_active + * to be set in case this is the nr_events 0 -> 1 transition. + * + * So what we do is we add the event to the list here, which will allow + * a future context switch to DTRT and then send a racy IPI. If the IPI + * fails to hit the right task, this means a context switch must have + * happened and that will have taken care of business. + */ + raw_spin_lock_irq(&ctx->lock); + update_context_time(ctx); + /* + * Update cgrp time only if current cgrp matches event->cgrp. + * Must be done before calling add_event_to_ctx(). + */ + update_cgrp_time_from_event(event); + add_event_to_ctx(event, ctx); + task = ctx->task; + raw_spin_unlock_irq(&ctx->lock); + + if (task) + task_function_call(task, __perf_install_in_context, ctx); + else + cpu_function_call(cpu, __perf_install_in_context, ctx); } /* @@ -2328,6 +2348,16 @@ static void ctx_sched_out(struct perf_event_context *ctx, lockdep_assert_held(&ctx->lock); + if (likely(!ctx->nr_events)) { + /* + * See __perf_remove_from_context(). + */ + WARN_ON_ONCE(ctx->is_active); + if (ctx->task) + WARN_ON_ONCE(cpuctx->task_ctx); + return; + } + ctx->is_active &= ~event_type; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@ -2335,9 +2365,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, cpuctx->task_ctx = NULL; } - if (likely(!ctx->nr_events)) - return; - update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx); if (!ctx->nr_active) @@ -2716,6 +2743,9 @@ ctx_sched_in(struct perf_event_context *ctx, lockdep_assert_held(&ctx->lock); + if (likely(!ctx->nr_events)) + return; + ctx->is_active |= event_type; if (ctx->task) { if (!is_active) @@ -2724,9 +2754,6 @@ ctx_sched_in(struct perf_event_context *ctx, WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - if (likely(!ctx->nr_events)) - return; - now = perf_clock(); ctx->timestamp = now; perf_cgroup_set_timestamp(task, ctx); -- cgit v1.3-14-g43fede From 32132a3d0d5d6f127388be3e3fd7759f798c2eb4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jan 2016 15:40:59 +0100 Subject: perf: Specialize perf_event_exit_task() The perf_remove_from_context() usage in __perf_event_exit_task() is different from the other usages in that this site has already detached and scheduled out the task context. This will stand in the way of stronger assertions checking the (task) context scheduling invariants. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c27e04655d86..66c9ad4f8707 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8726,7 +8726,13 @@ __perf_event_exit_task(struct perf_event *child_event, * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - perf_remove_from_context(child_event, !!child_event->parent); + raw_spin_lock_irq(&child_ctx->lock); + WARN_ON_ONCE(child_ctx->is_active); + + if (!!child_event->parent) + perf_group_detach(child_event); + list_del_event(child_event, child_ctx); + raw_spin_unlock_irq(&child_ctx->lock); /* * It can happen that the parent exits first, and has events @@ -8746,17 +8752,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *next; struct perf_event_context *child_ctx, *clone_ctx = NULL; - unsigned long flags; if (likely(!child->perf_event_ctxp[ctxn])) return; - local_irq_save(flags); + local_irq_disable(); + WARN_ON_ONCE(child != current); /* * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. + * and child must be current. */ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); @@ -8776,7 +8780,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ clone_ctx = unclone_ctx(child_ctx); update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); -- cgit v1.3-14-g43fede From fae3fde65138b6071b1b0e0b567d4058a8b6a88c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jan 2016 15:00:50 +0100 Subject: perf: Collapse and fix event_function_call() users There is one common bug left in all the event_function_call() users, between loading ctx->task and getting to the remote_function(), ctx->task can already have been changed. Therefore we need to double check and retry if ctx->task != current. Insert another trampoline specific to event_function_call() that checks for this and further validates state. This also allows getting rid of the active/inactive functions. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 365 +++++++++++++++++++----------------------- kernel/events/hw_breakpoint.c | 2 +- 3 files changed, 168 insertions(+), 201 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9828a48f16a..6612732d8fd0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1044,7 +1044,7 @@ extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); -extern int __perf_event_disable(void *info); +extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/kernel/events/core.c b/kernel/events/core.c index 66c9ad4f8707..6620432491f6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -126,6 +126,28 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + /* * On task ctx scheduling... * @@ -158,21 +180,96 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ -static void event_function_call(struct perf_event *event, - int (*active)(void *), - void (*inactive)(void *), - void *data) +typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, + struct perf_event_context *, void *); + +struct event_function_struct { + struct perf_event *event; + event_f func; + void *data; +}; + +static int event_function(void *info) +{ + struct event_function_struct *efs = info; + struct perf_event *event = efs->event; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Since we do the IPI call without holding ctx->lock things can have + * changed, double check we hit the task we set out to hit. + * + * If ctx->task == current, we know things must remain valid because + * we have IRQs disabled so we cannot schedule. + */ + if (ctx->task) { + if (ctx->task != current) + return -EAGAIN; + + WARN_ON_ONCE(task_ctx != ctx); + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + perf_ctx_lock(cpuctx, task_ctx); + /* + * Now that we hold locks, double check state. Paranoia pays. + */ + if (task_ctx) { + WARN_ON_ONCE(task_ctx->task != current); + /* + * We only use event_function_call() on established contexts, + * and event_function() is only ever called when active (or + * rather, we'll have bailed in task_function_call() or the + * above ctx->task != current test), therefore we must have + * ctx->is_active here. + */ + WARN_ON_ONCE(!ctx->is_active); + /* + * And since we have ctx->is_active, cpuctx->task_ctx must + * match. + */ + WARN_ON_ONCE(cpuctx->task_ctx != task_ctx); + } + efs->func(event, cpuctx, ctx, efs->data); + perf_ctx_unlock(cpuctx, task_ctx); + + return 0; +} + +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + int ret = event_function(&efs); + WARN_ON_ONCE(ret); +} + +static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; if (!task) { - cpu_function_call(event->cpu, active, data); + cpu_function_call(event->cpu, event_function, &efs); return; } again: - if (!task_function_call(task, active, data)) + if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); @@ -185,7 +282,7 @@ again: raw_spin_unlock_irq(&ctx->lock); goto again; } - inactive(data); + func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } @@ -400,28 +497,6 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -static void perf_ctx_lock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - raw_spin_lock(&cpuctx->ctx.lock); - if (ctx) - raw_spin_lock(&ctx->lock); -} - -static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); -} - #ifdef CONFIG_CGROUP_PERF static inline bool @@ -1684,38 +1759,22 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -struct remove_event { - struct perf_event *event; - bool detach_group; -}; - -static void ___perf_remove_from_context(void *info) -{ - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - - if (re->detach_group) - perf_group_detach(event); - list_del_event(event, ctx); -} - /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static int __perf_remove_from_context(void *info) +static void +__perf_remove_from_context(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + bool detach_group = (unsigned long)info; - raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); - if (re->detach_group) + if (detach_group) perf_group_detach(event); list_del_event(event, ctx); @@ -1726,17 +1785,11 @@ static int __perf_remove_from_context(void *info) cpuctx->task_ctx = NULL; } } - raw_spin_unlock(&ctx->lock); - - return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since @@ -1746,71 +1799,31 @@ static int __perf_remove_from_context(void *info) */ static void perf_remove_from_context(struct perf_event *event, bool detach_group) { - struct perf_event_context *ctx = event->ctx; - struct remove_event re = { - .event = event, - .detach_group = detach_group, - }; - - lockdep_assert_held(&ctx->mutex); + lockdep_assert_held(&event->ctx->mutex); event_function_call(event, __perf_remove_from_context, - ___perf_remove_from_context, &re); + (void *)(unsigned long)detach_group); } /* * Cross CPU call to disable a performance event */ -int __perf_event_disable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); - - return 0; -} - -void ___perf_event_disable(void *info) +static void __perf_event_disable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; + if (event->state < PERF_EVENT_STATE_INACTIVE) + return; - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; - } + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; } /* @@ -1837,8 +1850,12 @@ static void _perf_event_disable(struct perf_event *event) } raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_disable, - ___perf_event_disable, event); + event_function_call(event, __perf_event_disable, NULL); +} + +void perf_event_disable_local(struct perf_event *event) +{ + event_function_local(event, __perf_event_disable, NULL); } /* @@ -2202,44 +2219,29 @@ static void __perf_event_mark_enabled(struct perf_event *event) /* * Cross CPU call to enable a performance event */ -static int __perf_event_enable(void *info) +static void __perf_event_enable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - struct perf_event_context *task_ctx = cpuctx->task_ctx; - - /* - * There's a time window between 'ctx->is_active' check - * in perf_event_enable function and this place having: - * - IRQs on - * - ctx->lock unlocked - * - * where the task could be killed and 'ctx' deactivated - * by perf_event_exit_task. - */ - if (!ctx->is_active) - return -EINVAL; - - perf_ctx_lock(cpuctx, task_ctx); - WARN_ON_ONCE(&cpuctx->ctx != ctx && task_ctx != ctx); - update_context_time(ctx); + struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; - - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); + return; + update_context_time(ctx); __perf_event_mark_enabled(event); + if (!ctx->is_active) + return; + if (!event_filter_match(event)) { - if (is_cgroup_event(event)) + if (is_cgroup_event(event)) { + perf_cgroup_set_timestamp(current, ctx); // XXX ? perf_cgroup_defer_enabled(event); - goto unlock; + } + return; } /* @@ -2247,19 +2249,13 @@ static int __perf_event_enable(void *info) * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; + return; - ctx_resched(cpuctx, task_ctx); + task_ctx = cpuctx->task_ctx; + if (ctx->task) + WARN_ON_ONCE(task_ctx != ctx); -unlock: - perf_ctx_unlock(cpuctx, task_ctx); - - return 0; -} - -void ___perf_event_enable(void *info) -{ - __perf_event_mark_enabled((struct perf_event *)info); + ctx_resched(cpuctx, task_ctx); } /* @@ -2292,8 +2288,7 @@ static void _perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_enable, - ___perf_event_enable, event); + event_function_call(event, __perf_event_enable, NULL); } /* @@ -4095,36 +4090,14 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -struct period_event { - struct perf_event *event; - u64 value; -}; - -static void ___perf_event_period(void *info) -{ - struct period_event *pe = info; - struct perf_event *event = pe->event; - u64 value = pe->value; - - if (event->attr.freq) { - event->attr.sample_freq = value; - } else { - event->attr.sample_period = value; - event->hw.sample_period = value; - } - - local64_set(&event->hw.period_left, 0); -} - -static int __perf_event_period(void *info) +static void __perf_event_period(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct period_event *pe = info; - struct perf_event *event = pe->event; - struct perf_event_context *ctx = event->ctx; - u64 value = pe->value; + u64 value = *((u64 *)info); bool active; - raw_spin_lock(&ctx->lock); if (event->attr.freq) { event->attr.sample_freq = value; } else { @@ -4144,14 +4117,10 @@ static int __perf_event_period(void *info) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } - raw_spin_unlock(&ctx->lock); - - return 0; } static int perf_event_period(struct perf_event *event, u64 __user *arg) { - struct period_event pe = { .event = event, }; u64 value; if (!is_sampling_event(event)) @@ -4166,10 +4135,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; - pe.value = value; - - event_function_call(event, __perf_event_period, - ___perf_event_period, &pe); + event_function_call(event, __perf_event_period, &value); return 0; } @@ -4941,7 +4907,7 @@ static void perf_pending_event(struct irq_work *entry) if (event->pending_disable) { event->pending_disable = 0; - __perf_event_disable(event); + perf_event_disable_local(event); } if (event->pending_wakeup) { @@ -9239,13 +9205,14 @@ static void perf_event_init_cpu(int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *event; - rcu_read_lock(); - list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) - __perf_remove_from_context(&re); - rcu_read_unlock(); + raw_spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->event_list, event_entry) + __perf_remove_from_context(event, cpuctx, ctx, (void *)(unsigned long)true); + raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4ccc26..3f8cb1e14588 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) - __perf_event_disable(bp); + perf_event_disable_local(bp); else perf_event_disable(bp); -- cgit v1.3-14-g43fede From c97f473643a9d3e618c0f0426b926bc3a3e23944 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jan 2016 10:51:03 +0100 Subject: perf: Add more assertions Try to trigger warnings before races do damage. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6620432491f6..3eaf91b104e9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -263,6 +263,15 @@ static void event_function_call(struct perf_event *event, event_f func, void *da .data = data, }; + if (!event->parent) { + /* + * If this is a !child event, we must hold ctx::mutex to + * stabilize the the event->ctx relation. See + * perf_event_ctx_lock(). + */ + lockdep_assert_held(&ctx->mutex); + } + if (!task) { cpu_function_call(event->cpu, event_function, &efs); return; -- cgit v1.3-14-g43fede From 63b6da39bb38e8f1a1ef3180d32a39d6baf9da84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jan 2016 16:05:37 +0100 Subject: perf: Fix perf_event_exit_task() race There is a race against perf_event_exit_task() vs event_function_call(),find_get_context(),perf_install_in_context() (iow, everyone). Since there is no permanent marker on a context that its dead, it is quite possible that we access (and even modify) a context after its passed through perf_event_exit_task(). For instance, find_get_context() might find the context still installed, but by the time we get to perf_install_in_context() it might already have passed through perf_event_exit_task() and be considered dead, we will however still add the event to it. Solve this by marking a ctx dead by setting its ctx->task value to -1, it must be !0 so we still know its a (former) task context. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 151 +++++++++++++++++++++++++++++---------------------- 1 file changed, 85 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3eaf91b104e9..9de4d352ba8c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -148,6 +148,13 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, raw_spin_unlock(&cpuctx->ctx.lock); } +#define TASK_TOMBSTONE ((void *)-1L) + +static bool is_kernel_event(struct perf_event *event) +{ + return event->owner == TASK_TOMBSTONE; +} + /* * On task ctx scheduling... * @@ -196,31 +203,21 @@ static int event_function(void *info) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; + int ret = 0; WARN_ON_ONCE(!irqs_disabled()); + perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit. - * - * If ctx->task == current, we know things must remain valid because - * we have IRQs disabled so we cannot schedule. */ if (ctx->task) { - if (ctx->task != current) - return -EAGAIN; - - WARN_ON_ONCE(task_ctx != ctx); - } else { - WARN_ON_ONCE(&cpuctx->ctx != ctx); - } + if (ctx->task != current) { + ret = -EAGAIN; + goto unlock; + } - perf_ctx_lock(cpuctx, task_ctx); - /* - * Now that we hold locks, double check state. Paranoia pays. - */ - if (task_ctx) { - WARN_ON_ONCE(task_ctx->task != current); /* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or @@ -233,12 +230,16 @@ static int event_function(void *info) * And since we have ctx->is_active, cpuctx->task_ctx must * match. */ - WARN_ON_ONCE(cpuctx->task_ctx != task_ctx); + WARN_ON_ONCE(task_ctx != ctx); + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); } + efs->func(event, cpuctx, ctx, efs->data); +unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } static void event_function_local(struct perf_event *event, event_f func, void *data) @@ -256,7 +257,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; + struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct event_function_struct efs = { .event = event, .func = func, @@ -278,30 +279,28 @@ static void event_function_call(struct perf_event *event, event_f func, void *da } again: + if (task == TASK_TOMBSTONE) + return; + if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); - if (ctx->is_active) { - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; - raw_spin_unlock_irq(&ctx->lock); - goto again; + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + if (task != TASK_TOMBSTONE) { + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + func(event, NULL, ctx, data); } - func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } -#define EVENT_OWNER_KERNEL ((void *) -1) - -static bool is_kernel_event(struct perf_event *event) -{ - return event->owner == EVENT_OWNER_KERNEL; -} - #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -1025,7 +1024,7 @@ static void put_ctx(struct perf_event_context *ctx) if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - if (ctx->task) + if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } @@ -1186,6 +1185,7 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. + * * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ @@ -1226,10 +1226,13 @@ retry: goto retry; } - if (!atomic_inc_not_zero(&ctx->refcount)) { + if (ctx->task == TASK_TOMBSTONE || + !atomic_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } + + WARN_ON_ONCE(ctx->task != task); } rcu_read_unlock(); if (!ctx) @@ -2140,23 +2143,27 @@ static int __perf_install_in_context(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; + raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { + raw_spin_lock(&ctx->lock); /* * If we hit the 'wrong' task, we've since scheduled and * everything should be sorted, nothing to do! */ + task_ctx = ctx; if (ctx->task != current) - return 0; + goto unlock; /* * If task_ctx is set, it had better be to us. */ WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); - task_ctx = ctx; + } else if (task_ctx) { + raw_spin_lock(&task_ctx->lock); } - perf_ctx_lock(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx); +unlock: perf_ctx_unlock(cpuctx, task_ctx); return 0; @@ -2188,6 +2195,17 @@ perf_install_in_context(struct perf_event_context *ctx, * happened and that will have taken care of business. */ raw_spin_lock_irq(&ctx->lock); + task = ctx->task; + /* + * Worse, we cannot even rely on the ctx actually existing anymore. If + * between find_get_context() and perf_install_in_context() the task + * went through perf_event_exit_task() its dead and we should not be + * adding new events. + */ + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } update_context_time(ctx); /* * Update cgrp time only if current cgrp matches event->cgrp. @@ -2195,7 +2213,6 @@ perf_install_in_context(struct perf_event_context *ctx, */ update_cgrp_time_from_event(event); add_event_to_ctx(event, ctx); - task = ctx->task; raw_spin_unlock_irq(&ctx->lock); if (task) @@ -2538,17 +2555,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp[ctxn] = next_ctx; - next->perf_event_ctxp[ctxn] = ctx; - ctx->task = next; - next_ctx->task = task; + WRITE_ONCE(ctx->task, next); + WRITE_ONCE(next_ctx->task, task); swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + /* + * RCU_INIT_POINTER here is safe because we've not + * modified the ctx and the above modification of + * ctx->task and ctx->task_ctx_data are immaterial + * since those values are always verified under + * ctx->lock which we're now holding. + */ + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); @@ -8545,7 +8566,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } /* Mark owner so we could distinguish it from user events. */ - event->owner = EVENT_OWNER_KERNEL; + event->owner = TASK_TOMBSTONE; account_event(event); @@ -8725,28 +8746,26 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *next; struct perf_event_context *child_ctx, *clone_ctx = NULL; + struct perf_event *child_event, *next; + unsigned long flags; - if (likely(!child->perf_event_ctxp[ctxn])) + WARN_ON_ONCE(child != current); + + child_ctx = perf_lock_task_context(child, ctxn, &flags); + if (!child_ctx) return; - local_irq_disable(); - WARN_ON_ONCE(child != current); - /* - * We can't reschedule here because interrupts are disabled, - * and child must be current. - */ - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); /* - * Take the context lock here so that if find_get_context is - * reading child->perf_event_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. + * Now that the context is inactive, destroy the task <-> ctx relation + * and mark the context dead. */ - raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); - child->perf_event_ctxp[ctxn] = NULL; + RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + put_ctx(child_ctx); /* cannot be last */ + WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); + put_task_struct(current); /* cannot be last */ /* * If this context is a clone; unclone it so it can't get @@ -8755,7 +8774,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ clone_ctx = unclone_ctx(child_ctx); update_context_time(child_ctx); - raw_spin_unlock_irq(&child_ctx->lock); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); -- cgit v1.3-14-g43fede From 45c815f06b80031659c63d7b93e580015d6024dd Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Jan 2016 17:14:29 +0200 Subject: perf: Synchronously free aux pages in case of allocation failure We are currently using asynchronous deallocation in the error path in AUX mmap code, which is unnecessary and also presents a problem for users that wish to probe for the biggest possible buffer size they can get: they'll get -EINVAL on all subsequent attemts to allocate a smaller buffer before the asynchronous deallocation callback frees up the pages from the previous unsuccessful attempt. Currently, gdb does that for allocating AUX buffers for Intel PT traces. More specifically, overwrite mode of AUX pmus that don't support hardware sg (some implementations of Intel PT, for instance) is limited to only one contiguous high order allocation for its buffer and there is no way of knowing its size without trying. This patch changes error path freeing to be synchronous as there won't be any contenders for the AUX pages at that point. Reported-by: Markus Metzger Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/1453216469-9509-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index adfdc0536117..1faad2cfdb9e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) __free_page(page); } +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } +} + int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { @@ -547,30 +566,11 @@ out: if (!ret) rb->aux_pgoff = pgoff; else - rb_free_aux(rb); + __rb_free_aux(rb); return ret; } -static void __rb_free_aux(struct ring_buffer *rb) -{ - int pg; - - if (rb->aux_priv) { - rb->free_aux(rb->aux_priv); - rb->free_aux = NULL; - rb->aux_priv = NULL; - } - - if (rb->aux_nr_pages) { - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); - - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; - } -} - void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) -- cgit v1.3-14-g43fede From 6f16886b7c050c934305b1f285c3458ff1b6e4e4 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 21 Jan 2016 11:19:29 +0000 Subject: cpuidle: fix fallback mechanism for suspend to idle in absence of enter_freeze Commit 51164251f5c3 "sched / idle: Drop default_idle_call() fallback from call_cpuidle()" made find_deepest_state() return non-negative value and check all the states with index > 0. Also as a result, find_deepest_state() returns 0 even when enter_freeze callbacks are not implemented and enter_freeze_proper() is called which ends up crashing the kernel. This patch updates the check for index > 0 in cpuidle_enter_freeze and cpuidle_idle_call(when idle_should_freeze is true) to restore the suspend-to-idle functionality in absence of enter_freeze callback. Fixes: 51164251f5c3 "sched / idle: Drop default_idle_call() fallback from call_cpuidle()" Signed-off-by: Sudeep Holla Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 2 +- kernel/sched/idle.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 046423b0c5ca..f996efc56605 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -153,7 +153,7 @@ int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) * be frozen safely. */ index = find_deepest_state(drv, dev, UINT_MAX, 0, true); - if (index >= 0) + if (index > 0) enter_freeze_proper(drv, dev, index); return index; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 34852ee70d54..30f7b6ad3920 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -162,7 +162,7 @@ static void cpuidle_idle_call(void) */ if (idle_should_freeze()) { entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state >= 0) { + if (entered_state > 0) { local_irq_enable(); goto exit_idle; } -- cgit v1.3-14-g43fede From dd4e17ab704269bce71402285f5e8b9ac24b1eff Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 21 Jan 2016 15:03:34 -0800 Subject: ntp: Fix ADJ_SETOFFSET being used w/ ADJ_NANO Recently, in commit 37cf4dc3370f I forgot to check if the timeval being passed was actually a timespec (as is signaled with ADJ_NANO). This resulted in that patch breaking ADJ_SETOFFSET users who set ADJ_NANO, by rejecting valid timespecs that were compared with valid timeval ranges. This patch addresses this by checking for the ADJ_NANO flag and using the timepsec check instead in that case. Reported-by: Harald Hoyer Reported-by: Kay Sievers Fixes: 37cf4dc3370f "time: Verify time values in adjtimex ADJ_SETOFFSET to avoid overflow" Signed-off-by: John Stultz Cc: Sasha Levin Cc: Richard Cochran Cc: Prarit Bhargava Cc: David Herrmann Link: http://lkml.kernel.org/r/1453417415-19110-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36f2ca09aa5e..6df8927c58a5 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -685,8 +685,18 @@ int ntp_validate_timex(struct timex *txc) if (!capable(CAP_SYS_TIME)) return -EPERM; - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } } /* -- cgit v1.3-14-g43fede From 1dff76b92f69051e579bdc131e01500da9fa2a91 Mon Sep 17 00:00:00 2001 From: Gavin Guo Date: Wed, 20 Jan 2016 12:36:58 +0800 Subject: sched/numa: Fix use-after-free bug in the task_numa_compare The following message can be observed on the Ubuntu v3.13.0-65 with KASan backported: ================================================================== BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr ffff880dd393ecd8 Read of size 8 by task qemu-system-x86/3998900 ============================================================================= BUG kmalloc-128 (Tainted: G B ): kasan: bad access detected ----------------------------------------------------------------------------- INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890 __slab_alloc+0x4f8/0x560 __kmalloc+0x1eb/0x280 task_numa_fault+0xc1b/0xed0 do_numa_page+0x192/0x200 handle_mm_fault+0x808/0x1160 __do_page_fault+0x218/0x750 do_page_fault+0x1a/0x70 page_fault+0x28/0x30 SyS_poll+0x66/0x1a0 system_call_fastpath+0x1a/0x1f INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0 __slab_free+0x2ab/0x3f0 kfree+0x161/0x170 task_numa_free+0x1d2/0x200 finish_task_switch+0x1d2/0x210 __schedule+0x5d4/0xc60 schedule_preempt_disabled+0x40/0xc0 cpu_startup_entry+0x2da/0x340 start_secondary+0x28f/0x360 Call Trace: [] dump_stack+0x45/0x56 [] print_trailer+0xfd/0x170 [] object_err+0x36/0x40 [] kasan_report_error+0x1e9/0x3a0 [] kasan_report+0x40/0x50 [] ? task_numa_find_cpu+0x64c/0x890 [] __asan_load8+0x69/0xa0 [] ? find_next_bit+0xd8/0x120 [] task_numa_find_cpu+0x64c/0x890 [] task_numa_migrate+0x4ac/0x7b0 [] numa_migrate_preferred+0xb3/0xc0 [] task_numa_fault+0xb88/0xed0 [] do_numa_page+0x192/0x200 [] handle_mm_fault+0x808/0x1160 [] ? sched_clock_cpu+0x10d/0x160 [] ? native_load_tls+0x82/0xa0 [] __do_page_fault+0x218/0x750 [] ? hrtimer_try_to_cancel+0x76/0x160 [] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0 [] do_page_fault+0x1a/0x70 [] page_fault+0x28/0x30 [] ? do_sys_poll+0x1c4/0x6d0 [] ? enqueue_task_fair+0x4b6/0xaa0 [] ? sched_clock+0x9/0x10 [] ? resched_task+0x7a/0xc0 [] ? check_preempt_curr+0xb3/0x130 [] ? poll_select_copy_remaining+0x170/0x170 [] ? wake_up_state+0x10/0x20 [] ? drop_futex_key_refs.isra.14+0x1f/0x90 [] ? futex_requeue+0x3de/0xba0 [] ? do_futex+0xbe/0x8f0 [] ? read_tsc+0x9/0x20 [] ? ktime_get_ts+0x12d/0x170 [] ? timespec_add_safe+0x59/0xe0 [] SyS_poll+0x66/0x1a0 [] system_call_fastpath+0x1a/0x1f As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in task_numa_assign()") points out, the rcu_read_lock() cannot protect the task_struct from being freed in the finish_task_switch(). And the bug happens in the process of calculation of imp which requires the access of p->numa_faults being freed in the following path: do_exit() current->flags |= PF_EXITING; release_task() ~~delayed_put_task_struct()~~ schedule() ... ... rq->curr = next; context_switch() finish_task_switch() put_task_struct() __put_task_struct() task_numa_free() The fix here to get_task_struct() early before end of dst_rq->lock to protect the calculation process and also put_task_struct() in the corresponding point if finally the dst_rq->curr somehow cannot be assigned. Additional credit to Liang Chen who helped fix the error logic and add the put_task_struct() to the place it missed. Signed-off-by: Gavin Guo Signed-off-by: Peter Zijlstra (Intel) Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: jay.vosburgh@canonical.com Cc: liang.chen@canonical.com Link: http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin.guo@canonical.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1926606ece80..56b7d4b83947 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1220,8 +1220,6 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); - if (p) - get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1289,20 +1287,30 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; + bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task, and this ensures that ->curr - * wasn't reaped and thus get_task_struct() in task_numa_assign() - * is safe under RCU read lock. - * Note that rcu_read_lock() itself can't protect from the final - * put_task_struct() after the last schedule(). + * No need to move the exiting task or idle task. */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + else { + /* + * The task_struct must be protected here to protect the + * p->numa_faults access in the task_weight since the + * numa_faults could already be freed in the following path: + * finish_task_switch() + * --> put_task_struct() + * --> __put_task_struct() + * --> task_numa_free() + */ + get_task_struct(cur); + } + raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1386,6 +1394,7 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; + put_task_struct(cur); cur = NULL; goto assign; } @@ -1411,9 +1420,16 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: + assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); + /* + * The dst_rq->curr isn't assigned. The protection for task_struct is + * finished. + */ + if (cur && !assigned) + put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, -- cgit v1.3-14-g43fede From 5955102c9984fa081b2d570cfac75c97eecf8f3b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Jan 2016 15:40:57 -0500 Subject: wrappers for ->i_mutex access parallel to mutex_{lock,unlock,trylock,is_locked,lock_nested}, inode_foo(inode) being mutex_foo(&inode->i_mutex). Please, use those for access to ->i_mutex; over the coming cycle ->i_mutex will become rwsem, with ->lookup() done with it held only shared. Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/file.c | 4 +- arch/powerpc/platforms/cell/spufs/inode.c | 12 ++-- arch/s390/hypfs/inode.c | 8 +-- block/ioctl.c | 4 +- drivers/base/devtmpfs.c | 12 ++-- drivers/block/aoe/aoecmd.c | 4 +- drivers/block/drbd/drbd_debugfs.c | 4 +- drivers/char/mem.c | 4 +- drivers/char/ps3flash.c | 4 +- drivers/infiniband/hw/qib/qib_fs.c | 12 ++-- drivers/mtd/ubi/cdev.c | 4 +- drivers/oprofile/oprofilefs.c | 16 +++--- drivers/staging/lustre/lustre/llite/dir.c | 4 +- drivers/staging/lustre/lustre/llite/file.c | 16 +++--- drivers/staging/lustre/lustre/llite/llite_lib.c | 4 +- drivers/staging/lustre/lustre/llite/llite_nfs.c | 4 +- drivers/staging/lustre/lustre/llite/lloop.c | 4 +- drivers/staging/lustre/lustre/llite/rw.c | 4 +- drivers/staging/lustre/lustre/llite/rw26.c | 4 +- drivers/staging/lustre/lustre/llite/vvp_io.c | 4 +- drivers/staging/lustre/lustre/llite/vvp_page.c | 10 ++-- drivers/staging/rdma/ipath/ipath_fs.c | 8 +-- drivers/usb/gadget/function/f_printer.c | 4 +- drivers/usb/gadget/legacy/inode.c | 4 +- drivers/usb/gadget/udc/atmel_usba_udc.c | 12 ++-- drivers/video/fbdev/core/fb_defio.c | 4 +- fs/9p/vfs_file.c | 8 +-- fs/affs/file.c | 8 +-- fs/afs/flock.c | 4 +- fs/afs/write.c | 4 +- fs/attr.c | 2 +- fs/binfmt_misc.c | 12 ++-- fs/block_dev.c | 20 +++---- fs/btrfs/file.c | 42 +++++++------- fs/btrfs/inode.c | 4 +- fs/btrfs/ioctl.c | 38 ++++++------- fs/btrfs/relocation.c | 4 +- fs/btrfs/scrub.c | 4 +- fs/btrfs/xattr.c | 2 +- fs/cachefiles/interface.c | 4 +- fs/cachefiles/namei.c | 40 ++++++------- fs/ceph/cache.c | 4 +- fs/ceph/caps.c | 4 +- fs/ceph/dir.c | 4 +- fs/ceph/export.c | 4 +- fs/ceph/file.c | 18 +++--- fs/cifs/cifsfs.c | 4 +- fs/cifs/file.c | 12 ++-- fs/coda/dir.c | 4 +- fs/coda/file.c | 8 +-- fs/configfs/dir.c | 58 +++++++++---------- fs/configfs/file.c | 8 +-- fs/configfs/inode.c | 4 +- fs/dax.c | 6 +- fs/dcache.c | 4 +- fs/debugfs/inode.c | 22 ++++---- fs/devpts/inode.c | 12 ++-- fs/direct-io.c | 8 +-- fs/ecryptfs/inode.c | 32 +++++------ fs/ecryptfs/mmap.c | 4 +- fs/efivarfs/file.c | 4 +- fs/efivarfs/super.c | 4 +- fs/exec.c | 4 +- fs/exofs/file.c | 4 +- fs/exportfs/expfs.c | 12 ++-- fs/ext2/ioctl.c | 12 ++-- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 20 +++---- fs/ext4/file.c | 18 +++--- fs/ext4/inode.c | 12 ++-- fs/ext4/ioctl.c | 20 +++---- fs/ext4/namei.c | 4 +- fs/ext4/super.c | 4 +- fs/f2fs/data.c | 4 +- fs/f2fs/file.c | 20 +++---- fs/fat/dir.c | 4 +- fs/fat/file.c | 12 ++-- fs/fuse/dir.c | 10 ++-- fs/fuse/file.c | 36 ++++++------ fs/gfs2/file.c | 4 +- fs/gfs2/inode.c | 4 +- fs/gfs2/quota.c | 8 +-- fs/hfs/dir.c | 4 +- fs/hfs/inode.c | 8 +-- fs/hfsplus/dir.c | 4 +- fs/hfsplus/inode.c | 8 +-- fs/hfsplus/ioctl.c | 4 +- fs/hostfs/hostfs_kern.c | 4 +- fs/hpfs/dir.c | 6 +- fs/hugetlbfs/inode.c | 12 ++-- fs/inode.c | 8 +-- fs/ioctl.c | 4 +- fs/jffs2/file.c | 4 +- fs/jfs/file.c | 6 +- fs/jfs/ioctl.c | 6 +- fs/jfs/super.c | 6 +- fs/kernfs/dir.c | 4 +- fs/libfs.c | 10 ++-- fs/locks.c | 6 +- fs/logfs/file.c | 8 +-- fs/namei.c | 74 ++++++++++++------------- fs/namespace.c | 10 ++-- fs/ncpfs/dir.c | 8 +-- fs/ncpfs/file.c | 4 +- fs/nfs/dir.c | 8 +-- fs/nfs/direct.c | 12 ++-- fs/nfs/file.c | 4 +- fs/nfs/inode.c | 8 +-- fs/nfs/nfs42proc.c | 8 +-- fs/nfs/nfs4file.c | 24 ++++---- fs/nfsd/nfs4proc.c | 4 +- fs/nfsd/nfs4recover.c | 12 ++-- fs/nfsd/nfsfh.h | 4 +- fs/nfsd/vfs.c | 4 +- fs/nilfs2/inode.c | 4 +- fs/nilfs2/ioctl.c | 4 +- fs/ntfs/dir.c | 4 +- fs/ntfs/file.c | 8 +-- fs/ntfs/quota.c | 6 +- fs/ntfs/super.c | 12 ++-- fs/ocfs2/alloc.c | 32 +++++------ fs/ocfs2/aops.c | 4 +- fs/ocfs2/dir.c | 4 +- fs/ocfs2/file.c | 12 ++-- fs/ocfs2/inode.c | 12 ++-- fs/ocfs2/ioctl.c | 12 ++-- fs/ocfs2/journal.c | 8 +-- fs/ocfs2/localalloc.c | 16 +++--- fs/ocfs2/move_extents.c | 16 +++--- fs/ocfs2/namei.c | 28 +++++----- fs/ocfs2/quota_global.c | 4 +- fs/ocfs2/refcounttree.c | 12 ++-- fs/ocfs2/resize.c | 8 +-- fs/ocfs2/suballoc.c | 12 ++-- fs/ocfs2/xattr.c | 14 ++--- fs/open.c | 12 ++-- fs/overlayfs/copy_up.c | 4 +- fs/overlayfs/dir.c | 12 ++-- fs/overlayfs/inode.c | 4 +- fs/overlayfs/readdir.c | 20 +++---- fs/overlayfs/super.c | 14 ++--- fs/proc/kcore.c | 4 +- fs/proc/self.c | 4 +- fs/proc/thread_self.c | 4 +- fs/pstore/inode.c | 6 +- fs/quota/dquot.c | 20 +++---- fs/read_write.c | 4 +- fs/readdir.c | 2 +- fs/reiserfs/dir.c | 4 +- fs/reiserfs/file.c | 4 +- fs/reiserfs/ioctl.c | 2 +- fs/reiserfs/xattr.c | 64 ++++++++++----------- fs/tracefs/inode.c | 34 ++++++------ fs/ubifs/dir.c | 18 +++--- fs/ubifs/file.c | 4 +- fs/ubifs/xattr.c | 4 +- fs/udf/file.c | 10 ++-- fs/udf/inode.c | 2 +- fs/utimes.c | 4 +- fs/xattr.c | 8 +-- fs/xfs/xfs_file.c | 6 +- fs/xfs/xfs_pnfs.c | 4 +- include/linux/fs.h | 29 +++++++++- ipc/mqueue.c | 8 +-- kernel/audit_fsnotify.c | 2 +- kernel/audit_watch.c | 2 +- kernel/events/core.c | 4 +- kernel/relay.c | 4 +- kernel/sched/core.c | 4 +- mm/filemap.c | 4 +- mm/shmem.c | 12 ++-- mm/swapfile.c | 12 ++-- net/sunrpc/cache.c | 10 ++-- net/sunrpc/rpc_pipe.c | 60 ++++++++++---------- security/inode.c | 10 ++-- security/integrity/ima/ima_main.c | 8 +-- security/selinux/selinuxfs.c | 4 +- 177 files changed, 908 insertions(+), 883 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 5038fd578e65..2936a0044c04 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1799,9 +1799,9 @@ static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int data struct inode *inode = file_inode(file); int err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (!err) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = spufs_mfc_flush(file, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return err; } diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index ad4840f86be1..dfa863876778 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -163,7 +163,7 @@ static void spufs_prune_dir(struct dentry *dir) { struct dentry *dentry, *tmp; - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) { spin_lock(&dentry->d_lock); if (simple_positive(dentry)) { @@ -180,7 +180,7 @@ static void spufs_prune_dir(struct dentry *dir) } } shrink_dcache_parent(dir); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); } /* Caller must hold parent->i_mutex */ @@ -225,9 +225,9 @@ static int spufs_dir_close(struct inode *inode, struct file *file) parent = d_inode(dir->d_parent); ctx = SPUFS_I(d_inode(dir))->i_ctx; - mutex_lock_nested(&parent->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(parent, I_MUTEX_PARENT); ret = spufs_rmdir(parent, dir); - mutex_unlock(&parent->i_mutex); + inode_unlock(parent); WARN_ON(ret); return dcache_dir_close(inode, file); @@ -270,7 +270,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - mutex_lock(&inode->i_mutex); + inode_lock(inode); dget(dentry); inc_nlink(dir); @@ -291,7 +291,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags, if (ret) spufs_rmdir(dir, dentry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index b2e5902bd8f4..0f3da2cb2bd6 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -67,7 +67,7 @@ static void hypfs_remove(struct dentry *dentry) struct dentry *parent; parent = dentry->d_parent; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); if (simple_positive(dentry)) { if (d_is_dir(dentry)) simple_rmdir(d_inode(parent), dentry); @@ -76,7 +76,7 @@ static void hypfs_remove(struct dentry *dentry) } d_delete(dentry); dput(dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); } static void hypfs_delete_tree(struct dentry *root) @@ -331,7 +331,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, struct dentry *dentry; struct inode *inode; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) { dentry = ERR_PTR(-ENOMEM); @@ -359,7 +359,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, d_instantiate(dentry, inode); dget(dentry); fail: - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return dentry; } diff --git a/block/ioctl.c b/block/ioctl.c index 2c84683aada5..77f5d17779d6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -455,12 +455,12 @@ static int blkdev_daxset(struct block_device *bdev, unsigned long argp) if (arg && !blkdev_dax_capable(bdev)) return -ENOTTY; - mutex_lock(&bdev->bd_inode->i_mutex); + inode_lock(bdev->bd_inode); if (bdev->bd_map_count == 0) inode_set_flags(bdev->bd_inode, arg, S_DAX); else rc = -EBUSY; - mutex_unlock(&bdev->bd_inode->i_mutex); + inode_unlock(bdev->bd_inode); return rc; } #else diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 68f03141e432..44a74cf1372c 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -215,9 +215,9 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, newattrs.ia_uid = uid; newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); notify_change(dentry, &newattrs, NULL); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; @@ -244,7 +244,7 @@ static int dev_rmdir(const char *name) err = -ENOENT; } dput(dentry); - mutex_unlock(&d_inode(parent.dentry)->i_mutex); + inode_unlock(d_inode(parent.dentry)); path_put(&parent); return err; } @@ -321,9 +321,9 @@ static int handle_remove(const char *nodename, struct device *dev) newattrs.ia_mode = stat.mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); notify_change(dentry, &newattrs, NULL); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); err = vfs_unlink(d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; @@ -332,7 +332,7 @@ static int handle_remove(const char *nodename, struct device *dev) err = -ENOENT; } dput(dentry); - mutex_unlock(&d_inode(parent.dentry)->i_mutex); + inode_unlock(d_inode(parent.dentry)); path_put(&parent); if (deleted && strchr(nodename, '/')) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index ad80c85e0857..d048d2009e89 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -964,9 +964,9 @@ aoecmd_sleepwork(struct work_struct *work) ssize = get_capacity(d->gd); bd = bdget_disk(d->gd, 0); if (bd) { - mutex_lock(&bd->bd_inode->i_mutex); + inode_lock(bd->bd_inode); i_size_write(bd->bd_inode, (loff_t)ssize<<9); - mutex_unlock(&bd->bd_inode->i_mutex); + inode_unlock(bd->bd_inode); bdput(bd); } spin_lock_irq(&d->lock); diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 96a0107a72ea..4de95bbff486 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -434,12 +434,12 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo if (!parent || d_really_is_negative(parent)) goto out; /* serialize with d_delete() */ - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); /* Make sure the object is still alive */ if (simple_positive(file->f_path.dentry) && kref_get_unless_zero(kref)) ret = 0; - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!ret) { ret = single_open(file, show, data); if (ret) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 6b1721f978c2..4f6f94c43412 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -689,7 +689,7 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) { loff_t ret; - mutex_lock(&file_inode(file)->i_mutex); + inode_lock(file_inode(file)); switch (orig) { case SEEK_CUR: offset += file->f_pos; @@ -706,7 +706,7 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) default: ret = -EINVAL; } - mutex_unlock(&file_inode(file)->i_mutex); + inode_unlock(file_inode(file)); return ret; } diff --git a/drivers/char/ps3flash.c b/drivers/char/ps3flash.c index 0b311fa277ef..b526dc15c271 100644 --- a/drivers/char/ps3flash.c +++ b/drivers/char/ps3flash.c @@ -290,9 +290,9 @@ static int ps3flash_fsync(struct file *file, loff_t start, loff_t end, int datas { struct inode *inode = file_inode(file); int err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = ps3flash_writeback(ps3flash_dev); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index 13ef22bd9459..fcdf37913a26 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -89,14 +89,14 @@ static int create_file(const char *name, umode_t mode, { int error; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(*dentry)) error = qibfs_mknod(d_inode(parent), *dentry, mode, fops, data); else error = PTR_ERR(*dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return error; } @@ -481,7 +481,7 @@ static int remove_device_files(struct super_block *sb, int ret, i; root = dget(sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); snprintf(unit, sizeof(unit), "%u", dd->unit); dir = lookup_one_len(unit, root, strlen(unit)); @@ -491,7 +491,7 @@ static int remove_device_files(struct super_block *sb, goto bail; } - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); remove_file(dir, "counters"); remove_file(dir, "counter_names"); remove_file(dir, "portcounter_names"); @@ -506,13 +506,13 @@ static int remove_device_files(struct super_block *sb, } } remove_file(dir, "flash"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = simple_rmdir(d_inode(root), dir); d_delete(dir); dput(dir); bail: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); return ret; } diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c index 54e056d3be02..ee2b74d1d1b5 100644 --- a/drivers/mtd/ubi/cdev.c +++ b/drivers/mtd/ubi/cdev.c @@ -174,9 +174,9 @@ static int vol_cdev_fsync(struct file *file, loff_t start, loff_t end, struct ubi_device *ubi = desc->vol->ubi; struct inode *inode = file_inode(file); int err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = ubi_sync(ubi->ubi_num); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index dd92c5edf219..b48ac6300c79 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -138,22 +138,22 @@ static int __oprofilefs_create_file(struct dentry *root, char const *name, struct dentry *dentry; struct inode *inode; - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, name); if (!dentry) { - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return -ENOMEM; } inode = oprofilefs_get_inode(root->d_sb, S_IFREG | perm); if (!inode) { dput(dentry); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return -ENOMEM; } inode->i_fop = fops; inode->i_private = priv; d_add(dentry, inode); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return 0; } @@ -215,22 +215,22 @@ struct dentry *oprofilefs_mkdir(struct dentry *parent, char const *name) struct dentry *dentry; struct inode *inode; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); dentry = d_alloc_name(parent, name); if (!dentry) { - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return NULL; } inode = oprofilefs_get_inode(parent->d_sb, S_IFDIR | 0755); if (!inode) { dput(dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return NULL; } inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; d_add(dentry, inode); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return dentry; } diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c index 7b355319079c..8982f7d1b374 100644 --- a/drivers/staging/lustre/lustre/llite/dir.c +++ b/drivers/staging/lustre/lustre/llite/dir.c @@ -1858,7 +1858,7 @@ static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) int api32 = ll_need_32bit_api(sbi); loff_t ret = -EINVAL; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (origin) { case SEEK_SET: break; @@ -1896,7 +1896,7 @@ static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) goto out; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c index c92d58b770ec..39e2ffd5f97f 100644 --- a/drivers/staging/lustre/lustre/llite/file.c +++ b/drivers/staging/lustre/lustre/llite/file.c @@ -2082,17 +2082,17 @@ putgl: /* update time if requested */ rc = 0; if (llss->ia2.ia_valid != 0) { - mutex_lock(&llss->inode1->i_mutex); + inode_lock(llss->inode1); rc = ll_setattr(file1->f_path.dentry, &llss->ia2); - mutex_unlock(&llss->inode1->i_mutex); + inode_unlock(llss->inode1); } if (llss->ia1.ia_valid != 0) { int rc1; - mutex_lock(&llss->inode2->i_mutex); + inode_lock(llss->inode2); rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1); - mutex_unlock(&llss->inode2->i_mutex); + inode_unlock(llss->inode2); if (rc == 0) rc = rc1; } @@ -2179,13 +2179,13 @@ static int ll_hsm_import(struct inode *inode, struct file *file, ATTR_MTIME | ATTR_MTIME_SET | ATTR_ATIME | ATTR_ATIME_SET; - mutex_lock(&inode->i_mutex); + inode_lock(inode); rc = ll_setattr_raw(file->f_path.dentry, attr, true); if (rc == -ENODATA) rc = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(attr); free_hss: @@ -2609,7 +2609,7 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1); rc = filemap_write_and_wait_range(inode->i_mapping, start, end); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* catch async errors that were recorded back when async writeback * failed for pages in this mapping. */ @@ -2641,7 +2641,7 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) fd->fd_write_failed = false; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c index 1db93af62bad..b2fc5b3786ee 100644 --- a/drivers/staging/lustre/lustre/llite/llite_lib.c +++ b/drivers/staging/lustre/lustre/llite/llite_lib.c @@ -1277,7 +1277,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import) return -ENOMEM; if (!S_ISDIR(inode->i_mode)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); memcpy(&op_data->op_attr, attr, sizeof(*attr)); @@ -1358,7 +1358,7 @@ out: ll_finish_md_op_data(op_data); if (!S_ISDIR(inode->i_mode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if ((attr->ia_valid & ATTR_SIZE) && !hsm_import) inode_dio_wait(inode); } diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c index e578a1130ad1..18aab25f9cd9 100644 --- a/drivers/staging/lustre/lustre/llite/llite_nfs.c +++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c @@ -245,9 +245,9 @@ static int ll_get_name(struct dentry *dentry, char *name, goto out; } - mutex_lock(&dir->i_mutex); + inode_lock(dir); rc = ll_dir_read(dir, &lgd.ctx); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); if (!rc && !lgd.lgd_found) rc = -ENOENT; out: diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c index 420d39123877..871924b3f2e7 100644 --- a/drivers/staging/lustre/lustre/llite/lloop.c +++ b/drivers/staging/lustre/lustre/llite/lloop.c @@ -257,9 +257,9 @@ static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head) * be asked to write less pages once, this purely depends on * implementation. Anyway, we should be careful to avoid deadlocking. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); bytes = ll_direct_rw_pages(env, io, rw, inode, pvec); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); cl_io_fini(env, io); return (bytes == pvec->ldp_size) ? 0 : (int)bytes; } diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c index 95cdb0c58b04..f355474967d6 100644 --- a/drivers/staging/lustre/lustre/llite/rw.c +++ b/drivers/staging/lustre/lustre/llite/rw.c @@ -115,8 +115,8 @@ static struct ll_cl_context *ll_cl_init(struct file *file, struct inode *inode = vmpage->mapping->host; loff_t pos; - if (mutex_trylock(&inode->i_mutex)) { - mutex_unlock(&(inode)->i_mutex); + if (inode_trylock(inode)) { + inode_unlock((inode)); /* this is too bad. Someone is trying to write the * page w/o holding inode mutex. This means we can diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c index 39fa13b74cbd..711fda93a58d 100644 --- a/drivers/staging/lustre/lustre/llite/rw26.c +++ b/drivers/staging/lustre/lustre/llite/rw26.c @@ -403,7 +403,7 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter, * 1. Need inode mutex to operate transient pages. */ if (iov_iter_rw(iter) == READ) - mutex_lock(&inode->i_mutex); + inode_lock(inode); LASSERT(obj->cob_transient_pages == 0); while (iov_iter_count(iter)) { @@ -454,7 +454,7 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter, out: LASSERT(obj->cob_transient_pages == 0); if (iov_iter_rw(iter) == READ) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (tot_bytes > 0) { if (iov_iter_rw(iter) == WRITE) { diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index f68e972886ca..0920ac6b3003 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -439,7 +439,7 @@ static int vvp_io_setattr_start(const struct lu_env *env, struct inode *inode = ccc_object_inode(io->ci_obj); int result = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (cl_io_is_trunc(io)) result = vvp_io_setattr_trunc(env, ios, inode, io->u.ci_setattr.sa_attr.lvb_size); @@ -459,7 +459,7 @@ static void vvp_io_setattr_end(const struct lu_env *env, * because osc has already notified to destroy osc_extents. */ vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } static void vvp_io_setattr_fini(const struct lu_env *env, diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c index 99c0d7aee921..a133475a7c74 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_page.c +++ b/drivers/staging/lustre/lustre/llite/vvp_page.c @@ -428,7 +428,7 @@ static void vvp_transient_page_verify(const struct cl_page *page) { struct inode *inode = ccc_object_inode(page->cp_obj); - LASSERT(!mutex_trylock(&inode->i_mutex)); + LASSERT(!inode_trylock(inode)); } static int vvp_transient_page_own(const struct lu_env *env, @@ -480,9 +480,9 @@ static int vvp_transient_page_is_vmlocked(const struct lu_env *env, struct inode *inode = ccc_object_inode(slice->cpl_obj); int locked; - locked = !mutex_trylock(&inode->i_mutex); + locked = !inode_trylock(inode); if (!locked) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return locked ? -EBUSY : -ENODATA; } @@ -502,7 +502,7 @@ static void vvp_transient_page_fini(const struct lu_env *env, struct ccc_object *clobj = cl2ccc(clp->cp_obj); vvp_page_fini_common(cp); - LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); + LASSERT(!inode_trylock(clobj->cob_inode)); clobj->cob_transient_pages--; } @@ -548,7 +548,7 @@ int vvp_page_init(const struct lu_env *env, struct cl_object *obj, } else { struct ccc_object *clobj = cl2ccc(obj); - LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); + LASSERT(!inode_trylock(clobj->cob_inode)); cl_page_slice_add(page, &cpg->cpg_cl, obj, &vvp_transient_page_ops); clobj->cob_transient_pages++; diff --git a/drivers/staging/rdma/ipath/ipath_fs.c b/drivers/staging/rdma/ipath/ipath_fs.c index 796af6867007..476fcdf05acb 100644 --- a/drivers/staging/rdma/ipath/ipath_fs.c +++ b/drivers/staging/rdma/ipath/ipath_fs.c @@ -82,14 +82,14 @@ static int create_file(const char *name, umode_t mode, { int error; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(*dentry)) error = ipathfs_mknod(d_inode(parent), *dentry, mode, fops, data); else error = PTR_ERR(*dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return error; } @@ -295,7 +295,7 @@ static int remove_device_files(struct super_block *sb, int ret; root = dget(sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); dir = lookup_one_len(unit, root, strlen(unit)); @@ -311,7 +311,7 @@ static int remove_device_files(struct super_block *sb, ret = simple_rmdir(d_inode(root), dir); bail: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); return ret; } diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 0fbfb2b2aa08..26ccad5d8680 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c @@ -673,7 +673,7 @@ printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync) unsigned long flags; int tx_list_empty; - mutex_lock(&inode->i_mutex); + inode_lock(inode); spin_lock_irqsave(&dev->lock, flags); tx_list_empty = (likely(list_empty(&dev->tx_reqs))); spin_unlock_irqrestore(&dev->lock, flags); @@ -683,7 +683,7 @@ printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync) wait_event_interruptible(dev->tx_flush_wait, (likely(list_empty(&dev->tx_reqs_active)))); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 365afd7e14f8..7e179f81d05c 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -1521,10 +1521,10 @@ static void destroy_ep_files (struct dev_data *dev) spin_unlock_irq (&dev->lock); /* break link to dcache */ - mutex_lock (&parent->i_mutex); + inode_lock(parent); d_delete (dentry); dput (dentry); - mutex_unlock (&parent->i_mutex); + inode_unlock(parent); spin_lock_irq (&dev->lock); } diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c index f92f5aff0dd5..8755b2c2aada 100644 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c @@ -91,7 +91,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf, if (!access_ok(VERIFY_WRITE, buf, nbytes)) return -EFAULT; - mutex_lock(&file_inode(file)->i_mutex); + inode_lock(file_inode(file)); list_for_each_entry_safe(req, tmp_req, queue, queue) { len = snprintf(tmpbuf, sizeof(tmpbuf), "%8p %08x %c%c%c %5d %c%c%c\n", @@ -118,7 +118,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf, nbytes -= len; buf += len; } - mutex_unlock(&file_inode(file)->i_mutex); + inode_unlock(file_inode(file)); return actual; } @@ -143,7 +143,7 @@ static int regs_dbg_open(struct inode *inode, struct file *file) u32 *data; int ret = -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); udc = inode->i_private; data = kmalloc(inode->i_size, GFP_KERNEL); if (!data) @@ -158,7 +158,7 @@ static int regs_dbg_open(struct inode *inode, struct file *file) ret = 0; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -169,11 +169,11 @@ static ssize_t regs_dbg_read(struct file *file, char __user *buf, struct inode *inode = file_inode(file); int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = simple_read_from_buffer(buf, nbytes, ppos, file->private_data, file_inode(file)->i_size); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index 3fc63c208d08..57721c73177f 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -78,13 +78,13 @@ int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasy if (!info->fbdefio) return 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Kill off the delayed work */ cancel_delayed_work_sync(&info->deferred_work); /* Run it immediately */ schedule_delayed_work(&info->deferred_work, 0); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 7bf835f85bc8..eadc894faea2 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); retval = p9_client_wstat(fid, &wstat); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } @@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; retval = p9_client_fsync(fid, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/affs/file.c b/fs/affs/file.c index 659c579c4588..0548c53f41d5 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp) inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (inode->i_size != AFFS_I(inode)->mmu_private) affs_truncate(inode); affs_free_prealloc(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -958,12 +958,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = write_inode_now(inode, 0); err = sync_blockdev(inode->i_sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } const struct file_operations affs_file_operations = { diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 4baf1d2b39e4..d91a9c9cfbd0 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) fl->fl_type = F_UNLCK; - mutex_lock(&vnode->vfs_inode.i_mutex); + inode_lock(&vnode->vfs_inode); /* check local lock records first */ ret = 0; @@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) } error: - mutex_unlock(&vnode->vfs_inode.i_mutex); + inode_unlock(&vnode->vfs_inode); _leave(" = %d [%hd]", ret, fl->fl_type); return ret; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 0714abcd7f32..dfef94f70667 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* use a writeback record as a marker in the queue - when this reaches * the front of the queue, all the outstanding writes are either @@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) afs_put_writeback(wb); _leave(" = %d", ret); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/attr.c b/fs/attr.c index 6530ced19697..25b24d0f6c88 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de struct timespec now; unsigned int ia_valid = attr->ia_valid; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 78f005f37847..3a3ced779fc7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 3: /* Delete this handler. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); kill_node(e); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: @@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); root = dget(sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, out2: dput(dentry); out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); if (err) { @@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, case 3: /* Delete all handlers. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: diff --git a/fs/block_dev.c b/fs/block_dev.c index ba762ea07f67..2c3aeab17e20 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -346,9 +346,9 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) struct inode *bd_inode = bdev_file_inode(file); loff_t retval; - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); return retval; } @@ -1142,9 +1142,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) { unsigned bsize = bdev_logical_block_size(bdev); - mutex_lock(&bdev->bd_inode->i_mutex); + inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); - mutex_unlock(&bdev->bd_inode->i_mutex); + inode_unlock(bdev->bd_inode); while (bsize < PAGE_CACHE_SIZE) { if (size & bsize) break; @@ -1741,9 +1741,9 @@ static void blkdev_vm_open(struct vm_area_struct *vma) struct inode *bd_inode = bdev_file_inode(vma->vm_file); struct block_device *bdev = I_BDEV(bd_inode); - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); bdev->bd_map_count++; - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); } static void blkdev_vm_close(struct vm_area_struct *vma) @@ -1751,9 +1751,9 @@ static void blkdev_vm_close(struct vm_area_struct *vma) struct inode *bd_inode = bdev_file_inode(vma->vm_file); struct block_device *bdev = I_BDEV(bd_inode); - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); bdev->bd_map_count--; - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); } static const struct vm_operations_struct blkdev_dax_vm_ops = { @@ -1777,7 +1777,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) struct block_device *bdev = I_BDEV(bd_inode); file_accessed(file); - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); bdev->bd_map_count++; if (IS_DAX(bd_inode)) { vma->vm_ops = &blkdev_dax_vm_ops; @@ -1785,7 +1785,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) } else { vma->vm_ops = &blkdev_default_vm_ops; } - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); return 0; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9f5cc1e8e126..098bb8f690c9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1762,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, loff_t pos; size_t count; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = generic_write_checks(iocb, from); if (err <= 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -1783,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * to stop this write operation to ensure FS consistency. */ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EROFS; goto out; } @@ -1804,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -1820,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, iocb->ki_pos = pos + num_written; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * We also have to set last_sub_trans to the current log transid, @@ -1909,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -1961,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = start_ordered_ops(inode, start, end); } if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } atomic_inc(&root->log_batch); @@ -2007,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -2031,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } trans->sync = true; @@ -2054,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If any of the ordered extents had an error, just return it to user @@ -2303,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); ret = find_first_non_hole(inode, &offset, &len); if (ret < 0) @@ -2343,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2419,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = btrfs_wait_ordered_range(inode, lockstart, lockend - lockstart + 1); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2574,7 +2574,7 @@ out_only_mutex: ret = btrfs_end_transaction(trans, root); } } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret && !err) err = ret; return err; @@ -2658,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = inode_newsize_ok(inode, alloc_end); if (ret) goto out; @@ -2816,7 +2816,7 @@ out: * So this is completely used as cleanup. */ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_start, alloc_end - alloc_start); @@ -2892,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: case SEEK_CUR: @@ -2901,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: if (offset >= i_size_read(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } ret = find_desired_extent(inode, &offset, whence); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1b79dc9b12e4..e28f3d4691af 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8447,7 +8447,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); relock = true; } ret = btrfs_delalloc_reserve_space(inode, offset, count); @@ -8504,7 +8504,7 @@ out: if (wakeup) inode_dio_end(inode); if (relock) - mutex_lock(&inode->i_mutex); + inode_lock(inode); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9028737ee9b5..952172ca7e45 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ip_oldflags = ip->flags; i_oldflags = inode->i_flags; @@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); return ret; } @@ -881,7 +881,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return error; } @@ -1393,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += cluster; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (newer_than) { if (newer_off == (u64)-1) @@ -1465,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, out_ra: if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (!file) kfree(ra); @@ -2430,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Don't allow to delete a subvolume with send in progress. This is @@ -2543,7 +2543,7 @@ out_up_write: spin_unlock(&dest->root_item_lock); } out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) { d_invalidate(dentry); btrfs_invalidate_inodes(dest); @@ -2559,7 +2559,7 @@ out_unlock_inode: out_dput: dput(dentry); out_unlock_dir: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); out_drop_write: mnt_drop_write_file(file); out: @@ -2857,8 +2857,8 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - mutex_unlock(&inode1->i_mutex); - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode1); + inode_unlock(inode2); } static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) @@ -2866,8 +2866,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) if (inode1 < inode2) swap(inode1, inode2); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode1, I_MUTEX_PARENT); + inode_lock_nested(inode2, I_MUTEX_CHILD); } static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, @@ -3026,7 +3026,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, return 0; if (same_inode) { - mutex_lock(&src->i_mutex); + inode_lock(src); ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) @@ -3101,7 +3101,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, btrfs_cmp_data_free(&cmp); out_unlock: if (same_inode) - mutex_unlock(&src->i_mutex); + inode_unlock(src); else btrfs_double_inode_unlock(src, dst); @@ -3749,7 +3749,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, if (!same_inode) { btrfs_double_inode_lock(src, inode); } else { - mutex_lock(&src->i_mutex); + inode_lock(src); } /* determine range to clone */ @@ -3820,7 +3820,7 @@ out_unlock: if (!same_inode) btrfs_double_inode_unlock(src, inode); else - mutex_unlock(&src->i_mutex); + inode_unlock(src); return ret; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ef6d8fc85853..fd1c4d982463 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3030,7 +3030,7 @@ int prealloc_file_extent_cluster(struct inode *inode, int ret = 0; BUG_ON(cluster->start != cluster->boundary[0]); - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = btrfs_check_data_free_space(inode, cluster->start, cluster->end + 1 - cluster->start); @@ -3057,7 +3057,7 @@ int prealloc_file_extent_cluster(struct inode *inode, btrfs_free_reserved_data_space(inode, cluster->start, cluster->end + 1 - cluster->start); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b1a68530e911..92bf5ee732fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -4279,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, return PTR_ERR(inode); /* Avoid truncate/dio/punch hole.. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode_dio_wait(inode); physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; @@ -4358,7 +4358,7 @@ next_page: } ret = COPY_COMPLETE; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); return ret; } diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fd953c361a43..6c68d6356197 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(mutex_is_locked(&inode->i_mutex)); + ASSERT(inode_is_locked(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, name_len, 0); if (!di) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index afa023dded5b..675a3332d72f 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -446,7 +446,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) return 0; cachefiles_begin_secure(cache, &saved_cred); - mutex_lock(&d_inode(object->backer)->i_mutex); + inode_lock(d_inode(object->backer)); /* if there's an extension to a partial page at the end of the backing * file, we need to discard the partial page so that we pick up new @@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) ret = notify_change(object->backer, &newattrs, NULL); truncate_failed: - mutex_unlock(&d_inode(object->backer)->i_mutex); + inode_unlock(d_inode(object->backer)); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) { diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index c4b893453e0e..1c2334c163dd 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -295,7 +295,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, cachefiles_mark_object_buried(cache, rep, why); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); @@ -306,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -423,13 +423,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } else { /* we need to check that our parent is _still_ our parent - it @@ -442,7 +442,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore * it */ - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } } @@ -501,7 +501,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); @@ -585,7 +585,7 @@ lookup_again: /* process the next component */ if (key) { _debug("advance"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = next; next = NULL; @@ -623,7 +623,7 @@ lookup_again: /* note that we're now using this object */ ret = cachefiles_mark_object_active(cache, object); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = NULL; @@ -705,7 +705,7 @@ lookup_error: cachefiles_io_error(cache, "Lookup failed"); next = NULL; error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(next); error_out2: dput(dir); @@ -729,7 +729,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); @@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, d_backing_inode(subdir)->i_ino); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); /* we need to make sure the subdir is a directory */ ASSERT(d_backing_inode(subdir)); @@ -800,19 +800,19 @@ check_error: return ERR_PTR(ret); mkdir_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } @@ -837,7 +837,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir, filename); /* look up the victim */ - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); @@ -852,7 +852,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, * at the netfs's request whilst the cull was in progress */ if (d_is_negative(victim)) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); _leave(" = -ENOENT [absent]"); return ERR_PTR(-ENOENT); @@ -881,13 +881,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, object_in_use: read_unlock(&cache->active_lock); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = -EBUSY [in use]"); return ERR_PTR(-EBUSY); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); if (ret == -ENOENT) { /* file or dir now absent - probably retired by netfs */ @@ -947,7 +947,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return 0; error_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); error: dput(victim); if (ret == -ENOENT) { @@ -982,7 +982,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = 0"); return 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a4766ded1ba7..7680e2626815 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -197,7 +197,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, return; /* Avoid multiple racing open requests */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ci->fscache) goto done; @@ -207,7 +207,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci, true); fscache_check_consistency(ci->fscache); done: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c69e1253b47b..cdbf8cf3d52c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2030,7 +2030,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (datasync) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -2046,7 +2046,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 9314b4ea2375..fd11fb231a2e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -507,7 +507,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = -EINVAL; switch (whence) { case SEEK_CUR: @@ -542,7 +542,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index fe02ae7f056a..3b3172357326 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -215,7 +215,7 @@ static int ceph_get_name(struct dentry *parent, char *name, if (IS_ERR(req)) return PTR_ERR(req); - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); req->r_inode = d_inode(child); ihold(d_inode(child)); @@ -224,7 +224,7 @@ static int ceph_get_name(struct dentry *parent, char *name, req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!err) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3c68e6aee2f0..10c5ae79696e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1014,7 +1014,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1070,7 +1070,7 @@ retry_snap: (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { struct ceph_snap_context *snapc; struct iov_iter data; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1097,7 +1097,7 @@ retry_snap: "got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)count); - mutex_lock(&inode->i_mutex); + inode_lock(inode); goto retry_snap; } if (written > 0) @@ -1117,7 +1117,7 @@ retry_snap: iocb->ki_pos = pos + written; if (inode->i_size > old_size) ceph_fscache_update_objectsize(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (written >= 0) { @@ -1147,7 +1147,7 @@ retry_snap: goto out_unlocked; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; @@ -1162,7 +1162,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); @@ -1207,7 +1207,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -1363,7 +1363,7 @@ static long ceph_fallocate(struct file *file, int mode, if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ceph_snap(inode) != CEPH_NOSNAP) { ret = -EROFS; @@ -1418,7 +1418,7 @@ static long ceph_fallocate(struct file *file, int mode, ceph_put_cap_refs(ci, got); unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ceph_free_cap_flush(prealloc_cf); return ret; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c4c1169814b2..e24ca79b140c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -640,9 +640,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) while (*s && *s != sep) s++; - mutex_lock(&dir->i_mutex); + inode_lock(dir); child = lookup_one_len(p, dentry, s - p); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a2752b79e72..ff882aeaccc6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); - mutex_lock(&inode->i_mutex); + inode_lock(inode); rc = generic_write_checks(iocb, from); if (rc <= 0) @@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) else rc = -EACCES; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (rc > 0) { ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index fda9f4311212..42e731b8c80a 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) if (host_file->f_op->iterate) { struct inode *host_inode = file_inode(host_file); - mutex_lock(&host_inode->i_mutex); + inode_lock(host_inode); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { ret = host_file->f_op->iterate(host_file, ctx); file_accessed(host_file); } - mutex_unlock(&host_inode->i_mutex); + inode_unlock(host_inode); return ret; } /* Venus: we must read Venus dirents from a file */ diff --git a/fs/coda/file.c b/fs/coda/file.c index 1da3805f3ddc..f47c7483863b 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) host_file = cfi->cfi_container; file_start_write(host_file); - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); file_end_write(host_file); return ret; } @@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end); if (err) return err; - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); @@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = vfs_fsync(host_file, datasync); if (!err && !datasync) err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); return err; } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cab612b2ae76..f419519ec41f 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -640,13 +640,13 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; - mutex_lock(&d_inode(child)->i_mutex); + inode_lock(d_inode(child)); configfs_detach_group(sd->s_element); d_inode(child)->i_flags |= S_DEAD; dont_mount(child); - mutex_unlock(&d_inode(child)->i_mutex); + inode_unlock(d_inode(child)); d_delete(child); dput(child); @@ -834,11 +834,11 @@ static int configfs_attach_item(struct config_item *parent_item, * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. */ - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); configfs_remove_dir(item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); } } @@ -874,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item, * We must also lock the inode to remove it safely in case of * error, as rmdir() would. */ - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); ret = populate_groups(to_config_group(item)); if (ret) { @@ -883,7 +883,7 @@ static int configfs_attach_group(struct config_item *parent_item, dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (ret) d_delete(dentry); } @@ -1135,7 +1135,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, * subsystem is really registered, and so we need to lock out * configfs_[un]register_subsystem(). */ - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item); if (!subsys_sd) { @@ -1147,7 +1147,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, ret = configfs_do_depend_item(subsys_sd->s_dentry, target); out_unlock_fs: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); /* * If we succeeded, the fs is pinned via other methods. If not, @@ -1230,7 +1230,7 @@ int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys, * additional locking to prevent other subsystem from being * unregistered */ - mutex_lock(&d_inode(root->cg_item.ci_dentry)->i_mutex); + inode_lock(d_inode(root->cg_item.ci_dentry)); /* * As we are trying to depend item from other subsystem @@ -1254,7 +1254,7 @@ out_root_unlock: * We were called from subsystem other than our target so we * took some locks so now it's time to release them */ - mutex_unlock(&d_inode(root->cg_item.ci_dentry)->i_mutex); + inode_unlock(d_inode(root->cg_item.ci_dentry)); return ret; } @@ -1561,7 +1561,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) down_write(&configfs_rename_sem); parent = item->parent->dentry; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { @@ -1577,7 +1577,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); up_write(&configfs_rename_sem); return error; @@ -1590,7 +1590,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) struct configfs_dirent * parent_sd = dentry->d_fsdata; int err; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached @@ -1603,7 +1603,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) else err = 0; } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return err; } @@ -1613,11 +1613,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); spin_unlock(&configfs_dirent_lock); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); release_configfs_dirent(cursor); @@ -1698,7 +1698,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -1706,7 +1706,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -1732,7 +1732,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&configfs_dirent_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } @@ -1767,14 +1767,14 @@ int configfs_register_group(struct config_group *parent_group, parent = parent_group->cg_item.ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); ret = create_default_group(parent_group, group); if (!ret) { spin_lock(&configfs_dirent_lock); configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return ret; } EXPORT_SYMBOL(configfs_register_group); @@ -1791,7 +1791,7 @@ void configfs_unregister_group(struct config_group *group) struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); configfs_detach_prep(dentry, NULL); spin_unlock(&configfs_dirent_lock); @@ -1800,7 +1800,7 @@ void configfs_unregister_group(struct config_group *group) d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); d_delete(dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); dput(dentry); @@ -1872,7 +1872,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) sd = root->d_fsdata; link_group(to_config_group(sd->s_element), group); - mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); err = -ENOMEM; dentry = d_alloc_name(root, group->cg_item.ci_name); @@ -1892,7 +1892,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) } } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); if (err) { unlink_group(group); @@ -1913,9 +1913,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) return; } - mutex_lock_nested(&d_inode(root)->i_mutex, + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { @@ -1926,11 +1926,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(dentry); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 3687187c8ea5..33b7ee34eda5 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -540,10 +540,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL); + inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, CONFIGFS_ITEM_ATTR); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return error; } @@ -562,10 +562,10 @@ int configfs_create_bin_file(struct config_item *item, umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, CONFIGFS_ITEM_BIN_ATTR); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 0cc810e9dccc..cee087d8f7e0 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -255,7 +255,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) /* no inode means this hasn't been made visible yet */ return; - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; @@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) break; } } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); } diff --git a/fs/dax.c b/fs/dax.c index 7af879759064..55aa273145a8 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -248,10 +248,10 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, pos, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -263,7 +263,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, retval = dax_io(inode, iter, pos, end, get_block, &bh); if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); diff --git a/fs/dcache.c b/fs/dcache.c index b4539e84e577..92d5140de851 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2462,7 +2462,7 @@ EXPORT_SYMBOL(d_rehash); */ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) { - BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); + BUG_ON(!inode_is_locked(dentry->d_parent->d_inode)); BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ spin_lock(&dentry->d_lock); @@ -2738,7 +2738,7 @@ static int __d_unalias(struct inode *inode, if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; - if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) + if (!inode_trylock(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b7fcc0de0b2f..bece948b363d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -265,7 +265,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { dput(dentry); @@ -273,7 +273,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -282,7 +282,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return NULL; @@ -290,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); return dentry; } @@ -560,9 +560,9 @@ void debugfs_remove(struct dentry *dentry) if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); ret = __debugfs_remove(dentry, parent); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -594,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -609,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); parent = child; goto down; } @@ -630,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); child = parent; parent = parent->d_parent; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); if (child != dentry) /* go up */ @@ -641,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c35ffdc12bba..1f107fd51328 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb) if (!uid_valid(root_uid) || !gid_valid(root_gid)) return -EINVAL; - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); /* If we have already created ptmx node, return */ if (fsi->ptmx_dentry) { @@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return rc; } @@ -615,7 +615,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, sprintf(s, "%d", index); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, s); if (dentry) { @@ -626,7 +626,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, inode = ERR_PTR(-ENOMEM); } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return inode; } @@ -671,7 +671,7 @@ void devpts_pty_kill(struct inode *inode) BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_find_alias(inode); @@ -680,7 +680,7 @@ void devpts_pty_kill(struct inode *inode) dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index 602e8441bc0f..1b2f7ffc8b84 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1157,12 +1157,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, iocb->ki_filp->f_mapping; /* will be released by direct_io_worker */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, offset, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); goto out; } @@ -1173,7 +1173,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->i_size = i_size_read(inode); if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { if (dio->flags & DIO_LOCKING) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); retval = 0; goto out; @@ -1295,7 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * of protecting us from looking up uninitialized blocks. */ if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); + inode_unlock(dio->inode); /* * The only time we want to leave bios in flight is when a successful diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 040aa879d634..4e685ac1024d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry) struct dentry *dir; dir = dget_parent(dentry); - mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); return dir; } static void unlock_dir(struct dentry *dir) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); } @@ -397,11 +397,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + inode_lock(d_inode(lower_dir_dentry)); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -426,11 +426,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out; } - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + inode_lock(d_inode(lower_dir_dentry)); lower_dentry = lookup_one_len(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -869,9 +869,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); } return rc; } @@ -970,9 +970,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) lower_ia.ia_valid &= ~ATTR_MODE; - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; @@ -1048,10 +1048,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1075,9 +1075,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1092,9 +1092,9 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index caba848ac763..c6ced4cbf0cf 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -436,7 +436,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) rc = -ENOMEM; goto out; } - mutex_lock(&lower_inode->i_mutex); + inode_lock(lower_inode); size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, PAGE_CACHE_SIZE); if (size < 0) @@ -444,7 +444,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); - mutex_unlock(&lower_inode->i_mutex); + inode_unlock(lower_inode); if (rc) printk(KERN_ERR "Error whilst attempting to write inode size " "to lower file xattr; rc = [%d]\n", rc); diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 90001da9abfd..c424e4813ec8 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -50,9 +50,9 @@ static ssize_t efivarfs_file_write(struct file *file, d_delete(file->f_path.dentry); dput(file->f_path.dentry); } else { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } bytes = count; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 86a2121828c3..b8a564f29107 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -160,10 +160,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, efivar_entry_size(entry, &size); efivar_entry_add(entry, &efivarfs_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); d_add(dentry, inode); return 0; diff --git a/fs/exec.c b/fs/exec.c index 828ec5f07de0..dcd4ac7d3f1e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1307,13 +1307,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm) return; /* Be careful if suid/sgid is set */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; uid = inode->i_uid; gid = inode->i_gid; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!kuid_has_mapping(bprm->cred->user_ns, uid) || diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 906de66e8e7e..28645f0640f7 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_inode_metadata(filp->f_mapping->host, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 714cd37a6ba3..c46f1a190b8d 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, int err; parent = ERR_PTR(-EACCES); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("%s: get_parent of %ld failed, err %d\n", @@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); tmp = lookup_one_len(nbuf, parent, strlen(nbuf)); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); goto out_err; @@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, */ err = exportfs_get_name(mnt, target_dir, nbuf, result); if (!err) { - mutex_lock(&target_dir->d_inode->i_mutex); + inode_lock(target_dir->d_inode); nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); - mutex_unlock(&target_dir->d_inode->i_mutex); + inode_unlock(target_dir->d_inode); if (!IS_ERR(nresult)) { if (nresult->d_inode) { dput(result); diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 5d46c09863f0..b386af2e45f4 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext2_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) */ if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext2_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setflags_out: @@ -102,10 +102,10 @@ setflags_out: goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_ctime = CURRENT_TIME_SEC; inode->i_generation = generation; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setversion_out: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1c127213363a..0662b285dc8a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2896,7 +2896,7 @@ do { \ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = newsize; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b52fea3b7219..0ffabaf90aa5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4799,7 +4799,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, else max_blocks -= lblk; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Indirect files do not support unwritten extnets @@ -4902,7 +4902,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, out_dio: ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -4973,7 +4973,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * We only support preallocation for extent-based files only @@ -5006,7 +5006,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) EXT4_I(inode)->i_sync_tid); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } @@ -5492,7 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation @@ -5587,7 +5587,7 @@ out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5638,7 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; @@ -5757,7 +5757,7 @@ out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5792,8 +5792,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); - BUG_ON(!mutex_is_locked(&inode1->i_mutex)); - BUG_ON(!mutex_is_locked(&inode2->i_mutex)); + BUG_ON(!inode_is_locked(inode1)); + BUG_ON(!inode_is_locked(inode2)); *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 749b222e6498..8eb87e3e2752 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -113,7 +113,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_unwritten_wait(inode); } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto out; @@ -169,7 +169,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; @@ -186,7 +186,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ret; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (aio_mutex) mutex_unlock(aio_mutex); return ret; @@ -561,11 +561,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) int blkbits; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -613,7 +613,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) dataoff = (loff_t)last << blkbits; } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (dataoff > isize) return -ENXIO; @@ -634,11 +634,11 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) int blkbits; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -689,7 +689,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) break; } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (holeoff > isize) holeoff = isize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d964195ea0e2..83bc8bfb3bea 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3231,7 +3231,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, overwrite = *((int *)iocb->private); if (overwrite) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * We could direct write to holes and fallocate. @@ -3331,7 +3331,7 @@ retake_lock: inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) - mutex_lock(&inode->i_mutex); + inode_lock(inode); return ret; } @@ -3653,7 +3653,7 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, handle_t *handle; loff_t size = i_size_read(inode); - WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(!inode_is_locked(inode)); if (offset > size || offset + len < size) return 0; @@ -3707,7 +3707,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3809,7 +3809,7 @@ out_dio: up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -3879,7 +3879,7 @@ void ext4_truncate(struct inode *inode) * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) - WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2b0cb84255eb..0f6c36922c24 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -330,7 +330,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) return err; err = -EPERM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) goto out_unlock; @@ -381,7 +381,7 @@ out_dirty: out_stop: ext4_journal_stop(handle); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(filp); return err; } @@ -464,9 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext4_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = ext4_ioctl_setflags(inode, flags); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(filp); return err; } @@ -497,7 +497,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -512,7 +512,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext4_journal_stop(handle); unlock_out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); setversion_out: mnt_drop_write_file(filp); return err; @@ -658,9 +658,9 @@ group_add_out: * ext4_ext_swap_inode_data before we switch the * inode format to prevent read. */ - mutex_lock(&(inode->i_mutex)); + inode_lock((inode)); err = ext4_ext_migrate(inode); - mutex_unlock(&(inode->i_mutex)); + inode_unlock((inode)); mnt_drop_write_file(filp); return err; } @@ -876,11 +876,11 @@ encryption_policy_out: flags = ext4_xflags_to_iflags(fa.fsx_xflags); flags = ext4_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); err = ext4_ioctl_setflags(inode, flags); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(filp); if (err) return err; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 854f75de4599..06574dd77614 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2753,7 +2753,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* * Exit early if inode already is on orphan list. This is a big speedup * since we don't have to contend on the global s_orphan_lock. @@ -2835,7 +2835,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 00c98fab6333..3ed01ec011d7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2286,10 +2286,10 @@ static void ext4_orphan_cleanup(struct super_block *sb, __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); - mutex_lock(&inode->i_mutex); + inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); nr_truncates++; } else { if (test_opt(sb, DEBUG)) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ac9e7c6aac74..5c06db17e41f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -794,7 +794,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (start >= isize) @@ -860,7 +860,7 @@ out: if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 18ddb1e5182a..ea272be62677 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -333,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) @@ -388,10 +388,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -1219,7 +1219,7 @@ static long f2fs_fallocate(struct file *file, int mode, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) @@ -1243,7 +1243,7 @@ static long f2fs_fallocate(struct file *file, int mode, } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; @@ -1307,13 +1307,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = f2fs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto out; } @@ -1322,7 +1322,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; @@ -1667,7 +1667,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, f2fs_balance_fs(sbi, true); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* writeback all dirty pages in the range */ err = filemap_write_and_wait_range(inode->i_mapping, range->start, @@ -1778,7 +1778,7 @@ do_map: clear_out: clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_CACHE_SHIFT; return err; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 7def96caec5f..d0b95c95079b 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, buf.dirent = dirent; buf.result = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); buf.ctx.pos = file->f_pos; ret = -ENOENT; if (!IS_DEADDIR(inode)) { @@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, short_only, both ? &buf : NULL); file->f_pos = buf.ctx.pos; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret >= 0) ret = buf.result; return ret; diff --git a/fs/fat/file.c b/fs/fat/file.c index 43d3475da83a..f70185668832 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -24,9 +24,9 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; - mutex_lock(&inode->i_mutex); + inode_lock(inode); attr = fat_make_attrs(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(attr, user_attr); } @@ -47,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) err = mnt_want_write_file(file); if (err) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -109,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); out: return err; @@ -246,7 +246,7 @@ static long fat_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_KEEP_SIZE) { ondisksize = inode->i_blocks << 9; if ((offset + len) <= ondisksize) @@ -272,7 +272,7 @@ static long fat_fallocate(struct file *file, int mode, } error: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 712601f299b8..4b855b65d457 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!parent) return -ENOENT; - mutex_lock(&parent->i_mutex); + inode_lock(parent); if (!S_ISDIR(parent->i_mode)) goto unlock; @@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { - mutex_lock(&d_inode(entry)->i_mutex); + inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; @@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, clear_nlink(d_inode(entry)); err = 0; badentry: - mutex_unlock(&d_inode(entry)->i_mutex); + inode_unlock(d_inode(entry)); if (!err) d_delete(entry); } else { @@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, dput(entry); unlock: - mutex_unlock(&parent->i_mutex); + inode_unlock(parent); iput(parent); return err; } @@ -1504,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!mutex_is_locked(&inode->i_mutex)); + BUG_ON(!inode_is_locked(inode)); spin_lock(&fc->lock); BUG_ON(fi->writectr < 0); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aa03aab6a24f..b03d253ece15 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) return err; if (lock_inode) - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = fuse_do_open(fc, get_node_id(inode), file, isdir); @@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) fuse_finish_open(inode, file); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); @@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Start writeback against all dirty pages of the inode, then @@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, err = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -1160,7 +1160,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return generic_file_write_iter(iocb, from); } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1210,7 +1210,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } out: current->backing_dev_info = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return written ? written : err; } @@ -1322,10 +1322,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); if (!write) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } while (count) { @@ -1413,14 +1413,14 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) return -EIO; /* Don't allow parallel writes to the same file */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_invalidate_attr(inode); if (res > 0) fuse_write_update_size(inode, iocb->ki_pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -2287,17 +2287,17 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) retval = generic_file_llseek(file, offset, whence); break; case SEEK_END: - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fuse_update_attributes(inode, NULL, file, NULL); if (!retval) retval = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); break; case SEEK_HOLE: case SEEK_DATA: - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fuse_lseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); break; default: retval = -EINVAL; @@ -2944,7 +2944,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return -EOPNOTSUPP; if (lock_inode) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { loff_t endbyte = offset + length - 1; err = filemap_write_and_wait_range(inode->i_mapping, @@ -2990,7 +2990,7 @@ out: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7412863cda1e..c9384f932975 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -914,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); @@ -946,7 +946,7 @@ out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3e94400d587c..352f958769e1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -2067,7 +2067,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) @@ -2094,7 +2094,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, gfs2_glock_dq_uninit(&gh); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index be6d9c450b22..a39891344259 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -888,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); for (qx = 0; qx < num_qd; qx++) { error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); @@ -953,7 +953,7 @@ out_alloc: out: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); kfree(ghs); gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH); return error; @@ -1674,7 +1674,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (error) goto out_put; - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); if (error) goto out_unlockput; @@ -1739,7 +1739,7 @@ out_i: out_q: gfs2_glock_dq_uninit(&q_gh); out_unlockput: - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); out_put: qd_put(qd); return error; diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 70788e03820a..e9f2b855f831 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index b99ebddb10cb..6686bf39a5b5 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file) if (HFS_IS_RSRC(inode)) inode = HFS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfs_file_truncate(inode); //if (inode->i_flags & S_DEAD) { // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); // hfs_delete_inode(inode); //} - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* sync the inode to buffers */ ret = write_inode_now(inode, 0); @@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = sync_blockdev(sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d0f39dcbb58e..a4e867e08947 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file) { struct hfsplus_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 19b33f8151f1..1a6394cdb54e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfsplus_file_truncate(inode); if (inode->i_flags & S_DEAD) { hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb)->hidden_dir, NULL); hfsplus_delete_inode(inode); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Sync inode metadata into the catalog and extent trees. @@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 0624ce4e0702..32a49e292b6a 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) goto out_drop_write; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || inode->i_flags & (S_IMMUTABLE|S_APPEND)) { @@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_drop_write: mnt_drop_write_file(file); out: diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cfaa18c7a337..d1abbee281d1 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = fsync_file(HOSTFS_I(inode)->fd, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index dc540bfcee1d..e57a53c13d86 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; - mutex_lock(&i->i_mutex); + inode_lock(i); hpfs_lock(s); /*pr_info("dir lseek\n");*/ @@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) ok: filp->f_pos = new_off; hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return new_off; fail: /*pr_warn("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return -ESPIPE; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8bbf7f3e2a27..e1f465a389d5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma_len = (loff_t)(vma->vm_end - vma->vm_start); - mutex_lock(&inode->i_mutex); + inode_lock(inode); file_accessed(file); ret = -ENOMEM; @@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -538,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) hole_end >> PAGE_SHIFT); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -572,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); @@ -659,7 +659,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, i_size_write(inode, offset + len); inode->i_ctime = CURRENT_TIME; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/fs/inode.c b/fs/inode.c index e491e54d2430..bb8685220292 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -966,9 +966,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) swap(inode1, inode2); if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_lock(&inode1->i_mutex); + inode_lock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); + inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); @@ -980,9 +980,9 @@ EXPORT_SYMBOL(lock_two_nondirectories); void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_unlock(&inode1->i_mutex); + inode_unlock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode2); } EXPORT_SYMBOL(unlock_two_nondirectories); diff --git a/fs/ioctl.c b/fs/ioctl.c index 29466c380958..116a333e9c77 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -434,9 +434,9 @@ int generic_block_fiemap(struct inode *inode, u64 len, get_block_t *get_block) { int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(generic_block_fiemap); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index f509f62e12f6..c5ac5944bc1b 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Trigger GC to flush any pending writes for this inode */ jffs2_flush_wbuf_gc(c, inode->i_ino); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 0e026a7bdcd4..4ce7735dd042 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } rc |= jfs_commit_inode(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc ? -EIO : 0; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 8db8b7d61e40..8653cac7e12e 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } /* Lock against other parallel changes of flags */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); jfs_get_inode_flags(jfs_inode); oldflags = jfs_inode->mode2; @@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ((flags ^ oldflags) & (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EPERM; goto setflags_out; } @@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) jfs_inode->mode2 = flags; jfs_set_inode_flags(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 900925b5eb8c..4f5d85ba8e23 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -792,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh; struct buffer_head *bh; - mutex_lock(&inode->i_mutex); + inode_lock(inode); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -824,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, } out: if (len == towrite) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } if (inode->i_size < off+len-towrite) @@ -832,7 +832,7 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return len - towrite; } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 821973853340..996b7742c90b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1511,9 +1511,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, struct inode *inode = file_inode(file); loff_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/libfs.c b/fs/libfs.c index 01491299f348..0ca80b2af420 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close); loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&dentry->d_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } EXPORT_SYMBOL(dcache_dir_lseek); @@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; @@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, ret = err; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(__generic_file_fsync); diff --git a/fs/locks.c b/fs/locks.c index af1ed74a657f..7c5f91be9b65 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1650,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr * bother, maybe that's a sign this just isn't a good file to * hand out a delegation on. */ - if (is_deleg && !mutex_trylock(&inode->i_mutex)) + if (is_deleg && !inode_trylock(inode)) return -EAGAIN; if (is_deleg && arg == F_WRLCK) { /* Write delegations are not currently supported: */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(1); return -EINVAL; } @@ -1732,7 +1732,7 @@ out: spin_unlock(&ctx->flc_lock); locks_dispose_list(&dispose); if (is_deleg) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error && !my_fl) *flp = NULL; return error; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 1a6f0167b16a..61eaeb1b6cac 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = li->li_flags; flags &= LOGFS_FL_USER_MODIFIABLE; flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; li->li_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME; mark_inode_dirty_sync(inode); @@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); logfs_get_wblocks(sb, NULL, WF_LOCK); logfs_write_anchor(sb); logfs_put_wblocks(sb, NULL, WF_LOCK); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/namei.c b/fs/namei.c index bceefd5588a2..f624d132e01e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1629,9 +1629,9 @@ static int lookup_slow(struct nameidata *nd, struct path *path) parent = nd->path.dentry; BUG_ON(nd->inode != parent->d_inode); - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); dentry = __lookup_hash(&nd->last, parent, nd->flags); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (IS_ERR(dentry)) return PTR_ERR(dentry); path->mnt = nd->path.mnt; @@ -2229,10 +2229,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path) putname(filename); return ERR_PTR(-EINVAL); } - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = __lookup_hash(&last, path->dentry, 0); if (IS_ERR(d)) { - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); } putname(filename); @@ -2282,7 +2282,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) unsigned int c; int err; - WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); this.name = name; this.len = len; @@ -2380,9 +2380,9 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (ret) return ret; - mutex_lock(&base->d_inode->i_mutex); + inode_lock(base->d_inode); ret = __lookup_hash(&this, base, 0); - mutex_unlock(&base->d_inode->i_mutex); + inode_unlock(base->d_inode); return ret; } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2463,7 +2463,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) goto done; } - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); dentry = d_lookup(dir, &nd->last); if (!dentry) { /* @@ -2473,16 +2473,16 @@ mountpoint_last(struct nameidata *nd, struct path *path) */ dentry = d_alloc(dir, &nd->last); if (!dentry) { - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return -ENOMEM; } dentry = lookup_real(dir->d_inode, dentry, nd->flags); if (IS_ERR(dentry)) { - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return PTR_ERR(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); done: if (d_is_negative(dentry)) { @@ -2672,7 +2672,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) struct dentry *p; if (p1 == p2) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } @@ -2680,29 +2680,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) p = d_ancestor(p2, p1); if (p) { - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); return p; } p = d_ancestor(p1, p2); if (p) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); return p; } - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { - mutex_unlock(&p1->d_inode->i_mutex); + inode_unlock(p1->d_inode); if (p1 != p2) { - mutex_unlock(&p2->d_inode->i_mutex); + inode_unlock(p2->d_inode); mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } @@ -3141,9 +3141,9 @@ retry_lookup: * dropping this one anyway. */ } - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write, opened); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (error <= 0) { if (error) @@ -3489,7 +3489,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * Do the final lookup. */ lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path->dentry, lookup_flags); if (IS_ERR(dentry)) goto unlock; @@ -3518,7 +3518,7 @@ fail: dput(dentry); dentry = ERR_PTR(error); unlock: - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: @@ -3538,7 +3538,7 @@ EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } @@ -3735,7 +3735,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) return -EPERM; dget(dentry); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry)) @@ -3755,7 +3755,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) detach_mounts(dentry); out: - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete(dentry); @@ -3794,7 +3794,7 @@ retry: if (error) goto exit1; - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -3810,7 +3810,7 @@ retry: exit3: dput(dentry); exit2: - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit1: path_put(&path); @@ -3856,7 +3856,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&target->i_mutex); + inode_lock(target); if (is_local_mountpoint(dentry)) error = -EBUSY; else { @@ -3873,7 +3873,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate } } out: - mutex_unlock(&target->i_mutex); + inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { @@ -3916,7 +3916,7 @@ retry: if (error) goto exit1; retry_deleg: - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -3934,7 +3934,7 @@ retry_deleg: exit2: dput(dentry); } - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; @@ -4086,7 +4086,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; @@ -4103,7 +4103,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; @@ -4303,7 +4303,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); else if (target) - mutex_lock(&target->i_mutex); + inode_lock(target); error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) @@ -4356,7 +4356,7 @@ out: if (!is_dir || (flags & RENAME_EXCHANGE)) unlock_two_nondirectories(source, target); else if (target) - mutex_unlock(&target->i_mutex); + inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, old_name, is_dir, diff --git a/fs/namespace.c b/fs/namespace.c index a830e1463704..4fb1691b4355 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1961,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path) struct vfsmount *mnt; struct dentry *dentry = path->dentry; retry: - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (unlikely(cant_mount(dentry))) { - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ERR_PTR(-ENOENT); } namespace_lock(); @@ -1974,13 +1974,13 @@ retry: mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return mp; } return mp; } namespace_unlock(); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); path->mnt = mnt; dentry = path->dentry = dget(mnt->mnt_root); @@ -1992,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where) struct dentry *dentry = where->m_dentry; put_mountpoint(where); namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index f0e3e9e747dd..26c2de2de13f 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) if (!res) { struct inode *inode = d_inode(dentry); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { ncp_new_dentry(dentry); val=1; @@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) ncp_dbg(2, "found, but dirEntNum changed\n"); ncp_update_inode2(inode, &finfo); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } finished: @@ -639,9 +639,9 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, } else { struct inode *inode = d_inode(newdent); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode, I_MUTEX_CHILD); ncp_update_inode2(inode, entry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (ctl.idx >= NCP_DIRCACHE_SIZE) { diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 011324ce9df2..dd38ca1f2ecb 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos = pos; if (pos > i_size_read(inode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (pos > i_size_read(inode)) i_size_write(inode, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } ncp_dbg(1, "exit %pD2\n", file); outrel: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c82a21228a34..9cce67043f92 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->duped = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); - mutex_lock(&inode->i_mutex); + inode_lock(inode); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 7ab7ec9f4eed..7a0cfd3266e5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -580,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, if (!count) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) goto out_unlock; @@ -608,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -622,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return result; } @@ -1005,7 +1005,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) @@ -1045,7 +1045,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_CACHE_SHIFT, end); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -1066,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4ef8f5addcad..748bb813b8ec 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -278,9 +278,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8e24d886d2c5..86faecf8f328 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -661,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs_sync_inode(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err) goto out; } @@ -1178,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode, spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); if (may_lock) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_invalidate_mapping(inode, mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } else ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6e8174930a48..bd25dc7077f7 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -101,13 +101,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -123,7 +123,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return -EOPNOTSUPP; nfs_wb_all(inode); - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) @@ -131,7 +131,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 26f9a23e2b25..57ca1c8039c1 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -141,11 +141,11 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by @@ -219,13 +219,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ if (same_inode) { - mutex_lock(&src_inode->i_mutex); + inode_lock(src_inode); } else if (dst_inode < src_inode) { - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dst_inode, I_MUTEX_PARENT); + inode_lock_nested(src_inode, I_MUTEX_CHILD); } else { - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(src_inode, I_MUTEX_PARENT); + inode_lock_nested(dst_inode, I_MUTEX_CHILD); } /* flush all pending writes on both src and dst so that server @@ -246,13 +246,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, out_unlock: if (same_inode) { - mutex_unlock(&src_inode->i_mutex); + inode_unlock(src_inode); } else if (dst_inode < src_inode) { - mutex_unlock(&src_inode->i_mutex); - mutex_unlock(&dst_inode->i_mutex); + inode_unlock(src_inode); + inode_unlock(dst_inode); } else { - mutex_unlock(&dst_inode->i_mutex); - mutex_unlock(&src_inode->i_mutex); + inode_unlock(dst_inode); + inode_unlock(src_inode); } out: return ret; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 819ad812c71b..4cba7865f496 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u struct inode *inode = d_inode(resfh->fh_dentry); int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = security_inode_setsecctx(resfh->fh_dentry, label->data, label->len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (status) /* diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 79f0307a5ec8..dc8ebecf5618 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -192,7 +192,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dir = nn->rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { @@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) out_put: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (status == 0) { if (nn->in_grace) { crp = nfs4_client_to_reclaim(dname, nn); @@ -286,7 +286,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { @@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) list_del(&entry->list); kfree(entry); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); nfs4_reset_creds(original_cred); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { @@ -322,7 +322,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); @@ -335,7 +335,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) out: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return status; } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 0770bcb543c8..f84fe6bf9aee 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -288,7 +288,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) } inode = d_inode(dentry); - mutex_lock_nested(&inode->i_mutex, subclass); + inode_lock_nested(inode, subclass); fill_pre_wcc(fhp); fhp->fh_locked = true; } @@ -307,7 +307,7 @@ fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { fill_post_wcc(fhp); - mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); + inode_unlock(d_inode(fhp->fh_dentry)); fhp->fh_locked = false; } } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6739077f17fe..5d2a57e4c03a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -493,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); host_error = security_inode_setsecctx(dentry, label->data, label->len); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return nfserrno(host_error); } #else diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 10b22527a617..21a1e2e0d92f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1003,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); @@ -1113,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index aba43811d6ef..e8fe24882b5b 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, flags = nilfs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = NILFS_I(inode)->i_flags; @@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, nilfs_mark_inode_dirty(inode); ret = nilfs_transaction_commit(inode->i_sb); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 9e38dafa3bc7..b2eff5816adc 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(!S_ISDIR(vi->i_mode)); /* If the bitmap attribute inode is in memory sync it, too. */ @@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 9d383e5eff0e..bed4d427dfae 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written = 0; ssize_t err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); /* We can write back this queue in page reclaim. */ current->backing_dev_info = inode_to_bdi(vi); err = ntfs_prepare_file_for_write(iocb, from); if (iov_iter_count(from) && !err) written = ntfs_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); if (likely(written > 0)) { err = generic_write_sync(file, iocb->ki_pos, written); if (err < 0) @@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) @@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c index d80e3315cab0..9793e68ba1dd 100644 --- a/fs/ntfs/quota.c +++ b/fs/ntfs/quota.c @@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_error(vol->sb, "Quota inodes are not open."); return false; } - mutex_lock(&vol->quota_q_ino->i_mutex); + inode_lock(vol->quota_q_ino); ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); if (!ictx) { ntfs_error(vol->sb, "Failed to get index context."); @@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_index_entry_mark_dirty(ictx); set_done: ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); /* * We set the flag so we do not try to mark the quotas out of date * again on remount. @@ -110,7 +110,7 @@ done: err_out: if (ictx) ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); return false; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 2f77f8dfb861..1b38abdaa3ed 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol) * Find the inode number for the hibernation file by looking up the * filename hiberfil.sys in the root directory. */ - mutex_lock(&vol->root_ino->i_mutex); + inode_lock(vol->root_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, &name); - mutex_unlock(&vol->root_ino->i_mutex); + inode_unlock(vol->root_ino); if (IS_ERR_MREF(mref)) { ret = MREF_ERR(mref); /* If the file does not exist, Windows is not hibernated. */ @@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol) * Find the inode number for the quota file by looking up the filename * $Quota in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, quotas are disabled and have @@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol) * Find the inode number for the transaction log file by looking up the * filename $UsnJrnl in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, transaction logging is disabled, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a3ded88718c9..d002579c6f2b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5719,7 +5719,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto bail; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5776,7 +5776,7 @@ int ocfs2_remove_btree_range(struct inode *inode, out_commit: ocfs2_commit_trans(osb, handle); out: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -5832,7 +5832,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); @@ -5980,7 +5980,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); di = (struct ocfs2_dinode *) tl_bh->b_data; @@ -6008,7 +6008,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - mutex_lock(&data_alloc_inode->i_mutex); + inode_lock(data_alloc_inode); status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); if (status < 0) { @@ -6035,7 +6035,7 @@ out_unlock: ocfs2_inode_unlock(data_alloc_inode, 1); out_mutex: - mutex_unlock(&data_alloc_inode->i_mutex); + inode_unlock(data_alloc_inode); iput(data_alloc_inode); out: @@ -6047,9 +6047,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); status = __ocfs2_flush_truncate_log(osb); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6208,7 +6208,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, (unsigned long long)le64_to_cpu(tl_copy->i_blkno), num_recs); - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); for(i = 0; i < num_recs; i++) { if (ocfs2_truncate_log_needs_flush(osb)) { status = __ocfs2_flush_truncate_log(osb); @@ -6239,7 +6239,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, } bail_up: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6346,7 +6346,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { @@ -6395,7 +6395,7 @@ out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); out: while(head) { @@ -6439,7 +6439,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, handle_t *handle; int ret = 0; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); while (head) { if (ocfs2_truncate_log_needs_flush(osb)) { @@ -6471,7 +6471,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); while (head) { /* Premature exit may have left some dangling items. */ @@ -7355,7 +7355,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); if (ret < 0) { @@ -7422,7 +7422,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: return ret; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7f604727f487..794fd1587f34 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2046,9 +2046,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, int ret = 0; unsigned int truncated_clusters; - mutex_lock(&osb->osb_tl_inode->i_mutex); + inode_lock(osb->osb_tl_inode); truncated_clusters = osb->truncated_clusters; - mutex_unlock(&osb->osb_tl_inode->i_mutex); + inode_unlock(osb->osb_tl_inode); /* * Check whether we can succeed in allocating if we free diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ffecf89c8c1c..e1adf285fc31 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, mlog_errno(ret); goto out; } - mutex_lock(&dx_alloc_inode->i_mutex); + inode_lock(dx_alloc_inode); ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); if (ret) { @@ -4410,7 +4410,7 @@ out_unlock: ocfs2_inode_unlock(dx_alloc_inode, 1); out_mutex: - mutex_unlock(&dx_alloc_inode->i_mutex); + inode_unlock(dx_alloc_inode); brelse(dx_alloc_bh); out: iput(dx_alloc_inode); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d63127932509..7cb38fdca229 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1872,7 +1872,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes on other nodes @@ -1991,7 +1991,7 @@ out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -2299,7 +2299,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); relock: /* @@ -2435,7 +2435,7 @@ out: ocfs2_rw_unlock(inode, rw_level); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (written) ret = written; @@ -2547,7 +2547,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_SET: @@ -2585,7 +2585,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret) return ret; return offset; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 97a563bab9a8..36294446d960 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -630,10 +630,10 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); mlog_errno(status); goto bail; @@ -680,7 +680,7 @@ bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode_alloc_inode, 1); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); brelse(inode_alloc_bh); bail: iput(inode_alloc_inode); @@ -751,10 +751,10 @@ static int ocfs2_wipe_inode(struct inode *inode, /* Lock the orphan dir. The lock will be held for the entire * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); mlog_errno(status); goto bail; @@ -803,7 +803,7 @@ bail_unlock_dir: return status; ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 16b0bb482ea7..4506ec5ec2ea 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, unsigned oldflags; int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { @@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, bail_unlock: ocfs2_inode_unlock(inode, 1); bail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); brelse(bh); @@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) - mutex_lock(&inode_alloc->i_mutex); + inode_lock(inode_alloc); if (o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); @@ -317,7 +317,7 @@ bail: ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) - mutex_unlock(&inode_alloc->i_mutex); + inode_unlock(inode_alloc); brelse(bh); @@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); @@ -604,7 +604,7 @@ bail: ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); iput(gb_inode); brelse(bh); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 3772a2dbb980..61b833b721d8 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2088,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, return status; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); if (status < 0) { mlog_errno(status); @@ -2106,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, out_cluster: ocfs2_inode_unlock(orphan_dir_inode, 0); out: - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); return status; } @@ -2196,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, oi->ip_next_orphan = NULL; if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { mlog_errno(ret); @@ -2235,7 +2235,7 @@ unlock_inode: unlock_rw: ocfs2_rw_unlock(inode, 1); unlock_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* clear dio flag in ocfs2_inode_info */ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index e9c99e35f5ea..7d62c43a2c3e 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -414,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -468,7 +468,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -506,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, goto bail; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_read_inode_block_full(inode, &alloc_bh, OCFS2_BH_IGNORE_CACHE); @@ -539,7 +539,7 @@ bail: brelse(alloc_bh); if (inode) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); } @@ -571,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -601,7 +601,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); brelse(main_bm_bh); @@ -643,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - mutex_lock(&local_alloc_inode->i_mutex); + inode_lock(local_alloc_inode); /* * We must double check state and allocator bits because @@ -709,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, status = 0; bail: if (status < 0 && local_alloc_inode) { - mutex_unlock(&local_alloc_inode->i_mutex); + inode_unlock(local_alloc_inode); iput(local_alloc_inode); } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 124471d26a73..e3d05d9901a3 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; */ - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -338,7 +338,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock_mutex: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); @@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out; } - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { @@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out_unlock_gb_mutex; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -708,11 +708,11 @@ out_commit: brelse(gd_bh); out_unlock_tl_inode: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ocfs2_inode_unlock(gb_inode, 1); out_unlock_gb_mutex: - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); @@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes from other nodes @@ -969,7 +969,7 @@ out_inode_unlock: out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ab42c38031b1..6b3e87189a64 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1045,7 +1045,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -1664,7 +1664,7 @@ bail: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, return ret; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (ret < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(ret); @@ -2226,7 +2226,7 @@ out: if (ret) { ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); } @@ -2495,7 +2495,7 @@ out: ocfs2_free_alloc_context(inode_ac); /* Unroll orphan dir locking */ - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); ocfs2_inode_unlock(orphan_dir, 1); iput(orphan_dir); } @@ -2602,7 +2602,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); ocfs2_free_dir_lookup_result(&orphan_insert); @@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, goto bail; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(status); goto bail; @@ -2770,7 +2770,7 @@ bail_commit: bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); iput(orphan_dir_inode); @@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { mlog_errno(status); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); goto leave; } @@ -2901,7 +2901,7 @@ out_commit: ocfs2_commit_trans(osb, handle); orphan_unlock: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); leave: diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index fde9ef18cff3..9c9dd30bc945 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) WARN_ON(bh != oinfo->dqi_gqi_bh); spin_unlock(&dq_data_lock); if (ex) { - mutex_lock(&oinfo->dqi_gqinode->i_mutex); + inode_lock(oinfo->dqi_gqinode); down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } else { down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); @@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) { if (ex) { up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); - mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + inode_unlock(oinfo->dqi_gqinode); } else { up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 252119860e6c..3eff031aaf26 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); if (ret) { @@ -867,7 +867,7 @@ out_unlock: } out_mutex: if (alloc_inode) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); } out: @@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out; } - mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(new_inode, I_MUTEX_CHILD); ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, OI_LS_REFLINK_TARGET); if (ret) { @@ -4231,7 +4231,7 @@ inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: - mutex_unlock(&new_inode->i_mutex); + inode_unlock(new_inode); out: if (!ret) { ret = filemap_fdatawait(inode->i_mapping); @@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, return error; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = dquot_initialize(dir); if (!error) error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_create(dir, new_dentry); return error; diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 79b8021302b3..576b9a04873f 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -301,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -375,7 +375,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -486,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -590,7 +590,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index fc6d25f6d444..2f19aeec5482 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) if (ac->ac_which != OCFS2_AC_USE_LOCAL) ocfs2_inode_unlock(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); ac->ac_inode = NULL; @@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, return -EINVAL; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); status = ocfs2_inode_lock(alloc_inode, &bh, 1); if (status < 0) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); mlog_errno(status); @@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); @@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); ocfs2_inode_unlock(inode_alloc_inode, 0); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); brelse(alloc_bh); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f0e241ffd94f..7d3d979f57d9 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2524,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, mlog_errno(ret); goto out; } - mutex_lock(&xb_alloc_inode->i_mutex); + inode_lock(xb_alloc_inode); ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); if (ret < 0) { @@ -2549,7 +2549,7 @@ out_unlock: ocfs2_inode_unlock(xb_alloc_inode, 1); brelse(xb_alloc_bh); out_mutex: - mutex_unlock(&xb_alloc_inode->i_mutex); + inode_unlock(xb_alloc_inode); iput(xb_alloc_inode); out: brelse(blk_bh); @@ -3619,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode, } } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); if (ret < 0) { - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); mlog_errno(ret); goto cleanup; } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, &xbs, &ctxt, ref_meta, &credits); @@ -5460,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, return ret; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5504,7 +5504,7 @@ out_commit: out: ocfs2_schedule_truncate_log_flush(osb, 1); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (meta_ac) ocfs2_free_alloc_context(meta_ac); diff --git a/fs/open.c b/fs/open.c index b25b1542c530..55bdc75e2172 100644 --- a/fs/open.c +++ b/fs/open.c @@ -58,10 +58,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(dentry, &newattrs, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ret; } @@ -510,7 +510,7 @@ static int chmod_common(struct path *path, umode_t mode) if (error) return error; retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; @@ -518,7 +518,7 @@ retry_deleg: newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(path->dentry, &newattrs, &delegated_inode); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) @@ -593,11 +593,11 @@ retry_deleg: if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chown(path, uid, gid); if (!error) error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index eff6319d5037..d894e7cd9a86 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -248,9 +248,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (err) goto out_cleanup; - mutex_lock(&newdentry->d_inode->i_mutex); + inode_lock(newdentry->d_inode); err = ovl_set_attr(newdentry, stat); - mutex_unlock(&newdentry->d_inode->i_mutex); + inode_unlock(newdentry->d_inode); if (err) goto out_cleanup; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 692ceda3bc21..ed95272d57a6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; - mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); err = PTR_ERR(newdentry); @@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, out_dput: dput(newdentry); out_unlock: - mutex_unlock(&udir->i_mutex); + inode_unlock(udir); return err; } @@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - mutex_lock(&opaquedir->d_inode->i_mutex); + inode_lock(opaquedir->d_inode); err = ovl_set_attr(opaquedir, &stat); - mutex_unlock(&opaquedir->d_inode->i_mutex); + inode_unlock(opaquedir->d_inode); if (err) goto out_cleanup; @@ -599,7 +599,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) struct dentry *upper = ovl_dentry_upper(dentry); int err; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); err = -ESTALE; if (upper->d_parent == upperdir) { /* Don't let d_delete() think it can reset d_inode */ @@ -619,7 +619,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) * now. */ d_drop(dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return err; } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index bf996e574f3d..49e204560655 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -63,9 +63,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) { upperdentry = ovl_dentry_upper(dentry); - mutex_lock(&upperdentry->d_inode->i_mutex); + inode_lock(upperdentry->d_inode); err = notify_change(upperdentry, attr, NULL); - mutex_unlock(&upperdentry->d_inode->i_mutex); + inode_unlock(upperdentry->d_inode); } ovl_drop_write(dentry); out: diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index adcb1398c481..fdaf28f75e12 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -228,7 +228,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) dput(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); } revert_creds(old_cred); put_cred(override_cred); @@ -399,7 +399,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) loff_t res; struct ovl_dir_file *od = file->private_data; - mutex_lock(&file_inode(file)->i_mutex); + inode_lock(file_inode(file)); if (!file->f_pos) ovl_dir_reset(file); @@ -429,7 +429,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) res = offset; } out_unlock: - mutex_unlock(&file_inode(file)->i_mutex); + inode_unlock(file_inode(file)); return res; } @@ -454,10 +454,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); smp_mb__before_spinlock(); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return PTR_ERR(realfile); } od->upperfile = realfile; @@ -467,7 +467,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, fput(realfile); realfile = od->upperfile; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } } @@ -479,9 +479,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file) struct ovl_dir_file *od = file->private_data; if (od->cache) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ovl_cache_put(od, file->f_path.dentry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } fput(od->realfile); if (od->upperfile) @@ -557,7 +557,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; - mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; @@ -575,5 +575,5 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) ovl_cleanup(upper->d_inode, dentry); dput(dentry); } - mutex_unlock(&upper->d_inode->i_mutex); + inode_unlock(upper->d_inode); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d250604f985a..8d826bd56b26 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -229,7 +229,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode)); WARN_ON(oe->__upperdentry); BUG_ON(!upperdentry->d_inode); /* @@ -244,7 +244,7 @@ void ovl_dentry_version_inc(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); oe->version++; } @@ -252,7 +252,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); return oe->version; } @@ -375,9 +375,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, { struct dentry *dentry; - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); dentry = lookup_one_len(name->name, dir, name->len); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -744,7 +744,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt, if (err) return ERR_PTR(err); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); retry: work = lookup_one_len(OVL_WORKDIR_NAME, dentry, strlen(OVL_WORKDIR_NAME)); @@ -770,7 +770,7 @@ retry: goto out_dput; } out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); mnt_drop_write(mnt); return work; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 92e6726f6e37..a939f5ed7f89 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp) if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, proc_root_kcore->size); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index 67e8db442cf0..b6a8d3529fea 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -50,7 +50,7 @@ int proc_setup_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode_pseudo(s); @@ -69,7 +69,7 @@ int proc_setup_self(struct super_block *s) } else { self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(self)) { pr_err("proc_fill_super: can't allocate /proc/self\n"); return PTR_ERR(self); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 9eacd59e0360..e58a31e8fb2a 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -52,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *thread_self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); if (thread_self) { struct inode *inode = new_inode_pseudo(s); @@ -71,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s) } else { thread_self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(thread_self)) { pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); return PTR_ERR(thread_self); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d8c439d813ce..dc645b66cd79 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, break; } - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, name); if (!dentry) @@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return 0; fail_lockedalloc: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); kfree(private); fail_alloc: iput(inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index fbd70af98820..3c3b81bb6dfe 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -682,9 +682,9 @@ int dquot_quota_sync(struct super_block *sb, int type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - mutex_lock(&dqopt->files[cnt]->i_mutex); + inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); - mutex_unlock(&dqopt->files[cnt]->i_mutex); + inode_unlock(dqopt->files[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2162,12 +2162,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* If quota was reenabled in the meantime, we have * nothing to do */ if (!sb_has_quota_loaded(sb, cnt)) { - mutex_lock(&toputinode[cnt]->i_mutex); + inode_lock(toputinode[cnt]); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); truncate_inode_pages(&toputinode[cnt]->i_data, 0); - mutex_unlock(&toputinode[cnt]->i_mutex); + inode_unlock(toputinode[cnt]); mark_inode_dirty_sync(toputinode[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2258,11 +2258,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more * references can be added @@ -2305,12 +2305,12 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: @@ -2430,9 +2430,9 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - mutex_lock(&d_inode(sb->s_root)->i_mutex); + inode_lock(d_inode(sb->s_root)); dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); - mutex_unlock(&d_inode(sb->s_root)->i_mutex); + inode_unlock(d_inode(sb->s_root)); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/read_write.c b/fs/read_write.c index 06b07d5a08fe..fa05985f700e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -238,7 +238,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); @@ -283,7 +283,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) retval = offset; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); diff --git a/fs/readdir.c b/fs/readdir.c index ced679179cac..e69ef3b79787 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) fsnotify_access(file); file_accessed(file); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return res; } diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 4a024e2ceb9f..3abd4004184b 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); reiserfs_write_lock(inode->i_sb); err = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err < 0) return err; return 0; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 96a1bcf33db4..9424a4ba93a9 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); BUG_ON(!S_ISREG(inode->i_mode)); err = sync_mapping_buffers(inode->i_mapping); reiserfs_write_lock(inode->i_sb); @@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 6ec8a30a0911..036a1fc0a8c3 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -224,7 +224,7 @@ out_unlock: page_cache_release(page); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); reiserfs_write_unlock(inode->i_sb); return retval; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e5ddb4e5ea94..57e0b2310532 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -64,14 +64,14 @@ #ifdef CONFIG_REISERFS_FS_XATTR static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->create(dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->mkdir(dir, dentry, mode); } @@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) d_inode(dentry)->i_flags |= S_DEAD; - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) if (d_really_is_negative(privroot)) return ERR_PTR(-ENODATA); - mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) @@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) } } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); return xaroot; } @@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) le32_to_cpu(INODE_PKEY(inode)->k_objectid), inode->i_generation); - mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR); xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { @@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) } } - mutex_unlock(&d_inode(xaroot)->i_mutex); + inode_unlock(d_inode(xaroot)); dput(xaroot); return xadir; } @@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; - WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir))); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; @@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, goto out_dir; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); buf.xadir = dir; while (1) { @@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, break; buf.count = 0; } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); cleanup_dentry_buf(&buf); @@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (!err) { int jerror; - mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, + inode_lock_nested(d_inode(dir->d_parent), I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&d_inode(dir->d_parent)->i_mutex); + inode_unlock(d_inode(dir->d_parent)); err = jerror ?: err; } } @@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (IS_ERR(xadir)) return ERR_CAST(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); xafile = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(xafile)) { err = PTR_ERR(xafile); @@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (err) dput(xafile); out: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); if (err) return ERR_PTR(err); @@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) if (IS_ERR(xadir)) return PTR_ERR(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); dentry = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) dput(dentry); out_dput: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); return err; } @@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); err = reiserfs_setattr(dentry, &newattrs); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); } else update_ctime(inode); out_unlock: @@ -888,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) goto out; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (!err) err = buf.pos; @@ -905,7 +905,7 @@ static int create_privroot(struct dentry *dentry) int err; struct inode *inode = d_inode(dentry->d_parent); - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); err = xattr_mkdir(inode, dentry, 0700); if (err || d_really_is_negative(dentry)) { @@ -995,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { @@ -1005,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s) d_inode(dentry)->i_flags |= S_PRIVATE; } else err = PTR_ERR(dentry); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); return err; } @@ -1025,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) goto error; if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); err = create_privroot(REISERFS_SB(s)->priv_root); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); } if (d_really_is_positive(privroot)) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&d_inode(privroot)->i_mutex); + inode_lock(d_inode(privroot)); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; @@ -1043,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) else err = PTR_ERR(dentry); } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); } error: diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index c66f2423e1f5..4a0e48f92104 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo * the files within the tracefs system. It is up to the individual * mkdir routine to handle races. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = tracefs_ops.mkdir(name); - mutex_lock(&inode->i_mutex); + inode_lock(inode); kfree(name); @@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) * This time we need to unlock not only the parent (inode) but * also the directory that is being deleted. */ - mutex_unlock(&inode->i_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(inode); + inode_unlock(dentry->d_inode); ret = tracefs_ops.rmdir(name); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock_nested(inode, I_MUTEX_PARENT); + inode_lock(dentry->d_inode); kfree(name); @@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = tracefs_mount->mnt_root; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); @@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); dput(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; @@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); return dentry; } @@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry) if (!parent || !parent->d_inode) return; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); ret = __tracefs_remove(dentry, parent); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (!ret) simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); parent = child; goto down; } @@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); child = parent; parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); if (child != dentry) /* go up */ @@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry) if (!__tracefs_remove(child, parent)) simple_release_fs(&tracefs_mount, &tracefs_mount_count); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); } /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e49bd2808bf3..795992a8321e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -515,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) @@ -572,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -661,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry, inode->i_ino, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = check_dir_empty(c, d_inode(dentry)); if (err) return err; @@ -996,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu", old_dentry, old_inode->i_ino, old_dir->i_ino, new_dentry, new_dir->i_ino); - ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); - ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + ubifs_assert(inode_is_locked(old_dir)); + ubifs_assert(inode_is_locked(new_dir)); if (unlink) - ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + ubifs_assert(inode_is_locked(new_inode)); if (unlink && is_dir) { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index eff62801acbf..065c88f8e4b8 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Synchronize the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { @@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ err = ubifs_sync_wbufs_by_inode(c, inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e53292d0c21b..c7f4d434d098 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -313,7 +313,7 @@ static int setxattr(struct inode *host, const char *name, const void *value, union ubifs_key key; int err, type; - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; @@ -550,7 +550,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) dbg_gen("xattr '%s', ino %lu ('%pd')", name, host->i_ino, dentry); - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); err = check_namespace(&nm); if (err < 0) diff --git a/fs/udf/file.c b/fs/udf/file.c index bddf3d071dae..1af98963d860 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct udf_inode_info *iinfo = UDF_I(inode); int err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = generic_write_checks(iocb, from); if (retval <= 0) @@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) (udf_file_entry_alloc_offset(inode) + end)) { err = udf_expand_file_adinicb(inode); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); udf_debug("udf_expand_adinicb: err=%d\n", err); return err; } @@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) retval = __generic_file_write_iter(iocb, from); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval > 0) { mark_inode_dirty(inode); @@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp) * Grab i_mutex to avoid races with writes changing i_size * while we are running. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 87dc16d15572..166d3ed32c39 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode) .nr_to_write = 1, }; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; diff --git a/fs/utimes.c b/fs/utimes.c index aa138d64560a..85c40f4f373d 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times) } } retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/xattr.c b/fs/xattr.c index d5dd6c8b82a7..07d0e47f6a7f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_setxattr(dentry, name, value, size, flags); if (error) goto out; @@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, error = __vfs_setxattr_noperm(dentry, name, value, size, flags); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); @@ -277,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_removexattr(dentry, name); if (error) goto out; @@ -290,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebe9b8290a70..bb2b8f354041 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -55,7 +55,7 @@ xfs_rw_ilock( int type) { if (type & XFS_IOLOCK_EXCL) - mutex_lock(&VFS_I(ip)->i_mutex); + inode_lock(VFS_I(ip)); xfs_ilock(ip, type); } @@ -66,7 +66,7 @@ xfs_rw_iunlock( { xfs_iunlock(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } static inline void @@ -76,7 +76,7 @@ xfs_rw_ilock_demote( { xfs_ilock_demote(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } /* diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index dc6221942b85..ade236e90bb3 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -42,11 +42,11 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; if (with_imutex) - mutex_lock(&inode->i_mutex); + inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/include/linux/fs.h b/include/linux/fs.h index eb73d74ed992..2df6c033c3f5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -714,6 +714,31 @@ enum inode_i_mutex_lock_class I_MUTEX_PARENT2, }; +static inline void inode_lock(struct inode *inode) +{ + mutex_lock(&inode->i_mutex); +} + +static inline void inode_unlock(struct inode *inode) +{ + mutex_unlock(&inode->i_mutex); +} + +static inline int inode_trylock(struct inode *inode) +{ + return mutex_trylock(&inode->i_mutex); +} + +static inline int inode_is_locked(struct inode *inode) +{ + return mutex_is_locked(&inode->i_mutex); +} + +static inline void inode_lock_nested(struct inode *inode, unsigned subclass) +{ + mutex_lock_nested(&inode->i_mutex, subclass); +} + void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); @@ -3047,8 +3072,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) } static inline bool dir_relax(struct inode *inode) { - mutex_unlock(&inode->i_mutex); - mutex_lock(&inode->i_mutex); + inode_unlock(inode); + inode_lock(inode); return !IS_DEADDIR(inode); } diff --git a/ipc/mqueue.c b/ipc/mqueue.c index f4617cf07069..781c1399c6a3 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -795,7 +795,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, ro = mnt_want_write(mnt); /* we'll drop it in any case */ error = 0; - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); path.dentry = lookup_one_len(name->name, root, strlen(name->name)); if (IS_ERR(path.dentry)) { error = PTR_ERR(path.dentry); @@ -841,7 +841,7 @@ out_putfd: put_unused_fd(fd); fd = error; } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); if (!ro) mnt_drop_write(mnt); out_putname: @@ -866,7 +866,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = mnt_want_write(mnt); if (err) goto out_name; - mutex_lock_nested(&d_inode(mnt->mnt_root)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT); dentry = lookup_one_len(name->name, mnt->mnt_root, strlen(name->name)); if (IS_ERR(dentry)) { @@ -884,7 +884,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) dput(dentry); out_unlock: - mutex_unlock(&d_inode(mnt->mnt_root)->i_mutex); + inode_unlock(d_inode(mnt->mnt_root)); if (inode) iput(inode); mnt_drop_write(mnt); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 27c6046c2c3d..f84f8d06e1f6 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa if (IS_ERR(dentry)) return (void *)dentry; /* returning an error */ inode = path.dentry->d_inode; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93ac0d..9f194aad0adc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex); + inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d_backing_inode(d)->i_sb->s_dev; diff --git a/kernel/events/core.c b/kernel/events/core.c index c0957416b32e..06ae52e99ac2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4872,9 +4872,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval < 0) return retval; diff --git a/kernel/relay.c b/kernel/relay.c index 0b4570cfacae..074994bcfa9b 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!desc->count) return 0; - mutex_lock(&file_inode(filp)->i_mutex); + inode_lock(file_inode(filp)); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&file_inode(filp)->i_mutex); + inode_unlock(file_inode(filp)); return desc->written; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44253adb3c36..63d3a24e081a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -222,9 +222,9 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); - mutex_lock(&inode->i_mutex); + inode_lock(inode); i = sched_feat_set(cmp); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (i == __SCHED_FEAT_NR) return -EINVAL; diff --git a/mm/filemap.c b/mm/filemap.c index 847ee43c2806..30ab120b33db 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2684,11 +2684,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; ssize_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; diff --git a/mm/shmem.c b/mm/shmem.c index fa2ceb2d2655..38c5e72c7008 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1902,7 +1902,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We're holding i_mutex so we can access i_size directly */ if (offset < 0) @@ -1926,7 +1926,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -2091,7 +2091,7 @@ int shmem_add_seals(struct file *file, unsigned int seals) if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (info->seals & F_SEAL_SEAL) { error = -EPERM; @@ -2114,7 +2114,7 @@ int shmem_add_seals(struct file *file, unsigned int seals) error = 0; unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(shmem_add_seals); @@ -2164,7 +2164,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; @@ -2277,7 +2277,7 @@ undone: inode->i_private = NULL; spin_unlock(&inode->i_lock); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/mm/swapfile.c b/mm/swapfile.c index c43f654a7b64..d2c37365e2d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1956,9 +1956,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } filp_close(swap_file, NULL); @@ -2183,7 +2183,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (IS_SWAPFILE(inode)) return -EBUSY; } else @@ -2416,7 +2416,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mapping = swap_file->f_mapping; inode = mapping->host; - /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */ error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; @@ -2561,7 +2561,7 @@ bad_swap: vfree(cluster_info); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode = NULL; } filp_close(swap_file, NULL); @@ -2574,7 +2574,7 @@ out: if (name) putname(name); if (inode && S_ISREG(inode->i_mode)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 5e4f815c2b34..2b32fd602669 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -771,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (count == 0) return 0; - mutex_lock(&inode->i_mutex); /* protect against multiple concurrent + inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: spin_lock(&queue_lock); @@ -784,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (rp->q.list.next == &cd->queue) { spin_unlock(&queue_lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } @@ -838,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (err == -EAGAIN) goto again; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err ? err : count; } @@ -909,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf, if (!cd->cache_parse) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = cache_downcall(mapping, buf, count, cd); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return ret; } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 14f45bf0410c..31789ef3e614 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode) int need_release; LIST_HEAD(free_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); spin_lock(&pipe->lock); need_release = pipe->nreaders != 0 || pipe->nwriters != 0; pipe->nreaders = 0; @@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode) cancel_delayed_work_sync(&pipe->queue_timeout); rpc_inode_setowner(inode, NULL); RPC_I(inode)->pipe = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } static struct inode * @@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) int first_open; int res = -ENXIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) pipe->nwriters++; res = 0; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) struct rpc_pipe_msg *msg; int last_close; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) if (last_close && pipe->ops->release_pipe) pipe->ops->release_pipe(inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } @@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) struct rpc_pipe_msg *msg; int res = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { res = -EPIPE; @@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) pipe->ops->destroy_msg(msg); } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of struct inode *inode = file_inode(filp); int res; - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = -EPIPE; if (RPC_I(inode)->pipe != NULL) res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) poll_wait(filp, &rpci->waitq, wait); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (rpci->pipe == NULL) mask |= POLLERR | POLLHUP; else if (filp->private_data || !list_empty(&rpci->pipe->pipe)) mask |= POLLIN | POLLRDNORM; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return mask; } @@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -EPIPE; } spin_lock(&pipe->lock); @@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) len += msg->len - msg->copied; } spin_unlock(&pipe->lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(len, (int __user *)arg); default: return -EINVAL; @@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent, { struct inode *dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dir, I_MUTEX_CHILD); __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); } static int rpc_populate(struct dentry *parent, @@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent, struct dentry *dentry; int i, err; - mutex_lock(&dir->i_mutex); + inode_lock(dir); for (i = start; i < eof; i++) { dentry = __rpc_lookup_create_exclusive(parent, files[i].name); err = PTR_ERR(dentry); @@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent, if (err != 0) goto out_bad; } - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return 0; out_bad: __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", __FILE__, __func__, parent); return err; @@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, struct inode *dir = d_inode(parent); int error; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, goto err_rmdir; } out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; err_rmdir: __rpc_rmdir(dir, dentry); @@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); if (depopulate != NULL) depopulate(dentry); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (pipe->ops->downcall == NULL) umode &= ~S_IWUGO; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (err) goto out_err; out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; out_err: dentry = ERR_PTR(err); @@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmpipe(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } diff --git a/security/inode.c b/security/inode.c index 16622aef9bde..28414b0207ce 100644 --- a/security/inode.c +++ b/security/inode.c @@ -99,7 +99,7 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode, dir = d_inode(parent); - mutex_lock(&dir->i_mutex); + inode_lock(dir); dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) goto out; @@ -129,14 +129,14 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode, } d_instantiate(dentry, inode); dget(dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; out1: dput(dentry); dentry = ERR_PTR(error); out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); simple_release_fs(&mount, &mount_count); return dentry; } @@ -195,7 +195,7 @@ void securityfs_remove(struct dentry *dentry) if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); if (simple_positive(dentry)) { if (d_is_dir(dentry)) simple_rmdir(d_inode(parent), dentry); @@ -203,7 +203,7 @@ void securityfs_remove(struct dentry *dentry) simple_unlink(d_inode(parent), dentry); dput(dentry); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); simple_release_fs(&mount, &mount_count); } EXPORT_SYMBOL_GPL(securityfs_remove); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index c21f09bf8b99..9d96551d0196 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -121,7 +121,7 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint, if (!(mode & FMODE_WRITE)) return; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (atomic_read(&inode->i_writecount) == 1) { if ((iint->version != inode->i_version) || (iint->flags & IMA_NEW_FILE)) { @@ -130,7 +130,7 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint, ima_update_xattr(iint, file); } } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } /** @@ -186,7 +186,7 @@ static int process_measurement(struct file *file, int mask, int function, if (action & IMA_FILE_APPRAISE) function = FILE_CHECK; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (action) { iint = integrity_inode_get(inode); @@ -250,7 +250,7 @@ out_free: if (pathbuf) __putname(pathbuf); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if ((rc && must_appraise) && (ima_appraise & IMA_APPRAISE_ENFORCE)) return -EACCES; return 0; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 732c1c77dccd..1b1fd27de632 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -380,9 +380,9 @@ static int sel_open_policy(struct inode *inode, struct file *filp) goto err; if (i_size_read(inode) != security_policydb_len()) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, security_policydb_len()); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } rc = security_read_policy(&plm->data, &plm->len); -- cgit v1.3-14-g43fede From b4abf91047cf054f203dcfac97e1038388826937 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jan 2016 11:25:38 +0100 Subject: rtmutex: Make wait_lock irq safe Sasha reported a lockdep splat about a potential deadlock between RCU boosting rtmutex and the posix timer it_lock. CPU0 CPU1 rtmutex_lock(&rcu->rt_mutex) spin_lock(&rcu->rt_mutex.wait_lock) local_irq_disable() spin_lock(&timer->it_lock) spin_lock(&rcu->mutex.wait_lock) --> Interrupt spin_lock(&timer->it_lock) This is caused by the following code sequence on CPU1 rcu_read_lock() x = lookup(); if (x) spin_lock_irqsave(&x->it_lock); rcu_read_unlock(); return x; We could fix that in the posix timer code by keeping rcu read locked across the spinlocked and irq disabled section, but the above sequence is common and there is no reason not to support it. Taking rt_mutex.wait_lock irq safe prevents the deadlock. Reported-by: Sasha Levin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney --- kernel/futex.c | 18 +++---- kernel/locking/rtmutex.c | 135 +++++++++++++++++++++++++---------------------- 2 files changed, 81 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0773f2b23b10..5d6ce6413ef1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1191,7 +1191,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (pi_state->owner != current) return -EINVAL; - raw_spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -1217,22 +1217,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else if (curval != uval) ret = -EINVAL; if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); - raw_spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); @@ -2127,11 +2127,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * we returned due to timeout or signal without taking the * rt_mutex. Too late. */ - raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); if (!owner) owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8251e75dd9c0..3e746607abe5 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); clear_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* * If a new waiter comes in between the unlock and the cmpxchg * we have two situations: @@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return true; } #endif @@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int ret = 0, depth = 0; struct rt_mutex *lock; bool detect_deadlock; - unsigned long flags; bool requeue = true; detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); @@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * [1] Task cannot go away as we did a get_task() before ! */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irq(&task->pi_lock); /* * [2] Get the waiter on which @task is blocked on. @@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * operations. */ if (!raw_spin_trylock(&lock->wait_lock)) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); cpu_relax(); goto retry; } @@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * No requeue[7] here. Just release @task [8] */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * If there is no owner of the lock, end of chain. */ if (!rt_mutex_owner(lock)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* * No requeue [11] here. We just do deadlock detection. @@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* If owner is not blocked, end of chain. */ if (!next_lock) @@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) wake_up_process(rt_mutex_top_waiter(lock)->task); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { @@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop the locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* * Make the actual exit decisions [12], based on the stored @@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); out_put_task: put_task_struct(task); @@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Try to take an rt-mutex * - * Must be called with lock->wait_lock held. + * Must be called with lock->wait_lock held and interrupts disabled * * @lock: The lock to be acquired. * @task: The task which wants to acquire the lock @@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { - unsigned long flags; - /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * case, but conditionals are more expensive than a redundant * store. */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); task->pi_blocked_on = NULL; /* * Finish the lock acquisition. @task is the new owner. If @@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); takeit: /* We got the lock. */ @@ -883,7 +882,7 @@ takeit: * * Prepare waiter and propagate pi chain * - * This must be called with lock->wait_lock held. + * This must be called with lock->wait_lock held and interrupts disabled */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *top_waiter = waiter; struct rt_mutex *next_lock; int chain_walk = 0, res; - unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (owner == task) return -EDEADLK; - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); if (!owner) return 0; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); @@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Even if full deadlock detection is on, if the owner is not * blocked itself, we can avoid finding this out in the chain @@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); return res; } @@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * Remove the top waiter from the current tasks pi waiter tree and * queue it up. * - * Called with lock->wait_lock held. + * Called with lock->wait_lock held and interrupts disabled. */ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); wake_q_add(wake_q, waiter->task); } @@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, /* * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held and + * Must be called with lock->wait_lock held and interrupts disabled. I must * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, @@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock, bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); /* * Only update priority if the waiter was the highest priority @@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock, if (!owner || !is_top_waiter) return; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); rt_mutex_dequeue_pi(owner, waiter); @@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Don't walk the chain, if the owner task is not blocked @@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, next_lock, NULL, current); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); } /* @@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task) * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * - * lock->wait_lock must be held by the caller. + * Must be called with lock->wait_lock held and interrupts disabled */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, @@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); schedule(); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); } @@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; + unsigned long flags; int ret = 0; debug_rt_mutex_init_waiter(&waiter); RB_CLEAR_NODE(&waiter.pi_tree_entry); RB_CLEAR_NODE(&waiter.tree_entry); - raw_spin_lock(&lock->wait_lock); + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } @@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret; /* @@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) return 0; /* - * The mutex has currently no owner. Lock the wait lock and - * try to acquire the lock. + * The mutex has currently no owner. Lock the wait lock and try to + * acquire the lock. We use irqsave here to support early boot calls. */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) { - raw_spin_lock(&lock->wait_lock); + unsigned long flags; + + /* irqsave required to support early boot calls */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ - if (unlock_rt_mutex_safe(lock) == true) + if (unlock_rt_mutex_safe(lock, flags) == true) return false; /* Relock the rtmutex and try again */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); } /* @@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* check PI boosting */ return true; @@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 1; } @@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return ret; } -- cgit v1.3-14-g43fede From 530cbe100ef7587aa5b5ac3a4b670cda4d50e598 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Jan 2016 13:52:25 +0000 Subject: irqdomain: Allow domain lookup with DOMAIN_BUS_WIRED token Let's take the (outlandish) example of an interrupt controller capable of handling both wired interrupts and PCI MSIs. With the current code, the PCI MSI domain is going to be tagged with DOMAIN_BUS_PCI_MSI, and the wired domain with DOMAIN_BUS_ANY. Things get hairy when we start looking up the domain for a wired interrupt (typically when creating it based on some firmware information - DT or ACPI). In irq_create_fwspec_mapping(), we perform the lookup using DOMAIN_BUS_ANY, which is actually used as a wildcard. This gives us one chance out of two to end up with the wrong domain, and we try to configure a wired interrupt with the MSI domain. Everything grinds to a halt pretty quickly. What we really need to do is to start looking for a domain that would uniquely identify a wired interrupt domain, and only use DOMAIN_BUS_ANY as a fallback. In order to solve this, let's introduce a new DOMAIN_BUS_WIRED token, which is going to be used exactly as described above. Of course, this depends on the irqchip to setup the domain bus_token, and nobody had to implement this so far. Only so far. Signed-off-by: Marc Zyngier Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Frank Rowand Cc: Grant Likely Cc: Thomas Petazzoni Cc: Jiang Liu Link: http://lkml.kernel.org/r/1453816347-32720-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 1 + kernel/irq/irqdomain.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f64622ad02c1..04579d9fbce4 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -70,6 +70,7 @@ struct irq_fwspec { */ enum irq_domain_bus_token { DOMAIN_BUS_ANY = 0, + DOMAIN_BUS_WIRED, DOMAIN_BUS_PCI_MSI, DOMAIN_BUS_PLATFORM_MSI, DOMAIN_BUS_NEXUS, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8cf95de1ab3f..d75179735a28 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -575,10 +575,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) unsigned int type = IRQ_TYPE_NONE; int virq; - if (fwspec->fwnode) - domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); - else + if (fwspec->fwnode) { + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_WIRED); + if (!domain) + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_ANY); + } else { domain = irq_default_domain; + } if (!domain) { pr_warn("no irq domain found for %s !\n", -- cgit v1.3-14-g43fede From 7809998ab1af22602a8463845108edc49dfb9ef0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Jan 2016 16:41:49 +0100 Subject: tick/sched: Hide unused oneshot timer code A couple of functions in kernel/time/tick-sched.c are only relevant for oneshot timer mode, i.e. when hires-timers or nohz mode are enabled. If both are disabled, we get gcc warnings about them: kernel/time/tick-sched.c:98:16: warning: 'tick_init_jiffy_update' defined but not used [-Wunused-function] static ktime_t tick_init_jiffy_update(void) ^ kernel/time/tick-sched.c:112:13: warning: 'tick_sched_do_timer' defined but not used [-Wunused-function] static void tick_sched_do_timer(ktime_t now) ^ kernel/time/tick-sched.c:134:13: warning: 'tick_sched_handle' defined but not used [-Wunused-function] static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) ^ This encloses the whole set of functions in an appropriate ifdef to avoid the warning and to make it clearer when they are used. Signed-off-by: Arnd Bergmann Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1453736525-1959191-1-git-send-email-arnd@arndb.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7ea28ed3109d..cbe5d8dcf15a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -36,16 +36,17 @@ */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); -/* - * The time, when the last jiffy update happened. Protected by jiffies_lock. - */ -static ktime_t last_jiffies_update; - struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +/* + * The time, when the last jiffy update happened. Protected by jiffies_lock. + */ +static ktime_t last_jiffies_update; + /* * Must be called with interrupts disabled ! */ @@ -151,6 +152,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } +#endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; -- cgit v1.3-14-g43fede From 1ca8ec532fc2d986f1f4a319857bb18e0c9739b4 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 27 Jan 2016 19:26:07 +0800 Subject: tick/nohz: Set the correct expiry when switching to nohz/lowres mode commit 0ff53d096422 sets the next tick interrupt to the last jiffies update, i.e. in the past, because the forward operation is invoked before the set operation. There is no resulting damage (yet), but we get an extra pointless tick interrupt. Revert the order so we get the next tick interrupt in the future. Fixes: commit 0ff53d096422 "tick: sched: Force tick interrupt and get rid of softirq magic" Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1453893967-3458-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cbe5d8dcf15a..de2d9fef6ea6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -995,9 +995,9 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); - tick_program_event(next, 1); + hrtimer_forward_now(&ts->sched_timer, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } -- cgit v1.3-14-g43fede From 103502a35cfce0710909da874f092cb44823ca03 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 26 Dec 2015 06:00:48 +0100 Subject: seccomp: always propagate NO_NEW_PRIVS on tsync Before this patch, a process with some permissive seccomp filter that was applied by root without NO_NEW_PRIVS was able to add more filters to itself without setting NO_NEW_PRIVS by setting the new filter from a throwaway thread with NO_NEW_PRIVS. Signed-off-by: Jann Horn Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/seccomp.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 580ac2d4024f..15a1795bbba1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ - if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { - /* - * Don't let an unprivileged task work around - * the no_new_privs restriction by creating - * a thread that sets it up, enters seccomp, - * then dies. - */ - if (task_no_new_privs(caller)) - task_set_no_new_privs(thread); - + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); - } } } -- cgit v1.3-14-g43fede From 993e9fe1e70aedbb6e9670211f1ad56beea8373d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Jan 2016 16:48:28 +0100 Subject: PM: APM_EMULATION does not depend on PM The APM emulation code does multiple things, and some of them depend on PM_SLEEP, while the battery management does not. However, selecting the symbol like SHARPSL_PM does causes a Kconfig warning: warning: (SHARPSL_PM && PMAC_APM_EMU) selects APM_EMULATION which has unmet direct dependencies (PM && SYS_SUPPORTS_APM_EMULATION) From all I can tell, this is completely harmless, and we can simply allow APM_EMULATION to be enabled here, even if PM is not. Signed-off-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 02e8dfaa1ce2..68d3ebc12601 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -235,7 +235,7 @@ config PM_TRACE_RTC config APM_EMULATION tristate "Advanced Power Management Emulation" - depends on PM && SYS_SUPPORTS_APM_EMULATION + depends on SYS_SUPPORTS_APM_EMULATION help APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with -- cgit v1.3-14-g43fede From 78cd2c748f459739ff864dd9308c0f6caf7f6e41 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jan 2016 14:08:45 +0100 Subject: perf: Fix orphan hole We should set event->owner before we install the event, otherwise there is a hole where the target task can fork() and we'll not inherit the event because it thinks the event is orphaned. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9de4d352ba8c..6759f2a332d7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8489,6 +8489,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__header_size(event); perf_event__id_header_size(event); + event->owner = current; + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8498,8 +8500,6 @@ SYSCALL_DEFINE5(perf_event_open, put_online_cpus(); - event->owner = current; - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); -- cgit v1.3-14-g43fede From 6a3351b612b72c558910c88a43e2ef6d7d68bc97 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jan 2016 14:09:54 +0100 Subject: perf: Fix race in perf_event_exit_task_context() There is a race between perf_event_exit_task_context() and orphans_remove_work() which results in a use-after-free. We mark ctx->task with TASK_TOMBSTONE to indicate a context is 'dead', under ctx->lock. After which point event_function_call() on any event of that context will NOP A concurrent orphans_remove_work() will only hold ctx->mutex for the list iteration and not serialize against this. Therefore its possible that orphans_remove_work()'s perf_remove_from_context() call will fail, but we'll continue to free the event, with the result of free'd memory still being on lists and everything. Once perf_event_exit_task_context() gets around to acquiring ctx->mutex it too will iterate the event list, encounter the already free'd event and proceed to free it _again_. This fails with the WARN in free_event(). Plug the race by having perf_event_exit_task_context() hold ctx::mutex over the whole tear-down, thereby 'naturally' serializing against all other sites, including the orphan work. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: alexander.shishkin@linux.intel.com Cc: dsahern@gmail.com Cc: namhyung@kernel.org Link: http://lkml.kernel.org/r/20160125130954.GY6357@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6759f2a332d7..1d243fadfd12 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8748,14 +8748,40 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; - unsigned long flags; WARN_ON_ONCE(child != current); - child_ctx = perf_lock_task_context(child, ctxn, &flags); + child_ctx = perf_pin_task_context(child, ctxn); if (!child_ctx) return; + /* + * In order to reduce the amount of tricky in ctx tear-down, we hold + * ctx::mutex over the entire thing. This serializes against almost + * everything that wants to access the ctx. + * + * The exception is sys_perf_event_open() / + * perf_event_create_kernel_count() which does find_get_context() + * without ctx::mutex (it cannot because of the move_group double mutex + * lock thing). See the comments in perf_install_in_context(). + * + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * put_event() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock(&child_ctx->mutex); + + /* + * In a single ctx::lock section, de-schedule the events and detach the + * context from the task such that we cannot ever get it scheduled back + * in. + */ + raw_spin_lock_irq(&child_ctx->lock); task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); /* @@ -8767,14 +8793,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the events from it. - */ clone_ctx = unclone_ctx(child_ctx); - update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -8786,18 +8806,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ perf_event_task(child, child_ctx, 0); - /* - * We can recurse on the same lock type through: - * - * __perf_event_exit_task() - * sync_child_event() - * put_event() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock(&child_ctx->mutex); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) __perf_event_exit_task(child_event, child_ctx, child); -- cgit v1.3-14-g43fede From 828b6f0e26170938d617e99a17177453be4d77a3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Jan 2016 21:59:04 +0100 Subject: perf: Fix NULL deref Dan reported: 1229 if (ctx->task == TASK_TOMBSTONE || 1230 !atomic_inc_not_zero(&ctx->refcount)) { 1231 raw_spin_unlock(&ctx->lock); 1232 ctx = NULL; ^^^^^^^^^^ ctx is NULL. 1233 } 1234 1235 WARN_ON_ONCE(ctx->task != task); ^^^^^^^^^^^^^^^^^ The patch adds a NULL dereference. Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 63b6da39bb38 ("perf: Fix perf_event_exit_task() race") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1d243fadfd12..fe97f95f204e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1230,9 +1230,9 @@ retry: !atomic_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; + } else { + WARN_ON_ONCE(ctx->task != task); } - - WARN_ON_ONCE(ctx->task != task); } rcu_read_unlock(); if (!ctx) -- cgit v1.3-14-g43fede From e03e7ee34fdd1c3ef494949a75cb8c61c7265fa9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Jan 2016 20:59:49 -0800 Subject: perf/bpf: Convert perf_event_array to use struct file Robustify refcounting. Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160126045947.GA40151@ast-mbp.thefacebook.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++-- kernel/bpf/arraymap.c | 21 +++++++++++---------- kernel/events/core.c | 21 ++++++++++----------- kernel/trace/bpf_trace.c | 14 ++++++++++---- 4 files changed, 33 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6612732d8fd0..4f90434b8d64 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -729,7 +729,7 @@ extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); -extern struct perf_event *perf_event_get(unsigned int fd); +extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); @@ -1070,7 +1070,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } -static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b0799bced518..89ebbc4d1164 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) { struct perf_event *event; const struct perf_event_attr *attr; + struct file *file; - event = perf_event_get(fd); - if (IS_ERR(event)) - return event; + file = perf_event_get(fd); + if (IS_ERR(file)) + return file; + + event = file->private_data; attr = perf_event_attrs(event); if (IS_ERR(attr)) @@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) goto err; if (attr->type == PERF_TYPE_RAW) - return event; + return file; if (attr->type == PERF_TYPE_HARDWARE) - return event; + return file; if (attr->type == PERF_TYPE_SOFTWARE && attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return event; + return file; err: - perf_event_release_kernel(event); + fput(file); return ERR_PTR(-EINVAL); } static void perf_event_fd_array_put_ptr(void *ptr) { - struct perf_event *event = ptr; - - perf_event_release_kernel(event); + fput((struct file *)ptr); } static const struct bpf_map_ops perf_event_array_ops = { diff --git a/kernel/events/core.c b/kernel/events/core.c index fe97f95f204e..eb44730afea5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8916,21 +8916,20 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } -struct perf_event *perf_event_get(unsigned int fd) +struct file *perf_event_get(unsigned int fd) { - int err; - struct fd f; - struct perf_event *event; + struct file *file; - err = perf_fget_light(fd, &f); - if (err) - return ERR_PTR(err); + file = fget_raw(fd); + if (!file) + return ERR_PTR(-EBADF); - event = f.file->private_data; - atomic_long_inc(&event->refcount); - fdput(f); + if (file->f_op != &perf_fops) { + fput(file); + return ERR_PTR(-EBADF); + } - return event; + return file; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 45dd798bcd37..326a75e884db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct perf_event *event; + struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (!event) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct perf_event *event; + struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (unlikely(!event)) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; -- cgit v1.3-14-g43fede From a0733e695b83a9c31f779e41dcaec8ef924716b5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 12:14:40 +0100 Subject: perf: Remove __free_event() There is but a single caller, remove the function - we already have _free_event(), the extra indirection is nonsensical.. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eb44730afea5..024adf0e34eb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3590,7 +3590,7 @@ static void unaccount_event(struct perf_event *event) * 3) two matching events on the same context. * * The former two cases are handled in the allocation path (perf_event_alloc(), - * __free_event()), the latter -- before the first perf_install_in_context(). + * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { @@ -3665,29 +3665,6 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void __free_event(struct perf_event *event) -{ - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } - - call_rcu(&event->rcu_head, free_event_rcu); -} - static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3709,7 +3686,25 @@ static void _free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } + + perf_event_free_bpf_prog(event); + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + if (event->pmu) { + exclusive_event_destroy(event); + module_put(event->pmu->module); + } + + call_rcu(&event->rcu_head, free_event_rcu); } /* -- cgit v1.3-14-g43fede From 07c4a776135ea6d808ea19aabeb51de6b8648402 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 12:15:37 +0100 Subject: perf: Update locking order Update the locking order to note that ctx::lock nests inside of child_mutex, as per: perf_ioctl(): ctx::mutex -> perf_event_for_each(): event::child_mutex -> _perf_event_enable(): ctx::lock Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 024adf0e34eb..d345964c2bd6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1086,8 +1086,8 @@ static void put_ctx(struct perf_event_context *ctx) * Lock order: * task_struct::perf_event_mutex * perf_event_context::mutex - * perf_event_context::lock * perf_event::child_mutex; + * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem */ -- cgit v1.3-14-g43fede From 6e801e016917989ab8a7ddfc4229a15a5621622a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 12:17:08 +0100 Subject: perf: Fix STATE_EXIT usage We should never attempt to enable a STATE_EXIT event. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d345964c2bd6..d84374fa44e5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2253,7 +2253,8 @@ static void __perf_event_enable(struct perf_event *event, struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; - if (event->state >= PERF_EVENT_STATE_INACTIVE) + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state <= PERF_EVENT_STATE_ERROR) return; update_context_time(ctx); @@ -2298,7 +2299,8 @@ static void _perf_event_enable(struct perf_event *event) struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); - if (event->state >= PERF_EVENT_STATE_INACTIVE) { + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state < PERF_EVENT_STATE_ERROR) { raw_spin_unlock_irq(&ctx->lock); return; } -- cgit v1.3-14-g43fede From f47c02c0c8403963fbb8c3484e285727305d0f73 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 12:30:14 +0100 Subject: perf: Robustify event->owner usage and SMP ordering Use smp_store_release() to clear event->owner and lockless_dereference() to observe it. Further use READ_ONCE() for all lockless reads. This changes perf_remove_from_owner() to leave event->owner cleared. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d84374fa44e5..5f055de90c6d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -152,7 +152,7 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, static bool is_kernel_event(struct perf_event *event) { - return event->owner == TASK_TOMBSTONE; + return READ_ONCE(event->owner) == TASK_TOMBSTONE; } /* @@ -1651,7 +1651,7 @@ out: */ static bool is_orphaned_event(struct perf_event *event) { - return event && !is_kernel_event(event) && !event->owner; + return event && !is_kernel_event(event) && !READ_ONCE(event->owner); } /* @@ -3733,14 +3733,13 @@ static void perf_remove_from_owner(struct perf_event *event) struct task_struct *owner; rcu_read_lock(); - owner = ACCESS_ONCE(event->owner); /* - * Matches the smp_wmb() in perf_event_exit_task(). If we observe - * !owner it means the list deletion is complete and we can indeed - * free this event, otherwise we need to serialize on + * Matches the smp_store_release() in perf_event_exit_task(). If we + * observe !owner it means the list deletion is complete and we can + * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - smp_read_barrier_depends(); + owner = lockless_dereference(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -3768,8 +3767,10 @@ static void perf_remove_from_owner(struct perf_event *event) * ensured they're done, and we can proceed with freeing the * event. */ - if (event->owner) + if (event->owner) { list_del_init(&event->owner_entry); + smp_store_release(&event->owner, NULL); + } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } @@ -8829,8 +8830,7 @@ void perf_event_exit_task(struct task_struct *child) * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ - smp_wmb(); - event->owner = NULL; + smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); -- cgit v1.3-14-g43fede From 8ba289b8d4e4dbd1f971fbf0d2085e4776a4ba25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 13:06:56 +0100 Subject: perf: Clean up sync_child_event() sync_child_event() has outgrown its purpose, it does far too much. Bring it back to its named purpose. Rename __perf_event_exit_task() to perf_event_exit_event() to better reflect what it does and move the event->state assignment under the ctx->lock, like state changes ought to be. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 81 +++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5f055de90c6d..8c3d95195f05 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1041,9 +1041,8 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] - * __perf_event_exit_task() - * sync_child_event() - * put_event() [ parent, 1 ] + * perf_event_exit_event() + * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() @@ -1846,7 +1845,8 @@ static void __perf_event_disable(struct perf_event *event, * remains valid. This condition is satisifed when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that - * goes to exit will block in sync_child_event. + * goes to exit will block in perf_event_exit_event(). + * * When called from perf_pending_event it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. @@ -4086,7 +4086,7 @@ static void _perf_event_reset(struct perf_event *event) /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block - * in sync_child_event if it goes to exit, thus satisfying the + * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, @@ -8681,33 +8681,15 @@ static void sync_child_event(struct perf_event *child_event, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); - - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Make sure user/parent get notified, that we just - * lost one event. - */ - perf_event_wakeup(parent_event); - - /* - * Release the parent event, if this was the last - * reference to it. - */ - put_event(parent_event); } static void -__perf_event_exit_task(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) { + struct perf_event *parent_event = child_event->parent; + /* * Do not destroy the 'original' grouping; because of the context * switch optimization the original events could've ended up in a @@ -8723,23 +8705,39 @@ __perf_event_exit_task(struct perf_event *child_event, raw_spin_lock_irq(&child_ctx->lock); WARN_ON_ONCE(child_ctx->is_active); - if (!!child_event->parent) + if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); + child_event->state = PERF_EVENT_STATE_EXIT; raw_spin_unlock_irq(&child_ctx->lock); /* - * It can happen that the parent exits first, and has events - * that are still around due to the child reference. These - * events need to be zapped. + * Parent events are governed by their filedesc, retain them. */ - if (child_event->parent) { - sync_child_event(child_event, child); - free_event(child_event); - } else { - child_event->state = PERF_EVENT_STATE_EXIT; + if (!parent_event) { perf_event_wakeup(child_event); + return; } + /* + * Child events can be cleaned up. + */ + + sync_child_event(child_event, child); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Kick perf_poll() for is_event_hup(). + */ + perf_event_wakeup(parent_event); + free_event(child_event); + put_event(parent_event); } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) @@ -8765,10 +8763,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * * We can recurse on the same lock type through: * - * __perf_event_exit_task() - * sync_child_event() - * put_event() - * mutex_lock(&ctx->mutex) + * perf_event_exit_event() + * put_event() + * mutex_lock(&ctx->mutex) * * But since its the parent context it won't be the same instance. */ @@ -8805,7 +8802,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - __perf_event_exit_task(child_event, child_ctx, child); + perf_event_exit_event(child_event, child_ctx, child); mutex_unlock(&child_ctx->mutex); -- cgit v1.3-14-g43fede From 45a0e07abf4933490a2d2f81b1a31fe267bd3561 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 13:09:48 +0100 Subject: perf: Add flags argument to perf_remove_from_context() In preparation to adding more options, convert the boolean argument into a flags word. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8c3d95195f05..4291a4d27664 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1770,6 +1770,8 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +#define DETACH_GROUP 0x01UL + /* * Cross CPU call to remove a performance event * @@ -1782,10 +1784,10 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_context *ctx, void *info) { - bool detach_group = (unsigned long)info; + unsigned long flags = (unsigned long)info; event_sched_out(event, cpuctx, ctx); - if (detach_group) + if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); @@ -1808,12 +1810,11 @@ __perf_remove_from_context(struct perf_event *event, * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event, bool detach_group) +static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { lockdep_assert_held(&event->ctx->mutex); - event_function_call(event, __perf_remove_from_context, - (void *)(unsigned long)detach_group); + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* @@ -3800,7 +3801,7 @@ static void put_event(struct perf_event *event) */ ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, true); + perf_remove_from_context(event, DETACH_GROUP); perf_event_ctx_unlock(event, ctx); _free_event(event); @@ -3840,7 +3841,7 @@ static void orphans_remove_work(struct work_struct *work) if (!is_orphaned_child(event)) continue; - perf_remove_from_context(event, true); + perf_remove_from_context(event, DETACH_GROUP); mutex_lock(&parent_event->child_mutex); list_del_init(&event->child_list); @@ -8430,11 +8431,11 @@ SYSCALL_DEFINE5(perf_event_open, * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - perf_remove_from_context(group_leader, false); + perf_remove_from_context(group_leader, 0); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling, false); + perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -8614,7 +8615,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event, false); + perf_remove_from_context(event, 0); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -9240,7 +9241,7 @@ static void __perf_event_exit_context(void *__info) raw_spin_lock(&ctx->lock); list_for_each_entry(event, &ctx->event_list, event_entry) - __perf_remove_from_context(event, cpuctx, ctx, (void *)(unsigned long)true); + __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } -- cgit v1.3-14-g43fede From 60beda849343494b2a598b927630bbe293c1cc6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 14:55:02 +0100 Subject: perf: Untangle 'owner' confusion There are two concepts of owner wrt an event and they are conflated: - event::owner / event::owner_list, used by prctl(.option = PR_TASK_PERF_EVENTS_{EN,DIS}ABLE). - the 'owner' of the event object, typically the file descriptor. Currently these two concepts are conflated, which gives trouble with scm_rights passing of file descriptors. Passing the event and then closing the creating task would render the event 'orphan' and would have it cleared out. Unlikely what is expectd. This patch untangles these two concepts by using PERF_EVENT_STATE_EXIT to denote the second type. Reported-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4291a4d27664..e549cf2accdd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1650,7 +1650,7 @@ out: */ static bool is_orphaned_event(struct perf_event *event) { - return event && !is_kernel_event(event) && !READ_ONCE(event->owner); + return event && event->state == PERF_EVENT_STATE_EXIT; } /* @@ -1771,6 +1771,7 @@ group_sched_out(struct perf_event *group_event, } #define DETACH_GROUP 0x01UL +#define DETACH_STATE 0x02UL /* * Cross CPU call to remove a performance event @@ -1790,6 +1791,8 @@ __perf_remove_from_context(struct perf_event *event, if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); + if (flags & DETACH_STATE) + event->state = PERF_EVENT_STATE_EXIT; if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; @@ -3801,9 +3804,16 @@ static void put_event(struct perf_event *event) */ ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP); + perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); perf_event_ctx_unlock(event, ctx); + /* + * At this point we must have event->state == PERF_EVENT_STATE_EXIT, + * either from the above perf_remove_from_context() or through + * perf_event_exit_event(). + */ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); + _free_event(event); } -- cgit v1.3-14-g43fede From c6e5b73242d2d9172ea880483bc4ba7ffca0cfb2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jan 2016 16:07:41 +0200 Subject: perf: Synchronously clean up child events The orphan cleanup workqueue doesn't always catch orphans, for example, if they never schedule after they are orphaned. IOW, the event leak is still very real. It also wouldn't work for kernel counters. Doing it synchonously is a little hairy due to lock inversion issues, but is made to work. Patch based on work by Alexander Shishkin. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 - kernel/events/core.c | 174 ++++++++++++++++++++++----------------------- 2 files changed, 84 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f90434b8d64..b35a61a481fa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -634,9 +634,6 @@ struct perf_event_context { int nr_cgroups; /* cgroup evts */ void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; - - struct delayed_work orphans_remove; - bool orphans_remove_sched; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index e549cf2accdd..98c862aff8fa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -49,8 +49,6 @@ #include -static struct workqueue_struct *perf_wq; - typedef int (*remote_function_f)(void *); struct remote_function_call { @@ -1645,45 +1643,11 @@ out: perf_event__header_size(tmp); } -/* - * User event without the task. - */ static bool is_orphaned_event(struct perf_event *event) { - return event && event->state == PERF_EVENT_STATE_EXIT; -} - -/* - * Event has a parent but parent's task finished and it's - * alive only because of children holding refference. - */ -static bool is_orphaned_child(struct perf_event *event) -{ - return is_orphaned_event(event->parent); -} - -static void orphans_remove_work(struct work_struct *work); - -static void schedule_orphans_remove(struct perf_event_context *ctx) -{ - if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) - return; - - if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { - get_ctx(ctx); - ctx->orphans_remove_sched = true; - } -} - -static int __init perf_workqueue_init(void) -{ - perf_wq = create_singlethread_workqueue("perf"); - WARN(!perf_wq, "failed to create perf workqueue\n"); - return perf_wq ? 0 : -1; + return event->state == PERF_EVENT_STATE_EXIT; } -core_initcall(perf_workqueue_init); - static inline int pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; @@ -1744,9 +1708,6 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - perf_pmu_enable(event->pmu); } @@ -1984,9 +1945,6 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - out: perf_pmu_enable(event->pmu); @@ -3369,7 +3327,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3782,11 +3739,22 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { - struct perf_event_context *ctx; - if (!atomic_long_dec_and_test(&event->refcount)) return; + _free_event(event); +} + +/* + * Kill an event dead; while event:refcount will preserve the event + * object, it will not preserve its functionality. Once the last 'user' + * gives up the object, we'll destroy the thing. + */ +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx; + struct perf_event *child, *tmp; + if (!is_kernel_event(event)) perf_remove_from_owner(event); @@ -3811,14 +3779,70 @@ static void put_event(struct perf_event *event) * At this point we must have event->state == PERF_EVENT_STATE_EXIT, * either from the above perf_remove_from_context() or through * perf_event_exit_event(). + * + * Therefore, anybody acquiring event->child_mutex after the below + * loop _must_ also see this, most importantly inherit_event() which + * will avoid placing more children on the list. + * + * Thus this guarantees that we will in fact observe and kill _ALL_ + * child events. */ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); - _free_event(event); -} +again: + mutex_lock(&event->child_mutex); + list_for_each_entry(child, &event->child_list, child_list) { -int perf_event_release_kernel(struct perf_event *event) -{ + /* + * Cannot change, child events are not migrated, see the + * comment with perf_event_ctx_lock_nested(). + */ + ctx = lockless_dereference(child->ctx); + /* + * Since child_mutex nests inside ctx::mutex, we must jump + * through hoops. We start by grabbing a reference on the ctx. + * + * Since the event cannot get freed while we hold the + * child_mutex, the context must also exist and have a !0 + * reference count. + */ + get_ctx(ctx); + + /* + * Now that we have a ctx ref, we can drop child_mutex, and + * acquire ctx::mutex without fear of it going away. Then we + * can re-acquire child_mutex. + */ + mutex_unlock(&event->child_mutex); + mutex_lock(&ctx->mutex); + mutex_lock(&event->child_mutex); + + /* + * Now that we hold ctx::mutex and child_mutex, revalidate our + * state, if child is still the first entry, it didn't get freed + * and we can continue doing so. + */ + tmp = list_first_entry_or_null(&event->child_list, + struct perf_event, child_list); + if (tmp == child) { + perf_remove_from_context(child, DETACH_GROUP); + list_del(&child->child_list); + free_event(child); + /* + * This matches the refcount bump in inherit_event(); + * this can't be the last reference. + */ + put_event(event); + } + + mutex_unlock(&event->child_mutex); + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + mutex_unlock(&event->child_mutex); + + /* Must be the last reference */ put_event(event); return 0; } @@ -3829,46 +3853,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); */ static int perf_release(struct inode *inode, struct file *file) { - put_event(file->private_data); + perf_event_release_kernel(file->private_data); return 0; } -/* - * Remove all orphanes events from the context. - */ -static void orphans_remove_work(struct work_struct *work) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = container_of(work, struct perf_event_context, - orphans_remove.work); - - mutex_lock(&ctx->mutex); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { - struct perf_event *parent_event = event->parent; - - if (!is_orphaned_child(event)) - continue; - - perf_remove_from_context(event, DETACH_GROUP); - - mutex_lock(&parent_event->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent_event->child_mutex); - - free_event(event); - put_event(parent_event); - } - - raw_spin_lock_irq(&ctx->lock); - ctx->orphans_remove_sched = false; - raw_spin_unlock_irq(&ctx->lock); - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -8719,7 +8707,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; + child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ raw_spin_unlock_irq(&child_ctx->lock); /* @@ -8977,8 +8965,16 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + /* + * is_orphaned_event() and list_add_tail(&parent_event->child_list) + * must be under the same lock in order to serialize against + * perf_event_release_kernel(), such that either we must observe + * is_orphaned_event() or they will observe us on the child_list. + */ + mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { + mutex_unlock(&parent_event->child_mutex); free_event(child_event); return NULL; } @@ -9026,8 +9022,6 @@ inherit_event(struct perf_event *parent_event, /* * Link this into the parent event's child list */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); -- cgit v1.3-14-g43fede From 5fa7c8ec57f70a7b5c6fe269fa9c51b9e465989c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 15:25:15 +0100 Subject: perf: Remove/simplify lockdep annotation Now that the perf_event_ctx_lock_nested() call has moved from put_event() into perf_event_release_kernel() the first reason is no longer valid as that can no longer happen. The second reason seems to have been invalidated when Al Viro made fput() unconditionally async in the following commit: 4a9d4b024a31 ("switch fput to task_work_add") such that munmap()->fput()->release()->perf_release() would no longer happen. Therefore, remove the annotation. This should increase the efficiency of lockdep coverage of perf locking. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 98c862aff8fa..f1e53e8d4ae2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3758,19 +3758,7 @@ int perf_event_release_kernel(struct perf_event *event) if (!is_kernel_event(event)) perf_remove_from_owner(event); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); + ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); perf_event_ctx_unlock(event, ctx); @@ -8759,14 +8747,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * perf_event_create_kernel_count() which does find_get_context() * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). - * - * We can recurse on the same lock type through: - * - * perf_event_exit_event() - * put_event() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. */ mutex_lock(&child_ctx->mutex); -- cgit v1.3-14-g43fede From eb7d78c9e7f6418932bd5fbee45eb46d5ab05002 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 29 Jan 2016 21:48:34 -0800 Subject: devm_memremap_pages: fix vmem_altmap lifetime + alignment handling to_vmem_altmap() needs to return valid results until arch_remove_memory() completes. It also needs to be valid for any pfn in a section regardless of whether that pfn maps to data. This escape was a result of a bug in the unit test. The signature of this bug is that free_pagetable() fails to retrieve a vmem_altmap and goes off into the weeds: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] get_pfnblock_flags_mask+0x49/0x60 [..] Call Trace: [] free_hot_cold_page+0x97/0x1d0 [] __free_pages+0x2a/0x40 [] free_pagetable+0x8c/0xd4 [] remove_pagetable+0x37a/0x808 [] vmemmap_free+0x10/0x20 Fixes: 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment vmemmap_populate()") Cc: Andrew Morton Reported-by: Jeff Moyer Signed-off-by: Dan Williams --- kernel/memremap.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index e517a16cb426..cbc3e97e2bb4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -183,7 +183,11 @@ EXPORT_SYMBOL(put_zone_device_page); static void pgmap_radix_release(struct resource *res) { - resource_size_t key; + resource_size_t key, align_start, align_size, align_end; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; mutex_lock(&pgmap_lock); for (key = res->start; key <= res->end; key += SECTION_SIZE) @@ -226,12 +230,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data) percpu_ref_put(pgmap->ref); } - pgmap_radix_release(res); - /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -267,7 +270,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); - resource_size_t key, align_start, align_size; + resource_size_t key, align_start, align_size, align_end; struct dev_pagemap *pgmap; struct page_map *page_map; unsigned long pfn; @@ -309,7 +312,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; - for (key = res->start; key <= res->end; key += SECTION_SIZE) { + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += SECTION_SIZE) { struct dev_pagemap *dup; rcu_read_lock(); @@ -336,8 +342,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; -- cgit v1.3-14-g43fede From 840d6fe7425ffb6a62d53b2759e01ae6daf90e4e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 30 Jan 2016 10:04:17 +0800 Subject: pid: Fix spelling in comments Accidentally discovered this typo when I studied this module. Signed-off-by: Zhen Lei Cc: Hanjun Guo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tianhong Ding Cc: Xinwei Hu Cc: Zefan Li Link: http://lkml.kernel.org/r/1454119457-11272-1-git-send-email-thunder.leizhen@huawei.com Signed-off-by: Ingo Molnar --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 78b3d9f80d44..e793d091f680 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -588,7 +588,7 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - /* Veryify no one has done anything silly */ + /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); /* bump default and minimum pid_max based on number of cpus */ -- cgit v1.3-14-g43fede From 76e9f0ee52b0be5761e29847e0ef01f23f24f1df Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Jan 2016 09:43:28 -0800 Subject: phys_to_pfn_t: use phys_addr_t A dma_addr_t is potentially smaller than a phys_addr_t on some archs. Don't truncate the address when doing the pfn conversion. Cc: Ross Zwisler Reported-by: Matthew Wilcox [willy: fix pfn_t_to_phys as well] Signed-off-by: Dan Williams --- include/linux/pfn_t.h | 4 ++-- kernel/memremap.c | 2 +- tools/testing/nvdimm/test/iomap.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 0703b5360d31..37448ab5fb5c 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -29,7 +29,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); +extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags); static inline bool pfn_t_has_page(pfn_t pfn) { @@ -48,7 +48,7 @@ static inline struct page *pfn_t_to_page(pfn_t pfn) return NULL; } -static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) +static inline phys_addr_t pfn_t_to_phys(pfn_t pfn) { return PFN_PHYS(pfn_t_to_pfn(pfn)); } diff --git a/kernel/memremap.c b/kernel/memremap.c index cbc3e97e2bb4..70ee3775de24 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -150,7 +150,7 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags) { return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); } diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 7ec7df9e7fc7..0c1a7e65bb81 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res, } EXPORT_SYMBOL(__wrap_devm_memremap_pages); -pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags) { struct nfit_test_resource *nfit_res = get_nfit_res(addr); -- cgit v1.3-14-g43fede