From b2be84df99ebc93599c69e931a3c4a5105abfabc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:15 -0500 Subject: kprobes: Jump optimization sysctl interface Add /proc/sys/debug/kprobes-optimization sysctl which enables and disables kprobes jump optimization on the fly for debugging. Changes in v7: - Remove ctl_name = CTL_UNNUMBERED for upstream compatibility. Changes in v6: - Update comments and coding style. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133415.6725.8274.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a68b2448468..40d791d616b5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -1449,6 +1450,17 @@ static struct ctl_table debug_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#endif +#if defined(CONFIG_OPTPROBES) + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit v1.3-8-gc7d7 From 4b17764737bb4ee3364b8bfa2059f51ebc19ccd6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 1 Mar 2010 00:02:23 -0800 Subject: sparc: Support show_unhandled_signals. Just faults right now, will add other traps later. Signed-off-by: David S. Miller --- arch/sparc/mm/fault_32.c | 106 +++++++++++++++++++++++++++-------------------- arch/sparc/mm/fault_64.c | 34 +++++++++++++-- kernel/sysctl.c | 2 +- 3 files changed, 92 insertions(+), 50 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index a3413acb8f12..ec8f22c681ef 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -35,6 +35,8 @@ extern int prom_node_root; +int show_unhandled_signals = 1; + /* At boot time we determine these two values necessary for setting * up the segment maps and page table entries (pte's). */ @@ -149,6 +151,45 @@ asmlinkage int lookup_fault(unsigned long pc, unsigned long ret_pc, return 0; } +static inline void +show_signal_msg(struct pt_regs *regs, int sig, int code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, sig)) + return; + + if (!printk_ratelimit()) + return; + + printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->pc, (void *)regs->u_regs[UREG_I7], + (void *)regs->u_regs[UREG_FP], code); + + print_vma_addr(KERN_CONT " in ", regs->pc); + + printk(KERN_CONT "\n"); +} + +static void __do_fault_siginfo(int code, int sig, struct pt_regs *regs, + unsigned long addr) +{ + siginfo_t info; + + info.si_signo = sig; + info.si_code = code; + info.si_errno = 0; + info.si_addr = (void __user *) addr; + info.si_trapno = 0; + + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, sig, info.si_code, + addr, current); + + force_sig_info (sig, &info, current); +} + extern unsigned long safe_compute_effective_address(struct pt_regs *, unsigned int); @@ -168,6 +209,14 @@ static unsigned long compute_si_addr(struct pt_regs *regs, int text_fault) return safe_compute_effective_address(regs, insn); } +static noinline void do_fault_siginfo(int code, int sig, struct pt_regs *regs, + int text_fault) +{ + unsigned long addr = compute_si_addr(regs, text_fault); + + __do_fault_siginfo(code, sig, regs, addr); +} + asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, unsigned long address) { @@ -176,9 +225,8 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, struct mm_struct *mm = tsk->mm; unsigned int fixup; unsigned long g2; - siginfo_t info; int from_user = !(regs->psr & PSR_PS); - int fault; + int fault, code; if(text_fault) address = regs->pc; @@ -195,7 +243,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, if (!ARCH_SUN4C && address >= TASK_SIZE) goto vmalloc_fault; - info.si_code = SEGV_MAPERR; + code = SEGV_MAPERR; /* * If we're in an interrupt or have no user @@ -229,7 +277,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, * we can handle it.. */ good_area: - info.si_code = SEGV_ACCERR; + code = SEGV_ACCERR; if(write) { if(!(vma->vm_flags & VM_WRITE)) goto bad_area; @@ -273,18 +321,8 @@ bad_area: bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ - if(from_user) { -#if 0 - printk("Fault whee %s [%d]: segfaults at %08lx pc=%08lx\n", - tsk->comm, tsk->pid, address, regs->pc); -#endif - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code set above to make clear whether - this was a SEGV_MAPERR or SEGV_ACCERR fault. */ - info.si_addr = (void __user *)compute_si_addr(regs, text_fault); - info.si_trapno = 0; - force_sig_info (SIGSEGV, &info, tsk); + if (from_user) { + do_fault_siginfo(code, SIGSEGV, regs, text_fault); return; } @@ -335,12 +373,7 @@ out_of_memory: do_sigbus: up_read(&mm->mmap_sem); - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *) compute_si_addr(regs, text_fault); - info.si_trapno = 0; - force_sig_info (SIGBUS, &info, tsk); + do_fault_siginfo(BUS_ADRERR, SIGBUS, regs, text_fault); if (!from_user) goto no_context; @@ -466,14 +499,10 @@ static void force_user_fault(unsigned long address, int write) struct vm_area_struct *vma; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - siginfo_t info; + int code; - info.si_code = SEGV_MAPERR; + code = SEGV_MAPERR; -#if 0 - printk("wf\n", - tsk->pid, write, address); -#endif down_read(&mm->mmap_sem); vma = find_vma(mm, address); if(!vma) @@ -485,7 +514,7 @@ static void force_user_fault(unsigned long address, int write) if(expand_stack(vma, address)) goto bad_area; good_area: - info.si_code = SEGV_ACCERR; + code = SEGV_ACCERR; if(write) { if(!(vma->vm_flags & VM_WRITE)) goto bad_area; @@ -502,27 +531,12 @@ good_area: return; bad_area: up_read(&mm->mmap_sem); -#if 0 - printk("Window whee %s [%d]: segfaults at %08lx\n", - tsk->comm, tsk->pid, address); -#endif - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code set above to make clear whether - this was a SEGV_MAPERR or SEGV_ACCERR fault. */ - info.si_addr = (void __user *) address; - info.si_trapno = 0; - force_sig_info (SIGSEGV, &info, tsk); + __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address); return; do_sigbus: up_read(&mm->mmap_sem); - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *) address; - info.si_trapno = 0; - force_sig_info (SIGBUS, &info, tsk); + __do_fault_siginfo(BUS_ADRERR, SIGBUS, tsk->thread.kregs, address); } void window_overflow_fault(void) diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index b9d4ff02b8fc..f92ce56a8b22 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -32,6 +32,8 @@ #include #include +int show_unhandled_signals = 1; + static inline __kprobes int notify_page_fault(struct pt_regs *regs) { int ret = 0; @@ -128,22 +130,48 @@ outret: return insn; } +static inline void +show_signal_msg(struct pt_regs *regs, int sig, int code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, sig)) + return; + + if (!printk_ratelimit()) + return; + + printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->tpc, (void *)regs->u_regs[UREG_I7], + (void *)regs->u_regs[UREG_FP], code); + + print_vma_addr(KERN_CONT " in ", regs->tpc); + + printk(KERN_CONT "\n"); +} + extern unsigned long compute_effective_address(struct pt_regs *, unsigned int, unsigned int); static void do_fault_siginfo(int code, int sig, struct pt_regs *regs, unsigned int insn, int fault_code) { + unsigned long addr; siginfo_t info; info.si_code = code; info.si_signo = sig; info.si_errno = 0; if (fault_code & FAULT_CODE_ITLB) - info.si_addr = (void __user *) regs->tpc; + addr = regs->tpc; else - info.si_addr = (void __user *) - compute_effective_address(regs, insn, 0); + addr = compute_effective_address(regs, insn, 0); + info.si_addr = (void __user *) addr; info.si_trapno = 0; + + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, sig, code, addr, current); + force_sig_info(sig, &info, current); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a68b2448468..33e7a38b6eb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1441,7 +1441,7 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) +#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) { .procname = "exception-trace", .data = &show_unhandled_signals, -- cgit v1.3-8-gc7d7 From eb5572fed55f4c2b7dbc42582bc82dcb47632380 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:23:59 -0800 Subject: sysctl extern cleanup: C_A_D Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move C_A_D extern variable declaration to linux/reboot.h Signed-off-by: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 1 + kernel/sysctl.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 988e55fe649b..3005d5a7fce5 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -64,6 +64,7 @@ extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); +extern int C_A_D; /* for sysctl */ void ctrl_alt_del(void); #define POWEROFF_CMD_PATH_LEN 256 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0ef19c614f6d..72c3b1e80d7b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -65,7 +65,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int C_A_D; extern int print_fatal_signals; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; -- cgit v1.3-8-gc7d7 From d33ed52d57e794eba55cea3f5eab3c8f80b6cb5a Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:23:59 -0800 Subject: sysctl extern cleanup: signal Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move print_fatal_signals extern declaration to linux/signal.h Signed-off-by: Dave Young Cc: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 2 ++ kernel/sysctl.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/signal.h b/include/linux/signal.h index ab9272cc270c..fcd2b14b1932 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -7,6 +7,8 @@ #ifdef __KERNEL__ #include +/* for sysctl */ +extern int print_fatal_signals; /* * Real Time signals may be queued. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 72c3b1e80d7b..a8fd10a9a501 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +66,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int print_fatal_signals; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; -- cgit v1.3-8-gc7d7 From e5ab67726f33b50f40db0ccf271ceb3c658554d5 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:05 -0800 Subject: sysctl extern cleanup: rcu Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move rcutorture_runnable extern declaration to linux/rcupdate.h Signed-off-by: Dave Young Acked-by: Josh Triplett Reviewed-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rcupdate.h | 4 ++++ kernel/sysctl.c | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c84373626336..a005cac5e302 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -41,6 +41,10 @@ #include #include +#ifdef CONFIG_RCU_TORTURE_TEST +extern int rcutorture_runnable; /* for sysctl */ +#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ + /** * struct rcu_head - callback structure for use with RCU * @next: next update requests in a list diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a8fd10a9a501..f18aaa7b0d65 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -87,9 +87,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif -#ifdef CONFIG_RCU_TORTURE_TEST -extern int rcutorture_runnable; -#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ #ifdef CONFIG_BLOCK extern int blk_iopoll_enabled; #endif -- cgit v1.3-8-gc7d7 From 5ed109103d73b0bafc92e860cead56725231384d Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:06 -0800 Subject: sysctl extern cleanup: module Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move modprobe_path extern declaration to linux/kmod.h Move modules_disabled extern declaration to linux/module.h Signed-off-by: Dave Young Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmod.h | 1 + include/linux/module.h | 1 + kernel/sysctl.c | 4 ---- 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 384ca8bbf1ac..facb27fe7de0 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -27,6 +27,7 @@ #define KMOD_PATH_LEN 256 #ifdef CONFIG_MODULES +extern char modprobe_path[]; /* for sysctl */ /* modprobe exit status on success, -ve on error. Return value * usually useless though. */ extern int __request_module(bool wait, const char *name, ...) \ diff --git a/include/linux/module.h b/include/linux/module.h index dd618eb026aa..5e869ffd34aa 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -175,6 +175,7 @@ struct notifier_block; #ifdef CONFIG_MODULES +extern int modules_disabled; /* for sysctl */ /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f18aaa7b0d65..44e9492368fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -116,10 +116,6 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; -#ifdef CONFIG_MODULES -extern char modprobe_path[]; -extern int modules_disabled; -#endif #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; #endif -- cgit v1.3-8-gc7d7 From 15485a4682d1d3bfee2aa78b4b1a5d36f5746b64 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:07 -0800 Subject: sysctl extern cleanup: sg Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move sg_big_buff extern declaration to scsi/sg.h Signed-off-by: Dave Young Acked-by: Doug Gilbert Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/scsi/sg.h | 3 +++ kernel/sysctl.c | 7 +++---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/scsi/sg.h b/include/scsi/sg.h index 934ae389671d..a9f3c6fc3f57 100644 --- a/include/scsi/sg.h +++ b/include/scsi/sg.h @@ -70,6 +70,9 @@ Major new features in SG 3.x driver (cf SG 2.x drivers) (for the lk 2.2 series). */ +#ifdef __KERNEL__ +extern int sg_big_buff; /* for sysctl */ +#endif /* New interface introduced in the 3.x SG drivers follows */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 44e9492368fd..5290c437f151 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,9 @@ #include #include #endif +#ifdef CONFIG_CHR_DEV_SG +#include +#endif #if defined(CONFIG_SYSCTL) @@ -116,10 +119,6 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; -#ifdef CONFIG_CHR_DEV_SG -extern int sg_big_buff; -#endif - #ifdef CONFIG_SPARC #include #endif -- cgit v1.3-8-gc7d7 From c55b7c3e82d0ad58f35a0785faaaf2f70b9b6cd3 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:08 -0800 Subject: sysctl extern cleanup: acct Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move acct_parm extern declaration to linux/acct.h Signed-off-by: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/acct.h | 1 + kernel/sysctl.c | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/acct.h b/include/linux/acct.h index 93f46096ad4c..3e4737fa6cce 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -121,6 +121,7 @@ struct vfsmount; struct super_block; struct pacct_struct; struct pid_namespace; +extern int acct_parm[]; /* for sysctl */ extern void acct_auto_close_mnt(struct vfsmount *m); extern void acct_auto_close(struct super_block *sb); extern void acct_collect(long exitcode, int group_dead); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5290c437f151..7635bb15f5af 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,9 @@ #include #include #endif +#ifdef CONFIG_BSD_PROCESS_ACCT +#include +#endif #ifdef CONFIG_CHR_DEV_SG #include #endif @@ -140,10 +143,6 @@ extern int sysctl_userprocess_debug; extern int spin_retry; #endif -#ifdef CONFIG_BSD_PROCESS_ACCT -extern int acct_parm[]; -#endif - #ifdef CONFIG_IA64 extern int no_unaligned_warning; extern int unaligned_dump_stack; -- cgit v1.3-8-gc7d7 From 4f0e056fdebc15d3f4724ebc7bbf323158add1d7 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:09 -0800 Subject: sysctl extern cleanup: rtmutex Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move max_lock_depth extern declaration to linux/rtmutex.h Signed-off-by: Dave Young Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rtmutex.h | 2 ++ kernel/sysctl.c | 7 +++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 281d8fd775e8..8d522ffeda33 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -16,6 +16,8 @@ #include #include +extern int max_lock_depth; /* for sysctl */ + /** * The rt_mutex structure * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7635bb15f5af..622029ba5103 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -64,6 +64,9 @@ #ifdef CONFIG_BSD_PROCESS_ACCT #include #endif +#ifdef CONFIG_RT_MUTEXES +#include +#endif #ifdef CONFIG_CHR_DEV_SG #include #endif @@ -150,10 +153,6 @@ extern int unaligned_dump_stack; extern struct ratelimit_state printk_ratelimit_state; -#ifdef CONFIG_RT_MUTEXES -extern int max_lock_depth; -#endif - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.3-8-gc7d7 From 2edf5e49800846a2b2b6461d99cdae18067c440f Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:10 -0800 Subject: sysctl extern cleanup: lockdep Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move lockdep extern declarations to linux/lockdep.h Signed-off-by: Dave Young Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 4 ++++ kernel/sysctl.c | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 10206a87da19..a03977a96d7e 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -12,6 +12,10 @@ struct task_struct; struct lockdep_map; +/* for sysctl */ +extern int prove_locking; +extern int lock_stat; + #ifdef CONFIG_LOCKDEP #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 622029ba5103..8686b0f5fc12 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,6 +67,9 @@ #ifdef CONFIG_RT_MUTEXES #include #endif +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) +#include +#endif #ifdef CONFIG_CHR_DEV_SG #include #endif @@ -191,9 +194,6 @@ extern struct ctl_table epoll_table[]; int sysctl_legacy_va_layout; #endif -extern int prove_locking; -extern int lock_stat; - /* The default sysctl tables: */ static struct ctl_table root_table[] = { -- cgit v1.3-8-gc7d7 From 97f5f0cd8cd0a05449cbb77d1e6f02e026875802 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 21 Mar 2010 22:31:26 -0700 Subject: Input: implement SysRq as a separate input handler Instead of keeping SysRq support inside of legacy keyboard driver split it out into a separate input handler (filter). This stops most SysRq input events from leaking into evdev clients (some events, such as first SysRq scancode - not keycode - event, are still leaked into both legacy keyboard and evdev). [martinez.javier@gmail.com: fix compile error when CONFIG_MAGIC_SYSRQ is not defined] Signed-off-by: Dmitry Torokhov --- drivers/char/keyboard.c | 40 +------- drivers/char/sysrq.c | 243 ++++++++++++++++++++++++++++++++++++++++++------ include/linux/sysrq.h | 23 ++--- kernel/sysctl.c | 23 ++++- 4 files changed, 246 insertions(+), 83 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index ada25bb8941e..50f6c01f44ec 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -82,8 +81,7 @@ void compute_shiftstate(void); typedef void (k_handler_fn)(struct vc_data *vc, unsigned char value, char up_flag); static k_handler_fn K_HANDLERS; -k_handler_fn *k_handler[16] = { K_HANDLERS }; -EXPORT_SYMBOL_GPL(k_handler); +static k_handler_fn *k_handler[16] = { K_HANDLERS }; #define FN_HANDLERS\ fn_null, fn_enter, fn_show_ptregs, fn_show_mem,\ @@ -147,22 +145,6 @@ static struct ledptr { unsigned char valid:1; } ledptrs[3]; -/* Simple translation table for the SysRq keys */ - -#ifdef CONFIG_MAGIC_SYSRQ -unsigned char kbd_sysrq_xlate[KEY_MAX + 1] = - "\000\0331234567890-=\177\t" /* 0x00 - 0x0f */ - "qwertyuiop[]\r\000as" /* 0x10 - 0x1f */ - "dfghjkl;'`\000\\zxcv" /* 0x20 - 0x2f */ - "bnm,./\000*\000 \000\201\202\203\204\205" /* 0x30 - 0x3f */ - "\206\207\210\211\212\000\000789-456+1" /* 0x40 - 0x4f */ - "230\177\000\000\213\214\000\000\000\000\000\000\000\000\000\000" /* 0x50 - 0x5f */ - "\r\000/"; /* 0x60 - 0x6f */ -static int sysrq_down; -static int sysrq_alt_use; -#endif -static int sysrq_alt; - /* * Notifier list for console keyboard events */ @@ -1108,7 +1090,8 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode, * pressing PrtSc/SysRq alone, but simply 0x54 * when pressing Alt+PrtSc/SysRq. */ - if (sysrq_alt) { + if (test_bit(KEY_LEFTALT, key_down) || + test_bit(KEY_RIGHTALT, key_down)) { put_queue(vc, 0x54 | up_flag); } else { put_queue(vc, 0xe0); @@ -1176,8 +1159,6 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) kbd = kbd_table + vc->vc_num; - if (keycode == KEY_LEFTALT || keycode == KEY_RIGHTALT) - sysrq_alt = down ? keycode : 0; #ifdef CONFIG_SPARC if (keycode == KEY_STOP) sparc_l1_a_state = down; @@ -1190,21 +1171,6 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) if (keycode < BTN_MISC && printk_ratelimit()) printk(KERN_WARNING "keyboard.c: can't emulate rawmode for keycode %d\n", keycode); -#ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */ - if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) { - if (!sysrq_down) { - sysrq_down = down; - sysrq_alt_use = sysrq_alt; - } - return; - } - if (sysrq_down && !down && keycode == sysrq_alt_use) - sysrq_down = 0; - if (sysrq_down && down && !rep) { - handle_sysrq(kbd_sysrq_xlate[keycode], tty); - return; - } -#endif #ifdef CONFIG_SPARC if (keycode == KEY_A && sparc_l1_a_state) { sparc_l1_a_state = 0; diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 59de2525d303..193f9c214946 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -1,7 +1,4 @@ -/* -*- linux-c -*- - * - * $Id: sysrq.c,v 1.15 1998/08/23 14:56:41 mj Exp $ - * +/* * Linux Magic System Request Key Hacks * * (c) 1997 Martin Mares @@ -10,8 +7,13 @@ * (c) 2000 Crutcher Dunnavant * overhauled to use key registration * based upon discusions in irc://irc.openprojects.net/#kernelnewbies + * + * Copyright (c) 2010 Dmitry Torokhov + * Input handler conversion */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -39,33 +41,34 @@ #include #include #include +#include #include #include /* Whether we react on sysrq keys or just ignore them */ -int __read_mostly __sysrq_enabled = 1; - -static int __read_mostly sysrq_always_enabled; +static int __read_mostly sysrq_enabled = 1; +static bool __read_mostly sysrq_always_enabled; -int sysrq_on(void) +static bool sysrq_on(void) { - return __sysrq_enabled || sysrq_always_enabled; + return sysrq_enabled || sysrq_always_enabled; } /* * A value of 1 means 'all', other nonzero values are an op mask: */ -static inline int sysrq_on_mask(int mask) +static bool sysrq_on_mask(int mask) { - return sysrq_always_enabled || __sysrq_enabled == 1 || - (__sysrq_enabled & mask); + return sysrq_always_enabled || + sysrq_enabled == 1 || + (sysrq_enabled & mask); } static int __init sysrq_always_enabled_setup(char *str) { - sysrq_always_enabled = 1; - printk(KERN_INFO "debug: sysrq always enabled.\n"); + sysrq_always_enabled = true; + pr_info("sysrq always enabled.\n"); return 1; } @@ -76,6 +79,7 @@ __setup("sysrq_always_enabled", sysrq_always_enabled_setup); static void sysrq_handle_loglevel(int key, struct tty_struct *tty) { int i; + i = key - '0'; console_loglevel = 7; printk("Loglevel set to %d\n", i); @@ -101,7 +105,7 @@ static struct sysrq_key_op sysrq_SAK_op = { .enable_mask = SYSRQ_ENABLE_KEYBOARD, }; #else -#define sysrq_SAK_op (*(struct sysrq_key_op *)0) +#define sysrq_SAK_op (*(struct sysrq_key_op *)NULL) #endif #ifdef CONFIG_VT @@ -119,7 +123,7 @@ static struct sysrq_key_op sysrq_unraw_op = { .enable_mask = SYSRQ_ENABLE_KEYBOARD, }; #else -#define sysrq_unraw_op (*(struct sysrq_key_op *)0) +#define sysrq_unraw_op (*(struct sysrq_key_op *)NULL) #endif /* CONFIG_VT */ static void sysrq_handle_crash(int key, struct tty_struct *tty) @@ -195,7 +199,7 @@ static struct sysrq_key_op sysrq_showlocks_op = { .action_msg = "Show Locks Held", }; #else -#define sysrq_showlocks_op (*(struct sysrq_key_op *)0) +#define sysrq_showlocks_op (*(struct sysrq_key_op *)NULL) #endif #ifdef CONFIG_SMP @@ -298,7 +302,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = { .enable_mask = SYSRQ_ENABLE_DUMP, }; #else -#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)0) +#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)NULL) #endif static void sysrq_handle_showmem(int key, struct tty_struct *tty) @@ -477,6 +481,7 @@ struct sysrq_key_op *__sysrq_get_key_op(int key) i = sysrq_key_table_key2index(key); if (i != -1) op_p = sysrq_key_table[i]; + return op_p; } @@ -488,11 +493,7 @@ static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p) sysrq_key_table[i] = op_p; } -/* - * This is the non-locking version of handle_sysrq. It must/can only be called - * by sysrq key handlers, as they are inside of the lock - */ -void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) +static void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) { struct sysrq_key_op *op_p; int orig_log_level; @@ -544,10 +545,6 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) spin_unlock_irqrestore(&sysrq_key_table_lock, flags); } -/* - * This function is called by the keyboard handler when SysRq is pressed - * and any other keycode arrives. - */ void handle_sysrq(int key, struct tty_struct *tty) { if (sysrq_on()) @@ -555,10 +552,177 @@ void handle_sysrq(int key, struct tty_struct *tty) } EXPORT_SYMBOL(handle_sysrq); +#ifdef CONFIG_INPUT + +/* Simple translation table for the SysRq keys */ +static const unsigned char sysrq_xlate[KEY_MAX + 1] = + "\000\0331234567890-=\177\t" /* 0x00 - 0x0f */ + "qwertyuiop[]\r\000as" /* 0x10 - 0x1f */ + "dfghjkl;'`\000\\zxcv" /* 0x20 - 0x2f */ + "bnm,./\000*\000 \000\201\202\203\204\205" /* 0x30 - 0x3f */ + "\206\207\210\211\212\000\000789-456+1" /* 0x40 - 0x4f */ + "230\177\000\000\213\214\000\000\000\000\000\000\000\000\000\000" /* 0x50 - 0x5f */ + "\r\000/"; /* 0x60 - 0x6f */ + +static bool sysrq_down; +static int sysrq_alt_use; +static int sysrq_alt; + +static bool sysrq_filter(struct input_handle *handle, unsigned int type, + unsigned int code, int value) +{ + if (type != EV_KEY) + goto out; + + switch (code) { + + case KEY_LEFTALT: + case KEY_RIGHTALT: + if (value) + sysrq_alt = code; + else if (sysrq_down && code == sysrq_alt_use) + sysrq_down = false; + break; + + case KEY_SYSRQ: + if (value == 1 && sysrq_alt) { + sysrq_down = true; + sysrq_alt_use = sysrq_alt; + } + break; + + default: + if (sysrq_down && value && value != 2) + __handle_sysrq(sysrq_xlate[code], NULL, 1); + break; + } + +out: + return sysrq_down; +} + +static int sysrq_connect(struct input_handler *handler, + struct input_dev *dev, + const struct input_device_id *id) +{ + struct input_handle *handle; + int error; + + sysrq_down = false; + sysrq_alt = 0; + + handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->dev = dev; + handle->handler = handler; + handle->name = "sysrq"; + + error = input_register_handle(handle); + if (error) { + pr_err("Failed to register input sysrq handler, error %d\n", + error); + goto err_free; + } + + error = input_open_device(handle); + if (error) { + pr_err("Failed to open input device, error %d\n", error); + goto err_unregister; + } + + return 0; + + err_unregister: + input_unregister_handle(handle); + err_free: + kfree(handle); + return error; +} + +static void sysrq_disconnect(struct input_handle *handle) +{ + input_close_device(handle); + input_unregister_handle(handle); + kfree(handle); +} + +/* + * We are matching on KEY_LEFTALT insteard of KEY_SYSRQ because not all + * keyboards have SysRq ikey predefined and so user may add it to keymap + * later, but we expect all such keyboards to have left alt. + */ +static const struct input_device_id sysrq_ids[] = { + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | + INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { BIT_MASK(KEY_LEFTALT) }, + }, + { }, +}; + +static struct input_handler sysrq_handler = { + .filter = sysrq_filter, + .connect = sysrq_connect, + .disconnect = sysrq_disconnect, + .name = "sysrq", + .id_table = sysrq_ids, +}; + +static bool sysrq_handler_registered; + +static inline void sysrq_register_handler(void) +{ + int error; + + error = input_register_handler(&sysrq_handler); + if (error) + pr_err("Failed to register input handler, error %d", error); + else + sysrq_handler_registered = true; +} + +static inline void sysrq_unregister_handler(void) +{ + if (sysrq_handler_registered) { + input_unregister_handler(&sysrq_handler); + sysrq_handler_registered = false; + } +} + +#else + +static inline void sysrq_register_handler(void) +{ +} + +static inline void sysrq_unregister_handler(void) +{ +} + +#endif /* CONFIG_INPUT */ + +int sysrq_toggle_support(int enable_mask) +{ + bool was_enabled = sysrq_on(); + + sysrq_enabled = enable_mask; + + if (was_enabled != sysrq_on()) { + if (sysrq_on()) + sysrq_register_handler(); + else + sysrq_unregister_handler(); + } + + return 0; +} + static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p, struct sysrq_key_op *remove_op_p) { - int retval; unsigned long flags; @@ -599,6 +763,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, return -EFAULT; __handle_sysrq(c, NULL, 0); } + return count; } @@ -606,10 +771,28 @@ static const struct file_operations proc_sysrq_trigger_operations = { .write = write_sysrq_trigger, }; +static void sysrq_init_procfs(void) +{ + if (!proc_create("sysrq-trigger", S_IWUSR, NULL, + &proc_sysrq_trigger_operations)) + pr_err("Failed to register proc interface\n"); +} + +#else + +static inline void sysrq_init_procfs(void) +{ +} + +#endif /* CONFIG_PROC_FS */ + static int __init sysrq_init(void) { - proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); + sysrq_init_procfs(); + + if (sysrq_on()) + sysrq_register_handler(); + return 0; } module_init(sysrq_init); -#endif diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 99adcdc0d3ca..4496322e28dd 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -39,41 +39,34 @@ struct sysrq_key_op { #ifdef CONFIG_MAGIC_SYSRQ -extern int sysrq_on(void); - -/* - * Do not use this one directly: - */ -extern int __sysrq_enabled; - /* Generic SysRq interface -- you may call it from any device driver, supplying * ASCII code of the key, pointer to registers and kbd/tty structs (if they * are available -- else NULL's). */ void handle_sysrq(int key, struct tty_struct *tty); -void __handle_sysrq(int key, struct tty_struct *tty, int check_mask); int register_sysrq_key(int key, struct sysrq_key_op *op); int unregister_sysrq_key(int key, struct sysrq_key_op *op); struct sysrq_key_op *__sysrq_get_key_op(int key); +int sysrq_toggle_support(int enable_mask); + #else -static inline int sysrq_on(void) +static inline void handle_sysrq(int key, struct tty_struct *tty) { - return 0; } -static inline int __reterr(void) + +static inline int register_sysrq_key(int key, struct sysrq_key_op *op) { return -EINVAL; } -static inline void handle_sysrq(int key, struct tty_struct *tty) + +static inline int unregister_sysrq_key(int key, struct sysrq_key_op *op) { + return -EINVAL; } -#define register_sysrq_key(ig,nore) __reterr() -#define unregister_sysrq_key(ig,nore) __reterr() - #endif #endif /* _LINUX_SYSRQ_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8686b0f5fc12..ce724a0dd0bb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -163,6 +163,27 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_MAGIC_SYSRQ +static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ + +static int sysrq_sysctl_handler(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int error; + + error = proc_dointvec(table, write, buffer, lenp, ppos); + if (error) + return error; + + if (write) + sysrq_toggle_support(__sysrq_enabled); + + return 0; +} + +#endif + static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { @@ -567,7 +588,7 @@ static struct ctl_table kern_table[] = { .data = &__sysrq_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysrq_sysctl_handler, }, #endif #ifdef CONFIG_PROC_SYSCTL -- cgit v1.3-8-gc7d7 From 00b7c3395aec3df43de5bd02a3c5a099ca51169f Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Wed, 5 May 2010 00:26:45 +0000 Subject: sysctl: refactor integer handling proc code (Based on Octavian's work, and I modified a lot.) As we are about to add another integer handling proc function a little bit of cleanup is in order: add a few helper functions to improve code readability and decrease code duplication. In the process a bug is also fixed: if the user specifies a number with more then 20 digits it will be interpreted as two integers (e.g. 10000...13 will be interpreted as 100.... and 13). Behavior for EFAULT handling was changed as well. Previous to this patch, when an EFAULT error occurred in the middle of a write operation, although some of the elements were set, that was not acknowledged to the user (by shorting the write and returning the number of bytes accepted). EFAULT is now treated just like any other errors by acknowledging the amount of bytes accepted. Signed-off-by: Octavian Purdila Signed-off-by: WANG Cong Cc: Eric W. Biederman Signed-off-by: David S. Miller --- kernel/sysctl.c | 390 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 229 insertions(+), 161 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8686b0f5fc12..4a976208de29 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2040,8 +2040,122 @@ int proc_dostring(struct ctl_table *table, int write, buffer, lenp, ppos); } +static size_t proc_skip_spaces(char **buf) +{ + size_t ret; + char *tmp = skip_spaces(*buf); + ret = tmp - *buf; + *buf = tmp; + return ret; +} + +#define TMPBUFLEN 22 +/** + * proc_get_long - reads an ASCII formated integer from a user buffer + * + * @buf - a kernel buffer + * @size - size of the kernel buffer + * @val - this is where the number will be stored + * @neg - set to %TRUE if number is negative + * @perm_tr - a vector which contains the allowed trailers + * @perm_tr_len - size of the perm_tr vector + * @tr - pointer to store the trailer character + * + * In case of success 0 is returned and buf and size are updated with + * the amount of bytes read. If tr is non NULL and a trailing + * character exist (size is non zero after returning from this + * function) tr is updated with the trailing character. + */ +static int proc_get_long(char **buf, size_t *size, + unsigned long *val, bool *neg, + const char *perm_tr, unsigned perm_tr_len, char *tr) +{ + int len; + char *p, tmp[TMPBUFLEN]; + + if (!*size) + return -EINVAL; + + len = *size; + if (len > TMPBUFLEN - 1) + len = TMPBUFLEN - 1; + + memcpy(tmp, *buf, len); + + tmp[len] = 0; + p = tmp; + if (*p == '-' && *size > 1) { + *neg = true; + p++; + } else + *neg = false; + if (!isdigit(*p)) + return -EINVAL; + + *val = simple_strtoul(p, &p, 0); + + len = p - tmp; + + /* We don't know if the next char is whitespace thus we may accept + * invalid integers (e.g. 1234...a) or two integers instead of one + * (e.g. 123...1). So lets not allow such large numbers. */ + if (len == TMPBUFLEN - 1) + return -EINVAL; + + if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) + return -EINVAL; -static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, + if (tr && (len < *size)) + *tr = *p; + + *buf += len; + *size -= len; + + return 0; +} + +/** + * proc_put_long - coverts an integer to a decimal ASCII formated string + * + * @buf - the user buffer + * @size - the size of the user buffer + * @val - the integer to be converted + * @neg - sign of the number, %TRUE for negative + * + * In case of success 0 is returned and buf and size are updated with + * the amount of bytes read. + */ +static int proc_put_long(void __user **buf, size_t *size, unsigned long val, + bool neg) +{ + int len; + char tmp[TMPBUFLEN], *p = tmp; + + sprintf(p, "%s%lu", neg ? "-" : "", val); + len = strlen(tmp); + if (len > *size) + len = *size; + if (copy_to_user(*buf, tmp, len)) + return -EFAULT; + *size -= len; + *buf += len; + return 0; +} +#undef TMPBUFLEN + +static int proc_put_char(void __user **buf, size_t *size, char c) +{ + if (*size) { + char __user **buffer = (char __user **)buf; + if (put_user(c, *buffer)) + return -EFAULT; + (*size)--, (*buffer)++; + *buf = *buffer; + } + return 0; +} + +static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { @@ -2050,33 +2164,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } else { int val = *valp; if (val < 0) { - *negp = -1; + *negp = true; *lvalp = (unsigned long)-val; } else { - *negp = 0; + *negp = false; *lvalp = (unsigned long)val; } } return 0; } +static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; + static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { -#define TMPBUFLEN 21 - int *i, vleft, first = 1, neg; - unsigned long lval; - size_t left, len; - - char buf[TMPBUFLEN], *p; - char __user *s = buffer; + int *i, vleft, first = 1, err = 0; + unsigned long page = 0; + size_t left; + char *kbuf; - if (!tbl_data || !table->maxlen || !*lenp || - (*ppos && !write)) { + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } @@ -2088,89 +2200,69 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!conv) conv = do_proc_dointvec_conv; + if (write) { + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + err = -EFAULT; + goto free; + } + kbuf[left] = 0; + } + for (; left && vleft--; i++, first=0) { - if (write) { - while (left) { - char c; - if (get_user(c, s)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - s++; - } - if (!left) - break; - neg = 0; - len = left; - if (len > sizeof(buf) - 1) - len = sizeof(buf) - 1; - if (copy_from_user(buf, s, len)) - return -EFAULT; - buf[len] = 0; - p = buf; - if (*p == '-' && left > 1) { - neg = 1; - p++; - } - if (*p < '0' || *p > '9') - break; + unsigned long lval; + bool neg; - lval = simple_strtoul(p, &p, 0); + if (write) { + left -= proc_skip_spaces(&kbuf); - len = p-buf; - if ((len < left) && *p && !isspace(*p)) + err = proc_get_long(&kbuf, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) break; - s += len; - left -= len; - - if (conv(&neg, &lval, i, 1, data)) + if (conv(&neg, &lval, i, 1, data)) { + err = -EINVAL; break; + } } else { - p = buf; + if (conv(&neg, &lval, i, 0, data)) { + err = -EINVAL; + break; + } if (!first) - *p++ = '\t'; - - if (conv(&neg, &lval, i, 0, data)) + err = proc_put_char(&buffer, &left, '\t'); + if (err) + break; + err = proc_put_long(&buffer, &left, lval, neg); + if (err) break; - - sprintf(p, "%s%lu", neg ? "-" : "", lval); - len = strlen(buf); - if (len > left) - len = left; - if(copy_to_user(s, buf, len)) - return -EFAULT; - left -= len; - s += len; } } - if (!write && !first && left) { - if(put_user('\n', s)) - return -EFAULT; - left--, s++; - } + if (!write && !first && left && !err) + err = proc_put_char(&buffer, &left, '\n'); + if (write && !err) + left -= proc_skip_spaces(&kbuf); +free: if (write) { - while (left) { - char c; - if (get_user(c, s++)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - } + free_page(page); + if (first) + return err ? : -EINVAL; } - if (write && first) - return -EINVAL; *lenp -= left; *ppos += *lenp; - return 0; -#undef TMPBUFLEN + return err; } static int do_proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { @@ -2238,8 +2330,8 @@ struct do_proc_dointvec_minmax_conv_param { int *max; }; -static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, - int *valp, +static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, int write, void *data) { struct do_proc_dointvec_minmax_conv_param *param = data; @@ -2252,10 +2344,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, } else { int val = *valp; if (val < 0) { - *negp = -1; + *negp = true; *lvalp = (unsigned long)-val; } else { - *negp = 0; + *negp = false; *lvalp = (unsigned long)val; } } @@ -2295,102 +2387,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int unsigned long convmul, unsigned long convdiv) { -#define TMPBUFLEN 21 - unsigned long *i, *min, *max, val; - int vleft, first=1, neg; - size_t len, left; - char buf[TMPBUFLEN], *p; - char __user *s = buffer; - - if (!data || !table->maxlen || !*lenp || - (*ppos && !write)) { + unsigned long *i, *min, *max; + int vleft, first = 1, err = 0; + unsigned long page = 0; + size_t left; + char *kbuf; + + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - + i = (unsigned long *) data; min = (unsigned long *) table->extra1; max = (unsigned long *) table->extra2; vleft = table->maxlen / sizeof(unsigned long); left = *lenp; - + + if (write) { + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + err = -EFAULT; + goto free; + } + kbuf[left] = 0; + } + for (; left && vleft--; i++, min++, max++, first=0) { + unsigned long val; + if (write) { - while (left) { - char c; - if (get_user(c, s)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - s++; - } - if (!left) - break; - neg = 0; - len = left; - if (len > TMPBUFLEN-1) - len = TMPBUFLEN-1; - if (copy_from_user(buf, s, len)) - return -EFAULT; - buf[len] = 0; - p = buf; - if (*p == '-' && left > 1) { - neg = 1; - p++; - } - if (*p < '0' || *p > '9') - break; - val = simple_strtoul(p, &p, 0) * convmul / convdiv ; - len = p-buf; - if ((len < left) && *p && !isspace(*p)) + bool neg; + + left -= proc_skip_spaces(&kbuf); + + err = proc_get_long(&kbuf, &left, &val, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) break; if (neg) - val = -val; - s += len; - left -= len; - - if(neg) continue; if ((min && val < *min) || (max && val > *max)) continue; *i = val; } else { - p = buf; + val = convdiv * (*i) / convmul; if (!first) - *p++ = '\t'; - sprintf(p, "%lu", convdiv * (*i) / convmul); - len = strlen(buf); - if (len > left) - len = left; - if(copy_to_user(s, buf, len)) - return -EFAULT; - left -= len; - s += len; + err = proc_put_char(&buffer, &left, '\t'); + err = proc_put_long(&buffer, &left, val, false); + if (err) + break; } } - if (!write && !first && left) { - if(put_user('\n', s)) - return -EFAULT; - left--, s++; - } + if (!write && !first && left && !err) + err = proc_put_char(&buffer, &left, '\n'); + if (write && !err) + left -= proc_skip_spaces(&kbuf); +free: if (write) { - while (left) { - char c; - if (get_user(c, s++)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - } + free_page(page); + if (first) + return err ? : -EINVAL; } - if (write && first) - return -EINVAL; *lenp -= left; *ppos += *lenp; - return 0; -#undef TMPBUFLEN + return err; } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, @@ -2451,7 +2519,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, } -static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, +static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { @@ -2463,10 +2531,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, int val = *valp; unsigned long lval; if (val < 0) { - *negp = -1; + *negp = true; lval = (unsigned long)-val; } else { - *negp = 0; + *negp = false; lval = (unsigned long)val; } *lvalp = lval / HZ; @@ -2474,7 +2542,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, return 0; } -static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, +static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { @@ -2486,10 +2554,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, int val = *valp; unsigned long lval; if (val < 0) { - *negp = -1; + *negp = true; lval = (unsigned long)-val; } else { - *negp = 0; + *negp = false; lval = (unsigned long)val; } *lvalp = jiffies_to_clock_t(lval); @@ -2497,7 +2565,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, return 0; } -static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, +static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { @@ -2507,10 +2575,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, int val = *valp; unsigned long lval; if (val < 0) { - *negp = -1; + *negp = true; lval = (unsigned long)-val; } else { - *negp = 0; + *negp = false; lval = (unsigned long)val; } *lvalp = jiffies_to_msecs(lval); -- cgit v1.3-8-gc7d7 From 9f977fb7ae9ddf565b4800854212fb9a1ed6c2ea Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 5 May 2010 00:26:55 +0000 Subject: sysctl: add proc_do_large_bitmap The new function can be used to read/write large bitmaps via /proc. A comma separated range format is used for compact output and input (e.g. 1,3-4,10-10). Writing into the file will first reset the bitmap then update it based on the given input. Signed-off-by: Octavian Purdila Signed-off-by: WANG Cong Cc: Eric W. Biederman Signed-off-by: David S. Miller --- include/linux/sysctl.h | 2 + kernel/sysctl.c | 161 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index f66014c90c9f..7bb5cb64f3b8 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -980,6 +980,8 @@ extern int proc_doulongvec_minmax(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void __user *, size_t *, loff_t *); +extern int proc_do_large_bitmap(struct ctl_table *, int, + void __user *, size_t *, loff_t *); /* * Register a set of sysctl names by calling register_sysctl_table diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4a976208de29..bcfb79e94ec7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2049,6 +2049,16 @@ static size_t proc_skip_spaces(char **buf) return ret; } +static void proc_skip_char(char **buf, size_t *size, const char v) +{ + while (*size) { + if (**buf != v) + break; + (*size)--; + (*buf)++; + } +} + #define TMPBUFLEN 22 /** * proc_get_long - reads an ASCII formated integer from a user buffer @@ -2675,6 +2685,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, return 0; } +/** + * proc_do_large_bitmap - read/write from/to a large bitmap + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * The bitmap is stored at table->data and the bitmap length (in bits) + * in table->maxlen. + * + * We use a range comma separated format (e.g. 1,3-4,10-10) so that + * large bitmaps may be represented in a compact manner. Writing into + * the file will clear the bitmap then update it with the given input. + * + * Returns 0 on success. + */ +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err = 0; + bool first = 1; + size_t left = *lenp; + unsigned long bitmap_len = table->maxlen; + unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *tmp_bitmap = NULL; + char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; + + if (!bitmap_len || !left || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + unsigned long page = 0; + char *kbuf; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + free_page(page); + return -EFAULT; + } + kbuf[left] = 0; + + tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), + GFP_KERNEL); + if (!tmp_bitmap) { + free_page(page); + return -ENOMEM; + } + proc_skip_char(&kbuf, &left, '\n'); + while (!err && left) { + unsigned long val_a, val_b; + bool neg; + + err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + sizeof(tr_a), &c); + if (err) + break; + if (val_a >= bitmap_len || neg) { + err = -EINVAL; + break; + } + + val_b = val_a; + if (left) { + kbuf++; + left--; + } + + if (c == '-') { + err = proc_get_long(&kbuf, &left, &val_b, + &neg, tr_b, sizeof(tr_b), + &c); + if (err) + break; + if (val_b >= bitmap_len || neg || + val_a > val_b) { + err = -EINVAL; + break; + } + if (left) { + kbuf++; + left--; + } + } + + while (val_a <= val_b) + set_bit(val_a++, tmp_bitmap); + + first = 0; + proc_skip_char(&kbuf, &left, '\n'); + } + free_page(page); + } else { + unsigned long bit_a, bit_b = 0; + + while (left) { + bit_a = find_next_bit(bitmap, bitmap_len, bit_b); + if (bit_a >= bitmap_len) + break; + bit_b = find_next_zero_bit(bitmap, bitmap_len, + bit_a + 1) - 1; + + if (!first) { + err = proc_put_char(&buffer, &left, ','); + if (err) + break; + } + err = proc_put_long(&buffer, &left, bit_a, false); + if (err) + break; + if (bit_a != bit_b) { + err = proc_put_char(&buffer, &left, '-'); + if (err) + break; + err = proc_put_long(&buffer, &left, bit_b, false); + if (err) + break; + } + + first = 0; bit_b++; + } + if (!err) + err = proc_put_char(&buffer, &left, '\n'); + } + + if (!err) { + if (write) { + if (*ppos) + bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); + else + memcpy(bitmap, tmp_bitmap, + BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); + } + kfree(tmp_bitmap); + *lenp -= left; + *ppos += *lenp; + return 0; + } else { + kfree(tmp_bitmap); + return err; + } +} + #else /* CONFIG_PROC_FS */ int proc_dostring(struct ctl_table *table, int write, -- cgit v1.3-8-gc7d7 From ab3c68ee5fd329ba48094d3417fd60e30ea14a87 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 17 May 2010 10:00:21 +0200 Subject: [S390] debug: enable exception-trace debug facility The exception-trace facility on x86 and other architectures prints traces to dmesg whenever a user space application crashes. s390 has such a feature since ages however it is called userprocess_debug and is enabled differently. This patch makes sure that whenever one of the two procfs files /proc/sys/kernel/userprocess_debug /proc/sys/debug/exception-trace is modified the contents of the second one changes as well. That way we keep backwards compatibilty but also support the same interface like other architectures do. Besides that the output of the traces is improved since it will now also contain the corresponding filename of the vma (when available) where the process caused a fault or trap. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 7 ------- arch/s390/kernel/traps.c | 31 +++++++++++++------------------ arch/s390/mm/fault.c | 32 +++++++++++++++++--------------- kernel/sysctl.c | 5 +++-- 4 files changed, 33 insertions(+), 42 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 0d8cd9bbe101..79d0ca086820 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -444,13 +444,6 @@ config FORCE_MAX_ZONEORDER int default "9" -config PROCESS_DEBUG - bool "Show crashed user process info" - help - Say Y to print all process fault locations to the console. This is - a debugging option; you probably do not want to set it unless you - are an S390 port maintainer. - config PFAULT bool "Pseudo page fault support" help diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index e605f070610c..5d8f0f3d0250 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -46,13 +46,7 @@ pgm_check_handler_t *pgm_check_table[128]; -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_PROCESS_DEBUG -int sysctl_userprocess_debug = 1; -#else -int sysctl_userprocess_debug = 0; -#endif -#endif +int show_unhandled_signals; extern pgm_check_handler_t do_protection_exception; extern pgm_check_handler_t do_dat_exception; @@ -315,18 +309,19 @@ void die(const char * str, struct pt_regs * regs, long err) do_exit(SIGSEGV); } -static void inline -report_user_fault(long interruption_code, struct pt_regs *regs) +static void inline report_user_fault(struct pt_regs *regs, long int_code, + int signr) { -#if defined(CONFIG_SYSCTL) - if (!sysctl_userprocess_debug) + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) return; -#endif -#if defined(CONFIG_SYSCTL) || defined(CONFIG_PROCESS_DEBUG) - printk("User process fault: interruption code 0x%lX\n", - interruption_code); + if (!unhandled_signal(current, signr)) + return; + if (!printk_ratelimit()) + return; + printk("User process fault: interruption code 0x%lX ", int_code); + print_vma_addr("in ", regs->psw.addr & PSW_ADDR_INSN); + printk("\n"); show_regs(regs); -#endif } int is_valid_bugaddr(unsigned long addr) @@ -354,7 +349,7 @@ static void __kprobes inline do_trap(long interruption_code, int signr, tsk->thread.trap_no = interruption_code & 0xffff; force_sig_info(signr, info, tsk); - report_user_fault(interruption_code, regs); + report_user_fault(regs, interruption_code, signr); } else { const struct exception_table_entry *fixup; fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); @@ -390,7 +385,7 @@ static void default_trap_handler(struct pt_regs * regs, long interruption_code) { if (regs->psw.mask & PSW_MASK_PSTATE) { local_irq_enable(); - report_user_fault(interruption_code, regs); + report_user_fault(regs, interruption_code, SIGSEGV); do_exit(SIGSEGV); } else die("Unknown program exception", regs, interruption_code); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 3040d7c78fe0..2505b2ea0ef1 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -48,10 +48,6 @@ #define __PF_RES_FIELD 0x8000000000000000ULL #endif /* CONFIG_64BIT */ -#ifdef CONFIG_SYSCTL -extern int sysctl_userprocess_debug; -#endif - #define VM_FAULT_BADCONTEXT 0x010000 #define VM_FAULT_BADMAP 0x020000 #define VM_FAULT_BADACCESS 0x040000 @@ -120,6 +116,22 @@ static inline int user_space_fault(unsigned long trans_exc_code) return trans_exc_code != 3; } +static inline void report_user_fault(struct pt_regs *regs, long int_code, + int signr, unsigned long address) +{ + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) + return; + if (!unhandled_signal(current, signr)) + return; + if (!printk_ratelimit()) + return; + printk("User process fault: interruption code 0x%lX ", int_code); + print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN); + printk("\n"); + printk("failing address: %lX\n", address); + show_regs(regs); +} + /* * Send SIGSEGV to task. This is an external routine * to keep the stack usage of do_page_fault small. @@ -133,17 +145,7 @@ static noinline void do_sigsegv(struct pt_regs *regs, long int_code, address = trans_exc_code & __FAIL_ADDR_MASK; current->thread.prot_addr = address; current->thread.trap_no = int_code; -#if defined(CONFIG_SYSCTL) || defined(CONFIG_PROCESS_DEBUG) -#if defined(CONFIG_SYSCTL) - if (sysctl_userprocess_debug) -#endif - { - printk("User process fault: interruption code 0x%lX\n", - int_code); - printk("failing address: %lX\n", address); - show_regs(regs); - } -#endif + report_user_fault(regs, int_code, SIGSEGV, address); si.si_signo = SIGSEGV; si.si_code = si_code; si.si_addr = (void __user *) address; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8686b0f5fc12..90f536d84643 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "userprocess_debug", - .data = &sysctl_userprocess_debug, + .data = &show_unhandled_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -1431,7 +1431,8 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) +#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ + defined(CONFIG_S390) { .procname = "exception-trace", .data = &show_unhandled_signals, -- cgit v1.3-8-gc7d7 From b492e95be0ae672922f4734acf3f5d35c30be948 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 May 2010 21:03:16 +0200 Subject: pipe: set lower and upper limit on max pages in the pipe page array We need at least two to guarantee proper POSIX behaviour, so never allow a smaller limit than that. Also expose a /proc/sys/fs/pipe-max-pages sysctl file that allows root to define a sane upper limit. Make it default to 16 times the default size, which is 16 pages. Signed-off-by: Jens Axboe --- fs/pipe.c | 15 +++++++++++++++ include/linux/pipe_fs_i.h | 2 ++ kernel/sysctl.c | 9 +++++++++ 3 files changed, 26 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/fs/pipe.c b/fs/pipe.c index 054b8a6a2c7a..d79872eba09a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -19,10 +19,17 @@ #include #include #include +#include #include #include +/* + * The max size that a non-root user is allowed to grow the pipe. Can + * be set by root in /proc/sys/fs/pipe-max-pages + */ +unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -1162,6 +1169,14 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case F_SETPIPE_SZ: + if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) + return -EINVAL; + /* + * The pipe needs to be at least 2 pages large to + * guarantee POSIX behaviour. + */ + if (arg < 2) + return -EINVAL; ret = pipe_set_size(pipe, arg); break; case F_GETPIPE_SZ: diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 65f4282fcbaf..16de3933c45e 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -139,6 +139,8 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); +extern unsigned int pipe_max_pages; + /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8686b0f5fc12..c649d1c5fe09 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -1423,6 +1424,14 @@ static struct ctl_table fs_table[] = { .child = binfmt_misc_table, }, #endif + { + .procname = "pipe-max-pages", + .data = &pipe_max_pages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &two, + }, /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- cgit v1.3-8-gc7d7 From 0fc377bd648d1935ea34665239e3f0a274b71698 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 21 May 2010 11:29:53 -0700 Subject: sysctl: fix kernel-doc notation and typos Fix kernel-doc warnings, kernel-doc special characters, and typos in recent kernel/sysctl.c additions. Signed-off-by: Randy Dunlap Cc: Amerigo Wang Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b12583047757..30acc6c87b1b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2083,20 +2083,20 @@ static void proc_skip_char(char **buf, size_t *size, const char v) #define TMPBUFLEN 22 /** - * proc_get_long - reads an ASCII formated integer from a user buffer + * proc_get_long - reads an ASCII formatted integer from a user buffer * - * @buf - a kernel buffer - * @size - size of the kernel buffer - * @val - this is where the number will be stored - * @neg - set to %TRUE if number is negative - * @perm_tr - a vector which contains the allowed trailers - * @perm_tr_len - size of the perm_tr vector - * @tr - pointer to store the trailer character + * @buf: a kernel buffer + * @size: size of the kernel buffer + * @val: this is where the number will be stored + * @neg: set to %TRUE if number is negative + * @perm_tr: a vector which contains the allowed trailers + * @perm_tr_len: size of the perm_tr vector + * @tr: pointer to store the trailer character * - * In case of success 0 is returned and buf and size are updated with - * the amount of bytes read. If tr is non NULL and a trailing - * character exist (size is non zero after returning from this - * function) tr is updated with the trailing character. + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes read. If @tr is non-NULL and a trailing + * character exists (size is non-zero after returning from this + * function), @tr is updated with the trailing character. */ static int proc_get_long(char **buf, size_t *size, unsigned long *val, bool *neg, @@ -2147,15 +2147,15 @@ static int proc_get_long(char **buf, size_t *size, } /** - * proc_put_long - coverts an integer to a decimal ASCII formated string + * proc_put_long - converts an integer to a decimal ASCII formatted string * - * @buf - the user buffer - * @size - the size of the user buffer - * @val - the integer to be converted - * @neg - sign of the number, %TRUE for negative + * @buf: the user buffer + * @size: the size of the user buffer + * @val: the integer to be converted + * @neg: sign of the number, %TRUE for negative * - * In case of success 0 is returned and buf and size are updated with - * the amount of bytes read. + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes written. */ static int proc_put_long(void __user **buf, size_t *size, unsigned long val, bool neg) -- cgit v1.3-8-gc7d7 From 76ab0f530e4a01d4dc20cdc1d5e87753c579dc18 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:28 -0700 Subject: mm: compaction: add /proc trigger for memory compaction Add a proc file /proc/sys/vm/compact_memory. When an arbitrary value is written to the file, all zones are compacted. The expected user of such a trigger is a job scheduler that prepares the system before the target application runs. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 10 ++++++++ include/linux/compaction.h | 6 +++++ kernel/sysctl.c | 10 ++++++++ mm/compaction.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6c7d18c53f84..56dd29b97a91 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -19,6 +19,7 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: - block_dump +- compact_memory - dirty_background_bytes - dirty_background_ratio - dirty_bytes @@ -64,6 +65,15 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt. ============================================================== +compact_memory + +Available only when CONFIG_COMPACTION is set. When 1 is written to the file, +all zones are compacted such that free memory is available in contiguous +blocks where possible. This can be important for example in the allocation of +huge pages although processes will also directly compact memory as required. + +============================================================== + dirty_background_bytes Contains the amount of dirty memory at which the pdflush background writeback diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 465ca88615ed..572388880ba8 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -6,4 +6,10 @@ #define COMPACT_PARTIAL 1 #define COMPACT_COMPLETE 2 +#ifdef CONFIG_COMPACTION +extern int sysctl_compact_memory; +extern int sysctl_compaction_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos); +#endif /* CONFIG_COMPACTION */ + #endif /* _LINUX_COMPACTION_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4c93486b45d1..284f330d6a01 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1121,6 +1122,15 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = drop_caches_sysctl_handler, }, +#ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", + .data = &sysctl_compact_memory, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = sysctl_compaction_handler, + }, +#endif /* CONFIG_COMPACTION */ { .procname = "min_free_kbytes", .data = &min_free_kbytes, diff --git a/mm/compaction.c b/mm/compaction.c index be1ff3f7552b..77854fbc0f56 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "internal.h" /* @@ -391,3 +392,64 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } + +/* Compact all zones within a node */ +static int compact_node(int nid) +{ + int zoneid; + pg_data_t *pgdat; + struct zone *zone; + + if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) + return -EINVAL; + pgdat = NODE_DATA(nid); + + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + }; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + return 0; +} + +/* Compact all nodes in the system */ +static int compact_nodes(void) +{ + int nid; + + for_each_online_node(nid) + compact_node(nid); + + return COMPACT_COMPLETE; +} + +/* The written value is actually unused, all memory is compacted */ +int sysctl_compact_memory; + +/* This is the entry point for compacting all nodes via /proc/sys/vm */ +int sysctl_compaction_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + if (write) + return compact_nodes(); + + return 0; +} -- cgit v1.3-8-gc7d7 From 5e7719058079a1423ccce56148b0aaa56b2df821 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 May 2010 14:32:31 -0700 Subject: mm: compaction: add a tunable that decides when memory should be compacted and when it should be reclaimed The kernel applies some heuristics when deciding if memory should be compacted or reclaimed to satisfy a high-order allocation. One of these is based on the fragmentation. If the index is below 500, memory will not be compacted. This choice is arbitrary and not based on data. To help optimise the system and set a sensible default for this value, this patch adds a sysctl extfrag_threshold. The kernel will only compact memory if the fragmentation index is above the extfrag_threshold. [randy.dunlap@oracle.com: Fix build errors when proc fs is not configured] Signed-off-by: Mel Gorman Signed-off-by: Randy Dunlap Cc: Rik van Riel Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 15 +++++++++++++++ include/linux/compaction.h | 3 +++ kernel/sysctl.c | 15 +++++++++++++++ mm/compaction.c | 12 +++++++++++- 4 files changed, 44 insertions(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 56dd29b97a91..5fdbb612aeb8 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -27,6 +27,7 @@ Currently, these files are in /proc/sys/vm: - dirty_ratio - dirty_writeback_centisecs - drop_caches +- extfrag_threshold - hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode @@ -149,6 +150,20 @@ user should run `sync' first. ============================================================== +extfrag_threshold + +This parameter affects whether the kernel will compact memory or direct +reclaim to satisfy a high-order allocation. /proc/extfrag_index shows what +the fragmentation index for each order is in each zone in the system. Values +tending towards 0 imply allocations would fail due to lack of memory, +values towards 1000 imply failures are due to fragmentation and -1 implies +that the allocation will succeed as long as watermarks are met. + +The kernel will not compact memory in a zone if the +fragmentation index is <= extfrag_threshold. The default value is 500. + +============================================================== + hugepages_treat_as_movable This parameter is only useful when kernelcore= is specified at boot time to diff --git a/include/linux/compaction.h b/include/linux/compaction.h index eed40ec4280b..3719325c6091 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -15,6 +15,9 @@ extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); +extern int sysctl_extfrag_threshold; +extern int sysctl_extfrag_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos); extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 284f330d6a01..84ff5e75c084 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -263,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */ static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif +#ifdef CONFIG_COMPACTION +static int min_extfrag_threshold; +static int max_extfrag_threshold = 1000; +#endif + static struct ctl_table kern_table[] = { { .procname = "sched_child_runs_first", @@ -1130,6 +1135,16 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = sysctl_compaction_handler, }, + { + .procname = "extfrag_threshold", + .data = &sysctl_extfrag_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_extfrag_handler, + .extra1 = &min_extfrag_threshold, + .extra2 = &max_extfrag_threshold, + }, + #endif /* CONFIG_COMPACTION */ { .procname = "min_free_kbytes", diff --git a/mm/compaction.c b/mm/compaction.c index 9583e193dc47..94cce51b0b35 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -433,6 +433,8 @@ static unsigned long compact_zone_order(struct zone *zone, return compact_zone(zone, &cc); } +int sysctl_extfrag_threshold = 500; + /** * try_to_compact_pages - Direct compact to satisfy a high-order allocation * @zonelist: The zonelist used for the current allocation @@ -491,7 +493,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, * Only compact if a failure would be due to fragmentation. */ fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= 500) + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) continue; if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { @@ -572,6 +574,14 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, return 0; } +int sysctl_extfrag_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, buffer, length, ppos); + + return 0; +} + #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) ssize_t sysfs_compact_node(struct sys_device *dev, struct sysdev_attribute *attr, -- cgit v1.3-8-gc7d7 From 563b04671017ea00ba563ebeebdc36bce79b1b60 Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Tue, 25 May 2010 16:10:14 -0700 Subject: proc_dointvec: write a single value The commit 00b7c3395aec3df43de5bd02a3c5a099ca51169f "sysctl: refactor integer handling proc code" modified the behaviour of writing to /proc. Before the commit, write("1\n") to /proc/sys/kernel/printk succeeded. But now it returns EINVAL. This commit supports writing a single value to a multi-valued entry. Signed-off-by: J. R. Okajima Reviewed-and-tested-by: WANG Cong Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b12583047757..f948f20f09cb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2253,6 +2253,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (write) { left -= proc_skip_spaces(&kbuf); + if (!left) + break; err = proc_get_long(&kbuf, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); @@ -2279,7 +2281,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); - if (write && !err) + if (write && !err && left) left -= proc_skip_spaces(&kbuf); free: if (write) { -- cgit v1.3-8-gc7d7 From ff9da691c0498ff81fdd014e7a0731dab2337dac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jun 2010 14:54:39 +0200 Subject: pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface This changes the interface to be based on bytes instead. The API matches that of F_SETPIPE_SZ in that it rounds up the passed in size so that the resulting page array is a power-of-2 in size. The proc file is renamed to /proc/sys/fs/pipe-max-size to reflect this change. Signed-off-by: Jens Axboe --- fs/pipe.c | 54 ++++++++++++++++++++++++++++++++++++----------- include/linux/pipe_fs_i.h | 4 +++- kernel/sysctl.c | 8 +++---- 3 files changed, 49 insertions(+), 17 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/fs/pipe.c b/fs/pipe.c index f98fae3e36b0..69c4c7c13ea9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -26,9 +26,14 @@ /* * The max size that a non-root user is allowed to grow the pipe. Can - * be set by root in /proc/sys/fs/pipe-max-pages + * be set by root in /proc/sys/fs/pipe-max-size */ -unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; +unsigned int pipe_max_size = 1048576; + +/* + * Minimum pipe size, as required by POSIX + */ +unsigned int pipe_min_size = PAGE_SIZE; /* * We use a start+len construction, which provides full use of the @@ -1156,6 +1161,35 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) return nr_pages * PAGE_SIZE; } +/* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* + * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * will return an error. + */ +int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); + if (ret < 0 || !write) + return ret; + + pipe_max_size = round_pipe_size(pipe_max_size); + return ret; +} + long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; @@ -1169,23 +1203,19 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case F_SETPIPE_SZ: { - unsigned long nr_pages; + unsigned int size, nr_pages; - /* - * Currently the array must be a power-of-2 size, so adjust - * upwards if needed. - */ - nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT; - nr_pages = roundup_pow_of_two(nr_pages); + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; - if (!capable(CAP_SYS_RESOURCE) && nr_pages > pipe_max_pages) { + if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; - } else if (nr_pages < 1) { + } else if (nr_pages < PAGE_SIZE) { ret = -EINVAL; goto out; } - ret = pipe_set_size(pipe, arg); + ret = pipe_set_size(pipe, nr_pages); break; } case F_GETPIPE_SZ: diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 16de3933c45e..445796945ac9 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -139,7 +139,9 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); -extern unsigned int pipe_max_pages; +extern unsigned int pipe_max_size, pipe_min_size; +int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); + /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 997080f00e0b..d24f761f4876 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = { }, #endif { - .procname = "pipe-max-pages", - .data = &pipe_max_pages, + .procname = "pipe-max-size", + .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &two, + .proc_handler = &pipe_proc_fn, + .extra1 = &pipe_min_size, }, /* * NOTE: do not add new entries to this table unless you have read -- cgit v1.3-8-gc7d7