From e45c9effed903ba3fdbd6ef0498ee8989c35af0a Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Wed, 27 Oct 2010 15:33:30 -0700 Subject: isofs: work-around for Rock Ridge+Joliet CDs with empty ISO root directory If a CD has both Rock Ridge and Joliet extensions and the ISO root directory is empty, no files are visible. Disable Rock Ridge extensions in this case and use Joliet root directory instead. Signed-off-by: Ondrej Zary Cc: Al Viro Cc: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/inode.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'fs') diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 60c2b944d762..79cf7f616bbe 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -543,6 +543,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) return vol_desc_start; } +/* + * Check if root directory is empty (has less than 3 files). + * + * Used to detect broken CDs where ISO root directory is empty but Joliet root + * directory is OK. If such CD has Rock Ridge extensions, they will be disabled + * (and Joliet used instead) or else no files would be visible. + */ +static bool rootdir_empty(struct super_block *sb, unsigned long block) +{ + int offset = 0, files = 0, de_len; + struct iso_directory_record *de; + struct buffer_head *bh; + + bh = sb_bread(sb, block); + if (!bh) + return true; + while (files < 3) { + de = (struct iso_directory_record *) (bh->b_data + offset); + de_len = *(unsigned char *) de; + if (de_len == 0) + break; + files++; + offset += de_len; + } + brelse(bh); + return files < 3; +} + /* * Initialize the superblock and read the root inode. * @@ -842,6 +870,18 @@ root_found: if (IS_ERR(inode)) goto out_no_root; + /* + * Fix for broken CDs with Rock Ridge and empty ISO root directory but + * correct Joliet root directory. + */ + if (sbi->s_rock == 1 && joliet_level && + rootdir_empty(s, sbi->s_firstdatazone)) { + printk(KERN_NOTICE + "ISOFS: primary root directory is empty. " + "Disabling Rock Ridge and switching to Joliet."); + sbi->s_rock = 0; + } + /* * If this disk has both Rock Ridge and Joliet on it, then we * want to use Rock Ridge by default. This can be overridden -- cgit v1.2.3-59-g8ed1b From 9b1bf12d5d51bca178dea21b04a0805e29d60cf1 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 27 Oct 2010 15:34:08 -0700 Subject: signals: move cred_guard_mutex from task_struct to signal_struct Oleg Nesterov pointed out we have to prevent multiple-threads-inside-exec itself and we can reuse ->cred_guard_mutex for it. Yes, concurrent execve() has no worth. Let's move ->cred_guard_mutex from task_struct to signal_struct. It naturally prevent multiple-threads-inside-exec. Signed-off-by: KOSAKI Motohiro Reviewed-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 10 +++++----- fs/proc/base.c | 8 ++++---- include/linux/init_task.h | 4 ++-- include/linux/sched.h | 7 ++++--- include/linux/tracehook.h | 2 +- kernel/cred.c | 4 +--- kernel/fork.c | 2 ++ kernel/ptrace.c | 4 ++-- 8 files changed, 21 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 3aa75b8888a1..9722909c4d88 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1083,14 +1083,14 @@ EXPORT_SYMBOL(setup_new_exec); */ int prepare_bprm_creds(struct linux_binprm *bprm) { - if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; - mutex_unlock(¤t->cred_guard_mutex); + mutex_unlock(¤t->signal->cred_guard_mutex); return -ENOMEM; } @@ -1098,7 +1098,7 @@ void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { - mutex_unlock(¤t->cred_guard_mutex); + mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } kfree(bprm); @@ -1119,13 +1119,13 @@ void install_exec_creds(struct linux_binprm *bprm) * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); - mutex_unlock(¤t->cred_guard_mutex); + mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); /* * determine how safe it is to execute the proposed program - * - the caller must hold current->cred_guard_mutex to protect against + * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH */ int check_unsafe_exec(struct linux_binprm *bprm) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9b094c1c8465..f3d02ca461ec 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task) { struct mm_struct *mm; - if (mutex_lock_killable(&task->cred_guard_mutex)) + if (mutex_lock_killable(&task->signal->cred_guard_mutex)) return NULL; mm = get_task_mm(task); @@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task) mmput(mm); mm = NULL; } - mutex_unlock(&task->cred_guard_mutex); + mutex_unlock(&task->signal->cred_guard_mutex); return mm; } @@ -2354,14 +2354,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, goto out_free; /* Guard against adverse ptrace interaction */ - length = mutex_lock_interruptible(&task->cred_guard_mutex); + length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (length < 0) goto out_free; length = security_setprocattr(task, (char*)file->f_path.dentry->d_name.name, (void*)page, count); - mutex_unlock(&task->cred_guard_mutex); + mutex_unlock(&task->signal->cred_guard_mutex); out_free: free_page((unsigned long) page); out: diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 2fea6c8ef6ba..1f8c06ce0fa6 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -29,6 +29,8 @@ extern struct fs_struct init_fs; .running = 0, \ .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ }, \ + .cred_guard_mutex = \ + __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ } extern struct nsproxy init_nsproxy; @@ -145,8 +147,6 @@ extern struct cred init_cred; .group_leader = &tsk, \ RCU_INIT_POINTER(.real_cred, &init_cred), \ RCU_INIT_POINTER(.cred, &init_cred), \ - .cred_guard_mutex = \ - __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \ .comm = "swapper", \ .thread = INIT_THREAD, \ .fs = &init_fs, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ff5c8519abd..be7adb7588e5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -626,6 +626,10 @@ struct signal_struct { int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ + + struct mutex cred_guard_mutex; /* guard against foreign influences on + * credential calculations + * (notably. ptrace) */ }; /* Context switch must be unlocked if interrupts are to be enabled */ @@ -1305,9 +1309,6 @@ struct task_struct { * credentials (COW) */ const struct cred __rcu *cred; /* effective (overridable) subjective task * credentials (COW) */ - struct mutex cred_guard_mutex; /* guard against foreign influences on - * credential calculations - * (notably. ptrace) */ struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */ char comm[TASK_COMM_LEN]; /* executable name excluding path diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 10db0102a890..3a2e66d88a32 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -150,7 +150,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) * * Return %LSM_UNSAFE_* bits applied to an exec because of tracing. * - * @task->cred_guard_mutex is held by the caller through the do_execve(). + * @task->signal->cred_guard_mutex is held by the caller through the do_execve(). */ static inline int tracehook_unsafe_exec(struct task_struct *task) { diff --git a/kernel/cred.c b/kernel/cred.c index 9a3e22641fe7..6a1aa004e376 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() - * - The caller must hold current->cred_guard_mutex + * - The caller must hold ->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { @@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - mutex_init(&p->cred_guard_mutex); - if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && diff --git a/kernel/fork.c b/kernel/fork.c index e87aaaaf5131..3b159c5991b7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -908,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + mutex_init(&sig->cred_guard_mutex); + return 0; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ea7ce0215cd1..99bbaa3e5b0d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task) * under ptrace. */ retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(&task->cred_guard_mutex)) + if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) goto out; task_lock(task); @@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task) unlock_tasklist: write_unlock_irq(&tasklist_lock); unlock_creds: - mutex_unlock(&task->cred_guard_mutex); + mutex_unlock(&task->signal->cred_guard_mutex); out: return retval; } -- cgit v1.2.3-59-g8ed1b From 1b0d300bd0f047e2edaf9d4b6784189e6c67c3d1 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Wed, 27 Oct 2010 15:34:08 -0700 Subject: core_pattern: fix truncation by core_pattern handler with long parameters We met a parameter truncated issue, consider following: > echo "|/root/core_pattern_pipe_test %p /usr/libexec/blah-blah-blah \ %s %c %p %u %g 11 12345678901234567890123456789012345678 %t" > \ /proc/sys/kernel/core_pattern This is okay because the strings is less than CORENAME_MAX_SIZE. "cat /proc/sys/kernel/core_pattern" shows the whole string. but after we run core_pattern_pipe_test in man page, we found last parameter was truncated like below: argc[10]=<12807486> The root cause is core_pattern allows % specifiers, which need to be replaced during parse time, but the replace may expand the strings to larger than CORENAME_MAX_SIZE. So if the last parameter is % specifiers, the replace code is using snprintf(out_ptr, out_end - out_ptr, ...), this will write out of corename array. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Xiaotian Feng Cc: Alexander Viro Cc: Oleg Nesterov Cc: KOSAKI Motohiro Reviewed-by: Neil Horman Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 155 ++++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 95 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 9722909c4d88..ca01d2d0a6d4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core"; unsigned int core_pipe_limit; int suid_dumpable = 0; +struct core_name { + char *corename; + int used, size; +}; +static atomic_t call_count = ATOMIC_INIT(1); + /* The maximal length of core_pattern is also specified in sysctl.c */ static LIST_HEAD(formats); @@ -1459,127 +1465,148 @@ void set_binfmt(struct linux_binfmt *new) EXPORT_SYMBOL(set_binfmt); +static int expand_corename(struct core_name *cn) +{ + char *old_corename = cn->corename; + + cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); + cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); + + if (!cn->corename) { + kfree(old_corename); + return -ENOMEM; + } + + return 0; +} + +static int cn_printf(struct core_name *cn, const char *fmt, ...) +{ + char *cur; + int need; + int ret; + va_list arg; + + va_start(arg, fmt); + need = vsnprintf(NULL, 0, fmt, arg); + va_end(arg); + + if (likely(need < cn->size - cn->used - 1)) + goto out_printf; + + ret = expand_corename(cn); + if (ret) + goto expand_fail; + +out_printf: + cur = cn->corename + cn->used; + va_start(arg, fmt); + vsnprintf(cur, need + 1, fmt, arg); + va_end(arg); + cn->used += need; + return 0; + +expand_fail: + return ret; +} + /* format_corename will inspect the pattern parameter, and output a * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(char *corename, long signr) +static int format_corename(struct core_name *cn, long signr) { const struct cred *cred = current_cred(); const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); - char *out_ptr = corename; - char *const out_end = corename + CORENAME_MAX_SIZE; - int rc; int pid_in_pattern = 0; + int err = 0; + + cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); + cn->corename = kmalloc(cn->size, GFP_KERNEL); + cn->used = 0; + + if (!cn->corename) + return -ENOMEM; /* Repeat as long as we have more pattern to process and more output space */ while (*pat_ptr) { if (*pat_ptr != '%') { - if (out_ptr == out_end) + if (*pat_ptr == 0) goto out; - *out_ptr++ = *pat_ptr++; + err = cn_printf(cn, "%c", *pat_ptr++); } else { switch (*++pat_ptr) { + /* single % at the end, drop that */ case 0: goto out; /* Double percent, output one percent */ case '%': - if (out_ptr == out_end) - goto out; - *out_ptr++ = '%'; + err = cn_printf(cn, "%c", '%'); break; /* pid */ case 'p': pid_in_pattern = 1; - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", task_tgid_vnr(current)); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%d", + task_tgid_vnr(current)); break; /* uid */ case 'u': - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", cred->uid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%d", cred->uid); break; /* gid */ case 'g': - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", cred->gid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%d", cred->gid); break; /* signal that caused the coredump */ case 's': - rc = snprintf(out_ptr, out_end - out_ptr, - "%ld", signr); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%ld", signr); break; /* UNIX time of coredump */ case 't': { struct timeval tv; do_gettimeofday(&tv); - rc = snprintf(out_ptr, out_end - out_ptr, - "%lu", tv.tv_sec); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%lu", tv.tv_sec); break; } /* hostname */ case 'h': down_read(&uts_sem); - rc = snprintf(out_ptr, out_end - out_ptr, - "%s", utsname()->nodename); + err = cn_printf(cn, "%s", + utsname()->nodename); up_read(&uts_sem); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; break; /* executable */ case 'e': - rc = snprintf(out_ptr, out_end - out_ptr, - "%s", current->comm); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%s", current->comm); break; /* core limit size */ case 'c': - rc = snprintf(out_ptr, out_end - out_ptr, - "%lu", rlimit(RLIMIT_CORE)); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, "%lu", + rlimit(RLIMIT_CORE)); break; default: break; } ++pat_ptr; } + + if (err) + return err; } + /* Backward compatibility with core_uses_pid: * * If core_pattern does not include a %p (as is the default) * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ if (!ispipe && !pid_in_pattern && core_uses_pid) { - rc = snprintf(out_ptr, out_end - out_ptr, - ".%d", task_tgid_vnr(current)); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; + err = cn_printf(cn, ".%d", task_tgid_vnr(current)); + if (err) + return err; } out: - *out_ptr = 0; return ispipe; } @@ -1856,7 +1883,7 @@ static int umh_pipe_setup(struct subprocess_info *info) void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; - char corename[CORENAME_MAX_SIZE + 1]; + struct core_name cn; struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; const struct cred *old_cred; @@ -1911,7 +1938,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) */ clear_thread_flag(TIF_SIGPENDING); - ispipe = format_corename(corename, signr); + ispipe = format_corename(&cn, signr); + + if (ispipe == -ENOMEM) { + printk(KERN_WARNING "format_corename failed\n"); + printk(KERN_WARNING "Aborting core\n"); + goto fail_corename; + } if (ispipe) { int dump_count; @@ -1948,7 +1981,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) goto fail_dropcount; } - helper_argv = argv_split(GFP_KERNEL, corename+1, NULL); + helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); if (!helper_argv) { printk(KERN_WARNING "%s failed to allocate memory\n", __func__); @@ -1961,7 +1994,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) argv_free(helper_argv); if (retval) { printk(KERN_INFO "Core dump to %s pipe failed\n", - corename); + cn.corename); goto close_fail; } } else { @@ -1970,7 +2003,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) if (cprm.limit < binfmt->min_coredump) goto fail_unlock; - cprm.file = filp_open(corename, + cprm.file = filp_open(cn.corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); if (IS_ERR(cprm.file)) @@ -2012,6 +2045,8 @@ fail_dropcount: if (ispipe) atomic_dec(&core_dump_count); fail_unlock: + kfree(cn.corename); +fail_corename: coredump_finish(mm); revert_creds(old_cred); fail_creds: -- cgit v1.2.3-59-g8ed1b From 895021552d6ffe8a4d076cb5c4b1e700c33e96e1 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 27 Oct 2010 15:34:09 -0700 Subject: coredump: default CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y The userland ELF tools have been coping with partial-segments core files for a few years now. Multiple distro builds are now setting this option. It behooves everyone who ever deals with core files to have more info dumped in there, especially as more and more people's compilers are producing build IDs. Make it the default. Anyone using older tools confused by these core files can configure this option off, or just change /proc/PID/coredump_filter after boot. Signed-off-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig.binfmt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index bb4cc5b8abc8..79e2ca7973b7 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC config CORE_DUMP_DEFAULT_ELF_HEADERS bool "Write ELF core dumps with partial segments" - default n + default y depends on BINFMT_ELF && ELF_CORE help ELF core dump files describe each memory mapping of the crashed @@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS inherited. See Documentation/filesystems/proc.txt for details. This config option changes the default setting of coredump_filter - seen at boot time. If unsure, say N. + seen at boot time. If unsure, say Y. config BINFMT_FLAT bool "Kernel support for flat binaries" -- cgit v1.2.3-59-g8ed1b From b40d4f84becd69275451baee7f0801c85eb58437 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Wed, 27 Oct 2010 15:34:10 -0700 Subject: /proc/pid/smaps: export amount of anonymous memory in a mapping Export the number of anonymous pages in a mapping via smaps. Even the private pages in a mapping backed by a file, would be marked as anonymous, when they are modified. Export this information to user-space via smaps. Exporting this count will help gdb to make a better decision on which areas need to be dumped in its coredump; and should be useful to others studying the memory usage of a process. Signed-off-by: Nikanth Karthikesan Acked-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 13 ++++++++++--- fs/proc/task_mmu.c | 6 ++++++ 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a563b74c7aef..976de6e19dd8 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -370,6 +370,7 @@ Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 892 kB +Anonymous: 0 kB Swap: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB @@ -378,9 +379,15 @@ The first of these lines shows the same information as is displayed for the mapping in /proc/PID/maps. The remaining lines show the size of the mapping (size), the amount of the mapping that is currently resident in RAM (RSS), the process' proportional share of this mapping (PSS), the number of clean and -dirty shared pages in the mapping, and the number of clean and dirty private -pages in the mapping. The "Referenced" indicates the amount of memory -currently marked as referenced or accessed. +dirty private pages in the mapping. Note that even a page which is part of a +MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used +by only one process, is accounted as private and not as shared. "Referenced" +indicates the amount of memory currently marked as referenced or accessed. +"Anonymous" shows the amount of memory that does not belong to any file. Even +a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE +and a page is modified, the file page is replaced by a private anonymous copy. +"Swap" shows how much would-be-anonymous memory is also used, but out on +swap. This file is only present if the CONFIG_MMU kernel configuration option is enabled. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 871e25ed0069..da6b01d70f01 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -327,6 +327,7 @@ struct mem_size_stats { unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; + unsigned long anonymous; unsigned long swap; u64 pss; }; @@ -357,6 +358,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!page) continue; + if (PageAnon(page)) + mss->anonymous += PAGE_SIZE; + mss->resident += PAGE_SIZE; /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) @@ -410,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v) "Private_Clean: %8lu kB\n" "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" + "Anonymous: %8lu kB\n" "Swap: %8lu kB\n" "KernelPageSize: %8lu kB\n" "MMUPageSize: %8lu kB\n", @@ -421,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v) mss.private_clean >> 10, mss.private_dirty >> 10, mss.referenced >> 10, + mss.anonymous >> 10, mss.swap >> 10, vma_kernel_pagesize(vma) >> 10, vma_mmu_pagesize(vma) >> 10); -- cgit v1.2.3-59-g8ed1b From 19cd56c48da58bebc3a638e036bcab69469acd27 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 27 Oct 2010 15:34:12 -0700 Subject: procfs: fix /proc/softirqs formatting The length of the BLOCK_IPOLL string is making i's value be printed too far to the right. This patch fixes this and makes the output a bit neater. Currently: CPU0 HI: 0 TIMER: 599792 NET_TX: 2 NET_RX: 6 BLOCK: 80807 BLOCK_IOPOLL: 0 TASKLET: 20012 SCHED: 0 HRTIMER: 63 RCU: 619279 With patch: CPU0 HI: 0 TIMER: 585582 NET_TX: 2 NET_RX: 6 BLOCK: 80320 BLOCK_IOPOLL: 0 TASKLET: 19287 SCHED: 0 HRTIMER: 62 RCU: 604441 Signed-off-by: Davidlohr Bueso Acked-by: Keika Kobayashi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/softirqs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 1807c2419f17..37994737c983 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -10,13 +10,13 @@ static int show_softirqs(struct seq_file *p, void *v) { int i, j; - seq_printf(p, " "); + seq_printf(p, " "); for_each_possible_cpu(i) seq_printf(p, "CPU%-8d", i); seq_printf(p, "\n"); for (i = 0; i < NR_SOFTIRQS; i++) { - seq_printf(p, "%8s:", softirq_to_name[i]); + seq_printf(p, "%12s:", softirq_to_name[i]); for_each_possible_cpu(j) seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); seq_printf(p, "\n"); -- cgit v1.2.3-59-g8ed1b From f2c66cd8eeddedb440f33bc0f5cec1ed7ae376cb Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 27 Oct 2010 15:34:13 -0700 Subject: /proc/stat: scalability of irq num per cpu /proc/stat shows the total number of all interrupts to each cpu. But when the number of IRQs are very large, it take very long time and 'cat /proc/stat' takes more than 10 secs. This is because sum of all irq events are counted when /proc/stat is read. This patch adds "sum of all irq" counter percpu and reduce read costs. The cost of reading /proc/stat is important because it's used by major applications as 'top', 'ps', 'w', etc.... A test on a mechin (4096cpu, 256 nodes, 4592 irqs) shows %time cat /proc/stat > /dev/null Before Patch: 12.627 sec After Patch: 2.459 sec Signed-off-by: KAMEZAWA Hiroyuki Tested-by: Jack Steiner Acked-by: Jack Steiner Cc: Yinghai Lu Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 4 +--- include/linux/kernel_stat.h | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index bf31b03fc275..b80c620565bf 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -52,9 +52,7 @@ static int show_stat(struct seq_file *p, void *v) guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); guest_nice = cputime64_add(guest_nice, kstat_cpu(i).cpustat.guest_nice); - for_each_irq_nr(j) { - sum += kstat_irqs_cpu(j, i); - } + sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index c059044bc6dc..8b9b89085530 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -33,6 +33,7 @@ struct kernel_stat { #ifndef CONFIG_GENERIC_HARDIRQS unsigned int irqs[NR_IRQS]; #endif + unsigned long irqs_sum; unsigned int softirqs[NR_SOFTIRQS]; }; @@ -54,6 +55,7 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) { kstat_this_cpu.irqs[irq]++; + kstat_this_cpu.irqs_sum++; } static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) @@ -65,8 +67,9 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); #define kstat_irqs_this_cpu(DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]) -#define kstat_incr_irqs_this_cpu(irqno, DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]++) +#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\ + ((DESC)->kstat_irqs[smp_processor_id()]++);\ + kstat_this_cpu.irqs_sum++; } while (0) #endif @@ -94,6 +97,13 @@ static inline unsigned int kstat_irqs(unsigned int irq) return sum; } +/* + * Number of interrupts per cpu, since bootup + */ +static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) +{ + return kstat_cpu(cpu).irqs_sum; +} /* * Lock/unlock the current runqueue - to extract task statistics: -- cgit v1.2.3-59-g8ed1b From 478735e38887077ac77a9756121b6ce0cb956e2f Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 27 Oct 2010 15:34:15 -0700 Subject: /proc/stat: fix scalability of irq sum of all cpu In /proc/stat, the number of per-IRQ event is shown by making a sum each irq's events on all cpus. But we can make use of kstat_irqs(). kstat_irqs() do the same calculation, If !CONFIG_GENERIC_HARDIRQ, it's not a big cost. (Both of the number of cpus and irqs are small.) If a system is very big and CONFIG_GENERIC_HARDIRQ, it does for_each_irq() for_each_cpu() - look up a radix tree - read desc->irq_stat[cpu] This seems not efficient. This patch adds kstat_irqs() for CONFIG_GENRIC_HARDIRQ and change the calculation as for_each_irq() look up radix tree for_each_cpu() - read desc->irq_stat[cpu] This reduces cost. A test on (4096cpusp, 256 nodes, 4592 irqs) host (by Jack Steiner) %time cat /proc/stat > /dev/null Before Patch: 2.459 sec After Patch : .561 sec [akpm@linux-foundation.org: unexport kstat_irqs, coding-style tweaks] [akpm@linux-foundation.org: fix unused variable 'per_irq_sum'] Signed-off-by: KAMEZAWA Hiroyuki Tested-by: Jack Steiner Acked-by: Jack Steiner Cc: Yinghai Lu Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 10 ++-------- include/linux/kernel_stat.h | 4 ++++ kernel/irq/irqdesc.c | 15 +++++++++++++++ 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index b80c620565bf..e15a19c93bae 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v) u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; - unsigned int per_irq_sum; user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; @@ -108,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "intr %llu", (unsigned long long)sum); /* sum again ? it could be updated? */ - for_each_irq_nr(j) { - per_irq_sum = 0; - for_each_possible_cpu(i) - per_irq_sum += kstat_irqs_cpu(j, i); - - seq_printf(p, " %u", per_irq_sum); - } + for_each_irq_nr(j) + seq_printf(p, " %u", kstat_irqs(j)); seq_printf(p, "\nctxt %llu\n" diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 8b9b89085530..ad54c846911b 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -86,6 +86,7 @@ static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) /* * Number of interrupts per specific IRQ source, since bootup */ +#ifndef CONFIG_GENERIC_HARDIRQS static inline unsigned int kstat_irqs(unsigned int irq) { unsigned int sum = 0; @@ -96,6 +97,9 @@ static inline unsigned int kstat_irqs(unsigned int irq) return sum; } +#else +extern unsigned int kstat_irqs(unsigned int irq); +#endif /* * Number of interrupts per cpu, since bootup diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9d917ff72675..9988d03797f5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) struct irq_desc *desc = irq_to_desc(irq); return desc ? desc->kstat_irqs[cpu] : 0; } + +#ifdef CONFIG_GENERIC_HARDIRQS +unsigned int kstat_irqs(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + int cpu; + int sum = 0; + + if (!desc) + return 0; + for_each_possible_cpu(cpu) + sum += desc->kstat_irqs[cpu]; + return sum; +} +#endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.2.3-59-g8ed1b From 98391cf4dcf893e9e74e1c14189851dbc9c5ad0d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 27 Oct 2010 15:34:16 -0700 Subject: exec: don't turn PF_KTHREAD off when a target command was not found Presently do_execve() turns PF_KTHREAD off before search_binary_handler(). THis has a theorical risk of PF_KTHREAD getting lost. We don't have to turn PF_KTHREAD off in the ENOEXEC case. This patch moves this flag modification to after the finding of the executable file. This is only a theorical issue because kthreads do not call do_execve() directly. But fixing would be better. Signed-off-by: KOSAKI Motohiro Acked-by: Roland McGrath Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index ca01d2d0a6d4..99d33a1371e9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1009,7 +1009,7 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ - current->flags &= ~PF_RANDOMIZE; + current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD); flush_thread(); current->personality &= ~bprm->per_clear; @@ -1412,7 +1412,6 @@ int do_execve(const char * filename, if (retval < 0) goto out; - current->flags &= ~PF_KTHREAD; retval = search_binary_handler(bprm,regs); if (retval < 0) goto out; -- cgit v1.2.3-59-g8ed1b From 0be8557bcd34887d5a42c01c5659cab5ecf99f13 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 27 Oct 2010 15:34:46 -0700 Subject: fuse: use release_pages() Replace iterated page_cache_release() with release_pages(), which is faster and shorter. Needs release_pages() to be exported to modules. Suggested-by: Andrew Morton Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 7 +------ mm/swap.c | 1 + 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b98664275f02..6e07696308dc 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1334,12 +1334,7 @@ out_finish: static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) { - int i; - - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; - page_cache_release(page); - } + release_pages(req->pages, req->num_pages, 0); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, diff --git a/mm/swap.c b/mm/swap.c index 3ce7bc373a52..3f4854205b16 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold) pagevec_free(&pages_to_free); } +EXPORT_SYMBOL(release_pages); /* * The pages which we're about to release may be in the deferred lru-addition -- cgit v1.2.3-59-g8ed1b From 231f3d393f63f6e3b505afa179999bba491d0f08 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 27 Oct 2010 15:34:53 -0700 Subject: select: rename estimate_accuracy() to select_estimate_accuracy() Make it a subsystem-specific identifier because we wish to amke it non-static in the next patch ("epoll: make epoll_wait() use the hrtimer range feature"). Cc: Shawn Bohrer Cc: Al Viro Cc: Davide Libenzi Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/select.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index 500a669f7790..5f023f911202 100644 --- a/fs/select.c +++ b/fs/select.c @@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -static long estimate_accuracy(struct timespec *tv) +static long select_estimate_accuracy(struct timespec *tv) { unsigned long ret; struct timespec now; @@ -417,7 +417,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) } if (end_time && !timed_out) - slack = estimate_accuracy(end_time); + slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { @@ -769,7 +769,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, } if (end_time && !timed_out) - slack = estimate_accuracy(end_time); + slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; -- cgit v1.2.3-59-g8ed1b From 95aac7b1cd224f568fb83937044cd303ff11b029 Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Wed, 27 Oct 2010 15:34:54 -0700 Subject: epoll: make epoll_wait() use the hrtimer range feature This make epoll use hrtimers for the timeout value which prevents epoll_wait() from timing out up to a millisecond early. This mirrors the behavior of select() and poll(). Signed-off-by: Shawn Bohrer Cc: Al Viro Acked-by: Davide Libenzi Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 35 +++++++++++++++++++---------------- fs/select.c | 2 +- include/linux/poll.h | 2 ++ 3 files changed, 22 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 256bb7bb102a..8cf07242067d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -77,9 +77,6 @@ /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 -/* Maximum msec timeout value storeable in a long int */ -#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) - #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) #define EP_UNACTIVE_PTR ((void *) -1L) @@ -1117,18 +1114,22 @@ static int ep_send_events(struct eventpoll *ep, static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { - int res, eavail; + int res, eavail, timed_out = 0; unsigned long flags; - long jtimeout; + long slack; wait_queue_t wait; - - /* - * Calculate the timeout by checking for the "infinite" value (-1) - * and the overflow condition. The passed timeout is in milliseconds, - * that why (t * HZ) / 1000. - */ - jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? - MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; + struct timespec end_time; + ktime_t expires, *to = NULL; + + if (timeout > 0) { + ktime_get_ts(&end_time); + timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC); + slack = select_estimate_accuracy(&end_time); + to = &expires; + *to = timespec_to_ktime(end_time); + } else if (timeout == 0) { + timed_out = 1; + } retry: spin_lock_irqsave(&ep->lock, flags); @@ -1150,7 +1151,7 @@ retry: * to TASK_INTERRUPTIBLE before doing the checks. */ set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&ep->rdllist) || !jtimeout) + if (!list_empty(&ep->rdllist) || timed_out) break; if (signal_pending(current)) { res = -EINTR; @@ -1158,7 +1159,9 @@ retry: } spin_unlock_irqrestore(&ep->lock, flags); - jtimeout = schedule_timeout(jtimeout); + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + timed_out = 1; + spin_lock_irqsave(&ep->lock, flags); } __remove_wait_queue(&ep->wq, &wait); @@ -1176,7 +1179,7 @@ retry: * more luck. */ if (!res && eavail && - !(res = ep_send_events(ep, events, maxevents)) && jtimeout) + !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto retry; return res; diff --git a/fs/select.c b/fs/select.c index 5f023f911202..b7b10aa30861 100644 --- a/fs/select.c +++ b/fs/select.c @@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -static long select_estimate_accuracy(struct timespec *tv) +long select_estimate_accuracy(struct timespec *tv) { unsigned long ret; struct timespec now; diff --git a/include/linux/poll.h b/include/linux/poll.h index 600cc1fde64d..56e76af78102 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -73,6 +73,8 @@ extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); +extern long select_estimate_accuracy(struct timespec *tv); + static inline int poll_schedule(struct poll_wqueues *pwq, int state) { -- cgit v1.2.3-59-g8ed1b