From 6047a007d0f6b7395cd158f3bdda34ab39a48821 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 14 Jan 2009 12:22:25 +0200 Subject: SLUB: Use ->objsize from struct kmem_cache_cpu in slab_free() There's no reason to use ->objsize from struct kmem_cache in slab_free() for the SLAB_DEBUG_OBJECTS case. All it does is generate extra cache pressure as we try very hard not to touch struct kmem_cache in the fast-path. Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 6392ae5cc6b1..f21e25ad453b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1724,7 +1724,7 @@ static __always_inline void slab_free(struct kmem_cache *s, c = get_cpu_slab(s, smp_processor_id()); debug_check_no_locks_freed(object, c->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + debug_check_no_obj_freed(object, c->objsize); if (likely(page == c->page && c->node >= 0)) { object[c->offset] = c->freelist; c->freelist = object; -- cgit v1.2.3-59-g8ed1b From 6e9ed0cc4b963fde66ab47d9fb19147631e44555 Mon Sep 17 00:00:00 2001 From: Américo Wang Date: Mon, 19 Jan 2009 02:00:38 +0800 Subject: slob: clean up the code - Use NULL instead of plain 0; - Rename slob_page() to is_slob_page(); - Define slob_page() to convert void* to struct slob_page*; - Rename slob_new_page() to slob_new_pages(); - Define slob_free_pages() accordingly. Compile tests only. Signed-off-by: WANG Cong Signed-off-by: Matt Mackall Cc: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slob.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed8..c9cd31d27e69 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -126,9 +126,9 @@ static LIST_HEAD(free_slob_medium); static LIST_HEAD(free_slob_large); /* - * slob_page: True for all slob pages (false for bigblock pages) + * is_slob_page: True for all slob pages (false for bigblock pages) */ -static inline int slob_page(struct slob_page *sp) +static inline int is_slob_page(struct slob_page *sp) { return PageSlobPage((struct page *)sp); } @@ -143,6 +143,11 @@ static inline void clear_slob_page(struct slob_page *sp) __ClearPageSlobPage((struct page *)sp); } +static inline struct slob_page *slob_page(const void *addr) +{ + return (struct slob_page *)virt_to_page(addr); +} + /* * slob_page_free: true for pages on free_slob_pages list. */ @@ -230,7 +235,7 @@ static int slob_last(slob_t *s) return !((unsigned long)slob_next(s) & ~PAGE_MASK); } -static void *slob_new_page(gfp_t gfp, int order, int node) +static void *slob_new_pages(gfp_t gfp, int order, int node) { void *page; @@ -247,12 +252,17 @@ static void *slob_new_page(gfp_t gfp, int order, int node) return page_address(page); } +static void slob_free_pages(void *b, int order) +{ + free_pages((unsigned long)b, order); +} + /* * Allocate a slob block within a given slob_page sp. */ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) { - slob_t *prev, *cur, *aligned = 0; + slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { @@ -349,10 +359,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) /* Not enough space: must allocate a new page */ if (!b) { - b = slob_new_page(gfp & ~__GFP_ZERO, 0, node); + b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); if (!b) - return 0; - sp = (struct slob_page *)virt_to_page(b); + return NULL; + sp = slob_page(b); set_slob_page(sp); spin_lock_irqsave(&slob_lock, flags); @@ -384,7 +394,7 @@ static void slob_free(void *block, int size) return; BUG_ON(!size); - sp = (struct slob_page *)virt_to_page(block); + sp = slob_page(block); units = SLOB_UNITS(size); spin_lock_irqsave(&slob_lock, flags); @@ -476,7 +486,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) } else { void *ret; - ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); if (ret) { struct page *page; page = virt_to_page(ret); @@ -494,8 +504,8 @@ void kfree(const void *block) if (unlikely(ZERO_OR_NULL_PTR(block))) return; - sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) { + sp = slob_page(block); + if (is_slob_page(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); @@ -513,8 +523,8 @@ size_t ksize(const void *block) if (unlikely(block == ZERO_SIZE_PTR)) return 0; - sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) { + sp = slob_page(block); + if (is_slob_page(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; @@ -572,7 +582,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) b = slob_alloc(c->size, flags, c->align, node); else - b = slob_new_page(flags, get_order(c->size), node); + b = slob_new_pages(flags, get_order(c->size), node); if (c->ctor) c->ctor(b); @@ -586,7 +596,7 @@ static void __kmem_cache_free(void *b, int size) if (size < PAGE_SIZE) slob_free(b, size); else - free_pages((unsigned long)b, get_order(size)); + slob_free_pages(b, get_order(size)); } static void kmem_rcu_free(struct rcu_head *head) -- cgit v1.2.3-59-g8ed1b From 6146f0d5e47ca4047ffded0fb79b6c25359b386c Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 4 Feb 2009 09:06:57 -0500 Subject: integrity: IMA hooks This patch replaces the generic integrity hooks, for which IMA registered itself, with IMA integrity hooks in the appropriate places directly in the fs directory. Signed-off-by: Mimi Zohar Acked-by: Serge Hallyn Signed-off-by: James Morris --- Documentation/kernel-parameters.txt | 1 + fs/exec.c | 10 +++++++++ fs/file_table.c | 2 ++ fs/inode.c | 24 ++++++++++++++------ fs/namei.c | 8 +++++++ include/linux/ima.h | 44 +++++++++++++++++++++++++++++++++++++ mm/mmap.c | 4 ++++ 7 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 include/linux/ima.h (limited to 'mm') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a2d8805c03d5..7c67b94d1823 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -44,6 +44,7 @@ parameter is applicable: FB The frame buffer device is enabled. HW Appropriate hardware is enabled. IA-64 IA-64 architecture is enabled. + IMA Integrity measurement architecture is enabled. IOSCHED More than one I/O scheduler is enabled. IP_PNP IP DHCP, BOOTP, or RARP is enabled. ISAPNP ISA PnP code is enabled. diff --git a/fs/exec.c b/fs/exec.c index 02d2e120542d..9c789a525cc4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -128,6 +129,9 @@ asmlinkage long sys_uselib(const char __user * library) goto exit; error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN); + if (error) + goto exit; + error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN); if (error) goto exit; @@ -681,6 +685,9 @@ struct file *open_exec(const char *name) goto out_path_put; err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN); + if (err) + goto out_path_put; + err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN); if (err) goto out_path_put; @@ -1207,6 +1214,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) } #endif retval = security_bprm_check(bprm); + if (retval) + return retval; + retval = ima_bprm_check(bprm); if (retval) return retval; diff --git a/fs/file_table.c b/fs/file_table.c index 0fbcacc3ea75..55895ccc08c6 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -276,6 +277,7 @@ void __fput(struct file *file) if (file->f_op && file->f_op->release) file->f_op->release(inode, file); security_file_free(file); + ima_file_free(file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) cdev_put(inode->i_cdev); fops_put(file->f_op); diff --git a/fs/inode.c b/fs/inode.c index 098a2443196f..ed22b14f2202 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -144,13 +145,13 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_cdev = NULL; inode->i_rdev = 0; inode->dirtied_when = 0; - if (security_inode_alloc(inode)) { - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, (inode)); - return NULL; - } + + if (security_inode_alloc(inode)) + goto out_free_inode; + + /* allocate and initialize an i_integrity */ + if (ima_inode_alloc(inode)) + goto out_free_security; spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); @@ -186,6 +187,15 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_mapping = mapping; return inode; + +out_free_security: + security_inode_free(inode); +out_free_inode: + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; } EXPORT_SYMBOL(inode_init_always); diff --git a/fs/namei.c b/fs/namei.c index af3783fff1de..734f2b5591bf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -860,6 +861,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd) err = exec_permission_lite(inode); if (err == -EAGAIN) err = vfs_permission(nd, MAY_EXEC); + if (!err) + err = ima_path_check(&nd->path, MAY_EXEC); if (err) break; @@ -1525,6 +1528,11 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) error = vfs_permission(nd, acc_mode); if (error) return error; + + error = ima_path_check(&nd->path, + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) + return error; /* * An append-only file must be opened in append mode for writing. */ diff --git a/include/linux/ima.h b/include/linux/ima.h new file mode 100644 index 000000000000..4ed1e4d962e2 --- /dev/null +++ b/include/linux/ima.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2008 IBM Corporation + * Author: Mimi Zohar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 2 of the License. + */ + +#include + +#ifndef _LINUX_IMA_H +#define _LINUX_IMA_H + +static inline int ima_bprm_check(struct linux_binprm *bprm) +{ + return 0; +} + +static inline int ima_inode_alloc(struct inode *inode) +{ + return 0; +} + +static inline void ima_inode_free(struct inode *inode) +{ + return; +} + +static inline int ima_path_check(struct path *path, int mask) +{ + return 0; +} + +static inline void ima_file_free(struct file *file) +{ + return; +} + +static inline int ima_file_mmap(struct file *file, unsigned long prot) +{ + return 0; +} +#endif /* _LINUX_IMA_H */ diff --git a/mm/mmap.c b/mm/mmap.c index d4855a682ab6..c3647f3b0621 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1048,6 +1049,9 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } error = security_file_mmap(file, reqprot, prot, flags, addr, 0); + if (error) + return error; + error = ima_file_mmap(file, prot); if (error) return error; -- cgit v1.2.3-59-g8ed1b From 1df9f0a73178718969ae47d813b8e7aab2cf073c Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 4 Feb 2009 09:07:02 -0500 Subject: Integrity: IMA file free imbalance The number of calls to ima_path_check()/ima_file_free() should be balanced. An extra call to fput(), indicates the file could have been accessed without first being measured. Although f_count is incremented/decremented in places other than fget/fput, like fget_light/fput_light and get_file, the current task must already hold a file refcnt. The call to __fput() is delayed until the refcnt becomes 0, resulting in ima_file_free() flagging any changes. - add hook to increment opencount for IPC shared memory(SYSV), shmat files, and /dev/zero - moved NULL iint test in opencount_get() Signed-off-by: Mimi Zohar Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/ima.h | 6 ++++++ ipc/shm.c | 3 +++ mm/shmem.c | 2 ++ security/integrity/ima/ima.h | 2 ++ security/integrity/ima/ima_iint.c | 17 ++++++++++++++++ security/integrity/ima/ima_main.c | 42 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 72 insertions(+) (limited to 'mm') diff --git a/include/linux/ima.h b/include/linux/ima.h index dcc3664feee8..6db30a328d98 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -19,6 +19,7 @@ extern void ima_inode_free(struct inode *inode); extern int ima_path_check(struct path *path, int mask); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); +extern void ima_shm_check(struct file *file); #else static inline int ima_bprm_check(struct linux_binprm *bprm) @@ -50,5 +51,10 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot) { return 0; } + +static inline void ima_shm_check(struct file *file) +{ + return; +} #endif /* CONFIG_IMA_H */ #endif /* _LINUX_IMA_H */ diff --git a/ipc/shm.c b/ipc/shm.c index 38a055758a9b..d39bd7637b1c 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -381,6 +382,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) error = PTR_ERR(file); if (IS_ERR(file)) goto no_file; + ima_shm_check(file); id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); if (id < 0) { @@ -888,6 +890,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations); if (!file) goto out_free; + ima_shm_check(file); file->private_data = sfd; file->f_mapping = shp->shm_file->f_mapping; diff --git a/mm/shmem.c b/mm/shmem.c index f1b0d4871f3a..dd5588f5d939 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -2600,6 +2601,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (IS_ERR(file)) return PTR_ERR(file); + ima_shm_check(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 42706b554921..e3c16a21a38e 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -97,6 +97,7 @@ static inline unsigned long ima_hash_key(u8 *digest) /* iint cache flags */ #define IMA_MEASURED 1 +#define IMA_IINT_DUMP_STACK 512 /* integrity data associated with an inode */ struct ima_iint_cache { @@ -106,6 +107,7 @@ struct ima_iint_cache { struct mutex mutex; /* protects: version, flags, digest */ long readcount; /* measured files readcount */ long writecount; /* measured files writecount */ + long opencount; /* opens reference count */ struct kref refcount; /* ima_iint_cache reference count */ struct rcu_head rcu; }; diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c index 750db3c993a7..1f035e8d29c7 100644 --- a/security/integrity/ima/ima_iint.c +++ b/security/integrity/ima/ima_iint.c @@ -126,6 +126,7 @@ struct ima_iint_cache *ima_iint_find_insert_get(struct inode *inode) return iint; } +EXPORT_SYMBOL_GPL(ima_iint_find_insert_get); /* iint_free - called when the iint refcount goes to zero */ void iint_free(struct kref *kref) @@ -134,6 +135,21 @@ void iint_free(struct kref *kref) refcount); iint->version = 0; iint->flags = 0UL; + if (iint->readcount != 0) { + printk(KERN_INFO "%s: readcount: %ld\n", __FUNCTION__, + iint->readcount); + iint->readcount = 0; + } + if (iint->writecount != 0) { + printk(KERN_INFO "%s: writecount: %ld\n", __FUNCTION__, + iint->writecount); + iint->writecount = 0; + } + if (iint->opencount != 0) { + printk(KERN_INFO "%s: opencount: %ld\n", __FUNCTION__, + iint->opencount); + iint->opencount = 0; + } kref_set(&iint->refcount, 1); kmem_cache_free(iint_cache, iint); } @@ -174,6 +190,7 @@ static void init_once(void *foo) mutex_init(&iint->mutex); iint->readcount = 0; iint->writecount = 0; + iint->opencount = 0; kref_set(&iint->refcount, 1); } diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 871e356e8d6c..f4e7266f5aee 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -66,6 +66,19 @@ void ima_file_free(struct file *file) return; mutex_lock(&iint->mutex); + if (iint->opencount <= 0) { + printk(KERN_INFO + "%s: %s open/free imbalance (r:%ld w:%ld o:%ld f:%ld)\n", + __FUNCTION__, file->f_dentry->d_name.name, + iint->readcount, iint->writecount, + iint->opencount, atomic_long_read(&file->f_count)); + if (!(iint->flags & IMA_IINT_DUMP_STACK)) { + dump_stack(); + iint->flags |= IMA_IINT_DUMP_STACK; + } + } + iint->opencount--; + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) iint->readcount--; @@ -119,6 +132,7 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file, pr_info("%s dentry_open failed\n", filename); return rc; } + iint->opencount++; iint->readcount++; rc = ima_collect_measurement(iint, file); @@ -159,6 +173,7 @@ int ima_path_check(struct path *path, int mask) return 0; mutex_lock(&iint->mutex); + iint->opencount++; if ((mask & MAY_WRITE) || (mask == 0)) iint->writecount++; else if (mask & (MAY_READ | MAY_EXEC)) @@ -219,6 +234,21 @@ out: return rc; } +static void opencount_get(struct file *file) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ima_iint_cache *iint; + + if (!ima_initialized || !S_ISREG(inode->i_mode)) + return; + iint = ima_iint_find_insert_get(inode); + if (!iint) + return; + mutex_lock(&iint->mutex); + iint->opencount++; + mutex_unlock(&iint->mutex); +} + /** * ima_file_mmap - based on policy, collect/store measurement. * @file: pointer to the file to be measured (May be NULL) @@ -242,6 +272,18 @@ int ima_file_mmap(struct file *file, unsigned long prot) return 0; } +/* + * ima_shm_check - IPC shm and shmat create/fput a file + * + * Maintain the opencount for these files to prevent unnecessary + * imbalance messages. + */ +void ima_shm_check(struct file *file) +{ + opencount_get(file); + return; +} + /** * ima_bprm_check - based on policy, collect/store measurement. * @bprm: contains the linux_binprm structure -- cgit v1.2.3-59-g8ed1b From ed850a52af971528b048812c4215cef298af0d3b Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 10 Feb 2009 23:01:19 -0500 Subject: integrity: shmem zero fix Based on comments from Mike Frysinger and Randy Dunlap: (http://lkml.org/lkml/2009/2/9/262) - moved ima.h include before CONFIG_SHMEM test to fix compiler error on Blackfin: mm/shmem.c: In function 'shmem_zero_setup': mm/shmem.c:2670: error: implicit declaration of function 'ima_shm_check' - added 'struct linux_binprm' in ima.h to fix compiler warning on Blackfin: In file included from mm/shmem.c:32: include/linux/ima.h:25: warning: 'struct linux_binprm' declared inside parameter list include/linux/ima.h:25: warning: its scope is only this definition or declaration, which is probably not what you want - moved fs.h include within _LINUX_IMA_H definition Signed-off-by: Mimi Zohar Signed-off-by: Mike Frysinger Signed-off-by: James Morris --- include/linux/ima.h | 5 +++-- mm/shmem.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/ima.h b/include/linux/ima.h index 6db30a328d98..0e2aa45cb0ce 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -7,11 +7,12 @@ * the Free Software Foundation, version 2 of the License. */ -#include - #ifndef _LINUX_IMA_H #define _LINUX_IMA_H +#include +struct linux_binprm; + #ifdef CONFIG_IMA extern int ima_bprm_check(struct linux_binprm *bprm); extern int ima_inode_alloc(struct inode *inode); diff --git a/mm/shmem.c b/mm/shmem.c index 75199888a6bd..8135fac294ee 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,6 +28,7 @@ #include #include #include +#include static struct vfsmount *shm_mnt; @@ -59,7 +60,6 @@ static struct vfsmount *shm_mnt; #include #include #include -#include #include #include -- cgit v1.2.3-59-g8ed1b From ffadd4d0feb5376c82dc3a4104731b7ce2794edc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 17 Feb 2009 12:05:07 -0500 Subject: SLUB: Introduce and use SLUB_MAX_SIZE and SLUB_PAGE_SHIFT constants As a preparational patch to bump up page allocator pass-through threshold, introduce two new constants SLUB_MAX_SIZE and SLUB_PAGE_SHIFT and convert mm/slub.c to use them. Reported-by: "Zhang, Yanmin" Tested-by: "Zhang, Yanmin" Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 19 ++++++++++++++++--- mm/slub.c | 16 ++++++++-------- 2 files changed, 24 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2f5c16b1aacd..986e09dcfd8f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -120,11 +120,24 @@ struct kmem_cache { #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE) +/* + * Maximum kmalloc object size handled by SLUB. Larger object allocations + * are passed through to the page allocator. The page allocator "fastpath" + * is relatively slow so we need this value sufficiently high so that + * performance critical objects are allocated through the SLUB fastpath. + * + * This should be dropped to PAGE_SIZE / 2 once the page allocator + * "fastpath" becomes competitive with the slab allocator fastpaths. + */ +#define SLUB_MAX_SIZE (PAGE_SIZE) + +#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 1) + /* * We keep the general caches in an array of slab caches that are used for * 2^x bytes of allocations. */ -extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1]; +extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT]; /* * Sorry that the following has to be that ugly but some versions of GCC @@ -212,7 +225,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) static __always_inline void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { - if (size > PAGE_SIZE) + if (size > SLUB_MAX_SIZE) return kmalloc_large(size, flags); if (!(flags & SLUB_DMA)) { @@ -234,7 +247,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { if (__builtin_constant_p(size) && - size <= PAGE_SIZE && !(flags & SLUB_DMA)) { + size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) { struct kmem_cache *s = kmalloc_slab(size); if (!s) diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a23..5a5e7f5bf799 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2475,7 +2475,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; +struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); static int __init setup_slub_min_order(char *str) @@ -2537,7 +2537,7 @@ panic: } #ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; +static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; static void sysfs_add_func(struct work_struct *w) { @@ -2658,7 +2658,7 @@ void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -2686,7 +2686,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, flags, node); s = get_slab(size, flags); @@ -2985,7 +2985,7 @@ void __init kmem_cache_init(void) caches++; } - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); caches++; @@ -3022,7 +3022,7 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) kmalloc_caches[i]. name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); @@ -3222,7 +3222,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, gfpflags); s = get_slab(size, gfpflags); @@ -3238,7 +3238,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, gfpflags, node); s = get_slab(size, gfpflags); -- cgit v1.2.3-59-g8ed1b From e8120ff1ffc51102ead1f4c98a3fd5d26fefc722 Mon Sep 17 00:00:00 2001 From: Zhang Yanmin Date: Thu, 12 Feb 2009 18:00:17 +0200 Subject: SLUB: Fix default slab order for big object sizes The default order of kmalloc-8192 on 2*4 stoakley is an issue of calculate_order. slab_size order name ------------------------------------------------- 4096 3 sgpool-128 8192 2 kmalloc-8192 16384 3 kmalloc-16384 kmalloc-8192's default order is smaller than sgpool-128's. On 4*4 tigerton machine, a similiar issue appears on another kmem_cache. Function calculate_order uses 'min_objects /= 2;' to shrink. Plus size calculation/checking in slab_order, sometimes above issue appear. Below patch against 2.6.29-rc2 fixes it. I checked the default orders of all kmem_cache and they don't become smaller than before. So the patch wouldn't hurt performance. Signed-off-by Zhang Yanmin Signed-off-by: Pekka Enberg --- mm/slub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5a5e7f5bf799..c01a7a3001d2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1844,6 +1844,7 @@ static inline int calculate_order(int size) int order; int min_objects; int fraction; + int max_objects; /* * Attempt to find best configuration for a slab. This @@ -1856,6 +1857,9 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); + max_objects = (PAGE_SIZE << slub_max_order)/size; + min_objects = min(min_objects, max_objects); + while (min_objects > 1) { fraction = 16; while (fraction >= 4) { @@ -1865,7 +1869,7 @@ static inline int calculate_order(int size) return order; fraction /= 2; } - min_objects /= 2; + min_objects --; } /* -- cgit v1.2.3-59-g8ed1b From 3b89d7d881a1dbb4da158f7eb5d6b3ceefc72810 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sun, 22 Feb 2009 17:40:07 -0800 Subject: slub: move min_partial to struct kmem_cache Although it allows for better cacheline use, it is unnecessary to save a copy of the cache's min_partial value in each kmem_cache_node. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 2 +- mm/slub.c | 29 ++++++++++++++++------------- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2f5c16b1aacd..f20a89e4d52c 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -46,7 +46,6 @@ struct kmem_cache_cpu { struct kmem_cache_node { spinlock_t list_lock; /* Protect partial list and nr_partial */ unsigned long nr_partial; - unsigned long min_partial; struct list_head partial; #ifdef CONFIG_SLUB_DEBUG atomic_long_t nr_slabs; @@ -89,6 +88,7 @@ struct kmem_cache { void (*ctor)(void *); int inuse; /* Offset to metadata */ int align; /* Alignment */ + unsigned long min_partial; const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ #ifdef CONFIG_SLUB_DEBUG diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a23..4fff385b17a3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1335,7 +1335,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) n = get_node(s, zone_to_nid(zone)); if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > n->min_partial) { + n->nr_partial > s->min_partial) { page = get_partial_node(n); if (page) return page; @@ -1387,7 +1387,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { stat(c, DEACTIVATE_EMPTY); - if (n->nr_partial < n->min_partial) { + if (n->nr_partial < s->min_partial) { /* * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs @@ -1928,17 +1928,6 @@ static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; - - /* - * The larger the object size is, the more pages we want on the partial - * list to avoid pounding the page allocator excessively. - */ - n->min_partial = ilog2(s->size); - if (n->min_partial < MIN_PARTIAL) - n->min_partial = MIN_PARTIAL; - else if (n->min_partial > MAX_PARTIAL) - n->min_partial = MAX_PARTIAL; - spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG @@ -2181,6 +2170,15 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } #endif +static void calculate_min_partial(struct kmem_cache *s, unsigned long min) +{ + if (min < MIN_PARTIAL) + min = MIN_PARTIAL; + else if (min > MAX_PARTIAL) + min = MAX_PARTIAL; + s->min_partial = min; +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -2319,6 +2317,11 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (!calculate_sizes(s, -1)) goto error; + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + calculate_min_partial(s, ilog2(s->size)); s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; -- cgit v1.2.3-59-g8ed1b From 73d342b169db700b5a6ad626fe4b86911efec8db Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sun, 22 Feb 2009 17:40:09 -0800 Subject: slub: add min_partial sysfs tunable Now that a cache's min_partial has been moved to struct kmem_cache, it's possible to easily tune it from userspace by adding a sysfs attribute. It may not be desirable to keep a large number of partial slabs around if a cache is used infrequently and memory, especially when constrained by a cgroup, is scarce. It's better to allow userspace to set the minimum policy per cache instead of relying explicitly on kmem_cache_shrink(). The memory savings from simply moving min_partial from struct kmem_cache_node to struct kmem_cache is obviously not significant (unless maybe you're from SGI or something), at the largest it's # allocated caches * (MAX_NUMNODES - 1) * sizeof(unsigned long) The true savings occurs when userspace reduces the number of partial slabs that would otherwise be wasted, especially on machines with a large number of nodes (ia64 with CONFIG_NODES_SHIFT at 10 for default?). As well as the kernel estimates ideal values for n->min_partial and ensures it's within a sane range, userspace has no other input other than writing to /sys/kernel/slab/cache/shrink. There simply isn't any better heuristic to add when calculating the partial values for a better estimate that works for all possible caches. And since it's currently a static value, the user really has no way of reclaiming that wasted space, which can be significant when constrained by a cgroup (either cpusets or, later, memory controller slab limits) without shrinking it entirely. This also allows the user to specify that increased fragmentation and more partial slabs are actually desired to avoid the cost of allocating new slabs at runtime for specific caches. There's also no reason why this should be a per-struct kmem_cache_node value in the first place. You could argue that a machine would have such node size asymmetries that it should be specified on a per-node basis, but we know nobody is doing that right now since it's a purely static value at the moment and there's no convenient way to tune that via slub's sysfs interface. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4fff385b17a3..a3e2d552ff46 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3838,6 +3838,26 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR(order); +static ssize_t min_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%lu\n", s->min_partial); +} + +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long min; + int err; + + err = strict_strtoul(buf, 10, &min); + if (err) + return err; + + calculate_min_partial(s, min); + return length; +} +SLAB_ATTR(min_partial); + static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (s->ctor) { @@ -4153,6 +4173,7 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &min_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &total_objects_attr.attr, -- cgit v1.2.3-59-g8ed1b From c0bdb232b23b51c23e551041510ad6bea5ce5a92 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 25 Feb 2009 09:16:35 +0200 Subject: slub: rename calculate_min_partial() to set_min_partial() As suggested by Christoph Lameter, rename calculate_min_partial() to set_min_partial() as the function doesn't really do any calculations. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a3e2d552ff46..77268d18e78d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2170,7 +2170,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } #endif -static void calculate_min_partial(struct kmem_cache *s, unsigned long min) +static void set_min_partial(struct kmem_cache *s, unsigned long min) { if (min < MIN_PARTIAL) min = MIN_PARTIAL; @@ -2321,7 +2321,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. */ - calculate_min_partial(s, ilog2(s->size)); + set_min_partial(s, ilog2(s->size)); s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -3853,7 +3853,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, if (err) return err; - calculate_min_partial(s, min); + set_min_partial(s, min); return length; } SLAB_ATTR(min_partial); -- cgit v1.2.3-59-g8ed1b From 1a00df4a2cc001dd9f45890e690548c24b2fa2d9 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 7 Mar 2009 00:36:21 +0900 Subject: slub: use get_track() Use get_track() in set_track() Signed-off-by: Akinobu Mita Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Pekka Enberg --- mm/slub.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index f21e25ad453b..e150b5c0424f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -374,14 +374,8 @@ static struct track *get_track(struct kmem_cache *s, void *object, static void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { - struct track *p; - - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + struct track *p = get_track(s, object, alloc); - p += alloc; if (addr) { p->addr = addr; p->cpu = smp_processor_id(); -- cgit v1.2.3-59-g8ed1b From 6fb8f424393025674fde7869b59f485d1e352182 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 16 Mar 2009 21:00:28 +1100 Subject: slob: fix lockup in slob_free() Don't hold SLOB lock when freeing the page. Reduces lock hold width. See the following thread for discussion of the bug: http://marc.info/?l=linux-kernel&m=123709983214143&w=2 Reported-by: Ingo Molnar Acked-by: Matt Mackall Signed-off-by: Nick Piggin Signed-off-by: Pekka Enberg --- mm/slob.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed8..f901653707a4 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -393,10 +393,11 @@ static void slob_free(void *block, int size) /* Go directly to page allocator. Do not pass slob allocator */ if (slob_page_free(sp)) clear_slob_page_free(sp); + spin_unlock_irqrestore(&slob_lock, flags); clear_slob_page(sp); free_slob_page(sp); free_page((unsigned long)b); - goto out; + return; } if (!slob_page_free(sp)) { -- cgit v1.2.3-59-g8ed1b From 26160158d3d3df548f4ee046cc6147fe048cfa9c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Mar 2009 09:35:06 +0100 Subject: Move the default_backing_dev_info out of readahead.c and into backing-dev.c It really makes no sense to have it in readahead.c, so move it where it belongs. Signed-off-by: Jens Axboe --- mm/backing-dev.c | 26 +++++++++++++++++++++++++- mm/readahead.c | 25 ------------------------- 2 files changed, 25 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8e8587444132..be68c956a660 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -2,11 +2,24 @@ #include #include #include +#include #include #include #include #include +void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ +} +EXPORT_SYMBOL(default_unplug_io_fn); + +struct backing_dev_info default_backing_dev_info = { + .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, + .state = 0, + .capabilities = BDI_CAP_MAP_COPY, + .unplug_io_fn = default_unplug_io_fn, +}; +EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; @@ -166,9 +179,20 @@ static __init int bdi_class_init(void) bdi_debug_init(); return 0; } - postcore_initcall(bdi_class_init); +static int __init default_bdi_init(void) +{ + int err; + + err = bdi_init(&default_backing_dev_info); + if (!err) + bdi_register(&default_backing_dev_info, NULL, "default"); + + return err; +} +subsys_initcall(default_bdi_init); + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { diff --git a/mm/readahead.c b/mm/readahead.c index bec83c15a78f..9ce303d4b810 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,19 +17,6 @@ #include #include -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} -EXPORT_SYMBOL(default_unplug_io_fn); - -struct backing_dev_info default_backing_dev_info = { - .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, - .state = 0, - .capabilities = BDI_CAP_MAP_COPY, - .unplug_io_fn = default_unplug_io_fn, -}; -EXPORT_SYMBOL_GPL(default_backing_dev_info); - /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -233,18 +220,6 @@ unsigned long max_sane_readahead(unsigned long nr) + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); } -static int __init readahead_init(void) -{ - int err; - - err = bdi_init(&default_backing_dev_info); - if (!err) - bdi_register(&default_backing_dev_info, NULL, "default"); - - return err; -} -subsys_initcall(readahead_init); - /* * Submit IO for the read-ahead request in file_ra_state. */ -- cgit v1.2.3-59-g8ed1b From 1b5e62b42b55c509eea04c3c0f25e42c8b35b564 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 23 Mar 2009 08:57:38 +0800 Subject: writeback: double the dirty thresholds Enlarge default dirty ratios from 5/10 to 10/20. This fixes [Bug #12809] iozone regression with 2.6.29-rc6. The iozone benchmarks are performed on a 1200M file, with 8GB ram. iozone -i 0 -i 1 -i 2 -i 3 -i 4 -r 4k -s 64k -s 512m -s 1200m -b tmp.xls iozone -B -r 4k -s 64k -s 512m -s 1200m -b tmp.xls The performance regression is triggered by commit 1cf6e7d83bf3(mm: task dirty accounting fix), which makes more correct/thorough dirty accounting. The default 5/10 dirty ratios were picked (a) with the old dirty logic and (b) largely at random and (c) designed to be aggressive. In particular, that (a) means that having fixed some of the dirty accounting, maybe the real bug is now that it was always too aggressive, just hidden by an accounting issue. The enlarged 10/20 dirty ratios are just about enough to fix the regression. [ We will have to look at how this affects the old fsync() latency issue, but that probably will need independent work. - Linus ] Cc: Nick Piggin Cc: Peter Zijlstra Reported-by: "Lin, Ming M" Tested-by: "Lin, Ming M" Signed-off-by: Wu Fengguang Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 74dc57c74349..40ca7cdb653e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -66,7 +66,7 @@ static inline long sync_writeback_pages(void) /* * Start background writeback (via pdflush) at this percentage */ -int dirty_background_ratio = 5; +int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of @@ -83,7 +83,7 @@ int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 10; +int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of -- cgit v1.2.3-59-g8ed1b