From 53e872681fed6a43047e71bf927f77d06f467988 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Dec 2012 13:29:52 -0500 Subject: ext4: fix deadlock in journal_unmap_buffer() We cannot wait for transaction commit in journal_unmap_buffer() because we hold page lock which ranks below transaction start. We solve the issue by bailing out of journal_unmap_buffer() and jbd2_journal_invalidatepage() with -EBUSY. Caller is then responsible for waiting for transaction commit to finish and try invalidation again. Since the issue can happen only for page stradding i_size, it is simple enough to manually call jbd2_journal_invalidatepage() for such page from ext4_setattr(), check the return value and wait if necessary. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1be23d9fdacb..e30b66346942 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1098,7 +1098,7 @@ void jbd2_journal_set_triggers(struct buffer_head *, extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); extern void journal_sync_buffer (struct buffer_head *); -extern void jbd2_journal_invalidatepage(journal_t *, +extern int jbd2_journal_invalidatepage(journal_t *, struct page *, unsigned long); extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); extern int jbd2_journal_stop(handle_t *); -- cgit v1.2.3-59-g8ed1b From 08b60f8438879a84246d7debded31c9cb7aea6e4 Mon Sep 17 00:00:00 2001 From: Stephen Warren Date: Mon, 24 Dec 2012 11:14:58 -0700 Subject: namei.h: include errno.h This solves: In file included from fs/ext3/symlink.c:20:0: include/linux/namei.h: In function 'retry_estale': include/linux/namei.h:114:19: error: 'ESTALE' undeclared (first use in this function) Signed-off-by: Stephen Warren Signed-off-by: Al Viro --- include/linux/namei.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index e998c030061d..5a5ff57ceed4 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -2,6 +2,7 @@ #define _LINUX_NAMEI_H #include +#include #include #include -- cgit v1.2.3-59-g8ed1b From 812089e01b9f65f90fc8fc670d8cce72a0e01fbb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 1 Dec 2012 12:37:20 -0800 Subject: PCI: Reduce Ricoh 0xe822 SD card reader base clock frequency to 50MHz Otherwise it fails like this on cards like the Transcend 16GB SDHC card: mmc0: new SDHC card at address b368 mmcblk0: mmc0:b368 SDC 15.0 GiB mmcblk0: error -110 sending status command, retrying mmcblk0: error -84 transferring data, sector 0, nr 8, cmd response 0x900, card status 0xb0 Tested on my Lenovo x200 laptop. [bhelgaas: changelog] Signed-off-by: Andy Lutomirski Signed-off-by: Bjorn Helgaas Acked-by: Chris Ball CC: Manoj Iyer CC: stable@vger.kernel.org --- drivers/pci/quirks.c | 7 +++++-- include/linux/pci_ids.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 8f7a6344e79e..0369fb6fc1da 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2725,7 +2725,7 @@ static void ricoh_mmc_fixup_r5c832(struct pci_dev *dev) if (PCI_FUNC(dev->devfn)) return; /* - * RICOH 0xe823 SD/MMC card reader fails to recognize + * RICOH 0xe822 and 0xe823 SD/MMC card readers fail to recognize * certain types of SD/MMC cards. Lowering the SD base * clock frequency from 200Mhz to 50Mhz fixes this issue. * @@ -2736,7 +2736,8 @@ static void ricoh_mmc_fixup_r5c832(struct pci_dev *dev) * 0xf9 - Key register for 0x150 * 0xfc - key register for 0xe1 */ - if (dev->device == PCI_DEVICE_ID_RICOH_R5CE823) { + if (dev->device == PCI_DEVICE_ID_RICOH_R5CE822 || + dev->device == PCI_DEVICE_ID_RICOH_R5CE823) { pci_write_config_byte(dev, 0xf9, 0xfc); pci_write_config_byte(dev, 0x150, 0x10); pci_write_config_byte(dev, 0xf9, 0x00); @@ -2763,6 +2764,8 @@ static void ricoh_mmc_fixup_r5c832(struct pci_dev *dev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5C832, ricoh_mmc_fixup_r5c832); DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5C832, ricoh_mmc_fixup_r5c832); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5CE822, ricoh_mmc_fixup_r5c832); +DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5CE822, ricoh_mmc_fixup_r5c832); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5CE823, ricoh_mmc_fixup_r5c832); DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5CE823, ricoh_mmc_fixup_r5c832); #endif /*CONFIG_MMC_RICOH_MMC*/ diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 0f8447376ddb..0eb65796bcb9 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1568,6 +1568,7 @@ #define PCI_DEVICE_ID_RICOH_RL5C476 0x0476 #define PCI_DEVICE_ID_RICOH_RL5C478 0x0478 #define PCI_DEVICE_ID_RICOH_R5C822 0x0822 +#define PCI_DEVICE_ID_RICOH_R5CE822 0xe822 #define PCI_DEVICE_ID_RICOH_R5CE823 0xe823 #define PCI_DEVICE_ID_RICOH_R5C832 0x0832 #define PCI_DEVICE_ID_RICOH_R5C843 0x0843 -- cgit v1.2.3-59-g8ed1b From a7a88b23737095e6c18a20c5d4eef9e25ec5b829 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Jan 2013 02:04:23 -0800 Subject: mempolicy: remove arg from mpol_parse_str, mpol_to_str Remove the unused argument (formerly no_context) from mpol_parse_str() and from mpol_to_str(). Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- include/linux/mempolicy.h | 11 ++++------- mm/mempolicy.c | 6 ++---- mm/shmem.c | 4 ++-- 4 files changed, 9 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 448455b7fd91..ca5ce7f9f800 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1278,7 +1278,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.mm = mm; pol = get_vma_policy(task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 9adc270de7ef..92bc9988a180 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -165,11 +165,10 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, #ifdef CONFIG_TMPFS -extern int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context); +extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif -extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, - int no_context); +extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ static inline int vma_migratable(struct vm_area_struct *vma) @@ -296,15 +295,13 @@ static inline void check_highest_zone(int k) } #ifdef CONFIG_TMPFS -static inline int mpol_parse_str(char *str, struct mempolicy **mpol, - int no_context) +static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } #endif -static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, - int no_context) +static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { return 0; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c914cca53d..1cb200af3828 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2612,14 +2612,13 @@ static const char * const policy_modes[] = * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. - * @unused: redundant argument, to be removed later. * * Format of input: * [=][:] * * On success, returns 0, else 1 */ -int mpol_parse_str(char *str, struct mempolicy **mpol, int unused) +int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode; @@ -2747,13 +2746,12 @@ out: * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted - * @unused: redundant argument, to be removed later. * * Convert a mempolicy into a string. * Returns the number of characters in buffer (if positive) * or an error (negative) */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int unused) +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; int l; diff --git a/mm/shmem.c b/mm/shmem.c index 5c90d84c2b02..5dd56f6efdbd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ - mpol_to_str(buffer, sizeof(buffer), mpol, 1); + mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } @@ -2463,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (!gid_valid(sbinfo->gid)) goto bad_val; } else if (!strcmp(this_char,"mpol")) { - if (mpol_parse_str(value, &sbinfo->mpol, 1)) + if (mpol_parse_str(value, &sbinfo->mpol)) goto bad_val; } else { printk(KERN_ERR "tmpfs: Bad mount option %s\n", -- cgit v1.2.3-59-g8ed1b From 42288fe366c4f1ce7522bc9f27d0bc2a81c55264 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 21 Dec 2012 23:10:25 +0000 Subject: mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro Tested-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 +- mm/mempolicy.c | 68 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 92bc9988a180..0d7df39a5885 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -123,7 +123,7 @@ struct sp_node { struct shared_policy { struct rb_root root; - struct mutex mutex; + spinlock_t lock; }; void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1cb200af3828..e2df1c1fb41f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2132,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) */ /* lookup first element intersecting start-end */ -/* Caller holds sp->mutex */ +/* Caller holds sp->lock */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2196,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - mutex_lock(&sp->mutex); + spin_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - mutex_unlock(&sp->mutex); + spin_unlock(&sp->lock); return pol; } @@ -2328,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) sp_free(n); } +static void sp_node_init(struct sp_node *node, unsigned long start, + unsigned long end, struct mempolicy *pol) +{ + node->start = start; + node->end = end; + node->policy = pol; +} + static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { @@ -2344,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, return NULL; } newpol->flags |= MPOL_F_SHARED; - - n->start = start; - n->end = end; - n->policy = newpol; + sp_node_init(n, start, end, newpol); return n; } @@ -2357,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { struct sp_node *n; + struct sp_node *n_new = NULL; + struct mempolicy *mpol_new = NULL; int ret = 0; - mutex_lock(&sp->mutex); +restart: + spin_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2372,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } else { /* Old policy spanning whole new range. */ if (n->end > end) { - struct sp_node *new2; - new2 = sp_alloc(end, n->end, n->policy); - if (!new2) { - ret = -ENOMEM; - goto out; - } + if (!n_new) + goto alloc_new; + + *mpol_new = *n->policy; + atomic_set(&mpol_new->refcnt, 1); + sp_node_init(n_new, n->end, end, mpol_new); + sp_insert(sp, n_new); n->end = start; - sp_insert(sp, new2); + n_new = NULL; + mpol_new = NULL; break; } else n->end = start; @@ -2390,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } if (new) sp_insert(sp, new); -out: - mutex_unlock(&sp->mutex); + spin_unlock(&sp->lock); + ret = 0; + +err_out: + if (mpol_new) + mpol_put(mpol_new); + if (n_new) + kmem_cache_free(sn_cache, n_new); + return ret; + +alloc_new: + spin_unlock(&sp->lock); + ret = -ENOMEM; + n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n_new) + goto err_out; + mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!mpol_new) + goto err_out; + goto restart; } /** @@ -2410,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - mutex_init(&sp->mutex); + spin_lock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; @@ -2476,14 +2504,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - mutex_lock(&p->mutex); + spin_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } - mutex_unlock(&p->mutex); + spin_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3-59-g8ed1b From 3d33fcc11bdd11b6949cf5c406726a094395dc4f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 2 Jan 2013 15:12:55 +0000 Subject: UAPI: Remove empty Kbuild files Empty files can get deleted by the patch program, so remove empty Kbuild files and their links from the parent Kbuilds. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/Kbuild | 3 --- include/linux/Kbuild | 5 ----- include/linux/hdlc/Kbuild | 0 include/linux/hsi/Kbuild | 0 include/linux/raid/Kbuild | 0 include/linux/usb/Kbuild | 0 include/rdma/Kbuild | 0 include/sound/Kbuild | 0 8 files changed, 8 deletions(-) delete mode 100644 include/linux/Kbuild delete mode 100644 include/linux/hdlc/Kbuild delete mode 100644 include/linux/hsi/Kbuild delete mode 100644 include/linux/raid/Kbuild delete mode 100644 include/linux/usb/Kbuild delete mode 100644 include/rdma/Kbuild delete mode 100644 include/sound/Kbuild (limited to 'include/linux') diff --git a/include/Kbuild b/include/Kbuild index 83256b64166a..1dfd33e8d43b 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -1,8 +1,5 @@ # Top-level Makefile calls into asm-$(ARCH) # List only non-arch directories below -header-y += linux/ -header-y += sound/ -header-y += rdma/ header-y += video/ header-y += scsi/ diff --git a/include/linux/Kbuild b/include/linux/Kbuild deleted file mode 100644 index 7fe2dae251e5..000000000000 --- a/include/linux/Kbuild +++ /dev/null @@ -1,5 +0,0 @@ -header-y += dvb/ -header-y += hdlc/ -header-y += hsi/ -header-y += raid/ -header-y += usb/ diff --git a/include/linux/hdlc/Kbuild b/include/linux/hdlc/Kbuild deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/include/linux/hsi/Kbuild b/include/linux/hsi/Kbuild deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/include/linux/raid/Kbuild b/include/linux/raid/Kbuild deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/include/linux/usb/Kbuild b/include/linux/usb/Kbuild deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/include/rdma/Kbuild b/include/rdma/Kbuild deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/include/sound/Kbuild b/include/sound/Kbuild deleted file mode 100644 index e69de29bb2d1..000000000000 -- cgit v1.2.3-59-g8ed1b From f568f6ca811fe681ecfd11c4ce78b6aa488020c0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 21 Dec 2012 15:02:05 -0800 Subject: pstore: remove __dev* attributes. CONFIG_HOTPLUG is going away as an option. As a result, the __dev* markings need to be removed. This change removes the use of __devinit from the pstore filesystem. Based on patches originally written by Bill Pemberton, but redone by me in order to handle some of the coding style issues better, by hand. Cc: Bill Pemberton Cc: Anton Vorontsov Cc: Colin Cross Cc: Kees Cook Cc: Tony Luck Signed-off-by: Greg Kroah-Hartman --- fs/pstore/ram.c | 14 ++++++-------- fs/pstore/ram_core.c | 9 ++++----- include/linux/pstore_ram.h | 5 ++--- 3 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index f883e7e74305..7003e5266f25 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -291,9 +291,8 @@ static void ramoops_free_przs(struct ramoops_context *cxt) kfree(cxt->przs); } -static int __devinit ramoops_init_przs(struct device *dev, - struct ramoops_context *cxt, - phys_addr_t *paddr, size_t dump_mem_sz) +static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, + phys_addr_t *paddr, size_t dump_mem_sz) { int err = -ENOMEM; int i; @@ -336,10 +335,9 @@ fail_prz: return err; } -static int __devinit ramoops_init_prz(struct device *dev, - struct ramoops_context *cxt, - struct persistent_ram_zone **prz, - phys_addr_t *paddr, size_t sz, u32 sig) +static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, + struct persistent_ram_zone **prz, + phys_addr_t *paddr, size_t sz, u32 sig) { if (!sz) return 0; @@ -367,7 +365,7 @@ static int __devinit ramoops_init_prz(struct device *dev, return 0; } -static int __devinit ramoops_probe(struct platform_device *pdev) +static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = pdev->dev.platform_data; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index eecd2a8a84dd..0306303be372 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -390,8 +390,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, return 0; } -static int __devinit persistent_ram_post_init(struct persistent_ram_zone *prz, - u32 sig, int ecc_size) +static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, + int ecc_size) { int ret; @@ -443,9 +443,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz) kfree(prz); } -struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start, - size_t size, u32 sig, - int ecc_size) +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, + u32 sig, int ecc_size) { struct persistent_ram_zone *prz; int ret = -ENOMEM; diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 098d2a838296..cb6ab5feab67 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -46,9 +46,8 @@ struct persistent_ram_zone { size_t old_log_size; }; -struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start, - size_t size, u32 sig, - int ecc_size); +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, + u32 sig, int ecc_size); void persistent_ram_free(struct persistent_ram_zone *prz); void persistent_ram_zap(struct persistent_ram_zone *prz); -- cgit v1.2.3-59-g8ed1b From 0f58a01ddd5e8177255705ba15e64c3b74d67993 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 21 Dec 2012 15:12:59 -0800 Subject: Drivers: bcma: remove __dev* attributes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_HOTPLUG is going away as an option. As a result, the __dev* markings need to be removed. This change removes the use of __devinit, __devexit_p, and __devexit from these drivers. Based on patches originally written by Bill Pemberton, but redone by me in order to handle some of the coding style issues better, by hand. Cc: Bill Pemberton Cc: "Rafał Miłecki" Signed-off-by: Greg Kroah-Hartman --- drivers/bcma/bcma_private.h | 6 +++--- drivers/bcma/driver_gmac_cmn.c | 2 +- drivers/bcma/driver_pci.c | 4 ++-- drivers/bcma/driver_pci_host.c | 13 ++++++------- drivers/bcma/host_pci.c | 8 ++++---- drivers/bcma/main.c | 2 +- include/linux/bcma/bcma_driver_gmac_cmn.h | 2 +- include/linux/bcma/bcma_driver_pci.h | 2 +- 8 files changed, 19 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h index 4a2d72ec6d43..19e3fbfd5757 100644 --- a/drivers/bcma/bcma_private.h +++ b/drivers/bcma/bcma_private.h @@ -22,7 +22,7 @@ struct bcma_bus; /* main.c */ -int __devinit bcma_bus_register(struct bcma_bus *bus); +int bcma_bus_register(struct bcma_bus *bus); void bcma_bus_unregister(struct bcma_bus *bus); int __init bcma_bus_early_register(struct bcma_bus *bus, struct bcma_device *core_cc, @@ -87,8 +87,8 @@ u32 bcma_pcie_read(struct bcma_drv_pci *pc, u32 address); extern int bcma_chipco_watchdog_register(struct bcma_drv_cc *cc); #ifdef CONFIG_BCMA_DRIVER_PCI_HOSTMODE -bool __devinit bcma_core_pci_is_in_hostmode(struct bcma_drv_pci *pc); -void __devinit bcma_core_pci_hostmode_init(struct bcma_drv_pci *pc); +bool bcma_core_pci_is_in_hostmode(struct bcma_drv_pci *pc); +void bcma_core_pci_hostmode_init(struct bcma_drv_pci *pc); #endif /* CONFIG_BCMA_DRIVER_PCI_HOSTMODE */ #ifdef CONFIG_BCMA_DRIVER_GPIO diff --git a/drivers/bcma/driver_gmac_cmn.c b/drivers/bcma/driver_gmac_cmn.c index 834225f65e8f..dcb137926d31 100644 --- a/drivers/bcma/driver_gmac_cmn.c +++ b/drivers/bcma/driver_gmac_cmn.c @@ -8,7 +8,7 @@ #include "bcma_private.h" #include -void __devinit bcma_core_gmac_cmn_init(struct bcma_drv_gmac_cmn *gc) +void bcma_core_gmac_cmn_init(struct bcma_drv_gmac_cmn *gc) { mutex_init(&gc->phy_mutex); } diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c index c39ee6d45850..cf7a476a519f 100644 --- a/drivers/bcma/driver_pci.c +++ b/drivers/bcma/driver_pci.c @@ -207,14 +207,14 @@ static void bcma_core_pci_config_fixup(struct bcma_drv_pci *pc) * Init. **************************************************/ -static void __devinit bcma_core_pci_clientmode_init(struct bcma_drv_pci *pc) +static void bcma_core_pci_clientmode_init(struct bcma_drv_pci *pc) { bcma_core_pci_fixcfg(pc); bcma_pcicore_serdes_workaround(pc); bcma_core_pci_config_fixup(pc); } -void __devinit bcma_core_pci_init(struct bcma_drv_pci *pc) +void bcma_core_pci_init(struct bcma_drv_pci *pc) { if (pc->setup_done) return; diff --git a/drivers/bcma/driver_pci_host.c b/drivers/bcma/driver_pci_host.c index e6b5c89469dc..af0c9fabee54 100644 --- a/drivers/bcma/driver_pci_host.c +++ b/drivers/bcma/driver_pci_host.c @@ -24,7 +24,7 @@ #define BCMA_PCI_SLOT_MAX 16 #define PCI_CONFIG_SPACE_SIZE 256 -bool __devinit bcma_core_pci_is_in_hostmode(struct bcma_drv_pci *pc) +bool bcma_core_pci_is_in_hostmode(struct bcma_drv_pci *pc) { struct bcma_bus *bus = pc->core->bus; u16 chipid_top; @@ -264,10 +264,9 @@ static int bcma_core_pci_hostmode_write_config(struct pci_bus *bus, } /* return cap_offset if requested capability exists in the PCI config space */ -static u8 __devinit bcma_find_pci_capability(struct bcma_drv_pci *pc, - unsigned int dev, - unsigned int func, u8 req_cap_id, - unsigned char *buf, u32 *buflen) +static u8 bcma_find_pci_capability(struct bcma_drv_pci *pc, unsigned int dev, + unsigned int func, u8 req_cap_id, + unsigned char *buf, u32 *buflen) { u8 cap_id; u8 cap_ptr = 0; @@ -334,7 +333,7 @@ static u8 __devinit bcma_find_pci_capability(struct bcma_drv_pci *pc, * Retry Status (CRS) Completion Status to software then * enable the feature. */ -static void __devinit bcma_core_pci_enable_crs(struct bcma_drv_pci *pc) +static void bcma_core_pci_enable_crs(struct bcma_drv_pci *pc) { struct bcma_bus *bus = pc->core->bus; u8 cap_ptr, root_ctrl, root_cap, dev; @@ -381,7 +380,7 @@ static void __devinit bcma_core_pci_enable_crs(struct bcma_drv_pci *pc) } } -void __devinit bcma_core_pci_hostmode_init(struct bcma_drv_pci *pc) +void bcma_core_pci_hostmode_init(struct bcma_drv_pci *pc) { struct bcma_bus *bus = pc->core->bus; struct bcma_drv_pci_host *pc_host; diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c index 98fdc3e014e7..fbf2759e7e4e 100644 --- a/drivers/bcma/host_pci.c +++ b/drivers/bcma/host_pci.c @@ -155,8 +155,8 @@ static const struct bcma_host_ops bcma_host_pci_ops = { .awrite32 = bcma_host_pci_awrite32, }; -static int __devinit bcma_host_pci_probe(struct pci_dev *dev, - const struct pci_device_id *id) +static int bcma_host_pci_probe(struct pci_dev *dev, + const struct pci_device_id *id) { struct bcma_bus *bus; int err = -ENOMEM; @@ -226,7 +226,7 @@ err_kfree_bus: return err; } -static void __devexit bcma_host_pci_remove(struct pci_dev *dev) +static void bcma_host_pci_remove(struct pci_dev *dev) { struct bcma_bus *bus = pci_get_drvdata(dev); @@ -284,7 +284,7 @@ static struct pci_driver bcma_pci_bridge_driver = { .name = "bcma-pci-bridge", .id_table = bcma_pci_bridge_tbl, .probe = bcma_host_pci_probe, - .remove = __devexit_p(bcma_host_pci_remove), + .remove = bcma_host_pci_remove, .driver.pm = BCMA_PM_OPS, }; diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c index 53ba20ca17e0..4a92f647b58b 100644 --- a/drivers/bcma/main.c +++ b/drivers/bcma/main.c @@ -192,7 +192,7 @@ static void bcma_unregister_cores(struct bcma_bus *bus) platform_device_unregister(bus->drv_cc.watchdog); } -int __devinit bcma_bus_register(struct bcma_bus *bus) +int bcma_bus_register(struct bcma_bus *bus) { int err; struct bcma_device *core; diff --git a/include/linux/bcma/bcma_driver_gmac_cmn.h b/include/linux/bcma/bcma_driver_gmac_cmn.h index def894b83b0d..4dd1f33e36a2 100644 --- a/include/linux/bcma/bcma_driver_gmac_cmn.h +++ b/include/linux/bcma/bcma_driver_gmac_cmn.h @@ -92,7 +92,7 @@ struct bcma_drv_gmac_cmn { #define gmac_cmn_write32(gc, offset, val) bcma_write32((gc)->core, offset, val) #ifdef CONFIG_BCMA_DRIVER_GMAC_CMN -extern void __devinit bcma_core_gmac_cmn_init(struct bcma_drv_gmac_cmn *gc); +extern void bcma_core_gmac_cmn_init(struct bcma_drv_gmac_cmn *gc); #else static inline void bcma_core_gmac_cmn_init(struct bcma_drv_gmac_cmn *gc) { } #endif diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index 41da581e1612..c48d98d27b77 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -214,7 +214,7 @@ struct bcma_drv_pci { #define pcicore_write16(pc, offset, val) bcma_write16((pc)->core, offset, val) #define pcicore_write32(pc, offset, val) bcma_write32((pc)->core, offset, val) -extern void __devinit bcma_core_pci_init(struct bcma_drv_pci *pc); +extern void bcma_core_pci_init(struct bcma_drv_pci *pc); extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core, bool enable); extern void bcma_core_pci_extend_L1timer(struct bcma_drv_pci *pc, bool extend); -- cgit v1.2.3-59-g8ed1b From e389623a68622e3c9be440ab522fac1aa1ca3454 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 21 Dec 2012 15:15:49 -0800 Subject: include: remove __dev* attributes. CONFIG_HOTPLUG is going away as an option. As a result, the __dev* markings need to be removed. This change removes the use of __devinit from some include files that were previously missed. Based on patches originally written by Bill Pemberton, but redone by me in order to handle some of the coding style issues better, by hand. Cc: Bill Pemberton Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/parport.h | 4 ++-- include/linux/ata_platform.h | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/parport.h b/include/asm-generic/parport.h index 40528cb977e8..2c9f9d4336ca 100644 --- a/include/asm-generic/parport.h +++ b/include/asm-generic/parport.h @@ -10,8 +10,8 @@ * to devices on the PCI bus. */ -static int __devinit parport_pc_find_isa_ports(int autoirq, int autodma); -static int __devinit parport_pc_find_nonpci_ports(int autoirq, int autodma) +static int parport_pc_find_isa_ports(int autoirq, int autodma); +static int parport_pc_find_nonpci_ports(int autoirq, int autodma) { #ifdef CONFIG_ISA return parport_pc_find_isa_ports(autoirq, autodma); diff --git a/include/linux/ata_platform.h b/include/linux/ata_platform.h index fe9989636b62..b9fde17f767c 100644 --- a/include/linux/ata_platform.h +++ b/include/linux/ata_platform.h @@ -15,12 +15,12 @@ struct pata_platform_info { unsigned int irq_flags; }; -extern int __devinit __pata_platform_probe(struct device *dev, - struct resource *io_res, - struct resource *ctl_res, - struct resource *irq_res, - unsigned int ioport_shift, - int __pio_mask); +extern int __pata_platform_probe(struct device *dev, + struct resource *io_res, + struct resource *ctl_res, + struct resource *irq_res, + unsigned int ioport_shift, + int __pio_mask); /* * Marvell SATA private data -- cgit v1.2.3-59-g8ed1b From 03f595668017f1a1fb971c02fc37140bc6e7bb1c Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 4 Jan 2013 15:34:50 -0800 Subject: ipc: add sysctl to specify desired next object id Add 3 new variables and sysctls to tune them (by one "next_id" variable for messages, semaphores and shared memory respectively). This variable can be used to set desired id for next allocated IPC object. By default it's equal to -1 and old behaviour is preserved. If this variable is non-negative, then desired idr will be extracted from it and used as a start value to search for free IDR slot. Notes: 1) this patch doesn't guarantee that the new object will have desired id. So it's up to user space how to handle new object with wrong id. 2) After a sucessful id allocation attempt, "next_id" will be set back to -1 (if it was non-negative). [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Stanislav Kinsbursky Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Al Viro Cc: KOSAKI Motohiro Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 19 +++++++++++++++++++ include/linux/ipc_namespace.h | 1 + ipc/ipc_sysctl.c | 32 ++++++++++++++++++++++++++++++++ ipc/util.c | 16 ++++++++++++---- ipc/util.h | 1 + 5 files changed, 65 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 2907ba6c3607..51b953a1b149 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -38,6 +38,7 @@ show up in /proc/sys/kernel: - l2cr [ PPC only ] - modprobe ==> Documentation/debugging-modules.txt - modules_disabled +- msg_next_id [ sysv ipc ] - msgmax - msgmnb - msgmni @@ -62,7 +63,9 @@ show up in /proc/sys/kernel: - rtsig-max - rtsig-nr - sem +- sem_next_id [ sysv ipc ] - sg-big-buff [ generic SCSI device (sg) ] +- shm_next_id [ sysv ipc ] - shm_rmid_forced - shmall - shmmax [ sysv ipc ] @@ -320,6 +323,22 @@ to false. ============================================================== +msg_next_id, sem_next_id, and shm_next_id: + +These three toggles allows to specify desired id for next allocated IPC +object: message, semaphore or shared memory respectively. + +By default they are equal to -1, which means generic allocation logic. +Possible values to set are in range {0..INT_MAX}. + +Notes: +1) kernel doesn't guarantee, that new object will have desired id. So, +it's up to userspace, how to handle an object with "wrong" id. +2) Toggle with non-default value will be set back to -1 by kernel after +successful IPC object allocation. + +============================================================== + nmi_watchdog: Enables/Disables the NMI watchdog on x86 systems. When the value is diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index fe771978e877..ae221a7b5092 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -24,6 +24,7 @@ struct ipc_ids { unsigned short seq_max; struct rw_semaphore rw_mutex; struct idr ipcs_idr; + int next_id; }; struct ipc_namespace { diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 00fba2bab87d..130dfece27ac 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -158,6 +158,9 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, static int zero; static int one = 1; +#ifdef CONFIG_CHECKPOINT_RESTORE +static int int_max = INT_MAX; +#endif static struct ctl_table ipc_kern_table[] = { { @@ -227,6 +230,35 @@ static struct ctl_table ipc_kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_CHECKPOINT_RESTORE + { + .procname = "sem_next_id", + .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, + .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), + .mode = 0644, + .proc_handler = proc_ipc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, + }, + { + .procname = "msg_next_id", + .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, + .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), + .mode = 0644, + .proc_handler = proc_ipc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, + }, + { + .procname = "shm_next_id", + .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, + .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), + .mode = 0644, + .proc_handler = proc_ipc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, + }, +#endif {} }; diff --git a/ipc/util.c b/ipc/util.c index 72fd0785ac94..74e1d9c7a98a 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -122,6 +122,7 @@ void ipc_init_ids(struct ipc_ids *ids) ids->in_use = 0; ids->seq = 0; + ids->next_id = -1; { int seq_limit = INT_MAX/SEQ_MULTIPLIER; if (seq_limit > USHRT_MAX) @@ -252,6 +253,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) kuid_t euid; kgid_t egid; int id, err; + int next_id = ids->next_id; if (size > IPCMNI) size = IPCMNI; @@ -264,7 +266,8 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) rcu_read_lock(); spin_lock(&new->lock); - err = idr_get_new(&ids->ipcs_idr, new, &id); + err = idr_get_new_above(&ids->ipcs_idr, new, + (next_id < 0) ? 0 : ipcid_to_idx(next_id), &id); if (err) { spin_unlock(&new->lock); rcu_read_unlock(); @@ -277,9 +280,14 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) new->cuid = new->uid = euid; new->gid = new->cgid = egid; - new->seq = ids->seq++; - if(ids->seq > ids->seq_max) - ids->seq = 0; + if (next_id < 0) { + new->seq = ids->seq++; + if (ids->seq > ids->seq_max) + ids->seq = 0; + } else { + new->seq = ipcid_to_seqx(next_id); + ids->next_id = -1; + } new->id = ipc_buildid(id, new->seq); return id; diff --git a/ipc/util.h b/ipc/util.h index c8fe2f7631e9..a61e0ca2bffd 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -92,6 +92,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, #define IPC_SHM_IDS 2 #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) +#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) /* must be called with ids->rw_mutex acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); -- cgit v1.2.3-59-g8ed1b From f9dd87f4738c7555aca2cdf8cb2b2326cafb0cad Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 4 Jan 2013 15:34:52 -0800 Subject: ipc: message queue receive cleanup Move all message related manipulation into one function msg_fill(). Actually, two functions because of the compat one. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Stanislav Kinsbursky Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Al Viro Cc: KOSAKI Motohiro Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/msg.h | 5 +++-- ipc/compat.c | 45 +++++++++++++++++++-------------------------- ipc/msg.c | 44 +++++++++++++++++++++++--------------------- 3 files changed, 45 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msg.h b/include/linux/msg.h index 7a4b9e97d29a..fc5743a554e6 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -34,7 +34,8 @@ struct msg_queue { /* Helper routines for sys_msgsnd and sys_msgrcv */ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, size_t msgsz, int msgflg); -extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext, - size_t msgsz, long msgtyp, int msgflg); +extern long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, + int msgflg, + long (*msg_fill)(void __user *, struct msg_msg *, size_t)); #endif /* _LINUX_MSG_H */ diff --git a/ipc/compat.c b/ipc/compat.c index ad9518eb26e0..eb3ea16d2d1d 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -306,6 +306,20 @@ static long do_compat_semctl(int first, int second, int third, u32 pad) return err; } +long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) +{ + struct compat_msgbuf __user *msgp = dest; + size_t msgsz; + + if (put_user(msg->m_type, &msgp->mtype)) + return -EFAULT; + + msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; + if (store_msg(msgp->mtext, msg, msgsz)) + return -EFAULT; + return msgsz; +} + #ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC long compat_sys_semctl(int first, int second, int third, void __user *uptr) { @@ -337,10 +351,6 @@ long compat_sys_msgsnd(int first, int second, int third, void __user *uptr) long compat_sys_msgrcv(int first, int second, int msgtyp, int third, int version, void __user *uptr) { - struct compat_msgbuf __user *up; - long type; - int err; - if (first < 0) return -EINVAL; if (second < 0) @@ -348,23 +358,14 @@ long compat_sys_msgrcv(int first, int second, int msgtyp, int third, if (!version) { struct compat_ipc_kludge ipck; - err = -EINVAL; if (!uptr) - goto out; - err = -EFAULT; + return -EINVAL; if (copy_from_user (&ipck, uptr, sizeof(ipck))) - goto out; + return -EFAULT; uptr = compat_ptr(ipck.msgp); msgtyp = ipck.msgtyp; } - up = uptr; - err = do_msgrcv(first, &type, up->mtext, second, msgtyp, third); - if (err < 0) - goto out; - if (put_user(type, &up->mtype)) - err = -EFAULT; -out: - return err; + return do_msgrcv(first, uptr, second, msgtyp, third, compat_do_msg_fill); } #else long compat_sys_semctl(int semid, int semnum, int cmd, int arg) @@ -385,16 +386,8 @@ long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp, long compat_sys_msgrcv(int msqid, struct compat_msgbuf __user *msgp, compat_ssize_t msgsz, long msgtyp, int msgflg) { - long err, mtype; - - err = do_msgrcv(msqid, &mtype, msgp->mtext, (ssize_t)msgsz, msgtyp, msgflg); - if (err < 0) - goto out; - - if (put_user(mtype, &msgp->mtype)) - err = -EFAULT; - out: - return err; + return do_msgrcv(msqid, msgp, (ssize_t)msgsz, msgtyp, msgflg, + compat_do_msg_fill); } #endif diff --git a/ipc/msg.c b/ipc/msg.c index 2f272fa76595..cefc24f46e3e 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -755,15 +755,30 @@ static inline int convert_mode(long *msgtyp, int msgflg) return SEARCH_EQUAL; } -long do_msgrcv(int msqid, long *pmtype, void __user *mtext, - size_t msgsz, long msgtyp, int msgflg) +static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) +{ + struct msgbuf __user *msgp = dest; + size_t msgsz; + + if (put_user(msg->m_type, &msgp->mtype)) + return -EFAULT; + + msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; + if (store_msg(msgp->mtext, msg, msgsz)) + return -EFAULT; + return msgsz; +} + +long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, + int msgflg, + long (*msg_handler)(void __user *, struct msg_msg *, size_t)) { struct msg_queue *msq; struct msg_msg *msg; int mode; struct ipc_namespace *ns; - if (msqid < 0 || (long) msgsz < 0) + if (msqid < 0 || (long) bufsz < 0) return -EINVAL; mode = convert_mode(&msgtyp, msgflg); ns = current->nsproxy->ipc_ns; @@ -804,7 +819,7 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, * Found a suitable message. * Unlink it from the queue. */ - if ((msgsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { + if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { msg = ERR_PTR(-E2BIG); goto out_unlock; } @@ -831,7 +846,7 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, if (msgflg & MSG_NOERROR) msr_d.r_maxsize = INT_MAX; else - msr_d.r_maxsize = msgsz; + msr_d.r_maxsize = bufsz; msr_d.r_msg = ERR_PTR(-EAGAIN); current->state = TASK_INTERRUPTIBLE; msg_unlock(msq); @@ -894,29 +909,16 @@ out_unlock: if (IS_ERR(msg)) return PTR_ERR(msg); - msgsz = (msgsz > msg->m_ts) ? msg->m_ts : msgsz; - *pmtype = msg->m_type; - if (store_msg(mtext, msg, msgsz)) - msgsz = -EFAULT; - + bufsz = msg_handler(buf, msg, bufsz); free_msg(msg); - return msgsz; + return bufsz; } SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, long, msgtyp, int, msgflg) { - long err, mtype; - - err = do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg); - if (err < 0) - goto out; - - if (put_user(mtype, &msgp->mtype)) - err = -EFAULT; -out: - return err; + return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); } #ifdef CONFIG_PROC_FS -- cgit v1.2.3-59-g8ed1b From 3a665531a3b7c2ad2c87903b24646be6916340e4 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 4 Jan 2013 15:34:56 -0800 Subject: selftests: IPC message queue copy feature test This test can be used to check wheither kernel supports IPC message queue copy and restore features (required by CRIU project). Signed-off-by: Stanislav Kinsbursky Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Al Viro Cc: KOSAKI Motohiro Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/msg.h | 3 +- ipc/compat.c | 3 +- tools/testing/selftests/ipc/Makefile | 25 ++++ tools/testing/selftests/ipc/msgque.c | 246 +++++++++++++++++++++++++++++++++++ 4 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/ipc/Makefile create mode 100644 tools/testing/selftests/ipc/msgque.c (limited to 'include/linux') diff --git a/include/linux/msg.h b/include/linux/msg.h index fc5743a554e6..391af8d11cce 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -36,6 +36,7 @@ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, size_t msgsz, int msgflg); extern long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, - long (*msg_fill)(void __user *, struct msg_msg *, size_t)); + long (*msg_fill)(void __user *, struct msg_msg *, + size_t)); #endif /* _LINUX_MSG_H */ diff --git a/ipc/compat.c b/ipc/compat.c index eb3ea16d2d1d..2547f29dcd1b 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -365,7 +365,8 @@ long compat_sys_msgrcv(int first, int second, int msgtyp, int third, uptr = compat_ptr(ipck.msgp); msgtyp = ipck.msgtyp; } - return do_msgrcv(first, uptr, second, msgtyp, third, compat_do_msg_fill); + return do_msgrcv(first, uptr, second, msgtyp, third, + compat_do_msg_fill); } #else long compat_sys_semctl(int semid, int semnum, int cmd, int arg) diff --git a/tools/testing/selftests/ipc/Makefile b/tools/testing/selftests/ipc/Makefile new file mode 100644 index 000000000000..5386fd7c43ae --- /dev/null +++ b/tools/testing/selftests/ipc/Makefile @@ -0,0 +1,25 @@ +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) +ifeq ($(ARCH),i386) + ARCH := X86 + CFLAGS := -DCONFIG_X86_32 -D__i386__ +endif +ifeq ($(ARCH),x86_64) + ARCH := X86 + CFLAGS := -DCONFIG_X86_64 -D__x86_64__ +endif + +CFLAGS += -I../../../../usr/include/ + +all: +ifeq ($(ARCH),X86) + gcc $(CFLAGS) msgque.c -o msgque_test +else + echo "Not an x86 target, can't build msgque selftest" +endif + +run_tests: all + ./msgque_test + +clean: + rm -fr ./msgque_test diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c new file mode 100644 index 000000000000..d66418237d21 --- /dev/null +++ b/tools/testing/selftests/ipc/msgque.c @@ -0,0 +1,246 @@ +#include +#include +#include +#include +#include +#include + +#define MAX_MSG_SIZE 32 + +struct msg1 { + int msize; + long mtype; + char mtext[MAX_MSG_SIZE]; +}; + +#define TEST_STRING "Test sysv5 msg" +#define MSG_TYPE 1 + +#define ANOTHER_TEST_STRING "Yet another test sysv5 msg" +#define ANOTHER_MSG_TYPE 26538 + +struct msgque_data { + key_t key; + int msq_id; + int qbytes; + int qnum; + int mode; + struct msg1 *messages; +}; + +int restore_queue(struct msgque_data *msgque) +{ + int fd, ret, id, i; + char buf[32]; + + fd = open("/proc/sys/kernel/msg_next_id", O_WRONLY); + if (fd == -1) { + printf("Failed to open /proc/sys/kernel/msg_next_id\n"); + return -errno; + } + sprintf(buf, "%d", msgque->msq_id); + + ret = write(fd, buf, strlen(buf)); + if (ret != strlen(buf)) { + printf("Failed to write to /proc/sys/kernel/msg_next_id\n"); + return -errno; + } + + id = msgget(msgque->key, msgque->mode | IPC_CREAT | IPC_EXCL); + if (id == -1) { + printf("Failed to create queue\n"); + return -errno; + } + + if (id != msgque->msq_id) { + printf("Restored queue has wrong id (%d instead of %d)\n", + id, msgque->msq_id); + ret = -EFAULT; + goto destroy; + } + + for (i = 0; i < msgque->qnum; i++) { + if (msgsnd(msgque->msq_id, &msgque->messages[i].mtype, + msgque->messages[i].msize, IPC_NOWAIT) != 0) { + printf("msgsnd failed (%m)\n"); + ret = -errno; + goto destroy; + }; + } + return 0; + +destroy: + if (msgctl(id, IPC_RMID, 0)) + printf("Failed to destroy queue: %d\n", -errno); + return ret; +} + +int check_and_destroy_queue(struct msgque_data *msgque) +{ + struct msg1 message; + int cnt = 0, ret; + + while (1) { + ret = msgrcv(msgque->msq_id, &message.mtype, MAX_MSG_SIZE, + 0, IPC_NOWAIT); + if (ret < 0) { + if (errno == ENOMSG) + break; + printf("Failed to read IPC message: %m\n"); + ret = -errno; + goto err; + } + if (ret != msgque->messages[cnt].msize) { + printf("Wrong message size: %d (expected %d)\n", ret, + msgque->messages[cnt].msize); + ret = -EINVAL; + goto err; + } + if (message.mtype != msgque->messages[cnt].mtype) { + printf("Wrong message type\n"); + ret = -EINVAL; + goto err; + } + if (memcmp(message.mtext, msgque->messages[cnt].mtext, ret)) { + printf("Wrong message content\n"); + ret = -EINVAL; + goto err; + } + cnt++; + } + + if (cnt != msgque->qnum) { + printf("Wrong message number\n"); + ret = -EINVAL; + goto err; + } + + ret = 0; +err: + if (msgctl(msgque->msq_id, IPC_RMID, 0)) { + printf("Failed to destroy queue: %d\n", -errno); + return -errno; + } + return ret; +} + +int dump_queue(struct msgque_data *msgque) +{ + struct msqid64_ds ds; + int kern_id; + int i, ret; + + for (kern_id = 0; kern_id < 256; kern_id++) { + ret = msgctl(kern_id, MSG_STAT, &ds); + if (ret < 0) { + if (errno == -EINVAL) + continue; + printf("Failed to get stats for IPC queue with id %d\n", + kern_id); + return -errno; + } + + if (ret == msgque->msq_id) + break; + } + + msgque->messages = malloc(sizeof(struct msg1) * ds.msg_qnum); + if (msgque->messages == NULL) { + printf("Failed to get stats for IPC queue\n"); + return -ENOMEM; + } + + msgque->qnum = ds.msg_qnum; + msgque->mode = ds.msg_perm.mode; + msgque->qbytes = ds.msg_qbytes; + + for (i = 0; i < msgque->qnum; i++) { + ret = msgrcv(msgque->msq_id, &msgque->messages[i].mtype, + MAX_MSG_SIZE, i, IPC_NOWAIT | MSG_COPY); + if (ret < 0) { + printf("Failed to copy IPC message: %m (%d)\n", errno); + return -errno; + } + msgque->messages[i].msize = ret; + } + return 0; +} + +int fill_msgque(struct msgque_data *msgque) +{ + struct msg1 msgbuf; + + msgbuf.mtype = MSG_TYPE; + memcpy(msgbuf.mtext, TEST_STRING, sizeof(TEST_STRING)); + if (msgsnd(msgque->msq_id, &msgbuf.mtype, sizeof(TEST_STRING), + IPC_NOWAIT) != 0) { + printf("First message send failed (%m)\n"); + return -errno; + }; + + msgbuf.mtype = ANOTHER_MSG_TYPE; + memcpy(msgbuf.mtext, ANOTHER_TEST_STRING, sizeof(ANOTHER_TEST_STRING)); + if (msgsnd(msgque->msq_id, &msgbuf.mtype, sizeof(ANOTHER_TEST_STRING), + IPC_NOWAIT) != 0) { + printf("Second message send failed (%m)\n"); + return -errno; + }; + return 0; +} + +int main(int argc, char **argv) +{ + int msg, pid, err; + struct msgque_data msgque; + + msgque.key = ftok(argv[0], 822155650); + if (msgque.key == -1) { + printf("Can't make key\n"); + return -errno; + } + + msgque.msq_id = msgget(msgque.key, IPC_CREAT | IPC_EXCL | 0666); + if (msgque.msq_id == -1) { + printf("Can't create queue\n"); + goto err_out; + } + + err = fill_msgque(&msgque); + if (err) { + printf("Failed to fill queue\n"); + goto err_destroy; + } + + err = dump_queue(&msgque); + if (err) { + printf("Failed to dump queue\n"); + goto err_destroy; + } + + err = check_and_destroy_queue(&msgque); + if (err) { + printf("Failed to check and destroy queue\n"); + goto err_out; + } + + err = restore_queue(&msgque); + if (err) { + printf("Failed to restore queue\n"); + goto err_destroy; + } + + err = check_and_destroy_queue(&msgque); + if (err) { + printf("Failed to test queue\n"); + goto err_out; + } + return 0; + +err_destroy: + if (msgctl(msgque.msq_id, IPC_RMID, 0)) { + printf("Failed to destroy queue: %d\n", -errno); + return -errno; + } +err_out: + return err; +} -- cgit v1.2.3-59-g8ed1b From a458431e176ddb27e8ef8b98c2a681b217337393 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 4 Jan 2013 15:35:08 -0800 Subject: mm: fix zone_watermark_ok_safe() accounting of isolated pages Commit 702d1a6e0766 ("memory-hotplug: fix kswapd looping forever problem") added an isolated pageblocks counter (nr_pageblock_isolate in struct zone) and used it to adjust free pages counter in zone_watermark_ok_safe() to prevent kswapd looping forever problem. Then later, commit 2139cbe627b8 ("cma: fix counting of isolated pages") fixed accounting of isolated pages in global free pages counter. It made the previous zone_watermark_ok_safe() fix unnecessary and potentially harmful (cause now isolated pages may be accounted twice making free pages counter incorrect). This patch removes the special isolated pageblocks counter altogether which fixes zone_watermark_ok_safe() free pages check. Reported-by: Tomasz Stanislawski Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Aaditya Kumar Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 -------- mm/page_alloc.c | 27 --------------------------- mm/page_isolation.c | 26 ++------------------------ 3 files changed, 2 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4bec5be82cab..73b64a38b984 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -503,14 +503,6 @@ struct zone { * rarely used fields: */ const char *name; -#ifdef CONFIG_MEMORY_ISOLATION - /* - * the number of MIGRATE_ISOLATE *pageblock*. - * We need this for free page counting. Look at zone_watermark_ok_safe. - * It's protected by zone->lock - */ - int nr_pageblock_isolate; -#endif } ____cacheline_internodealigned_in_smp; typedef enum { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ba5e37127fc..bc6cc0e913bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -221,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -/* - * NOTE: - * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. - * Instead, use {un}set_pageblock_isolate. - */ void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -1655,20 +1650,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -#ifdef CONFIG_MEMORY_ISOLATION -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - if (unlikely(zone->nr_pageblock_isolate)) - return zone->nr_pageblock_isolate * pageblock_nr_pages; - return 0; -} -#else -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - return 0; -} -#endif - bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { @@ -1684,14 +1665,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - /* - * If the zone has MIGRATE_ISOLATE type free pages, we should consider - * it. nr_zone_isolate_freepages is never accurate so kswapd might not - * sleep although it could do so. But this is more desirable for memory - * hotplug than sleeping which can cause a livelock in the direct - * reclaim path. - */ - free_pages -= nr_zone_isolate_freepages(z); return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, free_pages); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9d2264ea4606..383bdbb98b04 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,28 +8,6 @@ #include #include "internal.h" -/* called while holding zone->lock */ -static void set_pageblock_isolate(struct page *page) -{ - if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) - return; - - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - page_zone(page)->nr_pageblock_isolate++; -} - -/* called while holding zone->lock */ -static void restore_pageblock_isolate(struct page *page, int migratetype) -{ - struct zone *zone = page_zone(page); - if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) - return; - - BUG_ON(zone->nr_pageblock_isolate <= 0); - set_pageblock_migratetype(page, migratetype); - zone->nr_pageblock_isolate--; -} - int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { struct zone *zone; @@ -80,7 +58,7 @@ out: unsigned long nr_pages; int migratetype = get_pageblock_migratetype(page); - set_pageblock_isolate(page); + set_pageblock_migratetype(page, MIGRATE_ISOLATE); nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -103,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) goto out; nr_pages = move_freepages_block(zone, page, migratetype); __mod_zone_freepage_state(zone, nr_pages, migratetype); - restore_pageblock_isolate(page, migratetype); + set_pageblock_migratetype(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } -- cgit v1.2.3-59-g8ed1b From 08c097fc3bb283299a6915a6a3795edab85979b1 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Wed, 9 Jan 2013 14:16:30 +0000 Subject: cred: Remove tgcred pointer from struct cred Commit 3a50597de863 ("KEYS: Make the session and process keyrings per-thread") removed the definition of the thread_group_cred structure, but left a now unused pointer in struct cred. Signed-off-by: Marc Dionne Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/linux/cred.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index abb2cd50f6b2..04421e825365 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -128,7 +128,6 @@ struct cred { struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ - struct thread_group_cred *tgcred; /* thread-group shared credentials */ #endif #ifdef CONFIG_SECURITY void *security; /* subjective LSM security */ -- cgit v1.2.3-59-g8ed1b From 54b956b903607f8f8878754dd4352da6a54a1da2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 10 Jan 2013 10:57:01 -0800 Subject: Remove __dev* markings from init.h Now that all in-kernel users of __dev* are gone, let's remove them from init.h to keep them from popping up again and again. Thanks to Bill Pemberton for doing all of the hard work to make removal of this possible. Cc: Bill Pemberton Cc: Stephen Rothwell Signed-off-by: Greg Kroah-Hartman --- include/linux/init.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index a799273714ac..10ed4f436458 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -93,14 +93,6 @@ #define __exit __section(.exit.text) __exitused __cold notrace -/* Used for HOTPLUG, but that is always enabled now, so just make them noops */ -#define __devinit -#define __devinitdata -#define __devinitconst -#define __devexit -#define __devexitdata -#define __devexitconst - /* Used for HOTPLUG_CPU */ #define __cpuinit __section(.cpuinit.text) __cold notrace #define __cpuinitdata __section(.cpuinit.data) @@ -337,18 +329,6 @@ void __init parse_early_options(char *cmdline); #define __INITRODATA_OR_MODULE __INITRODATA #endif /*CONFIG_MODULES*/ -/* Functions marked as __devexit may be discarded at kernel link time, depending - on config options. Newer versions of binutils detect references from - retained sections to discarded sections and flag an error. Pointers to - __devexit functions must use __devexit_p(function_name), the wrapper will - insert either the function_name or NULL, depending on the config options. - */ -#if defined(MODULE) || defined(CONFIG_HOTPLUG) -#define __devexit_p(x) x -#else -#define __devexit_p(x) NULL -#endif - #ifdef MODULE #define __exit_p(x) x #else -- cgit v1.2.3-59-g8ed1b From 896f97ea95c1d29c0520ee0766b66b7f64cb967c Mon Sep 17 00:00:00 2001 From: David Decotigny Date: Fri, 11 Jan 2013 14:31:36 -0800 Subject: lib: cpu_rmap: avoid flushing all workqueues In some cases, free_irq_cpu_rmap() is called while holding a lock (eg rtnl). This can lead to deadlocks, because it invokes flush_scheduled_work() which ends up waiting for whole system workqueue to flush, but some pending works might try to acquire the lock we are already holding. This commit uses reference-counting to replace irq_run_affinity_notifiers(). It also removes irq_run_affinity_notifiers() altogether. [akpm@linux-foundation.org: eliminate free_cpu_rmap, rename cpu_rmap_reclaim() to cpu_rmap_release(), propagate kref_put() retval from cpu_rmap_put()] Signed-off-by: David Decotigny Reviewed-by: Ben Hutchings Acked-by: Eric Dumazet Reviewed-by: Josh Triplett Cc: "David S. Miller" Cc: Or Gerlitz Acked-by: Amir Vadai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpu_rmap.h | 13 ++++-------- include/linux/interrupt.h | 5 ----- lib/cpu_rmap.c | 54 ++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 53 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h index ac3bbb5b9502..1739510d8994 100644 --- a/include/linux/cpu_rmap.h +++ b/include/linux/cpu_rmap.h @@ -13,9 +13,11 @@ #include #include #include +#include /** * struct cpu_rmap - CPU affinity reverse-map + * @refcount: kref for object * @size: Number of objects to be reverse-mapped * @used: Number of objects added * @obj: Pointer to array of object pointers @@ -23,6 +25,7 @@ * based on affinity masks */ struct cpu_rmap { + struct kref refcount; u16 size, used; void **obj; struct { @@ -33,15 +36,7 @@ struct cpu_rmap { #define CPU_RMAP_DIST_INF 0xffff extern struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags); - -/** - * free_cpu_rmap - free CPU affinity reverse-map - * @rmap: Reverse-map allocated with alloc_cpu_rmap(), or %NULL - */ -static inline void free_cpu_rmap(struct cpu_rmap *rmap) -{ - kfree(rmap); -} +extern int cpu_rmap_put(struct cpu_rmap *rmap); extern int cpu_rmap_add(struct cpu_rmap *rmap, void *obj); extern int cpu_rmap_update(struct cpu_rmap *rmap, u16 index, diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5e4e6170f43a..5fa5afeeb759 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -268,11 +268,6 @@ struct irq_affinity_notify { extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); -static inline void irq_run_affinity_notifiers(void) -{ - flush_scheduled_work(); -} - #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c index 145dec5267c9..5fbed5caba6e 100644 --- a/lib/cpu_rmap.c +++ b/lib/cpu_rmap.c @@ -45,6 +45,7 @@ struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags) if (!rmap) return NULL; + kref_init(&rmap->refcount); rmap->obj = (void **)((char *)rmap + obj_offset); /* Initially assign CPUs to objects on a rota, since we have @@ -63,6 +64,35 @@ struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags) } EXPORT_SYMBOL(alloc_cpu_rmap); +/** + * cpu_rmap_release - internal reclaiming helper called from kref_put + * @ref: kref to struct cpu_rmap + */ +static void cpu_rmap_release(struct kref *ref) +{ + struct cpu_rmap *rmap = container_of(ref, struct cpu_rmap, refcount); + kfree(rmap); +} + +/** + * cpu_rmap_get - internal helper to get new ref on a cpu_rmap + * @rmap: reverse-map allocated with alloc_cpu_rmap() + */ +static inline void cpu_rmap_get(struct cpu_rmap *rmap) +{ + kref_get(&rmap->refcount); +} + +/** + * cpu_rmap_put - release ref on a cpu_rmap + * @rmap: reverse-map allocated with alloc_cpu_rmap() + */ +int cpu_rmap_put(struct cpu_rmap *rmap) +{ + return kref_put(&rmap->refcount, cpu_rmap_release); +} +EXPORT_SYMBOL(cpu_rmap_put); + /* Reevaluate nearest object for given CPU, comparing with the given * neighbours at the given distance. */ @@ -197,8 +227,7 @@ struct irq_glue { * free_irq_cpu_rmap - free a CPU affinity reverse-map used for IRQs * @rmap: Reverse-map allocated with alloc_irq_cpu_map(), or %NULL * - * Must be called in process context, before freeing the IRQs, and - * without holding any locks required by global workqueue items. + * Must be called in process context, before freeing the IRQs. */ void free_irq_cpu_rmap(struct cpu_rmap *rmap) { @@ -212,12 +241,18 @@ void free_irq_cpu_rmap(struct cpu_rmap *rmap) glue = rmap->obj[index]; irq_set_affinity_notifier(glue->notify.irq, NULL); } - irq_run_affinity_notifiers(); - kfree(rmap); + cpu_rmap_put(rmap); } EXPORT_SYMBOL(free_irq_cpu_rmap); +/** + * irq_cpu_rmap_notify - callback for IRQ subsystem when IRQ affinity updated + * @notify: struct irq_affinity_notify passed by irq/manage.c + * @mask: cpu mask for new SMP affinity + * + * This is executed in workqueue context. + */ static void irq_cpu_rmap_notify(struct irq_affinity_notify *notify, const cpumask_t *mask) { @@ -230,10 +265,16 @@ irq_cpu_rmap_notify(struct irq_affinity_notify *notify, const cpumask_t *mask) pr_warning("irq_cpu_rmap_notify: update failed: %d\n", rc); } +/** + * irq_cpu_rmap_release - reclaiming callback for IRQ subsystem + * @ref: kref to struct irq_affinity_notify passed by irq/manage.c + */ static void irq_cpu_rmap_release(struct kref *ref) { struct irq_glue *glue = container_of(ref, struct irq_glue, notify.kref); + + cpu_rmap_put(glue->rmap); kfree(glue); } @@ -258,10 +299,13 @@ int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq) glue->notify.notify = irq_cpu_rmap_notify; glue->notify.release = irq_cpu_rmap_release; glue->rmap = rmap; + cpu_rmap_get(rmap); glue->index = cpu_rmap_add(rmap, glue); rc = irq_set_affinity_notifier(irq, &glue->notify); - if (rc) + if (rc) { + cpu_rmap_put(glue->rmap); kfree(glue); + } return rc; } EXPORT_SYMBOL(irq_cpu_rmap_add); -- cgit v1.2.3-59-g8ed1b From 1b963c81b14509e330e0fe3218b645ece2738dc5 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 11 Jan 2013 14:31:56 -0800 Subject: lockdep, rwsem: provide down_write_nest_lock() down_write_nest_lock() provides a means to annotate locking scenario where an outer lock is guaranteed to serialize the order nested locks are being acquired. This is analogoue to already existing mutex_lock_nest_lock() and spin_lock_nest_lock(). Signed-off-by: Jiri Kosina Cc: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Mel Gorman Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 3 +++ include/linux/rwsem.h | 9 +++++++++ kernel/rwsem.c | 10 ++++++++++ 3 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 00e46376e28f..2bca44b0893c 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -524,14 +524,17 @@ static inline void print_irqtrace_events(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCK_ALLOC # ifdef CONFIG_PROVE_LOCKING # define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) +# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) # define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, NULL, i) # else # define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) +# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) # define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, NULL, i) # endif # define rwsem_release(l, n, i) lock_release(l, n, i) #else # define rwsem_acquire(l, s, t, i) do { } while (0) +# define rwsem_acquire_nest(l, s, t, n, i) do { } while (0) # define rwsem_acquire_read(l, s, t, i) do { } while (0) # define rwsem_release(l, n, i) do { } while (0) #endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 54bd7cd7ecbd..413cc11e414a 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -125,8 +125,17 @@ extern void downgrade_write(struct rw_semaphore *sem); */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); + +# define down_write_nest_lock(sem, nest_lock) \ +do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ +} while (0); + #else # define down_read_nested(sem, subclass) down_read(sem) +# define down_write_nest_lock(sem, nest_lock) down_read(sem) # define down_write_nested(sem, subclass) down_write(sem) #endif diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 6850f53e02d8..b3c6c3fcd847 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); +void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) +{ + might_sleep(); + rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(_down_write_nest_lock); + void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); -- cgit v1.2.3-59-g8ed1b From 7b9205bd775afc4439ed86d617f9042ee9e76a71 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Jan 2013 14:32:05 -0800 Subject: audit: create explicit AUDIT_SECCOMP event type The seccomp path was using AUDIT_ANOM_ABEND from when seccomp mode 1 could only kill a process. While we still want to make sure an audit record is forced on a kill, this should use a separate record type since seccomp mode 2 introduces other behaviors. In the case of "handled" behaviors (process wasn't killed), only emit a record if the process is under inspection. This change also fixes userspace examination of seccomp audit events, since it was considered malformed due to missing fields of the AUDIT_ANOM_ABEND event type. Signed-off-by: Kees Cook Cc: Al Viro Cc: Eric Paris Cc: Jeff Layton Cc: "Eric W. Biederman" Cc: Julien Tinnes Acked-by: Will Drewry Acked-by: Steve Grubb Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/audit.h | 3 ++- include/uapi/linux/audit.h | 1 + kernel/auditsc.c | 14 +++++++++++--- 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index bce729afbcf9..9d5104d7aba9 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -157,7 +157,8 @@ void audit_core_dumps(long signr); static inline void audit_seccomp(unsigned long syscall, long signr, int code) { - if (unlikely(!audit_dummy_context())) + /* Force a record to be reported if a signal was delivered. */ + if (signr || unlikely(!audit_dummy_context())) __audit_seccomp(syscall, signr, code); } diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 76352ac45f24..09a2d94ab113 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -106,6 +106,7 @@ #define AUDIT_MMAP 1323 /* Record showing descriptor and flags in mmap */ #define AUDIT_NETFILTER_PKT 1324 /* Packets traversing netfilter chains */ #define AUDIT_NETFILTER_CFG 1325 /* Netfilter chain modifications */ +#define AUDIT_SECCOMP 1326 /* Secure Computing event */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e37e6a12c5e3..3e46d1dec613 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2675,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } -static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; @@ -2693,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); +} + +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + audit_log_task(ab); audit_log_format(ab, " reason="); audit_log_string(ab, reason); audit_log_format(ab, " sig=%ld", signr); @@ -2723,8 +2728,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "seccomp", signr); + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); + if (unlikely(!ab)) + return; + audit_log_task(ab); + audit_log_format(ab, " sig=%ld", signr); audit_log_format(ab, " syscall=%ld", syscall); audit_log_format(ab, " compat=%d", is_compat_task()); audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); -- cgit v1.2.3-59-g8ed1b From c0a3a20b6c4b5229ef5d26fd9b1c4b1957632aa7 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 11 Jan 2013 14:32:13 -0800 Subject: linux/audit.h: move ptrace.h include to kernel header While the kernel internals want pt_regs (and so it includes linux/ptrace.h), the user version of audit.h does not need it. So move the include out of the uapi version. This avoids issues where people want the audit defines and userland ptrace api. Including both the kernel ptrace and the userland ptrace headers can easily lead to failure. Signed-off-by: Mike Frysinger Cc: Eric Paris Cc: Al Viro Reviewed-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/audit.h | 1 + include/uapi/linux/audit.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 9d5104d7aba9..5a6d718adf34 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -24,6 +24,7 @@ #define _LINUX_AUDIT_H_ #include +#include #include struct audit_sig_info { diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 09a2d94ab113..9f096f1c0907 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -26,7 +26,6 @@ #include #include -#include /* The netlink messages for the audit system is divided into blocks: * 1000 - 1099 are for commanding the audit system -- cgit v1.2.3-59-g8ed1b From 8fb74b9fb2b182d54beee592350d9ea1f325917a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 11 Jan 2013 14:32:16 -0800 Subject: mm: compaction: partially revert capture of suitable high-order page Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when waiting for POLLIN on a local TCP socket. It was easier to trigger if there was disk IO and dirty pages at the same time and he bisected it to commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). The intention of that patch was to improve high-order allocations under memory pressure after changes made to reclaim in 3.6 drastically hurt THP allocations but the approach was flawed. For Eric, the problem was that page->pfmemalloc was not being cleared for captured pages leading to a poor interaction with swap-over-NFS support causing the packets to be dropped. However, I identified a few more problems with the patch including the fact that it can increase contention on zone->lock in some cases which could result in async direct compaction being aborted early. In retrospect the capture patch took the wrong approach. What it should have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it was allocating for THP and avoided races that way. While the patch was showing to improve allocation success rates at the time, the benefit is marginal given the relative complexity and it should be revisited from scratch in the context of the other reclaim-related changes that have taken place since the patch was first written and tested. This patch partially reverts commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). Reported-and-tested-by: Eric Wong Tested-by: Eric Dumazet Cc: Signed-off-by: Mel Gorman Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 +- include/linux/mm.h | 1 - mm/compaction.c | 92 +++++++--------------------------------------- mm/internal.h | 1 - mm/page_alloc.c | 35 ++++-------------- 5 files changed, 23 insertions(+), 110 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 6ecb6dc2f303..cc7bddeaf553 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - bool sync, bool *contended, struct page **page); + bool sync, bool *contended); extern int compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order); @@ -75,7 +75,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended, struct page **page) + bool sync, bool *contended) { return COMPACT_CONTINUE; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 63204078f72b..66e2f7c61e5c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -455,7 +455,6 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); int split_free_page(struct page *page); -int capture_free_page(struct page *page, int alloc_order, int migratetype); /* * Compound pages have a destructor function. Provide a diff --git a/mm/compaction.c b/mm/compaction.c index f8f5c111b7d7..c62bd063d766 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, static int compact_finished(struct zone *zone, struct compact_control *cc) { + unsigned int order; unsigned long watermark; if (fatal_signal_pending(current)) @@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ - if (cc->page) { - /* Was a suitable page captured? */ - if (*cc->page) + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (cc->order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; - } else { - unsigned int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[cc->order]; - /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[cc->migratetype])) - return COMPACT_PARTIAL; - - /* Job done if allocation would set block type */ - if (cc->order >= pageblock_order && area->nr_free) - return COMPACT_PARTIAL; - } } return COMPACT_CONTINUE; @@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_CONTINUE; } -static void compact_capture_page(struct compact_control *cc) -{ - unsigned long flags; - int mtype, mtype_low, mtype_high; - - if (!cc->page || *cc->page) - return; - - /* - * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP - * regardless of the migratetype of the freelist is is captured from. - * This is fine because the order for a high-order MIGRATE_MOVABLE - * allocation is typically at least a pageblock size and overall - * fragmentation is not impaired. Other allocation types must - * capture pages from their own migratelist because otherwise they - * could pollute other pageblocks like MIGRATE_MOVABLE with - * difficult to move pages and making fragmentation worse overall. - */ - if (cc->migratetype == MIGRATE_MOVABLE) { - mtype_low = 0; - mtype_high = MIGRATE_PCPTYPES; - } else { - mtype_low = cc->migratetype; - mtype_high = cc->migratetype + 1; - } - - /* Speculatively examine the free lists without zone lock */ - for (mtype = mtype_low; mtype < mtype_high; mtype++) { - int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct page *page; - struct free_area *area; - area = &(cc->zone->free_area[order]); - if (list_empty(&area->free_list[mtype])) - continue; - - /* Take the lock and attempt capture of the page */ - if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) - return; - if (!list_empty(&area->free_list[mtype])) { - page = list_entry(area->free_list[mtype].next, - struct page, lru); - if (capture_free_page(page, cc->order, mtype)) { - spin_unlock_irqrestore(&cc->zone->lock, - flags); - *cc->page = page; - return; - } - } - spin_unlock_irqrestore(&cc->zone->lock, flags); - } - } -} - static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; @@ -1054,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } - - /* Capture a page now if it is a suitable size */ - compact_capture_page(cc); } out: @@ -1069,8 +1007,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, bool *contended, - struct page **page) + bool sync, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended, struct page **page) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, int status; status = compact_zone_order(zone, order, gfp_mask, sync, - contended, page); + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order) struct compact_control cc = { .order = order, .sync = false, - .page = NULL, }; return __compact_pgdat(pgdat, &cc); @@ -1203,7 +1138,6 @@ static int compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, - .page = NULL, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index d597f94cc205..9ba21100ebf3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -135,7 +135,6 @@ struct compact_control { int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; bool contended; /* True if a lock was contended */ - struct page **page; /* Page captured of requested size */ }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c957805a7f0e..df2022ff0c8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1384,14 +1384,8 @@ void split_page(struct page *page, unsigned int order) set_page_refcounted(page + i); } -/* - * Similar to the split_page family of functions except that the page - * required at the given order and being isolated now to prevent races - * with parallel allocators - */ -int capture_free_page(struct page *page, int alloc_order, int migratetype) +static int __isolate_free_page(struct page *page, unsigned int order) { - unsigned int order; unsigned long watermark; struct zone *zone; int mt; @@ -1399,7 +1393,6 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) BUG_ON(!PageBuddy(page)); zone = page_zone(page); - order = page_order(page); mt = get_pageblock_migratetype(page); if (mt != MIGRATE_ISOLATE) { @@ -1408,7 +1401,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); + __mod_zone_freepage_state(zone, -(1UL << order), mt); } /* Remove page from free list */ @@ -1416,11 +1409,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) zone->free_area[order].nr_free--; rmv_page_order(page); - if (alloc_order != order) - expand(zone, page, alloc_order, order, - &zone->free_area[order], migratetype); - - /* Set the pageblock if the captured page is at least a pageblock */ + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -1431,7 +1420,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) } } - return 1UL << alloc_order; + return 1UL << order; } /* @@ -1449,10 +1438,9 @@ int split_free_page(struct page *page) unsigned int order; int nr_pages; - BUG_ON(!PageBuddy(page)); order = page_order(page); - nr_pages = capture_free_page(page, order, 0); + nr_pages = __isolate_free_page(page, order); if (!nr_pages) return 0; @@ -2136,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page = NULL; - if (!order) return NULL; @@ -2149,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration, - contended_compaction, &page); + contended_compaction); current->flags &= ~PF_MEMALLOC; - /* If compaction captured a page, prep and use it */ - if (page) { - prep_new_page(page, order, gfp_mask); - goto got_page; - } - if (*did_some_progress != COMPACT_SKIPPED) { + struct page *page; + /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); put_cpu(); @@ -2168,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) { -got_page: preferred_zone->compact_blockskip_flush = false; preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; -- cgit v1.2.3-59-g8ed1b From 3cb7a56344ca45ee56d71c5f8fe9f922306bff1f Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 11 Jan 2013 14:32:20 -0800 Subject: lib/rbtree.c: avoid the use of non-static __always_inline lib/rbtree.c declared __rb_erase_color() as __always_inline void, and then exported it with EXPORT_SYMBOL. This was because __rb_erase_color() must be exported for augmented rbtree users, but it must also be inlined into rb_erase() so that the dummy callback can get optimized out of that call site. (Actually with a modern compiler, none of the dummy callback functions should even be generated as separate text functions). The above usage is legal C, but it was unusual enough for some compilers to warn about it. This change makes things more explicit, with a static __always_inline ____rb_erase_color function for use in rb_erase(), and a separate non-inline __rb_erase_color function for use in rb_erase_augmented call sites. Signed-off-by: Michel Lespinasse Reported-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 14 +++++++++++--- lib/rbtree.c | 20 +++++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 2ac60c9cf644..fea49b5da12a 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -123,9 +123,9 @@ __rb_change_child(struct rb_node *old, struct rb_node *new, extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); -static __always_inline void -rb_erase_augmented(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment) +static __always_inline struct rb_node * +__rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right, *tmp = node->rb_left; struct rb_node *parent, *rebalance; @@ -217,6 +217,14 @@ rb_erase_augmented(struct rb_node *node, struct rb_root *root, } augment->propagate(tmp, NULL); + return rebalance; +} + +static __always_inline void +rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } diff --git a/lib/rbtree.c b/lib/rbtree.c index 4f56a11d67fa..c0e31fe2fabf 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -194,8 +194,12 @@ __rb_insert(struct rb_node *node, struct rb_root *root, } } -__always_inline void -__rb_erase_color(struct rb_node *parent, struct rb_root *root, +/* + * Inline version for rb_erase() use - we want to be able to inline + * and eliminate the dummy_rotate callback there + */ +static __always_inline void +____rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; @@ -355,6 +359,13 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, } } } + +/* Non-inline version for rb_erase_augmented() use */ +void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + ____rb_erase_color(parent, root, augment_rotate); +} EXPORT_SYMBOL(__rb_erase_color); /* @@ -380,7 +391,10 @@ EXPORT_SYMBOL(rb_insert_color); void rb_erase(struct rb_node *node, struct rb_root *root) { - rb_erase_augmented(node, root, &dummy_callbacks); + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, root, dummy_rotate); } EXPORT_SYMBOL(rb_erase); -- cgit v1.2.3-59-g8ed1b From d07d7507bfb4e23735c9b83e397c43e1e8a173e8 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 10 Jan 2013 23:19:10 +0000 Subject: net, wireless: overwrite default_ethtool_ops Since: commit 2c60db037034d27f8c636403355d52872da92f81 Author: Eric Dumazet Date: Sun Sep 16 09:17:26 2012 +0000 net: provide a default dev->ethtool_ops wireless core does not correctly assign ethtool_ops. After alloc_netdev*() call, some cfg80211 drivers provide they own ethtool_ops, but some do not. For them, wireless core provide generic cfg80211_ethtool_ops, which is assigned in NETDEV_REGISTER notify call: if (!dev->ethtool_ops) dev->ethtool_ops = &cfg80211_ethtool_ops; But after Eric's commit, dev->ethtool_ops is no longer NULL (on cfg80211 drivers without custom ethtool_ops), but points to &default_ethtool_ops. In order to fix the problem, provide function which will overwrite default_ethtool_ops and use it by wireless core. Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Acked-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 8 ++++++++ net/wireless/core.c | 3 +-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c599e4782d45..9ef07d0868b6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -60,6 +60,9 @@ struct wireless_dev; #define SET_ETHTOOL_OPS(netdev,ops) \ ( (netdev)->ethtool_ops = (ops) ) +extern void netdev_set_default_ethtool_ops(struct net_device *dev, + const struct ethtool_ops *ops); + /* hardware address assignment types */ #define NET_ADDR_PERM 0 /* address is permanent (default) */ #define NET_ADDR_RANDOM 1 /* address is generated randomly */ diff --git a/net/core/dev.c b/net/core/dev.c index 515473ee52cb..f64e439b4a00 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6121,6 +6121,14 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) static const struct ethtool_ops default_ethtool_ops; +void netdev_set_default_ethtool_ops(struct net_device *dev, + const struct ethtool_ops *ops) +{ + if (dev->ethtool_ops == &default_ethtool_ops) + dev->ethtool_ops = ops; +} +EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); + /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for diff --git a/net/wireless/core.c b/net/wireless/core.c index 14d990400354..b677eab55b68 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -866,8 +866,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, /* allow mac80211 to determine the timeout */ wdev->ps_timeout = -1; - if (!dev->ethtool_ops) - dev->ethtool_ops = &cfg80211_ethtool_ops; + netdev_set_default_ethtool_ops(dev, &cfg80211_ethtool_ops); if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || -- cgit v1.2.3-59-g8ed1b From 5dbbaf2de89613d19a9286d4db0a535ca2735d26 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 14 Jan 2013 07:12:19 +0000 Subject: tun: fix LSM/SELinux labeling of tun/tap devices This patch corrects some problems with LSM/SELinux that were introduced with the multiqueue patchset. The problem stems from the fact that the multiqueue work changed the relationship between the tun device and its associated socket; before the socket persisted for the life of the device, however after the multiqueue changes the socket only persisted for the life of the userspace connection (fd open). For non-persistent devices this is not an issue, but for persistent devices this can cause the tun device to lose its SELinux label. We correct this problem by adding an opaque LSM security blob to the tun device struct which allows us to have the LSM security state, e.g. SELinux labeling information, persist for the lifetime of the tun device. In the process we tweak the LSM hooks to work with this new approach to TUN device/socket labeling and introduce a new LSM hook, security_tun_dev_attach_queue(), to approve requests to attach to a TUN queue via TUNSETQUEUE. The SELinux code has been adjusted to match the new LSM hooks, the other LSMs do not make use of the LSM TUN controls. This patch makes use of the recently added "tun_socket:attach_queue" permission to restrict access to the TUNSETQUEUE operation. On older SELinux policies which do not define the "tun_socket:attach_queue" permission the access control decision for TUNSETQUEUE will be handled according to the SELinux policy's unknown permission setting. Signed-off-by: Paul Moore Acked-by: Eric Paris Tested-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 23 +++++++++++---- include/linux/security.h | 59 ++++++++++++++++++++++++++++++--------- security/capability.c | 24 ++++++++++++++-- security/security.c | 28 +++++++++++++++---- security/selinux/hooks.c | 50 +++++++++++++++++++++++++-------- security/selinux/include/objsec.h | 4 +++ 6 files changed, 151 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index af372d0957fe..c81680dc10eb 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -185,6 +185,7 @@ struct tun_struct { unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; + void *security; }; static inline u32 tun_hashfn(u32 rxhash) @@ -490,6 +491,10 @@ static int tun_attach(struct tun_struct *tun, struct file *file) struct tun_file *tfile = file->private_data; int err; + err = security_tun_dev_attach(tfile->socket.sk, tun->security); + if (err < 0) + goto out; + err = -EINVAL; if (rtnl_dereference(tfile->tun)) goto out; @@ -1373,6 +1378,7 @@ static void tun_free_netdev(struct net_device *dev) BUG_ON(!(list_empty(&tun->disabled))); tun_flow_uninit(tun); + security_tun_dev_free_security(tun->security); free_netdev(dev); } @@ -1562,7 +1568,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (tun_not_capable(tun)) return -EPERM; - err = security_tun_dev_attach(tfile->socket.sk); + err = security_tun_dev_open(tun->security); if (err < 0) return err; @@ -1619,7 +1625,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) spin_lock_init(&tun->lock); - security_tun_dev_post_create(&tfile->sk); + err = security_tun_dev_alloc_security(&tun->security); + if (err < 0) + goto err_free_dev; tun_net_init(dev); @@ -1789,10 +1797,14 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { tun = tfile->detached; - if (!tun) + if (!tun) { ret = -EINVAL; - else - ret = tun_attach(tun, file); + goto unlock; + } + ret = security_tun_dev_attach_queue(tun->security); + if (ret < 0) + goto unlock; + ret = tun_attach(tun, file); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & TUN_TAP_MQ)) @@ -1802,6 +1814,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) } else ret = -EINVAL; +unlock: rtnl_unlock(); return ret; } diff --git a/include/linux/security.h b/include/linux/security.h index 0f6afc657f77..eee7478cda70 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -989,17 +989,29 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * tells the LSM to decrement the number of secmark labeling rules loaded * @req_classify_flow: * Sets the flow's sid to the openreq sid. + * @tun_dev_alloc_security: + * This hook allows a module to allocate a security structure for a TUN + * device. + * @security pointer to a security structure pointer. + * Returns a zero on success, negative values on failure. + * @tun_dev_free_security: + * This hook allows a module to free the security structure for a TUN + * device. + * @security pointer to the TUN device's security structure * @tun_dev_create: * Check permissions prior to creating a new TUN device. - * @tun_dev_post_create: - * This hook allows a module to update or allocate a per-socket security - * structure. - * @sk contains the newly created sock structure. + * @tun_dev_attach_queue: + * Check permissions prior to attaching to a TUN device queue. + * @security pointer to the TUN device's security structure. * @tun_dev_attach: - * Check permissions prior to attaching to a persistent TUN device. This - * hook can also be used by the module to update any security state + * This hook can be used by the module to update any security state * associated with the TUN device's sock structure. * @sk contains the existing sock structure. + * @security pointer to the TUN device's security structure. + * @tun_dev_open: + * This hook can be used by the module to update any security state + * associated with the TUN device's security structure. + * @security pointer to the TUN devices's security structure. * * Security hooks for XFRM operations. * @@ -1620,9 +1632,12 @@ struct security_operations { void (*secmark_refcount_inc) (void); void (*secmark_refcount_dec) (void); void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl); - int (*tun_dev_create)(void); - void (*tun_dev_post_create)(struct sock *sk); - int (*tun_dev_attach)(struct sock *sk); + int (*tun_dev_alloc_security) (void **security); + void (*tun_dev_free_security) (void *security); + int (*tun_dev_create) (void); + int (*tun_dev_attach_queue) (void *security); + int (*tun_dev_attach) (struct sock *sk, void *security); + int (*tun_dev_open) (void *security); #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_NETWORK_XFRM @@ -2566,9 +2581,12 @@ void security_inet_conn_established(struct sock *sk, int security_secmark_relabel_packet(u32 secid); void security_secmark_refcount_inc(void); void security_secmark_refcount_dec(void); +int security_tun_dev_alloc_security(void **security); +void security_tun_dev_free_security(void *security); int security_tun_dev_create(void); -void security_tun_dev_post_create(struct sock *sk); -int security_tun_dev_attach(struct sock *sk); +int security_tun_dev_attach_queue(void *security); +int security_tun_dev_attach(struct sock *sk, void *security); +int security_tun_dev_open(void *security); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct sock *sock, @@ -2733,16 +2751,31 @@ static inline void security_secmark_refcount_dec(void) { } +static inline int security_tun_dev_alloc_security(void **security) +{ + return 0; +} + +static inline void security_tun_dev_free_security(void *security) +{ +} + static inline int security_tun_dev_create(void) { return 0; } -static inline void security_tun_dev_post_create(struct sock *sk) +static inline int security_tun_dev_attach_queue(void *security) +{ + return 0; +} + +static inline int security_tun_dev_attach(struct sock *sk, void *security) { + return 0; } -static inline int security_tun_dev_attach(struct sock *sk) +static inline int security_tun_dev_open(void *security) { return 0; } diff --git a/security/capability.c b/security/capability.c index 0fe5a026aef8..579775088967 100644 --- a/security/capability.c +++ b/security/capability.c @@ -709,16 +709,31 @@ static void cap_req_classify_flow(const struct request_sock *req, { } +static int cap_tun_dev_alloc_security(void **security) +{ + return 0; +} + +static void cap_tun_dev_free_security(void *security) +{ +} + static int cap_tun_dev_create(void) { return 0; } -static void cap_tun_dev_post_create(struct sock *sk) +static int cap_tun_dev_attach_queue(void *security) +{ + return 0; +} + +static int cap_tun_dev_attach(struct sock *sk, void *security) { + return 0; } -static int cap_tun_dev_attach(struct sock *sk) +static int cap_tun_dev_open(void *security) { return 0; } @@ -1050,8 +1065,11 @@ void __init security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, secmark_refcount_inc); set_to_cap_if_null(ops, secmark_refcount_dec); set_to_cap_if_null(ops, req_classify_flow); + set_to_cap_if_null(ops, tun_dev_alloc_security); + set_to_cap_if_null(ops, tun_dev_free_security); set_to_cap_if_null(ops, tun_dev_create); - set_to_cap_if_null(ops, tun_dev_post_create); + set_to_cap_if_null(ops, tun_dev_open); + set_to_cap_if_null(ops, tun_dev_attach_queue); set_to_cap_if_null(ops, tun_dev_attach); #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_NETWORK_XFRM diff --git a/security/security.c b/security/security.c index daa97f4ac9d1..7b88c6aeaed4 100644 --- a/security/security.c +++ b/security/security.c @@ -1254,24 +1254,42 @@ void security_secmark_refcount_dec(void) } EXPORT_SYMBOL(security_secmark_refcount_dec); +int security_tun_dev_alloc_security(void **security) +{ + return security_ops->tun_dev_alloc_security(security); +} +EXPORT_SYMBOL(security_tun_dev_alloc_security); + +void security_tun_dev_free_security(void *security) +{ + security_ops->tun_dev_free_security(security); +} +EXPORT_SYMBOL(security_tun_dev_free_security); + int security_tun_dev_create(void) { return security_ops->tun_dev_create(); } EXPORT_SYMBOL(security_tun_dev_create); -void security_tun_dev_post_create(struct sock *sk) +int security_tun_dev_attach_queue(void *security) { - return security_ops->tun_dev_post_create(sk); + return security_ops->tun_dev_attach_queue(security); } -EXPORT_SYMBOL(security_tun_dev_post_create); +EXPORT_SYMBOL(security_tun_dev_attach_queue); -int security_tun_dev_attach(struct sock *sk) +int security_tun_dev_attach(struct sock *sk, void *security) { - return security_ops->tun_dev_attach(sk); + return security_ops->tun_dev_attach(sk, security); } EXPORT_SYMBOL(security_tun_dev_attach); +int security_tun_dev_open(void *security) +{ + return security_ops->tun_dev_open(security); +} +EXPORT_SYMBOL(security_tun_dev_open); + #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_NETWORK_XFRM diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 61a53367d029..ef26e9611ffb 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4399,6 +4399,24 @@ static void selinux_req_classify_flow(const struct request_sock *req, fl->flowi_secid = req->secid; } +static int selinux_tun_dev_alloc_security(void **security) +{ + struct tun_security_struct *tunsec; + + tunsec = kzalloc(sizeof(*tunsec), GFP_KERNEL); + if (!tunsec) + return -ENOMEM; + tunsec->sid = current_sid(); + + *security = tunsec; + return 0; +} + +static void selinux_tun_dev_free_security(void *security) +{ + kfree(security); +} + static int selinux_tun_dev_create(void) { u32 sid = current_sid(); @@ -4414,8 +4432,17 @@ static int selinux_tun_dev_create(void) NULL); } -static void selinux_tun_dev_post_create(struct sock *sk) +static int selinux_tun_dev_attach_queue(void *security) { + struct tun_security_struct *tunsec = security; + + return avc_has_perm(current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET, + TUN_SOCKET__ATTACH_QUEUE, NULL); +} + +static int selinux_tun_dev_attach(struct sock *sk, void *security) +{ + struct tun_security_struct *tunsec = security; struct sk_security_struct *sksec = sk->sk_security; /* we don't currently perform any NetLabel based labeling here and it @@ -4425,20 +4452,19 @@ static void selinux_tun_dev_post_create(struct sock *sk) * cause confusion to the TUN user that had no idea network labeling * protocols were being used */ - /* see the comments in selinux_tun_dev_create() about why we don't use - * the sockcreate SID here */ - - sksec->sid = current_sid(); + sksec->sid = tunsec->sid; sksec->sclass = SECCLASS_TUN_SOCKET; + + return 0; } -static int selinux_tun_dev_attach(struct sock *sk) +static int selinux_tun_dev_open(void *security) { - struct sk_security_struct *sksec = sk->sk_security; + struct tun_security_struct *tunsec = security; u32 sid = current_sid(); int err; - err = avc_has_perm(sid, sksec->sid, SECCLASS_TUN_SOCKET, + err = avc_has_perm(sid, tunsec->sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__RELABELFROM, NULL); if (err) return err; @@ -4446,8 +4472,7 @@ static int selinux_tun_dev_attach(struct sock *sk) TUN_SOCKET__RELABELTO, NULL); if (err) return err; - - sksec->sid = sid; + tunsec->sid = sid; return 0; } @@ -5642,9 +5667,12 @@ static struct security_operations selinux_ops = { .secmark_refcount_inc = selinux_secmark_refcount_inc, .secmark_refcount_dec = selinux_secmark_refcount_dec, .req_classify_flow = selinux_req_classify_flow, + .tun_dev_alloc_security = selinux_tun_dev_alloc_security, + .tun_dev_free_security = selinux_tun_dev_free_security, .tun_dev_create = selinux_tun_dev_create, - .tun_dev_post_create = selinux_tun_dev_post_create, + .tun_dev_attach_queue = selinux_tun_dev_attach_queue, .tun_dev_attach = selinux_tun_dev_attach, + .tun_dev_open = selinux_tun_dev_open, #ifdef CONFIG_SECURITY_NETWORK_XFRM .xfrm_policy_alloc_security = selinux_xfrm_policy_alloc, diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 26c7eee1c309..aa47bcabb5f6 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -110,6 +110,10 @@ struct sk_security_struct { u16 sclass; /* sock security class */ }; +struct tun_security_struct { + u32 sid; /* SID for the tun device sockets */ +}; + struct key_security_struct { u32 sid; /* SID of key */ }; -- cgit v1.2.3-59-g8ed1b