diff options
-rw-r--r-- | MAINTAINERS | 6 | ||||
-rw-r--r-- | block/partitions/efi.c | 7 | ||||
-rw-r--r-- | drivers/gpio/gpio-lynxpoint.c | 5 | ||||
-rw-r--r-- | drivers/gpio/gpiolib.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 18 | ||||
-rw-r--r-- | fs/buffer.c | 14 | ||||
-rw-r--r-- | fs/ext3/namei.c | 5 | ||||
-rw-r--r-- | fs/ext4/namei.c | 5 | ||||
-rw-r--r-- | fs/proc/inode.c | 10 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 4 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 50 | ||||
-rw-r--r-- | include/linux/sched.h | 7 | ||||
-rw-r--r-- | ipc/sem.c | 42 | ||||
-rw-r--r-- | ipc/util.c | 27 | ||||
-rw-r--r-- | lib/percpu-refcount.c | 3 | ||||
-rw-r--r-- | mm/filemap.c | 11 | ||||
-rw-r--r-- | mm/huge_memory.c | 10 | ||||
-rw-r--r-- | mm/hugetlb.c | 17 | ||||
-rw-r--r-- | mm/memcontrol.c | 143 | ||||
-rw-r--r-- | mm/memory.c | 20 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mprotect.c | 7 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/swapfile.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 1 | ||||
-rw-r--r-- | mm/zswap.c | 4 | ||||
-rw-r--r-- | tools/testing/selftests/timers/posix_timers.c | 2 |
29 files changed, 242 insertions, 205 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 72b1e5c2378a..a7c34ef3509d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3624,6 +3624,12 @@ L: linux-scsi@vger.kernel.org S: Odd Fixes (e.g., new signatures) F: drivers/scsi/fdomain.* +GCOV BASED KERNEL PROFILING +M: Peter Oberparleiter <oberpar@linux.vnet.ibm.com> +S: Maintained +F: kernel/gcov/ +F: Documentation/gcov.txt + GDT SCSI DISK ARRAY CONTROLLER DRIVER M: Achim Leubner <achim_leubner@adaptec.com> L: linux-scsi@vger.kernel.org diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 1eb09ee5311b..a8287b49d062 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -222,11 +222,16 @@ check_hybrid: * the disk size. * * Hybrid MBRs do not necessarily comply with this. + * + * Consider a bad value here to be a warning to support dd'ing + * an image from a smaller disk to a larger disk. */ if (ret == GPT_MBR_PROTECTIVE) { sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) - ret = 0; + pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", + sz, min_t(uint32_t, + total_sectors - 1, 0xFFFFFFFF)); } done: return ret; diff --git a/drivers/gpio/gpio-lynxpoint.c b/drivers/gpio/gpio-lynxpoint.c index 2d9ca6055e5e..41b5913ddabe 100644 --- a/drivers/gpio/gpio-lynxpoint.c +++ b/drivers/gpio/gpio-lynxpoint.c @@ -248,14 +248,15 @@ static void lp_gpio_irq_handler(unsigned irq, struct irq_desc *desc) struct lp_gpio *lg = irq_data_get_irq_handler_data(data); struct irq_chip *chip = irq_data_get_irq_chip(data); u32 base, pin, mask; - unsigned long reg, pending; + unsigned long reg, ena, pending; unsigned virq; /* check from GPIO controller which pin triggered the interrupt */ for (base = 0; base < lg->chip.ngpio; base += 32) { reg = lp_gpio_reg(&lg->chip, base, LP_INT_STAT); + ena = lp_gpio_reg(&lg->chip, base, LP_INT_ENABLE); - while ((pending = inl(reg))) { + while ((pending = (inl(reg) & inl(ena)))) { pin = __ffs(pending); mask = BIT(pin); /* Clear before handling so we don't lose an edge */ diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 86ef3461ec06..0dee0e0c247a 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -136,7 +136,7 @@ static struct gpio_desc *gpio_to_desc(unsigned gpio) */ static int desc_to_gpio(const struct gpio_desc *desc) { - return desc->chip->base + gpio_chip_hwgpio(desc); + return desc - &gpio_desc[0]; } @@ -1398,7 +1398,7 @@ static int gpiod_request(struct gpio_desc *desc, const char *label) int status = -EPROBE_DEFER; unsigned long flags; - if (!desc || !desc->chip) { + if (!desc) { pr_warn("%s: invalid GPIO\n", __func__); return -EINVAL; } @@ -1406,6 +1406,8 @@ static int gpiod_request(struct gpio_desc *desc, const char *label) spin_lock_irqsave(&gpio_lock, flags); chip = desc->chip; + if (chip == NULL) + goto done; if (!try_module_get(chip->owner)) goto done; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 4caa8e6d59d7..2d2b1b7588d7 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -269,6 +269,14 @@ static chunk_t area_location(struct pstore *ps, chunk_t area) return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area); } +static void skip_metadata(struct pstore *ps) +{ + uint32_t stride = ps->exceptions_per_area + 1; + chunk_t next_free = ps->next_free; + if (sector_div(next_free, stride) == NUM_SNAPSHOT_HDR_CHUNKS) + ps->next_free++; +} + /* * Read or write a metadata area. Remembering to skip the first * chunk which holds the header. @@ -502,6 +510,8 @@ static int read_exceptions(struct pstore *ps, ps->current_area--; + skip_metadata(ps); + return 0; } @@ -616,8 +626,6 @@ static int persistent_prepare_exception(struct dm_exception_store *store, struct dm_exception *e) { struct pstore *ps = get_info(store); - uint32_t stride; - chunk_t next_free; sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); /* Is there enough room ? */ @@ -630,10 +638,8 @@ static int persistent_prepare_exception(struct dm_exception_store *store, * Move onto the next free pending, making sure to take * into account the location of the metadata chunks. */ - stride = (ps->exceptions_per_area + 1); - next_free = ++ps->next_free; - if (sector_div(next_free, stride) == 1) - ps->next_free++; + ps->next_free++; + skip_metadata(ps); atomic_inc(&ps->pending_count); return 0; diff --git a/fs/buffer.c b/fs/buffer.c index 4d7433534f5c..6024877335ca 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; /* Will call free_more_memory() */ + gfp_t gfp_mask; - page = find_or_create_page(inode->i_mapping, index, - (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); + gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; + gfp_mask |= __GFP_MOVABLE; + /* + * XXX: __getblk_slow() can not really deal with failure and + * will endlessly loop on improvised global reclaim. Prefer + * looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp_mask |= __GFP_NOFAIL; + + page = find_or_create_page(inode->i_mapping, index, gfp_mask); if (!page) return ret; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 1194b1f0f839..f8cde46de9cd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1783,7 +1783,7 @@ retry: d_tmpfile(dentry, inode); err = ext3_orphan_add(handle, inode); if (err) - goto err_drop_inode; + goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } @@ -1791,10 +1791,9 @@ retry: if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; -err_drop_inode: +err_unlock_inode: ext3_journal_stop(handle); unlock_new_inode(inode); - iput(inode); return err; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1bec5a5c1e45..5a0408d7b114 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2319,7 +2319,7 @@ retry: d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); if (err) - goto err_drop_inode; + goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } @@ -2328,10 +2328,9 @@ retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; -err_drop_inode: +err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); - iput(inode); return err; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9f8ef9b7674d..8eaa1ba793fc 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -288,10 +288,14 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct proc_dir_entry *pde = PDE(file_inode(file)); - int rv = -EIO; - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long rv = -EIO; + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; if (use_pde(pde)) { - get_unmapped_area = pde->proc_fops->get_unmapped_area; +#ifdef CONFIG_MMU + get_unmapped_area = current->mm->get_unmapped_area; +#endif + if (pde->proc_fops->get_unmapped_area) + get_unmapped_area = pde->proc_fops->get_unmapped_area; if (get_unmapped_area) rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); unuse_pde(pde); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7366e9d63cee..390bdab01c3c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -941,6 +941,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, frame = pte_pfn(pte); flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) @@ -960,7 +962,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (page && !PageAnon(page)) flags |= PM_FILE; - if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) + if ((vma->vm_flags & VM_SOFTDIRTY)) flags2 |= __PM_SOFT_DIRTY; *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ecc82b37c4cc..b3e7a667e03c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); -/** - * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task - * @new: true to enable, false to disable - * - * Toggle whether a failed memcg charge should invoke the OOM killer - * or just return -ENOMEM. Returns the previous toggle state. - * - * NOTE: Any path that enables the OOM killer before charging must - * call mem_cgroup_oom_synchronize() afterward to finalize the - * OOM handling and clean up. - */ -static inline bool mem_cgroup_toggle_oom(bool new) +static inline void mem_cgroup_oom_enable(void) { - bool old; - - old = current->memcg_oom.may_oom; - current->memcg_oom.may_oom = new; - - return old; + WARN_ON(current->memcg_oom.may_oom); + current->memcg_oom.may_oom = 1; } -static inline void mem_cgroup_enable_oom(void) +static inline void mem_cgroup_oom_disable(void) { - bool old = mem_cgroup_toggle_oom(true); - - WARN_ON(old == true); -} - -static inline void mem_cgroup_disable_oom(void) -{ - bool old = mem_cgroup_toggle_oom(false); - - WARN_ON(old == false); + WARN_ON(!current->memcg_oom.may_oom); + current->memcg_oom.may_oom = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { - return p->memcg_oom.in_memcg_oom; + return p->memcg_oom.memcg; } -bool mem_cgroup_oom_synchronize(void); +bool mem_cgroup_oom_synchronize(bool wait); #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; @@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, { } -static inline bool mem_cgroup_toggle_oom(bool new) -{ - return false; -} - -static inline void mem_cgroup_enable_oom(void) +static inline void mem_cgroup_oom_enable(void) { } -static inline void mem_cgroup_disable_oom(void) +static inline void mem_cgroup_oom_disable(void) { } @@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p) return false; } -static inline bool mem_cgroup_oom_synchronize(void) +static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b293..e27baeeda3f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1394,11 +1394,10 @@ struct task_struct { } memcg_batch; unsigned int memcg_kmem_skip_account; struct memcg_oom_info { + struct mem_cgroup *memcg; + gfp_t gfp_mask; + int order; unsigned int may_oom:1; - unsigned int in_memcg_oom:1; - unsigned int oom_locked:1; - int wakeups; - struct mem_cgroup *wait_on_memcg; } memcg_oom; #endif #ifdef CONFIG_UPROBES diff --git a/ipc/sem.c b/ipc/sem.c index 8c4f59b0204a..db9d241af133 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1282,6 +1282,12 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, sem_lock(sma, NULL, -1); + if (sma->sem_perm.deleted) { + sem_unlock(sma, -1); + rcu_read_unlock(); + return -EIDRM; + } + curr = &sma->sem_base[semnum]; ipc_assert_locked_object(&sma->sem_perm); @@ -1336,12 +1342,14 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; sem_lock(sma, NULL, -1); + if (sma->sem_perm.deleted) { + err = -EIDRM; + goto out_unlock; + } if(nsems > SEMMSL_FAST) { if (!ipc_rcu_getref(sma)) { - sem_unlock(sma, -1); - rcu_read_unlock(); err = -EIDRM; - goto out_free; + goto out_unlock; } sem_unlock(sma, -1); rcu_read_unlock(); @@ -1354,10 +1362,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, rcu_read_lock(); sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma, -1); - rcu_read_unlock(); err = -EIDRM; - goto out_free; + goto out_unlock; } } for (i = 0; i < sma->sem_nsems; i++) @@ -1375,8 +1381,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, struct sem_undo *un; if (!ipc_rcu_getref(sma)) { - rcu_read_unlock(); - return -EIDRM; + err = -EIDRM; + goto out_rcu_wakeup; } rcu_read_unlock(); @@ -1404,10 +1410,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, rcu_read_lock(); sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma, -1); - rcu_read_unlock(); err = -EIDRM; - goto out_free; + goto out_unlock; } for (i = 0; i < nsems; i++) @@ -1431,6 +1435,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, goto out_rcu_wakeup; sem_lock(sma, NULL, -1); + if (sma->sem_perm.deleted) { + err = -EIDRM; + goto out_unlock; + } curr = &sma->sem_base[semnum]; switch (cmd) { @@ -1836,6 +1844,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, if (error) goto out_rcu_wakeup; + error = -EIDRM; + locknum = sem_lock(sma, sops, nsops); + if (sma->sem_perm.deleted) + goto out_unlock_free; /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID @@ -1843,8 +1855,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, * This case can be detected checking un->semid. The existence of * "un" itself is guaranteed by rcu. */ - error = -EIDRM; - locknum = sem_lock(sma, sops, nsops); if (un && un->semid == -1) goto out_unlock_free; @@ -2057,6 +2067,12 @@ void exit_sem(struct task_struct *tsk) } sem_lock(sma, NULL, -1); + /* exit_sem raced with IPC_RMID, nothing to do */ + if (sma->sem_perm.deleted) { + sem_unlock(sma, -1); + rcu_read_unlock(); + continue; + } un = __lookup_undo(ulp, semid); if (un == NULL) { /* exit_sem raced with IPC_RMID+semget() that created diff --git a/ipc/util.c b/ipc/util.c index fdb8ae740775..7684f41bce76 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -17,12 +17,27 @@ * Pavel Emelianov <xemul@openvz.org> * * General sysv ipc locking scheme: - * when doing ipc id lookups, take the ids->rwsem - * rcu_read_lock() - * obtain the ipc object (kern_ipc_perm) - * perform security, capabilities, auditing and permission checks, etc. - * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object() - * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands) + * rcu_read_lock() + * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr + * tree. + * - perform initial checks (capabilities, auditing and permission, + * etc). + * - perform read-only operations, such as STAT, INFO commands. + * acquire the ipc lock (kern_ipc_perm.lock) through + * ipc_lock_object() + * - perform data updates, such as SET, RMID commands and + * mechanism-specific operations (semop/semtimedop, + * msgsnd/msgrcv, shmat/shmdt). + * drop the ipc lock, through ipc_unlock_object(). + * rcu_read_unlock() + * + * The ids->rwsem must be taken when: + * - creating, removing and iterating the existing entries in ipc + * identifier sets. + * - iterating through files under /proc/sysvipc/ + * + * Note that sems have a special fast path that avoids kern_ipc_perm.lock - + * see sem_lock(). */ #include <linux/mm.h> diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 7deeb6297a48..1a53d497a8c5 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -53,6 +53,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) ref->release = release; return 0; } +EXPORT_SYMBOL_GPL(percpu_ref_init); /** * percpu_ref_cancel_init - cancel percpu_ref_init() @@ -84,6 +85,7 @@ void percpu_ref_cancel_init(struct percpu_ref *ref) free_percpu(ref->pcpu_count); } } +EXPORT_SYMBOL_GPL(percpu_ref_cancel_init); static void percpu_ref_kill_rcu(struct rcu_head *rcu) { @@ -156,3 +158,4 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } +EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); diff --git a/mm/filemap.c b/mm/filemap.c index 1e6aec4a2d2e..ae4846ff4849 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; - bool memcg_oom; pgoff_t size; int ret = 0; @@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Do we have something in the page cache already? Either - * way, try readahead, but disable the memcg OOM killer for it - * as readahead is optional and no errors are propagated up - * the fault stack. The OOM killer is enabled while trying to - * instantiate the faulting page individually below. + * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { @@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - memcg_oom = mem_cgroup_toggle_oom(false); do_async_mmap_readahead(vma, ra, file, page, offset); - mem_cgroup_toggle_oom(memcg_oom); } else if (!page) { /* No page in the page cache at all */ - memcg_oom = mem_cgroup_toggle_oom(false); do_sync_mmap_readahead(vma, ra, file, offset); - mem_cgroup_toggle_oom(memcg_oom); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884682d8..610e3df2768a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2697,6 +2697,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; +again: mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); if (unlikely(!pmd_trans_huge(*pmd))) { @@ -2719,7 +2720,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, split_huge_page(page); put_page(page); - BUG_ON(pmd_trans_huge(*pmd)); + + /* + * We don't always have down_write of mmap_sem here: a racing + * do_huge_pmd_wp_page() might have copied-on-write to another + * huge page before our split_huge_page() got the anon_vma lock. + */ + if (unlikely(pmd_trans_huge(*pmd))) + goto again; } void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b49579c7f2a5..0b7656e804d1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -653,6 +653,7 @@ static void free_huge_page(struct page *page) BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); restore_reserve = PagePrivate(page); + ClearPagePrivate(page); spin_lock(&hugetlb_lock); hugetlb_cgroup_uncharge_page(hstate_index(h), @@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); __SetPageHead(page); + __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + /* + * For gigantic hugepages allocated through bootmem at + * boot, it's safer to be consistent with the not-gigantic + * hugepages and clear the PG_reserved bit from all tail pages + * too. Otherwse drivers using get_user_pages() to access tail + * pages may get the reference counting wrong if they see + * PG_reserved set on a tail page (despite the head page not + * having PG_reserved set). Enforcing this consistency between + * head and tail pages allows drivers to optimize away a check + * on the head page when they need know if put_page() is needed + * after get_user_pages(). + */ + __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; } @@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void) #else page = virt_to_page(m); #endif - __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); + WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); /* * If we had gigantic hugepages allocated at boot time, we need diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1c52ddbc839b..34d3ca9572d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -866,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, unsigned long val = 0; int cpu; + get_online_cpus(); for_each_online_cpu(cpu) val += per_cpu(memcg->stat->events[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU @@ -873,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, val += memcg->nocpu_base.events[idx]; spin_unlock(&memcg->pcp_counter_lock); #endif + put_online_cpus(); return val; } @@ -2159,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) memcg_wakeup_oom(memcg); } -/* - * try to call OOM killer - */ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - bool locked; - int wakeups; - if (!current->memcg_oom.may_oom) return; - - current->memcg_oom.in_memcg_oom = 1; - /* - * As with any blocking lock, a contender needs to start - * listening for wakeups before attempting the trylock, - * otherwise it can miss the wakeup from the unlock and sleep - * indefinitely. This is just open-coded because our locking - * is so particular to memcg hierarchies. + * We are in the middle of the charge context here, so we + * don't want to block when potentially sitting on a callstack + * that holds all kinds of filesystem and mm locks. + * + * Also, the caller may handle a failed allocation gracefully + * (like optional page cache readahead) and so an OOM killer + * invocation might not even be necessary. + * + * That's why we don't do anything here except remember the + * OOM context and then deal with it at the end of the page + * fault when the stack is unwound, the locks are released, + * and when we know whether the fault was overall successful. */ - wakeups = atomic_read(&memcg->oom_wakeups); - mem_cgroup_mark_under_oom(memcg); - - locked = mem_cgroup_oom_trylock(memcg); - - if (locked) - mem_cgroup_oom_notify(memcg); - - if (locked && !memcg->oom_kill_disable) { - mem_cgroup_unmark_under_oom(memcg); - mem_cgroup_out_of_memory(memcg, mask, order); - mem_cgroup_oom_unlock(memcg); - /* - * There is no guarantee that an OOM-lock contender - * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitely. - */ - memcg_oom_recover(memcg); - } else { - /* - * A system call can just return -ENOMEM, but if this - * is a page fault and somebody else is handling the - * OOM already, we need to sleep on the OOM waitqueue - * for this memcg until the situation is resolved. - * Which can take some time because it might be - * handled by a userspace task. - * - * However, this is the charge context, which means - * that we may sit on a large call stack and hold - * various filesystem locks, the mmap_sem etc. and we - * don't want the OOM handler to deadlock on them - * while we sit here and wait. Store the current OOM - * context in the task_struct, then return -ENOMEM. - * At the end of the page fault handler, with the - * stack unwound, pagefault_out_of_memory() will check - * back with us by calling - * mem_cgroup_oom_synchronize(), possibly putting the - * task to sleep. - */ - current->memcg_oom.oom_locked = locked; - current->memcg_oom.wakeups = wakeups; - css_get(&memcg->css); - current->memcg_oom.wait_on_memcg = memcg; - } + css_get(&memcg->css); + current->memcg_oom.memcg = memcg; + current->memcg_oom.gfp_mask = mask; + current->memcg_oom.order = order; } /** * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state * - * This has to be called at the end of a page fault if the the memcg - * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled. * - * Memcg supports userspace OOM handling, so failed allocations must + * Memcg supports userspace OOM handling where failed allocations must * sleep on a waitqueue until the userspace task resolves the * situation. Sleeping directly in the charge context with all kinds * of locks held is not a good idea, instead we remember an OOM state * in the task and mem_cgroup_oom_synchronize() has to be called at - * the end of the page fault to put the task to sleep and clean up the - * OOM state. + * the end of the page fault to complete the OOM handling. * * Returns %true if an ongoing memcg OOM situation was detected and - * finalized, %false otherwise. + * completed, %false otherwise. */ -bool mem_cgroup_oom_synchronize(void) +bool mem_cgroup_oom_synchronize(bool handle) { + struct mem_cgroup *memcg = current->memcg_oom.memcg; struct oom_wait_info owait; - struct mem_cgroup *memcg; + bool locked; /* OOM is global, do not handle */ - if (!current->memcg_oom.in_memcg_oom) - return false; - - /* - * We invoked the OOM killer but there is a chance that a kill - * did not free up any charges. Everybody else might already - * be sleeping, so restart the fault and keep the rampage - * going until some charges are released. - */ - memcg = current->memcg_oom.wait_on_memcg; if (!memcg) - goto out; + return false; - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) - goto out_memcg; + if (!handle) + goto cleanup; owait.memcg = memcg; owait.wait.flags = 0; @@ -2271,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void) INIT_LIST_HEAD(&owait.wait.task_list); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - /* Only sleep if we didn't miss any wakeups since OOM */ - if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, + current->memcg_oom.order); + } else { schedule(); - finish_wait(&memcg_oom_waitq, &owait.wait); -out_memcg: - mem_cgroup_unmark_under_oom(memcg); - if (current->memcg_oom.oom_locked) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + } + + if (locked) { mem_cgroup_oom_unlock(memcg); /* * There is no guarantee that an OOM-lock contender @@ -2286,10 +2249,9 @@ out_memcg: */ memcg_oom_recover(memcg); } +cleanup: + current->memcg_oom.memcg = NULL; css_put(&memcg->css); - current->memcg_oom.wait_on_memcg = NULL; -out: - current->memcg_oom.in_memcg_oom = 0; return true; } @@ -2703,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, || fatal_signal_pending(current))) goto bypass; + if (unlikely(task_in_memcg_oom(current))) + goto bypass; + /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the @@ -2801,6 +2766,8 @@ done: return 0; nomem: *ptr = NULL; + if (gfp_mask & __GFP_NOFAIL) + return 0; return -ENOMEM; bypass: *ptr = root_mem_cgroup; diff --git a/mm/memory.c b/mm/memory.c index ca0003947115..1311f26497e6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -837,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } } @@ -3863,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) - mem_cgroup_enable_oom(); + mem_cgroup_oom_enable(); ret = __handle_mm_fault(mm, vma, address, flags); - if (flags & FAULT_FLAG_USER) - mem_cgroup_disable_oom(); - - if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) - mem_cgroup_oom_synchronize(); + if (flags & FAULT_FLAG_USER) { + mem_cgroup_oom_disable(); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); + } return ret; } diff --git a/mm/migrate.c b/mm/migrate.c index a26bccd44ccb..7a7325ee1d08 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*ptep)) + pte = pte_mksoft_dirty(pte); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b43..a3af058f68e4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -94,13 +94,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { + pte_t newpte; /* * A protection check is difficult so * just be safe and disable write */ make_migration_entry_read(&entry); - set_pte_at(mm, addr, pte, - swp_entry_to_pte(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + set_pte_at(mm, addr, pte, newpte); } pages++; } diff --git a/mm/mremap.c b/mm/mremap.c index 91b13d6a16d4..0843feb66f3d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,7 +25,6 @@ #include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> -#include <asm/pgalloc.h> #include "internal.h" @@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; pmd = pmd_alloc(mm, pud, addr); - if (!pmd) { - pud_free(mm, pud); + if (!pmd) return NULL; - } VM_BUG_ON(pmd_trans_huge(*pmd)); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d274381..6738c47f1f72 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -680,7 +680,7 @@ void pagefault_out_of_memory(void) { struct zonelist *zonelist; - if (mem_cgroup_oom_synchronize()) + if (mem_cgroup_oom_synchronize(true)) return; zonelist = node_zonelist(first_online_node, GFP_KERNEL); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f5236f804aa6..63807583d8e8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static unsigned long bdi_max_pause(struct backing_dev_info *bdi, + unsigned long bdi_dirty) { - long bw = bdi->avg_write_bandwidth; - long t; + unsigned long bw = bdi->avg_write_bandwidth; + unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long @@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi, t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; - return min_t(long, t, MAX_PAUSE); + return min_t(unsigned long, t, MAX_PAUSE); } static long bdi_min_pause(struct backing_dev_info *bdi, diff --git a/mm/swapfile.c b/mm/swapfile.c index 3963fc24fcc1..de7c904e52e5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct filename *pathname; int i, type, prev; int err; + unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } swap_file = p->swap_file; + old_block_size = p->old_block_size; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; @@ -1938,7 +1940,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); - set_blocksize(bdev, p->old_block_size); + set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); diff --git a/mm/vmscan.c b/mm/vmscan.c index 53f2f82f83ae..eea668d9cff6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -211,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); } EXPORT_SYMBOL(unregister_shrinker); diff --git a/mm/zswap.c b/mm/zswap.c index 841e35f1db22..d93510c6aa2d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -804,6 +804,10 @@ static void zswap_frontswap_invalidate_area(unsigned type) } tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); + + zbud_destroy_pool(tree->pool); + kfree(tree); + zswap_trees[type] = NULL; } static struct zbud_ops zswap_zbud_ops = { diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 4fa655d68a81..41bd85559d4b 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -151,7 +151,7 @@ static int check_timer_create(int which) fflush(stdout); done = 0; - timer_create(which, NULL, &id); + err = timer_create(which, NULL, &id); if (err < 0) { perror("Can't create timer\n"); return -1; |