diff options
author | 2017-08-02 17:11:45 +0200 | |
---|---|---|
committer | 2017-08-02 17:11:45 +0200 | |
commit | 5ef26e966d3fd105ad9a7e8e8f6d12c7fbd4c03d (patch) | |
tree | dd5c2ce3daab2e398ab8c0fb852587b647131568 /mm | |
parent | ALSA: hda - Fix speaker output from VAIO VPCL14M1R (diff) | |
parent | Merge remote-tracking branches 'asoc/fix/rt5663', 'asoc/fix/rt5665', 'asoc/fix/samsung', 'asoc/fix/sgtl5000' and 'asoc/fix/sh' into asoc-linus (diff) | |
download | wireguard-linux-5ef26e966d3fd105ad9a7e8e8f6d12c7fbd4c03d.tar.xz wireguard-linux-5ef26e966d3fd105ad9a7e8e8f6d12c7fbd4c03d.zip |
Merge tag 'asoc-fix-v4.13-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus
ASoC: Fixes for v4.13
Quite a few fixes here that have been sent since the merge window, the
biggest one is the fix from Tony for some confusion with the device
property API which was causing issues with the of-graph card. This is
fixed with some changes in the graph API itself as it seemed very likely
to be error prone.
Diffstat (limited to '')
65 files changed, 3527 insertions, 2016 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index beb7a455915d..48b1af447fa7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP bool config ARCH_DISCARD_MEMBLOCK @@ -149,32 +149,6 @@ config NO_BOOTMEM config MEMORY_ISOLATION bool -config MOVABLE_NODE - bool "Enable to assign a node which has only movable memory" - depends on HAVE_MEMBLOCK - depends on NO_BOOTMEM - depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG - depends on NUMA - default n - help - Allow a node to have only movable memory. Pages used by the kernel, - such as direct mapping pages cannot be migrated. So the corresponding - memory device cannot be hotplugged. This option allows the following - two things: - - When the system is booting, node full of hotpluggable memory can - be arranged to have only movable memory so that the whole node can - be hot-removed. (need movable_node boot option specified). - - After the system is up, the option allows users to online all the - memory of a node as movable memory so that the whole node can be - hot-removed. - - Users who don't use the memory hotplug feature are fine with this - option on since they don't specify movable_node boot option or they - don't online memory as movable. - - Say Y here if you want to hotplug a whole node. - Say N here if you want kernel to use memory on all nodes evenly. - # # Only be set on architectures that have completely implemented memory hotplug # feature. If you are not sure, don't touch it. @@ -187,7 +161,6 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on COMPILE_TEST || !KASAN config MEMORY_HOTPLUG_SPARSE def_bool y @@ -446,6 +419,18 @@ choice benefit. endchoice +config ARCH_WANTS_THP_SWAP + def_bool n + +config THP_SWAP + def_bool y + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP + help + Swap transparent huge pages in one piece, without splitting. + XXX: For now this only does clustered swap space allocation. + + For selection by architectures with reasonable THP sizes. + config TRANSPARENT_HUGE_PAGECACHE def_bool y depends on TRANSPARENT_HUGEPAGE @@ -683,12 +668,16 @@ config IDLE_PAGE_TRACKING See Documentation/vm/idle_page_tracking.txt for more details. +# arch_add_memory() comprehends device memory +config ARCH_HAS_ZONE_DEVICE + bool + config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on X86_64 #arch_add_memory() comprehends device memory + depends on ARCH_HAS_ZONE_DEVICE help Device memory hotplug support allows for establishing pmem, @@ -706,3 +695,11 @@ config ARCH_USES_HIGH_VMA_FLAGS bool config ARCH_HAS_PKEYS bool + +config PERCPU_STATS + bool "Collect percpu memory statistics" + default n + help + This feature collects and exposes statistics via debugfs. The + information includes global and per chunk statistics, which can + be used to help understand percpu memory usage. diff --git a/mm/Makefile b/mm/Makefile index 026f6a828a50..411bd24d4a7c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -103,3 +103,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o +obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index da91df50ba31..9075aa54e955 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) { unsigned long flags; struct page *page = alloc_page(balloon_mapping_gfp_mask() | - __GFP_NOMEMALLOC | __GFP_NORETRY); + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO); if (!page) return NULL; diff --git a/mm/cleancache.c b/mm/cleancache.c index ba5d8f3e6d68..f7b9fdc79d97 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -130,7 +130,7 @@ void __cleancache_init_shared_fs(struct super_block *sb) int pool_id = CLEANCACHE_NO_BACKEND_SHARED; if (cleancache_ops) { - pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE); if (pool_id < 0) pool_id = CLEANCACHE_NO_POOL; } @@ -59,7 +59,7 @@ const char *cma_get_name(const struct cma *cma) } static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, - int align_order) + unsigned int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -67,17 +67,14 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, } /* - * Find a PFN aligned to the specified order and return an offset represented in - * order_per_bits. + * Find the offset of the base PFN from the specified align_order. + * The value returned is represented in order_per_bits. */ static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, - int align_order) + unsigned int align_order) { - if (align_order <= cma->order_per_bit) - return 0; - - return (ALIGN(cma->base_pfn, (1UL << align_order)) - - cma->base_pfn) >> cma->order_per_bit; + return (cma->base_pfn & ((1UL << align_order) - 1)) + >> cma->order_per_bit; } static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, @@ -127,7 +124,7 @@ static int __init cma_activate_area(struct cma *cma) * to be in the same zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto err; + goto not_in_zone; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -141,7 +138,8 @@ static int __init cma_activate_area(struct cma *cma) return 0; -err: +not_in_zone: + pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; return -EINVAL; diff --git a/mm/compaction.c b/mm/compaction.c index 613c59e928cb..fb548e4c7bd4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -236,10 +236,9 @@ static void __reset_isolation_suitable(struct zone *zone) cond_resched(); - if (!pfn_valid(pfn)) + page = pfn_to_online_page(pfn); + if (!page) continue; - - page = pfn_to_page(pfn); if (zone != page_zone(page)) continue; diff --git a/mm/filemap.c b/mm/filemap.c index 6f1be573a5e6..a49702445ce0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -239,14 +239,16 @@ void __delete_from_page_cache(struct page *page, void *shadow) /* Leave page->index set: truncation lookup relies upon it */ /* hugetlb pages do not participate in page cache accounting. */ - if (!PageHuge(page)) - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + if (PageHuge(page)) + return; + + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else { - VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); + VM_BUG_ON_PAGE(PageTransHuge(page), page); } /* @@ -309,6 +311,16 @@ int filemap_check_errors(struct address_space *mapping) } EXPORT_SYMBOL(filemap_check_errors); +static int filemap_check_and_keep_errors(struct address_space *mapping) +{ + /* Check for outstanding write errors */ + if (test_bit(AS_EIO, &mapping->flags)) + return -EIO; + if (test_bit(AS_ENOSPC, &mapping->flags)) + return -ENOSPC; + return 0; +} + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -376,17 +388,48 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); -static int __filemap_fdatawait_range(struct address_space *mapping, +/** + * filemap_range_has_page - check if a page exists in range. + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. + */ +bool filemap_range_has_page(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + pgoff_t index = start_byte >> PAGE_SHIFT; + pgoff_t end = end_byte >> PAGE_SHIFT; + struct pagevec pvec; + bool ret; + + if (end_byte < start_byte) + return false; + + if (mapping->nrpages == 0) + return false; + + pagevec_init(&pvec, 0); + if (!pagevec_lookup(&pvec, mapping, index, 1)) + return false; + ret = (pvec.pages[0]->index <= end); + pagevec_release(&pvec); + return ret; +} +EXPORT_SYMBOL(filemap_range_has_page); + +static void __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; struct pagevec pvec; int nr_pages; - int ret = 0; if (end_byte < start_byte) - goto out; + return; pagevec_init(&pvec, 0); while ((index <= end) && @@ -403,14 +446,11 @@ static int __filemap_fdatawait_range(struct address_space *mapping, continue; wait_on_page_writeback(page); - if (TestClearPageError(page)) - ret = -EIO; + ClearPageError(page); } pagevec_release(&pvec); cond_resched(); } -out: - return ret; } /** @@ -430,14 +470,8 @@ out: int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - int ret, ret2; - - ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); - ret2 = filemap_check_errors(mapping); - if (!ret) - ret = ret2; - - return ret; + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return filemap_check_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range); @@ -453,15 +487,17 @@ EXPORT_SYMBOL(filemap_fdatawait_range); * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) */ -void filemap_fdatawait_keep_errors(struct address_space *mapping) +int filemap_fdatawait_keep_errors(struct address_space *mapping) { loff_t i_size = i_size_read(mapping->host); if (i_size == 0) - return; + return 0; __filemap_fdatawait_range(mapping, 0, i_size - 1); + return filemap_check_and_keep_errors(mapping); } +EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /** * filemap_fdatawait - wait for all under-writeback pages to complete @@ -503,6 +539,9 @@ int filemap_write_and_wait(struct address_space *mapping) int err2 = filemap_fdatawait(mapping); if (!err) err = err2; + } else { + /* Clear any previously stored errors */ + filemap_check_errors(mapping); } } else { err = filemap_check_errors(mapping); @@ -537,6 +576,9 @@ int filemap_write_and_wait_range(struct address_space *mapping, lstart, lend); if (!err) err = err2; + } else { + /* Clear any previously stored errors */ + filemap_check_errors(mapping); } } else { err = filemap_check_errors(mapping); @@ -545,6 +587,90 @@ int filemap_write_and_wait_range(struct address_space *mapping, } EXPORT_SYMBOL(filemap_write_and_wait_range); +void __filemap_set_wb_err(struct address_space *mapping, int err) +{ + errseq_t eseq = __errseq_set(&mapping->wb_err, err); + + trace_filemap_set_wb_err(mapping, eseq); +} +EXPORT_SYMBOL(__filemap_set_wb_err); + +/** + * file_check_and_advance_wb_err - report wb error (if any) that was previously + * and advance wb_err to current one + * @file: struct file on which the error is being reported + * + * When userland calls fsync (or something like nfsd does the equivalent), we + * want to report any writeback errors that occurred since the last fsync (or + * since the file was opened if there haven't been any). + * + * Grab the wb_err from the mapping. If it matches what we have in the file, + * then just quickly return 0. The file is all caught up. + * + * If it doesn't match, then take the mapping value, set the "seen" flag in + * it and try to swap it into place. If it works, or another task beat us + * to it with the new value, then update the f_wb_err and return the error + * portion. The error at this point must be reported via proper channels + * (a'la fsync, or NFS COMMIT operation, etc.). + * + * While we handle mapping->wb_err with atomic operations, the f_wb_err + * value is protected by the f_lock since we must ensure that it reflects + * the latest value swapped in for this file descriptor. + */ +int file_check_and_advance_wb_err(struct file *file) +{ + int err = 0; + errseq_t old = READ_ONCE(file->f_wb_err); + struct address_space *mapping = file->f_mapping; + + /* Locklessly handle the common case where nothing has changed */ + if (errseq_check(&mapping->wb_err, old)) { + /* Something changed, must use slow path */ + spin_lock(&file->f_lock); + old = file->f_wb_err; + err = errseq_check_and_advance(&mapping->wb_err, + &file->f_wb_err); + trace_file_check_and_advance_wb_err(file, old); + spin_unlock(&file->f_lock); + } + return err; +} +EXPORT_SYMBOL(file_check_and_advance_wb_err); + +/** + * file_write_and_wait_range - write out & wait on a file range + * @file: file pointing to address_space with pages + * @lstart: offset in bytes where the range starts + * @lend: offset in bytes where the range ends (inclusive) + * + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that @lend is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + * + * After writing out and waiting on the data, we check and advance the + * f_wb_err cursor to the latest value, and return any errors detected there. + */ +int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) +{ + int err = 0, err2; + struct address_space *mapping = file->f_mapping; + + if ((!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional)) { + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* See comment of filemap_write_and_wait() */ + if (err != -EIO) + __filemap_fdatawait_range(mapping, lstart, lend); + } + err2 = file_check_and_advance_wb_err(file); + if (!err) + err = err2; + return err; +} +EXPORT_SYMBOL(file_write_and_wait_range); + /** * replace_page_cache_page - replace a pagecache page with a new one * @old: page to be replaced @@ -768,10 +894,10 @@ struct wait_page_key { struct wait_page_queue { struct page *page; int bit_nr; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_page_key *key = arg; struct wait_page_queue *wait_page @@ -834,7 +960,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, bool lock) { struct wait_page_queue wait_page; - wait_queue_t *wait = &wait_page.wait; + wait_queue_entry_t *wait = &wait_page.wait; int ret = 0; init_wait(wait); @@ -845,9 +971,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, for (;;) { spin_lock_irq(&q->lock); - if (likely(list_empty(&wait->task_list))) { + if (likely(list_empty(&wait->entry))) { if (lock) - __add_wait_queue_tail_exclusive(q, wait); + __add_wait_queue_entry_tail_exclusive(q, wait); else __add_wait_queue(q, wait); SetPageWaiters(page); @@ -907,7 +1033,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) * * Add an arbitrary @waiter to the wait queue for the nominated @page. */ -void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) { wait_queue_head_t *q = page_waitqueue(page); unsigned long flags; @@ -2038,10 +2164,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t size; size = i_size_read(inode); - retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) + return -EAGAIN; + } else { + retval = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (retval < 0) + goto out; + } file_accessed(file); @@ -2226,7 +2359,7 @@ int filemap_fault(struct vm_fault *vmf) /* No page in the page cache at all */ do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); @@ -2642,6 +2775,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + if (limit != RLIM_INFINITY) { if (iocb->ki_pos >= limit) { send_sig(SIGXFSZ, current, 0); @@ -2710,9 +2846,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; - written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); - if (written) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* If there are pages to writeback, return */ + if (filemap_range_has_page(inode->i_mapping, pos, + pos + iov_iter_count(from))) + return -EAGAIN; + } else { + written = filemap_write_and_wait_range(mapping, pos, + pos + write_len - 1); + if (written) + goto out; + } /* * After a write we want buffered reads to be sure to go to disk to get @@ -208,72 +208,28 @@ no_page: return no_page_table(vma, flags); } -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in <linux/mm.h> - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +static struct page *follow_pmd_mask(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + unsigned int flags, unsigned int *page_mask) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; pmd_t *pmd; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - return page; - } - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return no_page_table(vma, flags); - p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d)) - return no_page_table(vma, flags); - BUILD_BUG_ON(p4d_huge(*p4d)); - if (unlikely(p4d_bad(*p4d))) - return no_page_table(vma, flags); - pud = pud_offset(p4d, address); - if (pud_none(*pud)) + pmd = pmd_offset(pudp, address); + if (pmd_none(*pmd)) return no_page_table(vma, flags); - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pud(mm, address, pud, flags); + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } - if (pud_devmap(*pud)) { - ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags); - spin_unlock(ptl); - if (page) - return page; - } - if (unlikely(pud_bad(*pud))) - return no_page_table(vma, flags); - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return no_page_table(vma, flags); - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags); + if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + page = follow_huge_pd(vma, address, + __hugepd(pmd_val(*pmd)), flags, + PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); @@ -319,13 +275,131 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return ret ? ERR_PTR(ret) : follow_page_pte(vma, address, pmd, flags); } - page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; return page; } + +static struct page *follow_pud_mask(struct vm_area_struct *vma, + unsigned long address, p4d_t *p4dp, + unsigned int flags, unsigned int *page_mask) +{ + pud_t *pud; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + pud = pud_offset(p4dp, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pud(mm, address, pud, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (is_hugepd(__hugepd(pud_val(*pud)))) { + page = follow_huge_pd(vma, address, + __hugepd(pud_val(*pud)), flags, + PUD_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + if (pud_devmap(*pud)) { + ptl = pud_lock(mm, pud); + page = follow_devmap_pud(vma, address, pud, flags); + spin_unlock(ptl); + if (page) + return page; + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + return follow_pmd_mask(vma, address, pud, flags, page_mask); +} + + +static struct page *follow_p4d_mask(struct vm_area_struct *vma, + unsigned long address, pgd_t *pgdp, + unsigned int flags, unsigned int *page_mask) +{ + p4d_t *p4d; + struct page *page; + + p4d = p4d_offset(pgdp, address); + if (p4d_none(*p4d)) + return no_page_table(vma, flags); + BUILD_BUG_ON(p4d_huge(*p4d)); + if (unlikely(p4d_bad(*p4d))) + return no_page_table(vma, flags); + + if (is_hugepd(__hugepd(p4d_val(*p4d)))) { + page = follow_huge_pd(vma, address, + __hugepd(p4d_val(*p4d)), flags, + P4D_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + return follow_pud_mask(vma, address, p4d, flags, page_mask); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in <linux/mm.h> + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + /* make this handle hugepd */ + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + return page; + } + + pgd = pgd_offset(mm, address); + + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + if (pgd_huge(*pgd)) { + page = follow_huge_pgd(mm, address, pgd, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (is_hugepd(__hugepd(pgd_val(*pgd)))) { + page = follow_huge_pd(vma, address, + __hugepd(pgd_val(*pgd)), flags, + PGDIR_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + + return follow_p4d_mask(vma, address, pgd, flags, page_mask); +} + static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) @@ -1146,7 +1220,7 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ /* - * Generic RCU Fast GUP + * Generic Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1167,8 +1241,8 @@ struct page *get_dump_page(unsigned long addr) * Before activating this code, please be aware that the following assumptions * are currently made: * - * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free - * pages containing page tables. + * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to + * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * @@ -1178,7 +1252,7 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_RCU_GUP +#ifdef CONFIG_HAVE_GENERIC_GUP #ifndef gup_get_pte /* @@ -1349,16 +1423,15 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return __gup_device_huge_pmd(orig, addr, end, pages, nr); refs = 0; - head = pmd_page(orig); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(pmd_page(orig)); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1388,16 +1461,15 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return __gup_device_huge_pud(orig, addr, end, pages, nr); refs = 0; - head = pud_page(orig); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(pud_page(orig)); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1426,16 +1498,15 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, BUILD_BUG_ON(pgd_devmap(orig)); refs = 0; - head = pgd_page(orig); - page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); + page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(pgd_page(orig)); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1668,4 +1739,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, return ret; } -#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ +#endif /* CONFIG_HAVE_GENERIC_GUP */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 88c6167f194d..86975dec0ba1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1575,8 +1575,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, get_page(page); spin_unlock(ptl); split_huge_page(page); - put_page(page); unlock_page(page); + put_page(page); goto out_unlocked; } @@ -2203,7 +2203,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * atomic_set() here would be safe on all archs (and not only on x86), * it's safer to use atomic_inc()/atomic_add(). */ - if (PageAnon(head)) { + if (PageAnon(head) && !PageSwapCache(head)) { page_ref_inc(page_tail); } else { /* Additional pin to radix tree */ @@ -2214,6 +2214,7 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->flags |= (head->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | + (1L << PG_swapcache) | (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | @@ -2276,7 +2277,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - page_ref_inc(head); + /* Additional pin to radix tree of swap cache */ + if (PageSwapCache(head)) + page_ref_add(head, 2); + else + page_ref_inc(head); } else { /* Additional pin to radix tree */ page_ref_add(head, 2); @@ -2385,6 +2390,21 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount) return ret; } +/* Racy check whether the huge page can be split */ +bool can_split_huge_page(struct page *page, int *pextra_pins) +{ + int extra_pins; + + /* Additional pins from radix tree */ + if (PageAnon(page)) + extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; + else + extra_pins = HPAGE_PMD_NR; + if (pextra_pins) + *pextra_pins = extra_pins; + return total_mapcount(page) == page_count(page) - extra_pins - 1; +} + /* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. @@ -2432,7 +2452,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } - extra_pins = 0; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2444,8 +2463,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - /* Addidional pins from radix tree */ - extra_pins = HPAGE_PMD_NR; anon_vma = NULL; i_mmap_lock_read(mapping); } @@ -2454,7 +2471,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * Racy check if we can split the page, before freeze_page() will * split PMDs */ - if (total_mapcount(head) != page_count(head) - extra_pins - 1) { + if (!can_split_huge_page(head, &extra_pins)) { ret = -EBUSY; goto out_unlock; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3eedb187e549..bc48ee783dd9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -20,9 +20,9 @@ #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/rmap.h> +#include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/swapops.h> -#include <linux/page-isolation.h> #include <linux/jhash.h> #include <asm/page.h> @@ -867,12 +867,12 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; list_for_each_entry(page, &h->hugepage_freelists[nid], lru) - if (!is_migrate_isolate_page(page)) + if (!PageHWPoison(page)) break; /* * if 'non-isolated free hugepage' not found on the list, @@ -887,6 +887,42 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return page; } +static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, + nodemask_t *nmask) +{ + unsigned int cpuset_mems_cookie; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + int node = -1; + + zonelist = node_zonelist(nid, gfp_mask); + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { + struct page *page; + + if (!cpuset_zone_allowed(zone, gfp_mask)) + continue; + /* + * no need to ask again on the same node. Pool is node rather than + * zone aware + */ + if (zone_to_nid(zone) == node) + continue; + node = zone_to_nid(zone); + + page = dequeue_huge_page_node_exact(h, node); + if (page) + return page; + } + if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + return NULL; +} + /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { @@ -901,13 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page = NULL; + struct page *page; struct mempolicy *mpol; + gfp_t gfp_mask; nodemask_t *nodemask; - struct zonelist *zonelist; - struct zone *zone; - struct zoneref *z; - unsigned int cpuset_mems_cookie; + int nid; /* * A child process with MAP_PRIVATE mappings created by their parent @@ -922,31 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) goto err; -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = huge_zonelist(vma, address, - htlb_alloc_mask(h), &mpol, &nodemask); - - for_each_zone_zonelist_nodemask(zone, z, zonelist, - MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { - page = dequeue_huge_page_node(h, zone_to_nid(zone)); - if (page) { - if (avoid_reserve) - break; - if (!vma_has_reserves(vma, chg)) - break; - - SetPagePrivate(page); - h->resv_huge_pages--; - break; - } - } + gfp_mask = htlb_alloc_mask(h); + nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { + SetPagePrivate(page); + h->resv_huge_pages--; } mpol_cond_put(mpol); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; return page; err: @@ -1024,9 +1042,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ - ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ - defined(CONFIG_CMA)) +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { @@ -1158,8 +1174,7 @@ static int alloc_fresh_gigantic_page(struct hstate *h, return 0; } -static inline bool gigantic_page_supported(void) { return true; } -#else +#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static inline bool gigantic_page_supported(void) { return false; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, @@ -1369,7 +1384,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, + __GFP_RETRY_MAYFAIL|__GFP_NOWARN, huge_page_order(h)); if (page) { prep_new_huge_page(h, page, nid); @@ -1444,7 +1459,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, * number of free hugepages would be reduced below the number of reserved * hugepages. */ -static int dissolve_free_huge_page(struct page *page) +int dissolve_free_huge_page(struct page *page) { int rc = 0; @@ -1457,6 +1472,14 @@ static int dissolve_free_huge_page(struct page *page) rc = -EBUSY; goto out; } + /* + * Move PageHWPoison flag from head page to the raw error page, + * which makes any subpages rather than the error page reusable. + */ + if (PageHWPoison(head) && page != head) { + SetPageHWPoison(page); + ClearPageHWPoison(head); + } list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; @@ -1497,82 +1520,19 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -/* - * There are 3 ways this can get called: - * 1. With vma+addr: we use the VMA's memory policy - * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge - * page from any node, and let the buddy allocator itself figure - * it out. - * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page - * strictly from 'nid' - */ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr, int nid) + gfp_t gfp_mask, int nid, nodemask_t *nmask) { int order = huge_page_order(h); - gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; - unsigned int cpuset_mems_cookie; - /* - * We need a VMA to get a memory policy. If we do not - * have one, we use the 'nid' argument. - * - * The mempolicy stuff below has some non-inlined bits - * and calls ->vm_ops. That makes it hard to optimize at - * compile-time, even when NUMA is off and it does - * nothing. This helps the compiler optimize it out. - */ - if (!IS_ENABLED(CONFIG_NUMA) || !vma) { - /* - * If a specific node is requested, make sure to - * get memory from there, but only when a node - * is explicitly specified. - */ - if (nid != NUMA_NO_NODE) - gfp |= __GFP_THISNODE; - /* - * Make sure to call something that can handle - * nid=NUMA_NO_NODE - */ - return alloc_pages_node(nid, gfp, order); - } - - /* - * OK, so we have a VMA. Fetch the mempolicy and try to - * allocate a huge page with it. We will only reach this - * when CONFIG_NUMA=y. - */ - do { - struct page *page; - struct mempolicy *mpol; - struct zonelist *zl; - nodemask_t *nodemask; - - cpuset_mems_cookie = read_mems_allowed_begin(); - zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); - mpol_cond_put(mpol); - page = __alloc_pages_nodemask(gfp, order, zl, nodemask); - if (page) - return page; - } while (read_mems_allowed_retry(cpuset_mems_cookie)); - - return NULL; + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); } -/* - * There are two ways to allocate a huge page: - * 1. When you have a VMA and an address (like a fault) - * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) - * - * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in - * this case which signifies that the allocation should be done with - * respect for the VMA's memory policy. - * - * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This - * implies that memory policies will not be taken in to account. - */ -static struct page *__alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr, int nid) +static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { struct page *page; unsigned int r_nid; @@ -1581,15 +1541,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, return NULL; /* - * Make sure that anyone specifying 'nid' is not also specifying a VMA. - * This makes sure the caller is picking _one_ of the modes with which - * we can call this function, not both. - */ - if (vma || (addr != -1)) { - VM_WARN_ON_ONCE(addr == -1); - VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); - } - /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed * overcommit @@ -1622,7 +1573,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); spin_lock(&hugetlb_lock); if (page) { @@ -1647,26 +1598,23 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, } /* - * Allocate a huge page from 'nid'. Note, 'nid' may be - * NUMA_NO_NODE, which means that it may be allocated - * anywhere. - */ -static -struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) -{ - unsigned long addr = -1; - - return __alloc_buddy_huge_page(h, NULL, addr, nid); -} - -/* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); + struct page *page; + struct mempolicy *mpol; + gfp_t gfp_mask = htlb_alloc_mask(h); + int nid; + nodemask_t *nodemask; + + nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); + page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask); + mpol_cond_put(mpol); + + return page; } /* @@ -1676,19 +1624,46 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { + gfp_t gfp_mask = htlb_alloc_mask(h); struct page *page = NULL; + if (nid != NUMA_NO_NODE) + gfp_mask |= __GFP_THISNODE; + spin_lock(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) - page = dequeue_huge_page_node(h, nid); + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL); spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_buddy_huge_page_no_mpol(h, nid); + page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL); return page; } + +struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, + nodemask_t *nmask) +{ + gfp_t gfp_mask = htlb_alloc_mask(h); + + spin_lock(&hugetlb_lock); + if (h->free_huge_pages - h->resv_huge_pages > 0) { + struct page *page; + + page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); + if (page) { + spin_unlock(&hugetlb_lock); + return page; + } + } + spin_unlock(&hugetlb_lock); + + /* No reservations, try to overcommit */ + + return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask); +} + /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -1714,12 +1689,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h), + NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; break; } list_add(&page->lru, &surplus_list); + cond_resched(); } allocated += i; @@ -2188,8 +2165,16 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) } else if (!alloc_fresh_huge_page(h, &node_states[N_MEMORY])) break; + cond_resched(); + } + if (i < h->max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, i); + h->max_huge_pages = i; } - h->max_huge_pages = i; } static void __init hugetlb_init_hstates(void) @@ -2207,26 +2192,16 @@ static void __init hugetlb_init_hstates(void) VM_BUG_ON(minimum_order == UINT_MAX); } -static char * __init memfmt(char *buf, unsigned long n) -{ - if (n >= (1UL << 30)) - sprintf(buf, "%lu GB", n >> 30); - else if (n >= (1UL << 20)) - sprintf(buf, "%lu MB", n >> 20); - else - sprintf(buf, "%lu KB", n >> 10); - return buf; -} - static void __init report_hugepages(void) { struct hstate *h; for_each_hstate(h) { char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", - memfmt(buf, huge_page_size(h)), - h->free_huge_pages); + buf, h->free_huge_pages); } } @@ -2785,6 +2760,11 @@ static int __init hugetlb_init(void) return 0; if (!size_to_hstate(default_hstate_size)) { + if (default_hstate_size != 0) { + pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n", + default_hstate_size, HPAGE_SIZE); + } + default_hstate_size = HPAGE_SIZE; if (!size_to_hstate(default_hstate_size)) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); @@ -3185,17 +3165,17 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } -static int is_hugetlb_entry_migration(pte_t pte) +bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) - return 0; + return false; swp = pte_to_swp_entry(pte); if (non_swap_entry(swp) && is_migration_entry(swp)) - return 1; + return true; else - return 0; + return false; } static int is_hugetlb_entry_hwpoisoned(pte_t pte) @@ -3233,7 +3213,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr); + src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); @@ -3263,9 +3243,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ make_migration_entry_read(&swp_entry); entry = swp_entry_to_pte(swp_entry); - set_huge_pte_at(src, addr, src_pte, entry); + set_huge_swap_pte_at(src, addr, src_pte, + entry, sz); } - set_huge_pte_at(dst, addr, dst_pte, entry); + set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { if (cow) { huge_ptep_set_wrprotect(src, addr, src_pte); @@ -3317,7 +3298,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; for (; address < end; address += sz) { - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; @@ -3338,7 +3319,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { - huge_pte_clear(mm, address, ptep); + huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } @@ -3535,7 +3516,8 @@ retry_avoidcopy: unmap_ref_private(mm, vma, old_page, address); BUG_ON(huge_pte_none(pte)); spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -3574,7 +3556,8 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -3861,7 +3844,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, address &= huge_page_mask(h); - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -4118,7 +4101,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * * Note that page table lock is not held when pte is null. */ - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), + huge_page_size(h)); if (pte) ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); @@ -4257,7 +4241,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); @@ -4279,7 +4263,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, make_migration_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_huge_pte_at(mm, address, ptep, newpte); + set_huge_swap_pte_at(mm, address, ptep, + newpte, huge_page_size(h)); pages++; } spin_unlock(ptl); @@ -4521,7 +4506,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr); + spte = huge_pte_offset(svma->vm_mm, saddr, + vma_mmu_pagesize(svma)); if (spte) { get_page(virt_to_page(spte)); break; @@ -4617,7 +4603,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; @@ -4653,6 +4640,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, } struct page * __weak +follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, int flags, int pdshift) +{ + WARN(1, "hugepd follow called with no support for hugepage directory format\n"); + return NULL; +} + +struct page * __weak follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int flags) { @@ -4699,39 +4694,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); } -#ifdef CONFIG_MEMORY_FAILURE - -/* - * This function is called from memory failure code. - */ -int dequeue_hwpoisoned_huge_page(struct page *hpage) +struct page * __weak +follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) { - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - int ret = -EBUSY; + if (flags & FOLL_GET) + return NULL; - spin_lock(&hugetlb_lock); - /* - * Just checking !page_huge_active is not enough, because that could be - * an isolated/hwpoisoned hugepage (which have >0 refcount). - */ - if (!page_huge_active(hpage) && !page_count(hpage)) { - /* - * Hwpoisoned hugepage isn't linked to activelist or freelist, - * but dangling hpage->lru can trigger list-debug warnings - * (this happens when we call unpoison_memory() on it), - * so let it point to itself with list_del_init(). - */ - list_del_init(&hpage->lru); - set_page_refcounted(hpage); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - ret = 0; - } - spin_unlock(&hugetlb_lock); - return ret; + return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -#endif bool isolate_huge_page(struct page *page, struct list_head *list) { diff --git a/mm/internal.h b/mm/internal.h index 0e4f558412fb..24d88f084705 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -23,7 +23,7 @@ * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ - __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ + __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_ATOMIC) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index c81549d5c833..ca11bc4ce205 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -134,97 +134,33 @@ static __always_inline bool memory_is_poisoned_1(unsigned long addr) return false; } -static __always_inline bool memory_is_poisoned_2(unsigned long addr) +static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, + unsigned long size) { - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(*shadow_addr)) { - if (memory_is_poisoned_1(addr + 1)) - return true; - - /* - * If single shadow byte covers 2-byte access, we don't - * need to do anything more. Otherwise, test the first - * shadow byte. - */ - if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) - return false; - - return unlikely(*(u8 *)shadow_addr); - } - - return false; -} - -static __always_inline bool memory_is_poisoned_4(unsigned long addr) -{ - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); - if (unlikely(*shadow_addr)) { - if (memory_is_poisoned_1(addr + 3)) - return true; - - /* - * If single shadow byte covers 4-byte access, we don't - * need to do anything more. Otherwise, test the first - * shadow byte. - */ - if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) - return false; - - return unlikely(*(u8 *)shadow_addr); - } - - return false; -} - -static __always_inline bool memory_is_poisoned_8(unsigned long addr) -{ - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(*shadow_addr)) { - if (memory_is_poisoned_1(addr + 7)) - return true; - - /* - * If single shadow byte covers 8-byte access, we don't - * need to do anything more. Otherwise, test the first - * shadow byte. - */ - if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) - return false; - - return unlikely(*(u8 *)shadow_addr); - } + /* + * Access crosses 8(shadow size)-byte boundary. Such access maps + * into 2 shadow bytes, so we need to check them both. + */ + if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) + return *shadow_addr || memory_is_poisoned_1(addr + size - 1); - return false; + return memory_is_poisoned_1(addr + size - 1); } static __always_inline bool memory_is_poisoned_16(unsigned long addr) { - u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(*shadow_addr)) { - u16 shadow_first_bytes = *(u16 *)shadow_addr; - - if (unlikely(shadow_first_bytes)) - return true; - - /* - * If two shadow bytes covers 16-byte access, we don't - * need to do anything more. Otherwise, test the last - * shadow byte. - */ - if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) - return false; + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - return memory_is_poisoned_1(addr + 15); - } + /* Unaligned 16-bytes access maps into 3 shadow bytes. */ + if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) + return *shadow_addr || memory_is_poisoned_1(addr + 15); - return false; + return *shadow_addr; } -static __always_inline unsigned long bytes_is_zero(const u8 *start, +static __always_inline unsigned long bytes_is_nonzero(const u8 *start, size_t size) { while (size) { @@ -237,7 +173,7 @@ static __always_inline unsigned long bytes_is_zero(const u8 *start, return 0; } -static __always_inline unsigned long memory_is_zero(const void *start, +static __always_inline unsigned long memory_is_nonzero(const void *start, const void *end) { unsigned int words; @@ -245,11 +181,11 @@ static __always_inline unsigned long memory_is_zero(const void *start, unsigned int prefix = (unsigned long)start % 8; if (end - start <= 16) - return bytes_is_zero(start, end - start); + return bytes_is_nonzero(start, end - start); if (prefix) { prefix = 8 - prefix; - ret = bytes_is_zero(start, prefix); + ret = bytes_is_nonzero(start, prefix); if (unlikely(ret)) return ret; start += prefix; @@ -258,12 +194,12 @@ static __always_inline unsigned long memory_is_zero(const void *start, words = (end - start) / 8; while (words) { if (unlikely(*(u64 *)start)) - return bytes_is_zero(start, 8); + return bytes_is_nonzero(start, 8); start += 8; words--; } - return bytes_is_zero(start, (end - start) % 8); + return bytes_is_nonzero(start, (end - start) % 8); } static __always_inline bool memory_is_poisoned_n(unsigned long addr, @@ -271,7 +207,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr, { unsigned long ret; - ret = memory_is_zero(kasan_mem_to_shadow((void *)addr), + ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), kasan_mem_to_shadow((void *)addr + size - 1) + 1); if (unlikely(ret)) { @@ -292,11 +228,9 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) case 1: return memory_is_poisoned_1(addr); case 2: - return memory_is_poisoned_2(addr); case 4: - return memory_is_poisoned_4(addr); case 8: - return memory_is_poisoned_8(addr); + return memory_is_poisoned_2_4_8(addr, size); case 16: return memory_is_poisoned_16(addr); default: @@ -803,17 +737,47 @@ void __asan_unpoison_stack_memory(const void *addr, size_t size) EXPORT_SYMBOL(__asan_unpoison_stack_memory); #ifdef CONFIG_MEMORY_HOTPLUG -static int kasan_mem_notifier(struct notifier_block *nb, +static int __meminit kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { - return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK; + struct memory_notify *mem_data = data; + unsigned long nr_shadow_pages, start_kaddr, shadow_start; + unsigned long shadow_end, shadow_size; + + nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT; + start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn); + shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr); + shadow_size = nr_shadow_pages << PAGE_SHIFT; + shadow_end = shadow_start + shadow_size; + + if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) || + WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT))) + return NOTIFY_BAD; + + switch (action) { + case MEM_GOING_ONLINE: { + void *ret; + + ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start, + shadow_end, GFP_KERNEL, + PAGE_KERNEL, VM_NO_GUARD, + pfn_to_nid(mem_data->start_pfn), + __builtin_return_address(0)); + if (!ret) + return NOTIFY_BAD; + + kmemleak_ignore(ret); + return NOTIFY_OK; + } + case MEM_OFFLINE: + vfree((void *)shadow_start); + } + + return NOTIFY_OK; } static int __init kasan_memhotplug_init(void) { - pr_info("WARNING: KASAN doesn't support memory hot-add\n"); - pr_info("Memory hot-add will be disabled\n"); - hotplug_memory_notifier(kasan_mem_notifier, 0); return 0; diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index b96a5f773d88..554e4c0f23a2 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -118,6 +118,18 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); + if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) { + pud_t *pud; + pmd_t *pmd; + + p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_zero_pte)); + continue; + } if (p4d_none(*p4d)) { p4d_populate(&init_mm, p4d, diff --git a/mm/kasan/report.c b/mm/kasan/report.c index beee0e980e2d..04bb1d3eb9ec 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -107,7 +107,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) return bug_type; } -const char *get_wild_bug_type(struct kasan_access_info *info) +static const char *get_wild_bug_type(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index df4ebdb2b10a..c01f177a1120 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -816,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) static bool hugepage_vma_check(struct vm_area_struct *vma) { if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) + (vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_file(vma->vm_file)) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 20036d4f9f13..7780cd83a495 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -150,7 +150,7 @@ struct kmemleak_scan_area { */ struct kmemleak_object { spinlock_t lock; - unsigned long flags; /* object status flags */ + unsigned int flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; struct rb_node rb_node; @@ -159,6 +159,8 @@ struct kmemleak_object { atomic_t use_count; unsigned long pointer; size_t size; + /* pass surplus references to this pointer */ + unsigned long excess_ref; /* minimum number of a pointers found before it is considered leak */ int min_count; /* the total number of pointers found pointing to this object */ @@ -253,7 +255,8 @@ enum { KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, - KMEMLEAK_NO_SCAN + KMEMLEAK_NO_SCAN, + KMEMLEAK_SET_EXCESS_REF }; /* @@ -262,9 +265,12 @@ enum { */ struct early_log { int op_type; /* kmemleak operation type */ - const void *ptr; /* allocated/freed memory block */ - size_t size; /* memory block size */ int min_count; /* minimum reference count */ + const void *ptr; /* allocated/freed memory block */ + union { + size_t size; /* memory block size */ + unsigned long excess_ref; /* surplus reference passing */ + }; unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -393,7 +399,7 @@ static void dump_object_info(struct kmemleak_object *object) object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); - pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); @@ -562,6 +568,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object->flags = OBJECT_ALLOCATED; object->pointer = ptr; object->size = size; + object->excess_ref = 0; object->min_count = min_count; object->count = 0; /* white color initially */ object->jiffies = jiffies; @@ -795,6 +802,30 @@ out: } /* + * Any surplus references (object already gray) to 'ptr' are passed to + * 'excess_ref'. This is used in the vmalloc() case where a pointer to + * vm_struct may be used as an alternative reference to the vmalloc'ed object + * (see free_thread_stack()). + */ +static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n", + ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->excess_ref = excess_ref; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* * Set the OBJECT_NO_SCAN flag for the object corresponding to the give * pointer. Such object will not be scanned by kmemleak but references to it * are searched. @@ -908,7 +939,7 @@ static void early_alloc_percpu(struct early_log *log) * @gfp: kmalloc() flags used for kmemleak internal memory allocations * * This function is called from the kernel allocators when a new object - * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). + * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.). */ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) @@ -952,6 +983,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); /** + * kmemleak_vmalloc - register a newly vmalloc'ed object + * @area: pointer to vm_struct + * @size: size of the object + * @gfp: __vmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the vmalloc() kernel allocator when a new + * object (memory block) is allocated. + */ +void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + + /* + * A min_count = 2 is needed because vm_struct contains a reference to + * the virtual address of the vmalloc'ed block. + */ + if (kmemleak_enabled) { + create_object((unsigned long)area->addr, size, 2, gfp); + object_set_excess_ref((unsigned long)area, + (unsigned long)area->addr); + } else if (kmemleak_early_log) { + log_early(KMEMLEAK_ALLOC, area->addr, size, 2); + /* reusing early_log.size for storing area->addr */ + log_early(KMEMLEAK_SET_EXCESS_REF, + area, (unsigned long)area->addr, 0); + } +} +EXPORT_SYMBOL_GPL(kmemleak_vmalloc); + +/** * kmemleak_free - unregister a previously registered object * @ptr: pointer to beginning of the object * @@ -1188,6 +1249,30 @@ static bool update_checksum(struct kmemleak_object *object) } /* + * Update an object's references. object->lock must be held by the caller. + */ +static void update_refs(struct kmemleak_object *object) +{ + if (!color_white(object)) { + /* non-orphan, ignored or new */ + return; + } + + /* + * Increase the object's reference count (number of pointers to the + * memory block). If this count reaches the required minimum, the + * object's color will become gray and it will be added to the + * gray_list. + */ + object->count++; + if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); + list_add_tail(&object->gray_list, &gray_list); + } +} + +/* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occurred. */ @@ -1224,6 +1309,7 @@ static void scan_block(void *_start, void *_end, for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; unsigned long pointer; + unsigned long excess_ref; if (scan_should_stop()) break; @@ -1259,25 +1345,27 @@ static void scan_block(void *_start, void *_end, * enclosed by scan_mutex. */ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - if (!color_white(object)) { - /* non-orphan, ignored or new */ - spin_unlock(&object->lock); - continue; - } - - /* - * Increase the object's reference count (number of pointers - * to the memory block). If this count reaches the required - * minimum, the object's color will become gray and it will be - * added to the gray_list. - */ - object->count++; + /* only pass surplus references (object already gray) */ if (color_gray(object)) { - /* put_object() called when removing from gray_list */ - WARN_ON(!get_object(object)); - list_add_tail(&object->gray_list, &gray_list); + excess_ref = object->excess_ref; + /* no need for update_refs() if object already gray */ + } else { + excess_ref = 0; + update_refs(object); } spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); + if (!object) + continue; + if (object == scanned) + /* circular reference, ignore */ + continue; + spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); + spin_unlock(&object->lock); + } } read_unlock_irqrestore(&kmemleak_lock, flags); } @@ -1980,6 +2068,10 @@ void __init kmemleak_init(void) case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); break; + case KMEMLEAK_SET_EXCESS_REF: + object_set_excess_ref((unsigned long)log->ptr, + log->excess_ref); + break; default: kmemleak_warn("Unknown early log operation: %d\n", log->op_type); @@ -128,9 +128,12 @@ struct ksm_scan { * struct stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @chain_prune_time: time of the last full garbage collection + * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { @@ -138,11 +141,24 @@ struct stable_node { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; - struct list_head list; + struct { + struct hlist_node hlist_dup; + struct list_head list; + }; }; }; struct hlist_head hlist; - unsigned long kpfn; + union { + unsigned long kpfn; + unsigned long chain_prune_time; + }; + /* + * STABLE_NODE_CHAIN can be any negative number in + * rmap_hlist_len negative range, but better not -1 to be able + * to reliably detect underflows. + */ +#define STABLE_NODE_CHAIN -1024 + int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif @@ -192,6 +208,7 @@ static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); +#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -219,6 +236,18 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; +/* The number of stable_node chains */ +static unsigned long ksm_stable_node_chains; + +/* The number of stable_node dups linked to the stable_node chains */ +static unsigned long ksm_stable_node_dups; + +/* Delay in pruning stale stable_node_dups in the stable_node_chains */ +static int ksm_stable_node_chains_prune_millisecs = 2000; + +/* Maximum number of page slots sharing a stable node */ +static int ksm_max_page_sharing = 256; + /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -287,6 +316,45 @@ static void __init ksm_slab_free(void) mm_slot_cache = NULL; } +static __always_inline bool is_stable_node_chain(struct stable_node *chain) +{ + return chain->rmap_hlist_len == STABLE_NODE_CHAIN; +} + +static __always_inline bool is_stable_node_dup(struct stable_node *dup) +{ + return dup->head == STABLE_NODE_DUP_HEAD; +} + +static inline void stable_node_chain_add_dup(struct stable_node *dup, + struct stable_node *chain) +{ + VM_BUG_ON(is_stable_node_dup(dup)); + dup->head = STABLE_NODE_DUP_HEAD; + VM_BUG_ON(!is_stable_node_chain(chain)); + hlist_add_head(&dup->hlist_dup, &chain->hlist); + ksm_stable_node_dups++; +} + +static inline void __stable_node_dup_del(struct stable_node *dup) +{ + VM_BUG_ON(!is_stable_node_dup(dup)); + hlist_del(&dup->hlist_dup); + ksm_stable_node_dups--; +} + +static inline void stable_node_dup_del(struct stable_node *dup) +{ + VM_BUG_ON(is_stable_node_chain(dup)); + if (is_stable_node_dup(dup)) + __stable_node_dup_del(dup); + else + rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); +#ifdef CONFIG_DEBUG_VM + dup->head = NULL; +#endif +} + static inline struct rmap_item *alloc_rmap_item(void) { struct rmap_item *rmap_item; @@ -317,6 +385,8 @@ static inline struct stable_node *alloc_stable_node(void) static inline void free_stable_node(struct stable_node *stable_node) { + VM_BUG_ON(stable_node->rmap_hlist_len && + !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } @@ -498,25 +568,82 @@ static inline int get_kpfn_nid(unsigned long kpfn) return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } +static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, + struct rb_root *root) +{ + struct stable_node *chain = alloc_stable_node(); + VM_BUG_ON(is_stable_node_chain(dup)); + if (likely(chain)) { + INIT_HLIST_HEAD(&chain->hlist); + chain->chain_prune_time = jiffies; + chain->rmap_hlist_len = STABLE_NODE_CHAIN; +#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) + chain->nid = -1; /* debug */ +#endif + ksm_stable_node_chains++; + + /* + * Put the stable node chain in the first dimension of + * the stable tree and at the same time remove the old + * stable node. + */ + rb_replace_node(&dup->node, &chain->node, root); + + /* + * Move the old stable node to the second dimension + * queued in the hlist_dup. The invariant is that all + * dup stable_nodes in the chain->hlist point to pages + * that are wrprotected and have the exact same + * content. + */ + stable_node_chain_add_dup(dup, chain); + } + return chain; +} + +static inline void free_stable_node_chain(struct stable_node *chain, + struct rb_root *root) +{ + rb_erase(&chain->node, root); + free_stable_node(chain); + ksm_stable_node_chains--; +} + static void remove_node_from_stable_tree(struct stable_node *stable_node) { struct rmap_item *rmap_item; + /* check it's not STABLE_NODE_CHAIN or negative */ + BUG_ON(stable_node->rmap_hlist_len < 0); + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) ksm_pages_sharing--; else ksm_pages_shared--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } + /* + * We need the second aligned pointer of the migrate_nodes + * list_head to stay clear from the rb_parent_color union + * (aligned and different than any node) and also different + * from &migrate_nodes. This will verify that future list.h changes + * don't break STABLE_NODE_DUP_HEAD. + */ +#if GCC_VERSION >= 40903 /* only recent gcc can handle it */ + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); +#endif + if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else - rb_erase(&stable_node->node, - root_stable_tree + NUMA(stable_node->nid)); + stable_node_dup_del(stable_node); free_stable_node(stable_node); } @@ -635,6 +762,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ksm_pages_sharing--; else ksm_pages_shared--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; @@ -743,6 +872,31 @@ static int remove_stable_node(struct stable_node *stable_node) return err; } +static int remove_stable_node_chain(struct stable_node *stable_node, + struct rb_root *root) +{ + struct stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + if (remove_stable_node(stable_node)) + return true; + else + return false; + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + if (remove_stable_node(dup)) + return true; + } + BUG_ON(!hlist_empty(&stable_node->hlist)); + free_stable_node_chain(stable_node, root); + return false; +} + static int remove_all_stable_nodes(void) { struct stable_node *stable_node, *next; @@ -753,7 +907,8 @@ static int remove_all_stable_nodes(void) while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct stable_node, node); - if (remove_stable_node(stable_node)) { + if (remove_stable_node_chain(stable_node, + root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } @@ -1138,6 +1293,214 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, return err ? NULL : page; } +static __always_inline +bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) +{ + VM_BUG_ON(stable_node->rmap_hlist_len < 0); + /* + * Check that at least one mapping still exists, otherwise + * there's no much point to merge and share with this + * stable_node, as the underlying tree_page of the other + * sharer is going to be freed soon. + */ + return stable_node->rmap_hlist_len && + stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; +} + +static __always_inline +bool is_page_sharing_candidate(struct stable_node *stable_node) +{ + return __is_page_sharing_candidate(stable_node, 0); +} + +struct page *stable_node_dup(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; + struct hlist_node *hlist_safe; + struct page *_tree_page, *tree_page = NULL; + int nr = 0; + int found_rmap_hlist_len; + + if (!prune_stale_stable_nodes || + time_before(jiffies, stable_node->chain_prune_time + + msecs_to_jiffies( + ksm_stable_node_chains_prune_millisecs))) + prune_stale_stable_nodes = false; + else + stable_node->chain_prune_time = jiffies; + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + cond_resched(); + /* + * We must walk all stable_node_dup to prune the stale + * stable nodes during lookup. + * + * get_ksm_page can drop the nodes from the + * stable_node->hlist if they point to freed pages + * (that's why we do a _safe walk). The "dup" + * stable_node parameter itself will be freed from + * under us if it returns NULL. + */ + _tree_page = get_ksm_page(dup, false); + if (!_tree_page) + continue; + nr += 1; + if (is_page_sharing_candidate(dup)) { + if (!found || + dup->rmap_hlist_len > found_rmap_hlist_len) { + if (found) + put_page(tree_page); + found = dup; + found_rmap_hlist_len = found->rmap_hlist_len; + tree_page = _tree_page; + + /* skip put_page for found dup */ + if (!prune_stale_stable_nodes) + break; + continue; + } + } + put_page(_tree_page); + } + + if (found) { + /* + * nr is counting all dups in the chain only if + * prune_stale_stable_nodes is true, otherwise we may + * break the loop at nr == 1 even if there are + * multiple entries. + */ + if (prune_stale_stable_nodes && nr == 1) { + /* + * If there's not just one entry it would + * corrupt memory, better BUG_ON. In KSM + * context with no lock held it's not even + * fatal. + */ + BUG_ON(stable_node->hlist.first->next); + + /* + * There's just one entry and it is below the + * deduplication limit so drop the chain. + */ + rb_replace_node(&stable_node->node, &found->node, + root); + free_stable_node(stable_node); + ksm_stable_node_chains--; + ksm_stable_node_dups--; + /* + * NOTE: the caller depends on the stable_node + * to be equal to stable_node_dup if the chain + * was collapsed. + */ + *_stable_node = found; + /* + * Just for robustneess as stable_node is + * otherwise left as a stable pointer, the + * compiler shall optimize it away at build + * time. + */ + stable_node = NULL; + } else if (stable_node->hlist.first != &found->hlist_dup && + __is_page_sharing_candidate(found, 1)) { + /* + * If the found stable_node dup can accept one + * more future merge (in addition to the one + * that is underway) and is not at the head of + * the chain, put it there so next search will + * be quicker in the !prune_stale_stable_nodes + * case. + * + * NOTE: it would be inaccurate to use nr > 1 + * instead of checking the hlist.first pointer + * directly, because in the + * prune_stale_stable_nodes case "nr" isn't + * the position of the found dup in the chain, + * but the total number of dups in the chain. + */ + hlist_del(&found->hlist_dup); + hlist_add_head(&found->hlist_dup, + &stable_node->hlist); + } + } + + *_stable_node_dup = found; + return tree_page; +} + +static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, + struct rb_root *root) +{ + if (!is_stable_node_chain(stable_node)) + return stable_node; + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return NULL; + } + return hlist_entry(stable_node->hlist.first, + typeof(*stable_node), hlist_dup); +} + +/* + * Like for get_ksm_page, this function can free the *_stable_node and + * *_stable_node_dup if the returned tree_page is NULL. + * + * It can also free and overwrite *_stable_node with the found + * stable_node_dup if the chain is collapsed (in which case + * *_stable_node will be equal to *_stable_node_dup like if the chain + * never existed). It's up to the caller to verify tree_page is not + * NULL before dereferencing *_stable_node or *_stable_node_dup. + * + * *_stable_node_dup is really a second output parameter of this + * function and will be overwritten in all cases, the caller doesn't + * need to initialize it. + */ +static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + struct stable_node *stable_node = *_stable_node; + if (!is_stable_node_chain(stable_node)) { + if (is_page_sharing_candidate(stable_node)) { + *_stable_node_dup = stable_node; + return get_ksm_page(stable_node, false); + } + /* + * _stable_node_dup set to NULL means the stable_node + * reached the ksm_max_page_sharing limit. + */ + *_stable_node_dup = NULL; + return NULL; + } + return stable_node_dup(_stable_node_dup, _stable_node, root, + prune_stale_stable_nodes); +} + +static __always_inline struct page *chain_prune(struct stable_node **s_n_d, + struct stable_node **s_n, + struct rb_root *root) +{ + return __stable_node_chain(s_n_d, s_n, root, true); +} + +static __always_inline struct page *chain(struct stable_node **s_n_d, + struct stable_node *s_n, + struct rb_root *root) +{ + struct stable_node *old_stable_node = s_n; + struct page *tree_page; + + tree_page = __stable_node_chain(s_n_d, &s_n, root, false); + /* not pruning dups so s_n cannot have changed */ + VM_BUG_ON(s_n != old_stable_node); + return tree_page; +} + /* * stable_tree_search - search for page inside the stable tree * @@ -1153,7 +1516,7 @@ static struct page *stable_tree_search(struct page *page) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node; + struct stable_node *stable_node, *stable_node_dup, *stable_node_any; struct stable_node *page_node; page_node = page_stable_node(page); @@ -1175,7 +1538,44 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node, false); + stable_node_any = NULL; + tree_page = chain_prune(&stable_node_dup, &stable_node, root); + /* + * NOTE: stable_node may have been freed by + * chain_prune() if the returned stable_node_dup is + * not NULL. stable_node_dup may have been inserted in + * the rbtree instead as a regular stable_node (in + * order to collapse the stable_node chain if a single + * stable_node dup was found in it). In such case the + * stable_node is overwritten by the calleee to point + * to the stable_node_dup that was collapsed in the + * stable rbtree and stable_node will be equal to + * stable_node_dup like if the chain never existed. + */ + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * wrprotected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, false); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { /* * If we walked over a stale stable_node, @@ -1198,6 +1598,34 @@ again: else if (ret > 0) new = &parent->rb_right; else { + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + /* + * Test if the migrated page should be merged + * into a stable node dup. If the mapcount is + * 1 we can migrate it with another KSM page + * without adding it to the chain. + */ + if (page_mapcount(page) > 1) + goto chain_append; + } + + if (!stable_node_dup) { + /* + * If the stable_node is a chain and + * we got a payload match in memcmp + * but we cannot merge the scanned + * page in any of the existing + * stable_node dups because they're + * all full, we need to wait the + * scanned page to find itself a match + * in the unstable tree to create a + * brand new KSM page to add later to + * the dups of this stable_node. + */ + return NULL; + } + /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page @@ -1205,23 +1633,21 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node, true); - if (tree_page) { - unlock_page(tree_page); - if (get_kpfn_nid(stable_node->kpfn) != - NUMA(stable_node->nid)) { - put_page(tree_page); - goto replace; - } - return tree_page; - } - /* - * There is now a place for page_node, but the tree may - * have been rebalanced, so re-evaluate parent and new. - */ - if (page_node) + tree_page = get_ksm_page(stable_node_dup, true); + if (unlikely(!tree_page)) + /* + * The tree may have been rebalanced, + * so re-evaluate parent and new. + */ goto again; - return NULL; + unlock_page(tree_page); + + if (get_kpfn_nid(stable_node_dup->kpfn) != + NUMA(stable_node_dup->nid)) { + put_page(tree_page); + goto replace; + } + return tree_page; } } @@ -1232,22 +1658,95 @@ again: DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); - get_page(page); - return page; +out: + if (is_page_sharing_candidate(page_node)) { + get_page(page); + return page; + } else + return NULL; replace: - if (page_node) { - list_del(&page_node->list); - DO_NUMA(page_node->nid = nid); - rb_replace_node(&stable_node->node, &page_node->node, root); - get_page(page); + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. + */ + if (stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_chain(stable_node_dup)); + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); + /* there is no chain */ + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_replace_node(&stable_node_dup->node, + &page_node->node, + root); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + rb_erase(&stable_node_dup->node, root); + page = NULL; + } } else { - rb_erase(&stable_node->node, root); - page = NULL; + VM_BUG_ON(!is_stable_node_chain(stable_node)); + __stable_node_dup_del(stable_node_dup); + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + page = NULL; + } } - stable_node->head = &migrate_nodes; - list_add(&stable_node->list, stable_node->head); + stable_node_dup->head = &migrate_nodes; + list_add(&stable_node_dup->list, stable_node_dup->head); return page; + +chain_append: + /* stable_node_dup could be null if it reached the limit */ + if (!stable_node_dup) + stable_node_dup = stable_node_any; + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. + */ + if (stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_chain(stable_node_dup)); + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(stable_node_dup, + root); + if (!stable_node) + return NULL; + } + /* + * Add this stable_node dup that was + * migrated to the stable_node chain + * of the current nid for this page + * content. + */ + VM_BUG_ON(!is_stable_node_chain(stable_node)); + VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + goto out; } /* @@ -1264,7 +1763,8 @@ static struct stable_node *stable_tree_insert(struct page *kpage) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node; + struct stable_node *stable_node, *stable_node_dup, *stable_node_any; + bool need_chain = false; kpfn = page_to_pfn(kpage); nid = get_kpfn_nid(kpfn); @@ -1279,7 +1779,32 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node, false); + stable_node_any = NULL; + tree_page = chain(&stable_node_dup, stable_node, root); + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * wrprotected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, false); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { /* * If we walked over a stale stable_node, @@ -1302,27 +1827,37 @@ again: else if (ret > 0) new = &parent->rb_right; else { - /* - * It is not a bug that stable_tree_search() didn't - * find this node: because at that time our page was - * not yet write-protected, so may have changed since. - */ - return NULL; + need_chain = true; + break; } } - stable_node = alloc_stable_node(); - if (!stable_node) + stable_node_dup = alloc_stable_node(); + if (!stable_node_dup) return NULL; - INIT_HLIST_HEAD(&stable_node->hlist); - stable_node->kpfn = kpfn; - set_page_stable_node(kpage, stable_node); - DO_NUMA(stable_node->nid = nid); - rb_link_node(&stable_node->node, parent, new); - rb_insert_color(&stable_node->node, root); + INIT_HLIST_HEAD(&stable_node_dup->hlist); + stable_node_dup->kpfn = kpfn; + set_page_stable_node(kpage, stable_node_dup); + stable_node_dup->rmap_hlist_len = 0; + DO_NUMA(stable_node_dup->nid = nid); + if (!need_chain) { + rb_link_node(&stable_node_dup->node, parent, new); + rb_insert_color(&stable_node_dup->node, root); + } else { + if (!is_stable_node_chain(stable_node)) { + struct stable_node *orig = stable_node; + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(orig, root); + if (!stable_node) { + free_stable_node(stable_node_dup); + return NULL; + } + } + stable_node_chain_add_dup(stable_node_dup, stable_node); + } - return stable_node; + return stable_node_dup; } /* @@ -1412,8 +1947,27 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, * the same ksm page. */ static void stable_tree_append(struct rmap_item *rmap_item, - struct stable_node *stable_node) + struct stable_node *stable_node, + bool max_page_sharing_bypass) { + /* + * rmap won't find this mapping if we don't insert the + * rmap_item in the right stable_node + * duplicate. page_migration could break later if rmap breaks, + * so we can as well crash here. We really need to check for + * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check + * for other negative values as an undeflow if detected here + * for the first time (and not when decreasing rmap_hlist_len) + * would be sign of memory corruption in the stable_node. + */ + BUG_ON(stable_node->rmap_hlist_len < 0); + + stable_node->rmap_hlist_len++; + if (!max_page_sharing_bypass) + /* possibly non fatal but unexpected overflow, only warn */ + WARN_ON_ONCE(stable_node->rmap_hlist_len > + ksm_max_page_sharing); + rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); @@ -1441,19 +1995,26 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) struct page *kpage; unsigned int checksum; int err; + bool max_page_sharing_bypass = false; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && - get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { - rb_erase(&stable_node->node, - root_stable_tree + NUMA(stable_node->nid)); + get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != + NUMA(stable_node->nid)) { + stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; + /* + * If it's a KSM fork, allow it to go over the sharing limit + * without warnings. + */ + if (!is_page_sharing_candidate(stable_node)) + max_page_sharing_bypass = true; } /* We first start with searching the page inside the stable tree */ @@ -1473,7 +2034,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * add its rmap_item to the stable tree. */ lock_page(kpage); - stable_tree_append(rmap_item, page_stable_node(kpage)); + stable_tree_append(rmap_item, page_stable_node(kpage), + max_page_sharing_bypass); unlock_page(kpage); } put_page(kpage); @@ -1523,8 +2085,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) lock_page(kpage); stable_node = stable_tree_insert(kpage); if (stable_node) { - stable_tree_append(tree_rmap_item, stable_node); - stable_tree_append(rmap_item, stable_node); + stable_tree_append(tree_rmap_item, stable_node, + false); + stable_tree_append(rmap_item, stable_node, + false); } unlock_page(kpage); @@ -2028,6 +2592,48 @@ static void wait_while_offlining(void) } } +static bool stable_node_dup_remove_range(struct stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn) +{ + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) { + /* + * Don't get_ksm_page, page has already gone: + * which is why we keep kpfn instead of page* + */ + remove_node_from_stable_tree(stable_node); + return true; + } + return false; +} + +static bool stable_node_chain_remove_range(struct stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn, + struct rb_root *root) +{ + struct stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + return stable_node_dup_remove_range(stable_node, start_pfn, + end_pfn); + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + stable_node_dup_remove_range(dup, start_pfn, end_pfn); + } + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return true; /* notify caller that tree was rebalanced */ + } else + return false; +} + static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { @@ -2039,15 +2645,12 @@ static void ksm_check_stable_tree(unsigned long start_pfn, node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct stable_node, node); - if (stable_node->kpfn >= start_pfn && - stable_node->kpfn < end_pfn) { - /* - * Don't get_ksm_page, page has already gone: - * which is why we keep kpfn instead of page* - */ - remove_node_from_stable_tree(stable_node); + if (stable_node_chain_remove_range(stable_node, + start_pfn, end_pfn, + root_stable_tree + + nid)) node = rb_first(root_stable_tree + nid); - } else + else node = rb_next(node); cond_resched(); } @@ -2293,6 +2896,47 @@ static ssize_t use_zero_pages_store(struct kobject *kobj, } KSM_ATTR(use_zero_pages); +static ssize_t max_page_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_max_page_sharing); +} + +static ssize_t max_page_sharing_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + int knob; + + err = kstrtoint(buf, 10, &knob); + if (err) + return err; + /* + * When a KSM page is created it is shared by 2 mappings. This + * being a signed comparison, it implicitly verifies it's not + * negative. + */ + if (knob < 2) + return -EINVAL; + + if (READ_ONCE(ksm_max_page_sharing) == knob) + return count; + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_max_page_sharing != knob) { + if (ksm_pages_shared || remove_all_stable_nodes()) + err = -EBUSY; + else + ksm_max_page_sharing = knob; + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(max_page_sharing); + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2331,6 +2975,46 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t stable_node_dups_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_stable_node_dups); +} +KSM_ATTR_RO(stable_node_dups); + +static ssize_t stable_node_chains_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_stable_node_chains); +} +KSM_ATTR_RO(stable_node_chains); + +static ssize_t +stable_node_chains_prune_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); +} + +static ssize_t +stable_node_chains_prune_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + ksm_stable_node_chains_prune_millisecs = msecs; + + return count; +} +KSM_ATTR(stable_node_chains_prune_millisecs); + static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2350,6 +3034,10 @@ static struct attribute *ksm_attrs[] = { #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif + &max_page_sharing_attr.attr, + &stable_node_chains_attr.attr, + &stable_node_dups_attr.attr, + &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, NULL, }; diff --git a/mm/list_lru.c b/mm/list_lru.c index 234676e31edd..7a40fa2be858 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -117,6 +117,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) l = list_lru_from_kmem(nlru, item); list_add_tail(item, &l->list); l->nr_items++; + nlru->nr_items++; spin_unlock(&nlru->lock); return true; } @@ -136,6 +137,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) l = list_lru_from_kmem(nlru, item); list_del_init(item); l->nr_items--; + nlru->nr_items--; spin_unlock(&nlru->lock); return true; } @@ -183,15 +185,10 @@ EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { - long count = 0; - int memcg_idx; + struct list_lru_node *nlru; - count += __list_lru_count_one(lru, nid, -1); - if (list_lru_memcg_aware(lru)) { - for_each_memcg_cache_index(memcg_idx) - count += __list_lru_count_one(lru, nid, memcg_idx); - } - return count; + nlru = &lru->node[nid]; + return nlru->nr_items; } EXPORT_SYMBOL_GPL(list_lru_count_node); @@ -226,6 +223,7 @@ restart: assert_spin_locked(&nlru->lock); case LRU_REMOVED: isolated++; + nlru->nr_items--; /* * If the lru lock has been dropped, our list * traversal is now invalid and so we have to diff --git a/mm/madvise.c b/mm/madvise.c index 25b78ee4fc2c..9976852f1e1c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -205,7 +205,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, continue; page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, - vma, index); + vma, index, false); if (page) put_page(page); } @@ -246,7 +246,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, } swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, - NULL, 0); + NULL, 0, false); if (page) put_page(page); } @@ -451,9 +451,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) - return -EINVAL; - /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; @@ -477,14 +474,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, return 0; } -static long madvise_free(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) -{ - *prev = vma; - return madvise_free_single_vma(vma, start, end); -} - /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about @@ -504,9 +493,17 @@ static long madvise_free(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_dontneed_single_vma(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + zap_page_range(vma, start, end - start); + return 0; +} + +static long madvise_dontneed_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + int behavior) { *prev = vma; if (!can_madv_dontneed_vma(vma)) @@ -526,7 +523,8 @@ static long madvise_dontneed(struct vm_area_struct *vma, * is also < vma->vm_end. If start < * vma->vm_start it means an hole materialized * in the user address space within the - * virtual range passed to MADV_DONTNEED. + * virtual range passed to MADV_DONTNEED + * or MADV_FREE. */ return -ENOMEM; } @@ -537,7 +535,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, * Don't fail if end > vma->vm_end. If the old * vma was splitted while the mmap_sem was * released the effect of the concurrent - * operation may not cause MADV_DONTNEED to + * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an @@ -549,8 +547,13 @@ static long madvise_dontneed(struct vm_area_struct *vma, } VM_WARN_ON(start >= end); } - zap_page_range(vma, start, end - start); - return 0; + + if (behavior == MADV_DONTNEED) + return madvise_dontneed_single_vma(vma, start, end); + else if (behavior == MADV_FREE) + return madvise_free_single_vma(vma, start, end); + else + return -EINVAL; } /* @@ -656,9 +659,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_FREE: - return madvise_free(vma, prev, start, end); case MADV_DONTNEED: - return madvise_dontneed(vma, prev, start, end); + return madvise_dontneed_free(vma, prev, start, end, behavior); default: return madvise_behavior(vma, prev, start, end, behavior); } diff --git a/mm/memblock.c b/mm/memblock.c index 7b8a5db76a2f..2cb25fe4452c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -54,9 +54,6 @@ struct memblock memblock __initdata_memblock = { }; int memblock_debug __initdata_memblock; -#ifdef CONFIG_MOVABLE_NODE -bool movable_node_enabled __initdata_memblock = false; -#endif static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94172089f52f..3df3c04d73ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -170,7 +170,7 @@ struct mem_cgroup_event { */ poll_table pt; wait_queue_head_t *wqh; - wait_queue_t wait; + wait_queue_entry_t wait; struct work_struct remove; }; @@ -631,7 +631,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, val = __this_cpu_read(memcg->stat->nr_page_events); next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ - if ((long)next - (long)val < 0) { + if ((long)(next - val) < 0) { switch (target) { case MEM_CGROUP_TARGET_THRESH: next = val + THRESHOLDS_EVENTS_TARGET; @@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { struct mem_cgroup *memcg; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int memcg_oom_wake_function(wait_queue_t *wait, +static int memcg_oom_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; @@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; - INIT_LIST_HEAD(&owait.wait.task_list); + INIT_LIST_HEAD(&owait.wait.entry); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); mem_cgroup_mark_under_oom(memcg); @@ -2376,10 +2376,9 @@ void mem_cgroup_split_huge_fixup(struct page *head) #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) + int nr_entries) { - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEMCG_SWAP], val); + this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries); } /** @@ -2405,8 +2404,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, false); - mem_cgroup_swap_statistics(to, true); + mem_cgroup_swap_statistics(from, -1); + mem_cgroup_swap_statistics(to, 1); return 0; } return -EINVAL; @@ -3574,6 +3573,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); + seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; } @@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int memcg_event_wake(wait_queue_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct mem_cgroup_event *event = @@ -4122,6 +4122,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; + pn->lruvec_stat = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat) { + kfree(pn); + return 1; + } + lruvec_init(&pn->lruvec); pn->usage_in_excess = 0; pn->on_tree = false; @@ -4133,7 +4139,10 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->nodeinfo[node]); + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + + free_percpu(pn->lruvec_stat); + kfree(pn); } static void __mem_cgroup_free(struct mem_cgroup *memcg) @@ -5165,6 +5174,7 @@ static int memory_events_show(struct seq_file *m, void *v) seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); + seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; } @@ -5197,8 +5207,8 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "kernel_stack %llu\n", (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + - stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(stat[NR_SLAB_RECLAIMABLE] + + stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); @@ -5222,15 +5232,25 @@ static int memory_stat_show(struct seq_file *m, void *v) } seq_printf(m, "slab_reclaimable %llu\n", - (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + + events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + + events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "workingset_refault %lu\n", stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", @@ -5297,38 +5317,52 @@ struct cgroup_subsys memory_cgrp_subsys = { /** * mem_cgroup_low - check if memory consumption is below the normal range - * @root: the highest ancestor to consider + * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * Returns %true if memory consumption of @memcg, and that of all - * configurable ancestors up to @root, is below the normal range. + * ancestors up to (but not including) @root, is below the normal range. + * + * @root is exclusive; it is never low when looked at directly and isn't + * checked when traversing the hierarchy. + * + * Excluding @root enables using memory.low to prioritize memory usage + * between cgroups within a subtree of the hierarchy that is limited by + * memory.high or memory.max. + * + * For example, given cgroup A with children B and C: + * + * A + * / \ + * B C + * + * and + * + * 1. A/memory.current > A/memory.high + * 2. A/B/memory.current < A/B/memory.low + * 3. A/C/memory.current >= A/C/memory.low + * + * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we + * should reclaim from 'C' until 'A' is no longer high or until we can + * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by + * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered + * low and we will reclaim indiscriminately from both 'B' and 'C'. */ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return false; - /* - * The toplevel group doesn't have a configurable range, so - * it's never low when looked at directly, and it is not - * considered an ancestor when assessing the hierarchy. - */ - - if (memcg == root_mem_cgroup) - return false; - - if (page_counter_read(&memcg->memory) >= memcg->low) + if (!root) + root = root_mem_cgroup; + if (memcg == root) return false; - while (memcg != root) { - memcg = parent_mem_cgroup(memcg); - - if (memcg == root_mem_cgroup) - break; - + for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { if (page_counter_read(&memcg->memory) >= memcg->low) return false; } + return true; } @@ -5445,7 +5479,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry); + mem_cgroup_uncharge_swap(entry, nr_pages); } } @@ -5873,9 +5907,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * ancestor for the swap instead and transfer the memory+swap charge. */ swap_memcg = mem_cgroup_id_get_online(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, true); + mem_cgroup_swap_statistics(swap_memcg, 1); page->mem_cgroup = NULL; @@ -5902,19 +5936,20 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) css_put(&memcg->css); } -/* - * mem_cgroup_try_charge_swap - try charging a swap entry +/** + * mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap * @entry: swap entry to charge * - * Try to charge @entry to the memcg that @page belongs to. + * Try to charge @page's memcg for the swap space at @entry. * * Returns 0 on success, -ENOMEM on failure. */ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + unsigned int nr_pages = hpage_nr_pages(page); struct page_counter *counter; + struct mem_cgroup *memcg; unsigned short oldid; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) @@ -5929,25 +5964,27 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && - !page_counter_try_charge(&memcg->swap, 1, &counter)) { + !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { mem_cgroup_id_put(memcg); return -ENOMEM; } - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* Get references for the tail pages, too */ + if (nr_pages > 1) + mem_cgroup_id_get_many(memcg, nr_pages - 1); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(memcg, nr_pages); return 0; } /** - * mem_cgroup_uncharge_swap - uncharge a swap entry + * mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge - * - * Drop the swap charge associated with @entry. + * @nr_pages: the amount of swap space to uncharge */ -void mem_cgroup_uncharge_swap(swp_entry_t entry) +void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { struct mem_cgroup *memcg; unsigned short id; @@ -5955,18 +5992,18 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) if (!do_swap_account) return; - id = swap_cgroup_record(entry, 0); + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, 1); + page_counter_uncharge(&memcg->swap, nr_pages); else - page_counter_uncharge(&memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, nr_pages); } - mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_id_put(memcg); + mem_cgroup_swap_statistics(memcg, -nr_pages); + mem_cgroup_id_put_many(memcg, nr_pages); } rcu_read_unlock(); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ecc183fd94f3..1cd3b3569af8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -49,7 +49,6 @@ #include <linux/swap.h> #include <linux/backing-dev.h> #include <linux/migrate.h> -#include <linux/page-isolation.h> #include <linux/suspend.h> #include <linux/slab.h> #include <linux/swapops.h> @@ -555,6 +554,39 @@ static int delete_from_lru_cache(struct page *p) return -EIO; } +static int truncate_error_page(struct page *p, unsigned long pfn, + struct address_space *mapping) +{ + int ret = MF_FAILED; + + if (mapping->a_ops->error_remove_page) { + int err = mapping->a_ops->error_remove_page(mapping, p); + + if (err != 0) { + pr_info("Memory failure: %#lx: Failed to punch page: %d\n", + pfn, err); + } else if (page_has_private(p) && + !try_to_release_page(p, GFP_NOIO)) { + pr_info("Memory failure: %#lx: failed to release buffers\n", + pfn); + } else { + ret = MF_RECOVERED; + } + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = MF_RECOVERED; + else + pr_info("Memory failure: %#lx: Failed to invalidate\n", + pfn); + } + + return ret; +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we @@ -579,8 +611,6 @@ static int me_unknown(struct page *p, unsigned long pfn) */ static int me_pagecache_clean(struct page *p, unsigned long pfn) { - int err; - int ret = MF_FAILED; struct address_space *mapping; delete_from_lru_cache(p); @@ -612,30 +642,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - if (mapping->a_ops->error_remove_page) { - err = mapping->a_ops->error_remove_page(mapping, p); - if (err != 0) { - pr_info("Memory failure: %#lx: Failed to punch page: %d\n", - pfn, err); - } else if (page_has_private(p) && - !try_to_release_page(p, GFP_NOIO)) { - pr_info("Memory failure: %#lx: failed to release buffers\n", - pfn); - } else { - ret = MF_RECOVERED; - } - } else { - /* - * If the file system doesn't support it just invalidate - * This fails on dirty or anything with private pages - */ - if (invalidate_inode_page(p)) - ret = MF_RECOVERED; - else - pr_info("Memory failure: %#lx: Failed to invalidate\n", - pfn); - } - return ret; + return truncate_error_page(p, pfn, mapping); } /* @@ -684,7 +691,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * the first EIO, but we're not worse than other parts * of the kernel. */ - mapping_set_error(mapping, EIO); + mapping_set_error(mapping, -EIO); } return me_pagecache_clean(p, pfn); @@ -741,24 +748,29 @@ static int me_huge_page(struct page *p, unsigned long pfn) { int res = 0; struct page *hpage = compound_head(p); + struct address_space *mapping; if (!PageHuge(hpage)) return MF_DELAYED; - /* - * We can safely recover from error on free or reserved (i.e. - * not in-use) hugepage by dequeuing it from freelist. - * To check whether a hugepage is in-use or not, we can't use - * page->lru because it can be used in other hugepage operations, - * such as __unmap_hugepage_range() and gather_surplus_pages(). - * So instead we use page_mapping() and PageAnon(). - */ - if (!(page_mapping(hpage) || PageAnon(hpage))) { - res = dequeue_hwpoisoned_huge_page(hpage); - if (!res) - return MF_RECOVERED; + mapping = page_mapping(hpage); + if (mapping) { + res = truncate_error_page(hpage, pfn, mapping); + } else { + unlock_page(hpage); + /* + * migration entry prevents later access on error anonymous + * hugepage, so we can free and dissolve it into buddy to + * save healthy subpages. + */ + if (PageAnon(hpage)) + put_page(hpage); + dissolve_free_huge_page(p); + res = MF_RECOVERED; + lock_page(hpage); } - return MF_DELAYED; + + return res; } /* @@ -857,7 +869,7 @@ static int page_action(struct page_state *ps, struct page *p, count = page_count(p) - 1; if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; - if (count != 0) { + if (count > 0) { pr_err("Memory failure: %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); result = MF_FAILED; @@ -1010,20 +1022,84 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, return unmap_success; } -static void set_page_hwpoison_huge_page(struct page *hpage) +static int identify_page_state(unsigned long pfn, struct page *p, + unsigned long page_flags) { - int i; - int nr_pages = 1 << compound_order(hpage); - for (i = 0; i < nr_pages; i++) - SetPageHWPoison(hpage + i); + struct page_state *ps; + + /* + * The first check uses the current page flags which may not have any + * relevant information. The second check with the saved page flags is + * carried out only if the first check can't determine the page status. + */ + for (ps = error_states;; ps++) + if ((p->flags & ps->mask) == ps->res) + break; + + page_flags |= (p->flags & (1UL << PG_dirty)); + + if (!ps->mask) + for (ps = error_states;; ps++) + if ((page_flags & ps->mask) == ps->res) + break; + return page_action(ps, p, pfn); } -static void clear_page_hwpoison_huge_page(struct page *hpage) +static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags) { - int i; - int nr_pages = 1 << compound_order(hpage); - for (i = 0; i < nr_pages; i++) - ClearPageHWPoison(hpage + i); + struct page *p = pfn_to_page(pfn); + struct page *head = compound_head(p); + int res; + unsigned long page_flags; + + if (TestSetPageHWPoison(head)) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", + pfn); + return 0; + } + + num_poisoned_pages_inc(); + + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { + /* + * Check "filter hit" and "race with other subpage." + */ + lock_page(head); + if (PageHWPoison(head)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != head && TestSetPageHWPoison(head))) { + num_poisoned_pages_dec(); + unlock_page(head); + return 0; + } + } + unlock_page(head); + dissolve_free_huge_page(p); + action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED); + return 0; + } + + lock_page(head); + page_flags = head->flags; + + if (!PageHWPoison(head)) { + pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); + num_poisoned_pages_dec(); + unlock_page(head); + put_hwpoison_page(head); + return 0; + } + + if (!hwpoison_user_mappings(p, pfn, trapno, flags, &head)) { + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); + res = -EBUSY; + goto out; + } + + res = identify_page_state(pfn, p, page_flags); +out: + unlock_page(head); + return res; } /** @@ -1046,12 +1122,10 @@ static void clear_page_hwpoison_huge_page(struct page *hpage) */ int memory_failure(unsigned long pfn, int trapno, int flags) { - struct page_state *ps; struct page *p; struct page *hpage; struct page *orig_head; int res; - unsigned int nr_pages; unsigned long page_flags; if (!sysctl_memory_failure_recovery) @@ -1064,34 +1138,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); - orig_head = hpage = compound_head(p); + if (PageHuge(p)) + return memory_failure_hugetlb(pfn, trapno, flags); if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); return 0; } - /* - * Currently errors on hugetlbfs pages are measured in hugepage units, - * so nr_pages should be 1 << compound_order. OTOH when errors are on - * transparent hugepages, they are supposed to be split and error - * measurement is done in normal page units. So nr_pages should be one - * in this case. - */ - if (PageHuge(p)) - nr_pages = 1 << compound_order(hpage); - else /* normal page or thp */ - nr_pages = 1; - num_poisoned_pages_add(nr_pages); + orig_head = hpage = compound_head(p); + num_poisoned_pages_inc(); /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's a free hugepage, which is also safe: - * an affected hugepage will be dequeued from hugepage freelist, - * so there's no concern about reusing it ever after. - * 3) it's part of a non-compound high order page. + * 2) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. @@ -1102,32 +1164,13 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (is_free_buddy_page(p)) { action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); return 0; - } else if (PageHuge(hpage)) { - /* - * Check "filter hit" and "race with other subpage." - */ - lock_page(hpage); - if (PageHWPoison(hpage)) { - if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != hpage && TestSetPageHWPoison(hpage))) { - num_poisoned_pages_sub(nr_pages); - unlock_page(hpage); - return 0; - } - } - set_page_hwpoison_huge_page(hpage); - res = dequeue_hwpoisoned_huge_page(hpage); - action_result(pfn, MF_MSG_FREE_HUGE, - res ? MF_IGNORED : MF_DELAYED); - unlock_page(hpage); - return res; } else { action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); return -EBUSY; } } - if (!PageHuge(p) && PageTransHuge(hpage)) { + if (PageTransHuge(hpage)) { lock_page(p); if (!PageAnon(p) || unlikely(split_huge_page(p))) { unlock_page(p); @@ -1138,7 +1181,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) pr_err("Memory failure: %#lx: thp split failed\n", pfn); if (TestClearPageHWPoison(p)) - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); put_hwpoison_page(p); return -EBUSY; } @@ -1165,7 +1208,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - lock_page(hpage); + lock_page(p); /* * The page could have changed compound pages during the locking. @@ -1194,42 +1237,23 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (!PageHWPoison(p)) { pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_sub(nr_pages); - unlock_page(hpage); - put_hwpoison_page(hpage); + num_poisoned_pages_dec(); + unlock_page(p); + put_hwpoison_page(p); return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - num_poisoned_pages_sub(nr_pages); - unlock_page(hpage); - put_hwpoison_page(hpage); + num_poisoned_pages_dec(); + unlock_page(p); + put_hwpoison_page(p); return 0; } - if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + if (!PageTransTail(p) && !PageLRU(p)) goto identify_page_state; /* - * For error on the tail page, we should set PG_hwpoison - * on the head page to show that the hugepage is hwpoisoned - */ - if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { - action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); - unlock_page(hpage); - put_hwpoison_page(hpage); - return 0; - } - /* - * Set PG_hwpoison on all pages in an error hugepage, - * because containment is done in hugepage unit for now. - * Since we have done TestSetPageHWPoison() for the head page with - * page lock held, we can safely set PG_hwpoison bits on tail pages. - */ - if (PageHuge(p)) - set_page_hwpoison_huge_page(hpage); - - /* * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ @@ -1258,25 +1282,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } identify_page_state: - res = -EBUSY; - /* - * The first check uses the current page flags which may not have any - * relevant information. The second check with the saved page flagss is - * carried out only if the first check can't determine the page status. - */ - for (ps = error_states;; ps++) - if ((p->flags & ps->mask) == ps->res) - break; - - page_flags |= (p->flags & (1UL << PG_dirty)); - - if (!ps->mask) - for (ps = error_states;; ps++) - if ((page_flags & ps->mask) == ps->res) - break; - res = page_action(ps, p, pfn); + res = identify_page_state(pfn, p, page_flags); out: - unlock_page(hpage); + unlock_page(p); return res; } EXPORT_SYMBOL_GPL(memory_failure); @@ -1398,7 +1406,6 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; - unsigned int nr_pages; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1443,20 +1450,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); - if (!get_hwpoison_page(p)) { - /* - * Since HWPoisoned hugepage should have non-zero refcount, - * race between memory failure and unpoison seems to happen. - * In such case unpoison fails and memory failure runs - * to the end. - */ - if (PageHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n", - pfn, &unpoison_rs); - return 0; - } if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", @@ -1474,10 +1468,8 @@ int unpoison_memory(unsigned long pfn) if (TestClearPageHWPoison(page)) { unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", pfn, &unpoison_rs); - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); freeit = 1; - if (PageHuge(page)) - clear_page_hwpoison_huge_page(page); } unlock_page(page); @@ -1492,11 +1484,8 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - nid); - else - return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); + + return new_page_nodemask(p, nid, &node_states[N_MEMORY]); } /* @@ -1603,15 +1592,8 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - /* overcommit hugetlb page will be freed to buddy */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - SetPageHWPoison(page); - num_poisoned_pages_inc(); - } + if (PageHuge(page)) + dissolve_free_huge_page(page); } return ret; } @@ -1727,15 +1709,12 @@ static int soft_offline_in_use_page(struct page *page, int flags) static void soft_offline_free_page(struct page *page) { - if (PageHuge(page)) { - struct page *hpage = compound_head(page); + struct page *head = compound_head(page); - set_page_hwpoison_huge_page(hpage); - if (!dequeue_hwpoisoned_huge_page(hpage)) - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - if (!TestSetPageHWPoison(page)) - num_poisoned_pages_inc(); + if (!TestSetPageHWPoison(head)) { + num_poisoned_pages_inc(); + if (PageHuge(head)) + dissolve_free_huge_page(page); } } diff --git a/mm/memory.c b/mm/memory.c index bb11c474857e..0e517be91a89 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2719,7 +2719,7 @@ int do_swap_page(struct vm_fault *vmf) /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -3262,14 +3262,14 @@ static int fault_around_bytes_set(void *data, u64 val) fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, +DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { void *ret; - ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, + ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); if (!ret) pr_warn("Failed to create fault_around_bytes in debugfs"); @@ -3591,7 +3591,7 @@ out: return 0; } -static int create_huge_pmd(struct vm_fault *vmf) +static inline int create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); @@ -3837,7 +3837,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT); + count_memcg_event_mm(vma->vm_mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); @@ -4014,8 +4014,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); - if (!ptep) - goto out; if (!pte_present(*ptep)) goto unlock; *ptepp = ptep; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b63d7d1239df..8dccc317aac2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -52,32 +52,19 @@ static void generic_online_page(struct page *page); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); -/* The same as the cpu_hotplug lock, but for memory hotplug. */ -static struct { - struct task_struct *active_writer; - struct mutex lock; /* Synchronizes accesses to refcount, */ - /* - * Also blocks the new readers during - * an ongoing mem hotplug operation. - */ - int refcount; +DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} mem_hotplug = { - .active_writer = NULL, - .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), - .refcount = 0, -#ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = {.name = "mem_hotplug.lock" }, -#endif -}; +void get_online_mems(void) +{ + percpu_down_read(&mem_hotplug_lock); +} -/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ -#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) -#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) -#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +void put_online_mems(void) +{ + percpu_up_read(&mem_hotplug_lock); +} + +bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE bool memhp_auto_online; @@ -97,60 +84,16 @@ static int __init setup_memhp_default_state(char *str) } __setup("memhp_default_state=", setup_memhp_default_state); -void get_online_mems(void) -{ - might_sleep(); - if (mem_hotplug.active_writer == current) - return; - memhp_lock_acquire_read(); - mutex_lock(&mem_hotplug.lock); - mem_hotplug.refcount++; - mutex_unlock(&mem_hotplug.lock); - -} - -void put_online_mems(void) -{ - if (mem_hotplug.active_writer == current) - return; - mutex_lock(&mem_hotplug.lock); - - if (WARN_ON(!mem_hotplug.refcount)) - mem_hotplug.refcount++; /* try to fix things up */ - - if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) - wake_up_process(mem_hotplug.active_writer); - mutex_unlock(&mem_hotplug.lock); - memhp_lock_release(); - -} - -/* Serializes write accesses to mem_hotplug.active_writer. */ -static DEFINE_MUTEX(memory_add_remove_lock); - void mem_hotplug_begin(void) { - mutex_lock(&memory_add_remove_lock); - - mem_hotplug.active_writer = current; - - memhp_lock_acquire(); - for (;;) { - mutex_lock(&mem_hotplug.lock); - if (likely(!mem_hotplug.refcount)) - break; - __set_current_state(TASK_UNINTERRUPTIBLE); - mutex_unlock(&mem_hotplug.lock); - schedule(); - } + cpus_read_lock(); + percpu_down_write(&mem_hotplug_lock); } void mem_hotplug_done(void) { - mem_hotplug.active_writer = NULL; - mutex_unlock(&mem_hotplug.lock); - memhp_lock_release(); - mutex_unlock(&memory_add_remove_lock); + percpu_up_write(&mem_hotplug_lock); + cpus_read_unlock(); } /* add this memory to iomem resource */ @@ -300,229 +243,38 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long old_zone_end_pfn; - - zone_span_writelock(zone); - - old_zone_end_pfn = zone_end_pfn(zone); - if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) - zone->zone_start_pfn = start_pfn; - - zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - - zone->zone_start_pfn; - - zone_span_writeunlock(zone); -} - -static void resize_zone(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - zone_span_writelock(zone); - - if (end_pfn - start_pfn) { - zone->zone_start_pfn = start_pfn; - zone->spanned_pages = end_pfn - start_pfn; - } else { - /* - * make it consist as free_area_init_core(), - * if spanned_pages = 0, then keep start_pfn = 0 - */ - zone->zone_start_pfn = 0; - zone->spanned_pages = 0; - } - - zone_span_writeunlock(zone); -} - -static void fix_zone_id(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - enum zone_type zid = zone_idx(zone); - int nid = zone->zone_pgdat->node_id; - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) - set_page_links(pfn_to_page(pfn), zid, nid, pfn); -} - -/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or - * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ -static int __ref ensure_zone_is_initialized(struct zone *zone, - unsigned long start_pfn, unsigned long num_pages) -{ - if (!zone_is_initialized(zone)) - return init_currently_empty_zone(zone, start_pfn, num_pages); - - return 0; -} - -static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, - unsigned long start_pfn, unsigned long end_pfn) -{ - int ret; - unsigned long flags; - unsigned long z1_start_pfn; - - ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); - if (ret) - return ret; - - pgdat_resize_lock(z1->zone_pgdat, &flags); - - /* can't move pfns which are higher than @z2 */ - if (end_pfn > zone_end_pfn(z2)) - goto out_fail; - /* the move out part must be at the left most of @z2 */ - if (start_pfn > z2->zone_start_pfn) - goto out_fail; - /* must included/overlap */ - if (end_pfn <= z2->zone_start_pfn) - goto out_fail; - - /* use start_pfn for z1's start_pfn if z1 is empty */ - if (!zone_is_empty(z1)) - z1_start_pfn = z1->zone_start_pfn; - else - z1_start_pfn = start_pfn; - - resize_zone(z1, z1_start_pfn, end_pfn); - resize_zone(z2, end_pfn, zone_end_pfn(z2)); - - pgdat_resize_unlock(z1->zone_pgdat, &flags); - - fix_zone_id(z1, start_pfn, end_pfn); - - return 0; -out_fail: - pgdat_resize_unlock(z1->zone_pgdat, &flags); - return -1; -} - -static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, - unsigned long start_pfn, unsigned long end_pfn) +static int __meminit __add_section(int nid, unsigned long phys_start_pfn, + bool want_memblock) { int ret; - unsigned long flags; - unsigned long z2_end_pfn; - - ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); - if (ret) - return ret; - - pgdat_resize_lock(z1->zone_pgdat, &flags); - - /* can't move pfns which are lower than @z1 */ - if (z1->zone_start_pfn > start_pfn) - goto out_fail; - /* the move out part mast at the right most of @z1 */ - if (zone_end_pfn(z1) > end_pfn) - goto out_fail; - /* must included/overlap */ - if (start_pfn >= zone_end_pfn(z1)) - goto out_fail; - - /* use end_pfn for z2's end_pfn if z2 is empty */ - if (!zone_is_empty(z2)) - z2_end_pfn = zone_end_pfn(z2); - else - z2_end_pfn = end_pfn; - - resize_zone(z1, z1->zone_start_pfn, start_pfn); - resize_zone(z2, start_pfn, z2_end_pfn); - - pgdat_resize_unlock(z1->zone_pgdat, &flags); - - fix_zone_id(z2, start_pfn, end_pfn); - - return 0; -out_fail: - pgdat_resize_unlock(z1->zone_pgdat, &flags); - return -1; -} - -static struct zone * __meminit move_pfn_range(int zone_shift, - unsigned long start_pfn, unsigned long end_pfn) -{ - struct zone *zone = page_zone(pfn_to_page(start_pfn)); - int ret = 0; - - if (zone_shift < 0) - ret = move_pfn_range_left(zone + zone_shift, zone, - start_pfn, end_pfn); - else if (zone_shift) - ret = move_pfn_range_right(zone, zone + zone_shift, - start_pfn, end_pfn); - - if (ret) - return NULL; - - return zone + zone_shift; -} - -static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); - - if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) - pgdat->node_start_pfn = start_pfn; - - pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - - pgdat->node_start_pfn; -} + int i; -static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - int nr_pages = PAGES_PER_SECTION; - int nid = pgdat->node_id; - int zone_type; - unsigned long flags, pfn; - int ret; + if (pfn_valid(phys_start_pfn)) + return -EEXIST; - zone_type = zone - pgdat->node_zones; - ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); - if (ret) + ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); + if (ret < 0) return ret; - pgdat_resize_lock(zone->zone_pgdat, &flags); - grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); - grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, - phys_start_pfn + nr_pages); - pgdat_resize_unlock(zone->zone_pgdat, &flags); - memmap_init_zone(nr_pages, nid, zone_type, - phys_start_pfn, MEMMAP_HOTPLUG); - - /* online_page_range is called later and expects pages reserved */ - for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { + /* + * Make all the pages reserved so that nobody will stumble over half + * initialized state. + * FIXME: We also have to associate it with a node because pfn_to_node + * relies on having page with the proper node. + */ + for (i = 0; i < PAGES_PER_SECTION; i++) { + unsigned long pfn = phys_start_pfn + i; + struct page *page; if (!pfn_valid(pfn)) continue; - SetPageReserved(pfn_to_page(pfn)); + page = pfn_to_page(pfn); + set_page_node(page, nid); + SetPageReserved(page); } - return 0; -} - -static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn) -{ - int ret; - - if (pfn_valid(phys_start_pfn)) - return -EEXIST; - - ret = sparse_add_one_section(zone, phys_start_pfn); - - if (ret < 0) - return ret; - ret = __add_zone(zone, phys_start_pfn); - - if (ret < 0) - return ret; + if (!want_memblock) + return 0; return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } @@ -533,16 +285,14 @@ static int __meminit __add_section(int nid, struct zone *zone, * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) +int __ref __add_pages(int nid, unsigned long phys_start_pfn, + unsigned long nr_pages, bool want_memblock) { unsigned long i; int err = 0; int start_sec, end_sec; struct vmem_altmap *altmap; - clear_zone_contiguous(zone); - /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -562,7 +312,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i)); + err = __add_section(nid, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision @@ -575,7 +325,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } vmemmap_populate_print_last(); out: - set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -772,11 +521,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) { struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; - int zone_type; unsigned long flags; - zone_type = zone - pgdat->node_zones; - pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); @@ -939,33 +685,20 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; struct page *page; + if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); (*online_page_callback)(page); onlined_pages++; } + + online_mem_sections(start_pfn, start_pfn + nr_pages); + *(unsigned long *)arg = onlined_pages; return 0; } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have - * normal memory. - */ -static bool can_online_high_movable(struct zone *zone) -{ - return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure every online node has NORMAL memory */ -static bool can_online_high_movable(struct zone *zone) -{ - return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); -} -#endif /* CONFIG_MOVABLE_NODE */ - /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -1040,39 +773,144 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift) +bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) { - struct zone *zone = page_zone(pfn_to_page(pfn)); - enum zone_type idx = zone_idx(zone); - int i; + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); - *zone_shift = 0; + /* + * TODO there shouldn't be any inherent reason to have ZONE_NORMAL + * physically before ZONE_MOVABLE. All we need is they do not + * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE + * though so let's stick with it for simplicity for now. + * TODO make sure we do not overlap with ZONE_DEVICE + */ + if (online_type == MMOP_ONLINE_KERNEL) { + if (zone_is_empty(movable_zone)) + return true; + return movable_zone->zone_start_pfn >= pfn + nr_pages; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + return zone_end_pfn(default_zone) <= pfn; + } - if (idx < target) { - /* pages must be at end of current zone */ - if (pfn + nr_pages != zone_end_pfn(zone)) - return false; + /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ + return online_type == MMOP_ONLINE_KEEP; +} + +static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = zone_end_pfn(zone); + + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; +} - /* no zones in use between current zone and target */ - for (i = idx + 1; i < target; i++) - if (zone_is_initialized(zone - idx + i)) - return false; +static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = pgdat_end_pfn(pgdat); + + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; +} + +void __ref move_pfn_range_to_zone(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nid = pgdat->node_id; + unsigned long flags; + + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); + + clear_zone_contiguous(zone); + + /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ + pgdat_resize_lock(pgdat, &flags); + zone_span_writelock(zone); + resize_zone_range(zone, start_pfn, nr_pages); + zone_span_writeunlock(zone); + resize_pgdat_range(pgdat, start_pfn, nr_pages); + pgdat_resize_unlock(pgdat, &flags); + + /* + * TODO now we have a visible range of pages which are not associated + * with their zone properly. Not nice but set_pfnblock_flags_mask + * expects the zone spans the pfn range. All the pages in the range + * are reserved so nobody should be touching them so we should be safe + */ + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); + + set_zone_contiguous(zone); +} + +/* + * Returns a default kernel memory zone for the given pfn range. + * If no kernel zone covers this pfn range it will automatically go + * to the ZONE_NORMAL. + */ +struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, + unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + int zid; + + for (zid = 0; zid <= ZONE_NORMAL; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (zone_intersects(zone, start_pfn, nr_pages)) + return zone; } - if (target < idx) { - /* pages must be at beginning of current zone */ - if (pfn != zone->zone_start_pfn) - return false; + return &pgdat->node_zones[ZONE_NORMAL]; +} + +static inline bool movable_pfn_range(int nid, struct zone *default_zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + if (!allow_online_pfn_range(nid, start_pfn, nr_pages, + MMOP_ONLINE_KERNEL)) + return true; + + if (!movable_node_is_enabled()) + return false; - /* no zones in use between current zone and target */ - for (i = target + 1; i < idx; i++) - if (zone_is_initialized(zone - idx + i)) - return false; + return !zone_intersects(default_zone, start_pfn, nr_pages); +} + +/* + * Associates the given pfn range with the given node and the zone appropriate + * for the given online type. + */ +static struct zone * __meminit move_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); + + if (online_type == MMOP_ONLINE_KEEP) { + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + /* + * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use + * movable zone if that is not possible (e.g. we are within + * or past the existing movable zone). movable_node overrides + * this default and defaults to movable zone + */ + if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) + zone = movable_zone; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + zone = &pgdat->node_zones[ZONE_MOVABLE]; } - *zone_shift = target - idx; - return true; + move_pfn_range_to_zone(zone, start_pfn, nr_pages); + return zone; } /* Must be protected by mem_hotplug_begin() */ @@ -1085,38 +923,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; - int zone_shift = 0; - - /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - if ((zone_idx(zone) > ZONE_NORMAL || - online_type == MMOP_ONLINE_MOVABLE) && - !can_online_high_movable(zone)) + nid = pfn_to_nid(pfn); + if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL) { - if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) - return -EINVAL; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) - return -EINVAL; - } - - zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); - if (!zone) - return -EINVAL; + /* associate pfn range with the zone */ + zone = move_pfn_range(online_type, nid, pfn, nr_pages); arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = zone_to_nid(zone); - ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) @@ -1311,39 +1129,6 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } -/* - * If movable zone has already been setup, newly added memory should be check. - * If its address is higher than movable zone, it should be added as movable. - * Without this check, movable zone may overlap with other zone. - */ -static int should_add_memory_movable(int nid, u64 start, u64 size) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; - - if (zone_is_empty(movable_zone)) - return 0; - - if (movable_zone->zone_start_pfn <= start_pfn) - return 1; - - return 0; -} - -int zone_for_memory(int nid, u64 start, u64 size, int zone_default, - bool for_device) -{ -#ifdef CONFIG_ZONE_DEVICE - if (for_device) - return ZONE_DEVICE; -#endif - if (should_add_memory_movable(nid, start, size)) - return ZONE_MOVABLE; - - return zone_default; -} - static int online_memory_block(struct memory_block *mem, void *arg) { return device_online(&mem->dev); @@ -1389,7 +1174,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, false); + ret = arch_add_memory(nid, start, size, true); if (ret < 0) goto error; @@ -1398,7 +1183,22 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) node_set_online(nid); if (new_node) { - ret = register_one_node(nid); + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + ret = __register_one_node(nid); + if (ret) + goto register_fail; + + /* + * link memory sections under this node. This is already + * done when creatig memory section in register_new_memory + * but that depends to have the node registered so offline + * nodes have to go through register_node. + * TODO clean up this mess. + */ + ret = link_mem_sections(nid, start_pfn, nr_pages); +register_fail: /* * If sysfs file of new node can't create, cpu on the node * can't be hot-added. There is no rollback way now. @@ -1419,7 +1219,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) error: /* rollback pgdat allocation and others */ - if (new_pgdat) + if (new_pgdat && pgdat) rollback_node_hotadd(nid, pgdat); memblock_remove(start, size); @@ -1571,34 +1371,19 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) static struct page *new_node_page(struct page *page, unsigned long private, int **result) { - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; - struct page *new_page = NULL; /* - * TODO: allocate a destination hugepage from a nearest neighbor node, - * accordance with memory policy of the user process if possible. For - * now as a simple work-around, we use the next node for destination. + * try to allocate from a different node but reuse this node if there + * are no other online nodes to be used (e.g. we are offlining a part + * of the only existing node) */ - if (PageHuge(page)) - return alloc_huge_page_node(page_hstate(compound_head(page)), - next_node_in(nid, nmask)); - node_clear(nid, nmask); + if (nodes_empty(nmask)) + node_set(nid, nmask); - if (PageHighMem(page) - || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) - gfp_mask |= __GFP_HIGHMEM; - - if (!nodes_empty(nmask)) - new_page = __alloc_pages_nodemask(gfp_mask, 0, - node_zonelist(nid, gfp_mask), &nmask); - if (!new_page) - new_page = __alloc_pages(gfp_mask, 0, - node_zonelist(nid, gfp_mask)); - - return new_page; + return new_page_nodemask(page, nid, &nmask); } #define NR_OFFLINE_AT_ONCE_PAGES (256) @@ -1725,47 +1510,12 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have - * normal memory. - */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ - return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure the node has NORMAL memory if it is still online */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long present_pages = 0; - enum zone_type zt; - - for (zt = 0; zt <= ZONE_NORMAL; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - - if (present_pages > nr_pages) - return true; - - present_pages = 0; - for (; zt <= ZONE_MOVABLE; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - - /* - * we can't offline the last normal memory until all - * higher memory is offlined. - */ - return present_pages == 0; -} -#endif /* CONFIG_MOVABLE_NODE */ - static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_MOVABLE_NODE +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; #else - pr_warn("movable_node option not supported\n"); + pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); #endif return 0; } @@ -1887,9 +1637,6 @@ static int __ref __offline_pages(unsigned long start_pfn, node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; - if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) - return -EINVAL; - /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); @@ -1919,7 +1666,7 @@ repeat: goto failed_removal; ret = 0; if (drain) { - lru_add_drain_all(); + lru_add_drain_all_cpuslocked(); cond_resched(); drain_all_pages(zone); } @@ -1940,7 +1687,7 @@ repeat: } } /* drain all zone's lru pagevec, this is asynchronous... */ - lru_add_drain_all(); + lru_add_drain_all_cpuslocked(); yield(); /* drain pcp pages, this is synchronous. */ drain_all_pages(zone); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 37d0b334bfe9..d911fa5cb2a7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - /* - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } -/* - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; @@ -325,41 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - /* - * if step == 1, we use ->w.cpuset_mems_allowed to cache the - * result - */ - if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { - nodes_remap(tmp, pol->v.nodes, - pol->w.cpuset_mems_allowed, *nodes); - pol->w.cpuset_mems_allowed = step ? tmp : *nodes; - } else if (step == MPOL_REBIND_STEP2) { - tmp = pol->w.cpuset_mems_allowed; - pol->w.cpuset_mems_allowed = *nodes; - } else - BUG(); + nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = tmp; } if (nodes_empty(tmp)) tmp = *nodes; - if (step == MPOL_REBIND_STEP1) - nodes_or(pol->v.nodes, pol->v.nodes, tmp); - else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) - pol->v.nodes = tmp; - else - BUG(); - - if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node_in(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = numa_node_id(); - } + pol->v.nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes, - enum mpol_rebind_step step) + const nodemask_t *nodes) { nodemask_t tmp; @@ -385,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol, /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes + * Per-vma policies are protected by mmap_sem. Allocations using per-task + * policies are protected by task->mems_allowed_seq to prevent a premature + * OOM/allocation failure due to parallel nodemask modification. */ -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, - enum mpol_rebind_step step) +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) - return; - - if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) - BUG(); - - if (step == MPOL_REBIND_STEP1) - pol->flags |= MPOL_F_REBINDING; - else if (step == MPOL_REBIND_STEP2) - pol->flags &= ~MPOL_F_REBINDING; - else if (step >= MPOL_REBIND_NSTEP) - BUG(); - - mpol_ops[pol->mode].rebind(pol, newmask, step); + mpol_ops[pol->mode].rebind(pol, newmask); } /* @@ -430,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { - mpol_rebind_policy(tsk->mempolicy, new, step); + mpol_rebind_policy(tsk->mempolicy, new); } /* @@ -448,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); + mpol_rebind_policy(vma->vm_policy, new); up_write(&mm->mmap_sem); } @@ -812,9 +743,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE && - nodes_weight(new->v.nodes)) - current->il_next = first_node(new->v.nodes); + if (new && new->mode == MPOL_INTERLEAVE) + current->il_prev = MAX_NUMNODES-1; task_unlock(current); mpol_put(old); ret = 0; @@ -916,7 +846,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { - *policy = current->il_next; + *policy = next_node_in(current->il_prev, pol->v.nodes); } else { err = -EINVAL; goto out; @@ -1148,7 +1078,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) /* * if !vma, alloc_page_vma() will use task or system default policy */ - return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL, + vma, address); } #else @@ -1676,9 +1607,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) return NULL; } -/* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, - int nd) +/* Return the node id preferred by the given mempolicy, or the given id */ +static int policy_node(gfp_t gfp, struct mempolicy *policy, + int nd) { if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; @@ -1691,20 +1622,19 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } - return node_zonelist(nd, gfp); + return nd; } /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { - unsigned nid, next; + unsigned next; struct task_struct *me = current; - nid = me->il_next; - next = next_node_in(nid, policy->v.nodes); + next = next_node_in(me->il_prev, policy->v.nodes); if (next < MAX_NUMNODES) - me->il_next = next; - return nid; + me->il_prev = next; + return next; } /* @@ -1799,38 +1729,37 @@ static inline unsigned interleave_nid(struct mempolicy *pol, #ifdef CONFIG_HUGETLBFS /* - * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * - * Returns a zonelist suitable for a huge page allocation and a pointer + * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ -struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, - gfp_t gfp_flags, struct mempolicy **mpol, - nodemask_t **nodemask) +int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, + struct mempolicy **mpol, nodemask_t **nodemask) { - struct zonelist *zl; + int nid; *mpol = get_vma_policy(vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { - zl = node_zonelist(interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))), gfp_flags); + nid = interleave_nid(*mpol, vma, addr, + huge_page_shift(hstate_vma(vma))); } else { - zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); + nid = policy_node(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } - return zl; + return nid; } /* @@ -1932,12 +1861,10 @@ out: static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) { - struct zonelist *zl; struct page *page; - zl = node_zonelist(nid, gfp); - page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) + page = __alloc_pages(gfp, order, nid); + if (page && page_to_nid(page) == nid) inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -1971,13 +1898,10 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, { struct mempolicy *pol; struct page *page; - unsigned int cpuset_mems_cookie; - struct zonelist *zl; + int preferred_nid; nodemask_t *nmask; -retry_cpuset: pol = get_vma_policy(vma, addr); - cpuset_mems_cookie = read_mems_allowed_begin(); if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; @@ -2015,12 +1939,10 @@ retry_cpuset: } nmask = policy_nodemask(gfp, pol); - zl = policy_zonelist(gfp, pol, node); - page = __alloc_pages_nodemask(gfp, order, zl, nmask); + preferred_nid = policy_node(gfp, pol, node); + page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; return page; } @@ -2038,23 +1960,15 @@ out: * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. - * - * Don't call cpuset_update_task_memory_state() unless - * 1) it's ok to take cpuset_sem (can WAIT), and - * 2) allocating for current task (not interrupt). */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; - unsigned int cpuset_mems_cookie; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); - /* * No reference counting needed for current->mempolicy * nor system default_policy @@ -2063,12 +1977,9 @@ retry_cpuset: page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol, numa_node_id()), + policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; - return page; } EXPORT_SYMBOL(alloc_pages_current); @@ -2112,10 +2023,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - if (new->flags & MPOL_F_REBINDING) - mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); - else - mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); + mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; diff --git a/mm/mempool.c b/mm/mempool.c index 47a659dedd44..1c0294858527 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; - wait_queue_t wait; + wait_queue_entry_t wait; gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); diff --git a/mm/migrate.c b/mm/migrate.c index 89a0a1707f4c..627671551873 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -227,25 +227,26 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); + flush_dcache_page(new); #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, vma, new, 0); - } -#endif - flush_dcache_page(new); - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - - if (PageHuge(new)) { + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); - } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, pvmw.address, false); - else - page_add_file_rmap(new, false); + } else +#endif + { + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + if (PageAnon(new)) + page_add_anon_rmap(new, vma, pvmw.address, false); + else + page_add_file_rmap(new, false); + } if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); @@ -1251,6 +1252,8 @@ put_anon: out: if (rc != -EAGAIN) putback_active_hugepage(hpage); + if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage)) + num_poisoned_pages_inc(); /* * If migration was not successful and there's a freeing callback, use @@ -1913,7 +1916,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_cache(page); unsigned long mmun_start = address & HPAGE_PMD_MASK; unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; - pmd_t orig_entry; /* * Rate-limit the amount of data that is being migrated to a node. @@ -1956,8 +1958,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Recheck the target PMD */ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { -fail_putback: + if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1979,7 +1980,6 @@ fail_putback: goto out_unlock; } - orig_entry = *pmd; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1996,15 +1996,7 @@ fail_putback: set_pmd_at(mm, mmun_start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); - if (page_count(page) != 2) { - set_pmd_at(mm, mmun_start, pmd, orig_entry); - flush_pmd_tlb_range(vma, mmun_start, mmun_end); - mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); - update_mmu_cache_pmd(vma, address, &entry); - page_remove_rmap(new_page, true); - goto fail_putback; - } - + page_ref_unfreeze(page, 2); mlock_migrate_page(new_page, page); page_remove_rmap(page, true); set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); diff --git a/mm/mmap.c b/mm/mmap.c index a5e3dcd75e79..f19efcf75418 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -94,7 +94,7 @@ static void unmap_region(struct mm_struct *mm, * w: (no) no * x: (yes) yes */ -pgprot_t protection_map[16] = { +pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 }; @@ -2177,7 +2177,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; - struct rlimit *rlim = current->signal->rlim; unsigned long new_start; /* address space limit tests */ @@ -2185,7 +2184,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* Stack limit test */ - if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) + if (size > rlimit(RLIMIT_STACK)) return -ENOMEM; /* mlock limit tests */ @@ -2193,7 +2192,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long locked; unsigned long limit; locked = mm->locked_vm + grow; - limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); + limit = rlimit(RLIMIT_MEMLOCK); limit >>= PAGE_SHIFT; if (locked > limit && !capable(CAP_IPC_LOCK)) return -ENOMEM; @@ -2232,7 +2231,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; - if (address >= TASK_SIZE) + if (address >= (TASK_SIZE & PAGE_MASK)) return -ENOMEM; address += PAGE_SIZE; @@ -2244,7 +2243,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) gap_addr = TASK_SIZE; next = vma->vm_next; - if (next && next->vm_start < gap_addr) { + if (next && next->vm_start < gap_addr && + (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2315,7 +2315,6 @@ int expand_downwards(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; - unsigned long gap_addr; int error; address &= PAGE_MASK; @@ -2324,14 +2323,12 @@ int expand_downwards(struct vm_area_struct *vma, return error; /* Enforce stack_guard_gap */ - gap_addr = address - stack_guard_gap; - if (gap_addr > address) - return -ENOMEM; prev = vma->vm_prev; - if (prev && prev->vm_end > gap_addr) { - if (!(prev->vm_flags & VM_GROWSDOWN)) + /* Check that both stack segments have the same anon_vma? */ + if (prev && !(prev->vm_flags & VM_GROWSDOWN) && + (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + if (address - prev->vm_end < stack_guard_gap) return -ENOMEM; - /* Check that both stack segments have the same anon_vma? */ } /* We must make sure the anon_vma is allocated. */ @@ -3186,8 +3183,12 @@ static int special_mapping_mremap(struct vm_area_struct *new_vma) { struct vm_special_mapping *sm = new_vma->vm_private_data; + if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) + return -EFAULT; + if (sm->mremap) return sm->mremap(sm, new_vma); + return 0; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 8edd0d576254..1a8c9ca83e48 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,8 +58,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * reading. */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (!pte) - return 0; /* Get target node for single threaded private VMAs */ if (prot_numa && !(vma->vm_flags & VM_SHARED) && diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 487dad610731..36454d0f96ee 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -118,7 +118,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start, unsigned long end_pfn = min_t(unsigned long, PFN_DOWN(end), max_low_pfn); - if (start_pfn > end_pfn) + if (start_pfn >= end_pfn) return 0; __free_pages_memory(start_pfn, end_pfn); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 04c9143a8625..9e8b4f030c1c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -490,6 +490,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) if (!down_read_trylock(&mm->mmap_sem)) { ret = false; + trace_skip_task_reaping(tsk->pid); goto unlock_oom; } @@ -500,9 +501,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) */ if (!mmget_not_zero(mm)) { up_read(&mm->mmap_sem); + trace_skip_task_reaping(tsk->pid); goto unlock_oom; } + trace_start_task_reaping(tsk->pid); + /* * Tell all users of get_user/copy_from_user etc... that the content * is no longer stable. No barriers really needed because unmapping @@ -544,6 +548,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * put the oom_reaper out of the way. */ mmput_async(mm); + trace_finish_task_reaping(tsk->pid); unlock_oom: mutex_unlock(&oom_lock); return ret; @@ -615,6 +620,7 @@ static void wake_oom_reaper(struct task_struct *tsk) tsk->oom_reaper_list = oom_reaper_list; oom_reaper_list = tsk; spin_unlock(&oom_reaper_lock); + trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); } @@ -666,6 +672,7 @@ static void mark_oom_victim(struct task_struct *tsk) */ __thaw_task(tsk); atomic_inc(&oom_victims); + trace_mark_victim(tsk->pid); } /** @@ -876,6 +883,11 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; mmgrab(mm); + + /* Raise event before sending signal: task reaper must see this */ + count_vm_event(OOM_KILL); + count_memcg_event_mm(mm, OOM_KILL); + /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 143c1c25d680..96e93b214d31 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -601,7 +601,7 @@ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { struct wb_domain *cgdom; - __inc_wb_stat(wb, WB_WRITTEN); + inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); @@ -2366,15 +2366,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) } /** - * write_one_page - write out a single page and optionally wait on I/O + * write_one_page - write out a single page and wait on I/O * @page: the page to write - * @wait: if true, wait on writeout * * The page must be locked by the caller and will be unlocked upon return. * - * write_one_page() returns a negative error code if I/O failed. + * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this + * function returns. */ -int write_one_page(struct page *page, int wait) +int write_one_page(struct page *page) { struct address_space *mapping = page->mapping; int ret = 0; @@ -2385,21 +2385,20 @@ int write_one_page(struct page *page, int wait) BUG_ON(!PageLocked(page)); - if (wait) - wait_on_page_writeback(page); + wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { get_page(page); ret = mapping->a_ops->writepage(page, &wbc); - if (ret == 0 && wait) { + if (ret == 0) wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - } put_page(page); } else { unlock_page(page); } + + if (!ret) + ret = filemap_check_errors(mapping); return ret; } EXPORT_SYMBOL(write_one_page); @@ -2433,12 +2432,11 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) inode_attach_wb(inode, page); wb = inode_to_wb(inode); - inc_memcg_page_state(page, NR_FILE_DIRTY); - __inc_node_page_state(page, NR_FILE_DIRTY); + __inc_lruvec_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); - __inc_wb_stat(wb, WB_RECLAIMABLE); - __inc_wb_stat(wb, WB_DIRTIED); + inc_wb_stat(wb, WB_RECLAIMABLE); + inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2455,8 +2453,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - dec_memcg_page_state(page, NR_FILE_DIRTY); - dec_node_page_state(page, NR_FILE_DIRTY); + dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_SIZE); @@ -2712,8 +2709,7 @@ int clear_page_dirty_for_io(struct page *page) */ wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - dec_memcg_page_state(page, NR_FILE_DIRTY); - dec_node_page_state(page, NR_FILE_DIRTY); + dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; @@ -2745,7 +2741,7 @@ int test_clear_page_writeback(struct page *page) if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); - __dec_wb_stat(wb, WB_WRITEBACK); + dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); } } @@ -2759,8 +2755,7 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - dec_memcg_page_state(page, NR_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK); + dec_lruvec_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } @@ -2791,7 +2786,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) - __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); /* * We can come through here when swapping anonymous @@ -2814,8 +2809,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - inc_memcg_page_state(page, NR_WRITEBACK); - inc_node_page_state(page, NR_WRITEBACK); + inc_lruvec_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } unlock_page_memcg(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2302f250d6b1..6d30e914afb6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -113,9 +113,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = { { [0] = 1UL } }, #endif -#ifdef CONFIG_MOVABLE_NODE [N_MEMORY] = { { [0] = 1UL } }, -#endif [N_CPU] = { { [0] = 1UL } }, #endif /* NUMA */ }; @@ -511,7 +509,7 @@ static int page_is_consistent(struct zone *zone, struct page *page) /* * Temporary debugging check for pages not lying within a given zone. */ -static int bad_range(struct zone *zone, struct page *page) +static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; @@ -521,7 +519,7 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } #else -static inline int bad_range(struct zone *zone, struct page *page) +static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) { return 0; } @@ -1297,8 +1295,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn) #endif #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { int nid; @@ -1320,8 +1319,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { return true; } -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { return true; } @@ -1365,7 +1365,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) return NULL; - start_page = pfn_to_page(start_pfn); + start_page = pfn_to_online_page(start_pfn); + if (!start_page) + return NULL; if (page_zone(start_page) != zone) return NULL; @@ -2204,19 +2206,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * list of requested migratetype, possibly along with other pages from the same * block, depending on fragmentation avoidance heuristics. Returns true if * fallback was found so that __rmqueue_smallest() can grab it. + * + * The use of signed ints for order and current_order is a deliberate + * deviation from the rest of this file, to make the for loop + * condition simpler. */ static inline bool -__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area *area; - unsigned int current_order; + int current_order; struct page *page; int fallback_mt; bool can_steal; - /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; - current_order >= order && current_order <= MAX_ORDER-1; + /* + * Find the largest available free page in the other list. This roughly + * approximates finding the pageblock with the most free pages, which + * would be too costly to do exactly. + */ + for (current_order = MAX_ORDER - 1; current_order >= order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2224,19 +2233,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) if (fallback_mt == -1) continue; - page = list_first_entry(&area->free_list[fallback_mt], - struct page, lru); + /* + * We cannot steal all free pages from the pageblock and the + * requested migratetype is movable. In that case it's better to + * steal and split the smallest available page instead of the + * largest available page, because even if the next movable + * allocation falls back into a different pageblock than this + * one, it won't cause permanent fragmentation. + */ + if (!can_steal && start_migratetype == MIGRATE_MOVABLE + && current_order > order) + goto find_smallest; - steal_suitable_fallback(zone, page, start_migratetype, - can_steal); + goto do_steal; + } - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); + return false; - return true; +find_smallest: + for (current_order = order; current_order < MAX_ORDER; + current_order++) { + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); + if (fallback_mt != -1) + break; } - return false; + /* + * This should not happen - we already found a suitable fallback + * when looking for the largest page. + */ + VM_BUG_ON(current_order == MAX_ORDER); + +do_steal: + page = list_first_entry(&area->free_list[fallback_mt], + struct page, lru); + + steal_suitable_fallback(zone, page, start_migratetype, can_steal); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + + return true; + } /* @@ -3244,6 +3284,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; + /* + * We have already exhausted all our reclaim opportunities without any + * success so it is time to admit defeat. We will skip the OOM killer + * because it is very likely that the caller has a more reasonable + * fallback than shooting a random task. + */ + if (gfp_mask & __GFP_RETRY_MAYFAIL) + goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; @@ -3373,7 +3421,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, } /* - * !costly requests are much more important than __GFP_REPEAT + * !costly requests are much more important than __GFP_RETRY_MAYFAIL * costly ones because they are de facto nofail and invoke OOM * killer to move on while costly can fail and users are ready * to cope with that. 1/4 retries is rather arbitrary but we @@ -3673,6 +3721,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, return false; } +static inline bool +check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) +{ + /* + * It's possible that cpuset's mems_allowed and the nodemask from + * mempolicy don't intersect. This should be normally dealt with by + * policy_nodemask(), but it's possible to race with cpuset update in + * such a way the check therein was true, and then it became false + * before we got our cpuset_mems_cookie here. + * This assumes that for all allocations, ac->nodemask can come only + * from MPOL_BIND mempolicy (whose documented semantics is to be ignored + * when it does not intersect with the cpuset restrictions) or the + * caller can deal with a violated nodemask. + */ + if (cpusets_enabled() && ac->nodemask && + !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { + ac->nodemask = NULL; + return true; + } + + /* + * When updating a task's mems_allowed or mempolicy nodemask, it is + * possible to race with parallel threads in such a way that our + * allocation can fail while the mask is being updated. If we are about + * to fail, check if the cpuset changed during allocation and if so, + * retry. + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + return true; + + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -3847,9 +3928,9 @@ retry: /* * Do not retry costly high order allocations unless they are - * __GFP_REPEAT + * __GFP_RETRY_MAYFAIL */ - if (costly_order && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -3868,11 +3949,9 @@ retry: &compaction_retries)) goto retry; - /* - * It's possible we raced with cpuset update so the OOM would be - * premature (see below the nopage: label for full explanation). - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + + /* Deal with possible cpuset update races before we start OOM killing */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* Reclaim has failed us, start killing things */ @@ -3893,14 +3972,8 @@ retry: } nopage: - /* - * When updating a task's mems_allowed or mempolicy nodemask, it is - * possible to race with parallel threads in such a way that our - * allocation can fail while the mask is being updated. If we are about - * to fail, check if the cpuset changed during allocation and if so, - * retry. - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + /* Deal with possible cpuset update races before we fail */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* @@ -3951,12 +4024,12 @@ got_pg: } static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask, + int preferred_nid, nodemask_t *nodemask, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { ac->high_zoneidx = gfp_zone(gfp_mask); - ac->zonelist = zonelist; + ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; ac->migratetype = gfpflags_to_migratetype(gfp_mask); @@ -4001,8 +4074,8 @@ static inline void finalise_ac(gfp_t gfp_mask, * This is the 'heart' of the zoned buddy allocator. */ struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4010,7 +4083,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct alloc_context ac = { }; gfp_mask &= gfp_allowed_mask; - if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; finalise_ac(gfp_mask, order, &ac); @@ -4614,8 +4687,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " slab_reclaimable:%lukB" - " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" " bounce:%lukB" @@ -4637,8 +4708,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), - K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), @@ -5124,6 +5193,7 @@ static void build_zonelists(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void setup_zone_pageset(struct zone *zone); /* @@ -5216,7 +5286,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) #endif /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, pgdat, NULL); + stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -5528,7 +5598,7 @@ static __meminit void zone_pcp_init(struct zone *zone) zone_batchsize(zone)); } -int __meminit init_currently_empty_zone(struct zone *zone, +void __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size) { @@ -5546,8 +5616,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_init_free_lists(zone); zone->initialized = 1; - - return 0; } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6005,7 +6073,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - int ret; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6027,6 +6094,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); + pgdat->per_cpu_nodestats = &boot_nodestats; + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; @@ -6087,8 +6156,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, size); - BUG_ON(ret); + init_currently_empty_zone(zone, zone_start_pfn, size); memmap_init(size, nid, j, zone_start_pfn); } } @@ -7182,6 +7250,21 @@ static unsigned long __init arch_reserved_kernel_pages(void) #endif /* + * Adaptive scale is meant to reduce sizes of hash tables on large memory + * machines. As memory size is increased the scale is also increased but at + * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory + * quadruples the scale is increased by one, which means the size of hash table + * only doubles, instead of quadrupling as well. + * Because 32-bit systems cannot have large physical memory, where this scaling + * makes sense, it is disabled on such platforms. + */ +#if __BITS_PER_LONG > 32 +#define ADAPT_SCALE_BASE (64ul << 30) +#define ADAPT_SCALE_SHIFT 2 +#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) +#endif + +/* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 * quantity of entries @@ -7200,6 +7283,7 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; + gfp_t gfp_flags; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -7211,6 +7295,16 @@ void *__init alloc_large_system_hash(const char *tablename, if (PAGE_SHIFT < 20) numentries = round_up(numentries, (1<<20)/PAGE_SIZE); +#if __BITS_PER_LONG > 32 + if (!high_limit) { + unsigned long adapt; + + for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; + adapt <<= ADAPT_SCALE_SHIFT) + scale++; + } +#endif + /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) numentries >>= (scale - PAGE_SHIFT); @@ -7244,12 +7338,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); + /* + * memblock allocator returns zeroed memory already, so HASH_ZERO is + * currently not used when HASH_EARLY is specified. + */ + gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; if (flags & HASH_EARLY) table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) - table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags, PAGE_KERNEL); else { /* * If bucketsize is not a power-of-two, we may free @@ -7257,8 +7356,8 @@ void *__init alloc_large_system_hash(const char *tablename, * alloc_pages_exact() automatically does */ if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, GFP_ATOMIC); - kmemleak_alloc(table, size, 1, GFP_ATOMIC); + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -7660,6 +7759,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) break; if (pfn == end_pfn) return; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); pfn = start_pfn; diff --git a/mm/page_io.c b/mm/page_io.c index 23f6d0d3470f..b6c4ac388209 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); /* * We failed to write the page out to swap-space. @@ -117,8 +117,9 @@ static void swap_slot_free_notify(struct page *page) static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; + struct task_struct *waiter = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); ClearPageUptodate(page); pr_alert("Read-error on swap-device (%u:%u:%llu)\n", @@ -132,7 +133,9 @@ static void end_swap_bio_read(struct bio *bio) swap_slot_free_notify(page); out: unlock_page(page); + WRITE_ONCE(bio->bi_private, NULL); bio_put(bio); + wake_up_process(waiter); } int generic_swapfile_activate(struct swap_info_struct *sis, @@ -329,11 +332,13 @@ out: return ret; } -int swap_readpage(struct page *page) +int swap_readpage(struct page *page, bool do_poll) { struct bio *bio; int ret = 0; struct swap_info_struct *sis = page_swap_info(page); + blk_qc_t qc; + struct block_device *bdev; VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -372,9 +377,23 @@ int swap_readpage(struct page *page) ret = -ENOMEM; goto out; } + bdev = bio->bi_bdev; + bio->bi_private = current; bio_set_op_attrs(bio, REQ_OP_READ, 0); count_vm_event(PSWPIN); - submit_bio(bio); + bio_get(bio); + qc = submit_bio(bio); + while (do_poll) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio->bi_private)) + break; + + if (!blk_mq_poll(bdev_get_queue(bdev), qc)) + break; + } + __set_current_state(TASK_RUNNING); + bio_put(bio); + out: return ret; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5092e4ef00c8..757410d9f758 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,6 +8,7 @@ #include <linux/memory.h> #include <linux/hugetlb.h> #include <linux/page_owner.h> +#include <linux/migrate.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -138,12 +139,18 @@ static inline struct page * __first_valid_page(unsigned long pfn, unsigned long nr_pages) { int i; - for (i = 0; i < nr_pages; i++) - if (pfn_valid_within(pfn + i)) - break; - if (unlikely(i == nr_pages)) - return NULL; - return pfn_to_page(pfn + i); + + for (i = 0; i < nr_pages; i++) { + struct page *page; + + if (!pfn_valid_within(pfn + i)) + continue; + page = pfn_to_online_page(pfn + i); + if (!page) + continue; + return page; + } + return NULL; } /* @@ -184,8 +191,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, undo: for (pfn = start_pfn; pfn < undo_pfn; - pfn += pageblock_nr_pages) - unset_migratetype_isolate(pfn_to_page(pfn), migratetype); + pfn += pageblock_nr_pages) { + struct page *page = pfn_to_online_page(pfn); + if (!page) + continue; + unset_migratetype_isolate(page, migratetype); + } return -EBUSY; } @@ -284,20 +295,5 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, struct page *alloc_migrate_target(struct page *page, unsigned long private, int **resultp) { - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; - - /* - * TODO: allocate a destination hugepage from a nearest neighbor node, - * accordance with memory policy of the user process if possible. For - * now as a simple work-around, we use the next node for destination. - */ - if (PageHuge(page)) - return alloc_huge_page_node(page_hstate(compound_head(page)), - next_node_in(page_to_nid(page), - node_online_map)); - - if (PageHighMem(page)) - gfp_mask |= __GFP_HIGHMEM; - - return alloc_page(gfp_mask); + return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); } diff --git a/mm/page_owner.c b/mm/page_owner.c index 60634dc53a88..0fd9dcf2c5dc 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -281,7 +281,11 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; + unsigned long freepage_order; + + freepage_order = page_order_unsafe(page); + if (freepage_order < MAX_ORDER) + pfn += (1UL << freepage_order) - 1; continue; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index de9c40d7304a..8ec6ba230bb9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address); + pvmw->pte = huge_pte_offset(mm, pvmw->address, + PAGE_SIZE << compound_order(page)); if (!pvmw->pte) return false; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 60f7856e508f..1a4197965415 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); + unsigned long sz = huge_page_size(h); pte_t *pte; int err = 0; do { next = hugetlb_entry_end(h, addr, end); - pte = huge_pte_offset(walk->mm, addr & hmask); + pte = huge_pte_offset(walk->mm, addr & hmask, sz); if (pte && walk->hugetlb_entry) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); if (err) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h new file mode 100644 index 000000000000..cd2442e13d8f --- /dev/null +++ b/mm/percpu-internal.h @@ -0,0 +1,166 @@ +#ifndef _MM_PERCPU_INTERNAL_H +#define _MM_PERCPU_INTERNAL_H + +#include <linux/types.h> +#include <linux/percpu.h> + +struct pcpu_chunk { +#ifdef CONFIG_PERCPU_STATS + int nr_alloc; /* # of allocations */ + size_t max_alloc_size; /* largest allocation size */ +#endif + + struct list_head list; /* linked to pcpu_slot lists */ + int free_size; /* free bytes in the chunk */ + int contig_hint; /* max contiguous size hint */ + void *base_addr; /* base address of this chunk */ + + int map_used; /* # of map entries used before the sentry */ + int map_alloc; /* # of map entries allocated */ + int *map; /* allocation map */ + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ + + void *data; /* chunk data */ + int first_free; /* no free below this */ + bool immutable; /* no [de]population allowed */ + bool has_reserved; /* Indicates if chunk has reserved space + at the beginning. Reserved chunk will + contain reservation for static chunk. + Dynamic chunk will contain reservation + for static and reserved chunks. */ + int nr_populated; /* # of populated pages */ + unsigned long populated[]; /* populated bitmap */ +}; + +extern spinlock_t pcpu_lock; + +extern struct list_head *pcpu_slot; +extern int pcpu_nr_slots; + +extern struct pcpu_chunk *pcpu_first_chunk; +extern struct pcpu_chunk *pcpu_reserved_chunk; + +#ifdef CONFIG_PERCPU_STATS + +#include <linux/spinlock.h> + +struct percpu_stats { + u64 nr_alloc; /* lifetime # of allocations */ + u64 nr_dealloc; /* lifetime # of deallocations */ + u64 nr_cur_alloc; /* current # of allocations */ + u64 nr_max_alloc; /* max # of live allocations */ + u32 nr_chunks; /* current # of live chunks */ + u32 nr_max_chunks; /* max # of live chunks */ + size_t min_alloc_size; /* min allocaiton size */ + size_t max_alloc_size; /* max allocation size */ +}; + +extern struct percpu_stats pcpu_stats; +extern struct pcpu_alloc_info pcpu_stats_ai; + +/* + * For debug purposes. We don't care about the flexible array. + */ +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ + memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); + + /* initialize min_alloc_size to unit_size */ + pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; +} + +/* + * pcpu_stats_area_alloc - increment area allocation stats + * @chunk: the location of the area being allocated + * @size: size of area to allocate in bytes + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ + lockdep_assert_held(&pcpu_lock); + + pcpu_stats.nr_alloc++; + pcpu_stats.nr_cur_alloc++; + pcpu_stats.nr_max_alloc = + max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); + pcpu_stats.min_alloc_size = + min(pcpu_stats.min_alloc_size, size); + pcpu_stats.max_alloc_size = + max(pcpu_stats.max_alloc_size, size); + + chunk->nr_alloc++; + chunk->max_alloc_size = max(chunk->max_alloc_size, size); +} + +/* + * pcpu_stats_area_dealloc - decrement allocation stats + * @chunk: the location of the area being deallocated + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ + lockdep_assert_held(&pcpu_lock); + + pcpu_stats.nr_dealloc++; + pcpu_stats.nr_cur_alloc--; + + chunk->nr_alloc--; +} + +/* + * pcpu_stats_chunk_alloc - increment chunk stats + */ +static inline void pcpu_stats_chunk_alloc(void) +{ + unsigned long flags; + spin_lock_irqsave(&pcpu_lock, flags); + + pcpu_stats.nr_chunks++; + pcpu_stats.nr_max_chunks = + max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); + + spin_unlock_irqrestore(&pcpu_lock, flags); +} + +/* + * pcpu_stats_chunk_dealloc - decrement chunk stats + */ +static inline void pcpu_stats_chunk_dealloc(void) +{ + unsigned long flags; + spin_lock_irqsave(&pcpu_lock, flags); + + pcpu_stats.nr_chunks--; + + spin_unlock_irqrestore(&pcpu_lock, flags); +} + +#else + +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ +} + +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ +} + +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ +} + +static inline void pcpu_stats_chunk_alloc(void) +{ +} + +static inline void pcpu_stats_chunk_dealloc(void) +{ +} + +#endif /* !CONFIG_PERCPU_STATS */ + +#endif diff --git a/mm/percpu-km.c b/mm/percpu-km.c index d66911ff42d9..eb58aa4c0997 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -72,6 +72,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void) pcpu_chunk_populated(chunk, 0, nr_pages); spin_unlock_irq(&pcpu_lock); + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(chunk->base_addr); + return chunk; } @@ -79,7 +82,13 @@ static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; - if (chunk && chunk->data) + if (!chunk) + return; + + pcpu_stats_chunk_dealloc(); + trace_percpu_destroy_chunk(chunk->base_addr); + + if (chunk->data) __free_pages(chunk->data, order_base_2(nr_pages)); pcpu_free_chunk(chunk); } diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c new file mode 100644 index 000000000000..03524a56eeff --- /dev/null +++ b/mm/percpu-stats.c @@ -0,0 +1,222 @@ +/* + * mm/percpu-debug.c + * + * Copyright (C) 2017 Facebook Inc. + * Copyright (C) 2017 Dennis Zhou <dennisz@fb.com> + * + * This file is released under the GPLv2. + * + * Prints statistics about the percpu allocator and backing chunks. + */ +#include <linux/debugfs.h> +#include <linux/list.h> +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/sort.h> +#include <linux/vmalloc.h> + +#include "percpu-internal.h" + +#define P(X, Y) \ + seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y) + +struct percpu_stats pcpu_stats; +struct pcpu_alloc_info pcpu_stats_ai; + +static int cmpint(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +/* + * Iterates over all chunks to find the max # of map entries used. + */ +static int find_max_map_used(void) +{ + struct pcpu_chunk *chunk; + int slot, max_map_used; + + max_map_used = 0; + for (slot = 0; slot < pcpu_nr_slots; slot++) + list_for_each_entry(chunk, &pcpu_slot[slot], list) + max_map_used = max(max_map_used, chunk->map_used); + + return max_map_used; +} + +/* + * Prints out chunk state. Fragmentation is considered between + * the beginning of the chunk to the last allocation. + */ +static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, + void *buffer) +{ + int i, s_index, last_alloc, alloc_sign, as_len; + int *alloc_sizes, *p; + /* statistics */ + int sum_frag = 0, max_frag = 0; + int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; + + alloc_sizes = buffer; + s_index = chunk->has_reserved ? 1 : 0; + + /* find last allocation */ + last_alloc = -1; + for (i = chunk->map_used - 1; i >= s_index; i--) { + if (chunk->map[i] & 1) { + last_alloc = i; + break; + } + } + + /* if the chunk is not empty - ignoring reserve */ + if (last_alloc >= s_index) { + as_len = last_alloc + 1 - s_index; + + /* + * Iterate through chunk map computing size info. + * The first bit is overloaded to be a used flag. + * negative = free space, positive = allocated + */ + for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) { + alloc_sign = (*p & 1) ? 1 : -1; + alloc_sizes[i] = alloc_sign * + ((p[1] & ~1) - (p[0] & ~1)); + } + + sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL); + + /* Iterate through the unallocated fragements. */ + for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { + sum_frag -= *p; + max_frag = max(max_frag, -1 * (*p)); + } + + cur_min_alloc = alloc_sizes[i]; + cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2]; + cur_max_alloc = alloc_sizes[as_len - 1]; + } + + P("nr_alloc", chunk->nr_alloc); + P("max_alloc_size", chunk->max_alloc_size); + P("free_size", chunk->free_size); + P("contig_hint", chunk->contig_hint); + P("sum_frag", sum_frag); + P("max_frag", max_frag); + P("cur_min_alloc", cur_min_alloc); + P("cur_med_alloc", cur_med_alloc); + P("cur_max_alloc", cur_max_alloc); + seq_putc(m, '\n'); +} + +static int percpu_stats_show(struct seq_file *m, void *v) +{ + struct pcpu_chunk *chunk; + int slot, max_map_used; + void *buffer; + +alloc_buffer: + spin_lock_irq(&pcpu_lock); + max_map_used = find_max_map_used(); + spin_unlock_irq(&pcpu_lock); + + buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0])); + if (!buffer) + return -ENOMEM; + + spin_lock_irq(&pcpu_lock); + + /* if the buffer allocated earlier is too small */ + if (max_map_used < find_max_map_used()) { + spin_unlock_irq(&pcpu_lock); + vfree(buffer); + goto alloc_buffer; + } + +#define PL(X) \ + seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X) + + seq_printf(m, + "Percpu Memory Statistics\n" + "Allocation Info:\n" + "----------------------------------------\n"); + PL(unit_size); + PL(static_size); + PL(reserved_size); + PL(dyn_size); + PL(atom_size); + PL(alloc_size); + seq_putc(m, '\n'); + +#undef PL + +#define PU(X) \ + seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X) + + seq_printf(m, + "Global Stats:\n" + "----------------------------------------\n"); + PU(nr_alloc); + PU(nr_dealloc); + PU(nr_cur_alloc); + PU(nr_max_alloc); + PU(nr_chunks); + PU(nr_max_chunks); + PU(min_alloc_size); + PU(max_alloc_size); + seq_putc(m, '\n'); + +#undef PU + + seq_printf(m, + "Per Chunk Stats:\n" + "----------------------------------------\n"); + + if (pcpu_reserved_chunk) { + seq_puts(m, "Chunk: <- Reserved Chunk\n"); + chunk_map_stats(m, pcpu_reserved_chunk, buffer); + } + + for (slot = 0; slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + if (chunk == pcpu_first_chunk) { + seq_puts(m, "Chunk: <- First Chunk\n"); + chunk_map_stats(m, chunk, buffer); + + + } else { + seq_puts(m, "Chunk:\n"); + chunk_map_stats(m, chunk, buffer); + } + + } + } + + spin_unlock_irq(&pcpu_lock); + + vfree(buffer); + + return 0; +} + +static int percpu_stats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, percpu_stats_show, NULL); +} + +static const struct file_operations percpu_stats_fops = { + .open = percpu_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init init_percpu_stats_debugfs(void) +{ + debugfs_create_file("percpu_stats", 0444, NULL, NULL, + &percpu_stats_fops); + + return 0; +} + +late_initcall(init_percpu_stats_debugfs); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 9ac639499bd1..15dab691ea70 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -343,12 +343,22 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = vms; chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; + + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(chunk->base_addr); + return chunk; } static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { - if (chunk && chunk->data) + if (!chunk) + return; + + pcpu_stats_chunk_dealloc(); + trace_percpu_destroy_chunk(chunk->base_addr); + + if (chunk->data) pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); pcpu_free_chunk(chunk); } diff --git a/mm/percpu.c b/mm/percpu.c index e0aa8ae7bde7..bd4130a69bbc 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,11 @@ #include <asm/tlbflush.h> #include <asm/io.h> +#define CREATE_TRACE_POINTS +#include <trace/events/percpu.h> + +#include "percpu-internal.h" + #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 @@ -103,53 +108,35 @@ #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) #endif /* CONFIG_SMP */ -struct pcpu_chunk { - struct list_head list; /* linked to pcpu_slot lists */ - int free_size; /* free bytes in the chunk */ - int contig_hint; /* max contiguous size hint */ - void *base_addr; /* base address of this chunk */ - - int map_used; /* # of map entries used before the sentry */ - int map_alloc; /* # of map entries allocated */ - int *map; /* allocation map */ - struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ - - void *data; /* chunk data */ - int first_free; /* no free below this */ - bool immutable; /* no [de]population allowed */ - int nr_populated; /* # of populated pages */ - unsigned long populated[]; /* populated bitmap */ -}; - -static int pcpu_unit_pages __read_mostly; -static int pcpu_unit_size __read_mostly; -static int pcpu_nr_units __read_mostly; -static int pcpu_atom_size __read_mostly; -static int pcpu_nr_slots __read_mostly; -static size_t pcpu_chunk_struct_size __read_mostly; +static int pcpu_unit_pages __ro_after_init; +static int pcpu_unit_size __ro_after_init; +static int pcpu_nr_units __ro_after_init; +static int pcpu_atom_size __ro_after_init; +int pcpu_nr_slots __ro_after_init; +static size_t pcpu_chunk_struct_size __ro_after_init; /* cpus with the lowest and highest unit addresses */ -static unsigned int pcpu_low_unit_cpu __read_mostly; -static unsigned int pcpu_high_unit_cpu __read_mostly; +static unsigned int pcpu_low_unit_cpu __ro_after_init; +static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ -void *pcpu_base_addr __read_mostly; +void *pcpu_base_addr __ro_after_init; EXPORT_SYMBOL_GPL(pcpu_base_addr); -static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ -const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ +static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ /* group information, used for vm allocation */ -static int pcpu_nr_groups __read_mostly; -static const unsigned long *pcpu_group_offsets __read_mostly; -static const size_t *pcpu_group_sizes __read_mostly; +static int pcpu_nr_groups __ro_after_init; +static const unsigned long *pcpu_group_offsets __ro_after_init; +static const size_t *pcpu_group_sizes __ro_after_init; /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area. */ -static struct pcpu_chunk *pcpu_first_chunk; +struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first @@ -158,13 +145,13 @@ static struct pcpu_chunk *pcpu_first_chunk; * area doesn't exist, the following variables contain NULL and 0 * respectively. */ -static struct pcpu_chunk *pcpu_reserved_chunk; -static int pcpu_reserved_chunk_limit; +struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; +static int pcpu_reserved_chunk_limit __ro_after_init; -static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ -static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */ /* chunks which need their map areas extended, protected by pcpu_lock */ static LIST_HEAD(pcpu_map_extend_chunks); @@ -672,6 +659,9 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, int to_free = 0; int *p; + lockdep_assert_held(&pcpu_lock); + pcpu_stats_area_dealloc(chunk); + freeme |= 1; /* we are searching for <given offset, in use> pair */ i = 0; @@ -735,6 +725,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map[0] = 0; chunk->map[1] = pcpu_unit_size | 1; chunk->map_used = 1; + chunk->has_reserved = false; INIT_LIST_HEAD(&chunk->list); INIT_LIST_HEAD(&chunk->map_extend_list); @@ -965,8 +956,10 @@ restart: * tasks to create chunks simultaneously. Serialize and create iff * there's still no empty chunk after grabbing the mutex. */ - if (is_atomic) + if (is_atomic) { + err = "atomic alloc failed, no space left"; goto fail; + } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); @@ -984,6 +977,7 @@ restart: goto restart; area_found: + pcpu_stats_area_alloc(chunk, size); spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ @@ -1026,11 +1020,17 @@ area_found: ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size, gfp); + + trace_percpu_alloc_percpu(reserved, is_atomic, size, align, + chunk->base_addr, off, ptr); + return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: + trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); + if (!is_atomic && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); @@ -1280,6 +1280,8 @@ void free_percpu(void __percpu *ptr) } } + trace_percpu_free_percpu(chunk->base_addr, off, ptr); + spin_unlock_irqrestore(&pcpu_lock, flags); } EXPORT_SYMBOL_GPL(free_percpu); @@ -1656,6 +1658,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); + pcpu_stats_save_ai(ai); + /* * Allocate chunk slots. The additional last slot is for * empty chunks. @@ -1699,6 +1703,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (schunk->free_size) schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; schunk->map[schunk->map_used] |= 1; + schunk->has_reserved = true; /* init dynamic chunk if necessary */ if (dyn_size) { @@ -1717,6 +1722,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dchunk->map[1] = pcpu_reserved_chunk_limit; dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; dchunk->map_used = 2; + dchunk->has_reserved = true; } /* link the first chunk in */ @@ -1725,6 +1731,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(base_addr); + /* we're done */ pcpu_base_addr = base_addr; return 0; diff --git a/mm/rmap.c b/mm/rmap.c index d405f0e0ee96..ced14f1af6dc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - int cpu; if (!tlb_ubc->flush_required) return; - cpu = get_cpu(); - - if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - local_flush_tlb(); - trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); - } - - if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL); - cpumask_clear(&tlb_ubc->cpumask); + arch_tlbbatch_flush(&tlb_ubc->arch); tlb_ubc->flush_required = false; tlb_ubc->writable = false; - put_cpu(); } /* Flush iff there are potentially writable TLB entries that can race with IO */ @@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; /* @@ -1157,8 +1145,7 @@ void page_add_file_rmap(struct page *page, bool compound) if (!atomic_inc_and_test(&page->_mapcount)) goto out; } - __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mod_memcg_page_state(page, NR_FILE_MAPPED, nr); + __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1193,12 +1180,11 @@ static void page_remove_file_rmap(struct page *page, bool compound) } /* - * We use the irq-unsafe __{inc|mod}_zone_page_state because + * We use the irq-unsafe __{inc|mod}_lruvec_page_state because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mod_memcg_page_state(page, NR_FILE_MAPPED, -nr); + __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1379,15 +1365,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { int nr = 1 << compound_order(page); hugetlb_count_sub(nr, mm); + set_huge_swap_pte_at(mm, address, + pvmw.pte, pteval, + vma_mmu_pagesize(vma)); } else { dec_mm_counter(mm, mm_counter(page)); + set_pte_at(mm, address, pvmw.pte, pteval); } - pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); - set_pte_at(mm, address, pvmw.pte, pteval); } else if (pte_unused(pteval)) { /* * The guest indicated that the page content is of no diff --git a/mm/shmem.c b/mm/shmem.c index e67d6ba4e98e..b0aa6075d164 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -75,6 +75,7 @@ static struct vfsmount *shm_mnt; #include <uapi/linux/memfd.h> #include <linux/userfaultfd_k.h> #include <linux/rmap.h> +#include <linux/uuid.h> #include <linux/uaccess.h> #include <asm/pgtable.h> @@ -1290,7 +1291,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) SetPageUptodate(page); } - swap = get_swap_page(); + swap = get_swap_page(page); if (!swap.val) goto redirty; @@ -1326,7 +1327,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) mutex_unlock(&shmem_swaplist_mutex); free_swap: - swapcache_free(swap); + put_swap_page(page, swap); redirty: set_page_dirty(page); if (wbc->for_reclaim) @@ -1645,8 +1646,7 @@ repeat: if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(charge_mm, - PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); @@ -1902,10 +1902,10 @@ unlock: * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); - list_del_init(&wait->task_list); + list_del_init(&wait->entry); return ret; } @@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf) } sgp = SGP_CACHE; - if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - else if (vma->vm_flags & VM_NOHUGEPAGE) + + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) sgp = SGP_NOHUGE; + else if (vma->vm_flags & VM_HUGEPAGE) + sgp = SGP_HUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); @@ -2840,7 +2842,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); - WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list)); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; @@ -3761,6 +3763,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif + uuid_gen(&sb->s_uuid); inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (!inode) diff --git a/mm/slab.c b/mm/slab.c index 2a31ee3c5814..04dec48c3ed7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1425,11 +1425,9 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - add_zone_page_state(page_zone(page), - NR_SLAB_RECLAIMABLE, nr_pages); + mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages); else - add_zone_page_state(page_zone(page), - NR_SLAB_UNRECLAIMABLE, nr_pages); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ @@ -1459,11 +1457,9 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - sub_zone_page_state(page_zone(page), - NR_SLAB_RECLAIMABLE, nr_freed); + mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); else - sub_zone_page_state(page_zone(page), - NR_SLAB_UNRECLAIMABLE, nr_freed); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed); BUG_ON(!PageSlab(page)); __ClearPageSlabPfmemalloc(page); @@ -2040,17 +2036,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ - if (size & (BYTES_PER_WORD - 1)) { - size += (BYTES_PER_WORD - 1); - size &= ~(BYTES_PER_WORD - 1); - } + size = ALIGN(size, BYTES_PER_WORD); if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably * aligned, by adjusting the object size accordingly. */ - size += REDZONE_ALIGN - 1; - size &= ~(REDZONE_ALIGN - 1); + size = ALIGN(size, REDZONE_ALIGN); } /* 3) caller mandated alignment */ diff --git a/mm/slab.h b/mm/slab.h index 9cfcf099709c..6885e1192ec5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -274,22 +274,11 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - int ret; - if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - - ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); - if (ret) - return ret; - - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, - 1 << order); - return 0; + return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); } static __always_inline void memcg_uncharge_slab(struct page *page, int order, @@ -297,11 +286,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, { if (!memcg_kmem_enabled()) return; - - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, - -(1 << order)); memcg_kmem_uncharge(page, order); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 01a0fe2eb332..904a83be82de 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -47,13 +47,12 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, /* * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.) */ -static int slab_nomerge; +static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); static int __init setup_slab_nomerge(char *str) { - slab_nomerge = 1; + slab_nomerge = true; return 1; } diff --git a/mm/slub.c b/mm/slub.c index 8addc535bcdc..1d3f9835f4ea 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1615,7 +1615,7 @@ out: if (!page) return NULL; - mod_zone_page_state(page_zone(page), + mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1 << oo_order(oo)); @@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) kmemcheck_free_shadow(page, compound_order(page)); - mod_zone_page_state(page_zone(page), + mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); @@ -1829,7 +1829,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, stat(s, CPU_PARTIAL_NODE); } if (!kmem_cache_has_cpu_partial(s) - || available > s->cpu_partial / 2) + || available > slub_cpu_partial(s) / 2) break; } @@ -1993,7 +1993,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct page *page, - void *freelist) + void *freelist, struct kmem_cache_cpu *c) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -2132,6 +2132,9 @@ redo: discard_slab(s, page); stat(s, FREE_SLAB); } + + c->page = NULL; + c->freelist = NULL; } /* @@ -2266,11 +2269,9 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c->page, c->freelist); + deactivate_slab(s, c->page, c->freelist, c); c->tid = next_tid(c->tid); - c->page = NULL; - c->freelist = NULL; } /* @@ -2302,7 +2303,7 @@ static bool has_cpu_slab(int cpu, void *info) struct kmem_cache *s = info; struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - return c->page || c->partial; + return c->page || slub_percpu_partial(c); } static void flush_all(struct kmem_cache *s) @@ -2521,9 +2522,7 @@ redo: if (unlikely(!node_match(page, searchnode))) { stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, c->freelist, c); goto new_slab; } } @@ -2534,9 +2533,7 @@ redo: * information when the page leaves the per-cpu allocator */ if (unlikely(!pfmemalloc_match(page, gfpflags))) { - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, c->freelist, c); goto new_slab; } @@ -2568,11 +2565,10 @@ load_freelist: new_slab: - if (c->partial) { - page = c->page = c->partial; - c->partial = page->next; + if (slub_percpu_partial(c)) { + page = c->page = slub_percpu_partial(c); + slub_set_percpu_partial(c, page); stat(s, CPU_PARTIAL_ALLOC); - c->freelist = NULL; goto redo; } @@ -2592,9 +2588,7 @@ new_slab: !alloc_debug_processing(s, page, freelist, addr)) goto new_slab; /* Slab failed checks. Next slab needed */ - deactivate_slab(s, page, get_freepointer(s, freelist)); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, get_freepointer(s, freelist), c); return freelist; } @@ -3410,6 +3404,39 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) s->min_partial = min; } +static void set_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch + * 50% to keep some capacity around for frees. + */ + if (!kmem_cache_has_cpu_partial(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; +#endif +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -3568,33 +3595,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) */ set_min_partial(s, ilog2(s->size) / 2); - /* - * cpu_partial determined the maximum number of objects kept in the - * per cpu partial lists of a processor. - * - * Per cpu partial lists mainly contain slabs that just have one - * object freed. If they are used for allocation then they can be - * filled up again with minimal effort. The slab will never hit the - * per node partial lists and therefore no locking will be required. - * - * This setting also determines - * - * A) The number of objects from per cpu partial slabs dumped to the - * per node list when we reach the limit. - * B) The number of objects in cpu partial slabs to extract from the - * per node list when we run out of per cpu objects. We only fetch - * 50% to keep some capacity around for frees. - */ - if (!kmem_cache_has_cpu_partial(s)) - s->cpu_partial = 0; - else if (s->size >= PAGE_SIZE) - s->cpu_partial = 2; - else if (s->size >= 1024) - s->cpu_partial = 6; - else if (s->size >= 256) - s->cpu_partial = 13; - else - s->cpu_partial = 30; + set_cpu_partial(s); #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -3981,7 +3982,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) * Disable empty slabs caching. Used to avoid pinning offline * memory cgroups by kmem pages that can be freed. */ - s->cpu_partial = 0; + slub_set_cpu_partial(s, 0); s->min_partial = 0; /* @@ -4760,7 +4761,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[node] += x; - page = READ_ONCE(c->partial); + page = slub_percpu_partial_read_once(c); if (page) { node = page_to_nid(page); if (flags & SO_TOTAL) @@ -4921,7 +4922,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->cpu_partial); + return sprintf(buf, "%u\n", slub_cpu_partial(s)); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -4936,7 +4937,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, if (objects && !kmem_cache_has_cpu_partial(s)) return -EINVAL; - s->cpu_partial = objects; + slub_set_cpu_partial(s, objects); flush_all(s); return length; } @@ -4988,7 +4989,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) int len; for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + struct page *page; + + page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (page) { pages += page->pages; @@ -5000,7 +5003,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) #ifdef CONFIG_SMP for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + struct page *page; + + page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (page && len < PAGE_SIZE - 20) len += sprintf(buf + len, " C%d=%d(%d)", cpu, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a56c3989f773..c50b1a14d55e 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -56,11 +56,11 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (node_state(node, N_HIGH_MEMORY)) page = alloc_pages_node( - node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, get_order(size)); else page = alloc_pages( - GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, get_order(size)); if (page) return page_address(page); diff --git a/mm/sparse.c b/mm/sparse.c index 6903c8fc3085..7b4be3fd5cac 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -168,6 +168,44 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, } } +/* + * There are a number of times that we loop over NR_MEM_SECTIONS, + * looking for section_present() on each. But, when we have very + * large physical address spaces, NR_MEM_SECTIONS can also be + * very large which makes the loops quite long. + * + * Keeping track of this gives us an easy way to break out of + * those loops early. + */ +int __highest_present_section_nr; +static void section_mark_present(struct mem_section *ms) +{ + int section_nr = __section_nr(ms); + + if (section_nr > __highest_present_section_nr) + __highest_present_section_nr = section_nr; + + ms->section_mem_map |= SECTION_MARKED_PRESENT; +} + +static inline int next_present_section_nr(int section_nr) +{ + do { + section_nr++; + if (present_section_nr(section_nr)) + return section_nr; + } while ((section_nr < NR_MEM_SECTIONS) && + (section_nr <= __highest_present_section_nr)); + + return -1; +} +#define for_each_present_section_nr(start, section_nr) \ + for (section_nr = next_present_section_nr(start-1); \ + ((section_nr >= 0) && \ + (section_nr < NR_MEM_SECTIONS) && \ + (section_nr <= __highest_present_section_nr)); \ + section_nr = next_present_section_nr(section_nr)) + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -183,9 +221,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) set_section_nid(section, nid); ms = __nr_to_section(section); - if (!ms->section_mem_map) + if (!ms->section_mem_map) { ms->section_mem_map = sparse_encode_early_nid(nid) | - SECTION_MARKED_PRESENT; + SECTION_IS_ONLINE; + section_mark_present(ms); + } } } @@ -476,23 +516,19 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) int nodeid_begin = 0; unsigned long pnum_begin = 0; - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + for_each_present_section_nr(0, pnum) { struct mem_section *ms; - if (!present_section_nr(pnum)) - continue; ms = __nr_to_section(pnum); nodeid_begin = sparse_early_nid(ms); pnum_begin = pnum; break; } map_count = 1; - for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + for_each_present_section_nr(pnum_begin + 1, pnum) { struct mem_section *ms; int nodeid; - if (!present_section_nr(pnum)) - continue; ms = __nr_to_section(pnum); nodeid = sparse_early_nid(ms); if (nodeid == nodeid_begin) { @@ -561,10 +597,7 @@ void __init sparse_init(void) (void *)map_map); #endif - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - if (!present_section_nr(pnum)) - continue; - + for_each_present_section_nr(0, pnum) { usemap = usemap_map[pnum]; if (!usemap) continue; @@ -590,6 +623,48 @@ void __init sparse_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG + +/* Mark all memory sections within the pfn range as online */ +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + + /* onlining code should never touch invalid ranges */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map |= SECTION_IS_ONLINE; + } +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* Mark all memory sections within the pfn range as online */ +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + + /* + * TODO this needs some double checking. Offlining code makes + * sure to check pfn_valid but those checks might be just bogus + */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map &= ~SECTION_IS_ONLINE; + } +} +#endif + #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { @@ -686,10 +761,9 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct pglist_data *pgdat = zone->zone_pgdat; struct mem_section *ms; struct page *memmap; unsigned long *usemap; @@ -722,7 +796,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); - ms->section_mem_map |= SECTION_MARKED_PRESENT; + section_mark_present(ms); ret = sparse_init_one_section(ms, section_nr, memmap, usemap); diff --git a/mm/swap.c b/mm/swap.c index 98d08b4579fa..60b1d2a75852 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -591,6 +591,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + count_memcg_page_event(page, PGLAZYFREE); update_page_reclaim_stat(lruvec, 1, 0); } } @@ -687,7 +688,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -void lru_add_drain_all(void) +void lru_add_drain_all_cpuslocked(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; @@ -701,7 +702,6 @@ void lru_add_drain_all(void) return; mutex_lock(&lock); - get_online_cpus(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { @@ -721,10 +721,16 @@ void lru_add_drain_all(void) for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); - put_online_cpus(); mutex_unlock(&lock); } +void lru_add_drain_all(void) +{ + get_online_cpus(); + lru_add_drain_all_cpuslocked(); + put_online_cpus(); +} + /** * release_pages - batched put_page() * @pages: array of pages to release diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 3405b4ee1757..fcd2740f4ed7 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -61,21 +61,27 @@ not_enough_page: return -ENOMEM; } +static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, + pgoff_t offset) +{ + struct page *mappage; + struct swap_cgroup *sc; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, struct swap_cgroup_ctrl **ctrlp) { pgoff_t offset = swp_offset(ent); struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; if (ctrlp) *ctrlp = ctrl; - - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; + return __lookup_swap_cgroup(ctrl, offset); } /** @@ -108,25 +114,39 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, } /** - * swap_cgroup_record - record mem_cgroup for this swp_entry. - * @ent: swap entry to be recorded into + * swap_cgroup_record - record mem_cgroup for a set of swap entries + * @ent: the first swap entry to be recorded into * @id: mem_cgroup to be recorded + * @nr_ents: number of swap entries to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, + unsigned int nr_ents) { struct swap_cgroup_ctrl *ctrl; struct swap_cgroup *sc; unsigned short old; unsigned long flags; + pgoff_t offset = swp_offset(ent); + pgoff_t end = offset + nr_ents; sc = lookup_swap_cgroup(ent, &ctrl); spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; - sc->id = id; + for (;;) { + VM_BUG_ON(sc->id != old); + sc->id = id; + offset++; + if (offset == end) + break; + if (offset % SC_PER_PAGE) + sc++; + else + sc = __lookup_swap_cgroup(ctrl, offset); + } spin_unlock_irqrestore(&ctrl->lock, flags); return old; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 58f6c78f1dad..13a174006b91 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -263,7 +263,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots); + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, + cache->slots); return cache->nr; } @@ -272,11 +273,11 @@ int free_swap_slot(swp_entry_t entry) { struct swap_slots_cache *cache; - cache = &get_cpu_var(swp_slots); + cache = raw_cpu_ptr(&swp_slots); if (use_swap_slot_cache && cache->slots_ret) { spin_lock_irq(&cache->free_lock); /* Swap slots cache may be deactivated before acquiring lock */ - if (!use_swap_slot_cache) { + if (!use_swap_slot_cache || !cache->slots_ret) { spin_unlock_irq(&cache->free_lock); goto direct_free; } @@ -296,16 +297,23 @@ int free_swap_slot(swp_entry_t entry) direct_free: swapcache_free_entries(&entry, 1); } - put_cpu_var(swp_slots); return 0; } -swp_entry_t get_swap_page(void) +swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry, *pentry; struct swap_slots_cache *cache; + entry.val = 0; + + if (PageTransHuge(page)) { + if (IS_ENABLED(CONFIG_THP_SWAP)) + get_swap_pages(1, true, &entry); + return entry; + } + /* * Preemption is allowed here, because we may sleep * in refill_swap_slots_cache(). But it is safe, because @@ -317,7 +325,6 @@ swp_entry_t get_swap_page(void) */ cache = raw_cpu_ptr(&swp_slots); - entry.val = 0; if (check_cache_active()) { mutex_lock(&cache->alloc_lock); if (cache->slots) { @@ -337,7 +344,7 @@ repeat: return entry; } - get_swap_pages(1, &entry); + get_swap_pages(1, false, &entry); return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 539b8885e3d1..b68c93014f50 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -19,6 +19,7 @@ #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/swap_slots.h> +#include <linux/huge_mm.h> #include <asm/pgtable.h> @@ -38,6 +39,7 @@ struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) +#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) static struct { unsigned long add_total; @@ -90,39 +92,46 @@ void show_swap_cache_info(void) */ int __add_to_swap_cache(struct page *page, swp_entry_t entry) { - int error; + int error, i, nr = hpage_nr_pages(page); struct address_space *address_space; + pgoff_t idx = swp_offset(entry); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - get_page(page); + page_ref_add(page, nr); SetPageSwapCache(page); - set_page_private(page, entry.val); address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); - error = radix_tree_insert(&address_space->page_tree, - swp_offset(entry), page); - if (likely(!error)) { - address_space->nrpages++; - __inc_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); + for (i = 0; i < nr; i++) { + set_page_private(page + i, entry.val + i); + error = radix_tree_insert(&address_space->page_tree, + idx + i, page + i); + if (unlikely(error)) + break; } - spin_unlock_irq(&address_space->tree_lock); - - if (unlikely(error)) { + if (likely(!error)) { + address_space->nrpages += nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + ADD_CACHE_INFO(add_total, nr); + } else { /* * Only the context which have set SWAP_HAS_CACHE flag * would call add_to_swap_cache(). * So add_to_swap_cache() doesn't returns -EEXIST. */ VM_BUG_ON(error == -EEXIST); - set_page_private(page, 0UL); + set_page_private(page + i, 0UL); + while (i--) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0UL); + } ClearPageSwapCache(page); - put_page(page); + page_ref_sub(page, nr); } + spin_unlock_irq(&address_space->tree_lock); return error; } @@ -132,7 +141,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_maybe_preload(gfp_mask); + error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -146,8 +155,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - swp_entry_t entry; struct address_space *address_space; + int i, nr = hpage_nr_pages(page); + swp_entry_t entry; + pgoff_t idx; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); @@ -155,12 +166,15 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, swp_offset(entry)); - set_page_private(page, 0); + idx = swp_offset(entry); + for (i = 0; i < nr; i++) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0); + } ClearPageSwapCache(page); - address_space->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(del_total); + address_space->nrpages -= nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + ADD_CACHE_INFO(del_total, nr); } /** @@ -170,7 +184,7 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page *page, struct list_head *list) +int add_to_swap(struct page *page) { swp_entry_t entry; int err; @@ -178,20 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); - entry = get_swap_page(); + entry = get_swap_page(page); if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) { - swapcache_free(entry); - return 0; - } - - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page_to_list(page, list))) { - swapcache_free(entry); - return 0; - } + if (mem_cgroup_try_charge_swap(page, entry)) + goto fail; /* * Radix-tree node allocations from PF_MEMALLOC contexts could @@ -206,17 +212,19 @@ int add_to_swap(struct page *page, struct list_head *list) */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - - if (!err) { - return 1; - } else { /* -ENOMEM radix-tree allocation failure */ + /* -ENOMEM radix-tree allocation failure */ + if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); - return 0; - } + goto fail; + + return 1; + +fail: + put_swap_page(page, entry); + return 0; } /* @@ -237,8 +245,8 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - swapcache_free(entry); - put_page(page); + put_swap_page(page, entry); + page_ref_sub(page, hpage_nr_pages(page)); } /* @@ -295,7 +303,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), swp_offset(entry)); - if (page) { + if (page && likely(!PageTransCompound(page))) { INC_CACHE_INFO(find_success); if (TestClearPageReadahead(page)) atomic_inc(&swapin_readahead_hits); @@ -389,7 +397,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); + put_swap_page(new_page, entry); } while (err != -ENOMEM); if (new_page) @@ -404,14 +412,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) + struct vm_area_struct *vma, unsigned long addr, bool do_poll) { bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) - swap_readpage(retpage); + swap_readpage(retpage, do_poll); return retpage; } @@ -488,11 +496,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long start_offset, end_offset; unsigned long mask; struct blk_plug plug; + bool do_poll = true; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; + do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -503,10 +513,10 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = read_swap_cache_async(swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr); + gfp_mask, vma, addr, false); if (!page) continue; - if (offset != entry_offset) + if (offset != entry_offset && likely(!PageTransCompound(page))) SetPageReadahead(page); put_page(page); } @@ -514,7 +524,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: - return read_swap_cache_async(entry, gfp_mask, vma, addr); + return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); } int init_swap_address_space(unsigned int type, unsigned long nr_pages) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f6cba1b6632..6ba4aab2db0b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -37,6 +37,7 @@ #include <linux/swapfile.h> #include <linux/export.h> #include <linux/swap_slots.h> +#include <linux/sort.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -199,7 +200,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR +#else #define SWAPFILE_CLUSTER 256 +#endif #define LATENCY_LIMIT 256 static inline void cluster_set_flag(struct swap_cluster_info *info, @@ -374,6 +379,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, schedule_work(&si->discard_work); } +static void __free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); + cluster_list_add_tail(&si->free_clusters, ci, idx); +} + /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. caller should hold si->lock. @@ -394,10 +407,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) spin_lock(&si->lock); ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); - cluster_set_flag(ci, CLUSTER_FLAG_FREE); - unlock_cluster(ci); - cluster_list_add_tail(&si->free_clusters, info, idx); - ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); + __free_cluster(si, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); unlock_cluster(ci); @@ -415,6 +425,34 @@ static void swap_discard_work(struct work_struct *work) spin_unlock(&si->lock); } +static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); + cluster_list_del_first(&si->free_clusters, ci); + cluster_set_count_flag(ci + idx, 0, 0); +} + +static void free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info + idx; + + VM_BUG_ON(cluster_count(ci) != 0); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { + swap_cluster_schedule_discard(si, idx); + return; + } + + __free_cluster(si, idx); +} + /* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased. @@ -426,11 +464,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (!cluster_info) return; - if (cluster_is_free(&cluster_info[idx])) { - VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); - cluster_list_del_first(&p->free_clusters, cluster_info); - cluster_set_count_flag(&cluster_info[idx], 0, 0); - } + if (cluster_is_free(&cluster_info[idx])) + alloc_cluster(p, idx); VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], @@ -454,21 +489,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p, cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) - 1); - if (cluster_count(&cluster_info[idx]) == 0) { - /* - * If the swap is discardable, prepare discard the cluster - * instead of free it immediately. The cluster will be freed - * after discard. - */ - if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == - (SWP_WRITEOK | SWP_PAGE_DISCARD)) { - swap_cluster_schedule_discard(p, idx); - return; - } - - cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - cluster_list_add_tail(&p->free_clusters, cluster_info, idx); - } + if (cluster_count(&cluster_info[idx]) == 0) + free_cluster(p, idx); } /* @@ -558,6 +580,60 @@ new_cluster: return found_free; } +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned int end = offset + nr_entries - 1; + + if (offset == si->lowest_bit) + si->lowest_bit += nr_entries; + if (end == si->highest_bit) + si->highest_bit -= nr_entries; + si->inuse_pages += nr_entries; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + } +} + +static void swap_range_free(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned long end = offset + nr_entries - 1; + void (*swap_slot_free_notify)(struct block_device *, unsigned long); + + if (offset < si->lowest_bit) + si->lowest_bit = offset; + if (end > si->highest_bit) { + bool was_full = !si->highest_bit; + + si->highest_bit = end; + if (was_full && (si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&si->avail_list)); + if (plist_node_empty(&si->avail_list)) + plist_add(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } + atomic_long_add(nr_entries, &nr_swap_pages); + si->inuse_pages -= nr_entries; + if (si->flags & SWP_BLKDEV) + swap_slot_free_notify = + si->bdev->bd_disk->fops->swap_slot_free_notify; + else + swap_slot_free_notify = NULL; + while (offset <= end) { + frontswap_invalidate_page(si->type, offset); + if (swap_slot_free_notify) + swap_slot_free_notify(si->bdev, offset); + offset++; + } +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -676,18 +752,7 @@ checks: inc_cluster_info_page(si, si->cluster_info, offset); unlock_cluster(ci); - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - si->inuse_pages++; - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; - spin_lock(&swap_avail_lock); - plist_del(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); - } + swap_range_alloc(si, offset, 1); si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); @@ -766,6 +831,52 @@ no_page: return n_ret; } +#ifdef CONFIG_THP_SWAP +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ + unsigned long idx; + struct swap_cluster_info *ci; + unsigned long offset, i; + unsigned char *map; + + if (cluster_list_empty(&si->free_clusters)) + return 0; + + idx = cluster_list_first(&si->free_clusters); + offset = idx * SWAPFILE_CLUSTER; + ci = lock_cluster(si, offset); + alloc_cluster(si, idx); + cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); + + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) + map[i] = SWAP_HAS_CACHE; + unlock_cluster(ci); + swap_range_alloc(si, offset, SWAPFILE_CLUSTER); + *slot = swp_entry(si->type, offset); + + return 1; +} + +static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + unsigned long offset = idx * SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + cluster_set_count_flag(ci, 0, 0); + free_cluster(si, idx); + unlock_cluster(ci); + swap_range_free(si, offset, SWAPFILE_CLUSTER); +} +#else +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ + VM_WARN_ON_ONCE(1); + return 0; +} +#endif /* CONFIG_THP_SWAP */ + static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) { @@ -781,13 +892,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) { + unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; - avail_pgs = atomic_long_read(&nr_swap_pages); + /* Only single cluster request supported */ + WARN_ON_ONCE(n_goal > 1 && cluster); + + avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages; if (avail_pgs <= 0) goto noswap; @@ -797,7 +912,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) if (n_goal > avail_pgs) n_goal = avail_pgs; - atomic_long_sub(n_goal, &nr_swap_pages); + atomic_long_sub(n_goal * nr_pages, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -823,10 +938,13 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries); + if (cluster) + n_ret = swap_alloc_cluster(si, swp_entries); + else + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries); spin_unlock(&si->lock); - if (n_ret) + if (n_ret || cluster) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); @@ -852,7 +970,8 @@ nextsi: check_out: if (n_ret < n_goal) - atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); + atomic_long_add((long)(n_goal - n_ret) * nr_pages, + &nr_swap_pages); noswap: return n_ret; } @@ -1008,32 +1127,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) dec_cluster_info_page(p, p->cluster_info, offset); unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry); - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) { - bool was_full = !p->highest_bit; - - p->highest_bit = offset; - if (was_full && (p->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&p->avail_list)); - if (plist_node_empty(&p->avail_list)) - plist_add(&p->avail_list, - &swap_avail_head); - spin_unlock(&swap_avail_lock); - } - } - atomic_long_inc(&nr_swap_pages); - p->inuse_pages--; - frontswap_invalidate_page(p->type, offset); - if (p->flags & SWP_BLKDEV) { - struct gendisk *disk = p->bdev->bd_disk; - - if (disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, - offset); - } + mem_cgroup_uncharge_swap(entry, 1); + swap_range_free(p, offset, 1); } /* @@ -1054,7 +1149,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void swapcache_free(swp_entry_t entry) +static void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; @@ -1065,6 +1160,52 @@ void swapcache_free(swp_entry_t entry) } } +#ifdef CONFIG_THP_SWAP +static void swapcache_free_cluster(swp_entry_t entry) +{ + unsigned long offset = swp_offset(entry); + unsigned long idx = offset / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + struct swap_info_struct *si; + unsigned char *map; + unsigned int i; + + si = swap_info_get(entry); + if (!si) + return; + + ci = lock_cluster(si, offset); + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + VM_BUG_ON(map[i] != SWAP_HAS_CACHE); + map[i] = 0; + } + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); +} +#else +static inline void swapcache_free_cluster(swp_entry_t entry) +{ +} +#endif /* CONFIG_THP_SWAP */ + +void put_swap_page(struct page *page, swp_entry_t entry) +{ + if (!PageTransHuge(page)) + swapcache_free(entry); + else + swapcache_free_cluster(entry); +} + +static int swp_entry_cmp(const void *ent1, const void *ent2) +{ + const swp_entry_t *e1 = ent1, *e2 = ent2; + + return (int)swp_type(*e1) - (int)swp_type(*e2); +} + void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; @@ -1075,6 +1216,14 @@ void swapcache_free_entries(swp_entry_t *entries, int n) prev = NULL; p = NULL; + + /* + * Sort swap entries by swap device, so each lock is only taken once. + * nr_swapfiles isn't absolutely correct, but the overhead of sort() is + * so low that it isn't necessary to optimize further. + */ + if (nr_swapfiles > 1) + sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) @@ -1719,7 +1868,7 @@ int try_to_unuse(unsigned int type, bool frontswap, swap_map = &si->swap_map[i]; entry = swp_entry(type, i); page = read_swap_cache_async(entry, - GFP_HIGHUSER_MOVABLE, NULL, 0); + GFP_HIGHUSER_MOVABLE, NULL, 0, false); if (!page) { /* * Either swap_duplicate() failed because entry diff --git a/mm/truncate.c b/mm/truncate.c index 6479ed2afc53..2330223841fb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -530,9 +530,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } else if (PageTransHuge(page)) { index += HPAGE_PMD_NR - 1; i += HPAGE_PMD_NR - 1; - /* 'end' is in the middle of THP */ - if (index == round_down(end, HPAGE_PMD_NR)) + /* + * 'end' is in the middle of THP. Don't + * invalidate the page as the part outside of + * 'end' could be still useful. + */ + if (index > end) { + unlock_page(page); continue; + } } ret = invalidate_inode_page(page); diff --git a/mm/util.c b/mm/util.c index 26be6407abd7..7b07ec852e01 100644 --- a/mm/util.c +++ b/mm/util.c @@ -83,6 +83,8 @@ EXPORT_SYMBOL(kstrdup_const); * @s: the string to duplicate * @max: read at most @max chars from @s * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Note: Use kmemdup_nul() instead if the size is known exactly. */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { @@ -121,6 +123,28 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) EXPORT_SYMBOL(kmemdup); /** + * kmemdup_nul - Create a NUL-terminated string from unterminated data + * @s: The data to stringify + * @len: The size of the data + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) +{ + char *buf; + + if (!s) + return NULL; + + buf = kmalloc_track_caller(len + 1, gfp); + if (buf) { + memcpy(buf, s, len); + buf[len] = '\0'; + } + return buf; +} +EXPORT_SYMBOL(kmemdup_nul); + +/** * memdup_user - duplicate memory region from user space * * @src: source address in user space @@ -339,9 +363,9 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT - * is supported only for large (>32kB) allocations, and it should be used only if - * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks. + * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. * * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. */ @@ -366,13 +390,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; - /* - * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly - * requests because there is no other way to tell the allocator - * that we want to fail rather than retry endlessly. - */ - if (!(kmalloc_flags & __GFP_REPEAT) || - (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ecc97f74ab18..8698c1c86c4d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -325,6 +325,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ +#define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); @@ -1497,6 +1498,7 @@ struct vm_struct *remove_vm_area(const void *addr) spin_lock(&vmap_area_lock); va->vm = NULL; va->flags &= ~VM_VM_AREA; + va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); @@ -1770,12 +1772,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */ clear_vm_uninitialized_flag(area); - /* - * A ref_count = 2 is needed because vm_struct allocated in - * __get_vm_area_node() contains a reference to the virtual address of - * the vmalloc'ed block. - */ - kmemleak_alloc(addr, real_size, 2, gfp_mask); + kmemleak_vmalloc(area, size, gfp_mask); return addr; @@ -1798,7 +1795,7 @@ fail: * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. * - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported * * Any use of gfp flags outside of GFP_KERNEL should be consulted @@ -2709,8 +2706,14 @@ static int s_show(struct seq_file *m, void *p) * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. */ - if (!(va->flags & VM_VM_AREA)) + if (!(va->flags & VM_VM_AREA)) { + seq_printf(m, "0x%pK-0x%pK %7ld %s\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start, + va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); + return 0; + } v = va->vm; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index ce0618bfa8d0..85350ce2d25d 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -93,12 +93,25 @@ enum vmpressure_levels { VMPRESSURE_NUM_LEVELS, }; +enum vmpressure_modes { + VMPRESSURE_NO_PASSTHROUGH = 0, + VMPRESSURE_HIERARCHY, + VMPRESSURE_LOCAL, + VMPRESSURE_NUM_MODES, +}; + static const char * const vmpressure_str_levels[] = { [VMPRESSURE_LOW] = "low", [VMPRESSURE_MEDIUM] = "medium", [VMPRESSURE_CRITICAL] = "critical", }; +static const char * const vmpressure_str_modes[] = { + [VMPRESSURE_NO_PASSTHROUGH] = "default", + [VMPRESSURE_HIERARCHY] = "hierarchy", + [VMPRESSURE_LOCAL] = "local", +}; + static enum vmpressure_levels vmpressure_level(unsigned long pressure) { if (pressure >= vmpressure_level_critical) @@ -141,27 +154,31 @@ out: struct vmpressure_event { struct eventfd_ctx *efd; enum vmpressure_levels level; + enum vmpressure_modes mode; struct list_head node; }; static bool vmpressure_event(struct vmpressure *vmpr, - enum vmpressure_levels level) + const enum vmpressure_levels level, + bool ancestor, bool signalled) { struct vmpressure_event *ev; - bool signalled = false; + bool ret = false; mutex_lock(&vmpr->events_lock); - list_for_each_entry(ev, &vmpr->events, node) { - if (level >= ev->level) { - eventfd_signal(ev->efd, 1); - signalled = true; - } + if (ancestor && ev->mode == VMPRESSURE_LOCAL) + continue; + if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) + continue; + if (level < ev->level) + continue; + eventfd_signal(ev->efd, 1); + ret = true; } - mutex_unlock(&vmpr->events_lock); - return signalled; + return ret; } static void vmpressure_work_fn(struct work_struct *work) @@ -170,6 +187,8 @@ static void vmpressure_work_fn(struct work_struct *work) unsigned long scanned; unsigned long reclaimed; enum vmpressure_levels level; + bool ancestor = false; + bool signalled = false; spin_lock(&vmpr->sr_lock); /* @@ -194,12 +213,9 @@ static void vmpressure_work_fn(struct work_struct *work) level = vmpressure_calc_level(scanned, reclaimed); do { - if (vmpressure_event(vmpr, level)) - break; - /* - * If not handled, propagate the event upward into the - * hierarchy. - */ + if (vmpressure_event(vmpr, level, ancestor, signalled)) + signalled = true; + ancestor = true; } while ((vmpr = vmpressure_parent(vmpr))); } @@ -326,17 +342,40 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) vmpressure(gfp, memcg, true, vmpressure_win, 0); } +static enum vmpressure_levels str_to_level(const char *arg) +{ + enum vmpressure_levels level; + + for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) + if (!strcmp(vmpressure_str_levels[level], arg)) + return level; + return -1; +} + +static enum vmpressure_modes str_to_mode(const char *arg) +{ + enum vmpressure_modes mode; + + for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++) + if (!strcmp(vmpressure_str_modes[mode], arg)) + return mode; + return -1; +} + +#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) + /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd * @memcg: memcg that is interested in vmpressure notifications * @eventfd: eventfd context to link notifications with - * @args: event arguments (used to set up a pressure level threshold) + * @args: event arguments (pressure level threshold, optional mode) * * This function associates eventfd context with the vmpressure * infrastructure, so that the notifications will be delivered to the - * @eventfd. The @args parameter is a string that denotes pressure level - * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or - * "critical"). + * @eventfd. The @args parameter is a comma-delimited string that denotes a + * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", + * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. + * "hierarchy" or "local"). * * To be used as memcg event method. */ @@ -345,28 +384,53 @@ int vmpressure_register_event(struct mem_cgroup *memcg, { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; - int level; + enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; + enum vmpressure_levels level = -1; + char *spec, *spec_orig; + char *token; + int ret = 0; + + spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL); + if (!spec) { + ret = -ENOMEM; + goto out; + } + strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN); - for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { - if (!strcmp(vmpressure_str_levels[level], args)) - break; + /* Find required level */ + token = strsep(&spec, ","); + level = str_to_level(token); + if (level == -1) { + ret = -EINVAL; + goto out; } - if (level >= VMPRESSURE_NUM_LEVELS) - return -EINVAL; + /* Find optional mode */ + token = strsep(&spec, ","); + if (token) { + mode = str_to_mode(token); + if (mode == -1) { + ret = -EINVAL; + goto out; + } + } ev = kzalloc(sizeof(*ev), GFP_KERNEL); - if (!ev) - return -ENOMEM; + if (!ev) { + ret = -ENOMEM; + goto out; + } ev->efd = eventfd; ev->level = level; + ev->mode = mode; mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock); - - return 0; +out: + kfree(spec_orig); + return ret; } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ad39bbc79e6..a1af041930a6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -708,7 +708,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - swapcache_free(swap); + put_swap_page(page, swap); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -1125,8 +1125,36 @@ static unsigned long shrink_page_list(struct list_head *page_list, !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (!add_to_swap(page, page_list)) + if (PageTransHuge(page)) { + /* cannot split THP, skip it */ + if (!can_split_huge_page(page, NULL)) + goto activate_locked; + /* + * Split pages without a PMD map right + * away. Chances are some or all of the + * tail pages can be freed without IO. + */ + if (!compound_mapcount(page) && + split_huge_page_to_list(page, page_list)) + goto activate_locked; + } + if (!add_to_swap(page)) { + if (!PageTransHuge(page)) + goto activate_locked; + /* Split THP and swap individual base pages */ + if (split_huge_page_to_list(page, page_list)) + goto activate_locked; + if (!add_to_swap(page)) + goto activate_locked; + } + + /* XXX: We don't support THP writes */ + if (PageTransHuge(page) && + split_huge_page_to_list(page, page_list)) { + delete_from_swap_cache(page); goto activate_locked; + } + may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1266,6 +1294,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } count_vm_event(PGLAZYFREED); + count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -1295,6 +1324,7 @@ activate_locked: if (!PageMlocked(page)) { SetPageActive(page); pgactivate++; + count_memcg_page_event(page, PGACTIVATE); } keep_locked: unlock_page(page); @@ -1734,11 +1764,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); - else + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, + nr_scanned); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_DIRECT, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, + nr_scanned); } spin_unlock_irq(&pgdat->lru_lock); @@ -1750,11 +1785,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); - else + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, + nr_reclaimed); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, + nr_reclaimed); } putback_inactive_pages(lruvec, &page_list); @@ -1899,8 +1939,11 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, } } - if (!is_active_lru(lru)) + if (!is_active_lru(lru)) { __count_vm_events(PGDEACTIVATE, nr_moved); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_moved); + } return nr_moved; } @@ -1938,6 +1981,7 @@ static void shrink_active_list(unsigned long nr_to_scan, reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -2184,8 +2228,17 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, } if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { - scan_balance = SCAN_ANON; - goto out; + /* + * Force SCAN_ANON if there are enough inactive + * anonymous pages on the LRU in eligible zones. + * Otherwise, the small LRU gets thrashed. + */ + if (!inactive_list_is_low(lruvec, false, memcg, sc, false) && + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) + >> sc->priority) { + scan_balance = SCAN_ANON; + goto out; + } } } @@ -2453,18 +2506,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return false; /* Consider stopping depending on scan and reclaim activity */ - if (sc->gfp_mask & __GFP_REPEAT) { + if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { /* - * For __GFP_REPEAT allocations, stop reclaiming if the + * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the * full LRU list has been scanned and we are still failing * to reclaim pages. This full LRU scan is potentially - * expensive but a __GFP_REPEAT caller really wants to succeed + * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed */ if (!nr_reclaimed && !nr_scanned) return false; } else { /* - * For non-__GFP_REPEAT allocations which can presumably + * For non-__GFP_RETRY_MAYFAIL allocations which can presumably * fail without consequence, stop if we failed to reclaim * any pages from the last SWAP_CLUSTER_MAX number of * pages that were scanned. This will return to the @@ -2967,7 +3020,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -2982,12 +3035,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * 1 is returned so that the page allocator does not OOM kill at this * point. */ - if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, - gfp_mask, + sc.gfp_mask, sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3652,7 +3705,7 @@ int kswapd_run(int nid) pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ - BUG_ON(system_state == SYSTEM_BOOTING); + BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; @@ -3774,17 +3827,16 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int classzone_idx = gfp_zone(gfp_mask); unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, - .reclaim_idx = classzone_idx, + .reclaim_idx = gfp_zone(gfp_mask), }; cond_resched(); @@ -3795,7 +3847,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - lockdep_set_current_reclaim_state(gfp_mask); + lockdep_set_current_reclaim_state(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3831,7 +3883,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) * unmapped file backed pages. */ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && - sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 76f73670200a..9a4441bbeef2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -928,8 +928,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_slab_reclaimable", - "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", "nr_bounce", @@ -952,6 +950,8 @@ const char * const vmstat_text[] = { "nr_inactive_file", "nr_active_file", "nr_unevictable", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", "workingset_refault", @@ -1018,6 +1018,7 @@ const char * const vmstat_text[] = { "drop_pagecache", "drop_slab", + "oom_kill", #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", @@ -1129,7 +1130,7 @@ static void frag_stop(struct seq_file *m, void *arg) * If @assert_populated is true, only use callback for zones that are populated. */ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, - bool assert_populated, + bool assert_populated, bool nolock, void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { struct zone *zone; @@ -1140,9 +1141,11 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, if (assert_populated && !populated_zone(zone)) continue; - spin_lock_irqsave(&zone->lock, flags); + if (!nolock) + spin_lock_irqsave(&zone->lock, flags); print(m, pgdat, zone); - spin_unlock_irqrestore(&zone->lock, flags); + if (!nolock) + spin_unlock_irqrestore(&zone->lock, flags); } } #endif @@ -1165,7 +1168,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, static int frag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, true, frag_show_print); + walk_zones_in_node(m, pgdat, true, false, frag_show_print); return 0; } @@ -1206,7 +1209,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print); + walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print); return 0; } @@ -1223,11 +1226,10 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { struct page *page; - if (!pfn_valid(pfn)) + page = pfn_to_online_page(pfn); + if (!page) continue; - page = pfn_to_page(pfn); - /* Watch for unexpected holes punched in the memmap */ if (!memmap_valid_within(pfn, page, zone)) continue; @@ -1258,7 +1260,8 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print); + walk_zones_in_node(m, pgdat, true, false, + pagetypeinfo_showblockcount_print); return 0; } @@ -1284,7 +1287,8 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); + walk_zones_in_node(m, pgdat, true, true, + pagetypeinfo_showmixedcount_print); #endif /* CONFIG_PAGE_OWNER */ } @@ -1322,7 +1326,7 @@ static int fragmentation_open(struct inode *inode, struct file *file) return seq_open(file, &fragmentation_op); } -static const struct file_operations fragmentation_file_operations = { +static const struct file_operations buddyinfo_file_operations = { .open = fragmentation_open, .read = seq_read, .llseek = seq_lseek, @@ -1341,7 +1345,7 @@ static int pagetypeinfo_open(struct inode *inode, struct file *file) return seq_open(file, &pagetypeinfo_op); } -static const struct file_operations pagetypeinfo_file_ops = { +static const struct file_operations pagetypeinfo_file_operations = { .open = pagetypeinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -1446,7 +1450,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); + walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print); return 0; } @@ -1463,7 +1467,7 @@ static int zoneinfo_open(struct inode *inode, struct file *file) return seq_open(file, &zoneinfo_op); } -static const struct file_operations proc_zoneinfo_file_operations = { +static const struct file_operations zoneinfo_file_operations = { .open = zoneinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -1552,7 +1556,7 @@ static int vmstat_open(struct inode *inode, struct file *file) return seq_open(file, &vmstat_op); } -static const struct file_operations proc_vmstat_file_operations = { +static const struct file_operations vmstat_file_operations = { .open = vmstat_open, .read = seq_read, .llseek = seq_lseek, @@ -1785,10 +1789,10 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif #ifdef CONFIG_PROC_FS - proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); - proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); - proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); - proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); + proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations); + proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations); + proc_create("vmstat", 0444, NULL, &vmstat_file_operations); + proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations); #endif } @@ -1852,7 +1856,7 @@ static int unusable_show(struct seq_file *m, void *arg) if (!node_state(pgdat->node_id, N_MEMORY)) return 0; - walk_zones_in_node(m, pgdat, true, unusable_show_print); + walk_zones_in_node(m, pgdat, true, false, unusable_show_print); return 0; } @@ -1904,7 +1908,7 @@ static int extfrag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, true, extfrag_show_print); + walk_zones_in_node(m, pgdat, true, false, extfrag_show_print); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index b8c9ab678479..7119cd745ace 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -288,12 +288,10 @@ bool workingset_refault(void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; - inc_node_state(pgdat, WORKINGSET_REFAULT); - inc_memcg_state(memcg, WORKINGSET_REFAULT); + inc_lruvec_state(lruvec, WORKINGSET_REFAULT); if (refault_distance <= active_file) { - inc_node_state(pgdat, WORKINGSET_ACTIVATE); - inc_memcg_state(memcg, WORKINGSET_ACTIVATE); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); rcu_read_unlock(); return true; } @@ -474,8 +472,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; - inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, workingset_update_node, mapping); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d41edd28298b..013eea76685e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -116,6 +116,11 @@ #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) +#define FULLNESS_BITS 2 +#define CLASS_BITS 8 +#define ISOLATED_BITS 3 +#define MAGIC_VAL_BITS 8 + #define MAX(a, b) ((a) >= (b) ? (a) : (b)) /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ @@ -137,6 +142,8 @@ * (reason above) */ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) +#define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \ + ZS_SIZE_CLASS_DELTA) + 1) enum fullness_group { ZS_EMPTY, @@ -169,11 +176,6 @@ static struct vfsmount *zsmalloc_mnt; #endif /* - * number of size_classes - */ -static int zs_size_classes; - -/* * We assign a page to ZS_ALMOST_EMPTY fullness group when: * n <= N / f, where * n = number of allocated objects @@ -244,7 +246,7 @@ struct link_free { struct zs_pool { const char *name; - struct size_class **size_class; + struct size_class *size_class[ZS_SIZE_CLASSES]; struct kmem_cache *handle_cachep; struct kmem_cache *zspage_cachep; @@ -268,11 +270,6 @@ struct zs_pool { #endif }; -#define FULLNESS_BITS 2 -#define CLASS_BITS 8 -#define ISOLATED_BITS 3 -#define MAGIC_VAL_BITS 8 - struct zspage { struct { unsigned int fullness:FULLNESS_BITS; @@ -469,7 +466,7 @@ static bool is_zspage_isolated(struct zspage *zspage) return zspage->isolated; } -static int is_first_page(struct page *page) +static __maybe_unused int is_first_page(struct page *page) { return PagePrivate(page); } @@ -551,7 +548,7 @@ static int get_size_class_index(int size) idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, ZS_SIZE_CLASS_DELTA); - return min(zs_size_classes - 1, idx); + return min_t(int, ZS_SIZE_CLASSES - 1, idx); } static inline void zs_stat_inc(struct size_class *class, @@ -610,7 +607,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) "obj_allocated", "obj_used", "pages_used", "pages_per_zspage", "freeable"); - for (i = 0; i < zs_size_classes; i++) { + for (i = 0; i < ZS_SIZE_CLASSES; i++) { class = pool->size_class[i]; if (class->index != i) @@ -1294,17 +1291,6 @@ static int zs_cpu_dead(unsigned int cpu) return 0; } -static void __init init_zs_size_classes(void) -{ - int nr; - - nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; - if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) - nr += 1; - - zs_size_classes = nr; -} - static bool can_merge(struct size_class *prev, int pages_per_zspage, int objs_per_zspage) { @@ -2145,7 +2131,7 @@ static void async_free_zspage(struct work_struct *work) struct zs_pool *pool = container_of(work, struct zs_pool, free_work); - for (i = 0; i < zs_size_classes; i++) { + for (i = 0; i < ZS_SIZE_CLASSES; i++) { class = pool->size_class[i]; if (class->index != i) continue; @@ -2263,7 +2249,7 @@ unsigned long zs_compact(struct zs_pool *pool) int i; struct size_class *class; - for (i = zs_size_classes - 1; i >= 0; i--) { + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; if (!class) continue; @@ -2309,7 +2295,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, struct zs_pool *pool = container_of(shrinker, struct zs_pool, shrinker); - for (i = zs_size_classes - 1; i >= 0; i--) { + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; if (!class) continue; @@ -2361,12 +2347,6 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); - pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), - GFP_KERNEL); - if (!pool->size_class) { - kfree(pool); - return NULL; - } pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) @@ -2379,7 +2359,7 @@ struct zs_pool *zs_create_pool(const char *name) * Iterate reversely, because, size of size_class that we want to use * for merging should be larger or equal to current size. */ - for (i = zs_size_classes - 1; i >= 0; i--) { + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { int size; int pages_per_zspage; int objs_per_zspage; @@ -2453,7 +2433,7 @@ void zs_destroy_pool(struct zs_pool *pool) zs_unregister_migration(pool); zs_pool_stat_destroy(pool); - for (i = 0; i < zs_size_classes; i++) { + for (i = 0; i < ZS_SIZE_CLASSES; i++) { int fg; struct size_class *class = pool->size_class[i]; @@ -2492,8 +2472,6 @@ static int __init zs_init(void) if (ret) goto hp_setup_fail; - init_zs_size_classes(); - #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif diff --git a/mm/zswap.c b/mm/zswap.c index eedc27894b10..d39581a076c3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -371,10 +371,9 @@ static int zswap_dstmem_prepare(unsigned int cpu) u8 *dst; dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) { - pr_err("can't allocate compressor buffer\n"); + if (!dst) return -ENOMEM; - } + per_cpu(zswap_dstmem, cpu) = dst; return 0; } @@ -515,10 +514,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) } pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) { - pr_err("pool alloc failed\n"); + if (!pool) return NULL; - } /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); @@ -1158,7 +1155,7 @@ static void zswap_frontswap_init(unsigned type) { struct zswap_tree *tree; - tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return; |