From ee8f248d266ec6966c0ce6b7dec24de43dcc1b58 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Mon, 25 Jul 2011 17:11:50 -0700 Subject: hugetlb: add phys addr to struct huge_bootmem_page This is needed on HIGHMEM systems - we don't always have a virtual address so store the physical address and map it in as needed. [akpm@linux-foundation.org: cleanup] Signed-off-by: Becky Bruce Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bfcf153bc829..c6d342d313c7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1105,8 +1105,16 @@ static void __init gather_bootmem_prealloc(void) struct huge_bootmem_page *m; list_for_each_entry(m, &huge_boot_pages, list) { - struct page *page = virt_to_page(m); struct hstate *h = m->hstate; + struct page *page; + +#ifdef CONFIG_HIGHMEM + page = pfn_to_page(m->phys >> PAGE_SHIFT); + free_bootmem_late((unsigned long)m, + sizeof(struct huge_bootmem_page)); +#else + page = virt_to_page(m); +#endif __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); -- cgit v1.2.3-59-g8ed1b From 33dd4e0ec91138c3d80e790c08a3db47426c81f2 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 25 Jul 2011 17:11:51 -0700 Subject: mm: make some struct page's const These uses are read-only and in a subsequent patch I have a const struct page in my hand... [akpm@linux-foundation.org: fix warnings in lowmem_page_address()] Signed-off-by: Ian Campbell Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++------ mm/sparse.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a45ad22a170..6321e840e21d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -637,7 +637,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) -static inline enum zone_type page_zonenum(struct page *page) +static inline enum zone_type page_zonenum(const struct page *page) { return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } @@ -665,15 +665,15 @@ static inline int zone_to_nid(struct zone *zone) } #ifdef NODE_NOT_IN_PAGE_FLAGS -extern int page_to_nid(struct page *page); +extern int page_to_nid(const struct page *page); #else -static inline int page_to_nid(struct page *page) +static inline int page_to_nid(const struct page *page) { return (page->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif -static inline struct zone *page_zone(struct page *page) +static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } @@ -718,9 +718,9 @@ static inline void set_page_links(struct page *page, enum zone_type zone, */ #include -static __always_inline void *lowmem_page_address(struct page *page) +static __always_inline void *lowmem_page_address(const struct page *page) { - return __va(PFN_PHYS(page_to_pfn(page))); + return __va(PFN_PHYS(page_to_pfn((struct page *)page))); } #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) diff --git a/mm/sparse.c b/mm/sparse.c index aa64b12831a2..858e1dff9b2a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(struct page *page) +int page_to_nid(const struct page *page) { return section_to_node_table[page_to_section(page)]; } -- cgit v1.2.3-59-g8ed1b From ccb6108f5b0b541d3eb332c3a73e645c0f84278e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jul 2011 17:11:57 -0700 Subject: mm/backing-dev.c: reset bdi min_ratio in bdi_unregister() Vito said: : The system has many usb disks coming and going day to day, with their : respective bdi's having min_ratio set to 1 when inserted. It works for : some time until eventually min_ratio can no longer be set, even when the : active set of bdi's seen in /sys/class/bdi/*/min_ratio doesn't add up to : anywhere near 100. : : This then leads to an unrelated starvation problem caused by write-heavy : fuse mounts being used atop the usb disks, a problem the min_ratio setting : at the underlying devices bdi effectively prevents. Fix this leakage by resetting the bdi min_ratio when unregistering the BDI. Signed-off-by: Peter Zijlstra Reported-by: Vito Caputo Cc: Wu Fengguang Cc: Miklos Szeredi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e1e09a..e56fe35cef05 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -606,6 +606,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); del_timer_sync(&bdi->wb.wakeup_timer); -- cgit v1.2.3-59-g8ed1b From 9d0ad8ca43ce8023bb834a409c2258bd7197fb05 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Mon, 25 Jul 2011 17:12:05 -0700 Subject: mm: extend memory hotplug API to allow memory hotplug in virtual machines This patch contains online_page_callback and apropriate functions for registering/unregistering online page callbacks. It allows to do some machine specific tasks during online page stage which is required to implement memory hotplug in virtual machines. Currently this patch is required by latest memory hotplug support for Xen balloon driver patch which will be posted soon. Additionally, originial online_page() function was splited into following functions doing "atomic" operations: - __online_page_set_limits() - set new limits for memory management code, - __online_page_increment_counters() - increment totalram_pages and totalhigh_pages, - __online_page_free() - free page to allocator. It was done to: - not duplicate existing code, - ease hotplug code devolpment by usage of well defined interface, - avoid stupid bugs which are unavoidable when the same code (by design) is developed in many places. [akpm@linux-foundation.org: use explicit indirect-call syntax] Signed-off-by: Daniel Kiper Reviewed-by: Konrad Rzeszutek Wilk Cc: Ian Campbell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 11 +++++-- mm/memory_hotplug.c | 68 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8122018d3000..0b8e2a742600 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -68,12 +68,19 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -/* need some defines for these for archs that don't support it */ -extern void online_page(struct page *page); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long, unsigned long); extern void __offline_isolated_pages(unsigned long, unsigned long); +typedef void (*online_page_callback_t)(struct page *page); + +extern int set_online_page_callback(online_page_callback_t callback); +extern int restore_online_page_callback(online_page_callback_t callback); + +extern void __online_page_set_limits(struct page *page); +extern void __online_page_increment_counters(struct page *page); +extern void __online_page_free(struct page *page); + #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c46887b5a11e..6e7d8b21dbfa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -34,6 +34,17 @@ #include "internal.h" +/* + * online_page_callback contains pointer to current page onlining function. + * Initially it is generic_online_page(). If it is required it could be + * changed by calling set_online_page_callback() for callback registration + * and restore_online_page_callback() for generic callback restore. + */ + +static void generic_online_page(struct page *page); + +static online_page_callback_t online_page_callback = generic_online_page; + DEFINE_MUTEX(mem_hotplug_mutex); void lock_memory_hotplug(void) @@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } EXPORT_SYMBOL_GPL(__remove_pages); -void online_page(struct page *page) +int set_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + lock_memory_hotplug(); + + if (online_page_callback == generic_online_page) { + online_page_callback = callback; + rc = 0; + } + + unlock_memory_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(set_online_page_callback); + +int restore_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + lock_memory_hotplug(); + + if (online_page_callback == callback) { + online_page_callback = generic_online_page; + rc = 0; + } + + unlock_memory_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(restore_online_page_callback); + +void __online_page_set_limits(struct page *page) { unsigned long pfn = page_to_pfn(page); - totalram_pages++; if (pfn >= num_physpages) num_physpages = pfn + 1; +} +EXPORT_SYMBOL_GPL(__online_page_set_limits); + +void __online_page_increment_counters(struct page *page) +{ + totalram_pages++; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) totalhigh_pages++; #endif +} +EXPORT_SYMBOL_GPL(__online_page_increment_counters); +void __online_page_free(struct page *page) +{ ClearPageReserved(page); init_page_count(page); __free_page(page); } +EXPORT_SYMBOL_GPL(__online_page_free); + +static void generic_online_page(struct page *page) +{ + __online_page_set_limits(page); + __online_page_increment_counters(page); + __online_page_free(page); +} static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) @@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); - online_page(page); + (*online_page_callback)(page); onlined_pages++; } *(unsigned long *)arg = onlined_pages; -- cgit v1.2.3-59-g8ed1b From e21c7ffd6f7493aa01bccd17ebc13dbdfecce880 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:06 -0700 Subject: mm: swap-token: fix dead link http://www.cs.wm.edu/~sjiang/token.pdf is now dead. Replace it with an alive alternative. Signed-off-by: KOSAKI Motohiro Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/thrash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/thrash.c b/mm/thrash.c index fabf2d0f5169..d416d403e140 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -6,7 +6,7 @@ * Released under the GPL, see the file COPYING for details. * * Simple token based thrashing protection, using the algorithm - * described in: http://www.cs.wm.edu/~sjiang/token.pdf + * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html * * Sep 2006, Ashwin Chaugule * Improved algorithm to pass token: -- cgit v1.2.3-59-g8ed1b From 53bb01f593d50188c8d638f89db96f9b6b042bcd Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:07 -0700 Subject: mm: swap-token: makes global variables to function local global_faults and last_aging are only used in grab_swap_token(). Move them into grab_swap_token(). Signed-off-by: KOSAKI Motohiro Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/thrash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/thrash.c b/mm/thrash.c index d416d403e140..42ffb0179271 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -30,8 +30,6 @@ static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; struct mem_cgroup *swap_token_memcg; -static unsigned int global_faults; -static unsigned int last_aging; #ifdef CONFIG_CGROUP_MEM_RES_CTLR static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) @@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm) { int current_interval; unsigned int old_prio = mm->token_priority; + static unsigned int global_faults; + static unsigned int last_aging; global_faults++; -- cgit v1.2.3-59-g8ed1b From 45ebb840257b060ec54416aebffd9747e210962c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:08 -0700 Subject: mm: swap-token: add a comment for priority aging Document some swap token aging design decisions. Signed-off-by: KOSAKI Motohiro Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/thrash.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/thrash.c b/mm/thrash.c index 42ffb0179271..e53f7d02c17c 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm) if (!swap_token_mm) goto replace_token; + /* + * Usually, we don't need priority aging because long interval faults + * makes priority decrease quickly. But there is one exception. If the + * token owner task is sleeping, it never make long interval faults. + * Thus, we need a priority aging mechanism instead. The requirements + * of priority aging are + * 1) An aging interval is reasonable enough long. Too short aging + * interval makes quick swap token lost and decrease performance. + * 2) The swap token owner task have to get priority aging even if + * it's under sleep. + */ if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { swap_token_mm->token_priority /= 2; last_aging = global_faults; -- cgit v1.2.3-59-g8ed1b From 4b6ddbf7ed4ef2f40e0a27418146eedaa68953c6 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:09 -0700 Subject: pagewalk: fix walk_page_range() don't check find_vma() result properly The doc of find_vma() says, /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { (snip) Thus, caller should confirm whether the returned vma matches a desired one. Signed-off-by: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Hiroyuki Kamezawa Cc: Andrea Arcangeli Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3450d533611..606bbb4125d0 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -176,7 +176,7 @@ int walk_page_range(unsigned long addr, unsigned long end, * we can't handled it in the same manner as non-huge pages. */ vma = find_vma(walk->mm, addr); - if (vma && is_vm_hugetlb_page(vma)) { + if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) { if (vma->vm_end < next) next = vma->vm_end; /* -- cgit v1.2.3-59-g8ed1b From 6c6d5280431544e4036886ea74e3334a98bc5f96 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:09 -0700 Subject: pagewalk: don't look up vma if walk->hugetlb_entry is unused Currently, walk_page_range() calls find_vma() every page table for walk iteration. but it's completely unnecessary if walk->hugetlb_entry is unused. And we don't have to assume find_vma() is a lightweight operation. So this patch checks the walk->hugetlb_entry and avoids the find_vma() call if possible. This patch also makes some cleanups. 1) remove ugly uninitialized_var() and 2) #ifdef in function body. Signed-off-by: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Hiroyuki Kamezawa Cc: Andrea Arcangeli Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 606bbb4125d0..ee4ff87c58c1 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, return 0; } -#endif + +static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +{ + struct vm_area_struct *vma; + + /* We don't need vma lookup at all. */ + if (!walk->hugetlb_entry) + return NULL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + vma = find_vma(walk->mm, addr); + if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) + return vma; + + return NULL; +} + +#else /* CONFIG_HUGETLB_PAGE */ +static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +{ + return NULL; +} + +static int walk_hugetlb_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + return 0; +} + +#endif /* CONFIG_HUGETLB_PAGE */ + + /** * walk_page_range - walk a memory map's page tables with a callback @@ -165,18 +197,17 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *uninitialized_var(vma); + struct vm_area_struct *vma; next = pgd_addr_end(addr, end); -#ifdef CONFIG_HUGETLB_PAGE /* * handle hugetlb vma individually because pagetable walk for * the hugetlb page is dependent on the architecture and * we can't handled it in the same manner as non-huge pages. */ - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) { + vma = hugetlb_vma(addr, walk); + if (vma) { if (vma->vm_end < next) next = vma->vm_end; /* @@ -189,7 +220,7 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, next); continue; } -#endif + if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); -- cgit v1.2.3-59-g8ed1b From c27fe4c8942d3ca715986f79cc26f44608d7d9fb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:10 -0700 Subject: pagewalk: add locking-rule comments Originally, walk_hugetlb_range() didn't require a caller take any lock. But commit d33b9f45bd ("mm: hugetlb: fix hugepage memory leak in walk_page_range") changed its rule. Because it added find_vma() call in walk_hugetlb_range(). Any locking-rule change commit should write a doc too. [akpm@linux-foundation.org: clarify comment] Signed-off-by: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Hiroyuki Kamezawa Cc: Andrea Arcangeli Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/pagewalk.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6321e840e21d..3c5505a3bc49 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -911,6 +911,8 @@ unsigned long unmap_vmas(struct mmu_gather *tlb, * @pte_entry: if set, called for each non-empty PTE (4th-level) entry * @pte_hole: if set, called for each hole at all levels * @hugetlb_entry: if set, called for each hugetlb entry + * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry + * is used. * * (see walk_page_range for more details) */ diff --git a/mm/pagewalk.c b/mm/pagewalk.c index ee4ff87c58c1..f7929406e776 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -181,6 +181,9 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, * * If any callback returns a non-zero value, the walk is aborted and * the return value is propagated back to the caller. Otherwise 0 is returned. + * + * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry + * is !NULL. */ int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk) -- cgit v1.2.3-59-g8ed1b From dd78553b5e7a0b34c0b60478d04ee16d8d8f4fa7 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 25 Jul 2011 17:12:11 -0700 Subject: pagewalk: fix code comment for THP Commit bae9c19bf1 ("thp: split_huge_page_mm/vma") changed locking behavior of walk_page_range(). Thus this patch changes the comment too. Signed-off-by: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Hiroyuki Kamezawa Cc: Andrea Arcangeli Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index f7929406e776..2f5cf10ff660 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -176,7 +176,8 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, * associated range, and a copy of the original mm_walk for access to * the ->private or ->mm fields. * - * No locks are taken, but the bottom level iterator will map PTE + * Usually no locks are taken, but splitting transparent huge page may + * take page table lock. And the bottom level iterator will map PTE * directories from highmem if necessary. * * If any callback returns a non-zero value, the walk is aborted and -- cgit v1.2.3-59-g8ed1b From 00a66d2974485d7d95d61d5772142b2a2231ed2a Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 25 Jul 2011 17:12:12 -0700 Subject: mm: remove the leftovers of noswapaccount In commit a2c8990aed5ab ("memsw: remove noswapaccount kernel parameter"), Michal forgot to remove some left pieces of noswapaccount in the tree, this patch removes them all. Signed-off-by: WANG Cong Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 16 ---------------- init/Kconfig | 4 ++-- mm/page_cgroup.c | 2 +- 3 files changed, 3 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index d59e71df5c5c..f9d240dfac06 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -518,22 +518,6 @@ Files: net/netfilter/xt_connlimit.c ---------------------------- -What: noswapaccount kernel command line parameter -When: 2.6.40 -Why: The original implementation of memsw feature enabled by - CONFIG_CGROUP_MEM_RES_CTLR_SWAP could be disabled by the noswapaccount - kernel parameter (introduced in 2.6.29-rc1). Later on, this decision - turned out to be not ideal because we cannot have the feature compiled - in and disabled by default and let only interested to enable it - (e.g. general distribution kernels might need it). Therefore we have - added swapaccount[=0|1] parameter (introduced in 2.6.37) which provides - the both possibilities. If we remove noswapaccount we will have - less command line parameters with the same functionality and we - can also cleanup the parameter handling a bit (). -Who: Michal Hocko - ----------------------------- - What: ipt_addrtype match include file When: 2012 Why: superseded by xt_addrtype diff --git a/init/Kconfig b/init/Kconfig index e20aa3112240..d62778390e55 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -673,7 +673,7 @@ config CGROUP_MEM_RES_CTLR_SWAP be careful about enabling this. When memory resource controller is disabled by boot option, this will be automatically disabled and there will be no overhead from this. Even when you set this config=y, - if boot option "noswapaccount" is set, swap will not be accounted. + if boot option "swapaccount=0" is set, swap will not be accounted. Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. config CGROUP_MEM_RES_CTLR_SWAP_ENABLED @@ -688,7 +688,7 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED parameter should have this option unselected. For those who want to have the feature enabled by default should select this option (if, for some reason, they need to disable it - then noswapaccount does the trick). + then swapaccount=0 does the trick). config CGROUP_PERF bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 53bffc6c293e..9cb1c44ffc37 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -537,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) nomem: printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); printk(KERN_INFO - "swap_cgroup can be disabled by noswapaccount boot option\n"); + "swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } -- cgit v1.2.3-59-g8ed1b From 1bb36fbd4d58ec3fab4dab5ed39a2af492c263ea Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Mon, 25 Jul 2011 17:12:13 -0700 Subject: mm/page_cgroup.c: simplify code by using SECTION_ALIGN_UP() and SECTION_ALIGN_DOWN() macros Commit a539f3533b78e3 ("mm: add SECTION_ALIGN_UP() and SECTION_ALIGN_DOWN() macro") introduced the SECTION_ALIGN_UP() and SECTION_ALIGN_DOWN() macros. Use those macros to increase code readability. Signed-off-by: Daniel Kiper Acked-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 9cb1c44ffc37..39d216d535ea 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -225,8 +225,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, unsigned long start, end, pfn; int fail = 0; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); if (nid == -1) { /* @@ -258,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn, { unsigned long start, end, pfn; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_cgroup(pfn); -- cgit v1.2.3-59-g8ed1b From d788e80a8c83ecdbdd55b6e985cced9cfe3a815b Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Mon, 25 Jul 2011 17:12:14 -0700 Subject: mm/huge_memory.c: minor lock simplification in __khugepaged_exit The lock is released first thing in all three branches. Simplify this by unconditionally releasing lock and remove else clause which was only there to be sure lock was released. Signed-off-by: Chris Wright Reviewed-by: Michal Hocko Cc: Andrea Arcangeli Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 81532f297fd2..e2d1587be269 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1596,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm) list_del(&mm_slot->mm_node); free = 1; } + spin_unlock(&khugepaged_mm_lock); if (free) { - spin_unlock(&khugepaged_mm_lock); clear_bit(MMF_VM_HUGEPAGE, &mm->flags); free_mm_slot(mm_slot); mmdrop(mm); } else if (mm_slot) { - spin_unlock(&khugepaged_mm_lock); /* * This is required to serialize against * khugepaged_test_exit() (which is guaranteed to run @@ -1614,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm) */ down_write(&mm->mmap_sem); up_write(&mm->mmap_sem); - } else - spin_unlock(&khugepaged_mm_lock); + } } static void release_pte_page(struct page *page) -- cgit v1.2.3-59-g8ed1b From 32f84528fbb5177275193a3311be8756f0cbd62c Mon Sep 17 00:00:00 2001 From: Chris Forbes Date: Mon, 25 Jul 2011 17:12:14 -0700 Subject: mm: hugetlb: fix coding style issues Fix coding style issues flagged by checkpatch.pl Signed-off-by: Chris Forbes Acked-by: Eric B Munson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c6d342d313c7..dae27ba3be2c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include @@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock); * must either hold the mmap_sem for write, or the mmap_sem for read and * the hugetlb_instantiation mutex: * - * down_write(&mm->mmap_sem); + * down_write(&mm->mmap_sem); * or - * down_read(&mm->mmap_sem); - * mutex_lock(&hugetlb_instantiation_mutex); + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); */ struct file_region { struct list_head link; @@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page) h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { - page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | - 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1<< PG_writeback); + page[i].flags &= ~(1 << PG_locked | 1 << PG_error | + 1 << PG_referenced | 1 << PG_dirty | + 1 << PG_active | 1 << PG_reserved | + 1 << PG_private | 1 << PG_writeback); } set_compound_page_dtor(page, NULL); set_page_refcounted(page); @@ -591,7 +592,6 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } - EXPORT_SYMBOL_GPL(PageHuge); static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) @@ -2132,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, pte_t entry; entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { + if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); - } } @@ -2189,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return 0; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) { + if (non_swap_entry(swp) && is_migration_entry(swp)) return 1; - } else + else return 0; } @@ -2202,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return 0; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) return 1; - } else + else return 0; } @@ -2567,7 +2566,7 @@ retry: * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON | + ret = VM_FAULT_HWPOISON | VM_FAULT_SET_HINDEX(h - hstates); goto backout_unlocked; } @@ -2635,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, migration_entry_wait(mm, (pmd_t *)ptep, address); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON_LARGE | + return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(h - hstates); } -- cgit v1.2.3-59-g8ed1b From 6ac47520063b230641a64062b8a229201cd0a3a8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Jul 2011 17:12:16 -0700 Subject: mm/memory.c: remove ZAP_BLOCK_SIZE ZAP_BLOCK_SIZE became unused in the preemptible-mmu_gather work ("mm: Remove i_mmap_lock lockbreak"). So zap it. Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 9b8a01d941cb..a58bbebb3070 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, return addr; } -#ifdef CONFIG_PREEMPT -# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) -#else -/* No preempt: go for improved straight-line efficiency */ -# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) -#endif - /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather @@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * * Unmap all pages in the vma list. * - * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. - * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. -- cgit v1.2.3-59-g8ed1b From 11239836c04b50ba8453ec58ca7a7bd716ef02c1 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 25 Jul 2011 17:12:17 -0700 Subject: oom: remove references to old badness() function The badness() function in the oom killer was renamed to oom_badness() in a63d83f427fb ("oom: badness heuristic rewrite") since it is a globally exported function for clarity. The prototype for the old function still existed in linux/oom.h, so remove it. There are no existing users. Also fixes documentation and comment references to badness() and adjusts them accordingly. Signed-off-by: David Rientjes Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/obsolete/proc-pid-oom_adj | 2 +- Documentation/feature-removal-schedule.txt | 2 +- include/linux/oom.h | 4 ---- mm/oom_kill.c | 2 +- 4 files changed, 3 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/Documentation/ABI/obsolete/proc-pid-oom_adj b/Documentation/ABI/obsolete/proc-pid-oom_adj index cf63f264ce0f..9a3cb88ade47 100644 --- a/Documentation/ABI/obsolete/proc-pid-oom_adj +++ b/Documentation/ABI/obsolete/proc-pid-oom_adj @@ -14,7 +14,7 @@ Why: /proc//oom_adj allows userspace to influence the oom killer's A much more powerful interface, /proc//oom_score_adj, was introduced with the oom killer rewrite that allows users to increase or - decrease the badness() score linearly. This interface will replace + decrease the badness score linearly. This interface will replace /proc//oom_adj. A warning will be emitted to the kernel log if an application uses this diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index f9d240dfac06..d093e550dbeb 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -184,7 +184,7 @@ Why: /proc//oom_adj allows userspace to influence the oom killer's A much more powerful interface, /proc//oom_score_adj, was introduced with the oom killer rewrite that allows users to increase or - decrease the badness() score linearly. This interface will replace + decrease the badness score linearly. This interface will replace /proc//oom_adj. A warning will be emitted to the kernel log if an application uses this diff --git a/include/linux/oom.h b/include/linux/oom.h index 4952fb874ad3..13b7b02e599a 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -64,10 +64,6 @@ static inline void oom_killer_enable(void) oom_killer_disabled = false; } -/* The badness from the OOM killer */ -extern unsigned long badness(struct task_struct *p, struct mem_cgroup *mem, - const nodemask_t *nodemask, unsigned long uptime); - extern struct task_struct *find_lock_task_mm(struct task_struct *p); /* sysctls */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b0be989d4365..eafff89b3dd6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -487,7 +487,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If any of p's children has a different mm and is eligible for kill, - * the one with the highest badness() score is sacrificed for its + * the one with the highest oom_badness() score is sacrificed for its * parent. This attempts to lose the minimal amount of work done while * still freeing memory. */ -- cgit v1.2.3-59-g8ed1b From c9d8c3d0896bfa5b57531ecc41a85ffbc6d87dbe Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Jul 2011 17:12:18 -0700 Subject: mm/memblock.c: avoid abuse of RED_INACTIVE RED_INACTIVE is a slab thing, and reusing it for memblock was inappropriate, because memblock is dealing with phys_addr_t's which have a Kconfigurable sizeof(). Create a new poison type for this application. Fixes the sparse warning warning: cast truncates bits from constant value (9f911029d74e35b becomes 9d74e35b) Reported-by: H Hartley Sweeten Tested-by: H Hartley Sweeten Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 6 ++++++ mm/memblock.c | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/poison.h b/include/linux/poison.h index 2110a81c5e2a..79159de0e341 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -40,6 +40,12 @@ #define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */ #define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */ +#ifdef CONFIG_PHYS_ADDR_T_64BIT +#define MEMBLOCK_INACTIVE 0x3a84fb0144c9e71bULL +#else +#define MEMBLOCK_INACTIVE 0x44c9e71bUL +#endif + #define SLUB_RED_INACTIVE 0xbb #define SLUB_RED_ACTIVE 0xcc diff --git a/mm/memblock.c b/mm/memblock.c index a0562d1a6ad4..ccbf97339592 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -758,9 +758,9 @@ void __init memblock_analyze(void) /* Check marker in the unused last array entry */ WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base - != (phys_addr_t)RED_INACTIVE); + != MEMBLOCK_INACTIVE); WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base - != (phys_addr_t)RED_INACTIVE); + != MEMBLOCK_INACTIVE); memblock.memory_size = 0; @@ -786,8 +786,8 @@ void __init memblock_init(void) memblock.reserved.max = INIT_MEMBLOCK_REGIONS; /* Write a marker in the unused last array entry */ - memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; - memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; + memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; /* Create a dummy zero size MEMBLOCK which will get coalesced away later. * This simplifies the memblock_add() code below... -- cgit v1.2.3-59-g8ed1b From c15bef3099c346f2124367bff46954b59e13c3ee Mon Sep 17 00:00:00 2001 From: Dmitry Fink Date: Mon, 25 Jul 2011 17:12:19 -0700 Subject: mmap: fix and tidy up overcommit page arithmetic - shmem pages are not immediately available, but they are not potentially available either, even if we swap them out, they will just relocate from memory into swap, total amount of immediate and potentially available memory is not going to be affected, so we shouldn't count them as potentially free in the first place. - nr_free_pages() is not an expensive operation anymore, there is no need to split the decision making in two halves and repeat code. Signed-off-by: Dmitry Fink Reviewed-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 34 +++++++++++++--------------------- mm/nommu.c | 34 +++++++++++++--------------------- 2 files changed, 26 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index d49736ff8a8d..a65efd4db3e1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); - free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* @@ -135,35 +143,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free += global_page_state(NR_SLAB_RECLAIMABLE); - /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - /* * Leave reserved pages. The pages are not for anonymous pages. */ - if (n <= totalreserve_pages) + if (free <= totalreserve_pages) goto error; else - n -= totalreserve_pages; + free -= totalreserve_pages; /* * Leave the last 3% for root */ if (!cap_sys_admin) - n -= n / 32; - free += n; + free -= free / 32; if (free > pages) return 0; diff --git a/mm/nommu.c b/mm/nommu.c index 5c5c2d4b1807..4358032566e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1884,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); - free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* @@ -1897,35 +1905,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free += global_page_state(NR_SLAB_RECLAIMABLE); - /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - /* * Leave reserved pages. The pages are not for anonymous pages. */ - if (n <= totalreserve_pages) + if (free <= totalreserve_pages) goto error; else - n -= totalreserve_pages; + free -= totalreserve_pages; /* * Leave the last 3% for root */ if (!cap_sys_admin) - n -= n / 32; - free += n; + free -= free / 32; if (free > pages) return 0; -- cgit v1.2.3-59-g8ed1b From 5e5358e7cf48aa079b8761a7d806ad536023745c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:23 -0700 Subject: mm: cleanup descriptions of filler arg The often-NULL data arg to read_cache_page() and read_mapping_page() functions is misdescribed as "destination for read data": no, it's the first arg to the filler function, often struct file * to ->readpage(). Satisfy checkpatch.pl on those filler prototypes, and tidy up the declarations in linux/pagemap.h. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 +++++------- mm/filemap.c | 12 ++++++------ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8e38d4c140ff..cfaaa6949b8b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -255,26 +255,24 @@ static inline struct page *grab_cache_page(struct address_space *mapping, extern struct page * grab_cache_page_nowait(struct address_space *mapping, pgoff_t index); extern struct page * read_cache_page_async(struct address_space *mapping, - pgoff_t index, filler_t *filler, - void *data); + pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page(struct address_space *mapping, - pgoff_t index, filler_t *filler, - void *data); + pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); static inline struct page *read_mapping_page_async( - struct address_space *mapping, - pgoff_t index, void *data) + struct address_space *mapping, + pgoff_t index, void *data) { filler_t *filler = (filler_t *)mapping->a_ops->readpage; return read_cache_page_async(mapping, index, filler, data); } static inline struct page *read_mapping_page(struct address_space *mapping, - pgoff_t index, void *data) + pgoff_t index, void *data) { filler_t *filler = (filler_t *)mapping->a_ops->readpage; return read_cache_page(mapping, index, filler, data); diff --git a/mm/filemap.c b/mm/filemap.c index f820e600f1ad..2780be4bd493 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1792,7 +1792,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) { @@ -1823,7 +1823,7 @@ repeat: static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) @@ -1863,7 +1863,7 @@ out: * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Same as read_cache_page, but don't wait for page to become unlocked * after submitting it to the filler. @@ -1875,7 +1875,7 @@ out: */ struct page *read_cache_page_async(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -1923,7 +1923,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Read into the page cache. If a page already exists, and PageUptodate() is * not set, try to fill the page then wait for it to become unlocked. @@ -1932,7 +1932,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -- cgit v1.2.3-59-g8ed1b From 8a549bea51138be2126a2cc6aabe8f17ef66b79b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:24 -0700 Subject: mm: tidy vmtruncate_range and related functions Use consistent variable names in truncate_pagecache(), truncate_setsize(), vmtruncate() and vmtruncate_range(). unmap_mapping_range() and vmtruncate_range() have mismatched interfaces: don't change either, but make the vmtruncates more precise about what they expect unmap_mapping_range() to do. vmtruncate_range() is currently called only with page-aligned start and end+1: can handle unaligned start, but unaligned end+1 would hit BUG_ON in truncate_inode_pages_range() (lacks partial clearing of the end page). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 003c6c685fc8..c924764e2ce5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -531,8 +531,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode - * @old: old file offset - * @new: new file offset + * @oldsize: old file size + * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache * is called. @@ -544,9 +544,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ -void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) { struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(newsize, PAGE_SIZE); /* * unmap_mapping_range is called twice, first simply for @@ -557,9 +558,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) * truncate_inode_pages finishes, hence the second * unmap_mapping_range call must be made for correctness. */ - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + unmap_mapping_range(mapping, holebegin, 0, 1); + truncate_inode_pages(mapping, newsize); + unmap_mapping_range(mapping, holebegin, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); @@ -589,29 +590,31 @@ EXPORT_SYMBOL(truncate_setsize); /** * vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used - * @offset: file offset to start truncating + * @newsize: file offset to start truncating * * This function is deprecated and truncate_setsize or truncate_pagecache * should be used instead, together with filesystem specific block truncation. */ -int vmtruncate(struct inode *inode, loff_t offset) +int vmtruncate(struct inode *inode, loff_t newsize) { int error; - error = inode_newsize_ok(inode, offset); + error = inode_newsize_ok(inode, newsize); if (error) return error; - truncate_setsize(inode, offset); + truncate_setsize(inode, newsize); if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; } EXPORT_SYMBOL(vmtruncate); -int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) { struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(lstart, PAGE_SIZE); + loff_t holelen = 1 + lend - holebegin; /* * If the underlying filesystem is not going to provide @@ -623,10 +626,10 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) mutex_lock(&inode->i_mutex); inode_dio_wait(inode); - unmap_mapping_range(mapping, offset, (end - offset), 1); - inode->i_op->truncate_range(inode, offset, end); + unmap_mapping_range(mapping, holebegin, holelen, 1); + inode->i_op->truncate_range(inode, lstart, lend); /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(mapping, offset, (end - offset), 1); + unmap_mapping_range(mapping, holebegin, holelen, 1); mutex_unlock(&inode->i_mutex); return 0; -- cgit v1.2.3-59-g8ed1b From b85e0effd3dcbf9118b896232f59526ab1a39a74 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:25 -0700 Subject: mm: consistent truncate and invalidate loops Make the pagevec_lookup loops in truncate_inode_pages_range(), invalidate_mapping_pages() and invalidate_inode_pages2_range() more consistent with each other. They were relying upon page->index of an unlocked page, but apologizing for it: accept it, embrace it, add comments and WARN_ONs, and simplify the index handling. invalidate_inode_pages2_range() had special handling for a wrapped page->index + 1 = 0 case; but MAX_LFS_FILESIZE doesn't let us anywhere near there, and a corrupt page->index in the radix_tree could cause more trouble than that would catch. Remove that wrapped handling. invalidate_inode_pages2_range() uses min() to limit the pagevec_lookup when near the end of the range: copy that into the other two, although it's less useful than you might think (it limits the use of the buffer, rather than the indices looked up). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 ++ mm/truncate.c | 110 +++++++++++++++++++++++++--------------------------------- 2 files changed, 49 insertions(+), 63 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2780be4bd493..10a171113273 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -128,6 +128,7 @@ void __delete_from_page_cache(struct page *page) radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) @@ -483,6 +484,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); diff --git a/mm/truncate.c b/mm/truncate.c index c924764e2ce5..dc459014f777 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page) * The first pass will remove most pages, so the search cost of the second pass * is low. * - * When looking at page->index outside the page lock we need to be careful to - * copy it into a local to avoid races (it could change at any time). - * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. @@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - pgoff_t end; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; - pgoff_t next; + pgoff_t index; + pgoff_t end; int i; cleancache_flush_inode(mapping); @@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping, end = (lend >> PAGE_CACHE_SHIFT); pagevec_init(&pvec, 0); - next = start; - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - if (page_index > end) { - next = page_index; + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; - } - if (page_index > next) - next = page_index; - next++; if (!trylock_page(page)) continue; + WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } if (partial) { @@ -264,13 +259,14 @@ void truncate_inode_pages_range(struct address_space *mapping, } } - next = start; + index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - if (next == start) + if (!pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + if (index == start) break; - next = start; + index = start; continue; } if (pvec.pages[0]->index > end) { @@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping, for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - if (page->index > end) + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; + lock_page(page); + WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); - if (page->index > next) - next = page->index; - next++; unlock_page(page); } pagevec_release(&pvec); mem_cgroup_uncharge_end(); + index++; } cleancache_flush_inode(mapping); } @@ -333,35 +331,26 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next = start; + pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; pagevec_init(&pvec, 0); - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t index; - int lock_failed; - - lock_failed = !trylock_page(page); - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ + /* We rely upon deletion not changing page->index */ index = page->index; - if (index > next) - next = index; - next++; - if (lock_failed) - continue; + if (index > end) + break; + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); ret = invalidate_inode_page(page); unlock_page(page); /* @@ -371,12 +360,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!ret) deactivate_page(page); count += ret; - if (next > end) - break; } pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } return count; } @@ -442,37 +430,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next; + pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; - int wrapped = 0; cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); - next = start; - while (next <= end && !wrapped && - pagevec_lookup(&pvec, mapping, next, - min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index; + + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) + break; lock_page(page); + WARN_ON(page->index != index); if (page->mapping != mapping) { unlock_page(page); continue; } - page_index = page->index; - next = page_index + 1; - if (next == 0) - wrapped = 1; - if (page_index > end) { - unlock_page(page); - break; - } wait_on_page_writeback(page); if (page_mapped(page)) { if (!did_range_unmap) { @@ -480,9 +463,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - (loff_t)page_index< Date: Mon, 25 Jul 2011 17:12:25 -0700 Subject: mm: pincer in truncate_inode_pages_range truncate_inode_pages_range()'s final loop has a nice pincer property, bringing start and end together, squeezing out the last pages. But the range handling missed out on that, just sliding up the range, perhaps letting pages come in behind it. Add one more test to give it the same pincer effect. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index dc459014f777..232eb2736a79 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -269,7 +269,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; continue; } - if (pvec.pages[0]->index > end) { + if (index == start && pvec.pages[0]->index > end) { pagevec_release(&pvec); break; } -- cgit v1.2.3-59-g8ed1b From d515afe88a32e567c550e3db914f3e378f86453a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:26 -0700 Subject: tmpfs: no need to use i_lock 2.6.36's 7e496299d4d2 ("tmpfs: make tmpfs scalable with percpu_counter for used blocks") to make tmpfs scalable with percpu_counter used inode->i_lock in place of sbinfo->stat_lock around i_blocks updates; but that was adverse to scalability, and unnecessary, since info->lock is already held there in the fast paths. Remove those uses of i_lock, and add info->lock in the three error paths where it's then needed across shmem_free_blocks(). It's not actually needed across shmem_unacct_blocks(), but they're so often paired that it looks wrong to split them apart. Signed-off-by: Hugh Dickins Acked-by: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index fcedf5464eb7..c1db11cf220d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -241,9 +241,7 @@ static void shmem_free_blocks(struct inode *inode, long pages) struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { percpu_counter_add(&sbinfo->used_blocks, -pages); - spin_lock(&inode->i_lock); inode->i_blocks -= pages*BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); } } @@ -432,9 +430,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long sbinfo->max_blocks - 1) >= 0) return ERR_PTR(-ENOSPC); percpu_counter_inc(&sbinfo->used_blocks); - spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); } spin_unlock(&info->lock); @@ -1421,9 +1417,7 @@ repeat: shmem_acct_block(info->flags)) goto nospace; percpu_counter_inc(&sbinfo->used_blocks); - spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); } else if (shmem_acct_block(info->flags)) goto nospace; @@ -1434,8 +1428,10 @@ repeat: spin_unlock(&info->lock); filepage = shmem_alloc_page(gfp, info, idx); if (!filepage) { + spin_lock(&info->lock); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); + spin_unlock(&info->lock); error = -ENOMEM; goto failed; } @@ -1449,8 +1445,10 @@ repeat: current->mm, GFP_KERNEL); if (error) { page_cache_release(filepage); + spin_lock(&info->lock); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); + spin_unlock(&info->lock); filepage = NULL; goto failed; } @@ -1480,10 +1478,10 @@ repeat: * be done automatically. */ if (ret) { - spin_unlock(&info->lock); - page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); + spin_unlock(&info->lock); + page_cache_release(filepage); filepage = NULL; if (error) goto failed; -- cgit v1.2.3-59-g8ed1b From 1d65f86db14806cf7b1218c7b4ecb8b4db5af27d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 25 Jul 2011 17:12:27 -0700 Subject: mm: preallocate page before lock_page() at filemap COW Currently we are keeping faulted page locked throughout whole __do_fault call (except for page_mkwrite code path) after calling file system's fault code. If we do early COW, we allocate a new page which has to be charged for a memcg (mem_cgroup_newpage_charge). This function, however, might block for unbounded amount of time if memcg oom killer is disabled or fork-bomb is running because the only way out of the OOM situation is either an external event or OOM-situation fix. In the end we are keeping the faulted page locked and blocking other processes from faulting it in which is not good at all because we are basically punishing potentially an unrelated process for OOM condition in a different group (I have seen stuck system because of ld-2.11.1.so being locked). We can do test easily. % cgcreate -g memory:A % cgset -r memory.limit_in_bytes=64M A % cgset -r memory.memsw.limit_in_bytes=64M A % cd kernel_dir; cgexec -g memory:A make -j Then, the whole system will live-locked until you kill 'make -j' by hands (or push reboot...) This is because some important page in a a shared library are locked. Considering again, the new page is not necessary to be allocated with lock_page() held. And usual page allocation may dive into long memory reclaim loop with holding lock_page() and can cause very long latency. There are 3 ways. 1. do allocation/charge before lock_page() Pros. - simple and can handle page allocation in the same manner. This will reduce holding time of lock_page() in general. Cons. - we do page allocation even if ->fault() returns error. 2. do charge after unlock_page(). Even if charge fails, it's just OOM. Pros. - no impact to non-memcg path. Cons. - implemenation requires special cares of LRU and we need to modify page_add_new_anon_rmap()... 3. do unlock->charge->lock again method. Pros. - no impact to non-memcg path. Cons. - This may kill LOCK_PAGE_RETRY optimization. We need to release lock and get it again... This patch moves "charge" and memory allocation for COW page before lock_page(). Then, we can avoid scanning LRU with holding a lock on a page and latency under lock_page() will be reduced. Then, above livelock disappears. [akpm@linux-foundation.org: fix code layout] Signed-off-by: KAMEZAWA Hiroyuki Reported-by: Lutz Vieweg Original-idea-by: Michal Hocko Cc: Michal Hocko Cc: Ying Han Cc: Johannes Weiner Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index a58bbebb3070..3c9f3aa8332e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3093,14 +3093,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *page_table; spinlock_t *ptl; struct page *page; + struct page *cow_page; pte_t entry; int anon = 0; - int charged = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; int page_mkwrite = 0; + /* + * If we do COW later, allocate page befor taking lock_page() + * on the file cache page. This will reduce lock holding time. + */ + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!cow_page) + return VM_FAULT_OOM; + + if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { + page_cache_release(cow_page); + return VM_FAULT_OOM; + } + } else + cow_page = NULL; + vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; vmf.flags = flags; @@ -3109,12 +3129,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; + goto uncharge_out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - return VM_FAULT_HWPOISON; + ret = VM_FAULT_HWPOISON; + goto uncharge_out; } /* @@ -3132,23 +3153,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, page = vmf.page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { + page = cow_page; anon = 1; - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; - goto out; - } - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - page_cache_release(page); - goto out; - } - charged = 1; copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -3217,8 +3223,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, page_table); } else { - if (charged) - mem_cgroup_uncharge_page(page); + if (cow_page) + mem_cgroup_uncharge_page(cow_page); if (anon) page_cache_release(page); else @@ -3227,7 +3233,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); -out: if (dirty_page) { struct address_space *mapping = page->mapping; @@ -3257,6 +3262,13 @@ out: unwritable_page: page_cache_release(page); return ret; +uncharge_out: + /* fs's fault handler get error */ + if (cow_page) { + mem_cgroup_uncharge_page(cow_page); + page_cache_release(cow_page); + } + return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3-59-g8ed1b From cd38b115d5ad79b0100ac6daa103c4fe2c50a913 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jul 2011 17:12:29 -0700 Subject: mm: page allocator: initialise ZLC for first zone eligible for zone_reclaim There have been a small number of complaints about significant stalls while copying large amounts of data on NUMA machines reported on a distribution bugzilla. In these cases, zone_reclaim was enabled by default due to large NUMA distances. In general, the complaints have not been about the workload itself unless it was a file server (in which case the recommendation was disable zone_reclaim). The stalls are mostly due to significant amounts of time spent scanning the preferred zone for pages to free. After a failure, it might fallback to another node (as zonelists are often node-ordered rather than zone-ordered) but stall quickly again when the next allocation attempt occurs. In bad cases, each page allocated results in a full scan of the preferred zone. Patch 1 checks the preferred zone for recent allocation failure which is particularly important if zone_reclaim has failed recently. This avoids rescanning the zone in the near future and instead falling back to another node. This may hurt node locality in some cases but a failure to zone_reclaim is more expensive than a remote access. Patch 2 clears the zlc information after direct reclaim. Otherwise, zone_reclaim can mark zones full, direct reclaim can reclaim enough pages but the zone is still not considered for allocation. This was tested on a 24-thread 2-node x86_64 machine. The tests were focused on large amounts of IO. All tests were bound to the CPUs on node-0 to avoid disturbances due to processes being scheduled on different nodes. The kernels tested are 3.0-rc6-vanilla Vanilla 3.0-rc6 zlcfirst Patch 1 applied zlcreconsider Patches 1+2 applied FS-Mark ./fs_mark -d /tmp/fsmark-10813 -D 100 -N 5000 -n 208 -L 35 -t 24 -S0 -s 524288 fsmark-3.0-rc6 3.0-rc6 3.0-rc6 vanilla zlcfirs zlcreconsider Files/s min 54.90 ( 0.00%) 49.80 (-10.24%) 49.10 (-11.81%) Files/s mean 100.11 ( 0.00%) 135.17 (25.94%) 146.93 (31.87%) Files/s stddev 57.51 ( 0.00%) 138.97 (58.62%) 158.69 (63.76%) Files/s max 361.10 ( 0.00%) 834.40 (56.72%) 802.40 (55.00%) Overhead min 76704.00 ( 0.00%) 76501.00 ( 0.27%) 77784.00 (-1.39%) Overhead mean 1485356.51 ( 0.00%) 1035797.83 (43.40%) 1594680.26 (-6.86%) Overhead stddev 1848122.53 ( 0.00%) 881489.88 (109.66%) 1772354.90 ( 4.27%) Overhead max 7989060.00 ( 0.00%) 3369118.00 (137.13%) 10135324.00 (-21.18%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 501.49 493.91 499.93 Total Elapsed Time (seconds) 2451.57 2257.48 2215.92 MMTests Statistics: vmstat Page Ins 46268 63840 66008 Page Outs 90821596 90671128 88043732 Swap Ins 0 0 0 Swap Outs 0 0 0 Direct pages scanned 13091697 8966863 8971790 Kswapd pages scanned 0 1830011 1831116 Kswapd pages reclaimed 0 1829068 1829930 Direct pages reclaimed 13037777 8956828 8648314 Kswapd efficiency 100% 99% 99% Kswapd velocity 0.000 810.643 826.346 Direct efficiency 99% 99% 96% Direct velocity 5340.128 3972.068 4048.788 Percentage direct scans 100% 83% 83% Page writes by reclaim 0 3 0 Slabs scanned 796672 720640 720256 Direct inode steals 7422667 7160012 7088638 Kswapd inode steals 0 1736840 2021238 Test completes far faster with a large increase in the number of files created per second. Standard deviation is high as a small number of iterations were much higher than the mean. The number of pages scanned by zone_reclaim is reduced and kswapd is used for more work. LARGE DD 3.0-rc6 3.0-rc6 3.0-rc6 vanilla zlcfirst zlcreconsider download tar 59 ( 0.00%) 59 ( 0.00%) 55 ( 7.27%) dd source files 527 ( 0.00%) 296 (78.04%) 320 (64.69%) delete source 36 ( 0.00%) 19 (89.47%) 20 (80.00%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 125.03 118.98 122.01 Total Elapsed Time (seconds) 624.56 375.02 398.06 MMTests Statistics: vmstat Page Ins 3594216 439368 407032 Page Outs 23380832 23380488 23377444 Swap Ins 0 0 0 Swap Outs 0 436 287 Direct pages scanned 17482342 69315973 82864918 Kswapd pages scanned 0 519123 575425 Kswapd pages reclaimed 0 466501 522487 Direct pages reclaimed 5858054 2732949 2712547 Kswapd efficiency 100% 89% 90% Kswapd velocity 0.000 1384.254 1445.574 Direct efficiency 33% 3% 3% Direct velocity 27991.453 184832.737 208171.929 Percentage direct scans 100% 99% 99% Page writes by reclaim 0 5082 13917 Slabs scanned 17280 29952 35328 Direct inode steals 115257 1431122 332201 Kswapd inode steals 0 0 979532 This test downloads a large tarfile and copies it with dd a number of times - similar to the most recent bug report I've dealt with. Time to completion is reduced. The number of pages scanned directly is still disturbingly high with a low efficiency but this is likely due to the number of dirty pages encountered. The figures could probably be improved with more work around how kswapd is used and how dirty pages are handled but that is separate work and this result is significant on its own. Streaming Mapped Writer MMTests Statistics: duration User/Sys Time Running Test (seconds) 124.47 111.67 112.64 Total Elapsed Time (seconds) 2138.14 1816.30 1867.56 MMTests Statistics: vmstat Page Ins 90760 89124 89516 Page Outs 121028340 120199524 120736696 Swap Ins 0 86 55 Swap Outs 0 0 0 Direct pages scanned 114989363 96461439 96330619 Kswapd pages scanned 56430948 56965763 57075875 Kswapd pages reclaimed 27743219 27752044 27766606 Direct pages reclaimed 49777 46884 36655 Kswapd efficiency 49% 48% 48% Kswapd velocity 26392.541 31363.631 30561.736 Direct efficiency 0% 0% 0% Direct velocity 53780.091 53108.759 51581.004 Percentage direct scans 67% 62% 62% Page writes by reclaim 385 122 1513 Slabs scanned 43008 39040 42112 Direct inode steals 0 10 8 Kswapd inode steals 733 534 477 This test just creates a large file mapping and writes to it linearly. Time to completion is again reduced. The gains are mostly down to two things. In many cases, there is less scanning as zone_reclaim simply gives up faster due to recent failures. The second reason is that memory is used more efficiently. Instead of scanning the preferred zone every time, the allocator falls back to another zone and uses it instead improving overall memory utilisation. This patch: initialise ZLC for first zone eligible for zone_reclaim. The zonelist cache (ZLC) is used among other things to record if zone_reclaim() failed for a particular zone recently. The intention is to avoid a high cost scanning extremely long zonelists or scanning within the zone uselessly. Currently the zonelist cache is setup only after the first zone has been considered and zone_reclaim() has been called. The objective was to avoid a costly setup but zone_reclaim is itself quite expensive. If it is failing regularly such as the first eligible zone having mostly mapped pages, the cost in scanning and allocation stalls is far higher than the ZLC initialisation step. This patch initialises ZLC before the first eligible zone calls zone_reclaim(). Once initialised, it is checked whether the zone failed zone_reclaim recently. If it has, the zone is skipped. As the first zone is now being checked, additional care has to be taken about zones marked full. A zone can be marked "full" because it should not have enough unmapped pages for zone_reclaim but this is excessive as direct reclaim or kswapd may succeed where zone_reclaim fails. Only mark zones "full" after zone_reclaim fails if it failed to reclaim enough pages after scanning. Signed-off-by: Mel Gorman Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9119faae6e6a..830a465958de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1664,7 +1664,7 @@ zonelist_scan: continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) - goto try_next_zone; + continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { @@ -1676,17 +1676,36 @@ zonelist_scan: classzone_idx, alloc_flags)) goto try_this_zone; + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + /* + * we do zlc_setup if there are multiple nodes + * and before considering the first zone allowed + * by the cpuset. + */ + allowednodes = zlc_setup(zonelist, alloc_flags); + zlc_active = 1; + did_zlc_setup = 1; + } + if (zone_reclaim_mode == 0) goto this_zone_full; + /* + * As we may have just activated ZLC, check if the first + * eligible zone has failed zone_reclaim recently. + */ + if (NUMA_BUILD && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; + ret = zone_reclaim(zone, gfp_mask, order); switch (ret) { case ZONE_RECLAIM_NOSCAN: /* did not scan */ - goto try_next_zone; + continue; case ZONE_RECLAIM_FULL: /* scanned but unreclaimable */ - goto this_zone_full; + continue; default: /* did we reclaim enough */ if (!zone_watermark_ok(zone, order, mark, @@ -1703,16 +1722,6 @@ try_this_zone: this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); -try_next_zone: - if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup after the first zone is tried but only - * if there are multiple nodes make it worthwhile - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } } if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { -- cgit v1.2.3-59-g8ed1b From 76d3fbf8fbf6cc78ceb63549e0e0c5bc8a88f838 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jul 2011 17:12:30 -0700 Subject: mm: page allocator: reconsider zones for allocation after direct reclaim With zone_reclaim_mode enabled, it's possible for zones to be considered full in the zonelist_cache so they are skipped in the future. If the process enters direct reclaim, the ZLC may still consider zones to be full even after reclaiming pages. Reconsider all zones for allocation if direct reclaim returns successfully. Signed-off-by: Mel Gorman Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 830a465958de..094472377d81 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1616,6 +1616,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) set_bit(i, zlc->fullzones); } +/* + * clear all zones full, called after direct reclaim makes progress so that + * a zone that was recently full is not skipped over for up to a second + */ +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); +} + #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1632,6 +1647,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) { } + +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ +} #endif /* CONFIG_NUMA */ /* @@ -1963,6 +1982,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (unlikely(!(*did_some_progress))) return NULL; + /* After successful reclaim, reconsider all zones for allocation */ + if (NUMA_BUILD) + zlc_clear_zones_full(zonelist); + retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, -- cgit v1.2.3-59-g8ed1b From 72c4783210f77fd743f0a316858d33f27db51e7c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 25 Jul 2011 17:12:31 -0700 Subject: mm: remove useless rcu lock-unlock from mapping_tagged() radix_tree_tagged() is lockless - it reads from a member of the raid-tree root node. It does not require any protection. Signed-off-by: Konstantin Khlebnikov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 31f698862420..919b45eb57ad 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1405,10 +1405,6 @@ EXPORT_SYMBOL(test_set_page_writeback); */ int mapping_tagged(struct address_space *mapping, int tag) { - int ret; - rcu_read_lock(); - ret = radix_tree_tagged(&mapping->page_tree, tag); - rcu_read_unlock(); - return ret; + return radix_tree_tagged(&mapping->page_tree, tag); } EXPORT_SYMBOL(mapping_tagged); -- cgit v1.2.3-59-g8ed1b From 2efaca927f5cd7ecd0f1554b8f9b6a9a2c329c03 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 25 Jul 2011 17:12:32 -0700 Subject: mm/futex: fix futex writes on archs with SW tracking of dirty & young I haven't reproduced it myself but the fail scenario is that on such machines (notably ARM and some embedded powerpc), if you manage to hit that futex path on a writable page whose dirty bit has gone from the PTE, you'll livelock inside the kernel from what I can tell. It will go in a loop of trying the atomic access, failing, trying gup to "fix it up", getting succcess from gup, go back to the atomic access, failing again because dirty wasn't fixed etc... So I think you essentially hang in the kernel. The scenario is probably rare'ish because affected architecture are embedded and tend to not swap much (if at all) so we probably rarely hit the case where dirty is missing or young is missing, but I think Shan has a piece of SW that can reliably reproduce it using a shared writable mapping & fork or something like that. On archs who use SW tracking of dirty & young, a page without dirty is effectively mapped read-only and a page without young unaccessible in the PTE. Additionally, some architectures might lazily flush the TLB when relaxing write protection (by doing only a local flush), and expect a fault to invalidate the stale entry if it's still present on another processor. The futex code assumes that if the "in_atomic()" access -EFAULT's, it can "fix it up" by causing get_user_pages() which would then be equivalent to taking the fault. However that isn't the case. get_user_pages() will not call handle_mm_fault() in the case where the PTE seems to have the right permissions, regardless of the dirty and young state. It will eventually update those bits ... in the struct page, but not in the PTE. Additionally, it will not handle the lazy TLB flushing that can be required by some architectures in the fault case. Basically, gup is the wrong interface for the job. The patch provides a more appropriate one which boils down to just calling handle_mm_fault() since what we are trying to do is simulate a real page fault. The futex code currently attempts to write to user memory within a pagefault disabled section, and if that fails, tries to fix it up using get_user_pages(). This doesn't work on archs where the dirty and young bits are maintained by software, since they will gate access permission in the TLB, and will not be updated by gup(). In addition, there's an expectation on some archs that a spurious write fault triggers a local TLB flush, and that is missing from the picture as well. I decided that adding those "features" to gup() would be too much for this already too complex function, and instead added a new simpler fixup_user_fault() which is essentially a wrapper around handle_mm_fault() which the futex code can call. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix some nits Darren saw, fiddle comment layout] Signed-off-by: Benjamin Herrenschmidt Reported-by: Shan Hai Tested-by: Shan Hai Cc: David Laight Acked-by: Peter Zijlstra Cc: Darren Hart Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ kernel/futex.c | 4 ++-- mm/memory.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3cccd053850f..3172a1c0f08e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -988,6 +988,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); diff --git a/kernel/futex.c b/kernel/futex.c index 3fbc76cbb9aa..0a308970c24a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr) int ret; down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + FAULT_FLAG_WRITE); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; diff --git a/mm/memory.c b/mm/memory.c index 3c9f3aa8332e..a56e3ba816b2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1805,7 +1805,63 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); -/** +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* * get_user_pages() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. -- cgit v1.2.3-59-g8ed1b From 708e3508c2a2204cc276dcdb543009a441bfe91b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:32 -0700 Subject: tmpfs: clone shmem_file_splice_read() Copy __generic_file_splice_read() and generic_file_splice_read() from fs/splice.c to shmem_file_splice_read() in mm/shmem.c. Make page_cache_pipe_buf_ops and spd_release_page() accessible to it. Signed-off-by: Hugh Dickins Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/splice.c | 4 +- include/linux/splice.h | 2 + mm/shmem.c | 218 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 221 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/fs/splice.c b/fs/splice.c index aa866d309695..fa2defa8afcf 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -132,7 +132,7 @@ error: return err; } -static const struct pipe_buf_operations page_cache_pipe_buf_ops = { +const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -264,7 +264,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, return ret; } -static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) +void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { page_cache_release(spd->pages[i]); } diff --git a/include/linux/splice.h b/include/linux/splice.h index 997c3b4c212b..26e5b613deda 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -88,5 +88,7 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *); extern void splice_shrink_spd(struct pipe_inode_info *, struct splice_pipe_desc *); +extern void spd_release_page(struct splice_pipe_desc *, unsigned int); +extern const struct pipe_buf_operations page_cache_pipe_buf_ops; #endif diff --git a/mm/shmem.c b/mm/shmem.c index c1db11cf220d..d176e488f04d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include #include #include @@ -1844,6 +1845,221 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, return retval; } +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct address_space *mapping = in->f_mapping; + unsigned int loff, nr_pages, req_pages; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *page; + pgoff_t index, end_index; + loff_t isize, left; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + isize = i_size_read(in->f_mapping->host); + if (unlikely(*ppos >= isize)) + return 0; + + left = isize - *ppos; + if (unlikely(left < len)) + len = left; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nr_pages = min(req_pages, pipe->buffers); + + /* + * Lookup the (hopefully) full range of pages we need. + */ + spd.nr_pages = find_get_pages_contig(mapping, index, + nr_pages, spd.pages); + index += spd.nr_pages; + + /* + * If find_get_pages_contig() returned fewer pages than we needed, + * readahead/allocate the rest and fill in the holes. + */ + if (spd.nr_pages < nr_pages) + page_cache_sync_readahead(mapping, &in->f_ra, in, + index, req_pages - spd.nr_pages); + + error = 0; + while (spd.nr_pages < nr_pages) { + /* + * Page could be there, find_get_pages_contig() breaks on + * the first hole. + */ + page = find_get_page(mapping, index); + if (!page) { + /* + * page didn't exist, allocate one. + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; + + error = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL); + if (unlikely(error)) { + page_cache_release(page); + if (error == -EEXIST) + continue; + break; + } + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); + } + + spd.pages[spd.nr_pages++] = page; + index++; + } + + /* + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. + */ + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = spd.pages[page_nr]; + + if (PageReadahead(page)) + page_cache_async_readahead(mapping, &in->f_ra, in, + page, index, req_pages - page_nr); + + /* + * If the page isn't uptodate, we may need to start io on it + */ + if (!PageUptodate(page)) { + lock_page(page); + + /* + * Page was truncated, or invalidated by the + * filesystem. Redo the find/create, but this time the + * page is kept locked, so there's no chance of another + * race with truncate/invalidate. + */ + if (!page->mapping) { + unlock_page(page); + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + + if (!page) { + error = -ENOMEM; + break; + } + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } + + /* + * need to read in the page + */ + error = mapping->a_ops->readpage(in, page); + if (unlikely(error)) { + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ + if (error == AOP_TRUNCATED_PAGE) + error = 0; + + break; + } + } +fill_it: + /* + * i_size must be checked after PageUptodate. + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + unsigned int plen; + + /* + * max good bytes in this page + */ + plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (plen <= loff) + break; + + /* + * force quit after adding this page + */ + this_len = min(this_len, plen - loff); + len = this_len; + } + + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; + len -= this_len; + loff = 0; + spd.nr_pages++; + index++; + } + + /* + * Release any pages at the end, if we quit early. 'page_nr' is how far + * we got, 'nr_pages' is how many pages are in the map. + */ + while (page_nr < nr_pages) + page_cache_release(spd.pages[page_nr++]); + in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(pipe, &spd); + + if (error > 0) { + *ppos += error; + file_accessed(in); + } + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -2699,7 +2915,7 @@ static const struct file_operations shmem_file_operations = { .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = shmem_file_splice_read, .splice_write = generic_file_splice_write, #endif }; -- cgit v1.2.3-59-g8ed1b From 71f0e07a605fad1fb6b288e4dc1dd8dfa78f4872 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:33 -0700 Subject: tmpfs: refine shmem_file_splice_read Tidy up shmem_file_splice_read(): Remove readahead: okay, we could implement shmem readahead on swap, but have never done so before, swap being the slow exceptional path. Use shmem_getpage() instead of find_or_create_page() plus ->readpage(). Remove several comments: sorry, I found them more distracting than helpful, and this will not be the reference version of splice_read(). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 138 +++++++++---------------------------------------------------- 1 file changed, 19 insertions(+), 119 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index d176e488f04d..f96614526d1c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1850,6 +1850,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; + struct inode *inode = mapping->host; unsigned int loff, nr_pages, req_pages; struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; @@ -1865,7 +1866,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, .spd_release = spd_release_page, }; - isize = i_size_read(in->f_mapping->host); + isize = i_size_read(inode); if (unlikely(*ppos >= isize)) return 0; @@ -1881,153 +1882,57 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; nr_pages = min(req_pages, pipe->buffers); - /* - * Lookup the (hopefully) full range of pages we need. - */ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); index += spd.nr_pages; - - /* - * If find_get_pages_contig() returned fewer pages than we needed, - * readahead/allocate the rest and fill in the holes. - */ - if (spd.nr_pages < nr_pages) - page_cache_sync_readahead(mapping, &in->f_ra, in, - index, req_pages - spd.nr_pages); - error = 0; - while (spd.nr_pages < nr_pages) { - /* - * Page could be there, find_get_pages_contig() breaks on - * the first hole. - */ - page = find_get_page(mapping, index); - if (!page) { - /* - * page didn't exist, allocate one. - */ - page = page_cache_alloc_cold(mapping); - if (!page) - break; - - error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL); - if (unlikely(error)) { - page_cache_release(page); - if (error == -EEXIST) - continue; - break; - } - /* - * add_to_page_cache() locks the page, unlock it - * to avoid convoluting the logic below even more. - */ - unlock_page(page); - } + while (spd.nr_pages < nr_pages) { + page = NULL; + error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); + if (error) + break; + unlock_page(page); spd.pages[spd.nr_pages++] = page; index++; } - /* - * Now loop over the map and see if we need to start IO on any - * pages, fill in the partial map, etc. - */ index = *ppos >> PAGE_CACHE_SHIFT; nr_pages = spd.nr_pages; spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { unsigned int this_len; if (!len) break; - /* - * this_len is the max we'll use from this page - */ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); page = spd.pages[page_nr]; - if (PageReadahead(page)) - page_cache_async_readahead(mapping, &in->f_ra, in, - page, index, req_pages - page_nr); - - /* - * If the page isn't uptodate, we may need to start io on it - */ - if (!PageUptodate(page)) { - lock_page(page); - - /* - * Page was truncated, or invalidated by the - * filesystem. Redo the find/create, but this time the - * page is kept locked, so there's no chance of another - * race with truncate/invalidate. - */ - if (!page->mapping) { - unlock_page(page); - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - - if (!page) { - error = -ENOMEM; - break; - } - page_cache_release(spd.pages[page_nr]); - spd.pages[page_nr] = page; - } - /* - * page was already under io and is now done, great - */ - if (PageUptodate(page)) { - unlock_page(page); - goto fill_it; - } - - /* - * need to read in the page - */ - error = mapping->a_ops->readpage(in, page); - if (unlikely(error)) { - /* - * We really should re-lookup the page here, - * but it complicates things a lot. Instead - * lets just do what we already stored, and - * we'll get it the next time we are called. - */ - if (error == AOP_TRUNCATED_PAGE) - error = 0; - + if (!PageUptodate(page) || page->mapping != mapping) { + page = NULL; + error = shmem_getpage(inode, index, &page, + SGP_CACHE, NULL); + if (error) break; - } + unlock_page(page); + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; } -fill_it: - /* - * i_size must be checked after PageUptodate. - */ - isize = i_size_read(mapping->host); + + isize = i_size_read(inode); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; if (unlikely(!isize || index > end_index)) break; - /* - * if this is the last page, see if we need to shrink - * the length and stop - */ if (end_index == index) { unsigned int plen; - /* - * max good bytes in this page - */ plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (plen <= loff) break; - /* - * force quit after adding this page - */ this_len = min(this_len, plen - loff); len = this_len; } @@ -2040,13 +1945,8 @@ fill_it: index++; } - /* - * Release any pages at the end, if we quit early. 'page_nr' is how far - * we got, 'nr_pages' is how many pages are in the map. - */ while (page_nr < nr_pages) page_cache_release(spd.pages[page_nr++]); - in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); -- cgit v1.2.3-59-g8ed1b From 68da9f055755ee2609a1686722e6d6a7980019ee Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:34 -0700 Subject: tmpfs: pass gfp to shmem_getpage_gfp Make shmem_getpage() a wrapper, passing mapping_gfp_mask() down to shmem_getpage_gfp(), which in turn passes gfp down to shmem_swp_alloc(). Change shmem_read_mapping_page_gfp() to use shmem_getpage_gfp() in the CONFIG_SHMEM case; but leave tiny !SHMEM using read_cache_page_gfp(). Add a BUG_ON() in case anyone happens to call this on a non-shmem mapping; though we might later want to let that case route to read_cache_page_gfp(). It annoys me to have these two almost-redundant args, gfp and fault_type: I can't find a better way; but initialize fault_type only in shmem_fault(). Note that before, read_cache_page_gfp() was allocating i915_gem's pages with __GFP_NORETRY as intended; but the corresponding swap vector pages got allocated without it, leaving a small possibility of OOM. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 67 +++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index f96614526d1c..f6c94ba87808 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -127,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type); +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); + +static inline int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, int *fault_type) +{ + return shmem_getpage_gfp(inode, index, pagep, sgp, + mapping_gfp_mask(inode->i_mapping), fault_type); +} static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) { @@ -404,10 +411,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns * @info: info structure for the inode * @index: index of the page to find * @sgp: check and recheck i_size? skip allocation? + * @gfp: gfp mask to use for any page allocation * * If the entry does not exist, allocate it. */ -static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) +static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, + unsigned long index, enum sgp_type sgp, gfp_t gfp) { struct inode *inode = &info->vfs_inode; struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); @@ -435,7 +444,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long } spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); + page = shmem_dir_alloc(gfp); spin_lock(&info->lock); if (!page) { @@ -1225,14 +1234,14 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) #endif /* - * shmem_getpage - either get the page from swap or allocate a new one + * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type) +static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1242,15 +1251,11 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page *prealloc_page = NULL; swp_entry_t *entry; swp_entry_t swap; - gfp_t gfp; int error; if (idx >= SHMEM_MAX_INDEX) return -EFBIG; - if (type) - *type = 0; - /* * Normally, filepage is NULL on entry, and either found * uptodate immediately, or allocated and zeroed, or read @@ -1264,13 +1269,12 @@ repeat: filepage = find_lock_page(mapping, idx); if (filepage && PageUptodate(filepage)) goto done; - gfp = mapping_gfp_mask(mapping); if (!filepage) { /* * Try to preload while we can wait, to not make a habit of * draining atomic reserves; but don't latch on to this cpu. */ - error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); if (error) goto failed; radix_tree_preload_end(); @@ -1290,7 +1294,7 @@ repeat: spin_lock(&info->lock); shmem_recalc_inode(inode); - entry = shmem_swp_alloc(info, idx, sgp); + entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) { spin_unlock(&info->lock); error = PTR_ERR(entry); @@ -1305,12 +1309,12 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); /* here we actually do the io */ - if (type) - *type |= VM_FAULT_MAJOR; + if (fault_type) + *fault_type |= VM_FAULT_MAJOR; swappage = shmem_swapin(swap, gfp, info, idx); if (!swappage) { spin_lock(&info->lock); - entry = shmem_swp_alloc(info, idx, sgp); + entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) error = PTR_ERR(entry); else { @@ -1461,7 +1465,7 @@ repeat: SetPageSwapBacked(filepage); } - entry = shmem_swp_alloc(info, idx, sgp); + entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) error = PTR_ERR(entry); else { @@ -1539,7 +1543,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; int error; - int ret; + int ret = VM_FAULT_LOCKED; if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; @@ -1547,11 +1551,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + if (ret & VM_FAULT_MAJOR) { count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); } - return ret | VM_FAULT_LOCKED; + return ret; } #ifdef CONFIG_NUMA @@ -3162,13 +3167,29 @@ int shmem_zero_setup(struct vm_area_struct *vma) * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * - * Provide a stub for those callers to start using now, then later - * flesh it out to call shmem_getpage() with additional gfp mask, when - * shmem_file_splice_read() is added and shmem_readpage() is removed. + * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in + * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { +#ifdef CONFIG_SHMEM + struct inode *inode = mapping->host; + struct page *page = NULL; + int error; + + BUG_ON(mapping->a_ops != &shmem_aops); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + if (error) + page = ERR_PTR(error); + else + unlock_page(page); + return page; +#else + /* + * The tiny !SHMEM case uses ramfs without swap + */ return read_cache_page_gfp(mapping, index, gfp); +#endif } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); -- cgit v1.2.3-59-g8ed1b From 9276aad6c898dbcc31d095f2934dedd5cbb2e93e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:34 -0700 Subject: tmpfs: remove_shmem_readpage Remove that pernicious shmem_readpage() at last: the things we needed it for (splice, loop, sendfile, i915 GEM) are now fully taken care of by shmem_file_splice_read() and shmem_read_mapping_page_gfp(). This removal clears the way for a simpler shmem_getpage_gfp(), since page is never passed in; but leave most of that cleanup until after. sys_readahead() and sys_fadvise(POSIX_FADV_WILLNEED) will now EINVAL, instead of unexpectedly trying to read ahead on tmpfs: if that proves to be an issue for someone, then we can either arrange for them to return success instead, or try to implement async readahead on tmpfs. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index f6c94ba87808..ff6713a2579e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1246,7 +1246,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; - struct page *filepage = *pagep; + struct page *filepage; struct page *swappage; struct page *prealloc_page = NULL; swp_entry_t *entry; @@ -1255,18 +1255,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, if (idx >= SHMEM_MAX_INDEX) return -EFBIG; - - /* - * Normally, filepage is NULL on entry, and either found - * uptodate immediately, or allocated and zeroed, or read - * in under swappage, which is then assigned to filepage. - * But shmem_readpage (required for splice) passes in a locked - * filepage, which may be found not uptodate by other callers - * too, and may need to be copied from the swappage read in. - */ repeat: - if (!filepage) - filepage = find_lock_page(mapping, idx); + filepage = find_lock_page(mapping, idx); if (filepage && PageUptodate(filepage)) goto done; if (!filepage) { @@ -1513,8 +1503,7 @@ nospace: * Perhaps the page was brought in from swap between find_lock_page * and taking info->lock? We allow for that at add_to_page_cache_lru, * but must also avoid reporting a spurious ENOSPC while working on a - * full tmpfs. (When filepage has been passed in to shmem_getpage, it - * is already in page cache, which prevents this race from occurring.) + * full tmpfs. */ if (!filepage) { struct page *page = find_get_page(mapping, idx); @@ -1527,7 +1516,7 @@ nospace: spin_unlock(&info->lock); error = -ENOSPC; failed: - if (*pagep != filepage) { + if (filepage) { unlock_page(filepage); page_cache_release(filepage); } @@ -1673,19 +1662,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_symlink_inline_operations; -/* - * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; - * but providing them allows a tmpfs file to be used for splice, sendfile, and - * below the loop driver, in the generic fashion that many filesystems support. - */ -static int shmem_readpage(struct file *file, struct page *page) -{ - struct inode *inode = page->mapping->host; - int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); - unlock_page(page); - return error; -} - static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -1693,7 +1669,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = NULL; return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } @@ -1893,7 +1868,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, error = 0; while (spd.nr_pages < nr_pages) { - page = NULL; error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); if (error) break; @@ -1916,7 +1890,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, page = spd.pages[page_nr]; if (!PageUptodate(page) || page->mapping != mapping) { - page = NULL; error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); if (error) @@ -2125,7 +2098,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s int error; int len; struct inode *inode; - struct page *page = NULL; + struct page *page; char *kaddr; struct shmem_inode_info *info; @@ -2803,7 +2776,6 @@ static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS - .readpage = shmem_readpage, .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif @@ -3175,7 +3147,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; - struct page *page = NULL; + struct page *page; int error; BUG_ON(mapping->a_ops != &shmem_aops); -- cgit v1.2.3-59-g8ed1b From e83c32e8f92724a06a22a3b42f3afc07db93e131 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:35 -0700 Subject: tmpfs: simplify prealloc_page The prealloc_page handling in shmem_getpage_gfp() is unnecessarily complicated: first simplify that before going on to filepage/swappage. That's right, don't report ENOMEM when the preallocation fails: we may or may not need the page. But simply report ENOMEM once we find we do need it, instead of dropping lock, repeating allocation, unwinding on failure etc. And leave the out label on the fast path, don't goto. Fix something that looks like a bug but turns out not to be: set PageSwapBacked on prealloc_page before its mem_cgroup_cache_charge(), as the removed case was doing. That's important before adding to LRU (determines which LRU the page goes on), and does affect which path it takes through memcontrol.c, but in the end MEM_CGROUP_CHANGE_TYPE_ SHMEM is handled no differently from CACHE. Signed-off-by: Hugh Dickins Acked-by: Shaohua Li Cc: "Zhang, Yanmin" Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 60 ++++++++++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index ff6713a2579e..8f8534f35476 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1269,9 +1269,9 @@ repeat: goto failed; radix_tree_preload_end(); if (sgp != SGP_READ && !prealloc_page) { - /* We don't care if this fails */ prealloc_page = shmem_alloc_page(gfp, info, idx); if (prealloc_page) { + SetPageSwapBacked(prealloc_page); if (mem_cgroup_cache_charge(prealloc_page, current->mm, GFP_KERNEL)) { page_cache_release(prealloc_page); @@ -1403,7 +1403,8 @@ repeat: goto repeat; } spin_unlock(&info->lock); - } else { + + } else if (prealloc_page) { shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { @@ -1419,41 +1420,8 @@ repeat: if (!filepage) { int ret; - if (!prealloc_page) { - spin_unlock(&info->lock); - filepage = shmem_alloc_page(gfp, info, idx); - if (!filepage) { - spin_lock(&info->lock); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - spin_unlock(&info->lock); - error = -ENOMEM; - goto failed; - } - SetPageSwapBacked(filepage); - - /* - * Precharge page while we can wait, compensate - * after - */ - error = mem_cgroup_cache_charge(filepage, - current->mm, GFP_KERNEL); - if (error) { - page_cache_release(filepage); - spin_lock(&info->lock); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - spin_unlock(&info->lock); - filepage = NULL; - goto failed; - } - - spin_lock(&info->lock); - } else { - filepage = prealloc_page; - prealloc_page = NULL; - SetPageSwapBacked(filepage); - } + filepage = prealloc_page; + prealloc_page = NULL; entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) @@ -1492,11 +1460,20 @@ repeat: SetPageUptodate(filepage); if (sgp == SGP_DIRTY) set_page_dirty(filepage); + } else { + spin_unlock(&info->lock); + error = -ENOMEM; + goto out; } done: *pagep = filepage; error = 0; - goto out; +out: + if (prealloc_page) { + mem_cgroup_uncharge_cache_page(prealloc_page); + page_cache_release(prealloc_page); + } + return error; nospace: /* @@ -1520,12 +1497,7 @@ failed: unlock_page(filepage); page_cache_release(filepage); } -out: - if (prealloc_page) { - mem_cgroup_uncharge_cache_page(prealloc_page); - page_cache_release(prealloc_page); - } - return error; + goto out; } static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -- cgit v1.2.3-59-g8ed1b From 27ab700626f048407e9466d389a43c7d3aa45967 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:36 -0700 Subject: tmpfs: simplify filepage/swappage We can now simplify shmem_getpage_gfp(): there is no longer a dilemma of filepage passed in via shmem_readpage(), then swappage found, which must then be copied over to it. Although at first it's tempting to replace the **pagep arg by returning struct page *, that makes a mess of IS_ERR_OR_NULL(page)s in all the callers, so leave as is. Insert BUG_ON(!PageUptodate) when we find and lock page: some of the complication came from uninitialized pages inserted into filecache prior to readpage; but now we're in control, and only release pagelock on filecache once it's uptodate (if an error occurs in reading back from swap, the page remains in swapcache, never moved to filecache). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 237 ++++++++++++++++++++++++++++--------------------------------- 1 file changed, 108 insertions(+), 129 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 8f8534f35476..bf6e9c11d859 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1246,41 +1246,47 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; - struct page *filepage; - struct page *swappage; + struct page *page; struct page *prealloc_page = NULL; swp_entry_t *entry; swp_entry_t swap; int error; + int ret; if (idx >= SHMEM_MAX_INDEX) return -EFBIG; repeat: - filepage = find_lock_page(mapping, idx); - if (filepage && PageUptodate(filepage)) - goto done; - if (!filepage) { + page = find_lock_page(mapping, idx); + if (page) { /* - * Try to preload while we can wait, to not make a habit of - * draining atomic reserves; but don't latch on to this cpu. + * Once we can get the page lock, it must be uptodate: + * if there were an error in reading back from swap, + * the page would not be inserted into the filecache. */ - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); - if (error) - goto failed; - radix_tree_preload_end(); - if (sgp != SGP_READ && !prealloc_page) { - prealloc_page = shmem_alloc_page(gfp, info, idx); - if (prealloc_page) { - SetPageSwapBacked(prealloc_page); - if (mem_cgroup_cache_charge(prealloc_page, - current->mm, GFP_KERNEL)) { - page_cache_release(prealloc_page); - prealloc_page = NULL; - } + BUG_ON(!PageUptodate(page)); + goto done; + } + + /* + * Try to preload while we can wait, to not make a habit of + * draining atomic reserves; but don't latch on to this cpu. + */ + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + if (error) + goto out; + radix_tree_preload_end(); + + if (sgp != SGP_READ && !prealloc_page) { + prealloc_page = shmem_alloc_page(gfp, info, idx); + if (prealloc_page) { + SetPageSwapBacked(prealloc_page); + if (mem_cgroup_cache_charge(prealloc_page, + current->mm, GFP_KERNEL)) { + page_cache_release(prealloc_page); + prealloc_page = NULL; } } } - error = 0; spin_lock(&info->lock); shmem_recalc_inode(inode); @@ -1288,21 +1294,21 @@ repeat: if (IS_ERR(entry)) { spin_unlock(&info->lock); error = PTR_ERR(entry); - goto failed; + goto out; } swap = *entry; if (swap.val) { /* Look it up and read it in.. */ - swappage = lookup_swap_cache(swap); - if (!swappage) { + page = lookup_swap_cache(swap); + if (!page) { shmem_swp_unmap(entry); spin_unlock(&info->lock); /* here we actually do the io */ if (fault_type) *fault_type |= VM_FAULT_MAJOR; - swappage = shmem_swapin(swap, gfp, info, idx); - if (!swappage) { + page = shmem_swapin(swap, gfp, info, idx); + if (!page) { spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) @@ -1314,62 +1320,42 @@ repeat: } spin_unlock(&info->lock); if (error) - goto failed; + goto out; goto repeat; } - wait_on_page_locked(swappage); - page_cache_release(swappage); + wait_on_page_locked(page); + page_cache_release(page); goto repeat; } /* We have to do this with page locked to prevent races */ - if (!trylock_page(swappage)) { + if (!trylock_page(page)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); - wait_on_page_locked(swappage); - page_cache_release(swappage); + wait_on_page_locked(page); + page_cache_release(page); goto repeat; } - if (PageWriteback(swappage)) { + if (PageWriteback(page)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); - wait_on_page_writeback(swappage); - unlock_page(swappage); - page_cache_release(swappage); + wait_on_page_writeback(page); + unlock_page(page); + page_cache_release(page); goto repeat; } - if (!PageUptodate(swappage)) { + if (!PageUptodate(page)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); - unlock_page(swappage); - page_cache_release(swappage); + unlock_page(page); + page_cache_release(page); error = -EIO; - goto failed; + goto out; } - if (filepage) { - shmem_swp_set(info, entry, 0); - shmem_swp_unmap(entry); - delete_from_swap_cache(swappage); - spin_unlock(&info->lock); - copy_highpage(filepage, swappage); - unlock_page(swappage); - page_cache_release(swappage); - flush_dcache_page(filepage); - SetPageUptodate(filepage); - set_page_dirty(filepage); - swap_free(swap); - } else if (!(error = add_to_page_cache_locked(swappage, mapping, - idx, GFP_NOWAIT))) { - info->flags |= SHMEM_PAGEIN; - shmem_swp_set(info, entry, 0); - shmem_swp_unmap(entry); - delete_from_swap_cache(swappage); - spin_unlock(&info->lock); - filepage = swappage; - set_page_dirty(filepage); - swap_free(swap); - } else { + error = add_to_page_cache_locked(page, mapping, + idx, GFP_NOWAIT); + if (error) { shmem_swp_unmap(entry); spin_unlock(&info->lock); if (error == -ENOMEM) { @@ -1378,28 +1364,33 @@ repeat: * call memcg's OOM if needed. */ error = mem_cgroup_shmem_charge_fallback( - swappage, - current->mm, - gfp); + page, current->mm, gfp); if (error) { - unlock_page(swappage); - page_cache_release(swappage); - goto failed; + unlock_page(page); + page_cache_release(page); + goto out; } } - unlock_page(swappage); - page_cache_release(swappage); + unlock_page(page); + page_cache_release(page); goto repeat; } - } else if (sgp == SGP_READ && !filepage) { + + info->flags |= SHMEM_PAGEIN; + shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); - filepage = find_get_page(mapping, idx); - if (filepage && - (!PageUptodate(filepage) || !trylock_page(filepage))) { + delete_from_swap_cache(page); + spin_unlock(&info->lock); + set_page_dirty(page); + swap_free(swap); + + } else if (sgp == SGP_READ) { + shmem_swp_unmap(entry); + page = find_get_page(mapping, idx); + if (page && !trylock_page(page)) { spin_unlock(&info->lock); - wait_on_page_locked(filepage); - page_cache_release(filepage); - filepage = NULL; + wait_on_page_locked(page); + page_cache_release(page); goto repeat; } spin_unlock(&info->lock); @@ -1417,56 +1408,52 @@ repeat: } else if (shmem_acct_block(info->flags)) goto nospace; - if (!filepage) { - int ret; + page = prealloc_page; + prealloc_page = NULL; - filepage = prealloc_page; - prealloc_page = NULL; - - entry = shmem_swp_alloc(info, idx, sgp, gfp); - if (IS_ERR(entry)) - error = PTR_ERR(entry); - else { - swap = *entry; - shmem_swp_unmap(entry); - } - ret = error || swap.val; - if (ret) - mem_cgroup_uncharge_cache_page(filepage); - else - ret = add_to_page_cache_lru(filepage, mapping, + entry = shmem_swp_alloc(info, idx, sgp, gfp); + if (IS_ERR(entry)) + error = PTR_ERR(entry); + else { + swap = *entry; + shmem_swp_unmap(entry); + } + ret = error || swap.val; + if (ret) + mem_cgroup_uncharge_cache_page(page); + else + ret = add_to_page_cache_lru(page, mapping, idx, GFP_NOWAIT); - /* - * At add_to_page_cache_lru() failure, uncharge will - * be done automatically. - */ - if (ret) { - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - spin_unlock(&info->lock); - page_cache_release(filepage); - filepage = NULL; - if (error) - goto failed; - goto repeat; - } - info->flags |= SHMEM_PAGEIN; + /* + * At add_to_page_cache_lru() failure, + * uncharge will be done automatically. + */ + if (ret) { + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + spin_unlock(&info->lock); + page_cache_release(page); + if (error) + goto out; + goto repeat; } + info->flags |= SHMEM_PAGEIN; info->alloced++; spin_unlock(&info->lock); - clear_highpage(filepage); - flush_dcache_page(filepage); - SetPageUptodate(filepage); + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); if (sgp == SGP_DIRTY) - set_page_dirty(filepage); + set_page_dirty(page); + } else { spin_unlock(&info->lock); error = -ENOMEM; goto out; } done: - *pagep = filepage; + *pagep = page; error = 0; out: if (prealloc_page) { @@ -1482,21 +1469,13 @@ nospace: * but must also avoid reporting a spurious ENOSPC while working on a * full tmpfs. */ - if (!filepage) { - struct page *page = find_get_page(mapping, idx); - if (page) { - spin_unlock(&info->lock); - page_cache_release(page); - goto repeat; - } - } + page = find_get_page(mapping, idx); spin_unlock(&info->lock); - error = -ENOSPC; -failed: - if (filepage) { - unlock_page(filepage); - page_cache_release(filepage); + if (page) { + page_cache_release(page); + goto repeat; } + error = -ENOSPC; goto out; } -- cgit v1.2.3-59-g8ed1b From 48f170fb7d7db8789ccc23e051af61f62af5f685 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 25 Jul 2011 17:12:37 -0700 Subject: tmpfs: simplify unuse and writepage shmem_unuse_inode() and shmem_writepage() contain a little code to cope with pages inserted independently into the filecache, probably by a filesystem stacked on top of tmpfs, then fed to its ->readpage() or ->writepage(). Unionfs was indeed experimenting with working in that way three years ago, but I find no current examples: nowadays the stacking filesystems use vfs interfaces to the lower filesystem. It's now illegal: remove most of that code, adding some WARN_ON_ONCEs. Signed-off-by: Hugh Dickins Cc: Erez Zadok Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 50 ++++++++++++++++---------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index bf6e9c11d859..7533574109da 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -972,20 +972,7 @@ found: error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); /* which does mem_cgroup_uncharge_cache_page on error */ - if (error == -EEXIST) { - struct page *filepage = find_get_page(mapping, idx); - error = 1; - if (filepage) { - /* - * There might be a more uptodate page coming down - * from a stacked writepage: forget our swappage if so. - */ - if (PageUptodate(filepage)) - error = 0; - page_cache_release(filepage); - } - } - if (!error) { + if (error != -ENOMEM) { delete_from_swap_cache(page); set_page_dirty(page); info->flags |= SHMEM_PAGEIN; @@ -1072,16 +1059,17 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * shmem_backing_dev_info's capabilities prevent regular writeback or * sync from ever calling shmem_writepage; but a stacking filesystem - * may use the ->writepage of its underlying filesystem, in which case + * might use ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for the writeback threads or sync. However, in those cases, - * we do still want to check if there's a redundant swappage to be - * discarded. + * and not for the writeback threads or sync. */ - if (wbc->for_reclaim) - swap = get_swap_page(); - else - swap.val = 0; + if (!wbc->for_reclaim) { + WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ + goto redirty; + } + swap = get_swap_page(); + if (!swap.val) + goto redirty; /* * Add inode to shmem_unuse()'s list of swapped-out inodes, @@ -1092,15 +1080,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * we've taken the spinlock, because shmem_unuse_inode() will * prune a !swapped inode from the swaplist under both locks. */ - if (swap.val) { - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add_tail(&info->swaplist, &shmem_swaplist); - } + mutex_lock(&shmem_swaplist_mutex); + if (list_empty(&info->swaplist)) + list_add_tail(&info->swaplist, &shmem_swaplist); spin_lock(&info->lock); - if (swap.val) - mutex_unlock(&shmem_swaplist_mutex); + mutex_unlock(&shmem_swaplist_mutex); if (index >= info->next_index) { BUG_ON(!(info->flags & SHMEM_TRUNCATE)); @@ -1108,16 +1093,13 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } entry = shmem_swp_entry(info, index, NULL); if (entry->val) { - /* - * The more uptodate page coming down from a stacked - * writepage should replace our old swappage. - */ + WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ free_swap_and_cache(*entry); shmem_swp_set(info, entry, 0); } shmem_recalc_inode(inode); - if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { delete_from_page_cache(page); shmem_swp_set(info, entry, swap.val); shmem_swp_unmap(entry); -- cgit v1.2.3-59-g8ed1b From 99b12e3d882bc7ebdfe0de381dff3b16d21c38f7 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 25 Jul 2011 17:12:37 -0700 Subject: writeback: account NR_WRITTEN at IO completion time NR_WRITTEN is now accounted at block IO enqueue time, which is not very accurate as to common understanding. This moves NR_WRITTEN accounting to the IO completion time and makes it more consistent with BDI_WRITTEN, which is used for bandwidth estimation. Signed-off-by: Wu Fengguang Cc: Michael Rubin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 919b45eb57ad..d8767b381b9c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1141,7 +1141,6 @@ EXPORT_SYMBOL(account_page_dirtied); void account_page_writeback(struct page *page) { inc_zone_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_WRITTEN); } EXPORT_SYMBOL(account_page_writeback); @@ -1358,8 +1357,10 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } - if (ret) + if (ret) { dec_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); + } return ret; } -- cgit v1.2.3-59-g8ed1b From ae891a1b93bf62e9aaa116a7a71312375047fc9f Mon Sep 17 00:00:00 2001 From: Maxin B John Date: Mon, 25 Jul 2011 17:12:59 -0700 Subject: devres: fix possible use after free devres uses the pointer value as key after it's freed, which is safe but triggers spurious use-after-free warnings on some static analysis tools. Rearrange code to avoid such warnings. Signed-off-by: Maxin B. John Reviewed-by: Rolf Eike Beer Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/devres.c | 2 +- lib/devres.c | 2 +- mm/dmapool.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1ef4ffcdfa55..bd8e788d71e0 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) { struct irq_devres match_data = { irq, dev_id }; - free_irq(irq, dev_id); WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, &match_data)); + free_irq(irq, dev_id); } EXPORT_SYMBOL(devm_free_irq); diff --git a/lib/devres.c b/lib/devres.c index 6efddf53b90c..7c0e953a7486 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -79,9 +79,9 @@ EXPORT_SYMBOL(devm_ioremap_nocache); */ void devm_iounmap(struct device *dev, void __iomem *addr) { - iounmap(addr); WARN_ON(devres_destroy(dev, devm_ioremap_release, devm_ioremap_match, (void *)addr)); + iounmap(addr); } EXPORT_SYMBOL(devm_iounmap); diff --git a/mm/dmapool.c b/mm/dmapool.c index 03bf3bb4519a..fbb58e346888 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool) { struct device *dev = pool->dev; - dma_pool_destroy(pool); WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); + dma_pool_destroy(pool); } EXPORT_SYMBOL(dmam_pool_destroy); -- cgit v1.2.3-59-g8ed1b