From 0e9052eb98a9986ec0669d030604f7a68f6df638 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:57 +0100 Subject: page-types: add standard GPL license header Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/page-types.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 7a7d9bab32ef..66e9358e2144 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -1,11 +1,22 @@ /* * page-types: Tool for querying page flags * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should find a copy of v2 of the GNU General Public License somewhere on + * your Linux system; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * * Copyright (C) 2009 Intel corporation * * Authors: Wu Fengguang - * - * Released under the General Public License (GPL). */ #define _LARGEFILE64_SOURCE -- cgit v1.2.3-59-g8ed1b From 847ce401df392b0704369fd3f75df614ac1414b4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:58 +0100 Subject: HWPOISON: Add unpoisoning support The unpoisoning interface is useful for stress testing tools to reclaim poisoned pages (to prevent OOM) There is no hardware level unpoisioning, so this cannot be used for real memory errors, only for software injected errors. Note that it may leak pages silently - those who have been removed from LRU cache, but not isolated from page cache/swap cache at hwpoison time. Especially the stress test of dirty swap cache pages shall reboot system before exhausting memory. AK: Fix comments, add documentation, add printks, rename symbol Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 16 ++++++++-- include/linux/mm.h | 1 + include/linux/page-flags.h | 2 +- mm/hwpoison-inject.c | 36 +++++++++++++++++++---- mm/memory-failure.c | 68 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 114 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 3ffadf8da61f..f047e75acb23 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -98,10 +98,22 @@ madvise(MADV_POISON, ....) hwpoison-inject module through debugfs - /sys/debug/hwpoison/corrupt-pfn -Inject hwpoison fault at PFN echoed into this file +/sys/debug/hwpoison/ +corrupt-pfn + +Inject hwpoison fault at PFN echoed into this file. + +unpoison-pfn + +Software-unpoison page at PFN echoed into this file. This +way a page can be reused again. +This only works for Linux injected failures, not for real +memory failures. + +Note these injection interfaces are not stable and might change between +kernel versions Architecture specific MCE injector diff --git a/include/linux/mm.h b/include/linux/mm.h index 135e19198cd3..8cdb941fc7b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1336,6 +1336,7 @@ enum mf_flags { }; extern void memory_failure(unsigned long pfn, int trapno); extern int __memory_failure(unsigned long pfn, int trapno, int flags); +extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 49e907bd067f..f9df6308af95 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -275,7 +275,7 @@ PAGEFLAG_FALSE(Uncached) #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison) -TESTSETFLAG(HWPoison, hwpoison) +TESTSCFLAG(HWPoison, hwpoison) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f086..6e35e563bf50 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -4,7 +4,7 @@ #include #include -static struct dentry *hwpoison_dir, *corrupt_pfn; +static struct dentry *hwpoison_dir; static int hwpoison_inject(void *data, u64 val) { @@ -14,7 +14,16 @@ static int hwpoison_inject(void *data, u64 val) return __memory_failure(val, 18, 0); } +static int hwpoison_unpoison(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return unpoison_memory(val); +} + DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); +DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { @@ -24,16 +33,31 @@ static void pfn_inject_exit(void) static int pfn_inject_init(void) { + struct dentry *dentry; + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); if (hwpoison_dir == NULL) return -ENOMEM; - corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + + /* + * Note that the below poison/unpoison interfaces do not involve + * hardware status change, hence do not require hardware support. + * They are mainly for testing hwpoison in software level. + */ + dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, NULL, &hwpoison_fops); - if (corrupt_pfn == NULL) { - pfn_inject_exit(); - return -ENOMEM; - } + if (!dentry) + goto fail; + + dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, + NULL, &unpoison_fops); + if (!dentry) + goto fail; + return 0; +fail: + pfn_inject_exit(); + return -ENOMEM; } module_init(pfn_inject_init); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5055b940df5f..ed6e91c87a54 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -838,6 +838,16 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * and in many cases impossible, so we just avoid it here. */ lock_page_nosync(p); + + /* + * unpoison always clear PG_hwpoison inside page lock + */ + if (!PageHWPoison(p)) { + action_result(pfn, "unpoisoned", IGNORED); + res = 0; + goto out; + } + wait_on_page_writeback(p); /* @@ -893,3 +903,61 @@ void memory_failure(unsigned long pfn, int trapno) { __memory_failure(pfn, trapno, 0); } + +/** + * unpoison_memory - Unpoison a previously poisoned page + * @pfn: Page number of the to be unpoisoned page + * + * Software-unpoison a page that has been poisoned by + * memory_failure() earlier. + * + * This is only done on the software-level, so it only works + * for linux injected failures, not real hardware failures + * + * Returns 0 for success, otherwise -errno. + */ +int unpoison_memory(unsigned long pfn) +{ + struct page *page; + struct page *p; + int freeit = 0; + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + page = compound_head(p); + + if (!PageHWPoison(p)) { + pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + return 0; + } + + if (!get_page_unless_zero(page)) { + if (TestClearPageHWPoison(p)) + atomic_long_dec(&mce_bad_pages); + pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); + return 0; + } + + lock_page_nosync(page); + /* + * This test is racy because PG_hwpoison is set outside of page lock. + * That's acceptable because that won't trigger kernel panic. Instead, + * the PG_hwpoison page will be caught and isolated on the entrance to + * the free buddy page pool. + */ + if (TestClearPageHWPoison(p)) { + pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); + atomic_long_dec(&mce_bad_pages); + freeit = 1; + } + unlock_page(page); + + put_page(page); + if (freeit) + put_page(page); + + return 0; +} +EXPORT_SYMBOL(unpoison_memory); -- cgit v1.2.3-59-g8ed1b From 7c116f2b0dbac4a1dd051c7a5e8cef37701cafd4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: add fs/device filters Filesystem data/metadata present the most tricky-to-isolate pages. It requires careful code review and stress testing to get them right. The fs/device filter helps to target the stress tests to some specific filesystem pages. The filter condition is block device's major/minor numbers: - corrupt-filter-dev-major - corrupt-filter-dev-minor When specified (non -1), only page cache pages that belong to that device will be poisoned. The filters are checked reliably on the locked and refcounted page. Haicheng: clear PG_hwpoison and drop bad page count if filter not OK AK: Add documentation CC: Haicheng Li CC: Nick Piggin Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 7 ++++++ mm/hwpoison-inject.c | 11 ++++++++++ mm/internal.h | 3 +++ mm/memory-failure.c | 51 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+) (limited to 'Documentation') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index f047e75acb23..fdf580464324 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -115,6 +115,13 @@ memory failures. Note these injection interfaces are not stable and might change between kernel versions +corrupt-filter-dev-major +corrupt-filter-dev-minor + +Only handle memory failures to pages associated with the file system defined +by block device major/minor. -1U is the wildcard value. +This should be only used for testing with artificial injection. + Architecture specific MCE injector x86 has mce-inject, mce-test diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 6e35e563bf50..ac692a9b766c 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -3,6 +3,7 @@ #include #include #include +#include "internal.h" static struct dentry *hwpoison_dir; @@ -54,6 +55,16 @@ static int pfn_inject_init(void) if (!dentry) goto fail; + dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, + hwpoison_dir, &hwpoison_filter_dev_major); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, + hwpoison_dir, &hwpoison_filter_dev_minor); + if (!dentry) + goto fail; + return 0; fail: pfn_inject_exit(); diff --git a/mm/internal.h b/mm/internal.h index 49b2ff776b78..814da335f050 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -250,3 +250,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, #define ZONE_RECLAIM_SOME 0 #define ZONE_RECLAIM_SUCCESS 1 #endif + +extern u32 hwpoison_filter_dev_major; +extern u32 hwpoison_filter_dev_minor; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index edeaf2319e74..82ac73436d0e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -48,6 +48,50 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +u32 hwpoison_filter_dev_major = ~0U; +u32 hwpoison_filter_dev_minor = ~0U; +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); + +static int hwpoison_filter_dev(struct page *p) +{ + struct address_space *mapping; + dev_t dev; + + if (hwpoison_filter_dev_major == ~0U && + hwpoison_filter_dev_minor == ~0U) + return 0; + + /* + * page_mapping() does not accept slab page + */ + if (PageSlab(p)) + return -EINVAL; + + mapping = page_mapping(p); + if (mapping == NULL || mapping->host == NULL) + return -EINVAL; + + dev = mapping->host->i_sb->s_dev; + if (hwpoison_filter_dev_major != ~0U && + hwpoison_filter_dev_major != MAJOR(dev)) + return -EINVAL; + if (hwpoison_filter_dev_minor != ~0U && + hwpoison_filter_dev_minor != MINOR(dev)) + return -EINVAL; + + return 0; +} + +int hwpoison_filter(struct page *p) +{ + if (hwpoison_filter_dev(p)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(hwpoison_filter); + /* * Send all the processes who have the page mapped an ``action optional'' * signal. @@ -843,6 +887,13 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) res = 0; goto out; } + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(p)) + atomic_long_dec(&mce_bad_pages); + unlock_page(p); + put_page(p); + return 0; + } wait_on_page_writeback(p); -- cgit v1.2.3-59-g8ed1b From 31d3d3484f9bd263925ecaa341500ac2df3a5d9b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: limit hwpoison injector to known page types __memory_failure()'s workflow is set PG_hwpoison //... unset PG_hwpoison if didn't pass hwpoison filter That could kill unrelated process if it happens to page fault on the page with the (temporary) PG_hwpoison. The race should be big enough to appear in stress tests. Fix it by grabbing the page and checking filter at inject time. This also avoids the very noisy "Injecting memory failure..." messages. - we don't touch madvise() based injection, because the filters are generally not necessary for it. - if we want to apply the filters to h/w aided injection, we'd better to rearrange the logic in __memory_failure() instead of this patch. AK: fix documentation, use drain all, cleanups CC: Haicheng Li Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 3 ++- mm/hwpoison-inject.c | 41 +++++++++++++++++++++++++++++++++++++++-- mm/internal.h | 2 ++ 3 files changed, 43 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index fdf580464324..4ef7bb30d15c 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -103,7 +103,8 @@ hwpoison-inject module through debugfs corrupt-pfn -Inject hwpoison fault at PFN echoed into this file. +Inject hwpoison fault at PFN echoed into this file. This does +some early filtering to avoid corrupted unintended pages in test suites. unpoison-pfn diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index ac692a9b766c..2b6b3200fa65 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -3,16 +3,53 @@ #include #include #include +#include +#include #include "internal.h" static struct dentry *hwpoison_dir; static int hwpoison_inject(void *data, u64 val) { + unsigned long pfn = val; + struct page *p; + int err; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); - return __memory_failure(val, 18, 0); + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + /* + * This implies unable to support free buddy pages. + */ + if (!get_page_unless_zero(p)) + return 0; + + if (!PageLRU(p)) + shake_page(p); + /* + * This implies unable to support non-LRU pages. + */ + if (!PageLRU(p)) + return 0; + + /* + * do a racy check with elevated page count, to make sure PG_hwpoison + * will only be set for the targeted owner (or on a free page). + * We temporarily take page lock for try_get_mem_cgroup_from_page(). + * __memory_failure() will redo the check reliably inside page lock. + */ + lock_page(p); + err = hwpoison_filter(p); + unlock_page(p); + if (err) + return 0; + + printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); + return __memory_failure(pfn, 18, MF_COUNT_INCREASED); } static int hwpoison_unpoison(void *data, u64 val) diff --git a/mm/internal.h b/mm/internal.h index 814da335f050..04bbce8b8ba6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -251,5 +251,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, #define ZONE_RECLAIM_SUCCESS 1 #endif +extern int hwpoison_filter(struct page *p); + extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; -- cgit v1.2.3-59-g8ed1b From 478c5ffc0b50527bd2390f2daa46cc16276b8413 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: add page flags filter When specified, only poison pages if ((page_flags & mask) == value). - corrupt-filter-flags-mask - corrupt-filter-flags-value This allows stress testing of many kinds of pages. Strictly speaking, the buddy pages requires taking zone lock, to avoid setting PG_hwpoison on a "was buddy but now allocated to someone" page. However we can just do nothing because we set PG_locked in the beginning, this prevents the page allocator from allocating it to someone. (It will BUG() on the unexpected PG_locked, which is fine for hwpoison testing.) [AK: Add select PROC_PAGE_MONITOR to satisfy dependency] CC: Nick Piggin Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 10 ++++++++++ mm/Kconfig | 1 + mm/hwpoison-inject.c | 10 ++++++++++ mm/internal.h | 2 ++ mm/memory-failure.c | 20 ++++++++++++++++++++ 5 files changed, 43 insertions(+) (limited to 'Documentation') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 4ef7bb30d15c..f454d3cd4d60 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -123,6 +123,16 @@ Only handle memory failures to pages associated with the file system defined by block device major/minor. -1U is the wildcard value. This should be only used for testing with artificial injection. + +corrupt-filter-flags-mask +corrupt-filter-flags-value + +When specified, only poison pages if ((page_flags & mask) == value). +This allows stress testing of many kinds of pages. The page_flags +are the same as in /proc/kpageflags. The flag bits are defined in +include/linux/kernel-page-flags.h and documented in +Documentation/vm/pagemap.txt + Architecture specific MCE injector x86 has mce-inject, mce-test diff --git a/mm/Kconfig b/mm/Kconfig index 2310984591ed..8cea7fde06e1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -253,6 +253,7 @@ config MEMORY_FAILURE config HWPOISON_INJECT tristate "Poison pages injector" depends on MEMORY_FAILURE && DEBUG_KERNEL + select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 2b6b3200fa65..c4dfd89f654a 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -102,6 +102,16 @@ static int pfn_inject_init(void) if (!dentry) goto fail; + dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, + hwpoison_dir, &hwpoison_filter_flags_mask); + if (!dentry) + goto fail; + + dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, + hwpoison_dir, &hwpoison_filter_flags_value); + if (!dentry) + goto fail; + return 0; fail: pfn_inject_exit(); diff --git a/mm/internal.h b/mm/internal.h index 04bbce8b8ba6..b2027c73119b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -255,3 +255,5 @@ extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; +extern u64 hwpoison_filter_flags_mask; +extern u64 hwpoison_filter_flags_value; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 82ac73436d0e..22d2b2028e54 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -50,8 +51,12 @@ atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); u32 hwpoison_filter_dev_major = ~0U; u32 hwpoison_filter_dev_minor = ~0U; +u64 hwpoison_filter_flags_mask; +u64 hwpoison_filter_flags_value; EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); static int hwpoison_filter_dev(struct page *p) { @@ -83,11 +88,26 @@ static int hwpoison_filter_dev(struct page *p) return 0; } +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} + int hwpoison_filter(struct page *p) { if (hwpoison_filter_dev(p)) return -EINVAL; + if (hwpoison_filter_flags(p)) + return -EINVAL; + return 0; } EXPORT_SYMBOL_GPL(hwpoison_filter); -- cgit v1.2.3-59-g8ed1b From 4fd466eb46a6a917c317a87fb94bfc7252a0f7ed Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:19:59 +0100 Subject: HWPOISON: add memory cgroup filter The hwpoison test suite need to inject hwpoison to a collection of selected task pages, and must not touch pages not owned by them and thus kill important system processes such as init. (But it's OK to mis-hwpoison free/unowned pages as well as shared clean pages. Mis-hwpoison of shared dirty pages will kill all tasks, so the test suite will target all or non of such tasks in the first place.) The memory cgroup serves this purpose well. We can put the target processes under the control of a memory cgroup, and tell the hwpoison injection code to only kill pages associated with some active memory cgroup. The prerequisite for doing hwpoison stress tests with mem_cgroup is, the mem_cgroup code tracks task pages _accurately_ (unless page is locked). Which we believe is/should be true. The benefits are simplification of hwpoison injector code. Also the mem_cgroup code will automatically be tested by hwpoison test cases. The alternative interfaces pin-pfn/unpin-pfn can also delegate the (process and page flags) filtering functions reliably to user space. However prototype implementation shows that this scheme adds more complexity than we wanted. Example test case: mkdir /cgroup/hwpoison usemem -m 100 -s 1000 & echo `jobs -p` > /cgroup/hwpoison/tasks memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ') echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg page-types -p `pidof init` --hwpoison # shall do nothing page-types -p `pidof usemem` --hwpoison # poison its pages [AK: Fix documentation] [Add fix for problem noticed by Li Zefan ; dentry in the css could be NULL] CC: KOSAKI Motohiro CC: Hugh Dickins CC: Daisuke Nishimura CC: Balbir Singh CC: KAMEZAWA Hiroyuki CC: Li Zefan CC: Paul Menage CC: Nick Piggin CC: Andi Kleen Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 16 +++++++++++++++ mm/hwpoison-inject.c | 7 +++++++ mm/internal.h | 1 + mm/memory-failure.c | 46 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+) (limited to 'Documentation') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index f454d3cd4d60..989e5afe740f 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -123,6 +123,22 @@ Only handle memory failures to pages associated with the file system defined by block device major/minor. -1U is the wildcard value. This should be only used for testing with artificial injection. +corrupt-filter-memcg + +Limit injection to pages owned by memgroup. Specified by inode number +of the memcg. + +Example: + mkdir /cgroup/hwpoison + + usemem -m 100 -s 1000 & + echo `jobs -p` > /cgroup/hwpoison/tasks + + memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ') + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg + + page-types -p `pidof init` --hwpoison # shall do nothing + page-types -p `pidof usemem` --hwpoison # poison its pages corrupt-filter-flags-mask corrupt-filter-flags-value diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c4dfd89f654a..c838735ac31d 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -112,6 +112,13 @@ static int pfn_inject_init(void) if (!dentry) goto fail; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP + dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, + hwpoison_dir, &hwpoison_filter_memcg); + if (!dentry) + goto fail; +#endif + return 0; fail: pfn_inject_exit(); diff --git a/mm/internal.h b/mm/internal.h index b2027c73119b..5a6761bea6a6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -257,3 +257,4 @@ extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; +extern u64 hwpoison_filter_memcg; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 22d2b2028e54..117ef1598469 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -100,6 +100,49 @@ static int hwpoison_filter_flags(struct page *p) return -EINVAL; } +/* + * This allows stress tests to limit test scope to a collection of tasks + * by putting them under some memcg. This prevents killing unrelated/important + * processes such as /sbin/init. Note that the target task may share clean + * pages with init (eg. libc text), which is harmless. If the target task + * share _dirty_ pages with another task B, the test scheme must make sure B + * is also included in the memcg. At last, due to race conditions this filter + * can only guarantee that the page either belongs to the memcg tasks, or is + * a freed page. + */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +u64 hwpoison_filter_memcg; +EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); +static int hwpoison_filter_task(struct page *p) +{ + struct mem_cgroup *mem; + struct cgroup_subsys_state *css; + unsigned long ino; + + if (!hwpoison_filter_memcg) + return 0; + + mem = try_get_mem_cgroup_from_page(p); + if (!mem) + return -EINVAL; + + css = mem_cgroup_css(mem); + /* root_mem_cgroup has NULL dentries */ + if (!css->cgroup->dentry) + return -EINVAL; + + ino = css->cgroup->dentry->d_inode->i_ino; + css_put(css); + + if (ino != hwpoison_filter_memcg) + return -EINVAL; + + return 0; +} +#else +static int hwpoison_filter_task(struct page *p) { return 0; } +#endif + int hwpoison_filter(struct page *p) { if (hwpoison_filter_dev(p)) @@ -108,6 +151,9 @@ int hwpoison_filter(struct page *p) if (hwpoison_filter_flags(p)) return -EINVAL; + if (hwpoison_filter_task(p)) + return -EINVAL; + return 0; } EXPORT_SYMBOL_GPL(hwpoison_filter); -- cgit v1.2.3-59-g8ed1b From fe194d3e100dea323d7b2de96d3b44d0c067ba7a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: Use correct name for MADV_HWPOISON in documentation Signed-off-by: Andi Kleen --- Documentation/vm/hwpoison.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 989e5afe740f..12f9ba20ccb7 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -92,7 +92,7 @@ PR_MCE_KILL_GET Testing: -madvise(MADV_POISON, ....) +madvise(MADV_HWPOISON, ....) (as root) Poison a page in the process for testing -- cgit v1.2.3-59-g8ed1b From facb6011f3993947283fa15d039dacb4ad140230 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Dec 2009 12:20:00 +0100 Subject: HWPOISON: Add soft page offline support This is a simpler, gentler variant of memory_failure() for soft page offlining controlled from user space. It doesn't kill anything, just tries to invalidate and if that doesn't work migrate the page away. This is useful for predictive failure analysis, where a page has a high rate of corrected errors, but hasn't gone bad yet. Instead it can be offlined early and avoided. The offlining is controlled from sysfs, including a new generic entry point for hard page offlining for symmetry too. We use the page isolate facility to prevent re-allocation race. Normally this is only used by memory hotplug. To avoid races with memory allocation I am using lock_system_sleep(). This avoids the situation where memory hotplug is about to isolate a page range and then hwpoison undoes that work. This is a big hammer currently, but the simplest solution currently. When the page is not free or LRU we try to free pages from slab and other caches. The slab freeing is currently quite dumb and does not try to focus on the specific slab cache which might own the page. This could be potentially improved later. Thanks to Fengguang Wu and Haicheng Li for some fixes. [Added fix from Andrew Morton to adapt to new migrate_pages prototype] Signed-off-by: Andi Kleen --- .../ABI/testing/sysfs-memory-page-offline | 44 +++++ drivers/base/memory.c | 61 +++++++ include/linux/mm.h | 3 +- mm/hwpoison-inject.c | 2 +- mm/memory-failure.c | 194 ++++++++++++++++++++- 5 files changed, 297 insertions(+), 7 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-memory-page-offline (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-memory-page-offline b/Documentation/ABI/testing/sysfs-memory-page-offline new file mode 100644 index 000000000000..e14703f12fdf --- /dev/null +++ b/Documentation/ABI/testing/sysfs-memory-page-offline @@ -0,0 +1,44 @@ +What: /sys/devices/system/memory/soft_offline_page +Date: Sep 2009 +KernelVersion: 2.6.33 +Contact: andi@firstfloor.org +Description: + Soft-offline the memory page containing the physical address + written into this file. Input is a hex number specifying the + physical address of the page. The kernel will then attempt + to soft-offline it, by moving the contents elsewhere or + dropping it if possible. The kernel will then be placed + on the bad page list and never be reused. + + The offlining is done in kernel specific granuality. + Normally it's the base page size of the kernel, but + this might change. + + The page must be still accessible, not poisoned. The + kernel will never kill anything for this, but rather + fail the offline. Return value is the size of the + number, or a error when the offlining failed. Reading + the file is not allowed. + +What: /sys/devices/system/memory/hard_offline_page +Date: Sep 2009 +KernelVersion: 2.6.33 +Contact: andi@firstfloor.org +Description: + Hard-offline the memory page containing the physical + address written into this file. Input is a hex number + specifying the physical address of the page. The + kernel will then attempt to hard-offline the page, by + trying to drop the page or killing any owner or + triggering IO errors if needed. Note this may kill + any processes owning the page. The kernel will avoid + to access this page assuming it's poisoned by the + hardware. + + The offlining is done in kernel specific granuality. + Normally it's the base page size of the kernel, but + this might change. + + Return value is the size of the number, or a error when + the offlining failed. + Reading the file is not allowed. diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 989429cfed88..c4c8f2e1dd15 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -341,6 +341,64 @@ static inline int memory_probe_init(void) } #endif +#ifdef CONFIG_MEMORY_FAILURE +/* + * Support for offlining pages of memory + */ + +/* Soft offline a page */ +static ssize_t +store_soft_offline_page(struct class *class, const char *buf, size_t count) +{ + int ret; + u64 pfn; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (strict_strtoull(buf, 0, &pfn) < 0) + return -EINVAL; + pfn >>= PAGE_SHIFT; + if (!pfn_valid(pfn)) + return -ENXIO; + ret = soft_offline_page(pfn_to_page(pfn), 0); + return ret == 0 ? count : ret; +} + +/* Forcibly offline a page, including killing processes. */ +static ssize_t +store_hard_offline_page(struct class *class, const char *buf, size_t count) +{ + int ret; + u64 pfn; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (strict_strtoull(buf, 0, &pfn) < 0) + return -EINVAL; + pfn >>= PAGE_SHIFT; + ret = __memory_failure(pfn, 0, 0); + return ret ? ret : count; +} + +static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); +static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); + +static __init int memory_fail_init(void) +{ + int err; + + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_soft_offline_page.attr); + if (!err) + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_hard_offline_page.attr); + return err; +} +#else +static inline int memory_fail_init(void) +{ + return 0; +} +#endif + /* * Note that phys_device is optional. It is here to allow for * differentiation between which *physical* devices each @@ -471,6 +529,9 @@ int __init memory_dev_init(void) } err = memory_probe_init(); + if (!ret) + ret = err; + err = memory_fail_init(); if (!ret) ret = err; err = block_size_init(); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8cdb941fc7b5..849b4a61bd8f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1339,8 +1339,9 @@ extern int __memory_failure(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p); +extern void shake_page(struct page *p, int access); extern atomic_long_t mce_bad_pages; +extern int soft_offline_page(struct page *page, int flags); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c597f46ac18a..a77fe3f9e211 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -29,7 +29,7 @@ static int hwpoison_inject(void *data, u64 val) return 0; if (!PageLRU(p)) - shake_page(p); + shake_page(p, 0); /* * This implies unable to support non-LRU pages. */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b5c3b6bd511f..bcce28755832 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -41,6 +41,9 @@ #include #include #include +#include +#include +#include #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -201,7 +204,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, * When a unknown page type is encountered drain as many buffers as possible * in the hope to turn the page into a LRU or free page, which we can handle. */ -void shake_page(struct page *p) +void shake_page(struct page *p, int access) { if (!PageSlab(p)) { lru_add_drain_all(); @@ -211,11 +214,19 @@ void shake_page(struct page *p) if (PageLRU(p) || is_free_buddy_page(p)) return; } + /* - * Could call shrink_slab here (which would also - * shrink other caches). Unfortunately that might - * also access the corrupted page, which could be fatal. + * Only all shrink_slab here (which would also + * shrink other caches) if access is not potentially fatal. */ + if (access) { + int nr; + do { + nr = shrink_slab(1000, GFP_KERNEL, 1000); + if (page_count(p) == 0) + break; + } while (nr > 10); + } } EXPORT_SYMBOL_GPL(shake_page); @@ -949,7 +960,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * walked by the page reclaim code, however that's not a big loss. */ if (!PageLRU(p)) - shake_page(p); + shake_page(p, 0); if (!PageLRU(p)) { /* * shake_page could have turned it free. @@ -1099,3 +1110,176 @@ int unpoison_memory(unsigned long pfn) return 0; } EXPORT_SYMBOL(unpoison_memory); + +static struct page *new_page(struct page *p, unsigned long private, int **x) +{ + return alloc_pages(GFP_HIGHUSER_MOVABLE, 0); +} + +/* + * Safely get reference count of an arbitrary page. + * Returns 0 for a free page, -EIO for a zero refcount page + * that is not free, and 1 for any other page type. + * For 1 the page is returned with increased page count, otherwise not. + */ +static int get_any_page(struct page *p, unsigned long pfn, int flags) +{ + int ret; + + if (flags & MF_COUNT_INCREASED) + return 1; + + /* + * The lock_system_sleep prevents a race with memory hotplug, + * because the isolation assumes there's only a single user. + * This is a big hammer, a better would be nicer. + */ + lock_system_sleep(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. + */ + set_migratetype_isolate(p); + if (!get_page_unless_zero(compound_head(p))) { + if (is_free_buddy_page(p)) { + pr_debug("get_any_page: %#lx free buddy page\n", pfn); + /* Set hwpoison bit while page is still isolated */ + SetPageHWPoison(p); + ret = 0; + } else { + pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pfn, p->flags); + ret = -EIO; + } + } else { + /* Not a free page */ + ret = 1; + } + unset_migratetype_isolate(p); + unlock_system_sleep(); + return ret; +} + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + /* + * Page cache page we can handle? + */ + if (!PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = get_any_page(page, pfn, 0); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + } + if (!PageLRU(page)) { + pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + + lock_page(page); + wait_on_page_writeback(page); + + /* + * Synchronized using the page lock with memory_failure() + */ + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + pr_debug("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); + unlock_page(page); + + /* + * Drop count because page migration doesn't like raised + * counts. The page could get re-allocated, but if it becomes + * LRU the isolation will just fail. + * RED-PEN would be better to keep it isolated here, but we + * would need to fix isolation locking first. + */ + put_page(page); + if (ret == 1) { + ret = 0; + pr_debug("soft_offline: %#lx: invalidated\n", pfn); + goto done; + } + + /* + * Simple invalidation didn't work. + * Try to migrate to a new page instead. migrate.c + * handles a large number of cases for us. + */ + ret = isolate_lru_page(page); + if (!ret) { + LIST_HEAD(pagelist); + + list_add(&page->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + if (ret) { + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + } + } else { + pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pfn, ret, page_count(page), page->flags); + } + if (ret) + return ret; + +done: + atomic_long_add(1, &mce_bad_pages); + SetPageHWPoison(page); + /* keep elevated page count for bad page */ + return ret; +} -- cgit v1.2.3-59-g8ed1b