From ad5fa913991e9e0f122b021e882b0d50051fbdbc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:06 +0200 Subject: HWPOISON: Add new SIGBUS error codes for hardware poison signals Add new SIGBUS codes for reporting machine checks as signals. When the hardware detects an uncorrected ECC error it can trigger these signals. This is needed for telling KVM's qemu about machine checks that happen to guests, so that it can inject them, but might be also useful for other programs. I find it useful in my test programs. This patch merely defines the new types. - Define two new si_codes for SIGBUS. BUS_MCEERR_AO and BUS_MCEERR_AR * BUS_MCEERR_AO is for "Action Optional" machine checks, which means that some corruption has been detected in the background, but nothing has been consumed so far. The program can ignore those if it wants (but most programs would already get killed) * BUS_MCEERR_AR is for "Action Required" machine checks. This happens when corrupted data is consumed or the application ran into an area which has been known to be corrupted earlier. These require immediate action and cannot just returned to. Most programs would kill themselves. - They report the address of the corruption in the user address space in si_addr. - Define a new si_addr_lsb field that reports the extent of the corruption to user space. That's currently always a (small) page. The user application cannot tell where in this page the corruption happened. AK: I plan to write a man page update before anyone asks. Signed-off-by: Andi Kleen --- include/asm-generic/siginfo.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/asm-generic') diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h index c840719a8c59..942d30b5aab1 100644 --- a/include/asm-generic/siginfo.h +++ b/include/asm-generic/siginfo.h @@ -82,6 +82,7 @@ typedef struct siginfo { #ifdef __ARCH_SI_TRAPNO int _trapno; /* TRAP # which caused the signal */ #endif + short _addr_lsb; /* LSB of the reported address */ } _sigfault; /* SIGPOLL */ @@ -112,6 +113,7 @@ typedef struct siginfo { #ifdef __ARCH_SI_TRAPNO #define si_trapno _sifields._sigfault._trapno #endif +#define si_addr_lsb _sifields._sigfault._addr_lsb #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd @@ -192,7 +194,11 @@ typedef struct siginfo { #define BUS_ADRALN (__SI_FAULT|1) /* invalid address alignment */ #define BUS_ADRERR (__SI_FAULT|2) /* non-existant physical address */ #define BUS_OBJERR (__SI_FAULT|3) /* object specific hardware error */ -#define NSIGBUS 3 +/* hardware memory error consumed on a machine check: action required */ +#define BUS_MCEERR_AR (__SI_FAULT|4) +/* hardware memory error detected in process but not consumed: action optional*/ +#define BUS_MCEERR_AO (__SI_FAULT|5) +#define NSIGBUS 5 /* * SIGTRAP si_codes -- cgit v1.2.3-59-g8ed1b From 9893e49d64a4874ea67849ee2cfbf3f3d6817573 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:17 +0200 Subject: HWPOISON: Add madvise() based injector for hardware poisoned pages v4 Impact: optional, useful for debugging Add a new madvice sub command to inject poison for some pages in a process' address space. This is useful for testing the poison page handling. This patch can allow root to tie up large amounts of memory. I got feedback from container developers and they didn't see any problem. v2: Use write flag for get_user_pages to make sure to always get a fresh page v3: Don't request write mapping (Fengguang Wu) v4: Move MADV_* number to avoid conflict with KSM (Hugh Dickins) Signed-off-by: Andi Kleen --- include/asm-generic/mman-common.h | 1 + mm/madvise.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include/asm-generic') diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index 3b69ad34189a..c325d1ef42ab 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -34,6 +34,7 @@ #define MADV_REMOVE 9 /* remove these pages & resources */ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ +#define MADV_HWPOISON 100 /* poison a page for testing */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/madvise.c b/mm/madvise.c index 76eb4193acdd..8dbd38b8e4a4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -207,6 +207,32 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(unsigned long start, unsigned long end) +{ + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE) { + struct page *p; + int ret = get_user_pages(current, current->mm, start, 1, + 0, 0, &p, NULL); + if (ret != 1) + return ret; + printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + __memory_failure(page_to_pfn(p), 0, 1); + put_page(p); + } + return ret; +} +#endif + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) @@ -307,6 +333,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON) + return madvise_hwpoison(start, start+len_in); +#endif if (!madvise_behavior_valid(behavior)) return error; -- cgit v1.2.3-59-g8ed1b