From 41a25e7e67b8be33d7598ff7968b9a8b405b6567 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 21 Sep 2009 17:01:24 -0700 Subject: hugetlb: clean up and update huge pages documentation Attempt to clarify huge page administration and usage, and updates the doucmentation to mention the balancing of huge pages across nodes when allocating and freeing. Signed-off-by: Lee Schermerhorn Cc: Mel Gorman Cc: Nishanth Aravamudan Cc: David Rientjes Cc: Adam Litke Cc: Andy Whitcroft Cc: Eric Whitney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hugetlbpage.txt | 133 +++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 46 deletions(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index ea8714fcc3ad..3a167be78c2f 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -18,13 +18,13 @@ First the Linux kernel needs to be built with the CONFIG_HUGETLBFS automatically when CONFIG_HUGETLBFS is selected) configuration options. -The kernel built with hugepage support should show the number of configured -hugepages in the system by running the "cat /proc/meminfo" command. +The kernel built with huge page support should show the number of configured +huge pages in the system by running the "cat /proc/meminfo" command. /proc/meminfo also provides information about the total number of hugetlb pages configured in the kernel. It also displays information about the number of free hugetlb pages at any time. It also displays information about -the configured hugepage size - this is needed for generating the proper +the configured huge page size - this is needed for generating the proper alignment and size of the arguments to the above system calls. The output of "cat /proc/meminfo" will have lines like: @@ -37,25 +37,27 @@ HugePages_Surp: yyy Hugepagesize: zzz kB where: -HugePages_Total is the size of the pool of hugepages. -HugePages_Free is the number of hugepages in the pool that are not yet -allocated. -HugePages_Rsvd is short for "reserved," and is the number of hugepages -for which a commitment to allocate from the pool has been made, but no -allocation has yet been made. It's vaguely analogous to overcommit. -HugePages_Surp is short for "surplus," and is the number of hugepages in -the pool above the value in /proc/sys/vm/nr_hugepages. The maximum -number of surplus hugepages is controlled by -/proc/sys/vm/nr_overcommit_hugepages. +HugePages_Total is the size of the pool of huge pages. +HugePages_Free is the number of huge pages in the pool that are not yet + allocated. +HugePages_Rsvd is short for "reserved," and is the number of huge pages for + which a commitment to allocate from the pool has been made, + but no allocation has yet been made. Reserved huge pages + guarantee that an application will be able to allocate a + huge page from the pool of huge pages at fault time. +HugePages_Surp is short for "surplus," and is the number of huge pages in + the pool above the value in /proc/sys/vm/nr_hugepages. The + maximum number of surplus huge pages is controlled by + /proc/sys/vm/nr_overcommit_hugepages. /proc/filesystems should also show a filesystem of type "hugetlbfs" configured in the kernel. /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb pages in the kernel. Super user can dynamically request more (or free some -pre-configured) hugepages. +pre-configured) huge pages. The allocation (or deallocation) of hugetlb pages is possible only if there are -enough physically contiguous free pages in system (freeing of hugepages is +enough physically contiguous free pages in system (freeing of huge pages is possible only if there are enough hugetlb pages free that can be transferred back to regular memory pool). @@ -67,43 +69,82 @@ use either the mmap system call or shared memory system calls to start using the huge pages. It is required that the system administrator preallocate enough memory for huge page purposes. -Use the following command to dynamically allocate/deallocate hugepages: +The administrator can preallocate huge pages on the kernel boot command line by +specifying the "hugepages=N" parameter, where 'N' = the number of huge pages +requested. This is the most reliable method for preallocating huge pages as +memory has not yet become fragmented. + +Some platforms support multiple huge page sizes. To preallocate huge pages +of a specific size, one must preceed the huge pages boot command parameters +with a huge page size selection parameter "hugepagesz=". must +be specified in bytes with optional scale suffix [kKmMgG]. The default huge +page size may be selected with the "default_hugepagesz=" boot parameter. + +/proc/sys/vm/nr_hugepages indicates the current number of configured [default +size] hugetlb pages in the kernel. Super user can dynamically request more +(or free some pre-configured) huge pages. + +Use the following command to dynamically allocate/deallocate default sized +huge pages: echo 20 > /proc/sys/vm/nr_hugepages -This command will try to configure 20 hugepages in the system. The success -or failure of allocation depends on the amount of physically contiguous -memory that is preset in system at this time. System administrators may want -to put this command in one of the local rc init files. This will enable the -kernel to request huge pages early in the boot process (when the possibility -of getting physical contiguous pages is still very high). In either -case, administrators will want to verify the number of hugepages actually -allocated by checking the sysctl or meminfo. - -/proc/sys/vm/nr_overcommit_hugepages indicates how large the pool of -hugepages can grow, if more hugepages than /proc/sys/vm/nr_hugepages are -requested by applications. echo'ing any non-zero value into this file -indicates that the hugetlb subsystem is allowed to try to obtain -hugepages from the buddy allocator, if the normal pool is exhausted. As -these surplus hugepages go out of use, they are freed back to the buddy +This command will try to configure 20 default sized huge pages in the system. +On a NUMA platform, the kernel will attempt to distribute the huge page pool +over the all on-line nodes. These huge pages, allocated when nr_hugepages +is increased, are called "persistent huge pages". + +The success or failure of huge page allocation depends on the amount of +physically contiguous memory that is preset in system at the time of the +allocation attempt. If the kernel is unable to allocate huge pages from +some nodes in a NUMA system, it will attempt to make up the difference by +allocating extra pages on other nodes with sufficient available contiguous +memory, if any. + +System administrators may want to put this command in one of the local rc init +files. This will enable the kernel to request huge pages early in the boot +process when the possibility of getting physical contiguous pages is still +very high. Administrators can verify the number of huge pages actually +allocated by checking the sysctl or meminfo. To check the per node +distribution of huge pages in a NUMA system, use: + + cat /sys/devices/system/node/node*/meminfo | fgrep Huge + +/proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of +huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are +requested by applications. Writing any non-zero value into this file +indicates that the hugetlb subsystem is allowed to try to obtain "surplus" +huge pages from the buddy allocator, when the normal pool is exhausted. As +these surplus huge pages go out of use, they are freed back to the buddy allocator. +When increasing the huge page pool size via nr_hugepages, any surplus +pages will first be promoted to persistent huge pages. Then, additional +huge pages will be allocated, if necessary and if possible, to fulfill +the new huge page pool size. + +The administrator may shrink the pool of preallocated huge pages for +the default huge page size by setting the nr_hugepages sysctl to a +smaller value. The kernel will attempt to balance the freeing of huge pages +across all on-line nodes. Any free huge pages on the selected nodes will +be freed back to the buddy allocator. + Caveat: Shrinking the pool via nr_hugepages such that it becomes less -than the number of hugepages in use will convert the balance to surplus +than the number of huge pages in use will convert the balance to surplus huge pages even if it would exceed the overcommit value. As long as this condition holds, however, no more surplus huge pages will be allowed on the system until one of the two sysctls are increased sufficiently, or the surplus huge pages go out of use and are freed. -With support for multiple hugepage pools at run-time available, much of -the hugepage userspace interface has been duplicated in sysfs. The above -information applies to the default hugepage size (which will be -controlled by the proc interfaces for backwards compatibility). The root -hugepage control directory is +With support for multiple huge page pools at run-time available, much of +the huge page userspace interface has been duplicated in sysfs. The above +information applies to the default huge page size which will be +controlled by the /proc interfaces for backwards compatibility. The root +huge page control directory in sysfs is: /sys/kernel/mm/hugepages -For each hugepage size supported by the running kernel, a subdirectory +For each huge page size supported by the running kernel, a subdirectory will exist, of the form hugepages-${size}kB @@ -116,9 +157,9 @@ Inside each of these directories, the same set of files will exist: resv_hugepages surplus_hugepages -which function as described above for the default hugepage-sized case. +which function as described above for the default huge page-sized case. -If the user applications are going to request hugepages using mmap system +If the user applications are going to request huge pages using mmap system call, then it is required that system administrator mount a file system of type hugetlbfs: @@ -127,7 +168,7 @@ type hugetlbfs: none /mnt/huge This command mounts a (pseudo) filesystem of type hugetlbfs on the directory -/mnt/huge. Any files created on /mnt/huge uses hugepages. The uid and gid +/mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid options sets the owner and group of the root of the file system. By default the uid and gid of the current process are taken. The mode option sets the mode of root of file system to value & 0777. This value is given in octal. @@ -156,14 +197,14 @@ mount of filesystem will be required for using mmap calls. ******************************************************************* /* - * Example of using hugepage memory in a user application using Sys V shared + * Example of using huge page memory in a user application using Sys V shared * memory system calls. In this example the app is requesting 256MB of * memory that is backed by huge pages. The application uses the flag * SHM_HUGETLB in the shmget system call to inform the kernel that it is - * requesting hugepages. + * requesting huge pages. * * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * hugepages. That means the addresses starting with 0x800000... will need + * huge pages. That means the addresses starting with 0x800000... will need * to be specified. Specifying a fixed address is not required on ppc64, * i386 or x86_64. * @@ -252,14 +293,14 @@ int main(void) ******************************************************************* /* - * Example of using hugepage memory in a user application using the mmap + * Example of using huge page memory in a user application using the mmap * system call. Before running this application, make sure that the * administrator has mounted the hugetlbfs filesystem (on some directory * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this * example, the app is requesting memory of size 256MB that is backed by * huge pages. * - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages. * That means the addresses starting with 0x800000... will need to be * specified. Specifying a fixed address is not required on ppc64, i386 * or x86_64. -- cgit v1.2.3-59-g8ed1b From 7701c9c0f54feb682d0cefa2ae1f4a1e00e0ba09 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Sep 2009 17:02:24 -0700 Subject: ksm: add some documentation Add Documentation/vm/ksm.txt: how to use the Kernel Samepage Merging feature Signed-off-by: Hugh Dickins Cc: Michael Kerrisk Cc: Randy Dunlap Acked-by: Izik Eidus Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/00-INDEX | 2 ++ Documentation/vm/ksm.txt | 89 +++++++++++++++++++++++++++++++++++++++++++++++ mm/Kconfig | 1 + 3 files changed, 92 insertions(+) create mode 100644 Documentation/vm/ksm.txt (limited to 'Documentation/vm') diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 2f77ced35df7..f80a44944874 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX @@ -6,6 +6,8 @@ balance - various information on memory balancing. hugetlbpage.txt - a brief summary of hugetlbpage support in the Linux kernel. +ksm.txt + - how to use the Kernel Samepage Merging feature. locking - info on how locking and synchronization is done in the Linux vm code. numa diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt new file mode 100644 index 000000000000..72a22f65960e --- /dev/null +++ b/Documentation/vm/ksm.txt @@ -0,0 +1,89 @@ +How to use the Kernel Samepage Merging feature +---------------------------------------------- + +KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, +added to the Linux kernel in 2.6.32. See mm/ksm.c for its implementation, +and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/ + +The KSM daemon ksmd periodically scans those areas of user memory which +have been registered with it, looking for pages of identical content which +can be replaced by a single write-protected page (which is automatically +copied if a process later wants to update its content). + +KSM was originally developed for use with KVM (where it was known as +Kernel Shared Memory), to fit more virtual machines into physical memory, +by sharing the data common between them. But it can be useful to any +application which generates many instances of the same data. + +KSM only merges anonymous (private) pages, never pagecache (file) pages. +KSM's merged pages are at present locked into kernel memory for as long +as they are shared: so cannot be swapped out like the user pages they +replace (but swapping KSM pages should follow soon in a later release). + +KSM only operates on those areas of address space which an application +has advised to be likely candidates for merging, by using the madvise(2) +system call: int madvise(addr, length, MADV_MERGEABLE). + +The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel +that advice and restore unshared pages: whereupon KSM unmerges whatever +it merged in that range. Note: this unmerging call may suddenly require +more memory than is available - possibly failing with EAGAIN, but more +probably arousing the Out-Of-Memory killer. + +If KSM is not configured into the running kernel, madvise MADV_MERGEABLE +and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was +built with CONFIG_KSM=y, those calls will normally succeed: even if the +the KSM daemon is not currently running, MADV_MERGEABLE still registers +the range for whenever the KSM daemon is started; even if the range +cannot contain any pages which KSM could actually merge; even if +MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE. + +Like other madvise calls, they are intended for use on mapped areas of +the user address space: they will report ENOMEM if the specified range +includes unmapped gaps (though working on the intervening mapped areas), +and might fail with EAGAIN if not enough memory for internal structures. + +Applications should be considerate in their use of MADV_MERGEABLE, +restricting its use to areas likely to benefit. KSM's scans may use +a lot of processing power, and its kernel-resident pages are a limited +resource. Some installations will disable KSM for these reasons. + +The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, +readable by all but writable only by root: + +max_kernel_pages - set to maximum number of kernel pages that KSM may use + e.g. "echo 2000 > /sys/kernel/mm/ksm/max_kernel_pages" + Value 0 imposes no limit on the kernel pages KSM may use; + but note that any process using MADV_MERGEABLE can cause + KSM to allocate these pages, unswappable until it exits. + Default: 2000 (chosen for demonstration purposes) + +pages_to_scan - how many present pages to scan before ksmd goes to sleep + e.g. "echo 200 > /sys/kernel/mm/ksm/pages_to_scan" + Default: 200 (chosen for demonstration purposes) + +sleep_millisecs - how many milliseconds ksmd should sleep before next scan + e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" + Default: 20 (chosen for demonstration purposes) + +run - set 0 to stop ksmd from running but keep merged pages, + set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", + set 2 to stop ksmd and unmerge all pages currently merged, + but leave mergeable areas registered for next run + Default: 1 (for immediate use by apps which register) + +The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: + +pages_shared - how many shared unswappable kernel pages KSM is using +pages_sharing - how many more sites are sharing them i.e. how much saved +pages_unshared - how many pages unique but repeatedly checked for merging +pages_volatile - how many pages changing too fast to be placed in a tree +full_scans - how many times all mergeable areas have been scanned + +A high ratio of pages_sharing to pages_shared indicates good sharing, but +a high ratio of pages_unshared to pages_sharing indicates wasted effort. +pages_volatile embraces several different kinds of activity, but a high +proportion there would also indicate poor use of madvise MADV_MERGEABLE. + +Izik Eidus, +Hugh Dickins, 30 July 2009 diff --git a/mm/Kconfig b/mm/Kconfig index c0b6afa178a1..71eb0b4cce8d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -224,6 +224,7 @@ config KSM the many instances by a single resident page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. + See Documentation/vm/ksm.txt for more information. config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" -- cgit v1.2.3-59-g8ed1b From 94bf5ceac095c7d4cb5e4d40fa7e2dd81d722b75 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Mon, 21 Sep 2009 17:03:48 -0700 Subject: hugetlb: add MAP_HUGETLB example Add an example of how to use the MAP_HUGETLB flag to the vm documentation directory and a reference to the example in hugetlbpage.txt. Signed-off-by: Eric B Munson Acked-by: David Rientjes Cc: Mel Gorman Cc: Adam Litke Cc: David Gibson Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/00-INDEX | 2 ++ Documentation/vm/hugetlbpage.txt | 14 ++++---- Documentation/vm/map_hugetlb.c | 77 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 6 deletions(-) create mode 100644 Documentation/vm/map_hugetlb.c (limited to 'Documentation/vm') diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index f80a44944874..e57d6a9dd32b 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX @@ -22,3 +22,5 @@ slabinfo.c - source code for a tool to get reports about slabs. slub.txt - a short users guide for SLUB. +map_hugetlb.c + - an example program that uses the MAP_HUGETLB mmap flag. diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 3a167be78c2f..82a7bd1800b2 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -187,12 +187,14 @@ Regular chown, chgrp, and chmod commands (with right permissions) could be used to change the file attributes on hugetlbfs. Also, it is important to note that no such mount command is required if the -applications are going to use only shmat/shmget system calls. Users who -wish to use hugetlb page via shared memory segment should be a member of -a supplementary group and system admin needs to configure that gid into -/proc/sys/vm/hugetlb_shm_group. It is possible for same or different -applications to use any combination of mmaps and shm* calls, though the -mount of filesystem will be required for using mmap calls. +applications are going to use only shmat/shmget system calls or mmap with +MAP_HUGETLB. Users who wish to use hugetlb page via shared memory segment +should be a member of a supplementary group and system admin needs to +configure that gid into /proc/sys/vm/hugetlb_shm_group. It is possible for +same or different applications to use any combination of mmaps and shm* +calls, though the mount of filesystem will be required for using mmap calls +without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see +map_hugetlb.c. ******************************************************************* diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c new file mode 100644 index 000000000000..e2bdae37f499 --- /dev/null +++ b/Documentation/vm/map_hugetlb.c @@ -0,0 +1,77 @@ +/* + * Example of using hugepage memory in a user application using the mmap + * system call with MAP_HUGETLB flag. Before running this program make + * sure the administrator has allocated enough default sized huge pages + * to cover the 256 MB allocation. + * + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#include +#include +#include +#include +#include + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40 +#endif + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +void write_bytes(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +void read_bytes(char *addr) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < LENGTH; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + break; + } +} + +int main(void) +{ + void *addr; + + addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr); + read_bytes(addr); + + munmap(addr, LENGTH); + + return 0; +} -- cgit v1.2.3-59-g8ed1b From b7ed698cc9d556306a4088c238e2ea9311ea2cb3 Mon Sep 17 00:00:00 2001 From: Ladinu Chandrasinghe Date: Tue, 22 Sep 2009 16:43:42 -0700 Subject: Documentation/: fix warnings from -Wmissing-prototypes in HOSTCFLAGS Fix up -Wmissing-prototypes in compileable userspace code, mainly under Documentation/. Signed-off-by: Ladinu Chandrasinghe Signed-off-by: Trevor Keith Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/getdelays.c | 12 ++--- Documentation/auxdisplay/cfag12864b-example.c | 22 ++++----- Documentation/ia64/aliasing-test.c | 8 ++-- Documentation/pcmcia/crc32hash.c | 2 +- Documentation/spi/spidev_test.c | 4 +- Documentation/video4linux/v4lgrab.c | 2 +- Documentation/vm/page-types.c | 52 ++++++++++---------- Documentation/vm/slabinfo.c | 68 +++++++++++++-------------- Documentation/watchdog/src/watchdog-test.c | 2 +- firmware/ihex2fw.c | 2 +- scripts/genksyms/genksyms.c | 6 +-- 11 files changed, 90 insertions(+), 90 deletions(-) (limited to 'Documentation/vm') diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index aa73e72fd793..6e25c2659e0a 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c @@ -116,7 +116,7 @@ error: } -int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, __u8 genl_cmd, __u16 nla_type, void *nla_data, int nla_len) { @@ -160,7 +160,7 @@ int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, * Probe the controller in genetlink to find the family id * for the TASKSTATS family */ -int get_family_id(int sd) +static int get_family_id(int sd) { struct { struct nlmsghdr n; @@ -190,7 +190,7 @@ int get_family_id(int sd) return id; } -void print_delayacct(struct taskstats *t) +static void print_delayacct(struct taskstats *t) { printf("\n\nCPU %15s%15s%15s%15s\n" " %15llu%15llu%15llu%15llu\n" @@ -216,7 +216,7 @@ void print_delayacct(struct taskstats *t) (unsigned long long)t->freepages_delay_total); } -void task_context_switch_counts(struct taskstats *t) +static void task_context_switch_counts(struct taskstats *t) { printf("\n\nTask %15s%15s\n" " %15llu%15llu\n", @@ -224,7 +224,7 @@ void task_context_switch_counts(struct taskstats *t) (unsigned long long)t->nvcsw, (unsigned long long)t->nivcsw); } -void print_cgroupstats(struct cgroupstats *c) +static void print_cgroupstats(struct cgroupstats *c) { printf("sleeping %llu, blocked %llu, running %llu, stopped %llu, " "uninterruptible %llu\n", (unsigned long long)c->nr_sleeping, @@ -235,7 +235,7 @@ void print_cgroupstats(struct cgroupstats *c) } -void print_ioacct(struct taskstats *t) +static void print_ioacct(struct taskstats *t) { printf("%s: read=%llu, write=%llu, cancelled_write=%llu\n", t->ac_comm, diff --git a/Documentation/auxdisplay/cfag12864b-example.c b/Documentation/auxdisplay/cfag12864b-example.c index 2caeea5e4993..1d2c010bae12 100644 --- a/Documentation/auxdisplay/cfag12864b-example.c +++ b/Documentation/auxdisplay/cfag12864b-example.c @@ -62,7 +62,7 @@ unsigned char cfag12864b_buffer[CFAG12864B_SIZE]; * Unable to open: return = -1 * Unable to mmap: return = -2 */ -int cfag12864b_init(char *path) +static int cfag12864b_init(char *path) { cfag12864b_fd = open(path, O_RDWR); if (cfag12864b_fd == -1) @@ -81,7 +81,7 @@ int cfag12864b_init(char *path) /* * exit a cfag12864b framebuffer device */ -void cfag12864b_exit(void) +static void cfag12864b_exit(void) { munmap(cfag12864b_mem, CFAG12864B_SIZE); close(cfag12864b_fd); @@ -90,7 +90,7 @@ void cfag12864b_exit(void) /* * set (x, y) pixel */ -void cfag12864b_set(unsigned char x, unsigned char y) +static void cfag12864b_set(unsigned char x, unsigned char y) { if (CFAG12864B_CHECK(x, y)) cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] |= @@ -100,7 +100,7 @@ void cfag12864b_set(unsigned char x, unsigned char y) /* * unset (x, y) pixel */ -void cfag12864b_unset(unsigned char x, unsigned char y) +static void cfag12864b_unset(unsigned char x, unsigned char y) { if (CFAG12864B_CHECK(x, y)) cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &= @@ -113,7 +113,7 @@ void cfag12864b_unset(unsigned char x, unsigned char y) * Pixel off: return = 0 * Pixel on: return = 1 */ -unsigned char cfag12864b_isset(unsigned char x, unsigned char y) +static unsigned char cfag12864b_isset(unsigned char x, unsigned char y) { if (CFAG12864B_CHECK(x, y)) if (cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] & @@ -126,7 +126,7 @@ unsigned char cfag12864b_isset(unsigned char x, unsigned char y) /* * not (x, y) pixel */ -void cfag12864b_not(unsigned char x, unsigned char y) +static void cfag12864b_not(unsigned char x, unsigned char y) { if (cfag12864b_isset(x, y)) cfag12864b_unset(x, y); @@ -137,7 +137,7 @@ void cfag12864b_not(unsigned char x, unsigned char y) /* * fill (set all pixels) */ -void cfag12864b_fill(void) +static void cfag12864b_fill(void) { unsigned short i; @@ -148,7 +148,7 @@ void cfag12864b_fill(void) /* * clear (unset all pixels) */ -void cfag12864b_clear(void) +static void cfag12864b_clear(void) { unsigned short i; @@ -162,7 +162,7 @@ void cfag12864b_clear(void) * Pixel off: src[i] = 0 * Pixel on: src[i] > 0 */ -void cfag12864b_format(unsigned char * matrix) +static void cfag12864b_format(unsigned char * matrix) { unsigned char i, j, n; @@ -182,7 +182,7 @@ void cfag12864b_format(unsigned char * matrix) /* * blit buffer to lcd */ -void cfag12864b_blit(void) +static void cfag12864b_blit(void) { memcpy(cfag12864b_mem, cfag12864b_buffer, CFAG12864B_SIZE); } @@ -198,7 +198,7 @@ void cfag12864b_blit(void) #define EXAMPLES 6 -void example(unsigned char n) +static void example(unsigned char n) { unsigned short i, j; unsigned char matrix[CFAG12864B_WIDTH * CFAG12864B_HEIGHT]; diff --git a/Documentation/ia64/aliasing-test.c b/Documentation/ia64/aliasing-test.c index d23610fb2ff9..3dfb76ca6931 100644 --- a/Documentation/ia64/aliasing-test.c +++ b/Documentation/ia64/aliasing-test.c @@ -24,7 +24,7 @@ int sum; -int map_mem(char *path, off_t offset, size_t length, int touch) +static int map_mem(char *path, off_t offset, size_t length, int touch) { int fd, rc; void *addr; @@ -62,7 +62,7 @@ int map_mem(char *path, off_t offset, size_t length, int touch) return 0; } -int scan_tree(char *path, char *file, off_t offset, size_t length, int touch) +static int scan_tree(char *path, char *file, off_t offset, size_t length, int touch) { struct dirent **namelist; char *name, *path2; @@ -119,7 +119,7 @@ skip: char buf[1024]; -int read_rom(char *path) +static int read_rom(char *path) { int fd, rc; size_t size = 0; @@ -146,7 +146,7 @@ int read_rom(char *path) return size; } -int scan_rom(char *path, char *file) +static int scan_rom(char *path, char *file) { struct dirent **namelist; char *name, *path2; diff --git a/Documentation/pcmcia/crc32hash.c b/Documentation/pcmcia/crc32hash.c index 4210e5abab8a..44f8beea7260 100644 --- a/Documentation/pcmcia/crc32hash.c +++ b/Documentation/pcmcia/crc32hash.c @@ -8,7 +8,7 @@ $ ./crc32hash "Dual Speed" #include #include -unsigned int crc32(unsigned char const *p, unsigned int len) +static unsigned int crc32(unsigned char const *p, unsigned int len) { int i; unsigned int crc = 0; diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c index c1a5aad3c75a..10abd3773e49 100644 --- a/Documentation/spi/spidev_test.c +++ b/Documentation/spi/spidev_test.c @@ -69,7 +69,7 @@ static void transfer(int fd) puts(""); } -void print_usage(const char *prog) +static void print_usage(const char *prog) { printf("Usage: %s [-DsbdlHOLC3]\n", prog); puts(" -D --device device to use (default /dev/spidev1.1)\n" @@ -85,7 +85,7 @@ void print_usage(const char *prog) exit(1); } -void parse_opts(int argc, char *argv[]) +static void parse_opts(int argc, char *argv[]) { while (1) { static const struct option lopts[] = { diff --git a/Documentation/video4linux/v4lgrab.c b/Documentation/video4linux/v4lgrab.c index 05769cff1009..c8ded175796e 100644 --- a/Documentation/video4linux/v4lgrab.c +++ b/Documentation/video4linux/v4lgrab.c @@ -89,7 +89,7 @@ } \ } -int get_brightness_adj(unsigned char *image, long size, int *brightness) { +static int get_brightness_adj(unsigned char *image, long size, int *brightness) { long i, tot = 0; for (i=0;i> 20; } -void fatal(const char *x, ...) +static void fatal(const char *x, ...) { va_list ap; @@ -178,7 +178,7 @@ void fatal(const char *x, ...) * page flag names */ -char *page_flag_name(uint64_t flags) +static char *page_flag_name(uint64_t flags) { static char buf[65]; int present; @@ -197,7 +197,7 @@ char *page_flag_name(uint64_t flags) return buf; } -char *page_flag_longname(uint64_t flags) +static char *page_flag_longname(uint64_t flags) { static char buf[1024]; int i, n; @@ -221,7 +221,7 @@ char *page_flag_longname(uint64_t flags) * page list and summary */ -void show_page_range(unsigned long offset, uint64_t flags) +static void show_page_range(unsigned long offset, uint64_t flags) { static uint64_t flags0; static unsigned long index; @@ -241,12 +241,12 @@ void show_page_range(unsigned long offset, uint64_t flags) count = 1; } -void show_page(unsigned long offset, uint64_t flags) +static void show_page(unsigned long offset, uint64_t flags) { printf("%lu\t%s\n", offset, page_flag_name(flags)); } -void show_summary(void) +static void show_summary(void) { int i; @@ -272,7 +272,7 @@ void show_summary(void) * page flag filters */ -int bit_mask_ok(uint64_t flags) +static int bit_mask_ok(uint64_t flags) { int i; @@ -289,7 +289,7 @@ int bit_mask_ok(uint64_t flags) return 1; } -uint64_t expand_overloaded_flags(uint64_t flags) +static uint64_t expand_overloaded_flags(uint64_t flags) { /* SLOB/SLUB overload several page flags */ if (flags & BIT(SLAB)) { @@ -308,7 +308,7 @@ uint64_t expand_overloaded_flags(uint64_t flags) return flags; } -uint64_t well_known_flags(uint64_t flags) +static uint64_t well_known_flags(uint64_t flags) { /* hide flags intended only for kernel hacker */ flags &= ~KPF_HACKERS_BITS; @@ -325,7 +325,7 @@ uint64_t well_known_flags(uint64_t flags) * page frame walker */ -int hash_slot(uint64_t flags) +static int hash_slot(uint64_t flags) { int k = HASH_KEY(flags); int i; @@ -352,7 +352,7 @@ int hash_slot(uint64_t flags) exit(EXIT_FAILURE); } -void add_page(unsigned long offset, uint64_t flags) +static void add_page(unsigned long offset, uint64_t flags) { flags = expand_overloaded_flags(flags); @@ -371,7 +371,7 @@ void add_page(unsigned long offset, uint64_t flags) total_pages++; } -void walk_pfn(unsigned long index, unsigned long count) +static void walk_pfn(unsigned long index, unsigned long count) { unsigned long batch; unsigned long n; @@ -404,7 +404,7 @@ void walk_pfn(unsigned long index, unsigned long count) } } -void walk_addr_ranges(void) +static void walk_addr_ranges(void) { int i; @@ -428,7 +428,7 @@ void walk_addr_ranges(void) * user interface */ -const char *page_flag_type(uint64_t flag) +static const char *page_flag_type(uint64_t flag) { if (flag & KPF_HACKERS_BITS) return "(r)"; @@ -437,7 +437,7 @@ const char *page_flag_type(uint64_t flag) return " "; } -void usage(void) +static void usage(void) { int i, j; @@ -482,7 +482,7 @@ void usage(void) "(r) raw mode bits (o) overloaded bits\n"); } -unsigned long long parse_number(const char *str) +static unsigned long long parse_number(const char *str) { unsigned long long n; @@ -494,16 +494,16 @@ unsigned long long parse_number(const char *str) return n; } -void parse_pid(const char *str) +static void parse_pid(const char *str) { opt_pid = parse_number(str); } -void parse_file(const char *name) +static void parse_file(const char *name) { } -void add_addr_range(unsigned long offset, unsigned long size) +static void add_addr_range(unsigned long offset, unsigned long size) { if (nr_addr_ranges >= MAX_ADDR_RANGES) fatal("too much addr ranges\n"); @@ -513,7 +513,7 @@ void add_addr_range(unsigned long offset, unsigned long size) nr_addr_ranges++; } -void parse_addr_range(const char *optarg) +static void parse_addr_range(const char *optarg) { unsigned long offset; unsigned long size; @@ -547,7 +547,7 @@ void parse_addr_range(const char *optarg) add_addr_range(offset, size); } -void add_bits_filter(uint64_t mask, uint64_t bits) +static void add_bits_filter(uint64_t mask, uint64_t bits) { if (nr_bit_filters >= MAX_BIT_FILTERS) fatal("too much bit filters\n"); @@ -557,7 +557,7 @@ void add_bits_filter(uint64_t mask, uint64_t bits) nr_bit_filters++; } -uint64_t parse_flag_name(const char *str, int len) +static uint64_t parse_flag_name(const char *str, int len) { int i; @@ -577,7 +577,7 @@ uint64_t parse_flag_name(const char *str, int len) return parse_number(str); } -uint64_t parse_flag_names(const char *str, int all) +static uint64_t parse_flag_names(const char *str, int all) { const char *p = str; uint64_t flags = 0; @@ -596,7 +596,7 @@ uint64_t parse_flag_names(const char *str, int all) return flags; } -void parse_bits_mask(const char *optarg) +static void parse_bits_mask(const char *optarg) { uint64_t mask; uint64_t bits; @@ -621,7 +621,7 @@ void parse_bits_mask(const char *optarg) } -struct option opts[] = { +static struct option opts[] = { { "raw" , 0, NULL, 'r' }, { "pid" , 1, NULL, 'p' }, { "file" , 1, NULL, 'f' }, diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index df3227605d59..92e729f4b676 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c @@ -87,7 +87,7 @@ int page_size; regex_t pattern; -void fatal(const char *x, ...) +static void fatal(const char *x, ...) { va_list ap; @@ -97,7 +97,7 @@ void fatal(const char *x, ...) exit(EXIT_FAILURE); } -void usage(void) +static void usage(void) { printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n" "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" @@ -131,7 +131,7 @@ void usage(void) ); } -unsigned long read_obj(const char *name) +static unsigned long read_obj(const char *name) { FILE *f = fopen(name, "r"); @@ -151,7 +151,7 @@ unsigned long read_obj(const char *name) /* * Get the contents of an attribute */ -unsigned long get_obj(const char *name) +static unsigned long get_obj(const char *name) { if (!read_obj(name)) return 0; @@ -159,7 +159,7 @@ unsigned long get_obj(const char *name) return atol(buffer); } -unsigned long get_obj_and_str(const char *name, char **x) +static unsigned long get_obj_and_str(const char *name, char **x) { unsigned long result = 0; char *p; @@ -178,7 +178,7 @@ unsigned long get_obj_and_str(const char *name, char **x) return result; } -void set_obj(struct slabinfo *s, const char *name, int n) +static void set_obj(struct slabinfo *s, const char *name, int n) { char x[100]; FILE *f; @@ -192,7 +192,7 @@ void set_obj(struct slabinfo *s, const char *name, int n) fclose(f); } -unsigned long read_slab_obj(struct slabinfo *s, const char *name) +static unsigned long read_slab_obj(struct slabinfo *s, const char *name) { char x[100]; FILE *f; @@ -215,7 +215,7 @@ unsigned long read_slab_obj(struct slabinfo *s, const char *name) /* * Put a size string together */ -int store_size(char *buffer, unsigned long value) +static int store_size(char *buffer, unsigned long value) { unsigned long divisor = 1; char trailer = 0; @@ -247,7 +247,7 @@ int store_size(char *buffer, unsigned long value) return n; } -void decode_numa_list(int *numa, char *t) +static void decode_numa_list(int *numa, char *t) { int node; int nr; @@ -272,7 +272,7 @@ void decode_numa_list(int *numa, char *t) } } -void slab_validate(struct slabinfo *s) +static void slab_validate(struct slabinfo *s) { if (strcmp(s->name, "*") == 0) return; @@ -280,7 +280,7 @@ void slab_validate(struct slabinfo *s) set_obj(s, "validate", 1); } -void slab_shrink(struct slabinfo *s) +static void slab_shrink(struct slabinfo *s) { if (strcmp(s->name, "*") == 0) return; @@ -290,7 +290,7 @@ void slab_shrink(struct slabinfo *s) int line = 0; -void first_line(void) +static void first_line(void) { if (show_activity) printf("Name Objects Alloc Free %%Fast Fallb O\n"); @@ -302,7 +302,7 @@ void first_line(void) /* * Find the shortest alias of a slab */ -struct aliasinfo *find_one_alias(struct slabinfo *find) +static struct aliasinfo *find_one_alias(struct slabinfo *find) { struct aliasinfo *a; struct aliasinfo *best = NULL; @@ -318,18 +318,18 @@ struct aliasinfo *find_one_alias(struct slabinfo *find) return best; } -unsigned long slab_size(struct slabinfo *s) +static unsigned long slab_size(struct slabinfo *s) { return s->slabs * (page_size << s->order); } -unsigned long slab_activity(struct slabinfo *s) +static unsigned long slab_activity(struct slabinfo *s) { return s->alloc_fastpath + s->free_fastpath + s->alloc_slowpath + s->free_slowpath; } -void slab_numa(struct slabinfo *s, int mode) +static void slab_numa(struct slabinfo *s, int mode) { int node; @@ -374,7 +374,7 @@ void slab_numa(struct slabinfo *s, int mode) line++; } -void show_tracking(struct slabinfo *s) +static void show_tracking(struct slabinfo *s) { printf("\n%s: Kernel object allocation\n", s->name); printf("-----------------------------------------------------------------------\n"); @@ -392,7 +392,7 @@ void show_tracking(struct slabinfo *s) } -void ops(struct slabinfo *s) +static void ops(struct slabinfo *s) { if (strcmp(s->name, "*") == 0) return; @@ -405,14 +405,14 @@ void ops(struct slabinfo *s) printf("\n%s has no kmem_cache operations\n", s->name); } -const char *onoff(int x) +static const char *onoff(int x) { if (x) return "On "; return "Off"; } -void slab_stats(struct slabinfo *s) +static void slab_stats(struct slabinfo *s) { unsigned long total_alloc; unsigned long total_free; @@ -477,7 +477,7 @@ void slab_stats(struct slabinfo *s) s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); } -void report(struct slabinfo *s) +static void report(struct slabinfo *s) { if (strcmp(s->name, "*") == 0) return; @@ -518,7 +518,7 @@ void report(struct slabinfo *s) slab_stats(s); } -void slabcache(struct slabinfo *s) +static void slabcache(struct slabinfo *s) { char size_str[20]; char dist_str[40]; @@ -593,7 +593,7 @@ void slabcache(struct slabinfo *s) /* * Analyze debug options. Return false if something is amiss. */ -int debug_opt_scan(char *opt) +static int debug_opt_scan(char *opt) { if (!opt || !opt[0] || strcmp(opt, "-") == 0) return 1; @@ -642,7 +642,7 @@ int debug_opt_scan(char *opt) return 1; } -int slab_empty(struct slabinfo *s) +static int slab_empty(struct slabinfo *s) { if (s->objects > 0) return 0; @@ -657,7 +657,7 @@ int slab_empty(struct slabinfo *s) return 1; } -void slab_debug(struct slabinfo *s) +static void slab_debug(struct slabinfo *s) { if (strcmp(s->name, "*") == 0) return; @@ -717,7 +717,7 @@ void slab_debug(struct slabinfo *s) set_obj(s, "trace", 1); } -void totals(void) +static void totals(void) { struct slabinfo *s; @@ -976,7 +976,7 @@ void totals(void) b1, b2, b3); } -void sort_slabs(void) +static void sort_slabs(void) { struct slabinfo *s1,*s2; @@ -1005,7 +1005,7 @@ void sort_slabs(void) } } -void sort_aliases(void) +static void sort_aliases(void) { struct aliasinfo *a1,*a2; @@ -1030,7 +1030,7 @@ void sort_aliases(void) } } -void link_slabs(void) +static void link_slabs(void) { struct aliasinfo *a; struct slabinfo *s; @@ -1048,7 +1048,7 @@ void link_slabs(void) } } -void alias(void) +static void alias(void) { struct aliasinfo *a; char *active = NULL; @@ -1079,7 +1079,7 @@ void alias(void) } -void rename_slabs(void) +static void rename_slabs(void) { struct slabinfo *s; struct aliasinfo *a; @@ -1102,12 +1102,12 @@ void rename_slabs(void) } } -int slab_mismatch(char *slab) +static int slab_mismatch(char *slab) { return regexec(&pattern, slab, 0, NULL, 0); } -void read_slab_dir(void) +static void read_slab_dir(void) { DIR *dir; struct dirent *de; @@ -1209,7 +1209,7 @@ void read_slab_dir(void) fatal("Too many aliases\n"); } -void output_slabs(void) +static void output_slabs(void) { struct slabinfo *slab; diff --git a/Documentation/watchdog/src/watchdog-test.c b/Documentation/watchdog/src/watchdog-test.c index 65f6c19cb865..a750532ffcf8 100644 --- a/Documentation/watchdog/src/watchdog-test.c +++ b/Documentation/watchdog/src/watchdog-test.c @@ -18,7 +18,7 @@ int fd; * the PC Watchdog card to reset its internal timer so it doesn't trigger * a computer reset. */ -void keep_alive(void) +static void keep_alive(void) { int dummy; diff --git a/firmware/ihex2fw.c b/firmware/ihex2fw.c index 8f7fdaa9e010..5a03ba8c8364 100644 --- a/firmware/ihex2fw.c +++ b/firmware/ihex2fw.c @@ -56,7 +56,7 @@ static int output_records(int outfd); static int sort_records = 0; static int wide_records = 0; -int usage(void) +static int usage(void) { fprintf(stderr, "ihex2fw: Convert ihex files into binary " "representation for use by Linux kernel\n"); diff --git a/scripts/genksyms/genksyms.c b/scripts/genksyms/genksyms.c index 3a8297b5184c..af6b8363a2d5 100644 --- a/scripts/genksyms/genksyms.c +++ b/scripts/genksyms/genksyms.c @@ -176,7 +176,7 @@ static int is_unknown_symbol(struct symbol *sym) strcmp(defn->string, "{") == 0); } -struct symbol *__add_symbol(const char *name, enum symbol_type type, +static struct symbol *__add_symbol(const char *name, enum symbol_type type, struct string_list *defn, int is_extern, int is_reference) { @@ -265,7 +265,7 @@ struct symbol *add_symbol(const char *name, enum symbol_type type, return __add_symbol(name, type, defn, is_extern, 0); } -struct symbol *add_reference_symbol(const char *name, enum symbol_type type, +static struct symbol *add_reference_symbol(const char *name, enum symbol_type type, struct string_list *defn, int is_extern) { return __add_symbol(name, type, defn, is_extern, 1); @@ -313,7 +313,7 @@ static int equal_list(struct string_list *a, struct string_list *b) #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) -struct string_list *read_node(FILE *f) +static struct string_list *read_node(FILE *f) { char buffer[256]; struct string_list node = { -- cgit v1.2.3-59-g8ed1b From 25d9e2d15286281ec834b829a4aaf8969011f1cd Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Fri, 21 Aug 2009 02:35:05 +1000 Subject: truncate: new helpers Introduce new truncate helpers truncate_pagecache and inode_newsize_ok. vmtruncate is also consolidated from mm/memory.c and mm/nommu.c and into mm/truncate.c. Reviewed-by: Christoph Hellwig Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- Documentation/vm/locking | 2 +- fs/attr.c | 46 ++++++++++++++++++++++++++++++++-- include/linux/fs.h | 3 ++- include/linux/mm.h | 5 ++-- mm/filemap.c | 2 +- mm/memory.c | 62 +++------------------------------------------- mm/mremap.c | 4 +-- mm/nommu.c | 40 ------------------------------ mm/truncate.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 120 insertions(+), 108 deletions(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/locking b/Documentation/vm/locking index f366fa956179..25fadb448760 100644 --- a/Documentation/vm/locking +++ b/Documentation/vm/locking @@ -80,7 +80,7 @@ Note: PTL can also be used to guarantee that no new clones using the mm start up ... this is a loose form of stability on mm_users. For example, it is used in copy_mm to protect against a racing tlb_gather_mmu single address space optimization, so that the zap_page_range (from -vmtruncate) does not lose sending ipi's to cloned threads that might +truncate) does not lose sending ipi's to cloned threads that might be spawned underneath it and go to user mode to drag in pte's into tlbs. swap_lock diff --git a/fs/attr.c b/fs/attr.c index 9fe1b1bd30a8..96d394bdaddf 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,7 +18,7 @@ /* Taken over from the old code... */ /* POSIX UID/GID verification for setting inode attributes. */ -int inode_change_ok(struct inode *inode, struct iattr *attr) +int inode_change_ok(const struct inode *inode, struct iattr *attr) { int retval = -EPERM; unsigned int ia_valid = attr->ia_valid; @@ -60,9 +60,51 @@ fine: error: return retval; } - EXPORT_SYMBOL(inode_change_ok); +/** + * inode_newsize_ok - may this inode be truncated to a given size + * @inode: the inode to be truncated + * @offset: the new size to assign to the inode + * @Returns: 0 on success, -ve errno on failure + * + * inode_newsize_ok will check filesystem limits and ulimits to check that the + * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ + * when necessary. Caller must not proceed with inode size change if failure is + * returned. @inode must be a file (not directory), with appropriate + * permissions to allow truncate (inode_newsize_ok does NOT check these + * conditions). + * + * inode_newsize_ok must be called with i_mutex held. + */ +int inode_newsize_ok(const struct inode *inode, loff_t offset) +{ + if (inode->i_size < offset) { + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + } else { + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + } + + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} +EXPORT_SYMBOL(inode_newsize_ok); + int inode_setattr(struct inode * inode, struct iattr * attr) { unsigned int ia_valid = attr->ia_valid; diff --git a/include/linux/fs.h b/include/linux/fs.h index 502d96ef345d..2b08b5ce09b6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2382,7 +2382,8 @@ extern int buffer_migrate_page(struct address_space *, #define buffer_migrate_page NULL #endif -extern int inode_change_ok(struct inode *, struct iattr *); +extern int inode_change_ok(const struct inode *, struct iattr *); +extern int inode_newsize_ok(const struct inode *, loff_t offset); extern int __must_check inode_setattr(struct inode *, struct iattr *); extern void file_update_time(struct file *file); diff --git a/include/linux/mm.h b/include/linux/mm.h index b6eae5e3144b..8347e938fb2f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -791,8 +791,9 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, unmap_mapping_range(mapping, holebegin, holelen, 0); } -extern int vmtruncate(struct inode * inode, loff_t offset); -extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); +extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); +extern int vmtruncate(struct inode *inode, loff_t offset); +extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end); #ifdef CONFIG_MMU extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/mm/filemap.c b/mm/filemap.c index bcc7372aebbc..33349adb227a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -58,7 +58,7 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_lock (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock diff --git a/mm/memory.c b/mm/memory.c index b1443ac07c00..ebcd3decac89 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -297,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr = vma->vm_start; /* - * Hide vma from rmap and vmtruncate before freeing pgtables + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); @@ -2407,7 +2408,7 @@ restart: * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE - * boundary. Note that this is different from vmtruncate(), which + * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded @@ -2458,63 +2459,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @offset: file offset to start truncating - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode * inode, loff_t offset) -{ - if (inode->i_size < offset) { - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - } else { - struct address_space *mapping = inode->i_mapping; - - /* - * truncation of in-use swapfiles is disallowed - it would - * cause subsequent swapout to scribble on the now-freed - * blocks. - */ - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - i_size_write(inode, offset); - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - } - - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; - -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -} -EXPORT_SYMBOL(vmtruncate); - int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) { struct address_space *mapping = inode->i_mapping; diff --git a/mm/mremap.c b/mm/mremap.c index 20a07dba6be0..97bff2547719 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -86,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock vmtruncate out, - * since it might clean the dst vma before the src vma, + * moving file-based ptes, we must lock truncate_pagecache + * out, since it might clean the dst vma before the src vma, * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; diff --git a/mm/nommu.c b/mm/nommu.c index 8d484241d034..56a446f05971 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -82,46 +82,6 @@ DECLARE_RWSEM(nommu_region_sem); struct vm_operations_struct generic_file_vm_ops = { }; -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - i_size_write(inode, offset); - - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; - i_size_write(inode, offset); - -out_truncate: - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out: - return -EFBIG; -} - -EXPORT_SYMBOL(vmtruncate); - /* * Return the total memory allocated for this pointer, not * just what the caller asked for. diff --git a/mm/truncate.c b/mm/truncate.c index ccc3ecf7cb98..5900afca0fa9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -465,3 +465,67 @@ int invalidate_inode_pages2(struct address_space *mapping) return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); + +/** + * truncate_pagecache - unmap and remove pagecache that has been truncated + * @inode: inode + * @old: old file offset + * @new: new file offset + * + * inode's new i_size must already be written before truncate_pagecache + * is called. + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +{ + if (new < old) { + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + } +} +EXPORT_SYMBOL(truncate_pagecache); + +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode *inode, loff_t offset) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, offset); + if (error) + return error; + oldsize = inode->i_size; + i_size_write(inode, offset); + truncate_pagecache(inode, oldsize, offset); + if (inode->i_op->truncate) + inode->i_op->truncate(inode); + + return error; +} +EXPORT_SYMBOL(vmtruncate); -- cgit v1.2.3-59-g8ed1b From ba36c440ba9486b155c9254ce5e50f5f20eb1fcb Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 23 Sep 2009 15:56:15 -0700 Subject: Documentation/vm/.gitignore: add page-types Signed-off-by: Josh Triplett Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation/vm') diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore index 33e8a023df02..09b164a5700f 100644 --- a/Documentation/vm/.gitignore +++ b/Documentation/vm/.gitignore @@ -1 +1,2 @@ +page-types slabinfo -- cgit v1.2.3-59-g8ed1b From 0b4b2ad5307c76c7105d6e7c724b1c14b8daf482 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 23 Sep 2009 15:56:16 -0700 Subject: page-types: add feature for walking process address space Introduce "-p|--pid " for walking the process address space. The default action is to walk raw memory PFNs. Both the virtual address and physical address of each present pages will be listed: # ./tools/vm/page-types -lp $$ | head -3 voffset offset len flags 400 11bebe 1 __RU_lA____M______________________ 402 11bebc 1 __RU_lA____M______________________ Note that voffset/offset/len are now showed as hex numbers. [akpm@linux-foundation.org: coding-style fixes] Cc: Andi Kleen Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 200 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 180 insertions(+), 20 deletions(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 3eda8ea00852..fa1a30d9e9d5 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -5,6 +5,7 @@ * Copyright (C) 2009 Wu Fengguang */ +#define _LARGEFILE64_SOURCE #include #include #include @@ -13,11 +14,32 @@ #include #include #include +#include #include #include #include +/* + * pagemap kernel ABI bits + */ + +#define PM_ENTRY_BYTES sizeof(uint64_t) +#define PM_STATUS_BITS 3 +#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) +#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) +#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) +#define PM_PSHIFT_BITS 6 +#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) +#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) +#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) +#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) + +#define PM_PRESENT PM_STATUS(4LL) +#define PM_SWAP PM_STATUS(2LL) + + /* * kernel page flags */ @@ -126,6 +148,14 @@ static int nr_addr_ranges; static unsigned long opt_offset[MAX_ADDR_RANGES]; static unsigned long opt_size[MAX_ADDR_RANGES]; +#define MAX_VMAS 10240 +static int nr_vmas; +static unsigned long pg_start[MAX_VMAS]; +static unsigned long pg_end[MAX_VMAS]; +static unsigned long voffset; + +static int pagemap_fd; + #define MAX_BIT_FILTERS 64 static int nr_bit_filters; static uint64_t opt_mask[MAX_BIT_FILTERS]; @@ -135,7 +165,6 @@ static int page_size; #define PAGES_BATCH (64 << 10) /* 64k pages */ static int kpageflags_fd; -static uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; #define HASH_SHIFT 13 #define HASH_SIZE (1 << HASH_SHIFT) @@ -158,6 +187,11 @@ static uint64_t page_flags[HASH_SIZE]; type __min2 = (y); \ __min1 < __min2 ? __min1 : __min2; }) +#define max_t(type, x, y) ({ \ + type __max1 = (x); \ + type __max2 = (y); \ + __max1 > __max2 ? __max1 : __max2; }) + static unsigned long pages2mb(unsigned long pages) { return (pages * page_size) >> 20; @@ -224,26 +258,34 @@ static char *page_flag_longname(uint64_t flags) static void show_page_range(unsigned long offset, uint64_t flags) { static uint64_t flags0; + static unsigned long voff; static unsigned long index; static unsigned long count; - if (flags == flags0 && offset == index + count) { + if (flags == flags0 && offset == index + count && + (!opt_pid || voffset == voff + count)) { count++; return; } - if (count) - printf("%lu\t%lu\t%s\n", + if (count) { + if (opt_pid) + printf("%lx\t", voff); + printf("%lx\t%lx\t%s\n", index, count, page_flag_name(flags0)); + } flags0 = flags; index = offset; + voff = voffset; count = 1; } static void show_page(unsigned long offset, uint64_t flags) { - printf("%lu\t%s\n", offset, page_flag_name(flags)); + if (opt_pid) + printf("%lx\t", voffset); + printf("%lx\t%s\n", offset, page_flag_name(flags)); } static void show_summary(void) @@ -383,6 +425,8 @@ static void walk_pfn(unsigned long index, unsigned long count) lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); while (count) { + uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; + batch = min_t(unsigned long, count, PAGES_BATCH); n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); if (n == 0) @@ -404,6 +448,81 @@ static void walk_pfn(unsigned long index, unsigned long count) } } + +#define PAGEMAP_BATCH 4096 +static unsigned long task_pfn(unsigned long pgoff) +{ + static uint64_t buf[PAGEMAP_BATCH]; + static unsigned long start; + static long count; + uint64_t pfn; + + if (pgoff < start || pgoff >= start + count) { + if (lseek64(pagemap_fd, + (uint64_t)pgoff * PM_ENTRY_BYTES, + SEEK_SET) < 0) { + perror("pagemap seek"); + exit(EXIT_FAILURE); + } + count = read(pagemap_fd, buf, sizeof(buf)); + if (count == 0) + return 0; + if (count < 0) { + perror("pagemap read"); + exit(EXIT_FAILURE); + } + if (count % PM_ENTRY_BYTES) { + fatal("pagemap read not aligned.\n"); + exit(EXIT_FAILURE); + } + count /= PM_ENTRY_BYTES; + start = pgoff; + } + + pfn = buf[pgoff - start]; + if (pfn & PM_PRESENT) + pfn = PM_PFRAME(pfn); + else + pfn = 0; + + return pfn; +} + +static void walk_task(unsigned long index, unsigned long count) +{ + int i = 0; + const unsigned long end = index + count; + + while (index < end) { + + while (pg_end[i] <= index) + if (++i >= nr_vmas) + return; + if (pg_start[i] >= end) + return; + + voffset = max_t(unsigned long, pg_start[i], index); + index = min_t(unsigned long, pg_end[i], end); + + assert(voffset < index); + for (; voffset < index; voffset++) { + unsigned long pfn = task_pfn(voffset); + if (pfn) + walk_pfn(pfn, 1); + } + } +} + +static void add_addr_range(unsigned long offset, unsigned long size) +{ + if (nr_addr_ranges >= MAX_ADDR_RANGES) + fatal("too many addr ranges\n"); + + opt_offset[nr_addr_ranges] = offset; + opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset); + nr_addr_ranges++; +} + static void walk_addr_ranges(void) { int i; @@ -415,10 +534,13 @@ static void walk_addr_ranges(void) } if (!nr_addr_ranges) - walk_pfn(0, ULONG_MAX); + add_addr_range(0, ULONG_MAX); for (i = 0; i < nr_addr_ranges; i++) - walk_pfn(opt_offset[i], opt_size[i]); + if (!opt_pid) + walk_pfn(opt_offset[i], opt_size[i]); + else + walk_task(opt_offset[i], opt_size[i]); close(kpageflags_fd); } @@ -446,8 +568,8 @@ static void usage(void) " -r|--raw Raw mode, for kernel developers\n" " -a|--addr addr-spec Walk a range of pages\n" " -b|--bits bits-spec Walk pages with specified bits\n" -#if 0 /* planned features */ " -p|--pid pid Walk process address space\n" +#if 0 /* planned features */ " -f|--file filename Walk file address space\n" #endif " -l|--list Show page details in ranges\n" @@ -459,7 +581,7 @@ static void usage(void) " N+M pages range from N to N+M-1\n" " N,M pages range from N to M-1\n" " N, pages range from N to end\n" -" ,M pages range from 0 to M\n" +" ,M pages range from 0 to M-1\n" "bits-spec:\n" " bit1,bit2 (flags & (bit1|bit2)) != 0\n" " bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" @@ -496,21 +618,57 @@ static unsigned long long parse_number(const char *str) static void parse_pid(const char *str) { + FILE *file; + char buf[5000]; + opt_pid = parse_number(str); -} -static void parse_file(const char *name) -{ + sprintf(buf, "/proc/%d/pagemap", opt_pid); + pagemap_fd = open(buf, O_RDONLY); + if (pagemap_fd < 0) { + perror(buf); + exit(EXIT_FAILURE); + } + + sprintf(buf, "/proc/%d/maps", opt_pid); + file = fopen(buf, "r"); + if (!file) { + perror(buf); + exit(EXIT_FAILURE); + } + + while (fgets(buf, sizeof(buf), file) != NULL) { + unsigned long vm_start; + unsigned long vm_end; + unsigned long long pgoff; + int major, minor; + char r, w, x, s; + unsigned long ino; + int n; + + n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu", + &vm_start, + &vm_end, + &r, &w, &x, &s, + &pgoff, + &major, &minor, + &ino); + if (n < 10) { + fprintf(stderr, "unexpected line: %s\n", buf); + continue; + } + pg_start[nr_vmas] = vm_start / page_size; + pg_end[nr_vmas] = vm_end / page_size; + if (++nr_vmas >= MAX_VMAS) { + fprintf(stderr, "too many VMAs\n"); + break; + } + } + fclose(file); } -static void add_addr_range(unsigned long offset, unsigned long size) +static void parse_file(const char *name) { - if (nr_addr_ranges >= MAX_ADDR_RANGES) - fatal("too much addr ranges\n"); - - opt_offset[nr_addr_ranges] = offset; - opt_size[nr_addr_ranges] = size; - nr_addr_ranges++; } static void parse_addr_range(const char *optarg) @@ -676,8 +834,10 @@ int main(int argc, char *argv[]) } } + if (opt_list && opt_pid) + printf("voffset\t"); if (opt_list == 1) - printf("offset\tcount\tflags\n"); + printf("offset\tlen\tflags\n"); if (opt_list == 2) printf("offset\tflags\n"); -- cgit v1.2.3-59-g8ed1b From c73602ad31cdcf7e6651f43d12f65b5b9b825b6f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 7 Oct 2009 16:32:22 -0700 Subject: ksm: more on default values Adjust the max_kernel_pages default to a quarter of totalram_pages, instead of nr_free_buffer_pages() / 4: the KSM pages themselves come from highmem, and even on a 16GB PAE machine, 4GB of KSM pages would only be pinning 32MB of lowmem with their rmap_items, so no need for the more obscure calculation (nor for its own special init function). There is no way for the user to switch KSM on if CONFIG_SYSFS is not enabled, so in that case default run to KSM_RUN_MERGE. Update KSM Documentation and Kconfig to reflect the new defaults. Signed-off-by: Hugh Dickins Cc: Izik Eidus Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/ksm.txt | 13 +++++++------ mm/Kconfig | 4 +++- mm/ksm.c | 10 ++++------ 3 files changed, 14 insertions(+), 13 deletions(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index 72a22f65960e..262d8e6793a3 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -52,15 +52,15 @@ The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, readable by all but writable only by root: max_kernel_pages - set to maximum number of kernel pages that KSM may use - e.g. "echo 2000 > /sys/kernel/mm/ksm/max_kernel_pages" + e.g. "echo 100000 > /sys/kernel/mm/ksm/max_kernel_pages" Value 0 imposes no limit on the kernel pages KSM may use; but note that any process using MADV_MERGEABLE can cause KSM to allocate these pages, unswappable until it exits. - Default: 2000 (chosen for demonstration purposes) + Default: quarter of memory (chosen to not pin too much) pages_to_scan - how many present pages to scan before ksmd goes to sleep - e.g. "echo 200 > /sys/kernel/mm/ksm/pages_to_scan" - Default: 200 (chosen for demonstration purposes) + e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan" + Default: 100 (chosen for demonstration purposes) sleep_millisecs - how many milliseconds ksmd should sleep before next scan e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" @@ -70,7 +70,8 @@ run - set 0 to stop ksmd from running but keep merged pages, set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", set 2 to stop ksmd and unmerge all pages currently merged, but leave mergeable areas registered for next run - Default: 1 (for immediate use by apps which register) + Default: 0 (must be changed to 1 to activate KSM, + except if CONFIG_SYSFS is disabled) The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: @@ -86,4 +87,4 @@ pages_volatile embraces several different kinds of activity, but a high proportion there would also indicate poor use of madvise MADV_MERGEABLE. Izik Eidus, -Hugh Dickins, 30 July 2009 +Hugh Dickins, 24 Sept 2009 diff --git a/mm/Kconfig b/mm/Kconfig index edd300aca173..57963c6063d1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -224,7 +224,9 @@ config KSM the many instances by a single resident page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. - See Documentation/vm/ksm.txt for more information. + See Documentation/vm/ksm.txt for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" diff --git a/mm/ksm.c b/mm/ksm.c index f7edac356f46..bef1af4f77e3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -184,11 +184,6 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock); sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) -static void __init ksm_init_max_kernel_pages(void) -{ - ksm_max_kernel_pages = nr_free_buffer_pages() / 4; -} - static int __init ksm_slab_init(void) { rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); @@ -1673,7 +1668,7 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; - ksm_init_max_kernel_pages(); + ksm_max_kernel_pages = totalram_pages / 4; err = ksm_slab_init(); if (err) @@ -1697,6 +1692,9 @@ static int __init ksm_init(void) kthread_stop(ksm_thread); goto out_free2; } +#else + ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ + #endif /* CONFIG_SYSFS */ return 0; -- cgit v1.2.3-59-g8ed1b From 253fb02d62571e5455eedc9e39b9d660e86a40f0 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:27 -0700 Subject: pagemap: export KPF_HWPOISON This flag indicates a hardware detected memory corruption on the page. Any future access of the page data may bring down the machine. Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 2 ++ Documentation/vm/pagemap.txt | 4 ++++ fs/proc/page.c | 5 +++++ 3 files changed, 11 insertions(+) (limited to 'Documentation/vm') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index fa1a30d9e9d5..87f57228f56e 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -69,6 +69,7 @@ #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 /* [32-] kernel hacking assistances */ @@ -116,6 +117,7 @@ static char *page_flag_names[] = { [KPF_COMPOUND_TAIL] = "T:compound_tail", [KPF_HUGE] = "G:huge", [KPF_UNEVICTABLE] = "u:unevictable", + [KPF_HWPOISON] = "X:hwpoison", [KPF_NOPAGE] = "n:nopage", [KPF_RESERVED] = "r:reserved", diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 600a304a828c..2fdd84a19109 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -57,6 +57,7 @@ There are three components to pagemap: 16. COMPOUND_TAIL 16. HUGE 18. UNEVICTABLE + 19. HWPOISON 20. NOPAGE Short descriptions to the page flags: @@ -86,6 +87,9 @@ Short descriptions to the page flags: 17. HUGE this is an integral part of a HugeTLB page +19. HWPOISON + hardware detected memory corruption on this page: don't touch the data! + 20. NOPAGE no page frame exists at the requested address diff --git a/fs/proc/page.c b/fs/proc/page.c index 2281c2cbfe2b..5033ce0d254b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 #define KPF_KSM 21 @@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif -- cgit v1.2.3-59-g8ed1b From a1bbb5ec39042fb762e7f4bcc634da0d87834193 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:28 -0700 Subject: pagemap: document KPF_KSM and show it in page-types It indicates to the system admin that processes mapping such pages may be eating less physical memory than the reported numbers by legacy tools. Signed-off-by: Wu Fengguang Cc: Hugh Dickins Cc: Izik Eidus Acked-by: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 2 ++ Documentation/vm/pagemap.txt | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'Documentation/vm') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 87f57228f56e..9899fa239eed 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -71,6 +71,7 @@ #define KPF_UNEVICTABLE 18 #define KPF_HWPOISON 19 #define KPF_NOPAGE 20 +#define KPF_KSM 21 /* [32-] kernel hacking assistances */ #define KPF_RESERVED 32 @@ -119,6 +120,7 @@ static char *page_flag_names[] = { [KPF_UNEVICTABLE] = "u:unevictable", [KPF_HWPOISON] = "X:hwpoison", [KPF_NOPAGE] = "n:nopage", + [KPF_KSM] = "x:ksm", [KPF_RESERVED] = "r:reserved", [KPF_MLOCKED] = "m:mlocked", diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 2fdd84a19109..df09b9650a81 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -59,6 +59,7 @@ There are three components to pagemap: 18. UNEVICTABLE 19. HWPOISON 20. NOPAGE + 21. KSM Short descriptions to the page flags: @@ -93,6 +94,9 @@ Short descriptions to the page flags: 20. NOPAGE no page frame exists at the requested address +21. KSM + identical memory pages dynamically shared between one or more processes + [IO related page flags] 1. ERROR IO error occurred 3. UPTODATE page has up-to-date data -- cgit v1.2.3-59-g8ed1b From 0c57effe27eb6544eb44d5fac563b7334e3bc771 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:28 -0700 Subject: page-types: add GPL note Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 9899fa239eed..e46fb5b2107d 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -2,7 +2,10 @@ * page-types: Tool for querying page flags * * Copyright (C) 2009 Intel corporation - * Copyright (C) 2009 Wu Fengguang + * + * Authors: Wu Fengguang + * + * Released under the General Public License (GPL). */ #define _LARGEFILE64_SOURCE -- cgit v1.2.3-59-g8ed1b From 31bbf66eaaaf50ba79e50ab7d3c89531b31c0614 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:29 -0700 Subject: page-types: introduce checked_open() This helps merge duplicate code (now and future) and outstand the main logic. Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index e46fb5b2107d..6bdcc0632c24 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -214,6 +214,18 @@ static void fatal(const char *x, ...) exit(EXIT_FAILURE); } +int checked_open(const char *pathname, int flags) +{ + int fd = open(pathname, flags); + + if (fd < 0) { + perror(pathname); + exit(EXIT_FAILURE); + } + + return fd; +} + /* * page flag names @@ -534,11 +546,7 @@ static void walk_addr_ranges(void) { int i; - kpageflags_fd = open(PROC_KPAGEFLAGS, O_RDONLY); - if (kpageflags_fd < 0) { - perror(PROC_KPAGEFLAGS); - exit(EXIT_FAILURE); - } + kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); if (!nr_addr_ranges) add_addr_range(0, ULONG_MAX); @@ -631,11 +639,7 @@ static void parse_pid(const char *str) opt_pid = parse_number(str); sprintf(buf, "/proc/%d/pagemap", opt_pid); - pagemap_fd = open(buf, O_RDONLY); - if (pagemap_fd < 0) { - perror(buf); - exit(EXIT_FAILURE); - } + pagemap_fd = checked_open(buf, O_RDONLY); sprintf(buf, "/proc/%d/maps", opt_pid); file = fopen(buf, "r"); -- cgit v1.2.3-59-g8ed1b From 4a1b6726feee3132905996ae4cd1c7dc2104e721 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:30 -0700 Subject: page-types: make standalone pagemap/kpageflags read routines Refactor the code to be more modular and easier to reuse. Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 154 ++++++++++++++++++++++++------------------ 1 file changed, 89 insertions(+), 65 deletions(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 6bdcc0632c24..18e891a329e0 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -161,8 +161,6 @@ static unsigned long pg_start[MAX_VMAS]; static unsigned long pg_end[MAX_VMAS]; static unsigned long voffset; -static int pagemap_fd; - #define MAX_BIT_FILTERS 64 static int nr_bit_filters; static uint64_t opt_mask[MAX_BIT_FILTERS]; @@ -170,7 +168,7 @@ static uint64_t opt_bits[MAX_BIT_FILTERS]; static int page_size; -#define PAGES_BATCH (64 << 10) /* 64k pages */ +static int pagemap_fd; static int kpageflags_fd; #define HASH_SHIFT 13 @@ -226,6 +224,62 @@ int checked_open(const char *pathname, int flags) return fd; } +/* + * pagemap/kpageflags routines + */ + +static unsigned long do_u64_read(int fd, char *name, + uint64_t *buf, + unsigned long index, + unsigned long count) +{ + long bytes; + + if (index > ULONG_MAX / 8) + fatal("index overflow: %lu\n", index); + + if (lseek(fd, index * 8, SEEK_SET) < 0) { + perror(name); + exit(EXIT_FAILURE); + } + + bytes = read(fd, buf, count * 8); + if (bytes < 0) { + perror(name); + exit(EXIT_FAILURE); + } + if (bytes % 8) + fatal("partial read: %lu bytes\n", bytes); + + return bytes / 8; +} + +static unsigned long kpageflags_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages); +} + +static unsigned long pagemap_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages); +} + +static unsigned long pagemap_pfn(uint64_t val) +{ + unsigned long pfn; + + if (val & PM_PRESENT) + pfn = PM_PFRAME(val); + else + pfn = 0; + + return pfn; +} + /* * page flag names @@ -432,79 +486,53 @@ static void add_page(unsigned long offset, uint64_t flags) total_pages++; } +#define KPAGEFLAGS_BATCH (64 << 10) /* 64k pages */ static void walk_pfn(unsigned long index, unsigned long count) { + uint64_t buf[KPAGEFLAGS_BATCH]; unsigned long batch; - unsigned long n; + unsigned long pages; unsigned long i; - if (index > ULONG_MAX / KPF_BYTES) - fatal("index overflow: %lu\n", index); - - lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); - while (count) { - uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; - - batch = min_t(unsigned long, count, PAGES_BATCH); - n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); - if (n == 0) + batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH); + pages = kpageflags_read(buf, index, batch); + if (pages == 0) break; - if (n < 0) { - perror(PROC_KPAGEFLAGS); - exit(EXIT_FAILURE); - } - - if (n % KPF_BYTES != 0) - fatal("partial read: %lu bytes\n", n); - n = n / KPF_BYTES; - for (i = 0; i < n; i++) - add_page(index + i, kpageflags_buf[i]); + for (i = 0; i < pages; i++) + add_page(index + i, buf[i]); - index += batch; - count -= batch; + index += pages; + count -= pages; } } - -#define PAGEMAP_BATCH 4096 -static unsigned long task_pfn(unsigned long pgoff) +#define PAGEMAP_BATCH (64 << 10) +static void walk_vma(unsigned long index, unsigned long count) { - static uint64_t buf[PAGEMAP_BATCH]; - static unsigned long start; - static long count; - uint64_t pfn; + uint64_t buf[PAGEMAP_BATCH]; + unsigned long batch; + unsigned long pages; + unsigned long pfn; + unsigned long i; - if (pgoff < start || pgoff >= start + count) { - if (lseek64(pagemap_fd, - (uint64_t)pgoff * PM_ENTRY_BYTES, - SEEK_SET) < 0) { - perror("pagemap seek"); - exit(EXIT_FAILURE); - } - count = read(pagemap_fd, buf, sizeof(buf)); - if (count == 0) - return 0; - if (count < 0) { - perror("pagemap read"); - exit(EXIT_FAILURE); - } - if (count % PM_ENTRY_BYTES) { - fatal("pagemap read not aligned.\n"); - exit(EXIT_FAILURE); - } - count /= PM_ENTRY_BYTES; - start = pgoff; - } + while (count) { + batch = min_t(unsigned long, count, PAGEMAP_BATCH); + pages = pagemap_read(buf, index, batch); + if (pages == 0) + break; - pfn = buf[pgoff - start]; - if (pfn & PM_PRESENT) - pfn = PM_PFRAME(pfn); - else - pfn = 0; + for (i = 0; i < pages; i++) { + pfn = pagemap_pfn(buf[i]); + voffset = index + i; + if (pfn) + walk_pfn(pfn, 1); + } - return pfn; + index += pages; + count -= pages; + } } static void walk_task(unsigned long index, unsigned long count) @@ -524,11 +552,7 @@ static void walk_task(unsigned long index, unsigned long count) index = min_t(unsigned long, pg_end[i], end); assert(voffset < index); - for (; voffset < index; voffset++) { - unsigned long pfn = task_pfn(voffset); - if (pfn) - walk_pfn(pfn, 1); - } + walk_vma(voffset, index - voffset); } } -- cgit v1.2.3-59-g8ed1b From e577ebde9fb161bdc87511763c75dfad291059e2 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:30 -0700 Subject: page-types: make voffset local variables Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 18e891a329e0..56f33b63d160 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -159,7 +159,6 @@ static unsigned long opt_size[MAX_ADDR_RANGES]; static int nr_vmas; static unsigned long pg_start[MAX_VMAS]; static unsigned long pg_end[MAX_VMAS]; -static unsigned long voffset; #define MAX_BIT_FILTERS 64 static int nr_bit_filters; @@ -328,7 +327,8 @@ static char *page_flag_longname(uint64_t flags) * page list and summary */ -static void show_page_range(unsigned long offset, uint64_t flags) +static void show_page_range(unsigned long voffset, + unsigned long offset, uint64_t flags) { static uint64_t flags0; static unsigned long voff; @@ -354,7 +354,8 @@ static void show_page_range(unsigned long offset, uint64_t flags) count = 1; } -static void show_page(unsigned long offset, uint64_t flags) +static void show_page(unsigned long voffset, + unsigned long offset, uint64_t flags) { if (opt_pid) printf("%lx\t", voffset); @@ -435,7 +436,6 @@ static uint64_t well_known_flags(uint64_t flags) return flags; } - /* * page frame walker */ @@ -467,7 +467,8 @@ static int hash_slot(uint64_t flags) exit(EXIT_FAILURE); } -static void add_page(unsigned long offset, uint64_t flags) +static void add_page(unsigned long voffset, + unsigned long offset, uint64_t flags) { flags = expand_overloaded_flags(flags); @@ -478,16 +479,18 @@ static void add_page(unsigned long offset, uint64_t flags) return; if (opt_list == 1) - show_page_range(offset, flags); + show_page_range(voffset, offset, flags); else if (opt_list == 2) - show_page(offset, flags); + show_page(voffset, offset, flags); nr_pages[hash_slot(flags)]++; total_pages++; } #define KPAGEFLAGS_BATCH (64 << 10) /* 64k pages */ -static void walk_pfn(unsigned long index, unsigned long count) +static void walk_pfn(unsigned long voffset, + unsigned long index, + unsigned long count) { uint64_t buf[KPAGEFLAGS_BATCH]; unsigned long batch; @@ -501,7 +504,7 @@ static void walk_pfn(unsigned long index, unsigned long count) break; for (i = 0; i < pages; i++) - add_page(index + i, buf[i]); + add_page(voffset + i, index + i, buf[i]); index += pages; count -= pages; @@ -525,9 +528,8 @@ static void walk_vma(unsigned long index, unsigned long count) for (i = 0; i < pages; i++) { pfn = pagemap_pfn(buf[i]); - voffset = index + i; if (pfn) - walk_pfn(pfn, 1); + walk_pfn(index + i, pfn, 1); } index += pages; @@ -537,8 +539,9 @@ static void walk_vma(unsigned long index, unsigned long count) static void walk_task(unsigned long index, unsigned long count) { - int i = 0; const unsigned long end = index + count; + unsigned long start; + int i = 0; while (index < end) { @@ -548,11 +551,11 @@ static void walk_task(unsigned long index, unsigned long count) if (pg_start[i] >= end) return; - voffset = max_t(unsigned long, pg_start[i], index); - index = min_t(unsigned long, pg_end[i], end); + start = max_t(unsigned long, pg_start[i], index); + index = min_t(unsigned long, pg_end[i], end); - assert(voffset < index); - walk_vma(voffset, index - voffset); + assert(start < index); + walk_vma(start, index - start); } } @@ -577,7 +580,7 @@ static void walk_addr_ranges(void) for (i = 0; i < nr_addr_ranges; i++) if (!opt_pid) - walk_pfn(opt_offset[i], opt_size[i]); + walk_pfn(0, opt_offset[i], opt_size[i]); else walk_task(opt_offset[i], opt_size[i]); @@ -879,7 +882,7 @@ int main(int argc, char *argv[]) walk_addr_ranges(); if (opt_list == 1) - show_page_range(0, 0); /* drain the buffer */ + show_page_range(0, 0, 0); /* drain the buffer */ if (opt_no_summary) return 0; -- cgit v1.2.3-59-g8ed1b From 48640d69f5f06fe951a4de065a7f374cb9ee6040 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:31 -0700 Subject: page-types: introduce kpageflags_flags() Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 56f33b63d160..11b9a03f7b10 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -436,6 +436,16 @@ static uint64_t well_known_flags(uint64_t flags) return flags; } +static uint64_t kpageflags_flags(uint64_t flags) +{ + flags = expand_overloaded_flags(flags); + + if (!opt_raw) + flags = well_known_flags(flags); + + return flags; +} + /* * page frame walker */ @@ -470,10 +480,7 @@ static int hash_slot(uint64_t flags) static void add_page(unsigned long voffset, unsigned long offset, uint64_t flags) { - flags = expand_overloaded_flags(flags); - - if (!opt_raw) - flags = well_known_flags(flags); + flags = kpageflags_flags(flags); if (!bit_mask_ok(flags)) return; -- cgit v1.2.3-59-g8ed1b From a54fed9f70a2765f4476e1ce3d691a2f31df258f Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:32 -0700 Subject: page-types: add hwpoison/unpoison feature For hwpoison stress testing. The debugfs mount point is assumed to be /debug/. Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page-types.c | 73 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) (limited to 'Documentation/vm') diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 11b9a03f7b10..3ec4f2a22585 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -170,6 +170,13 @@ static int page_size; static int pagemap_fd; static int kpageflags_fd; +static int opt_hwpoison; +static int opt_unpoison; + +static char *hwpoison_debug_fs = "/debug/hwpoison"; +static int hwpoison_inject_fd; +static int hwpoison_forget_fd; + #define HASH_SHIFT 13 #define HASH_SIZE (1 << HASH_SHIFT) #define HASH_MASK (HASH_SIZE - 1) @@ -446,6 +453,53 @@ static uint64_t kpageflags_flags(uint64_t flags) return flags; } +/* + * page actions + */ + +static void prepare_hwpoison_fd(void) +{ + char buf[100]; + + if (opt_hwpoison && !hwpoison_inject_fd) { + sprintf(buf, "%s/corrupt-pfn", hwpoison_debug_fs); + hwpoison_inject_fd = checked_open(buf, O_WRONLY); + } + + if (opt_unpoison && !hwpoison_forget_fd) { + sprintf(buf, "%s/renew-pfn", hwpoison_debug_fs); + hwpoison_forget_fd = checked_open(buf, O_WRONLY); + } +} + +static int hwpoison_page(unsigned long offset) +{ + char buf[100]; + int len; + + len = sprintf(buf, "0x%lx\n", offset); + len = write(hwpoison_inject_fd, buf, len); + if (len < 0) { + perror("hwpoison inject"); + return len; + } + return 0; +} + +static int unpoison_page(unsigned long offset) +{ + char buf[100]; + int len; + + len = sprintf(buf, "0x%lx\n", offset); + len = write(hwpoison_forget_fd, buf, len); + if (len < 0) { + perror("hwpoison forget"); + return len; + } + return 0; +} + /* * page frame walker */ @@ -485,6 +539,11 @@ static void add_page(unsigned long voffset, if (!bit_mask_ok(flags)) return; + if (opt_hwpoison) + hwpoison_page(offset); + if (opt_unpoison) + unpoison_page(offset); + if (opt_list == 1) show_page_range(voffset, offset, flags); else if (opt_list == 2) @@ -624,6 +683,8 @@ static void usage(void) " -l|--list Show page details in ranges\n" " -L|--list-each Show page details one by one\n" " -N|--no-summary Don't show summay info\n" +" -X|--hwpoison hwpoison pages\n" +" -x|--unpoison unpoison pages\n" " -h|--help Show this usage message\n" "addr-spec:\n" " N one page at offset N (unit: pages)\n" @@ -833,6 +894,8 @@ static struct option opts[] = { { "list" , 0, NULL, 'l' }, { "list-each" , 0, NULL, 'L' }, { "no-summary", 0, NULL, 'N' }, + { "hwpoison" , 0, NULL, 'X' }, + { "unpoison" , 0, NULL, 'x' }, { "help" , 0, NULL, 'h' }, { NULL , 0, NULL, 0 } }; @@ -844,7 +907,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); while ((c = getopt_long(argc, argv, - "rp:f:a:b:lLNh", opts, NULL)) != -1) { + "rp:f:a:b:lLNXxh", opts, NULL)) != -1) { switch (c) { case 'r': opt_raw = 1; @@ -870,6 +933,14 @@ int main(int argc, char *argv[]) case 'N': opt_no_summary = 1; break; + case 'X': + opt_hwpoison = 1; + prepare_hwpoison_fd(); + break; + case 'x': + opt_unpoison = 1; + prepare_hwpoison_fd(); + break; case 'h': usage(); exit(0); -- cgit v1.2.3-59-g8ed1b