aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/vm
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/vm')
-rw-r--r--Documentation/vm/Makefile2
-rw-r--r--Documentation/vm/slabinfo.c1364
-rw-r--r--Documentation/vm/transhuge.txt298
3 files changed, 299 insertions, 1365 deletions
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile
index 9dcff328b964..3fa4d0668864 100644
--- a/Documentation/vm/Makefile
+++ b/Documentation/vm/Makefile
@@ -2,7 +2,7 @@
obj- := dummy.o
# List of programs to build
-hostprogs-y := slabinfo page-types hugepage-mmap hugepage-shm map_hugetlb
+hostprogs-y := page-types hugepage-mmap hugepage-shm map_hugetlb
# Tell kbuild to always build the programs
always := $(hostprogs-y)
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
deleted file mode 100644
index 92e729f4b676..000000000000
--- a/Documentation/vm/slabinfo.c
+++ /dev/null
@@ -1,1364 +0,0 @@
-/*
- * Slabinfo: Tool to get reports about slabs
- *
- * (C) 2007 sgi, Christoph Lameter
- *
- * Compile by:
- *
- * gcc -o slabinfo slabinfo.c
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <strings.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <getopt.h>
-#include <regex.h>
-#include <errno.h>
-
-#define MAX_SLABS 500
-#define MAX_ALIASES 500
-#define MAX_NODES 1024
-
-struct slabinfo {
- char *name;
- int alias;
- int refs;
- int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
- int hwcache_align, object_size, objs_per_slab;
- int sanity_checks, slab_size, store_user, trace;
- int order, poison, reclaim_account, red_zone;
- unsigned long partial, objects, slabs, objects_partial, objects_total;
- unsigned long alloc_fastpath, alloc_slowpath;
- unsigned long free_fastpath, free_slowpath;
- unsigned long free_frozen, free_add_partial, free_remove_partial;
- unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill;
- unsigned long cpuslab_flush, deactivate_full, deactivate_empty;
- unsigned long deactivate_to_head, deactivate_to_tail;
- unsigned long deactivate_remote_frees, order_fallback;
- int numa[MAX_NODES];
- int numa_partial[MAX_NODES];
-} slabinfo[MAX_SLABS];
-
-struct aliasinfo {
- char *name;
- char *ref;
- struct slabinfo *slab;
-} aliasinfo[MAX_ALIASES];
-
-int slabs = 0;
-int actual_slabs = 0;
-int aliases = 0;
-int alias_targets = 0;
-int highest_node = 0;
-
-char buffer[4096];
-
-int show_empty = 0;
-int show_report = 0;
-int show_alias = 0;
-int show_slab = 0;
-int skip_zero = 1;
-int show_numa = 0;
-int show_track = 0;
-int show_first_alias = 0;
-int validate = 0;
-int shrink = 0;
-int show_inverted = 0;
-int show_single_ref = 0;
-int show_totals = 0;
-int sort_size = 0;
-int sort_active = 0;
-int set_debug = 0;
-int show_ops = 0;
-int show_activity = 0;
-
-/* Debug options */
-int sanity = 0;
-int redzone = 0;
-int poison = 0;
-int tracking = 0;
-int tracing = 0;
-
-int page_size;
-
-regex_t pattern;
-
-static void fatal(const char *x, ...)
-{
- va_list ap;
-
- va_start(ap, x);
- vfprintf(stderr, x, ap);
- va_end(ap);
- exit(EXIT_FAILURE);
-}
-
-static void usage(void)
-{
- printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n"
- "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
- "-a|--aliases Show aliases\n"
- "-A|--activity Most active slabs first\n"
- "-d<options>|--debug=<options> Set/Clear Debug options\n"
- "-D|--display-active Switch line format to activity\n"
- "-e|--empty Show empty slabs\n"
- "-f|--first-alias Show first alias\n"
- "-h|--help Show usage information\n"
- "-i|--inverted Inverted list\n"
- "-l|--slabs Show slabs\n"
- "-n|--numa Show NUMA information\n"
- "-o|--ops Show kmem_cache_ops\n"
- "-s|--shrink Shrink slabs\n"
- "-r|--report Detailed report on single slabs\n"
- "-S|--Size Sort by size\n"
- "-t|--tracking Show alloc/free information\n"
- "-T|--Totals Show summary information\n"
- "-v|--validate Validate slabs\n"
- "-z|--zero Include empty slabs\n"
- "-1|--1ref Single reference\n"
- "\nValid debug options (FZPUT may be combined)\n"
- "a / A Switch on all debug options (=FZUP)\n"
- "- Switch off all debug options\n"
- "f / F Sanity Checks (SLAB_DEBUG_FREE)\n"
- "z / Z Redzoning\n"
- "p / P Poisoning\n"
- "u / U Tracking\n"
- "t / T Tracing\n"
- );
-}
-
-static unsigned long read_obj(const char *name)
-{
- FILE *f = fopen(name, "r");
-
- if (!f)
- buffer[0] = 0;
- else {
- if (!fgets(buffer, sizeof(buffer), f))
- buffer[0] = 0;
- fclose(f);
- if (buffer[strlen(buffer)] == '\n')
- buffer[strlen(buffer)] = 0;
- }
- return strlen(buffer);
-}
-
-
-/*
- * Get the contents of an attribute
- */
-static unsigned long get_obj(const char *name)
-{
- if (!read_obj(name))
- return 0;
-
- return atol(buffer);
-}
-
-static unsigned long get_obj_and_str(const char *name, char **x)
-{
- unsigned long result = 0;
- char *p;
-
- *x = NULL;
-
- if (!read_obj(name)) {
- x = NULL;
- return 0;
- }
- result = strtoul(buffer, &p, 10);
- while (*p == ' ')
- p++;
- if (*p)
- *x = strdup(p);
- return result;
-}
-
-static void set_obj(struct slabinfo *s, const char *name, int n)
-{
- char x[100];
- FILE *f;
-
- snprintf(x, 100, "%s/%s", s->name, name);
- f = fopen(x, "w");
- if (!f)
- fatal("Cannot write to %s\n", x);
-
- fprintf(f, "%d\n", n);
- fclose(f);
-}
-
-static unsigned long read_slab_obj(struct slabinfo *s, const char *name)
-{
- char x[100];
- FILE *f;
- size_t l;
-
- snprintf(x, 100, "%s/%s", s->name, name);
- f = fopen(x, "r");
- if (!f) {
- buffer[0] = 0;
- l = 0;
- } else {
- l = fread(buffer, 1, sizeof(buffer), f);
- buffer[l] = 0;
- fclose(f);
- }
- return l;
-}
-
-
-/*
- * Put a size string together
- */
-static int store_size(char *buffer, unsigned long value)
-{
- unsigned long divisor = 1;
- char trailer = 0;
- int n;
-
- if (value > 1000000000UL) {
- divisor = 100000000UL;
- trailer = 'G';
- } else if (value > 1000000UL) {
- divisor = 100000UL;
- trailer = 'M';
- } else if (value > 1000UL) {
- divisor = 100;
- trailer = 'K';
- }
-
- value /= divisor;
- n = sprintf(buffer, "%ld",value);
- if (trailer) {
- buffer[n] = trailer;
- n++;
- buffer[n] = 0;
- }
- if (divisor != 1) {
- memmove(buffer + n - 2, buffer + n - 3, 4);
- buffer[n-2] = '.';
- n++;
- }
- return n;
-}
-
-static void decode_numa_list(int *numa, char *t)
-{
- int node;
- int nr;
-
- memset(numa, 0, MAX_NODES * sizeof(int));
-
- if (!t)
- return;
-
- while (*t == 'N') {
- t++;
- node = strtoul(t, &t, 10);
- if (*t == '=') {
- t++;
- nr = strtoul(t, &t, 10);
- numa[node] = nr;
- if (node > highest_node)
- highest_node = node;
- }
- while (*t == ' ')
- t++;
- }
-}
-
-static void slab_validate(struct slabinfo *s)
-{
- if (strcmp(s->name, "*") == 0)
- return;
-
- set_obj(s, "validate", 1);
-}
-
-static void slab_shrink(struct slabinfo *s)
-{
- if (strcmp(s->name, "*") == 0)
- return;
-
- set_obj(s, "shrink", 1);
-}
-
-int line = 0;
-
-static void first_line(void)
-{
- if (show_activity)
- printf("Name Objects Alloc Free %%Fast Fallb O\n");
- else
- printf("Name Objects Objsize Space "
- "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
-}
-
-/*
- * Find the shortest alias of a slab
- */
-static struct aliasinfo *find_one_alias(struct slabinfo *find)
-{
- struct aliasinfo *a;
- struct aliasinfo *best = NULL;
-
- for(a = aliasinfo;a < aliasinfo + aliases; a++) {
- if (a->slab == find &&
- (!best || strlen(best->name) < strlen(a->name))) {
- best = a;
- if (strncmp(a->name,"kmall", 5) == 0)
- return best;
- }
- }
- return best;
-}
-
-static unsigned long slab_size(struct slabinfo *s)
-{
- return s->slabs * (page_size << s->order);
-}
-
-static unsigned long slab_activity(struct slabinfo *s)
-{
- return s->alloc_fastpath + s->free_fastpath +
- s->alloc_slowpath + s->free_slowpath;
-}
-
-static void slab_numa(struct slabinfo *s, int mode)
-{
- int node;
-
- if (strcmp(s->name, "*") == 0)
- return;
-
- if (!highest_node) {
- printf("\n%s: No NUMA information available.\n", s->name);
- return;
- }
-
- if (skip_zero && !s->slabs)
- return;
-
- if (!line) {
- printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
- for(node = 0; node <= highest_node; node++)
- printf(" %4d", node);
- printf("\n----------------------");
- for(node = 0; node <= highest_node; node++)
- printf("-----");
- printf("\n");
- }
- printf("%-21s ", mode ? "All slabs" : s->name);
- for(node = 0; node <= highest_node; node++) {
- char b[20];
-
- store_size(b, s->numa[node]);
- printf(" %4s", b);
- }
- printf("\n");
- if (mode) {
- printf("%-21s ", "Partial slabs");
- for(node = 0; node <= highest_node; node++) {
- char b[20];
-
- store_size(b, s->numa_partial[node]);
- printf(" %4s", b);
- }
- printf("\n");
- }
- line++;
-}
-
-static void show_tracking(struct slabinfo *s)
-{
- printf("\n%s: Kernel object allocation\n", s->name);
- printf("-----------------------------------------------------------------------\n");
- if (read_slab_obj(s, "alloc_calls"))
- printf(buffer);
- else
- printf("No Data\n");
-
- printf("\n%s: Kernel object freeing\n", s->name);
- printf("------------------------------------------------------------------------\n");
- if (read_slab_obj(s, "free_calls"))
- printf(buffer);
- else
- printf("No Data\n");
-
-}
-
-static void ops(struct slabinfo *s)
-{
- if (strcmp(s->name, "*") == 0)
- return;
-
- if (read_slab_obj(s, "ops")) {
- printf("\n%s: kmem_cache operations\n", s->name);
- printf("--------------------------------------------\n");
- printf(buffer);
- } else
- printf("\n%s has no kmem_cache operations\n", s->name);
-}
-
-static const char *onoff(int x)
-{
- if (x)
- return "On ";
- return "Off";
-}
-
-static void slab_stats(struct slabinfo *s)
-{
- unsigned long total_alloc;
- unsigned long total_free;
- unsigned long total;
-
- if (!s->alloc_slab)
- return;
-
- total_alloc = s->alloc_fastpath + s->alloc_slowpath;
- total_free = s->free_fastpath + s->free_slowpath;
-
- if (!total_alloc)
- return;
-
- printf("\n");
- printf("Slab Perf Counter Alloc Free %%Al %%Fr\n");
- printf("--------------------------------------------------\n");
- printf("Fastpath %8lu %8lu %3lu %3lu\n",
- s->alloc_fastpath, s->free_fastpath,
- s->alloc_fastpath * 100 / total_alloc,
- s->free_fastpath * 100 / total_free);
- printf("Slowpath %8lu %8lu %3lu %3lu\n",
- total_alloc - s->alloc_fastpath, s->free_slowpath,
- (total_alloc - s->alloc_fastpath) * 100 / total_alloc,
- s->free_slowpath * 100 / total_free);
- printf("Page Alloc %8lu %8lu %3lu %3lu\n",
- s->alloc_slab, s->free_slab,
- s->alloc_slab * 100 / total_alloc,
- s->free_slab * 100 / total_free);
- printf("Add partial %8lu %8lu %3lu %3lu\n",
- s->deactivate_to_head + s->deactivate_to_tail,
- s->free_add_partial,
- (s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc,
- s->free_add_partial * 100 / total_free);
- printf("Remove partial %8lu %8lu %3lu %3lu\n",
- s->alloc_from_partial, s->free_remove_partial,
- s->alloc_from_partial * 100 / total_alloc,
- s->free_remove_partial * 100 / total_free);
-
- printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n",
- s->deactivate_remote_frees, s->free_frozen,
- s->deactivate_remote_frees * 100 / total_alloc,
- s->free_frozen * 100 / total_free);
-
- printf("Total %8lu %8lu\n\n", total_alloc, total_free);
-
- if (s->cpuslab_flush)
- printf("Flushes %8lu\n", s->cpuslab_flush);
-
- if (s->alloc_refill)
- printf("Refill %8lu\n", s->alloc_refill);
-
- total = s->deactivate_full + s->deactivate_empty +
- s->deactivate_to_head + s->deactivate_to_tail;
-
- if (total)
- printf("Deactivate Full=%lu(%lu%%) Empty=%lu(%lu%%) "
- "ToHead=%lu(%lu%%) ToTail=%lu(%lu%%)\n",
- s->deactivate_full, (s->deactivate_full * 100) / total,
- s->deactivate_empty, (s->deactivate_empty * 100) / total,
- s->deactivate_to_head, (s->deactivate_to_head * 100) / total,
- s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total);
-}
-
-static void report(struct slabinfo *s)
-{
- if (strcmp(s->name, "*") == 0)
- return;
-
- printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n",
- s->name, s->aliases, s->order, s->objects);
- if (s->hwcache_align)
- printf("** Hardware cacheline aligned\n");
- if (s->cache_dma)
- printf("** Memory is allocated in a special DMA zone\n");
- if (s->destroy_by_rcu)
- printf("** Slabs are destroyed via RCU\n");
- if (s->reclaim_account)
- printf("** Reclaim accounting active\n");
-
- printf("\nSizes (bytes) Slabs Debug Memory\n");
- printf("------------------------------------------------------------------------\n");
- printf("Object : %7d Total : %7ld Sanity Checks : %s Total: %7ld\n",
- s->object_size, s->slabs, onoff(s->sanity_checks),
- s->slabs * (page_size << s->order));
- printf("SlabObj: %7d Full : %7ld Redzoning : %s Used : %7ld\n",
- s->slab_size, s->slabs - s->partial - s->cpu_slabs,
- onoff(s->red_zone), s->objects * s->object_size);
- printf("SlabSiz: %7d Partial: %7ld Poisoning : %s Loss : %7ld\n",
- page_size << s->order, s->partial, onoff(s->poison),
- s->slabs * (page_size << s->order) - s->objects * s->object_size);
- printf("Loss : %7d CpuSlab: %7d Tracking : %s Lalig: %7ld\n",
- s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user),
- (s->slab_size - s->object_size) * s->objects);
- printf("Align : %7d Objects: %7d Tracing : %s Lpadd: %7ld\n",
- s->align, s->objs_per_slab, onoff(s->trace),
- ((page_size << s->order) - s->objs_per_slab * s->slab_size) *
- s->slabs);
-
- ops(s);
- show_tracking(s);
- slab_numa(s, 1);
- slab_stats(s);
-}
-
-static void slabcache(struct slabinfo *s)
-{
- char size_str[20];
- char dist_str[40];
- char flags[20];
- char *p = flags;
-
- if (strcmp(s->name, "*") == 0)
- return;
-
- if (actual_slabs == 1) {
- report(s);
- return;
- }
-
- if (skip_zero && !show_empty && !s->slabs)
- return;
-
- if (show_empty && s->slabs)
- return;
-
- store_size(size_str, slab_size(s));
- snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
- s->partial, s->cpu_slabs);
-
- if (!line++)
- first_line();
-
- if (s->aliases)
- *p++ = '*';
- if (s->cache_dma)
- *p++ = 'd';
- if (s->hwcache_align)
- *p++ = 'A';
- if (s->poison)
- *p++ = 'P';
- if (s->reclaim_account)
- *p++ = 'a';
- if (s->red_zone)
- *p++ = 'Z';
- if (s->sanity_checks)
- *p++ = 'F';
- if (s->store_user)
- *p++ = 'U';
- if (s->trace)
- *p++ = 'T';
-
- *p = 0;
- if (show_activity) {
- unsigned long total_alloc;
- unsigned long total_free;
-
- total_alloc = s->alloc_fastpath + s->alloc_slowpath;
- total_free = s->free_fastpath + s->free_slowpath;
-
- printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d\n",
- s->name, s->objects,
- total_alloc, total_free,
- total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0,
- total_free ? (s->free_fastpath * 100 / total_free) : 0,
- s->order_fallback, s->order);
- }
- else
- printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
- s->name, s->objects, s->object_size, size_str, dist_str,
- s->objs_per_slab, s->order,
- s->slabs ? (s->partial * 100) / s->slabs : 100,
- s->slabs ? (s->objects * s->object_size * 100) /
- (s->slabs * (page_size << s->order)) : 100,
- flags);
-}
-
-/*
- * Analyze debug options. Return false if something is amiss.
- */
-static int debug_opt_scan(char *opt)
-{
- if (!opt || !opt[0] || strcmp(opt, "-") == 0)
- return 1;
-
- if (strcasecmp(opt, "a") == 0) {
- sanity = 1;
- poison = 1;
- redzone = 1;
- tracking = 1;
- return 1;
- }
-
- for ( ; *opt; opt++)
- switch (*opt) {
- case 'F' : case 'f':
- if (sanity)
- return 0;
- sanity = 1;
- break;
- case 'P' : case 'p':
- if (poison)
- return 0;
- poison = 1;
- break;
-
- case 'Z' : case 'z':
- if (redzone)
- return 0;
- redzone = 1;
- break;
-
- case 'U' : case 'u':
- if (tracking)
- return 0;
- tracking = 1;
- break;
-
- case 'T' : case 't':
- if (tracing)
- return 0;
- tracing = 1;
- break;
- default:
- return 0;
- }
- return 1;
-}
-
-static int slab_empty(struct slabinfo *s)
-{
- if (s->objects > 0)
- return 0;
-
- /*
- * We may still have slabs even if there are no objects. Shrinking will
- * remove them.
- */
- if (s->slabs != 0)
- set_obj(s, "shrink", 1);
-
- return 1;
-}
-
-static void slab_debug(struct slabinfo *s)
-{
- if (strcmp(s->name, "*") == 0)
- return;
-
- if (sanity && !s->sanity_checks) {
- set_obj(s, "sanity", 1);
- }
- if (!sanity && s->sanity_checks) {
- if (slab_empty(s))
- set_obj(s, "sanity", 0);
- else
- fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name);
- }
- if (redzone && !s->red_zone) {
- if (slab_empty(s))
- set_obj(s, "red_zone", 1);
- else
- fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name);
- }
- if (!redzone && s->red_zone) {
- if (slab_empty(s))
- set_obj(s, "red_zone", 0);
- else
- fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name);
- }
- if (poison && !s->poison) {
- if (slab_empty(s))
- set_obj(s, "poison", 1);
- else
- fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name);
- }
- if (!poison && s->poison) {
- if (slab_empty(s))
- set_obj(s, "poison", 0);
- else
- fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name);
- }
- if (tracking && !s->store_user) {
- if (slab_empty(s))
- set_obj(s, "store_user", 1);
- else
- fprintf(stderr, "%s not empty cannot enable tracking\n", s->name);
- }
- if (!tracking && s->store_user) {
- if (slab_empty(s))
- set_obj(s, "store_user", 0);
- else
- fprintf(stderr, "%s not empty cannot disable tracking\n", s->name);
- }
- if (tracing && !s->trace) {
- if (slabs == 1)
- set_obj(s, "trace", 1);
- else
- fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name);
- }
- if (!tracing && s->trace)
- set_obj(s, "trace", 1);
-}
-
-static void totals(void)
-{
- struct slabinfo *s;
-
- int used_slabs = 0;
- char b1[20], b2[20], b3[20], b4[20];
- unsigned long long max = 1ULL << 63;
-
- /* Object size */
- unsigned long long min_objsize = max, max_objsize = 0, avg_objsize;
-
- /* Number of partial slabs in a slabcache */
- unsigned long long min_partial = max, max_partial = 0,
- avg_partial, total_partial = 0;
-
- /* Number of slabs in a slab cache */
- unsigned long long min_slabs = max, max_slabs = 0,
- avg_slabs, total_slabs = 0;
-
- /* Size of the whole slab */
- unsigned long long min_size = max, max_size = 0,
- avg_size, total_size = 0;
-
- /* Bytes used for object storage in a slab */
- unsigned long long min_used = max, max_used = 0,
- avg_used, total_used = 0;
-
- /* Waste: Bytes used for alignment and padding */
- unsigned long long min_waste = max, max_waste = 0,
- avg_waste, total_waste = 0;
- /* Number of objects in a slab */
- unsigned long long min_objects = max, max_objects = 0,
- avg_objects, total_objects = 0;
- /* Waste per object */
- unsigned long long min_objwaste = max,
- max_objwaste = 0, avg_objwaste,
- total_objwaste = 0;
-
- /* Memory per object */
- unsigned long long min_memobj = max,
- max_memobj = 0, avg_memobj,
- total_objsize = 0;
-
- /* Percentage of partial slabs per slab */
- unsigned long min_ppart = 100, max_ppart = 0,
- avg_ppart, total_ppart = 0;
-
- /* Number of objects in partial slabs */
- unsigned long min_partobj = max, max_partobj = 0,
- avg_partobj, total_partobj = 0;
-
- /* Percentage of partial objects of all objects in a slab */
- unsigned long min_ppartobj = 100, max_ppartobj = 0,
- avg_ppartobj, total_ppartobj = 0;
-
-
- for (s = slabinfo; s < slabinfo + slabs; s++) {
- unsigned long long size;
- unsigned long used;
- unsigned long long wasted;
- unsigned long long objwaste;
- unsigned long percentage_partial_slabs;
- unsigned long percentage_partial_objs;
-
- if (!s->slabs || !s->objects)
- continue;
-
- used_slabs++;
-
- size = slab_size(s);
- used = s->objects * s->object_size;
- wasted = size - used;
- objwaste = s->slab_size - s->object_size;
-
- percentage_partial_slabs = s->partial * 100 / s->slabs;
- if (percentage_partial_slabs > 100)
- percentage_partial_slabs = 100;
-
- percentage_partial_objs = s->objects_partial * 100
- / s->objects;
-
- if (percentage_partial_objs > 100)
- percentage_partial_objs = 100;
-
- if (s->object_size < min_objsize)
- min_objsize = s->object_size;
- if (s->partial < min_partial)
- min_partial = s->partial;
- if (s->slabs < min_slabs)
- min_slabs = s->slabs;
- if (size < min_size)
- min_size = size;
- if (wasted < min_waste)
- min_waste = wasted;
- if (objwaste < min_objwaste)
- min_objwaste = objwaste;
- if (s->objects < min_objects)
- min_objects = s->objects;
- if (used < min_used)
- min_used = used;
- if (s->objects_partial < min_partobj)
- min_partobj = s->objects_partial;
- if (percentage_partial_slabs < min_ppart)
- min_ppart = percentage_partial_slabs;
- if (percentage_partial_objs < min_ppartobj)
- min_ppartobj = percentage_partial_objs;
- if (s->slab_size < min_memobj)
- min_memobj = s->slab_size;
-
- if (s->object_size > max_objsize)
- max_objsize = s->object_size;
- if (s->partial > max_partial)
- max_partial = s->partial;
- if (s->slabs > max_slabs)
- max_slabs = s->slabs;
- if (size > max_size)
- max_size = size;
- if (wasted > max_waste)
- max_waste = wasted;
- if (objwaste > max_objwaste)
- max_objwaste = objwaste;
- if (s->objects > max_objects)
- max_objects = s->objects;
- if (used > max_used)
- max_used = used;
- if (s->objects_partial > max_partobj)
- max_partobj = s->objects_partial;
- if (percentage_partial_slabs > max_ppart)
- max_ppart = percentage_partial_slabs;
- if (percentage_partial_objs > max_ppartobj)
- max_ppartobj = percentage_partial_objs;
- if (s->slab_size > max_memobj)
- max_memobj = s->slab_size;
-
- total_partial += s->partial;
- total_slabs += s->slabs;
- total_size += size;
- total_waste += wasted;
-
- total_objects += s->objects;
- total_used += used;
- total_partobj += s->objects_partial;
- total_ppart += percentage_partial_slabs;
- total_ppartobj += percentage_partial_objs;
-
- total_objwaste += s->objects * objwaste;
- total_objsize += s->objects * s->slab_size;
- }
-
- if (!total_objects) {
- printf("No objects\n");
- return;
- }
- if (!used_slabs) {
- printf("No slabs\n");
- return;
- }
-
- /* Per slab averages */
- avg_partial = total_partial / used_slabs;
- avg_slabs = total_slabs / used_slabs;
- avg_size = total_size / used_slabs;
- avg_waste = total_waste / used_slabs;
-
- avg_objects = total_objects / used_slabs;
- avg_used = total_used / used_slabs;
- avg_partobj = total_partobj / used_slabs;
- avg_ppart = total_ppart / used_slabs;
- avg_ppartobj = total_ppartobj / used_slabs;
-
- /* Per object object sizes */
- avg_objsize = total_used / total_objects;
- avg_objwaste = total_objwaste / total_objects;
- avg_partobj = total_partobj * 100 / total_objects;
- avg_memobj = total_objsize / total_objects;
-
- printf("Slabcache Totals\n");
- printf("----------------\n");
- printf("Slabcaches : %3d Aliases : %3d->%-3d Active: %3d\n",
- slabs, aliases, alias_targets, used_slabs);
-
- store_size(b1, total_size);store_size(b2, total_waste);
- store_size(b3, total_waste * 100 / total_used);
- printf("Memory used: %6s # Loss : %6s MRatio:%6s%%\n", b1, b2, b3);
-
- store_size(b1, total_objects);store_size(b2, total_partobj);
- store_size(b3, total_partobj * 100 / total_objects);
- printf("# Objects : %6s # PartObj: %6s ORatio:%6s%%\n", b1, b2, b3);
-
- printf("\n");
- printf("Per Cache Average Min Max Total\n");
- printf("---------------------------------------------------------\n");
-
- store_size(b1, avg_objects);store_size(b2, min_objects);
- store_size(b3, max_objects);store_size(b4, total_objects);
- printf("#Objects %10s %10s %10s %10s\n",
- b1, b2, b3, b4);
-
- store_size(b1, avg_slabs);store_size(b2, min_slabs);
- store_size(b3, max_slabs);store_size(b4, total_slabs);
- printf("#Slabs %10s %10s %10s %10s\n",
- b1, b2, b3, b4);
-
- store_size(b1, avg_partial);store_size(b2, min_partial);
- store_size(b3, max_partial);store_size(b4, total_partial);
- printf("#PartSlab %10s %10s %10s %10s\n",
- b1, b2, b3, b4);
- store_size(b1, avg_ppart);store_size(b2, min_ppart);
- store_size(b3, max_ppart);
- store_size(b4, total_partial * 100 / total_slabs);
- printf("%%PartSlab%10s%% %10s%% %10s%% %10s%%\n",
- b1, b2, b3, b4);
-
- store_size(b1, avg_partobj);store_size(b2, min_partobj);
- store_size(b3, max_partobj);
- store_size(b4, total_partobj);
- printf("PartObjs %10s %10s %10s %10s\n",
- b1, b2, b3, b4);
-
- store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
- store_size(b3, max_ppartobj);
- store_size(b4, total_partobj * 100 / total_objects);
- printf("%% PartObj%10s%% %10s%% %10s%% %10s%%\n",
- b1, b2, b3, b4);
-
- store_size(b1, avg_size);store_size(b2, min_size);
- store_size(b3, max_size);store_size(b4, total_size);
- printf("Memory %10s %10s %10s %10s\n",
- b1, b2, b3, b4);
-
- store_size(b1, avg_used);store_size(b2, min_used);
- store_size(b3, max_used);store_size(b4, total_used);
- printf("Used %10s %10s %10s %10s\n",
- b1, b2, b3, b4);
-
- store_size(b1, avg_waste);store_size(b2, min_waste);
- store_size(b3, max_waste);store_size(b4, total_waste);
- printf("Loss %10s %10s %10s %10s\n",
- b1, b2, b3, b4);
-
- printf("\n");
- printf("Per Object Average Min Max\n");
- printf("---------------------------------------------\n");
-
- store_size(b1, avg_memobj);store_size(b2, min_memobj);
- store_size(b3, max_memobj);
- printf("Memory %10s %10s %10s\n",
- b1, b2, b3);
- store_size(b1, avg_objsize);store_size(b2, min_objsize);
- store_size(b3, max_objsize);
- printf("User %10s %10s %10s\n",
- b1, b2, b3);
-
- store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
- store_size(b3, max_objwaste);
- printf("Loss %10s %10s %10s\n",
- b1, b2, b3);
-}
-
-static void sort_slabs(void)
-{
- struct slabinfo *s1,*s2;
-
- for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) {
- for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
- int result;
-
- if (sort_size)
- result = slab_size(s1) < slab_size(s2);
- else if (sort_active)
- result = slab_activity(s1) < slab_activity(s2);
- else
- result = strcasecmp(s1->name, s2->name);
-
- if (show_inverted)
- result = -result;
-
- if (result > 0) {
- struct slabinfo t;
-
- memcpy(&t, s1, sizeof(struct slabinfo));
- memcpy(s1, s2, sizeof(struct slabinfo));
- memcpy(s2, &t, sizeof(struct slabinfo));
- }
- }
- }
-}
-
-static void sort_aliases(void)
-{
- struct aliasinfo *a1,*a2;
-
- for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) {
- for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) {
- char *n1, *n2;
-
- n1 = a1->name;
- n2 = a2->name;
- if (show_alias && !show_inverted) {
- n1 = a1->ref;
- n2 = a2->ref;
- }
- if (strcasecmp(n1, n2) > 0) {
- struct aliasinfo t;
-
- memcpy(&t, a1, sizeof(struct aliasinfo));
- memcpy(a1, a2, sizeof(struct aliasinfo));
- memcpy(a2, &t, sizeof(struct aliasinfo));
- }
- }
- }
-}
-
-static void link_slabs(void)
-{
- struct aliasinfo *a;
- struct slabinfo *s;
-
- for (a = aliasinfo; a < aliasinfo + aliases; a++) {
-
- for (s = slabinfo; s < slabinfo + slabs; s++)
- if (strcmp(a->ref, s->name) == 0) {
- a->slab = s;
- s->refs++;
- break;
- }
- if (s == slabinfo + slabs)
- fatal("Unresolved alias %s\n", a->ref);
- }
-}
-
-static void alias(void)
-{
- struct aliasinfo *a;
- char *active = NULL;
-
- sort_aliases();
- link_slabs();
-
- for(a = aliasinfo; a < aliasinfo + aliases; a++) {
-
- if (!show_single_ref && a->slab->refs == 1)
- continue;
-
- if (!show_inverted) {
- if (active) {
- if (strcmp(a->slab->name, active) == 0) {
- printf(" %s", a->name);
- continue;
- }
- }
- printf("\n%-12s <- %s", a->slab->name, a->name);
- active = a->slab->name;
- }
- else
- printf("%-20s -> %s\n", a->name, a->slab->name);
- }
- if (active)
- printf("\n");
-}
-
-
-static void rename_slabs(void)
-{
- struct slabinfo *s;
- struct aliasinfo *a;
-
- for (s = slabinfo; s < slabinfo + slabs; s++) {
- if (*s->name != ':')
- continue;
-
- if (s->refs > 1 && !show_first_alias)
- continue;
-
- a = find_one_alias(s);
-
- if (a)
- s->name = a->name;
- else {
- s->name = "*";
- actual_slabs--;
- }
- }
-}
-
-static int slab_mismatch(char *slab)
-{
- return regexec(&pattern, slab, 0, NULL, 0);
-}
-
-static void read_slab_dir(void)
-{
- DIR *dir;
- struct dirent *de;
- struct slabinfo *slab = slabinfo;
- struct aliasinfo *alias = aliasinfo;
- char *p;
- char *t;
- int count;
-
- if (chdir("/sys/kernel/slab") && chdir("/sys/slab"))
- fatal("SYSFS support for SLUB not active\n");
-
- dir = opendir(".");
- while ((de = readdir(dir))) {
- if (de->d_name[0] == '.' ||
- (de->d_name[0] != ':' && slab_mismatch(de->d_name)))
- continue;
- switch (de->d_type) {
- case DT_LNK:
- alias->name = strdup(de->d_name);
- count = readlink(de->d_name, buffer, sizeof(buffer));
-
- if (count < 0)
- fatal("Cannot read symlink %s\n", de->d_name);
-
- buffer[count] = 0;
- p = buffer + count;
- while (p > buffer && p[-1] != '/')
- p--;
- alias->ref = strdup(p);
- alias++;
- break;
- case DT_DIR:
- if (chdir(de->d_name))
- fatal("Unable to access slab %s\n", slab->name);
- slab->name = strdup(de->d_name);
- slab->alias = 0;
- slab->refs = 0;
- slab->aliases = get_obj("aliases");
- slab->align = get_obj("align");
- slab->cache_dma = get_obj("cache_dma");
- slab->cpu_slabs = get_obj("cpu_slabs");
- slab->destroy_by_rcu = get_obj("destroy_by_rcu");
- slab->hwcache_align = get_obj("hwcache_align");
- slab->object_size = get_obj("object_size");
- slab->objects = get_obj("objects");
- slab->objects_partial = get_obj("objects_partial");
- slab->objects_total = get_obj("objects_total");
- slab->objs_per_slab = get_obj("objs_per_slab");
- slab->order = get_obj("order");
- slab->partial = get_obj("partial");
- slab->partial = get_obj_and_str("partial", &t);
- decode_numa_list(slab->numa_partial, t);
- free(t);
- slab->poison = get_obj("poison");
- slab->reclaim_account = get_obj("reclaim_account");
- slab->red_zone = get_obj("red_zone");
- slab->sanity_checks = get_obj("sanity_checks");
- slab->slab_size = get_obj("slab_size");
- slab->slabs = get_obj_and_str("slabs", &t);
- decode_numa_list(slab->numa, t);
- free(t);
- slab->store_user = get_obj("store_user");
- slab->trace = get_obj("trace");
- slab->alloc_fastpath = get_obj("alloc_fastpath");
- slab->alloc_slowpath = get_obj("alloc_slowpath");
- slab->free_fastpath = get_obj("free_fastpath");
- slab->free_slowpath = get_obj("free_slowpath");
- slab->free_frozen= get_obj("free_frozen");
- slab->free_add_partial = get_obj("free_add_partial");
- slab->free_remove_partial = get_obj("free_remove_partial");
- slab->alloc_from_partial = get_obj("alloc_from_partial");
- slab->alloc_slab = get_obj("alloc_slab");
- slab->alloc_refill = get_obj("alloc_refill");
- slab->free_slab = get_obj("free_slab");
- slab->cpuslab_flush = get_obj("cpuslab_flush");
- slab->deactivate_full = get_obj("deactivate_full");
- slab->deactivate_empty = get_obj("deactivate_empty");
- slab->deactivate_to_head = get_obj("deactivate_to_head");
- slab->deactivate_to_tail = get_obj("deactivate_to_tail");
- slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
- slab->order_fallback = get_obj("order_fallback");
- chdir("..");
- if (slab->name[0] == ':')
- alias_targets++;
- slab++;
- break;
- default :
- fatal("Unknown file type %lx\n", de->d_type);
- }
- }
- closedir(dir);
- slabs = slab - slabinfo;
- actual_slabs = slabs;
- aliases = alias - aliasinfo;
- if (slabs > MAX_SLABS)
- fatal("Too many slabs\n");
- if (aliases > MAX_ALIASES)
- fatal("Too many aliases\n");
-}
-
-static void output_slabs(void)
-{
- struct slabinfo *slab;
-
- for (slab = slabinfo; slab < slabinfo + slabs; slab++) {
-
- if (slab->alias)
- continue;
-
-
- if (show_numa)
- slab_numa(slab, 0);
- else if (show_track)
- show_tracking(slab);
- else if (validate)
- slab_validate(slab);
- else if (shrink)
- slab_shrink(slab);
- else if (set_debug)
- slab_debug(slab);
- else if (show_ops)
- ops(slab);
- else if (show_slab)
- slabcache(slab);
- else if (show_report)
- report(slab);
- }
-}
-
-struct option opts[] = {
- { "aliases", 0, NULL, 'a' },
- { "activity", 0, NULL, 'A' },
- { "debug", 2, NULL, 'd' },
- { "display-activity", 0, NULL, 'D' },
- { "empty", 0, NULL, 'e' },
- { "first-alias", 0, NULL, 'f' },
- { "help", 0, NULL, 'h' },
- { "inverted", 0, NULL, 'i'},
- { "numa", 0, NULL, 'n' },
- { "ops", 0, NULL, 'o' },
- { "report", 0, NULL, 'r' },
- { "shrink", 0, NULL, 's' },
- { "slabs", 0, NULL, 'l' },
- { "track", 0, NULL, 't'},
- { "validate", 0, NULL, 'v' },
- { "zero", 0, NULL, 'z' },
- { "1ref", 0, NULL, '1'},
- { NULL, 0, NULL, 0 }
-};
-
-int main(int argc, char *argv[])
-{
- int c;
- int err;
- char *pattern_source;
-
- page_size = getpagesize();
-
- while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS",
- opts, NULL)) != -1)
- switch (c) {
- case '1':
- show_single_ref = 1;
- break;
- case 'a':
- show_alias = 1;
- break;
- case 'A':
- sort_active = 1;
- break;
- case 'd':
- set_debug = 1;
- if (!debug_opt_scan(optarg))
- fatal("Invalid debug option '%s'\n", optarg);
- break;
- case 'D':
- show_activity = 1;
- break;
- case 'e':
- show_empty = 1;
- break;
- case 'f':
- show_first_alias = 1;
- break;
- case 'h':
- usage();
- return 0;
- case 'i':
- show_inverted = 1;
- break;
- case 'n':
- show_numa = 1;
- break;
- case 'o':
- show_ops = 1;
- break;
- case 'r':
- show_report = 1;
- break;
- case 's':
- shrink = 1;
- break;
- case 'l':
- show_slab = 1;
- break;
- case 't':
- show_track = 1;
- break;
- case 'v':
- validate = 1;
- break;
- case 'z':
- skip_zero = 0;
- break;
- case 'T':
- show_totals = 1;
- break;
- case 'S':
- sort_size = 1;
- break;
-
- default:
- fatal("%s: Invalid option '%c'\n", argv[0], optopt);
-
- }
-
- if (!show_slab && !show_alias && !show_track && !show_report
- && !validate && !shrink && !set_debug && !show_ops)
- show_slab = 1;
-
- if (argc > optind)
- pattern_source = argv[optind];
- else
- pattern_source = ".*";
-
- err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB);
- if (err)
- fatal("%s: Invalid pattern '%s' code %d\n",
- argv[0], pattern_source, err);
- read_slab_dir();
- if (show_alias)
- alias();
- else
- if (show_totals)
- totals();
- else {
- link_slabs();
- rename_slabs();
- sort_slabs();
- output_slabs();
- }
- return 0;
-}
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
new file mode 100644
index 000000000000..0924aaca3302
--- /dev/null
+++ b/Documentation/vm/transhuge.txt
@@ -0,0 +1,298 @@
+= Transparent Hugepage Support =
+
+== Objective ==
+
+Performance critical computing applications dealing with large memory
+working sets are already running on top of libhugetlbfs and in turn
+hugetlbfs. Transparent Hugepage Support is an alternative means of
+using huge pages for the backing of virtual memory with huge pages
+that supports the automatic promotion and demotion of page sizes and
+without the shortcomings of hugetlbfs.
+
+Currently it only works for anonymous memory mappings but in the
+future it can expand over the pagecache layer starting with tmpfs.
+
+The reason applications are running faster is because of two
+factors. The first factor is almost completely irrelevant and it's not
+of significant interest because it'll also have the downside of
+requiring larger clear-page copy-page in page faults which is a
+potentially negative effect. The first factor consists in taking a
+single page fault for each 2M virtual region touched by userland (so
+reducing the enter/exit kernel frequency by a 512 times factor). This
+only matters the first time the memory is accessed for the lifetime of
+a memory mapping. The second long lasting and much more important
+factor will affect all subsequent accesses to the memory for the whole
+runtime of the application. The second factor consist of two
+components: 1) the TLB miss will run faster (especially with
+virtualization using nested pagetables but almost always also on bare
+metal without virtualization) and 2) a single TLB entry will be
+mapping a much larger amount of virtual memory in turn reducing the
+number of TLB misses. With virtualization and nested pagetables the
+TLB can be mapped of larger size only if both KVM and the Linux guest
+are using hugepages but a significant speedup already happens if only
+one of the two is using hugepages just because of the fact the TLB
+miss is going to run faster.
+
+== Design ==
+
+- "graceful fallback": mm components which don't have transparent
+ hugepage knowledge fall back to breaking a transparent hugepage and
+ working on the regular pages and their respective regular pmd/pte
+ mappings
+
+- if a hugepage allocation fails because of memory fragmentation,
+ regular pages should be gracefully allocated instead and mixed in
+ the same vma without any failure or significant delay and without
+ userland noticing
+
+- if some task quits and more hugepages become available (either
+ immediately in the buddy or through the VM), guest physical memory
+ backed by regular pages should be relocated on hugepages
+ automatically (with khugepaged)
+
+- it doesn't require memory reservation and in turn it uses hugepages
+ whenever possible (the only possible reservation here is kernelcore=
+ to avoid unmovable pages to fragment all the memory but such a tweak
+ is not specific to transparent hugepage support and it's a generic
+ feature that applies to all dynamic high order allocations in the
+ kernel)
+
+- this initial support only offers the feature in the anonymous memory
+ regions but it'd be ideal to move it to tmpfs and the pagecache
+ later
+
+Transparent Hugepage Support maximizes the usefulness of free memory
+if compared to the reservation approach of hugetlbfs by allowing all
+unused memory to be used as cache or other movable (or even unmovable
+entities). It doesn't require reservation to prevent hugepage
+allocation failures to be noticeable from userland. It allows paging
+and all other advanced VM features to be available on the
+hugepages. It requires no modifications for applications to take
+advantage of it.
+
+Applications however can be further optimized to take advantage of
+this feature, like for example they've been optimized before to avoid
+a flood of mmap system calls for every malloc(4k). Optimizing userland
+is by far not mandatory and khugepaged already can take care of long
+lived page allocations even for hugepage unaware applications that
+deals with large amounts of memory.
+
+In certain cases when hugepages are enabled system wide, application
+may end up allocating more memory resources. An application may mmap a
+large region but only touch 1 byte of it, in that case a 2M page might
+be allocated instead of a 4k page for no good. This is why it's
+possible to disable hugepages system-wide and to only have them inside
+MADV_HUGEPAGE madvise regions.
+
+Embedded systems should enable hugepages only inside madvise regions
+to eliminate any risk of wasting any precious byte of memory and to
+only run faster.
+
+Applications that gets a lot of benefit from hugepages and that don't
+risk to lose memory by using hugepages, should use
+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+
+== sysfs ==
+
+Transparent Hugepage Support can be entirely disabled (mostly for
+debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to
+avoid the risk of consuming more memory resources) or enabled system
+wide. This can be achieved with one of:
+
+echo always >/sys/kernel/mm/transparent_hugepage/enabled
+echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
+echo never >/sys/kernel/mm/transparent_hugepage/enabled
+
+It's also possible to limit defrag efforts in the VM to generate
+hugepages in case they're not immediately free to madvise regions or
+to never try to defrag memory and simply fallback to regular pages
+unless hugepages are immediately available. Clearly if we spend CPU
+time to defrag memory, we would expect to gain even more by the fact
+we use hugepages later instead of regular pages. This isn't always
+guaranteed, but it may be more likely in case the allocation is for a
+MADV_HUGEPAGE region.
+
+echo always >/sys/kernel/mm/transparent_hugepage/defrag
+echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
+echo never >/sys/kernel/mm/transparent_hugepage/defrag
+
+khugepaged will be automatically started when
+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
+be automatically shutdown if it's set to "never".
+
+khugepaged runs usually at low frequency so while one may not want to
+invoke defrag algorithms synchronously during the page faults, it
+should be worth invoking defrag at least in khugepaged. However it's
+also possible to disable defrag in khugepaged:
+
+echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+
+You can also control how many pages khugepaged should scan at each
+pass:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
+
+and how many milliseconds to wait in khugepaged between each pass (you
+can set this to 0 to run khugepaged at 100% utilization of one core):
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
+
+and how many milliseconds to wait in khugepaged if there's an hugepage
+allocation failure to throttle the next allocation attempt.
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
+
+The khugepaged progress can be seen in the number of pages collapsed:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
+
+for each pass:
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
+
+== Boot parameter ==
+
+You can change the sysfs boot time defaults of Transparent Hugepage
+Support by passing the parameter "transparent_hugepage=always" or
+"transparent_hugepage=madvise" or "transparent_hugepage=never"
+(without "") to the kernel command line.
+
+== Need of application restart ==
+
+The transparent_hugepage/enabled values only affect future
+behavior. So to make them effective you need to restart any
+application that could have been using hugepages. This also applies to
+the regions registered in khugepaged.
+
+== get_user_pages and follow_page ==
+
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most gup users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead (while serializing properly against
+split_huge_page() to avoid the head and tail pages to disappear from
+under it, see the futex code to see an example of that, hugetlbfs also
+needed special handling in futex code for similar reasons).
+
+NOTE: these aren't new constraints to the GUP API, and they match the
+same constrains that applies to hugetlbfs too, so any driver capable
+of handling GUP on hugetlbfs will also work fine on transparent
+hugepage backed mappings.
+
+In case you can't handle compound pages if they're returned by
+follow_page, the FOLL_SPLIT bit can be specified as parameter to
+follow_page, so that it will split the hugepages before returning
+them. Migration for example passes FOLL_SPLIT as parameter to
+follow_page because it's not hugepage aware and in fact it can't work
+at all on hugetlbfs (but it instead works fine on transparent
+hugepages thanks to FOLL_SPLIT). migration simply can't deal with
+hugepages being returned (as it's not only checking the pfn of the
+page and pinning it during the copy but it pretends to migrate the
+memory in regular page sizes and with regular pte/pmd mappings).
+
+== Optimizing the applications ==
+
+To be guaranteed that the kernel will map a 2M page immediately in any
+memory region, the mmap region has to be hugepage naturally
+aligned. posix_memalign() can provide that guarantee.
+
+== Hugetlbfs ==
+
+You can use hugetlbfs on a kernel that has transparent hugepage
+support enabled just fine as always. No difference can be noted in
+hugetlbfs other than there will be less overall fragmentation. All
+usual features belonging to hugetlbfs are preserved and
+unaffected. libhugetlbfs will also work fine as usual.
+
+== Graceful fallback ==
+
+Code walking pagetables but unware about huge pmds can simply call
+split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundred if not thousand of lines of complex code to make your code
+hugepage aware.
+
+If you're not walking pagetables but you run into a physical hugepage
+but you can't handle it natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example.
+
+Example to make mremap.c transparent hugepage aware with a one liner
+change:
+
+diff --git a/mm/mremap.c b/mm/mremap.c
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+ return NULL;
+
+ pmd = pmd_offset(pud, addr);
++ split_huge_page_pmd(mm, pmd);
+ if (pmd_none_or_clear_bad(pmd))
+ return NULL;
+
+== Locking in hugepage aware code ==
+
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_page_pmd() has a cost.
+
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_sem in read (or write) mode to be sure an huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_sem in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+mm->page_table_lock and re-run pmd_trans_huge. Taking the
+page_table_lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_page can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page_table_lock and fallback to the old code as
+before. Otherwise you should run pmd_trans_splitting on the pmd. In
+case pmd_trans_splitting returns true, it means split_huge_page is
+already in the middle of splitting the page. So if pmd_trans_splitting
+returns true it's enough to drop the page_table_lock and call
+wait_split_huge_page and then fallback the old code paths. You are
+guaranteed by the time wait_split_huge_page returns, the pmd isn't
+huge anymore. If pmd_trans_splitting returns false, you can proceed to
+process the huge pmd and the hugepage natively. Once finished you can
+drop the page_table_lock.
+
+== compound_lock, get_user_pages and put_page ==
+
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the
+page structures. It can do that easily for refcounts taken by huge pmd
+mappings. But the GUI API as created by hugetlbfs (that returns head
+and tail pages if running get_user_pages on an address backed by any
+hugepage), requires the refcount to be accounted on the tail pages and
+not only in the head pages, if we want to be able to run
+split_huge_page while there are gup pins established on any tail
+page. Failure to be able to run split_huge_page if there's any gup pin
+on any tail page, would mean having to split all hugepages upfront in
+get_user_pages which is unacceptable as too many gup users are
+performance critical and they must work natively on hugepages like
+they work natively on hugetlbfs already (hugetlbfs is simpler because
+hugetlbfs pages cannot be splitted so there wouldn't be requirement of
+accounting the pins on the tail pages for hugetlbfs). If we wouldn't
+account the gup refcounts on the tail pages during gup, we won't know
+anymore which tail page is pinned by gup and which is not while we run
+split_huge_page. But we still have to add the gup pin to the head page
+too, to know when we can free the compound page in case it's never
+splitted during its lifetime. That requires changing not just
+get_page, but put_page as well so that when put_page runs on a tail
+page (and only on a tail page) it will find its respective head page,
+and then it will decrease the head page refcount in addition to the
+tail page refcount. To obtain a head page reliably and to decrease its
+refcount without race conditions, put_page has to serialize against
+__split_huge_page_refcount using a special per-page lock called
+compound_lock.