aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 11:40:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 11:40:28 -0700
commitef8f3d48afd6a17a0dae8c277c2f539c2f19fd16 (patch)
tree5c28f9f287a552ad1a655b9e29e5330966652e89 /mm/swapfile.c
parentMerge tag 'tag-chrome-platform-for-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux (diff)
parentmm/oom_kill.c: remove redundant OOM score normalization in select_bad_process() (diff)
downloadlinux-dev-ef8f3d48afd6a17a0dae8c277c2f539c2f19fd16.tar.xz
linux-dev-ef8f3d48afd6a17a0dae8c277c2f539c2f19fd16.zip
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: "Am experimenting with splitting MM up into identifiable subsystems perhaps with a view to gitifying it in complex ways. Also with more verbose "incoming" emails. Most of MM is here and a few other trees. Subsystems affected by this patch series: - hotfixes - iommu - scripts - arch/sh - ocfs2 - mm:slab-generic - mm:slub - mm:kmemleak - mm:kasan - mm:cleanups - mm:debug - mm:pagecache - mm:swap - mm:memcg - mm:gup - mm:pagemap - mm:infrastructure - mm:vmalloc - mm:initialization - mm:pagealloc - mm:vmscan - mm:tools - mm:proc - mm:ras - mm:oom-kill hotfixes: mm: vmscan: scan anonymous pages on file refaults mm/nvdimm: add is_ioremap_addr and use that to check ioremap address mm/memcontrol: fix wrong statistics in memory.stat mm/z3fold.c: lock z3fold page before __SetPageMovable() nilfs2: do not use unexported cpu_to_le32()/le32_to_cpu() in uapi header MAINTAINERS: nilfs2: update email address iommu: include/linux/dmar.h: replace single-char identifiers in macros scripts: scripts/decode_stacktrace: match basepath using shell prefix operator, not regex scripts/decode_stacktrace: look for modules with .ko.debug extension scripts/spelling.txt: drop "sepc" from the misspelling list scripts/spelling.txt: add spelling fix for prohibited scripts/decode_stacktrace: Accept dash/underscore in modules scripts/spelling.txt: add more spellings to spelling.txt arch/sh: arch/sh/configs/sdk7786_defconfig: remove CONFIG_LOGFS sh: config: remove left-over BACKLIGHT_LCD_SUPPORT sh: prevent warnings when using iounmap ocfs2: fs: ocfs: fix spelling mistake "hearbeating" -> "heartbeat" ocfs2/dlm: use struct_size() helper ocfs2: add last unlock times in locking_state ocfs2: add locking filter debugfs file ocfs2: add first lock wait time in locking_state ocfs: no need to check return value of debugfs_create functions fs/ocfs2/dlmglue.c: unneeded variable: "status" ocfs2: use kmemdup rather than duplicating its implementation mm:slab-generic: Patch series "mm/slab: Improved sanity checking": mm/slab: validate cache membership under freelist hardening mm/slab: sanity-check page type when looking up cache lkdtm/heap: add tests for freelist hardening mm:slub: mm/slub.c: avoid double string traverse in kmem_cache_flags() slub: don't panic for memcg kmem cache creation failure mm:kmemleak: mm/kmemleak.c: fix check for softirq context mm/kmemleak.c: change error at _write when kmemleak is disabled docs: kmemleak: add more documentation details mm:kasan: mm/kasan: print frame description for stack bugs Patch series "Bitops instrumentation for KASAN", v5: lib/test_kasan: add bitops tests x86: use static_cpu_has in uaccess region to avoid instrumentation asm-generic, x86: add bitops instrumentation for KASAN Patch series "mm/kasan: Add object validation in ksize()", v3: mm/kasan: introduce __kasan_check_{read,write} mm/kasan: change kasan_check_{read,write} to return boolean lib/test_kasan: Add test for double-kzfree detection mm/slab: refactor common ksize KASAN logic into slab_common.c mm/kasan: add object validation in ksize() mm:cleanups: include/linux/pfn_t.h: remove pfn_t_to_virt() Patch series "remove ARCH_SELECT_MEMORY_MODEL where it has no effect": arm: remove ARCH_SELECT_MEMORY_MODEL s390: remove ARCH_SELECT_MEMORY_MODEL sparc: remove ARCH_SELECT_MEMORY_MODEL mm/gup.c: make follow_page_mask() static mm/memory.c: trivial clean up in insert_page() mm: make !CONFIG_HUGE_PAGE wrappers into static inlines include/linux/mm_types.h: ifdef struct vm_area_struct::swap_readahead_info mm: remove the account_page_dirtied export mm/page_isolation.c: change the prototype of undo_isolate_page_range() include/linux/vmpressure.h: use spinlock_t instead of struct spinlock mm: remove the exporting of totalram_pages include/linux/pagemap.h: document trylock_page() return value mm:debug: mm/failslab.c: by default, do not fail allocations with direct reclaim only Patch series "debug_pagealloc improvements": mm, debug_pagelloc: use static keys to enable debugging mm, page_alloc: more extensive free page checking with debug_pagealloc mm, debug_pagealloc: use a page type instead of page_ext flag mm:pagecache: Patch series "fix filler_t callback type mismatches", v2: mm/filemap.c: fix an overly long line in read_cache_page mm/filemap: don't cast ->readpage to filler_t for do_read_cache_page jffs2: pass the correct prototype to read_cache_page 9p: pass the correct prototype to read_cache_page mm/filemap.c: correct the comment about VM_FAULT_RETRY mm:swap: mm, swap: fix race between swapoff and some swap operations mm/swap_state.c: simplify total_swapcache_pages() with get_swap_device() mm, swap: use rbtree for swap_extent mm/mincore.c: fix race between swapoff and mincore mm:memcg: memcg, oom: no oom-kill for __GFP_RETRY_MAYFAIL memcg, fsnotify: no oom-kill for remote memcg charging mm, memcg: introduce memory.events.local mm: memcontrol: dump memory.stat during cgroup OOM Patch series "mm: reparent slab memory on cgroup removal", v7: mm: memcg/slab: postpone kmem_cache memcg pointer initialization to memcg_link_cache() mm: memcg/slab: rename slab delayed deactivation functions and fields mm: memcg/slab: generalize postponed non-root kmem_cache deactivation mm: memcg/slab: introduce __memcg_kmem_uncharge_memcg() mm: memcg/slab: unify SLAB and SLUB page accounting mm: memcg/slab: don't check the dying flag on kmem_cache creation mm: memcg/slab: synchronize access to kmem_cache dying flag using a spinlock mm: memcg/slab: rework non-root kmem_cache lifecycle management mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages mm: memcg/slab: reparent memcg kmem_caches on cgroup removal mm, memcg: add a memcg_slabinfo debugfs file mm:gup: Patch series "switch the remaining architectures to use generic GUP", v4: mm: use untagged_addr() for get_user_pages_fast addresses mm: simplify gup_fast_permitted mm: lift the x86_32 PAE version of gup_get_pte to common code MIPS: use the generic get_user_pages_fast code sh: add the missing pud_page definition sh: use the generic get_user_pages_fast code sparc64: add the missing pgd_page definition sparc64: define untagged_addr() sparc64: use the generic get_user_pages_fast code mm: rename CONFIG_HAVE_GENERIC_GUP to CONFIG_HAVE_FAST_GUP mm: reorder code blocks in gup.c mm: consolidate the get_user_pages* implementations mm: validate get_user_pages_fast flags mm: move the powerpc hugepd code to mm/gup.c mm: switch gup_hugepte to use try_get_compound_head mm: mark the page referenced in gup_hugepte mm/gup: speed up check_and_migrate_cma_pages() on huge page mm/gup.c: remove some BUG_ONs from get_gate_page() mm/gup.c: mark undo_dev_pagemap as __maybe_unused mm:pagemap: asm-generic, x86: introduce generic pte_{alloc,free}_one[_kernel] alpha: switch to generic version of pte allocation arm: switch to generic version of pte allocation arm64: switch to generic version of pte allocation csky: switch to generic version of pte allocation m68k: sun3: switch to generic version of pte allocation mips: switch to generic version of pte allocation nds32: switch to generic version of pte allocation nios2: switch to generic version of pte allocation parisc: switch to generic version of pte allocation riscv: switch to generic version of pte allocation um: switch to generic version of pte allocation unicore32: switch to generic version of pte allocation mm/pgtable: drop pgtable_t variable from pte_fn_t functions mm/memory.c: fail when offset == num in first check of __vm_map_pages() mm:infrastructure: mm/mmu_notifier: use hlist_add_head_rcu() mm:vmalloc: Patch series "Some cleanups for the KVA/vmalloc", v5: mm/vmalloc.c: remove "node" argument mm/vmalloc.c: preload a CPU with one object for split purpose mm/vmalloc.c: get rid of one single unlink_va() when merge mm/vmalloc.c: switch to WARN_ON() and move it under unlink_va() mm/vmalloc.c: spelling> s/informaion/information/ mm:initialization: mm/large system hash: use vmalloc for size > MAX_ORDER when !hashdist mm/large system hash: clear hashdist when only one node with memory is booted mm:pagealloc: arm64: move jump_label_init() before parse_early_param() Patch series "add init_on_alloc/init_on_free boot options", v10: mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options mm: init: report memory auto-initialization features at boot time mm:vmscan: mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned mm: vmscan: correct some vmscan counters for THP swapout mm:tools: tools/vm/slabinfo: order command line options tools/vm/slabinfo: add partial slab listing to -X tools/vm/slabinfo: add option to sort by partial slabs tools/vm/slabinfo: add sorting info to help menu mm:proc: proc: use down_read_killable mmap_sem for /proc/pid/maps proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup proc: use down_read_killable mmap_sem for /proc/pid/pagemap proc: use down_read_killable mmap_sem for /proc/pid/clear_refs proc: use down_read_killable mmap_sem for /proc/pid/map_files mm: use down_read_killable for locking mmap_sem in access_remote_vm mm: smaps: split PSS into components mm: vmalloc: show number of vmalloc pages in /proc/meminfo mm:ras: mm/memory-failure.c: clarify error message mm:oom-kill: mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks() mm, oom: refactor dump_tasks for memcg OOMs mm, oom: remove redundant task_in_mem_cgroup() check oom: decouple mems_allowed from oom_unkillable_task mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process()" * akpm: (147 commits) mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process() oom: decouple mems_allowed from oom_unkillable_task mm, oom: remove redundant task_in_mem_cgroup() check mm, oom: refactor dump_tasks for memcg OOMs mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks() mm/memory-failure.c: clarify error message mm: vmalloc: show number of vmalloc pages in /proc/meminfo mm: smaps: split PSS into components mm: use down_read_killable for locking mmap_sem in access_remote_vm proc: use down_read_killable mmap_sem for /proc/pid/map_files proc: use down_read_killable mmap_sem for /proc/pid/clear_refs proc: use down_read_killable mmap_sem for /proc/pid/pagemap proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup proc: use down_read_killable mmap_sem for /proc/pid/maps tools/vm/slabinfo: add sorting info to help menu tools/vm/slabinfo: add option to sort by partial slabs tools/vm/slabinfo: add partial slab listing to -X tools/vm/slabinfo: order command line options mm: vmscan: correct some vmscan counters for THP swapout mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned ...
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c291
1 files changed, 197 insertions, 94 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 596ac98051c5..0789a762ce2f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
return ret;
}
+static inline struct swap_extent *first_se(struct swap_info_struct *sis)
+{
+ struct rb_node *rb = rb_first(&sis->swap_extent_root);
+ return rb_entry(rb, struct swap_extent, rb_node);
+}
+
+static inline struct swap_extent *next_se(struct swap_extent *se)
+{
+ struct rb_node *rb = rb_next(&se->rb_node);
+ return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
+}
+
/*
* swapon tell device that all the old swap contents can be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si)
int err = 0;
/* Do not discard the swap header page! */
- se = &si->first_swap_extent;
+ se = first_se(si);
start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
@@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si)
cond_resched();
}
- list_for_each_entry(se, &si->first_swap_extent.list, list) {
+ for (se = next_se(se); se; se = next_se(se)) {
start_block = se->start_block << (PAGE_SHIFT - 9);
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
@@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si)
return err; /* That will often be -EOPNOTSUPP */
}
+static struct swap_extent *
+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
+{
+ struct swap_extent *se;
+ struct rb_node *rb;
+
+ rb = sis->swap_extent_root.rb_node;
+ while (rb) {
+ se = rb_entry(rb, struct swap_extent, rb_node);
+ if (offset < se->start_page)
+ rb = rb->rb_left;
+ else if (offset >= se->start_page + se->nr_pages)
+ rb = rb->rb_right;
+ else
+ return se;
+ }
+ /* It *must* be present */
+ BUG();
+}
+
/*
* swap allocation tell device that a cluster of swap can now be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si)
static void discard_swap_cluster(struct swap_info_struct *si,
pgoff_t start_page, pgoff_t nr_pages)
{
- struct swap_extent *se = si->curr_swap_extent;
- int found_extent = 0;
+ struct swap_extent *se = offset_to_swap_extent(si, start_page);
while (nr_pages) {
- if (se->start_page <= start_page &&
- start_page < se->start_page + se->nr_pages) {
- pgoff_t offset = start_page - se->start_page;
- sector_t start_block = se->start_block + offset;
- sector_t nr_blocks = se->nr_pages - offset;
-
- if (nr_blocks > nr_pages)
- nr_blocks = nr_pages;
- start_page += nr_blocks;
- nr_pages -= nr_blocks;
-
- if (!found_extent++)
- si->curr_swap_extent = se;
-
- start_block <<= PAGE_SHIFT - 9;
- nr_blocks <<= PAGE_SHIFT - 9;
- if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, 0))
- break;
- }
+ pgoff_t offset = start_page - se->start_page;
+ sector_t start_block = se->start_block + offset;
+ sector_t nr_blocks = se->nr_pages - offset;
+
+ if (nr_blocks > nr_pages)
+ nr_blocks = nr_pages;
+ start_page += nr_blocks;
+ nr_pages -= nr_blocks;
+
+ start_block <<= PAGE_SHIFT - 9;
+ nr_blocks <<= PAGE_SHIFT - 9;
+ if (blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_NOIO, 0))
+ break;
- se = list_next_entry(se, list);
+ se = next_se(se);
}
}
@@ -1079,12 +1104,11 @@ fail:
static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
- unsigned long offset, type;
+ unsigned long offset;
if (!entry.val)
goto out;
- type = swp_type(entry);
- p = swap_type_to_swap_info(type);
+ p = swp_swap_info(entry);
if (!p)
goto bad_nofile;
if (!(p->flags & SWP_USED))
@@ -1187,6 +1211,69 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
return usage;
}
+/*
+ * Check whether swap entry is valid in the swap device. If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called. Otherwise return NULL.
+ *
+ * The entirety of the RCU read critical section must come before the
+ * return from or after the call to synchronize_rcu() in
+ * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
+ * true, the si->map, si->cluster_info, etc. must be valid in the
+ * critical section.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
+ * in put_swap_device() if there isn't any other way to prevent
+ * swapoff, such as page lock, page table lock, etc. The caller must
+ * be prepared for that. For example, the following situation is
+ * possible.
+ *
+ * CPU1 CPU2
+ * do_swap_page()
+ * ... swapoff+swapon
+ * __read_swap_cache_async()
+ * swapcache_prepare()
+ * __swap_duplicate()
+ * // check swap_map
+ * // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff. And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ unsigned long offset;
+
+ if (!entry.val)
+ goto out;
+ si = swp_swap_info(entry);
+ if (!si)
+ goto bad_nofile;
+
+ rcu_read_lock();
+ if (!(si->flags & SWP_VALID))
+ goto unlock_out;
+ offset = swp_offset(entry);
+ if (offset >= si->max)
+ goto unlock_out;
+
+ return si;
+bad_nofile:
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+ return NULL;
+unlock_out:
+ rcu_read_unlock();
+ return NULL;
+}
+
static unsigned char __swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -1358,11 +1445,18 @@ int page_swapcount(struct page *page)
return count;
}
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
{
+ struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
+ int count = 0;
- return swap_count(si->swap_map[offset]);
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_count(si->swap_map[offset]);
+ put_swap_device(si);
+ }
+ return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1387,9 +1481,11 @@ int __swp_swapcount(swp_entry_t entry)
int count = 0;
struct swap_info_struct *si;
- si = __swap_info_get(entry);
- if (si)
+ si = get_swap_device(entry);
+ if (si) {
count = swap_swapcount(si, entry);
+ put_swap_device(si);
+ }
return count;
}
@@ -1684,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
return type;
}
if (bdev == sis->bdev) {
- struct swap_extent *se = &sis->first_swap_extent;
+ struct swap_extent *se = first_se(sis);
if (se->start_block == offset) {
if (bdev_p)
@@ -2161,7 +2257,6 @@ static void drain_mmlist(void)
static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
{
struct swap_info_struct *sis;
- struct swap_extent *start_se;
struct swap_extent *se;
pgoff_t offset;
@@ -2169,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
*bdev = sis->bdev;
offset = swp_offset(entry);
- start_se = sis->curr_swap_extent;
- se = start_se;
-
- for ( ; ; ) {
- if (se->start_page <= offset &&
- offset < (se->start_page + se->nr_pages)) {
- return se->start_block + (offset - se->start_page);
- }
- se = list_next_entry(se, list);
- sis->curr_swap_extent = se;
- BUG_ON(se == start_se); /* It *must* be present */
- }
+ se = offset_to_swap_extent(sis, offset);
+ return se->start_block + (offset - se->start_page);
}
/*
@@ -2198,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev)
*/
static void destroy_swap_extents(struct swap_info_struct *sis)
{
- while (!list_empty(&sis->first_swap_extent.list)) {
- struct swap_extent *se;
+ while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
+ struct rb_node *rb = sis->swap_extent_root.rb_node;
+ struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
- se = list_first_entry(&sis->first_swap_extent.list,
- struct swap_extent, list);
- list_del(&se->list);
+ rb_erase(rb, &sis->swap_extent_root);
kfree(se);
}
@@ -2219,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
/*
* Add a block range (and the corresponding page range) into this swapdev's
- * extent list. The extent list is kept sorted in page order.
+ * extent tree.
*
* This function rather assumes that it is called in ascending page order.
*/
@@ -2227,20 +2311,21 @@ int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block)
{
+ struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
struct swap_extent *se;
struct swap_extent *new_se;
- struct list_head *lh;
-
- if (start_page == 0) {
- se = &sis->first_swap_extent;
- sis->curr_swap_extent = se;
- se->start_page = 0;
- se->nr_pages = nr_pages;
- se->start_block = start_block;
- return 1;
- } else {
- lh = sis->first_swap_extent.list.prev; /* Highest extent */
- se = list_entry(lh, struct swap_extent, list);
+
+ /*
+ * place the new node at the right most since the
+ * function is called in ascending page order.
+ */
+ while (*link) {
+ parent = *link;
+ link = &parent->rb_right;
+ }
+
+ if (parent) {
+ se = rb_entry(parent, struct swap_extent, rb_node);
BUG_ON(se->start_page + se->nr_pages != start_page);
if (se->start_block + se->nr_pages == start_block) {
/* Merge it */
@@ -2249,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
}
}
- /*
- * No merge. Insert a new extent, preserving ordering.
- */
+ /* No merge, insert a new extent. */
new_se = kmalloc(sizeof(*se), GFP_KERNEL);
if (new_se == NULL)
return -ENOMEM;
@@ -2259,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
new_se->nr_pages = nr_pages;
new_se->start_block = start_block;
- list_add_tail(&new_se->list, &sis->first_swap_extent.list);
+ rb_link_node(&new_se->rb_node, parent, link);
+ rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
return 1;
}
EXPORT_SYMBOL_GPL(add_swap_extent);
@@ -2335,9 +2419,9 @@ static int swap_node(struct swap_info_struct *p)
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
{
int i;
@@ -2362,7 +2446,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
}
p->swap_map = swap_map;
p->cluster_info = cluster_info;
- p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+ p->flags |= SWP_WRITEOK | SWP_VALID;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2389,7 +2477,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, prio, swap_map, cluster_info);
+ setup_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * Guarantee swap_map, cluster_info, etc. fields are valid
+ * between get/put_swap_device() if SWP_VALID bit is set
+ */
+ synchronize_rcu();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2398,7 +2496,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2501,6 +2600,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * wait for swap operations protected by get/put_swap_device()
+ * to complete
+ */
+ synchronize_rcu();
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -2749,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void)
* would be relying on p->type to remain valid.
*/
}
- INIT_LIST_HEAD(&p->first_swap_extent.list);
+ p->swap_extent_root = RB_ROOT;
plist_node_init(&p->list, 0);
for_each_node(i)
plist_node_init(&p->avail_lists[i], 0);
@@ -3265,17 +3375,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unsigned char has_cache;
int err = -EINVAL;
- if (non_swap_entry(entry))
- goto out;
-
- p = swp_swap_info(entry);
+ p = get_swap_device(entry);
if (!p)
- goto bad_file;
-
- offset = swp_offset(entry);
- if (unlikely(offset >= p->max))
goto out;
+ offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -3321,11 +3425,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
out:
+ if (p)
+ put_swap_device(p);
return err;
-
-bad_file:
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
- goto out;
}
/*
@@ -3417,6 +3519,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
struct page *list_page;
pgoff_t offset;
unsigned char count;
+ int ret = 0;
/*
* When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3424,15 +3527,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
page = alloc_page(gfp_mask | __GFP_HIGHMEM);
- si = swap_info_get(entry);
+ si = get_swap_device(entry);
if (!si) {
/*
* An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap entry has been freed,
- * perhaps even the whole swap_map cleared for swapoff.
+ * __swap_duplicate(): the swap device may be swapoff
*/
goto outer;
}
+ spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3450,9 +3553,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
- unlock_cluster(ci);
- spin_unlock(&si->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/*
@@ -3504,10 +3606,11 @@ out_unlock_cont:
out:
unlock_cluster(ci);
spin_unlock(&si->lock);
+ put_swap_device(si);
outer:
if (page)
__free_page(page);
- return 0;
+ return ret;
}
/*