aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c161
1 files changed, 101 insertions, 60 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 54879c339024..f64ebb6226cb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -39,6 +39,7 @@
#include <linux/kernel-page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
+#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -57,6 +58,7 @@
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
#include "internal.h"
#include "ras/ras_event.h"
@@ -306,6 +308,7 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
struct vm_area_struct *vma)
{
unsigned long address = vma_address(page, vma);
+ unsigned long ret = 0;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -329,11 +332,10 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
if (pmd_devmap(*pmd))
return PMD_SHIFT;
pte = pte_offset_map(pmd, address);
- if (!pte_present(*pte))
- return 0;
- if (pte_devmap(*pte))
- return PAGE_SHIFT;
- return 0;
+ if (pte_present(*pte) && pte_devmap(*pte))
+ ret = PAGE_SHIFT;
+ pte_unmap(pte);
+ return ret;
}
/*
@@ -673,7 +675,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
#define hwpoison_hugetlb_range NULL
#endif
-static struct mm_walk_ops hwp_walk_ops = {
+static const struct mm_walk_ops hwp_walk_ops = {
.pmd_entry = hwpoison_pte_range,
.hugetlb_entry = hwpoison_hugetlb_range,
};
@@ -762,7 +764,7 @@ static int delete_from_lru_cache(struct page *p)
* Poisoned page might never drop its ref count to 0 so we have
* to uncharge it manually from its memcg.
*/
- mem_cgroup_uncharge(p);
+ mem_cgroup_uncharge(page_folio(p));
/*
* drop the page count elevated by isolate_lru_page()
@@ -806,12 +808,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
return ret;
}
+struct page_state {
+ unsigned long mask;
+ unsigned long res;
+ enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
+ int (*action)(struct page_state *ps, struct page *p);
+};
+
+/*
+ * Return true if page is still referenced by others, otherwise return
+ * false.
+ *
+ * The extra_pins is true when one extra refcount is expected.
+ */
+static bool has_extra_refcount(struct page_state *ps, struct page *p,
+ bool extra_pins)
+{
+ int count = page_count(p) - 1;
+
+ if (extra_pins)
+ count -= 1;
+
+ if (count > 0) {
+ pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+ page_to_pfn(p), action_page_types[ps->type], count);
+ return true;
+ }
+
+ return false;
+}
+
/*
* Error hit kernel page.
* Do nothing, try to be lucky and not touch this instead. For a few cases we
* could be more sophisticated.
*/
-static int me_kernel(struct page *p, unsigned long pfn)
+static int me_kernel(struct page_state *ps, struct page *p)
{
unlock_page(p);
return MF_IGNORED;
@@ -820,9 +854,9 @@ static int me_kernel(struct page *p, unsigned long pfn)
/*
* Page in unknown state. Do nothing.
*/
-static int me_unknown(struct page *p, unsigned long pfn)
+static int me_unknown(struct page_state *ps, struct page *p)
{
- pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
unlock_page(p);
return MF_FAILED;
}
@@ -830,10 +864,11 @@ static int me_unknown(struct page *p, unsigned long pfn)
/*
* Clean (or cleaned) page cache page.
*/
-static int me_pagecache_clean(struct page *p, unsigned long pfn)
+static int me_pagecache_clean(struct page_state *ps, struct page *p)
{
int ret;
struct address_space *mapping;
+ bool extra_pins;
delete_from_lru_cache(p);
@@ -863,13 +898,23 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
}
/*
+ * The shmem page is kept in page cache instead of truncating
+ * so is expected to have an extra refcount after error-handling.
+ */
+ extra_pins = shmem_mapping(mapping);
+
+ /*
* Truncation is a bit tricky. Enable it per file system for now.
*
* Open: to take i_rwsem or not for this? Right now we don't.
*/
- ret = truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, page_to_pfn(p), mapping);
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
out:
unlock_page(p);
+
return ret;
}
@@ -878,7 +923,7 @@ out:
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
-static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+static int me_pagecache_dirty(struct page_state *ps, struct page *p)
{
struct address_space *mapping = page_mapping(p);
@@ -922,7 +967,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
mapping_set_error(mapping, -EIO);
}
- return me_pagecache_clean(p, pfn);
+ return me_pagecache_clean(ps, p);
}
/*
@@ -944,9 +989,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* Clean swap cache pages can be directly isolated. A later page fault will
* bring in the known good data from disk.
*/
-static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+static int me_swapcache_dirty(struct page_state *ps, struct page *p)
{
int ret;
+ bool extra_pins = false;
ClearPageDirty(p);
/* Trigger EIO in shmem: */
@@ -954,10 +1000,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
unlock_page(p);
+
+ if (ret == MF_DELAYED)
+ extra_pins = true;
+
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
return ret;
}
-static int me_swapcache_clean(struct page *p, unsigned long pfn)
+static int me_swapcache_clean(struct page_state *ps, struct page *p)
{
int ret;
@@ -965,6 +1018,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
unlock_page(p);
+
+ if (has_extra_refcount(ps, p, false))
+ ret = MF_FAILED;
+
return ret;
}
@@ -974,7 +1031,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
*/
-static int me_huge_page(struct page *p, unsigned long pfn)
+static int me_huge_page(struct page_state *ps, struct page *p)
{
int res;
struct page *hpage = compound_head(p);
@@ -985,7 +1042,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
- res = truncate_error_page(hpage, pfn, mapping);
+ res = truncate_error_page(hpage, page_to_pfn(p), mapping);
unlock_page(hpage);
} else {
res = MF_FAILED;
@@ -1003,6 +1060,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
}
}
+ if (has_extra_refcount(ps, p, false))
+ res = MF_FAILED;
+
return res;
}
@@ -1028,14 +1088,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
-static struct page_state {
- unsigned long mask;
- unsigned long res;
- enum mf_action_page_type type;
-
- /* Callback ->action() has to unlock the relevant page inside it. */
- int (*action)(struct page *p, unsigned long pfn);
-} error_states[] = {
+static struct page_state error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
@@ -1095,19 +1148,10 @@ static int page_action(struct page_state *ps, struct page *p,
unsigned long pfn)
{
int result;
- int count;
/* page p should be unlocked after returning from ps->action(). */
- result = ps->action(p, pfn);
+ result = ps->action(ps, p);
- count = page_count(p) - 1;
- if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
- count--;
- if (count > 0) {
- pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
- pfn, action_page_types[ps->type], count);
- result = MF_FAILED;
- }
action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
@@ -1126,7 +1170,7 @@ static int page_action(struct page_state *ps, struct page *p,
*/
static inline bool HWPoisonHandlable(struct page *page)
{
- return PageLRU(page) || __PageMovable(page);
+ return PageLRU(page) || __PageMovable(page) || is_free_buddy_page(page);
}
static int __get_hwpoison_page(struct page *page)
@@ -1147,20 +1191,6 @@ static int __get_hwpoison_page(struct page *page)
if (!HWPoisonHandlable(head))
return -EBUSY;
- if (PageTransHuge(head)) {
- /*
- * Non anonymous thp exists only in allocation/free time. We
- * can't handle such a case correctly, so let's give it up.
- * This should be better than triggering BUG_ON when kernel
- * tries to touch the "partially handled" page.
- */
- if (!PageAnon(head)) {
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- page_to_pfn(page));
- return 0;
- }
- }
-
if (get_page_unless_zero(head)) {
if (head == compound_head(page))
return 1;
@@ -1414,14 +1444,11 @@ static int identify_page_state(unsigned long pfn, struct page *p,
static int try_to_split_thp_page(struct page *page, const char *msg)
{
lock_page(page);
- if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ if (unlikely(split_huge_page(page))) {
unsigned long pfn = page_to_pfn(page);
unlock_page(page);
- if (!PageAnon(page))
- pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
- else
- pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
put_page(page);
return -EBUSY;
}
@@ -1708,6 +1735,20 @@ try_again:
}
if (PageTransHuge(hpage)) {
+ /*
+ * The flag must be set after the refcount is bumped
+ * otherwise it may race with THP split.
+ * And the flag can't be set in get_hwpoison_page() since
+ * it is called by soft offline too and it is just called
+ * for !MF_COUNT_INCREASE. So here seems to be the best
+ * place.
+ *
+ * Don't need care about the above error handling paths for
+ * get_hwpoison_page() since they handle either free page
+ * or unhandlable page. The refcount is bumped iff the
+ * page is a valid handlable page.
+ */
+ SetPageHasHWPoisoned(hpage);
if (try_to_split_thp_page(p, "Memory Failure") < 0) {
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
res = -EBUSY;
@@ -2109,14 +2150,14 @@ static int __soft_offline_page(struct page *page)
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
- pfn, msg_page[huge], ret, page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n",
+ pfn, msg_page[huge], ret, &page->flags);
if (ret > 0)
ret = -EBUSY;
}
} else {
- pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
- pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
+ pfn, msg_page[huge], page_count(page), &page->flags);
ret = -EBUSY;
}
return ret;