aboutsummaryrefslogtreecommitdiffstats
path: root/mm/compaction.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--mm/compaction.c912
1 files changed, 626 insertions, 286 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 672d3c78c6ab..c51f7f545afe 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -26,6 +26,11 @@
#include "internal.h"
#ifdef CONFIG_COMPACTION
+/*
+ * Fragmentation score check interval for proactive compaction purposes.
+ */
+#define HPAGE_FRAG_CHECK_INTERVAL_MSEC (500)
+
static inline void count_compact_event(enum vm_event_item item)
{
count_vm_event(item);
@@ -47,8 +52,19 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
-#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
-#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
+
+/*
+ * Page order with-respect-to which proactive compaction
+ * calculates external fragmentation, which is used as
+ * the "fragmentation score" of a node/zone.
+ */
+#if defined CONFIG_TRANSPARENT_HUGEPAGE
+#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER
+#elif defined CONFIG_HUGETLBFS
+#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER
+#else
+#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
+#endif
static unsigned long release_freepages(struct list_head *freelist)
{
@@ -92,42 +108,38 @@ static void split_map_pages(struct list_head *list)
}
#ifdef CONFIG_COMPACTION
-
-int PageMovable(struct page *page)
+bool PageMovable(struct page *page)
{
- struct address_space *mapping;
+ const struct movable_operations *mops;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!__PageMovable(page))
- return 0;
+ return false;
- mapping = page_mapping(page);
- if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
- return 1;
+ mops = page_movable_ops(page);
+ if (mops)
+ return true;
- return 0;
+ return false;
}
EXPORT_SYMBOL(PageMovable);
-void __SetPageMovable(struct page *page, struct address_space *mapping)
+void __SetPageMovable(struct page *page, const struct movable_operations *mops)
{
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
- page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
+ VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
+ page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
}
EXPORT_SYMBOL(__SetPageMovable);
void __ClearPageMovable(struct page *page)
{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageMovable(page), page);
/*
- * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
- * flag so that VM can catch up released page by driver after isolation.
- * With it, VM migration doesn't try to put it back.
+ * This page still has the type of a movable page, but it's
+ * actually not movable any more.
*/
- page->mapping = (void *)((unsigned long)page->mapping &
- PAGE_MAPPING_MOVABLE);
+ page->mapping = (void *)PAGE_MAPPING_MOVABLE;
}
EXPORT_SYMBOL(__ClearPageMovable);
@@ -136,10 +148,10 @@ EXPORT_SYMBOL(__ClearPageMovable);
/*
* Compaction is deferred when compaction fails to result in a page
- * allocation success. 1 << compact_defer_limit compactions are skipped up
+ * allocation success. 1 << compact_defer_shift, compactions are skipped up
* to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
*/
-void defer_compaction(struct zone *zone, int order)
+static void defer_compaction(struct zone *zone, int order)
{
zone->compact_considered = 0;
zone->compact_defer_shift++;
@@ -154,7 +166,7 @@ void defer_compaction(struct zone *zone, int order)
}
/* Returns true if compaction should be skipped this time */
-bool compaction_deferred(struct zone *zone, int order)
+static bool compaction_deferred(struct zone *zone, int order)
{
unsigned long defer_limit = 1UL << zone->compact_defer_shift;
@@ -162,11 +174,10 @@ bool compaction_deferred(struct zone *zone, int order)
return false;
/* Avoid possible overflow */
- if (++zone->compact_considered > defer_limit)
+ if (++zone->compact_considered >= defer_limit) {
zone->compact_considered = defer_limit;
-
- if (zone->compact_considered >= defer_limit)
return false;
+ }
trace_mm_compaction_deferred(zone, order);
@@ -192,7 +203,7 @@ void compaction_defer_reset(struct zone *zone, int order,
}
/* Returns true if restarting compaction after many failures */
-bool compaction_restarting(struct zone *zone, int order)
+static bool compaction_restarting(struct zone *zone, int order)
{
if (order < zone->compact_order_failed)
return false;
@@ -220,7 +231,7 @@ static void reset_cached_positions(struct zone *zone)
}
/*
- * Compound pages of >= pageblock_order should consistenly be skipped until
+ * Compound pages of >= pageblock_order should consistently be skipped until
* released. It is always pointless to compact pages of such order (if they are
* migratable), and the pageblocks they occupy cannot contain any free pages.
*/
@@ -290,20 +301,17 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
* is necessary for the block to be a migration source/target.
*/
do {
- if (pfn_valid_within(pfn)) {
- if (check_source && PageLRU(page)) {
- clear_pageblock_skip(page);
- return true;
- }
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
- if (check_target && PageBuddy(page)) {
- clear_pageblock_skip(page);
- return true;
- }
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
}
page += (1 << PAGE_ALLOC_COSTLY_ORDER);
- pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
} while (page <= end_page);
return false;
@@ -394,7 +402,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page,
if (cc->ignore_skip_hint)
return false;
- if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+ if (!pageblock_aligned(pfn))
return false;
skip = get_pageblock_skip(page);
@@ -481,6 +489,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page,
*/
static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
struct compact_control *cc)
+ __acquires(lock)
{
/* Track if the lock is contended in async mode */
if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
@@ -499,15 +508,12 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
* very heavily contended. The lock should be periodically unlocked to avoid
* having disabled IRQs for a long time, even when there is nobody waiting on
* the lock. It might also be that allowing the IRQs will result in
- * need_resched() becoming true. If scheduling is needed, async compaction
- * aborts. Sync compaction schedules.
+ * need_resched() becoming true. If scheduling is needed, compaction schedules.
* Either compaction type will also abort if a fatal signal is pending.
* In either case if the lock was locked, it is dropped and not regained.
*
- * Returns true if compaction should abort due to fatal signal pending, or
- * async compaction due to need_resched()
- * Returns false when compaction can continue (sync compaction might have
- * scheduled)
+ * Returns true if compaction should abort due to fatal signal pending.
+ * Returns false when compaction can continue.
*/
static bool compact_unlock_should_abort(spinlock_t *lock,
unsigned long flags, bool *locked, struct compact_control *cc)
@@ -560,16 +566,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort if fatal signal
- * pending or async compaction detects need_resched()
+ * pending.
*/
- if (!(blockpfn % SWAP_CLUSTER_MAX)
+ if (!(blockpfn % COMPACT_CLUSTER_MAX)
&& compact_unlock_should_abort(&cc->zone->lock, flags,
&locked, cc))
break;
nr_scanned++;
- if (!pfn_valid_within(blockpfn))
- goto isolate_fail;
/*
* For compound pages such as THP and hugetlbfs, we can save
@@ -590,13 +594,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (!PageBuddy(page))
goto isolate_fail;
- /*
- * If we already hold the lock, we can skip some rechecking.
- * Note that if we hold the lock now, checked_pageblock was
- * already set in some previous iteration (or strict is true),
- * so it is correct to skip the suitable migration target
- * recheck as well.
- */
+ /* If we already hold the lock, we can skip some rechecking. */
if (!locked) {
locked = compact_lock_irqsave(&cc->zone->lock,
&flags, cc);
@@ -607,12 +605,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
}
/* Found a free page, will break it into order-0 pages */
- order = page_order(page);
+ order = buddy_order(page);
isolated = __isolate_free_page(page, order);
if (!isolated)
break;
set_page_private(page, order);
+ nr_scanned += isolated - 1;
total_isolated += isolated;
cc->nr_freepages += isolated;
list_add_tail(&page->lru, freelist);
@@ -748,6 +747,8 @@ isolate_freepages_range(struct compact_control *cc,
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(pg_data_t *pgdat)
{
+ bool too_many;
+
unsigned long active, inactive, isolated;
inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
@@ -757,7 +758,11 @@ static bool too_many_isolated(pg_data_t *pgdat)
isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
node_page_state(pgdat, NR_ISOLATED_ANON);
- return isolated > (inactive + active) / 2;
+ too_many = isolated > (inactive + active) / 2;
+ if (!too_many)
+ wake_throttle_isolated(pgdat);
+
+ return too_many;
}
/**
@@ -766,32 +771,35 @@ static bool too_many_isolated(pg_data_t *pgdat)
* @cc: Compaction control structure.
* @low_pfn: The first PFN to isolate
* @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
- * @isolate_mode: Isolation mode to be used.
+ * @mode: Isolation mode to be used.
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
- * Returns zero if there is a fatal signal pending, otherwise PFN of the
- * first page that was not scanned (which may be both less, equal to or more
- * than end_pfn).
+ * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
+ * -ENOMEM in case we could not allocate a page, or 0.
+ * cc->migrate_pfn will contain the next pfn to scan.
*
* The pages are isolated on cc->migratepages list (not required to be empty),
- * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
- * is neither read nor updated.
+ * and cc->nr_migratepages is updated accordingly.
*/
-static unsigned long
+static int
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
- unsigned long end_pfn, isolate_mode_t isolate_mode)
+ unsigned long end_pfn, isolate_mode_t mode)
{
pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
- bool locked = false;
+ struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
+ struct address_space *mapping;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
bool skip_updated = false;
+ int ret = 0;
+
+ cc->migrate_pfn = low_pfn;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -799,14 +807,18 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* delay for some time until fewer pages are isolated
*/
while (unlikely(too_many_isolated(pgdat))) {
+ /* stop isolation if there are still pages not migrated */
+ if (cc->nr_migratepages)
+ return -EAGAIN;
+
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
- return 0;
+ return -EAGAIN;
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
if (fatal_signal_pending(current))
- return 0;
+ return -EINTR;
}
cond_resched();
@@ -846,15 +858,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* contention, to give chance to IRQs. Abort completely if
* a fatal signal is pending.
*/
- if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(&pgdat->lru_lock,
- flags, &locked, cc)) {
- low_pfn = 0;
- goto fatal_pending;
+ if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = true;
+ ret = -EINTR;
+
+ goto fatal_pending;
+ }
+
+ cond_resched();
}
- if (!pfn_valid_within(low_pfn))
- goto isolate_fail;
nr_scanned++;
page = pfn_to_page(low_pfn);
@@ -865,14 +884,47 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* COMPACT_CLUSTER_MAX at a time so the second call must
* not falsely conclude that the block should be skipped.
*/
- if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
- if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+ if (!valid_page && pageblock_aligned(low_pfn)) {
+ if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
+ page = NULL;
goto isolate_abort;
}
valid_page = page;
}
+ if (PageHuge(page) && cc->alloc_contig) {
+ ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+
+ /*
+ * Fail isolation in case isolate_or_dissolve_huge_page()
+ * reports an error. In case of -ENOMEM, abort right away.
+ */
+ if (ret < 0) {
+ /* Do not report -EBUSY down the chain */
+ if (ret == -EBUSY)
+ ret = 0;
+ low_pfn += compound_nr(page) - 1;
+ goto isolate_fail;
+ }
+
+ if (PageHuge(page)) {
+ /*
+ * Hugepage was successfully isolated and placed
+ * on the cc->migratepages list.
+ */
+ low_pfn += compound_nr(page) - 1;
+ goto isolate_success_no_list;
+ }
+
+ /*
+ * Ok, the hugepage was dissolved. Now these pages are
+ * Buddy and cannot be re-allocated because they are
+ * isolated. Fall-through as the check below handles
+ * Buddy pages.
+ */
+ }
+
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
@@ -880,7 +932,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* potential isolation targets.
*/
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
@@ -894,12 +946,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/*
* Regardless of being on LRU, compound pages such as THP and
- * hugetlbfs are not to be compacted. We can potentially save
- * a lot of iterations if we skip them at once. The check is
- * racy, but we can consider only valid values and the only
- * danger is skipping too much.
+ * hugetlbfs are not to be compacted unless we are attempting
+ * an allocation much larger than the huge page size (eg CMA).
+ * We can potentially save a lot of iterations if we skip them
+ * at once. The check is racy, but we can consider only valid
+ * values and the only danger is skipping too much.
*/
- if (PageCompound(page)) {
+ if (PageCompound(page) && !cc->alloc_contig) {
const unsigned int order = compound_order(page);
if (likely(order < MAX_ORDER))
@@ -920,12 +973,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock,
- flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
- if (!isolate_movable_page(page, isolate_mode))
+ if (!isolate_movable_page(page, mode))
goto isolate_success;
}
@@ -937,21 +989,80 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
- if (!page_mapping(page) &&
- page_count(page) > page_mapcount(page))
+ mapping = page_mapping(page);
+ if (!mapping && page_count(page) > page_mapcount(page))
goto isolate_fail;
/*
* Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks.
*/
- if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
+ if (!(cc->gfp_mask & __GFP_FS) && mapping)
+ goto isolate_fail;
+
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ if (unlikely(!get_page_unless_zero(page)))
goto isolate_fail;
+ /* Only take pages on LRU: a check now makes later tests safe */
+ if (!PageLRU(page))
+ goto isolate_fail_put;
+
+ /* Compaction might skip unevictable pages but CMA takes them */
+ if (!(mode & ISOLATE_UNEVICTABLE) && PageUnevictable(page))
+ goto isolate_fail_put;
+
+ /*
+ * To minimise LRU disruption, the caller can indicate with
+ * ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages
+ * it will be able to migrate without blocking - clean pages
+ * for the most part. PageWriteback would require blocking.
+ */
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && PageWriteback(page))
+ goto isolate_fail_put;
+
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && PageDirty(page)) {
+ bool migrate_dirty;
+
+ /*
+ * Only pages without mappings or that have a
+ * ->migrate_folio callback are possible to migrate
+ * without blocking. However, we can be racing with
+ * truncation so it's necessary to lock the page
+ * to stabilise the mapping as truncation holds
+ * the page lock until after the page is removed
+ * from the page cache.
+ */
+ if (!trylock_page(page))
+ goto isolate_fail_put;
+
+ mapping = page_mapping(page);
+ migrate_dirty = !mapping ||
+ mapping->a_ops->migrate_folio;
+ unlock_page(page);
+ if (!migrate_dirty)
+ goto isolate_fail_put;
+ }
+
+ /* Try isolate the page */
+ if (!TestClearPageLRU(page))
+ goto isolate_fail_put;
+
+ lruvec = folio_lruvec(page_folio(page));
+
/* If we already hold the lock, we can skip some rechecking */
- if (!locked) {
- locked = compact_lock_irqsave(&pgdat->lru_lock,
- &flags, cc);
+ if (lruvec != locked) {
+ if (locked)
+ unlock_page_lruvec_irqrestore(locked, flags);
+
+ compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ locked = lruvec;
+
+ lruvec_memcg_debug(lruvec, page_folio(page));
/* Try get exclusive access under lock */
if (!skip_updated) {
@@ -960,38 +1071,34 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_abort;
}
- /* Recheck PageLRU and PageCompound under lock */
- if (!PageLRU(page))
- goto isolate_fail;
-
/*
* Page become compound since the non-locked check,
* and it's on LRU. It can only be a THP so the order
* is safe to read and it's 0 for tail pages.
*/
- if (unlikely(PageCompound(page))) {
+ if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
low_pfn += compound_nr(page) - 1;
- goto isolate_fail;
+ SetPageLRU(page);
+ goto isolate_fail_put;
}
}
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- /* Try isolate the page */
- if (__isolate_lru_page(page, isolate_mode) != 0)
- goto isolate_fail;
-
- VM_BUG_ON_PAGE(PageCompound(page), page);
+ /* The whole page is taken off the LRU; skip the tail pages. */
+ if (PageCompound(page))
+ low_pfn += compound_nr(page) - 1;
/* Successfully isolated */
- del_page_from_lru_list(page, lruvec, page_lru(page));
- inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_cache(page));
+ del_page_from_lru_list(page, lruvec);
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ thp_nr_pages(page));
isolate_success:
list_add(&page->lru, &cc->migratepages);
- cc->nr_migratepages++;
- nr_isolated++;
+isolate_success_no_list:
+ cc->nr_migratepages += compound_nr(page);
+ nr_isolated += compound_nr(page);
+ nr_scanned += compound_nr(page) - 1;
/*
* Avoid isolating too much unless this block is being
@@ -999,15 +1106,24 @@ isolate_success:
* or a lock is contended. For contention, isolate quickly to
* potentially remove one source of contention.
*/
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
!cc->rescan && !cc->contended) {
++low_pfn;
break;
}
continue;
+
+isolate_fail_put:
+ /* Avoid potential deadlock in freeing page under lru_lock */
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+ put_page(page);
+
isolate_fail:
- if (!skip_on_failure)
+ if (!skip_on_failure && ret != -ENOMEM)
continue;
/*
@@ -1017,8 +1133,8 @@ isolate_fail:
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
@@ -1033,6 +1149,9 @@ isolate_fail:
*/
next_skip_pfn += 1UL << cc->order;
}
+
+ if (ret == -ENOMEM)
+ break;
}
/*
@@ -1042,9 +1161,15 @@ isolate_fail:
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
+ page = NULL;
+
isolate_abort:
if (locked)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(locked, flags);
+ if (page) {
+ SetPageLRU(page);
+ put_page(page);
+ }
/*
* Updated the cached scanner pfn once the pageblock has been scanned
@@ -1068,7 +1193,9 @@ fatal_pending:
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
- return low_pfn;
+ cc->migrate_pfn = low_pfn;
+
+ return ret;
}
/**
@@ -1077,15 +1204,15 @@ fatal_pending:
* @start_pfn: The first PFN to start isolating.
* @end_pfn: The one-past-last PFN.
*
- * Returns zero if isolation fails fatally due to e.g. pending signal.
- * Otherwise, function returns one-past-the-last PFN of isolated page
- * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
+ * in case we could not allocate a page, or 0.
*/
-unsigned long
+int
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long pfn, block_start_pfn, block_end_pfn;
+ int ret = 0;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
@@ -1104,17 +1231,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
block_end_pfn, cc->zone))
continue;
- pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
- ISOLATE_UNEVICTABLE);
+ ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
+ ISOLATE_UNEVICTABLE);
- if (!pfn)
+ if (ret)
break;
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
break;
}
- return pfn;
+ return ret;
}
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
@@ -1150,7 +1277,7 @@ static bool suitable_migration_target(struct compact_control *cc,
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
- if (page_order_unsafe(page) >= pageblock_order)
+ if (buddy_order_unsafe(page) >= pageblock_order)
return false;
}
@@ -1195,8 +1322,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage)
if (!list_is_last(freelist, &freepage->lru)) {
list_cut_before(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
@@ -1213,8 +1339,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
if (!list_is_first(freelist, &freepage->lru)) {
list_cut_position(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
@@ -1222,7 +1347,7 @@ static void
fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
{
unsigned long start_pfn, end_pfn;
- struct page *page = pfn_to_page(pfn);
+ struct page *page;
/* Do not search around if there are enough pages already */
if (cc->nr_freepages >= cc->nr_migratepages)
@@ -1233,8 +1358,12 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
return;
/* Pageblock boundaries */
- start_pfn = pageblock_start_pfn(pfn);
- end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
+ start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn);
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone));
+
+ page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone);
+ if (!page)
+ return;
/* Scan before */
if (start_pfn != pfn) {
@@ -1274,9 +1403,9 @@ static int next_search_order(struct compact_control *cc, int order)
static unsigned long
fast_isolate_freepages(struct compact_control *cc)
{
- unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1);
unsigned int nr_scanned = 0;
- unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+ unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
unsigned long distance;
struct page *page = NULL;
@@ -1321,6 +1450,7 @@ fast_isolate_freepages(struct compact_control *cc)
struct page *freepage;
unsigned long flags;
unsigned int order_scanned = 0;
+ unsigned long high_pfn = 0;
if (!area->nr_free)
continue;
@@ -1335,7 +1465,8 @@ fast_isolate_freepages(struct compact_control *cc)
pfn = page_to_pfn(freepage);
if (pfn >= highest)
- highest = pageblock_start_pfn(pfn);
+ highest = max(pageblock_start_pfn(pfn),
+ cc->zone->zone_start_pfn);
if (pfn >= low_pfn) {
cc->fast_search_fail = 0;
@@ -1371,6 +1502,7 @@ fast_isolate_freepages(struct compact_control *cc)
if (__isolate_free_page(page, order)) {
set_page_private(page, order);
nr_isolated = 1 << order;
+ nr_scanned += nr_isolated - 1;
cc->nr_freepages += nr_isolated;
list_add_tail(&page->lru, &cc->freepages);
count_compact_events(COMPACTISOLATED, nr_isolated);
@@ -1384,11 +1516,11 @@ fast_isolate_freepages(struct compact_control *cc)
spin_unlock_irqrestore(&cc->zone->lock, flags);
/*
- * Smaller scan on next order so the total scan ig related
+ * Smaller scan on next order so the total scan is related
* to freelist_scan_limit.
*/
if (order_scanned >= limit)
- limit = min(1U, limit >> 1);
+ limit = max(1U, limit >> 1);
}
if (!page) {
@@ -1396,15 +1528,18 @@ fast_isolate_freepages(struct compact_control *cc)
if (scan_start) {
/*
* Use the highest PFN found above min. If one was
- * not found, be pessemistic for direct compaction
+ * not found, be pessimistic for direct compaction
* and use the min mark.
*/
- if (highest) {
+ if (highest >= min_pfn) {
page = pfn_to_page(highest);
cc->free_pfn = highest;
} else {
if (cc->direct_compaction && pfn_valid(min_pfn)) {
- page = pfn_to_page(min_pfn);
+ page = pageblock_pfn_to_page(min_pfn,
+ min(pageblock_end_pfn(min_pfn),
+ zone_end_pfn(cc->zone)),
+ cc->zone);
cc->free_pfn = min_pfn;
}
}
@@ -1441,7 +1576,7 @@ static void isolate_freepages(struct compact_control *cc)
unsigned int stride;
/* Try a small search of the free lists for a candidate */
- isolate_start_pfn = fast_isolate_freepages(cc);
+ fast_isolate_freepages(cc);
if (cc->nr_freepages)
goto splitmap;
@@ -1452,7 +1587,7 @@ static void isolate_freepages(struct compact_control *cc)
* this pfn aligned down to the pageblock boundary, because we do
* block_start_pfn -= pageblock_nr_pages in the for loop.
* For ending point, take care when isolating in last pageblock of a
- * a zone which ends in the middle of a pageblock.
+ * zone which ends in the middle of a pageblock.
* The low boundary is the end of the pageblock the migration scanner
* is using.
*/
@@ -1478,7 +1613,7 @@ static void isolate_freepages(struct compact_control *cc)
* This can iterate a massively long zone without finding any
* suitable migration targets, so periodically check resched.
*/
- if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ if (!(block_start_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
@@ -1590,7 +1725,7 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
-int sysctl_compact_unevictable_allowed __read_mostly = 1;
+int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -1629,6 +1764,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
unsigned long pfn = cc->migrate_pfn;
unsigned long high_pfn;
int order;
+ bool found_block = false;
/* Skip hints are relied on to avoid repeats on the fast search */
if (cc->ignore_skip_hint)
@@ -1671,7 +1807,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
for (order = cc->order - 1;
- order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+ order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit;
order--) {
struct free_area *area = &cc->zone->free_area[order];
struct list_head *freelist;
@@ -1686,7 +1822,11 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
list_for_each_entry(freepage, freelist, lru) {
unsigned long free_pfn;
- nr_scanned++;
+ if (nr_scanned++ >= limit) {
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+
free_pfn = page_to_pfn(freepage);
if (free_pfn < high_pfn) {
/*
@@ -1695,26 +1835,18 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* the list assumes an entry is deleted, not
* reordered.
*/
- if (get_pageblock_skip(freepage)) {
- if (list_is_last(freelist, &freepage->lru))
- break;
-
+ if (get_pageblock_skip(freepage))
continue;
- }
/* Reorder to so a future search skips recent pages */
move_freelist_tail(freelist, freepage);
update_fast_start_pfn(cc, free_pfn);
pfn = pageblock_start_pfn(free_pfn);
+ if (pfn < cc->zone->zone_start_pfn)
+ pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
- set_pageblock_skip(freepage);
- break;
- }
-
- if (nr_scanned >= limit) {
- cc->fast_search_fail++;
- move_freelist_tail(freelist, freepage);
+ found_block = true;
break;
}
}
@@ -1727,9 +1859,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* If fast scanning failed then use a cached entry for a page block
* that had free pages as the basis for starting a linear scan.
*/
- if (pfn == cc->migrate_pfn)
+ if (!found_block) {
+ cc->fast_search_fail++;
pfn = reinit_migrate_pfn(cc);
-
+ }
return pfn;
}
@@ -1775,7 +1908,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
*/
for (; block_end_pfn <= cc->free_pfn;
fast_find_block = false,
- low_pfn = block_end_pfn,
+ cc->migrate_pfn = low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
@@ -1784,7 +1917,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
* many pageblocks unsuitable, so periodically check if we
* need to schedule.
*/
- if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ if (!(low_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
page = pageblock_pfn_to_page(block_start_pfn,
@@ -1799,17 +1932,17 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
* before making it "skip" so other compaction instances do
* not scan the same block.
*/
- if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+ if (pageblock_aligned(low_pfn) &&
!fast_find_block && !isolation_suitable(cc, page))
continue;
/*
- * For async compaction, also only scan in MOVABLE blocks
- * without huge pages. Async compaction is optimistic to see
- * if the minimum amount of work satisfies the allocation.
- * The cached PFN is updated as it's possible that all
- * remaining blocks between source and target are unsuitable
- * and the compaction scanners fail to meet.
+ * For async direct compaction, only scan the pageblocks of the
+ * same migratetype without huge pages. Async direct compaction
+ * is optimistic to see if the minimum amount of work satisfies
+ * the allocation. The cached PFN is updated as it's possible
+ * that all remaining blocks between source and target are
+ * unsuitable and the compaction scanners fail to meet.
*/
if (!suitable_migration_source(cc, page)) {
update_cached_migrate(cc, block_end_pfn);
@@ -1817,10 +1950,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
}
/* Perform the isolation */
- low_pfn = isolate_migratepages_block(cc, low_pfn,
- block_end_pfn, isolate_mode);
-
- if (!low_pfn)
+ if (isolate_migratepages_block(cc, low_pfn, block_end_pfn,
+ isolate_mode))
return ISOLATE_ABORT;
/*
@@ -1831,9 +1962,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
break;
}
- /* Record where migration scanner will be restarted. */
- cc->migrate_pfn = low_pfn;
-
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
@@ -1846,6 +1974,96 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
+/*
+ * Determine whether kswapd is (or recently was!) running on this node.
+ *
+ * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't
+ * zero it.
+ */
+static bool kswapd_is_running(pg_data_t *pgdat)
+{
+ bool running;
+
+ pgdat_kswapd_lock(pgdat);
+ running = pgdat->kswapd && task_is_running(pgdat->kswapd);
+ pgdat_kswapd_unlock(pgdat);
+
+ return running;
+}
+
+/*
+ * A zone's fragmentation score is the external fragmentation wrt to the
+ * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
+ */
+static unsigned int fragmentation_score_zone(struct zone *zone)
+{
+ return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+}
+
+/*
+ * A weighted zone's fragmentation score is the external fragmentation
+ * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
+ * returns a value in the range [0, 100].
+ *
+ * The scaling factor ensures that proactive compaction focuses on larger
+ * zones like ZONE_NORMAL, rather than smaller, specialized zones like
+ * ZONE_DMA32. For smaller zones, the score value remains close to zero,
+ * and thus never exceeds the high threshold for proactive compaction.
+ */
+static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
+{
+ unsigned long score;
+
+ score = zone->present_pages * fragmentation_score_zone(zone);
+ return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
+}
+
+/*
+ * The per-node proactive (background) compaction process is started by its
+ * corresponding kcompactd thread when the node's fragmentation score
+ * exceeds the high threshold. The compaction process remains active till
+ * the node's score falls below the low threshold, or one of the back-off
+ * conditions is met.
+ */
+static unsigned int fragmentation_score_node(pg_data_t *pgdat)
+{
+ unsigned int score = 0;
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone;
+
+ zone = &pgdat->node_zones[zoneid];
+ score += fragmentation_score_zone_weighted(zone);
+ }
+
+ return score;
+}
+
+static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+{
+ unsigned int wmark_low;
+
+ /*
+ * Cap the low watermark to avoid excessive compaction
+ * activity in case a user sets the proactiveness tunable
+ * close to 100 (maximum).
+ */
+ wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
+ return low ? wmark_low : min(wmark_low + 10, 100U);
+}
+
+static bool should_proactive_compact_node(pg_data_t *pgdat)
+{
+ int wmark_high;
+
+ if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
+ return false;
+
+ wmark_high = fragmentation_score_wmark(pgdat, false);
+ return fragmentation_score_node(pgdat) > wmark_high;
+}
+
static enum compact_result __compact_finished(struct compact_control *cc)
{
unsigned int order;
@@ -1872,6 +2090,25 @@ static enum compact_result __compact_finished(struct compact_control *cc)
return COMPACT_PARTIAL_SKIPPED;
}
+ if (cc->proactive_compaction) {
+ int score, wmark_low;
+ pg_data_t *pgdat;
+
+ pgdat = cc->zone->zone_pgdat;
+ if (kswapd_is_running(pgdat))
+ return COMPACT_PARTIAL_SKIPPED;
+
+ score = fragmentation_score_zone(cc->zone);
+ wmark_low = fragmentation_score_wmark(pgdat, true);
+
+ if (score > wmark_low)
+ ret = COMPACT_CONTINUE;
+ else
+ ret = COMPACT_SUCCESS;
+
+ goto out;
+ }
+
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
@@ -1881,7 +2118,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* migration source is unmovable/reclaimable but it's not worth
* special casing.
*/
- if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ if (!pageblock_aligned(cc->migrate_pfn))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
@@ -1905,31 +2142,19 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* other migratetype buddy lists.
*/
if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1) {
-
- /* movable pages are OK in any pageblock */
- if (migratetype == MIGRATE_MOVABLE)
- return COMPACT_SUCCESS;
-
+ true, &can_steal) != -1)
/*
- * We are stealing for a non-movable allocation. Make
- * sure we finish compacting the current pageblock
- * first so it is as free as possible and we won't
- * have to steal another one soon. This only applies
- * to sync compaction, as async compaction operates
- * on pageblocks of the same migratetype.
+ * Movable pages are OK in any pageblock. If we are
+ * stealing for a non-movable allocation, make sure
+ * we finish compacting the current pageblock first
+ * (which is assured by the above migrate_pfn align
+ * check) so it is as free as possible and we won't
+ * have to steal another one soon.
*/
- if (cc->mode == MIGRATE_ASYNC ||
- IS_ALIGNED(cc->migrate_pfn,
- pageblock_nr_pages)) {
- return COMPACT_SUCCESS;
- }
-
- ret = COMPACT_CONTINUE;
- break;
- }
+ return COMPACT_SUCCESS;
}
+out:
if (cc->contended || fatal_signal_pending(current))
ret = COMPACT_CONTENDED;
@@ -1948,16 +2173,9 @@ static enum compact_result compact_finished(struct compact_control *cc)
return ret;
}
-/*
- * compaction_suitable: Is this suitable to run compaction on this zone now?
- * Returns
- * COMPACT_SKIPPED - If there are too few free pages for compaction
- * COMPACT_SUCCESS - If the allocation would succeed without compaction
- * COMPACT_CONTINUE - If compaction should run now
- */
static enum compact_result __compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
- int classzone_idx,
+ int highest_zoneidx,
unsigned long wmark_target)
{
unsigned long watermark;
@@ -1970,7 +2188,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
*/
- if (zone_watermark_ok(zone, order, watermark, classzone_idx,
+ if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
alloc_flags))
return COMPACT_SUCCESS;
@@ -1980,9 +2198,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* watermark and alloc_flags have to match, or be more pessimistic than
* the check in __isolate_free_page(). We don't use the direct
* compactor's alloc_flags, as they are not relevant for freepage
- * isolation. We however do use the direct compactor's classzone_idx to
- * skip over zones where lowmem reserves would prevent allocation even
- * if compaction succeeds.
+ * isolation. We however do use the direct compactor's highest_zoneidx
+ * to skip over zones where lowmem reserves would prevent allocation
+ * even if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
* ALLOC_CMA is used, as pages in CMA pageblocks are considered
@@ -1991,21 +2209,28 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
- if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+ if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
ALLOC_CMA, wmark_target))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
}
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_SUCCESS - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
- int classzone_idx)
+ int highest_zoneidx)
{
enum compact_result ret;
int fragindex;
- ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+ ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2046,8 +2271,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
* Make sure at least one zone would pass __compaction_suitable if we continue
* retrying the reclaim.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
enum compact_result compact_result;
@@ -2060,8 +2285,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
compact_result = __compaction_suitable(zone, order, alloc_flags,
- ac_classzone_idx(ac), available);
- if (compact_result != COMPACT_SKIPPED)
+ ac->highest_zoneidx, available);
+ if (compact_result == COMPACT_CONTINUE)
return true;
}
@@ -2077,6 +2302,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
+ unsigned int nr_succeeded = 0;
/*
* These counters track activities during zone compaction. Initialize
@@ -2089,9 +2315,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
- cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
+ cc->migratetype = gfp_migratetype(cc->gfp_mask);
ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
- cc->classzone_idx);
+ cc->highest_zoneidx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;
@@ -2146,14 +2372,14 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
update_cached = !sync &&
cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
- trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
- cc->free_pfn, end_pfn, sync);
+ trace_mm_compaction_begin(cc, start_pfn, end_pfn, sync);
- migrate_prep_local();
+ /* lru_add_drain_all could be expensive with involving other CPUs */
+ lru_add_drain();
while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
- unsigned long start_pfn = cc->migrate_pfn;
+ unsigned long iteration_start_pfn = cc->migrate_pfn;
/*
* Avoid multiple rescans which can happen if a page cannot be
@@ -2165,7 +2391,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
*/
cc->rescan = false;
if (pageblock_start_pfn(last_migrated_pfn) ==
- pageblock_start_pfn(start_pfn)) {
+ pageblock_start_pfn(iteration_start_pfn)) {
cc->rescan = true;
}
@@ -2174,7 +2400,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- last_migrated_pfn = 0;
goto out;
case ISOLATE_NONE:
if (update_cached) {
@@ -2190,16 +2415,14 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
goto check_drain;
case ISOLATE_SUCCESS:
update_cached = false;
- last_migrated_pfn = start_pfn;
- ;
+ last_migrated_pfn = iteration_start_pfn;
}
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
- MR_COMPACTION);
+ MR_COMPACTION, &nr_succeeded);
- trace_mm_compaction_migratepages(cc->nr_migratepages, err,
- &cc->migratepages);
+ trace_mm_compaction_migratepages(cc, nr_succeeded);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
@@ -2235,15 +2458,11 @@ check_drain:
* would succeed.
*/
if (cc->order > 0 && last_migrated_pfn) {
- int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
if (last_migrated_pfn < current_block_start) {
- cpu = get_cpu();
- lru_add_drain_cpu(cpu);
- drain_local_pages(cc->zone);
- put_cpu();
+ lru_add_drain_cpu_zone(cc->zone);
/* No more flushing until we migrate again */
last_migrated_pfn = 0;
}
@@ -2279,15 +2498,14 @@ out:
count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
- trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
- cc->free_pfn, end_pfn, sync, ret);
+ trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret);
return ret;
}
static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum compact_priority prio,
- unsigned int alloc_flags, int classzone_idx,
+ unsigned int alloc_flags, int highest_zoneidx,
struct page **capture)
{
enum compact_result ret;
@@ -2299,7 +2517,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.mode = (prio == COMPACT_PRIO_ASYNC) ?
MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
.alloc_flags = alloc_flags,
- .classzone_idx = classzone_idx,
+ .highest_zoneidx = highest_zoneidx,
.direct_compaction = true,
.whole_zone = (prio == MIN_COMPACT_PRIORITY),
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
@@ -2310,16 +2528,34 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.page = NULL,
};
- if (capture)
- current->capture_control = &capc;
+ /*
+ * Make sure the structs are really initialized before we expose the
+ * capture control, in case we are interrupted and the interrupt handler
+ * frees a page.
+ */
+ barrier();
+ WRITE_ONCE(current->capture_control, &capc);
ret = compact_zone(&cc, &capc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
- *capture = capc.page;
- current->capture_control = NULL;
+ /*
+ * Make sure we hide capture control first before we read the captured
+ * page pointer, otherwise an interrupt could free and capture a page
+ * and we would leak it.
+ */
+ WRITE_ONCE(current->capture_control, NULL);
+ *capture = READ_ONCE(capc.page);
+ /*
+ * Technically, it is also possible that compaction is skipped but
+ * the page is still captured out of luck(IRQ came and freed the page).
+ * Returning COMPACT_SUCCESS in such cases helps in properly accounting
+ * the COMPACT[STALL|FAIL] when compaction is skipped.
+ */
+ if (*capture)
+ ret = COMPACT_SUCCESS;
return ret;
}
@@ -2333,6 +2569,7 @@ int sysctl_extfrag_threshold = 500;
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
* @prio: Determines how hard direct compaction should try to succeed
+ * @capture: Pointer to free page created by compaction will be stored here
*
* This is the main entry point for direct page compaction.
*/
@@ -2340,7 +2577,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, struct page **capture)
{
- int may_perform_io = gfp_mask & __GFP_IO;
+ int may_perform_io = (__force int)(gfp_mask & __GFP_IO);
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
@@ -2355,8 +2592,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
/* Compact each zone in the list */
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
enum compact_result status;
if (prio > MIN_COMPACT_PRIORITY
@@ -2366,7 +2603,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
}
status = compact_zone_order(zone, order, gfp_mask, prio,
- alloc_flags, ac_classzone_idx(ac), capture);
+ alloc_flags, ac->highest_zoneidx, capture);
rc = max(status, rc);
/* The allocation should succeed, stop compacting */
@@ -2404,6 +2641,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
return rc;
}
+/*
+ * Compact all zones within a node till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable).
+ *
+ * It is possible that the function returns before reaching score targets
+ * due to various back-off conditions, such as, contention on per-node or
+ * per-zone locks.
+ */
+static void proactive_compact_node(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = -1,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+ .whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
+ .proactive_compaction = true,
+ };
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc.zone = zone;
+
+ compact_zone(&cc, NULL);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+}
/* Compact all zones within a node */
static void compact_node(int nid)
@@ -2447,15 +2719,43 @@ static void compact_nodes(void)
compact_node(nid);
}
-/* The written value is actually unused, all memory is compacted */
-int sysctl_compact_memory;
+/*
+ * Tunable for proactive compaction. It determines how
+ * aggressively the kernel should compact memory in the
+ * background. It takes values in the range [0, 100].
+ */
+unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+
+int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ int rc, nid;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write && sysctl_compaction_proactiveness) {
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (pgdat->proactive_compact_trigger)
+ continue;
+
+ pgdat->proactive_compact_trigger = true;
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+ }
+ }
+
+ return 0;
+}
/*
* This is the entry point for compacting all nodes via
* /proc/sys/vm/compact_memory
*/
int sysctl_compaction_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
if (write)
compact_nodes();
@@ -2464,9 +2764,9 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
-static ssize_t sysfs_compact_node(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t compact_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
int nid = dev->id;
@@ -2479,7 +2779,7 @@ static ssize_t sysfs_compact_node(struct device *dev,
return count;
}
-static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node);
+static DEVICE_ATTR_WO(compact);
int compaction_register_node(struct node *node)
{
@@ -2494,23 +2794,24 @@ void compaction_unregister_node(struct node *node)
static inline bool kcompactd_work_requested(pg_data_t *pgdat)
{
- return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+ return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
+ pgdat->proactive_compact_trigger;
}
static bool kcompactd_node_suitable(pg_data_t *pgdat)
{
int zoneid;
struct zone *zone;
- enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+ enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
- for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
- classzone_idx) == COMPACT_CONTINUE)
+ highest_zoneidx) == COMPACT_CONTINUE)
return true;
}
@@ -2528,16 +2829,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
.search_order = pgdat->kcompactd_max_order,
- .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .highest_zoneidx = pgdat->kcompactd_highest_zoneidx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL,
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
- cc.classzone_idx);
+ cc.highest_zoneidx);
count_compact_event(KCOMPACTD_WAKE);
- for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) {
int status;
zone = &pgdat->node_zones[zoneid];
@@ -2586,16 +2887,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
/*
* Regardless of success, we are done until woken up next. But remember
- * the requested order/classzone_idx in case it was higher/tighter than
- * our current ones
+ * the requested order/highest_zoneidx in case it was higher/tighter
+ * than our current ones
*/
if (pgdat->kcompactd_max_order <= cc.order)
pgdat->kcompactd_max_order = 0;
- if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
- pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+ if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
}
-void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
{
if (!order)
return;
@@ -2603,8 +2904,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kcompactd_max_order < order)
pgdat->kcompactd_max_order = order;
- if (pgdat->kcompactd_classzone_idx > classzone_idx)
- pgdat->kcompactd_classzone_idx = classzone_idx;
+ if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = highest_zoneidx;
/*
* Pairs with implicit barrier in wait_event_freezable()
@@ -2617,7 +2918,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
return;
trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
- classzone_idx);
+ highest_zoneidx);
wake_up_interruptible(&pgdat->kcompactd_wait);
}
@@ -2627,8 +2928,10 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
*/
static int kcompactd(void *p)
{
- pg_data_t *pgdat = (pg_data_t*)p;
+ pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
+ long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
+ long timeout = default_timeout;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -2638,18 +2941,57 @@ static int kcompactd(void *p)
set_freezable();
pgdat->kcompactd_max_order = 0;
- pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) {
unsigned long pflags;
+ /*
+ * Avoid the unnecessary wakeup for proactive compaction
+ * when it is disabled.
+ */
+ if (!sysctl_compaction_proactiveness)
+ timeout = MAX_SCHEDULE_TIMEOUT;
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
- wait_event_freezable(pgdat->kcompactd_wait,
- kcompactd_work_requested(pgdat));
+ if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat), timeout) &&
+ !pgdat->proactive_compact_trigger) {
+
+ psi_memstall_enter(&pflags);
+ kcompactd_do_work(pgdat);
+ psi_memstall_leave(&pflags);
+ /*
+ * Reset the timeout value. The defer timeout from
+ * proactive compaction is lost here but that is fine
+ * as the condition of the zone changing substantionally
+ * then carrying on with the previous defer interval is
+ * not useful.
+ */
+ timeout = default_timeout;
+ continue;
+ }
+
+ /*
+ * Start the proactive work with default timeout. Based
+ * on the fragmentation score, this timeout is updated.
+ */
+ timeout = default_timeout;
+ if (should_proactive_compact_node(pgdat)) {
+ unsigned int prev_score, score;
- psi_memstall_enter(&pflags);
- kcompactd_do_work(pgdat);
- psi_memstall_leave(&pflags);
+ prev_score = fragmentation_score_node(pgdat);
+ proactive_compact_node(pgdat);
+ score = fragmentation_score_node(pgdat);
+ /*
+ * Defer proactive compaction if the fragmentation
+ * score did not go down i.e. no progress made.
+ */
+ if (unlikely(score >= prev_score))
+ timeout =
+ default_timeout << COMPACT_MAX_DEFER_SHIFT;
+ }
+ if (unlikely(pgdat->proactive_compact_trigger))
+ pgdat->proactive_compact_trigger = false;
}
return 0;
@@ -2659,26 +3001,23 @@ static int kcompactd(void *p)
* This kcompactd start function will be called by init and node-hot-add.
* On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
*/
-int kcompactd_run(int nid)
+void kcompactd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- int ret = 0;
if (pgdat->kcompactd)
- return 0;
+ return;
pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
if (IS_ERR(pgdat->kcompactd)) {
pr_err("Failed to start kcompactd on node %d\n", nid);
- ret = PTR_ERR(pgdat->kcompactd);
pgdat->kcompactd = NULL;
}
- return ret;
}
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
*/
void kcompactd_stop(int nid)
{
@@ -2708,7 +3047,8 @@ static int kcompactd_cpu_online(unsigned int cpu)
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
/* One of our CPUs online: restore mask */
- set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ if (pgdat->kcompactd)
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
}
return 0;
}