aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorHuang Ying <ying.huang@intel.com>2023-10-16 13:29:56 +0800
committerAndrew Morton <akpm@linux-foundation.org>2023-10-25 16:47:10 -0700
commit362d37a106dd3f6431b2fdd91d9208b0d023b50d (patch)
treef51d1db4a765d1594fd34b62a96ca872d8e639da
parentcacheinfo: calculate size of per-CPU data cache slice (diff)
downloadwireguard-linux-362d37a106dd3f6431b2fdd91d9208b0d023b50d.tar.xz
wireguard-linux-362d37a106dd3f6431b2fdd91d9208b0d023b50d.zip
mm, pcp: reduce lock contention for draining high-order pages
In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when PCP is mostly used for high-order pages freeing to improve the cache-hot pages reusing between page allocating and freeing CPUs. On system with small per-CPU data cache slice, pages shouldn't be cached before draining to guarantee cache-hot. But on a system with large per-CPU data cache slice, some pages can be cached before draining to reduce zone lock contention. So, in this patch, instead of draining without any caching, "pcp->batch" pages will be cached in PCP before draining if the size of the per-CPU data cache slice is more than "3 * batch". In theory, if the size of per-CPU data cache slice is more than "2 * batch", we can reuse cache-hot pages between CPUs. But considering the other usage of cache (code, other data accessing, etc.), "3 * batch" is used. Note: "3 * batch" is chosen to make sure the optimization works on recent x86_64 server CPUs. If you want to increase it, please check whether it breaks the optimization. On a 2-socket Intel server with 128 logical CPU, with the patch, the network bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite with 16-pair processes increase 70.5%. The cycles% of the spinlock contention (mostly for zone lock) decreases from 46.1% to 21.3%. The number of PCP draining for high order pages freeing (free_high) decreases 89.9%. The cache miss rate keeps 0.2%. Link: https://lkml.kernel.org/r/20231016053002.756205-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Sudeep Holla <sudeep.holla@arm.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: David Hildenbrand <david@redhat.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Christoph Lameter <cl@linux.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r--drivers/base/cacheinfo.c2
-rw-r--r--include/linux/gfp.h1
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--mm/page_alloc.c38
4 files changed, 46 insertions, 1 deletions
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index 585c66fce9d9..f1e79263fe61 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -950,6 +950,7 @@ static int cacheinfo_cpu_online(unsigned int cpu)
if (rc)
goto err;
update_per_cpu_data_slice_size(true, cpu);
+ setup_pcp_cacheinfo();
return 0;
err:
free_cache_attributes(cpu);
@@ -963,6 +964,7 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu)
free_cache_attributes(cpu);
update_per_cpu_data_slice_size(false, cpu);
+ setup_pcp_cacheinfo();
return 0;
}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 665f06675c83..665edc11fb9f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -325,6 +325,7 @@ void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
void page_alloc_init_late(void);
+void setup_pcp_cacheinfo(void);
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index de313f1c15f9..efe72b3f7872 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -680,8 +680,14 @@ enum zone_watermarks {
* PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
* previous page freeing. To avoid to drain PCP for an accident
* high-order page freeing.
+ *
+ * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
+ * draining PCP for consecutive high-order pages freeing without
+ * allocation if data cache slice of CPU is large enough. To reduce
+ * zone lock contention and keep cache-hot pages reusing.
*/
#define PCPF_PREV_FREE_HIGH_ORDER BIT(0)
+#define PCPF_FREE_HIGH_BATCH BIT(1)
struct per_cpu_pages {
spinlock_t lock; /* Protects lists field */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index de547ef9a9ad..b76b1de48a30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -52,6 +52,7 @@
#include <linux/psi.h>
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
+#include <linux/cacheinfo.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
@@ -2385,7 +2386,9 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
*/
if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
free_high = (pcp->free_factor &&
- (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER));
+ (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
+ (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
+ pcp->count >= READ_ONCE(pcp->batch)));
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
@@ -5418,6 +5421,39 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
mutex_unlock(&pcp_batch_high_lock);
}
+static void zone_pcp_update_cacheinfo(struct zone *zone)
+{
+ int cpu;
+ struct per_cpu_pages *pcp;
+ struct cpu_cacheinfo *cci;
+
+ for_each_online_cpu(cpu) {
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ cci = get_cpu_cacheinfo(cpu);
+ /*
+ * If data cache slice of CPU is large enough, "pcp->batch"
+ * pages can be preserved in PCP before draining PCP for
+ * consecutive high-order pages freeing without allocation.
+ * This can reduce zone lock contention without hurting
+ * cache-hot pages sharing.
+ */
+ spin_lock(&pcp->lock);
+ if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+ pcp->flags |= PCPF_FREE_HIGH_BATCH;
+ else
+ pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+ spin_unlock(&pcp->lock);
+ }
+}
+
+void setup_pcp_cacheinfo(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ zone_pcp_update_cacheinfo(zone);
+}
+
/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.