aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/resctrl/monitor.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu/resctrl/monitor.c')
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c346
1 files changed, 253 insertions, 93 deletions
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 773124b0e18a..efe0c30d3a12 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -16,8 +16,12 @@
*/
#include <linux/module.h>
+#include <linux/sizes.h>
#include <linux/slab.h>
+
#include <asm/cpu_device_id.h>
+#include <asm/resctrl.h>
+
#include "internal.h"
struct rmid_entry {
@@ -37,8 +41,8 @@ static LIST_HEAD(rmid_free_lru);
* @rmid_limbo_count count of currently unused but (potentially)
* dirty RMIDs.
* This counts RMIDs that no one is currently using but that
- * may have a occupancy value > intel_cqm_threshold. User can change
- * the threshold occupancy value.
+ * may have a occupancy value > resctrl_rmid_realloc_threshold. User can
+ * change the threshold occupancy value.
*/
static unsigned int rmid_limbo_count;
@@ -59,10 +63,78 @@ bool rdt_mon_capable;
unsigned int rdt_mon_features;
/*
- * This is the threshold cache occupancy at which we will consider an
+ * This is the threshold cache occupancy in bytes at which we will consider an
* RMID available for re-allocation.
*/
-unsigned int resctrl_cqm_threshold;
+unsigned int resctrl_rmid_realloc_threshold;
+
+/*
+ * This is the maximum value for the reallocation threshold, in bytes.
+ */
+unsigned int resctrl_rmid_realloc_limit;
+
+#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+
+/*
+ * The correction factor table is documented in Documentation/x86/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ * for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ * to calculate corrected value by shifting:
+ * corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+ u32 rmidthreshold;
+ u64 cf;
+} mbm_cf_table[] __initconst = {
+ {7, CF(1.000000)},
+ {15, CF(1.000000)},
+ {15, CF(0.969650)},
+ {31, CF(1.000000)},
+ {31, CF(1.066667)},
+ {31, CF(0.969650)},
+ {47, CF(1.142857)},
+ {63, CF(1.000000)},
+ {63, CF(1.185115)},
+ {63, CF(1.066553)},
+ {79, CF(1.454545)},
+ {95, CF(1.000000)},
+ {95, CF(1.230769)},
+ {95, CF(1.142857)},
+ {95, CF(1.066667)},
+ {127, CF(1.000000)},
+ {127, CF(1.254863)},
+ {127, CF(1.185255)},
+ {151, CF(1.000000)},
+ {127, CF(1.066667)},
+ {167, CF(1.000000)},
+ {159, CF(1.454334)},
+ {183, CF(1.000000)},
+ {127, CF(0.969744)},
+ {191, CF(1.280246)},
+ {191, CF(1.230921)},
+ {215, CF(1.000000)},
+ {191, CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+static u64 mbm_cf __read_mostly;
+
+static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+ /* Correct MBM value. */
+ if (rmid > mbm_cf_rmidthreshold)
+ val = (val * mbm_cf) >> 20;
+
+ return val;
+}
static inline struct rmid_entry *__rmid_entry(u32 rmid)
{
@@ -74,9 +146,54 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)
return entry;
}
-static u64 __rmid_read(u32 rmid, u32 eventid)
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
+ u32 rmid,
+ enum resctrl_event_id eventid)
+{
+ switch (eventid) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ return NULL;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return &hw_dom->arch_mbm_total[rmid];
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return &hw_dom->arch_mbm_local[rmid];
+ }
+
+ /* Never expect to get here */
+ WARN_ON_ONCE(1);
+
+ return NULL;
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
+ u32 rmid, enum resctrl_event_id eventid)
+{
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct arch_mbm_state *am;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am)
+ memset(am, 0, sizeof(*am));
+}
+
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
{
- u64 val;
+ u64 shift = 64 - width, chunks;
+
+ chunks = (cur_msr << shift) - (prev_msr << shift);
+ return chunks >> shift;
+}
+
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
+ u32 rmid, enum resctrl_event_id eventid, u64 *val)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct arch_mbm_state *am;
+ u64 msr_val, chunks;
+
+ if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
+ return -EINVAL;
/*
* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
@@ -87,16 +204,26 @@ static u64 __rmid_read(u32 rmid, u32 eventid)
* are error bits.
*/
wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
- rdmsrl(MSR_IA32_QM_CTR, val);
+ rdmsrl(MSR_IA32_QM_CTR, msr_val);
- return val;
-}
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
-static bool rmid_dirty(struct rmid_entry *entry)
-{
- u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+ hw_res->mbm_width);
+ chunks = get_corrected_mbm_count(rmid, am->chunks);
+ am->prev_msr = msr_val;
+ } else {
+ chunks = msr_val;
+ }
- return val >= resctrl_cqm_threshold;
+ *val = chunks * hw_res->mon_scale;
+
+ return 0;
}
/*
@@ -107,11 +234,11 @@ static bool rmid_dirty(struct rmid_entry *entry)
*/
void __check_limbo(struct rdt_domain *d, bool force_free)
{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
struct rmid_entry *entry;
- struct rdt_resource *r;
u32 crmid = 1, nrmid;
-
- r = &rdt_resources_all[RDT_RESOURCE_L3];
+ bool rmid_dirty;
+ u64 val = 0;
/*
* Skip RMID 0 and start from RMID 1 and check all the RMIDs that
@@ -125,7 +252,15 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
break;
entry = __rmid_entry(nrmid);
- if (force_free || !rmid_dirty(entry)) {
+
+ if (resctrl_arch_rmid_read(r, d, entry->rmid,
+ QOS_L3_OCCUP_EVENT_ID, &val)) {
+ rmid_dirty = true;
+ } else {
+ rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+ }
+
+ if (force_free || !rmid_dirty) {
clear_bit(entry->rmid, d->rmid_busy_llc);
if (!--entry->busy) {
rmid_limbo_count--;
@@ -164,19 +299,19 @@ int alloc_rmid(void)
static void add_rmid_to_limbo(struct rmid_entry *entry)
{
- struct rdt_resource *r;
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
struct rdt_domain *d;
- int cpu;
- u64 val;
-
- r = &rdt_resources_all[RDT_RESOURCE_L3];
+ int cpu, err;
+ u64 val = 0;
entry->busy = 0;
cpu = get_cpu();
list_for_each_entry(d, &r->domains, list) {
if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
- val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
- if (val <= resctrl_cqm_threshold)
+ err = resctrl_arch_rmid_read(r, d, entry->rmid,
+ QOS_L3_OCCUP_EVENT_ID,
+ &val);
+ if (err || val <= resctrl_rmid_realloc_threshold)
continue;
}
@@ -214,24 +349,18 @@ void free_rmid(u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
-static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
-{
- u64 shift = 64 - MBM_CNTR_WIDTH, chunks;
-
- chunks = (cur_msr << shift) - (prev_msr << shift);
- return chunks >>= shift;
-}
-
static int __mon_event_count(u32 rmid, struct rmid_read *rr)
{
struct mbm_state *m;
- u64 chunks, tval;
+ u64 tval = 0;
+
+ if (rr->first)
+ resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
+
+ rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval);
+ if (rr->err)
+ return rr->err;
- tval = __rmid_read(rmid, rr->evtid);
- if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
- rr->val = tval;
- return -EINVAL;
- }
switch (rr->evtid) {
case QOS_L3_OCCUP_EVENT_ID:
rr->val += tval;
@@ -244,50 +373,48 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
break;
default:
/*
- * Code would never reach here because
- * an invalid event id would fail the __rmid_read.
+ * Code would never reach here because an invalid
+ * event id would fail in resctrl_arch_rmid_read().
*/
return -EINVAL;
}
if (rr->first) {
memset(m, 0, sizeof(struct mbm_state));
- m->prev_bw_msr = m->prev_msr = tval;
return 0;
}
- chunks = mbm_overflow_count(m->prev_msr, tval);
- m->chunks += chunks;
- m->prev_msr = tval;
+ rr->val += tval;
- rr->val += m->chunks;
return 0;
}
/*
+ * mbm_bw_count() - Update bw count from values previously read by
+ * __mon_event_count().
+ * @rmid: The rmid used to identify the cached mbm_state.
+ * @rr: The struct rmid_read populated by __mon_event_count().
+ *
* Supporting function to calculate the memory bandwidth
- * and delta bandwidth in MBps.
+ * and delta bandwidth in MBps. The chunks value previously read by
+ * __mon_event_count() is compared with the chunks value from the previous
+ * invocation. This must be called once per second to maintain values in MBps.
*/
static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
struct mbm_state *m = &rr->d->mbm_local[rmid];
- u64 tval, cur_bw, chunks;
+ u64 cur_bw, bytes, cur_bytes;
- tval = __rmid_read(rmid, rr->evtid);
- if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
- return;
+ cur_bytes = rr->val;
+ bytes = cur_bytes - m->prev_bw_bytes;
+ m->prev_bw_bytes = cur_bytes;
- chunks = mbm_overflow_count(m->prev_bw_msr, tval);
- m->chunks_bw += chunks;
- m->chunks = m->chunks_bw;
- cur_bw = (chunks * r->mon_scale) >> 20;
+ cur_bw = bytes / SZ_1M;
if (m->delta_comp)
m->delta_bw = abs(cur_bw - m->prev_bw);
m->delta_comp = false;
m->prev_bw = cur_bw;
- m->prev_bw_msr = tval;
}
/*
@@ -299,23 +426,33 @@ void mon_event_count(void *info)
struct rdtgroup *rdtgrp, *entry;
struct rmid_read *rr = info;
struct list_head *head;
+ int ret;
rdtgrp = rr->rgrp;
- if (__mon_event_count(rdtgrp->mon.rmid, rr))
- return;
+ ret = __mon_event_count(rdtgrp->mon.rmid, rr);
/*
- * For Ctrl groups read data from child monitor groups.
+ * For Ctrl groups read data from child monitor groups and
+ * add them together. Count events which are read successfully.
+ * Discard the rmid_read's reporting errors.
*/
head = &rdtgrp->mon.crdtgrp_list;
if (rdtgrp->type == RDTCTRL_GROUP) {
list_for_each_entry(entry, head, mon.crdtgrp_list) {
- if (__mon_event_count(entry->mon.rmid, rr))
- return;
+ if (__mon_event_count(entry->mon.rmid, rr) == 0)
+ ret = 0;
}
}
+
+ /*
+ * __mon_event_count() calls for newly created monitor groups may
+ * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
+ * Discard error if any of the monitor event reads succeeded.
+ */
+ if (ret == 0)
+ rr->err = 0;
}
/*
@@ -325,7 +462,7 @@ void mon_event_count(void *info)
* adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
* that:
*
- * current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ * current bandwidth(cur_bw) < user specified bandwidth(user_bw)
*
* This uses the MBM counters to measure the bandwidth and MBA throttle
* MSRs to control the bandwidth for a particular rdtgrp. It builds on the
@@ -335,7 +472,7 @@ void mon_event_count(void *info)
* timer. Having 1s interval makes the calculation of bandwidth simpler.
*
* Although MBA's goal is to restrict the bandwidth to a maximum, there may
- * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * be a need to increase the bandwidth to avoid unnecessarily restricting
* the L2 <-> L3 traffic.
*
* Since MBA controls the L2 external bandwidth where as MBM measures the
@@ -352,7 +489,7 @@ void mon_event_count(void *info)
*/
static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
{
- u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+ u32 closid, rmid, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
u32 cur_bw, delta_bw, user_bw;
struct rdt_resource *r_mba;
@@ -363,7 +500,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
if (!is_mbm_local_enabled())
return;
- r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+ r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+
closid = rgrp->closid;
rmid = rgrp->mon.rmid;
pmbm_data = &dom_mbm->mbm_local[rmid];
@@ -377,7 +515,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
cur_bw = pmbm_data->prev_bw;
user_bw = dom_mba->mbps_val[closid];
delta_bw = pmbm_data->delta_bw;
- cur_msr_val = dom_mba->ctrl_val[closid];
+
+ /* MBA resource doesn't support CDP */
+ cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
/*
* For Ctrl groups read data from child monitor groups.
@@ -412,13 +552,11 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
return;
}
- cur_msr = r_mba->msr_base + closid;
- wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
- dom_mba->ctrl_val[closid] = new_msr_val;
+ resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
/*
* Delta values are updated dynamically package wise for each
- * rdtgrp everytime the throttle MSR changes value.
+ * rdtgrp every time the throttle MSR changes value.
*
* This is because (1)the increase in bandwidth is not perfectly
* linear and only "approximately" linear even when the hardware
@@ -433,11 +571,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
}
}
-static void mbm_update(struct rdt_domain *d, int rmid)
+static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
{
struct rmid_read rr;
rr.first = false;
+ rr.r = r;
rr.d = d;
/*
@@ -446,19 +585,20 @@ static void mbm_update(struct rdt_domain *d, int rmid)
*/
if (is_mbm_total_enabled()) {
rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+ rr.val = 0;
__mon_event_count(rmid, &rr);
}
if (is_mbm_local_enabled()) {
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+ rr.val = 0;
+ __mon_event_count(rmid, &rr);
/*
* Call the MBA software controller only for the
* control groups and when user has enabled
* the software controller explicitly.
*/
- if (!is_mba_sc(NULL))
- __mon_event_count(rmid, &rr);
- else
+ if (is_mba_sc(NULL))
mbm_bw_count(rmid, &rr);
}
}
@@ -476,20 +616,14 @@ void cqm_handle_limbo(struct work_struct *work)
mutex_lock(&rdtgroup_mutex);
- r = &rdt_resources_all[RDT_RESOURCE_L3];
- d = get_domain_from_cpu(cpu, r);
-
- if (!d) {
- pr_warn_once("Failure to get domain for limbo worker\n");
- goto out_unlock;
- }
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ d = container_of(work, struct rdt_domain, cqm_limbo.work);
__check_limbo(d, false);
if (has_busy_rmid(r, d))
schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
-out_unlock:
mutex_unlock(&rdtgroup_mutex);
}
@@ -510,6 +644,7 @@ void mbm_handle_overflow(struct work_struct *work)
struct rdtgroup *prgrp, *crgrp;
int cpu = smp_processor_id();
struct list_head *head;
+ struct rdt_resource *r;
struct rdt_domain *d;
mutex_lock(&rdtgroup_mutex);
@@ -517,16 +652,15 @@ void mbm_handle_overflow(struct work_struct *work)
if (!static_branch_likely(&rdt_mon_enable_key))
goto out_unlock;
- d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
- if (!d)
- goto out_unlock;
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ d = container_of(work, struct rdt_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mbm_update(d, prgrp->mon.rmid);
+ mbm_update(r, d, prgrp->mon.rmid);
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list)
- mbm_update(d, crgrp->mon.rmid);
+ mbm_update(r, d, crgrp->mon.rmid);
if (is_mba_sc(NULL))
update_mba_bw(prgrp, d);
@@ -614,11 +748,20 @@ static void l3_mon_evt_init(struct rdt_resource *r)
int rdt_get_mon_l3_config(struct rdt_resource *r)
{
- unsigned int cl_size = boot_cpu_data.x86_cache_size;
+ unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ unsigned int threshold;
int ret;
- r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+ resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+ hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
+
+ if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
+ hw_res->mbm_width += mbm_offset;
+ else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
+ pr_warn("Ignoring impossible MBM counter offset\n");
/*
* A reasonable upper limit on the max threshold is the number
@@ -627,10 +770,14 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
*
* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
*/
- resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
+ threshold = resctrl_rmid_realloc_limit / r->num_rmid;
- /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
- resctrl_cqm_threshold /= r->mon_scale;
+ /*
+ * Because num_rmid may not be a power of two, round the value
+ * to the nearest multiple of hw_res->mon_scale so it matches a
+ * value the hardware will measure. mon_scale may not be a power of 2.
+ */
+ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
ret = dom_data_init(r);
if (ret)
@@ -639,7 +786,20 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
l3_mon_evt_init(r);
r->mon_capable = true;
- r->mon_enabled = true;
return 0;
}
+
+void __init intel_rdt_mbm_apply_quirk(void)
+{
+ int cf_index;
+
+ cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
+ if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+ pr_info("No MBM correction factor available\n");
+ return;
+ }
+
+ mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+ mbm_cf = mbm_cf_table[cf_index].cf;
+}