2019-05-29 07:18:02 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2017-07-25 14:14:27 -07:00
|
|
|
/*
|
|
|
|
* Resource Director Technology(RDT)
|
|
|
|
* - Monitoring code
|
|
|
|
*
|
|
|
|
* Copyright (C) 2017 Intel Corporation
|
|
|
|
*
|
|
|
|
* Author:
|
|
|
|
* Vikas Shivappa <vikas.shivappa@intel.com>
|
|
|
|
*
|
|
|
|
* This replaces the cqm.c based on perf but we reuse a lot of
|
|
|
|
* code and datastructures originally from Peter Zijlstra and Matt Fleming.
|
|
|
|
*
|
|
|
|
* More information about RDT be found in the Intel (R) x86 Architecture
|
|
|
|
* Software Developer Manual June 2016, volume 3, section 17.17.
|
|
|
|
*/
|
|
|
|
|
2024-06-28 14:56:18 -07:00
|
|
|
#define pr_fmt(fmt) "resctrl: " fmt
|
|
|
|
|
x86/resctrl: Separate arch and fs resctrl locks
resctrl has one mutex that is taken by the architecture-specific code, and the
filesystem parts. The two interact via cpuhp, where the architecture code
updates the domain list. Filesystem handlers that walk the domains list should
not run concurrently with the cpuhp callback modifying the list.
Exposing a lock from the filesystem code means the interface is not cleanly
defined, and creates the possibility of cross-architecture lock ordering
headaches. The interaction only exists so that certain filesystem paths are
serialised against CPU hotplug. The CPU hotplug code already has a mechanism to
do this using cpus_read_lock().
MPAM's monitors have an overflow interrupt, so it needs to be possible to walk
the domains list in irq context. RCU is ideal for this, but some paths need to
be able to sleep to allocate memory.
Because resctrl_{on,off}line_cpu() take the rdtgroup_mutex as part of a cpuhp
callback, cpus_read_lock() must always be taken first.
rdtgroup_schemata_write() already does this.
Most of the filesystem code's domain list walkers are currently protected by
the rdtgroup_mutex taken in rdtgroup_kn_lock_live(). The exceptions are
rdt_bit_usage_show() and the mon_config helpers which take the lock directly.
Make the domain list protected by RCU. An architecture-specific lock prevents
concurrent writers. rdt_bit_usage_show() could walk the domain list using RCU,
but to keep all the filesystem operations the same, this is changed to call
cpus_read_lock(). The mon_config helpers send multiple IPIs, take the
cpus_read_lock() in these cases.
The other filesystem list walkers need to be able to sleep. Add
cpus_read_lock() to rdtgroup_kn_lock_live() so that the cpuhp callbacks can't
be invoked when file system operations are occurring.
Add lockdep_assert_cpus_held() in the cases where the rdtgroup_kn_lock_live()
call isn't obvious.
Resctrl's domain online/offline calls now need to take the rdtgroup_mutex
themselves.
[ bp: Fold in a build fix: https://lore.kernel.org/r/87zfvwieli.ffs@tglx ]
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-25-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:38 +00:00
|
|
|
#include <linux/cpu.h>
|
2025-05-15 16:58:45 +00:00
|
|
|
#include <linux/resctrl.h>
|
2022-09-02 15:48:27 +00:00
|
|
|
|
2017-07-25 14:14:27 -07:00
|
|
|
#include <asm/cpu_device_id.h>
|
2025-04-30 22:42:41 -07:00
|
|
|
#include <asm/msr.h>
|
2022-09-02 15:48:27 +00:00
|
|
|
|
2018-11-21 20:28:25 +00:00
|
|
|
#include "internal.h"
|
2017-07-25 14:14:27 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Global boolean for rdt_monitor which is true if any
|
|
|
|
* resource monitoring is enabled.
|
|
|
|
*/
|
|
|
|
bool rdt_mon_capable;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global to indicate which monitoring events are enabled.
|
|
|
|
*/
|
|
|
|
unsigned int rdt_mon_features;
|
|
|
|
|
2020-10-14 00:49:27 +00:00
|
|
|
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
|
|
|
|
|
2024-06-28 14:56:06 -07:00
|
|
|
static int snc_nodes_per_l3_cache = 1;
|
|
|
|
|
2020-10-14 00:49:27 +00:00
|
|
|
/*
|
2025-05-15 16:58:54 +00:00
|
|
|
* The correction factor table is documented in Documentation/filesystems/resctrl.rst.
|
2020-10-14 00:49:27 +00:00
|
|
|
* If rmid > rmid threshold, MBM total and local values should be multiplied
|
|
|
|
* by the correction factor.
|
|
|
|
*
|
|
|
|
* The original table is modified for better code:
|
|
|
|
*
|
|
|
|
* 1. The threshold 0 is changed to rmid count - 1 so don't do correction
|
|
|
|
* for the case.
|
|
|
|
* 2. MBM total and local correction table indexed by core counter which is
|
|
|
|
* equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
|
|
|
|
* 3. The correction factor is normalized to 2^20 (1048576) so it's faster
|
|
|
|
* to calculate corrected value by shifting:
|
|
|
|
* corrected_value = (original_value * correction_factor) >> 20
|
|
|
|
*/
|
|
|
|
static const struct mbm_correction_factor_table {
|
|
|
|
u32 rmidthreshold;
|
|
|
|
u64 cf;
|
2021-04-25 14:12:29 -07:00
|
|
|
} mbm_cf_table[] __initconst = {
|
2020-10-14 00:49:27 +00:00
|
|
|
{7, CF(1.000000)},
|
|
|
|
{15, CF(1.000000)},
|
|
|
|
{15, CF(0.969650)},
|
|
|
|
{31, CF(1.000000)},
|
|
|
|
{31, CF(1.066667)},
|
|
|
|
{31, CF(0.969650)},
|
|
|
|
{47, CF(1.142857)},
|
|
|
|
{63, CF(1.000000)},
|
|
|
|
{63, CF(1.185115)},
|
|
|
|
{63, CF(1.066553)},
|
|
|
|
{79, CF(1.454545)},
|
|
|
|
{95, CF(1.000000)},
|
|
|
|
{95, CF(1.230769)},
|
|
|
|
{95, CF(1.142857)},
|
|
|
|
{95, CF(1.066667)},
|
|
|
|
{127, CF(1.000000)},
|
|
|
|
{127, CF(1.254863)},
|
|
|
|
{127, CF(1.185255)},
|
|
|
|
{151, CF(1.000000)},
|
|
|
|
{127, CF(1.066667)},
|
|
|
|
{167, CF(1.000000)},
|
|
|
|
{159, CF(1.454334)},
|
|
|
|
{183, CF(1.000000)},
|
|
|
|
{127, CF(0.969744)},
|
|
|
|
{191, CF(1.280246)},
|
|
|
|
{191, CF(1.230921)},
|
|
|
|
{215, CF(1.000000)},
|
|
|
|
{191, CF(1.143118)},
|
|
|
|
};
|
|
|
|
|
|
|
|
static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
|
2025-05-15 16:58:54 +00:00
|
|
|
|
2020-10-14 00:49:27 +00:00
|
|
|
static u64 mbm_cf __read_mostly;
|
|
|
|
|
|
|
|
static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
|
|
|
|
{
|
|
|
|
/* Correct MBM value. */
|
|
|
|
if (rmid > mbm_cf_rmidthreshold)
|
|
|
|
val = (val * mbm_cf) >> 20;
|
|
|
|
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
2024-06-28 14:56:06 -07:00
|
|
|
/*
|
|
|
|
* When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
|
|
|
|
* "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
|
|
|
|
* needed. The physical RMID is the same as the logical RMID.
|
|
|
|
*
|
|
|
|
* On a platform with SNC mode enabled, Linux enables RMID sharing mode
|
|
|
|
* via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
|
|
|
|
* Resource Director Technology Architecture Specification" for a full
|
|
|
|
* description of RMID sharing mode).
|
|
|
|
*
|
|
|
|
* In RMID sharing mode there are fewer "logical RMID" values available
|
|
|
|
* to accumulate data ("physical RMIDs" are divided evenly between SNC
|
|
|
|
* nodes that share an L3 cache). Linux creates an rdt_mon_domain for
|
|
|
|
* each SNC node.
|
|
|
|
*
|
|
|
|
* The value loaded into IA32_PQR_ASSOC is the "logical RMID".
|
|
|
|
*
|
|
|
|
* Data is collected independently on each SNC node and can be retrieved
|
|
|
|
* using the "physical RMID" value computed by this function and loaded
|
|
|
|
* into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
|
|
|
|
*
|
|
|
|
* The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
|
|
|
|
* cache. So a "physical RMID" may be read from any CPU that shares
|
|
|
|
* the L3 cache with the desired SNC node, not just from a CPU in
|
|
|
|
* the specific SNC node.
|
|
|
|
*/
|
|
|
|
static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
|
|
|
|
{
|
|
|
|
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
|
|
|
|
|
|
|
|
if (snc_nodes_per_l3_cache == 1)
|
|
|
|
return lrmid;
|
|
|
|
|
|
|
|
return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
|
2022-12-20 17:41:31 +01:00
|
|
|
{
|
|
|
|
u64 msr_val;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
|
|
|
|
* with a valid event code for supported resource type and the bits
|
|
|
|
* IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
|
|
|
|
* IA32_QM_CTR.data (bits 61:0) reports the monitored data.
|
|
|
|
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
|
|
|
|
* are error bits.
|
|
|
|
*/
|
2024-06-28 14:56:06 -07:00
|
|
|
wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
|
2025-04-09 22:28:54 +02:00
|
|
|
rdmsrq(MSR_IA32_QM_CTR, msr_val);
|
2022-12-20 17:41:31 +01:00
|
|
|
|
|
|
|
if (msr_val & RMID_VAL_ERROR)
|
|
|
|
return -EIO;
|
|
|
|
if (msr_val & RMID_VAL_UNAVAIL)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
*val = msr_val;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-06-28 14:56:04 -07:00
|
|
|
static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
|
2022-09-02 15:48:22 +00:00
|
|
|
u32 rmid,
|
|
|
|
enum resctrl_event_id eventid)
|
|
|
|
{
|
|
|
|
switch (eventid) {
|
|
|
|
case QOS_L3_OCCUP_EVENT_ID:
|
|
|
|
return NULL;
|
|
|
|
case QOS_L3_MBM_TOTAL_EVENT_ID:
|
|
|
|
return &hw_dom->arch_mbm_total[rmid];
|
|
|
|
case QOS_L3_MBM_LOCAL_EVENT_ID:
|
|
|
|
return &hw_dom->arch_mbm_local[rmid];
|
2025-05-15 16:58:41 +00:00
|
|
|
default:
|
|
|
|
/* Never expect to get here */
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
return NULL;
|
2022-09-02 15:48:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-06-28 14:56:04 -07:00
|
|
|
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
|
2024-02-13 18:44:19 +00:00
|
|
|
u32 unused, u32 rmid,
|
|
|
|
enum resctrl_event_id eventid)
|
2022-09-02 15:48:22 +00:00
|
|
|
{
|
2024-06-28 14:56:04 -07:00
|
|
|
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
|
2024-06-28 14:56:06 -07:00
|
|
|
int cpu = cpumask_any(&d->hdr.cpu_mask);
|
2022-09-02 15:48:22 +00:00
|
|
|
struct arch_mbm_state *am;
|
2024-06-28 14:56:06 -07:00
|
|
|
u32 prmid;
|
2022-09-02 15:48:22 +00:00
|
|
|
|
|
|
|
am = get_arch_mbm_state(hw_dom, rmid, eventid);
|
2022-12-20 17:41:31 +01:00
|
|
|
if (am) {
|
2022-09-02 15:48:22 +00:00
|
|
|
memset(am, 0, sizeof(*am));
|
2022-12-20 17:41:31 +01:00
|
|
|
|
2024-06-28 14:56:06 -07:00
|
|
|
prmid = logical_rmid_to_physical_rmid(cpu, rmid);
|
2022-12-20 17:41:31 +01:00
|
|
|
/* Record any initial, non-zero count value. */
|
2024-06-28 14:56:06 -07:00
|
|
|
__rmid_read_phys(prmid, eventid, &am->prev_msr);
|
2022-12-20 17:41:31 +01:00
|
|
|
}
|
2022-09-02 15:48:22 +00:00
|
|
|
}
|
|
|
|
|
2023-01-13 09:20:37 -06:00
|
|
|
/*
|
|
|
|
* Assumes that hardware counters are also reset and thus that there is
|
|
|
|
* no need to record initial non-zero counts.
|
|
|
|
*/
|
2024-06-28 14:56:04 -07:00
|
|
|
void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
|
2023-01-13 09:20:37 -06:00
|
|
|
{
|
2024-06-28 14:56:04 -07:00
|
|
|
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
|
2023-01-13 09:20:37 -06:00
|
|
|
|
2025-03-11 18:37:03 +00:00
|
|
|
if (resctrl_arch_is_mbm_total_enabled())
|
2023-01-13 09:20:37 -06:00
|
|
|
memset(hw_dom->arch_mbm_total, 0,
|
|
|
|
sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
|
|
|
|
|
2025-03-11 18:37:03 +00:00
|
|
|
if (resctrl_arch_is_mbm_local_enabled())
|
2023-01-13 09:20:37 -06:00
|
|
|
memset(hw_dom->arch_mbm_local, 0,
|
|
|
|
sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
|
|
|
|
}
|
|
|
|
|
2022-09-02 15:48:25 +00:00
|
|
|
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
|
|
|
|
{
|
|
|
|
u64 shift = 64 - width, chunks;
|
|
|
|
|
|
|
|
chunks = (cur_msr << shift) - (prev_msr << shift);
|
|
|
|
return chunks >> shift;
|
|
|
|
}
|
|
|
|
|
2024-06-28 14:56:04 -07:00
|
|
|
int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
|
2024-02-13 18:44:19 +00:00
|
|
|
u32 unused, u32 rmid, enum resctrl_event_id eventid,
|
2024-02-13 18:44:29 +00:00
|
|
|
u64 *val, void *ignored)
|
2017-07-25 14:14:28 -07:00
|
|
|
{
|
2024-06-28 14:56:04 -07:00
|
|
|
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
|
2022-09-02 15:48:25 +00:00
|
|
|
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
|
2024-06-28 14:56:06 -07:00
|
|
|
int cpu = cpumask_any(&d->hdr.cpu_mask);
|
2022-09-02 15:48:25 +00:00
|
|
|
struct arch_mbm_state *am;
|
2022-09-02 15:48:29 +00:00
|
|
|
u64 msr_val, chunks;
|
2024-06-28 14:56:06 -07:00
|
|
|
u32 prmid;
|
2022-12-20 17:41:31 +01:00
|
|
|
int ret;
|
2017-07-25 14:14:28 -07:00
|
|
|
|
x86/resctrl: Allow resctrl_arch_rmid_read() to sleep
MPAM's cache occupancy counters can take a little while to settle once the
monitor has been configured. The maximum settling time is described to the
driver via a firmware table. The value could be large enough that it makes
sense to sleep. To avoid exposing this to resctrl, it should be hidden behind
MPAM's resctrl_arch_rmid_read().
resctrl_arch_rmid_read() may be called via IPI meaning it is unable to sleep.
In this case, it should return an error if it needs to sleep. This will only
affect MPAM platforms where the cache occupancy counter isn't available
immediately, nohz_full is in use, and there are no housekeeping CPUs in the
necessary domain.
There are three callers of resctrl_arch_rmid_read(): __mon_event_count() and
__check_limbo() are both called from a non-migrateable context.
mon_event_read() invokes __mon_event_count() using smp_call_on_cpu(), which
adds work to the target CPUs workqueue. rdtgroup_mutex() is held, meaning this
cannot race with the resctrl cpuhp callback. __check_limbo() is invoked via
schedule_delayed_work_on() also adds work to a per-cpu workqueue.
The remaining call is add_rmid_to_limbo() which is called in response to
a user-space syscall that frees an RMID. This opportunistically reads the LLC
occupancy counter on the current domain to see if the RMID is over the dirty
threshold. This has to disable preemption to avoid reading the wrong domain's
value. Disabling preemption here prevents resctrl_arch_rmid_read() from
sleeping.
add_rmid_to_limbo() walks each domain, but only reads the counter on one
domain. If the system has more than one domain, the RMID will always be added
to the limbo list. If the RMIDs usage was not over the threshold, it will be
removed from the list when __check_limbo() runs. Make this the default
behaviour. Free RMIDs are always added to the limbo list for each domain.
The user visible effect of this is that a clean RMID is not available for
re-allocation immediately after 'rmdir()' completes. This behaviour was never
portable as it never happened on a machine with multiple domains.
Removing this path allows resctrl_arch_rmid_read() to sleep if its called with
interrupts unmasked. Document this is the expected behaviour, and add
a might_sleep() annotation to catch changes that won't work on arm64.
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-15-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:28 +00:00
|
|
|
resctrl_arch_rmid_read_context_check();
|
|
|
|
|
2024-06-28 14:56:06 -07:00
|
|
|
prmid = logical_rmid_to_physical_rmid(cpu, rmid);
|
|
|
|
ret = __rmid_read_phys(prmid, eventid, &msr_val);
|
2022-12-20 17:41:31 +01:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2022-09-02 15:48:23 +00:00
|
|
|
|
2022-09-02 15:48:25 +00:00
|
|
|
am = get_arch_mbm_state(hw_dom, rmid, eventid);
|
|
|
|
if (am) {
|
2022-09-02 15:48:26 +00:00
|
|
|
am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
|
|
|
|
hw_res->mbm_width);
|
2022-09-02 15:48:29 +00:00
|
|
|
chunks = get_corrected_mbm_count(rmid, am->chunks);
|
2022-09-02 15:48:25 +00:00
|
|
|
am->prev_msr = msr_val;
|
|
|
|
} else {
|
2022-09-02 15:48:29 +00:00
|
|
|
chunks = msr_val;
|
2022-09-02 15:48:25 +00:00
|
|
|
}
|
2022-09-02 15:48:23 +00:00
|
|
|
|
2022-09-02 15:48:29 +00:00
|
|
|
*val = chunks * hw_res->mon_scale;
|
|
|
|
|
2022-09-02 15:48:23 +00:00
|
|
|
return 0;
|
2017-07-25 14:14:28 -07:00
|
|
|
}
|
|
|
|
|
2024-07-02 10:38:20 -07:00
|
|
|
/*
|
|
|
|
* The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
|
|
|
|
* which indicates that RMIDs are configured in legacy mode.
|
|
|
|
* This mode is incompatible with Linux resctrl semantics
|
|
|
|
* as RMIDs are partitioned between SNC nodes, which requires
|
|
|
|
* a user to know which RMID is allocated to a task.
|
|
|
|
* Clearing bit 0 reconfigures the RMID counters for use
|
|
|
|
* in RMID sharing mode. This mode is better for Linux.
|
|
|
|
* The RMID space is divided between all SNC nodes with the
|
|
|
|
* RMIDs renumbered to start from zero in each node when
|
|
|
|
* counting operations from tasks. Code to read the counters
|
|
|
|
* must adjust RMID counter numbers based on SNC node. See
|
|
|
|
* logical_rmid_to_physical_rmid() for code that does this.
|
|
|
|
*/
|
|
|
|
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
|
|
|
|
{
|
|
|
|
if (snc_nodes_per_l3_cache > 1)
|
|
|
|
msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
|
|
|
|
}
|
|
|
|
|
2024-06-28 14:56:18 -07:00
|
|
|
/* CPU models that support MSR_RMID_SNC_CONFIG */
|
|
|
|
static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
|
|
|
|
X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
|
|
|
|
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
|
|
|
|
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
|
|
|
|
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
|
|
|
|
X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
|
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There isn't a simple hardware bit that indicates whether a CPU is running
|
|
|
|
* in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
|
|
|
|
* number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
|
|
|
|
* the same NUMA node as CPU0.
|
|
|
|
* It is not possible to accurately determine SNC state if the system is
|
|
|
|
* booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
|
|
|
|
* to L3 caches. It will be OK if system is booted with hyperthreading
|
|
|
|
* disabled (since this doesn't affect the ratio).
|
|
|
|
*/
|
|
|
|
static __init int snc_get_config(void)
|
|
|
|
{
|
|
|
|
struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
|
|
|
|
const cpumask_t *node0_cpumask;
|
|
|
|
int cpus_per_node, cpus_per_l3;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!x86_match_cpu(snc_cpu_ids) || !ci)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
cpus_read_lock();
|
|
|
|
if (num_online_cpus() != num_present_cpus())
|
|
|
|
pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
|
|
|
|
cpus_read_unlock();
|
|
|
|
|
|
|
|
node0_cpumask = cpumask_of_node(cpu_to_node(0));
|
|
|
|
|
|
|
|
cpus_per_node = cpumask_weight(node0_cpumask);
|
|
|
|
cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
|
|
|
|
|
|
|
|
if (!cpus_per_node || !cpus_per_l3)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
ret = cpus_per_l3 / cpus_per_node;
|
|
|
|
|
2024-10-31 15:02:13 -07:00
|
|
|
/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
|
2024-06-28 14:56:18 -07:00
|
|
|
switch (ret) {
|
|
|
|
case 1:
|
|
|
|
break;
|
|
|
|
case 2 ... 4:
|
2024-10-31 15:02:13 -07:00
|
|
|
case 6:
|
2024-06-28 14:56:18 -07:00
|
|
|
pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
|
|
|
|
rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
pr_warn("Ignore improbable SNC node count %d\n", ret);
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-01-13 09:20:33 -06:00
|
|
|
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
|
2017-07-25 14:14:27 -07:00
|
|
|
{
|
2020-05-05 15:36:18 -07:00
|
|
|
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
|
x86/resctrl: Split struct rdt_resource
resctrl is the defacto Linux ABI for SoC resource partitioning features.
To support it on another architecture, it needs to be abstracted from
the features provided by Intel RDT and AMD PQoS, and moved to /fs/.
struct rdt_resource contains a mix of architecture private details
and properties of the filesystem interface user-space uses.
Start by splitting struct rdt_resource, into an architecture private
'hw' struct, which contains the common resctrl structure that would be
used by any architecture. The foreach helpers are most commonly used by
the filesystem code, and should return the common resctrl structure.
for_each_rdt_resource() is changed to walk the common structure in its
parent arch private structure.
Move as much of the structure as possible into the common structure
in the core code's header file. The x86 hardware accessors remain
part of the architecture private code, as do num_closid, mon_scale
and mbm_width.
mon_scale and mbm_width are used to detect overflow of the hardware
counters, and convert them from their native size to bytes. Any
cross-architecture abstraction should be in terms of bytes, making
these properties private.
The hardware's num_closid is kept in the private structure to force the
filesystem code to use a helper to access it. MPAM would return a single
value for the system, regardless of the resource. Using the helper
prevents this field from being confused with the version of num_closid
that is being exposed to user-space (added in a later patch).
After this split, filesystem code touching a 'hw' struct indicates
where an abstraction is needed.
Splitting this structure only moves types around, and should not lead
to any change in behaviour.
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-2-james.morse@arm.com
2021-07-28 17:06:14 +00:00
|
|
|
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
|
2022-09-02 15:48:27 +00:00
|
|
|
unsigned int threshold;
|
2017-07-25 14:14:27 -07:00
|
|
|
|
2024-06-28 14:56:18 -07:00
|
|
|
snc_nodes_per_l3_cache = snc_get_config();
|
|
|
|
|
2022-09-02 15:48:28 +00:00
|
|
|
resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
|
2024-06-28 14:56:06 -07:00
|
|
|
hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
|
|
|
|
r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
|
x86/resctrl: Split struct rdt_resource
resctrl is the defacto Linux ABI for SoC resource partitioning features.
To support it on another architecture, it needs to be abstracted from
the features provided by Intel RDT and AMD PQoS, and moved to /fs/.
struct rdt_resource contains a mix of architecture private details
and properties of the filesystem interface user-space uses.
Start by splitting struct rdt_resource, into an architecture private
'hw' struct, which contains the common resctrl structure that would be
used by any architecture. The foreach helpers are most commonly used by
the filesystem code, and should return the common resctrl structure.
for_each_rdt_resource() is changed to walk the common structure in its
parent arch private structure.
Move as much of the structure as possible into the common structure
in the core code's header file. The x86 hardware accessors remain
part of the architecture private code, as do num_closid, mon_scale
and mbm_width.
mon_scale and mbm_width are used to detect overflow of the hardware
counters, and convert them from their native size to bytes. Any
cross-architecture abstraction should be in terms of bytes, making
these properties private.
The hardware's num_closid is kept in the private structure to force the
filesystem code to use a helper to access it. MPAM would return a single
value for the system, regardless of the resource. Using the helper
prevents this field from being confused with the version of num_closid
that is being exposed to user-space (added in a later patch).
After this split, filesystem code touching a 'hw' struct indicates
where an abstraction is needed.
Splitting this structure only moves types around, and should not lead
to any change in behaviour.
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-2-james.morse@arm.com
2021-07-28 17:06:14 +00:00
|
|
|
hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
|
2020-05-05 15:36:18 -07:00
|
|
|
|
|
|
|
if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
|
x86/resctrl: Split struct rdt_resource
resctrl is the defacto Linux ABI for SoC resource partitioning features.
To support it on another architecture, it needs to be abstracted from
the features provided by Intel RDT and AMD PQoS, and moved to /fs/.
struct rdt_resource contains a mix of architecture private details
and properties of the filesystem interface user-space uses.
Start by splitting struct rdt_resource, into an architecture private
'hw' struct, which contains the common resctrl structure that would be
used by any architecture. The foreach helpers are most commonly used by
the filesystem code, and should return the common resctrl structure.
for_each_rdt_resource() is changed to walk the common structure in its
parent arch private structure.
Move as much of the structure as possible into the common structure
in the core code's header file. The x86 hardware accessors remain
part of the architecture private code, as do num_closid, mon_scale
and mbm_width.
mon_scale and mbm_width are used to detect overflow of the hardware
counters, and convert them from their native size to bytes. Any
cross-architecture abstraction should be in terms of bytes, making
these properties private.
The hardware's num_closid is kept in the private structure to force the
filesystem code to use a helper to access it. MPAM would return a single
value for the system, regardless of the resource. Using the helper
prevents this field from being confused with the version of num_closid
that is being exposed to user-space (added in a later patch).
After this split, filesystem code touching a 'hw' struct indicates
where an abstraction is needed.
Splitting this structure only moves types around, and should not lead
to any change in behaviour.
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <jamie@nuviainc.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Link: https://lkml.kernel.org/r/20210728170637.25610-2-james.morse@arm.com
2021-07-28 17:06:14 +00:00
|
|
|
hw_res->mbm_width += mbm_offset;
|
2020-05-05 15:36:18 -07:00
|
|
|
else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
|
|
|
|
pr_warn("Ignoring impossible MBM counter offset\n");
|
2017-07-25 14:14:27 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A reasonable upper limit on the max threshold is the number
|
|
|
|
* of lines tagged per RMID if all RMIDs have the same number of
|
|
|
|
* lines tagged in the LLC.
|
|
|
|
*
|
|
|
|
* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
|
|
|
|
*/
|
2022-09-02 15:48:28 +00:00
|
|
|
threshold = resctrl_rmid_realloc_limit / r->num_rmid;
|
2017-07-25 14:14:27 -07:00
|
|
|
|
2022-09-02 15:48:27 +00:00
|
|
|
/*
|
|
|
|
* Because num_rmid may not be a power of two, round the value
|
|
|
|
* to the nearest multiple of hw_res->mon_scale so it matches a
|
|
|
|
* value the hardware will measure. mon_scale may not be a power of 2.
|
|
|
|
*/
|
|
|
|
resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
|
2017-07-25 14:14:27 -07:00
|
|
|
|
2023-01-13 09:20:34 -06:00
|
|
|
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
|
2024-01-15 16:52:28 -06:00
|
|
|
u32 eax, ebx, ecx, edx;
|
|
|
|
|
|
|
|
/* Detect list of bandwidth sources that can be tracked */
|
|
|
|
cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
|
2025-03-11 18:37:07 +00:00
|
|
|
r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
|
2023-01-13 09:20:34 -06:00
|
|
|
}
|
|
|
|
|
2017-07-25 14:14:27 -07:00
|
|
|
r->mon_capable = true;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2020-10-14 00:49:27 +00:00
|
|
|
|
|
|
|
void __init intel_rdt_mbm_apply_quirk(void)
|
|
|
|
{
|
|
|
|
int cf_index;
|
|
|
|
|
|
|
|
cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
|
|
|
|
if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
|
|
|
|
pr_info("No MBM correction factor available\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
|
|
|
|
mbm_cf = mbm_cf_table[cf_index].cf;
|
|
|
|
}
|