2019-05-29 07:18:02 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2016-10-28 15:04:47 -07:00
|
|
|
/*
|
|
|
|
* Resource Director Technology(RDT)
|
|
|
|
* - Cache Allocation code.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2016 Intel Corporation
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Fenghua Yu <fenghua.yu@intel.com>
|
|
|
|
* Tony Luck <tony.luck@intel.com>
|
|
|
|
*
|
|
|
|
* More information about RDT be found in the Intel (R) x86 Architecture
|
|
|
|
* Software Developer Manual June 2016, volume 3, section 17.17.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2018-12-10 13:21:54 -08:00
|
|
|
#include <linux/cpu.h>
|
2016-10-28 15:04:47 -07:00
|
|
|
#include <linux/kernfs.h>
|
|
|
|
#include <linux/seq_file.h>
|
|
|
|
#include <linux/slab.h>
|
x86/resctrl: Queue mon_event_read() instead of sending an IPI
Intel is blessed with an abundance of monitors, one per RMID, that can be
read from any CPU in the domain. MPAMs monitors reside in the MMIO MSC,
the number implemented is up to the manufacturer. This means when there are
fewer monitors than needed, they need to be allocated and freed.
MPAM's CSU monitors are used to back the 'llc_occupancy' monitor file. The
CSU counter is allowed to return 'not ready' for a small number of
micro-seconds after programming. To allow one CSU hardware monitor to be
used for multiple control or monitor groups, the CPU accessing the
monitor needs to be able to block when configuring and reading the
counter.
Worse, the domain may be broken up into slices, and the MMIO accesses
for each slice may need performing from different CPUs.
These two details mean MPAMs monitor code needs to be able to sleep, and
IPI another CPU in the domain to read from a resource that has been sliced.
mon_event_read() already invokes mon_event_count() via IPI, which means
this isn't possible. On systems using nohz-full, some CPUs need to be
interrupted to run kernel work as they otherwise stay in user-space
running realtime workloads. Interrupting these CPUs should be avoided,
and scheduling work on them may never complete.
Change mon_event_read() to pick a housekeeping CPU, (one that is not using
nohz_full) and schedule mon_event_count() and wait. If all the CPUs
in a domain are using nohz-full, then an IPI is used as the fallback.
This function is only used in response to a user-space filesystem request
(not the timing sensitive overflow code).
This allows MPAM to hide the slice behaviour from resctrl, and to keep
the monitor-allocation in monitor.c. When the IPI fallback is used on
machines where MPAM needs to make an access on multiple CPUs, the counter
read will always fail.
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Peter Newman <peternewman@google.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-14-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:27 +00:00
|
|
|
#include <linux/tick.h>
|
|
|
|
|
2018-11-21 20:28:25 +00:00
|
|
|
#include "internal.h"
|
2016-10-28 15:04:47 -07:00
|
|
|
|
2017-04-07 17:33:57 -07:00
|
|
|
/*
|
|
|
|
* Check whether MBA bandwidth percentage value is correct. The value is
|
|
|
|
* checked against the minimum and max bandwidth values specified by the
|
|
|
|
* hardware. The allocated bandwidth percentage is rounded to the next
|
|
|
|
* control step available on the hardware.
|
|
|
|
*/
|
|
|
|
static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
|
|
|
|
{
|
|
|
|
unsigned long bw;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only linear delay values is supported for current Intel SKUs.
|
|
|
|
*/
|
2020-07-08 16:39:26 +00:00
|
|
|
if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
|
2017-09-25 16:39:34 -07:00
|
|
|
rdt_last_cmd_puts("No support for non-linear MB domains\n");
|
2017-04-07 17:33:57 -07:00
|
|
|
return false;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2017-04-07 17:33:57 -07:00
|
|
|
|
|
|
|
ret = kstrtoul(buf, 10, &bw);
|
2017-09-25 16:39:34 -07:00
|
|
|
if (ret) {
|
|
|
|
rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
|
2017-04-07 17:33:57 -07:00
|
|
|
return false;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2017-04-07 17:33:57 -07:00
|
|
|
|
2018-04-20 15:36:19 -07:00
|
|
|
if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
|
|
|
|
!is_mba_sc(r)) {
|
2017-09-25 16:39:34 -07:00
|
|
|
rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
|
|
|
|
r->membw.min_bw, r->default_ctrl);
|
2017-04-07 17:33:57 -07:00
|
|
|
return false;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2017-04-07 17:33:57 -07:00
|
|
|
|
|
|
|
*data = roundup(bw, (unsigned long)r->membw.bw_gran);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-07-28 17:06:22 +00:00
|
|
|
int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
|
2020-07-08 16:39:27 +00:00
|
|
|
struct rdt_domain *d)
|
2017-04-07 17:33:57 -07:00
|
|
|
{
|
2021-07-28 17:06:27 +00:00
|
|
|
struct resctrl_staged_config *cfg;
|
2022-09-02 15:48:17 +00:00
|
|
|
u32 closid = data->rdtgrp->closid;
|
2021-07-28 17:06:22 +00:00
|
|
|
struct rdt_resource *r = s->res;
|
2018-09-15 14:58:19 -07:00
|
|
|
unsigned long bw_val;
|
2017-04-07 17:33:57 -07:00
|
|
|
|
2021-07-28 17:06:27 +00:00
|
|
|
cfg = &d->staged_config[s->conf_type];
|
2021-07-28 17:06:26 +00:00
|
|
|
if (cfg->have_new_ctrl) {
|
2018-11-21 20:28:43 +00:00
|
|
|
rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
|
2017-04-07 17:33:57 -07:00
|
|
|
return -EINVAL;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2017-04-07 17:33:57 -07:00
|
|
|
|
2018-09-15 14:58:19 -07:00
|
|
|
if (!bw_validate(data->buf, &bw_val, r))
|
2017-04-07 17:33:57 -07:00
|
|
|
return -EINVAL;
|
2022-09-02 15:48:17 +00:00
|
|
|
|
|
|
|
if (is_mba_sc(r)) {
|
|
|
|
d->mbps_val[closid] = bw_val;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-07-28 17:06:26 +00:00
|
|
|
cfg->new_ctrl = bw_val;
|
|
|
|
cfg->have_new_ctrl = true;
|
2017-04-07 17:33:57 -07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-10-28 15:04:47 -07:00
|
|
|
/*
|
2020-07-08 16:39:28 +00:00
|
|
|
* Check whether a cache bit mask is valid.
|
2023-10-10 12:42:37 +02:00
|
|
|
* On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
|
|
|
|
* - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
|
|
|
|
* - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
|
|
|
|
*
|
|
|
|
* Haswell does not support a non-contiguous 1s value and additionally
|
|
|
|
* requires at least two bits set.
|
2020-07-08 16:39:28 +00:00
|
|
|
* AMD allows non-contiguous bitmasks.
|
2016-10-28 15:04:47 -07:00
|
|
|
*/
|
2020-07-08 16:39:28 +00:00
|
|
|
static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
|
2016-10-28 15:04:47 -07:00
|
|
|
{
|
2017-04-07 17:33:56 -07:00
|
|
|
unsigned long first_bit, zero_bit, val;
|
2017-04-14 13:00:36 +02:00
|
|
|
unsigned int cbm_len = r->cache.cbm_len;
|
2017-04-07 17:33:56 -07:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = kstrtoul(buf, 16, &val);
|
2017-09-25 16:39:34 -07:00
|
|
|
if (ret) {
|
2018-11-21 20:28:43 +00:00
|
|
|
rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
|
2017-04-07 17:33:56 -07:00
|
|
|
return false;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2016-10-28 15:04:47 -07:00
|
|
|
|
2022-09-27 15:16:36 -05:00
|
|
|
if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) {
|
2018-11-21 20:28:43 +00:00
|
|
|
rdt_last_cmd_puts("Mask out of range\n");
|
2016-10-28 15:04:47 -07:00
|
|
|
return false;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2016-10-28 15:04:47 -07:00
|
|
|
|
2017-04-07 17:33:56 -07:00
|
|
|
first_bit = find_first_bit(&val, cbm_len);
|
|
|
|
zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
|
2016-10-28 15:04:47 -07:00
|
|
|
|
2023-10-10 12:42:36 +02:00
|
|
|
/* Are non-contiguous bitmasks allowed? */
|
|
|
|
if (!r->cache.arch_has_sparse_bitmasks &&
|
2020-07-08 16:39:28 +00:00
|
|
|
(find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
|
2018-11-21 20:28:43 +00:00
|
|
|
rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
|
2016-10-28 15:04:47 -07:00
|
|
|
return false;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2016-10-28 15:04:47 -07:00
|
|
|
|
2017-09-25 16:39:34 -07:00
|
|
|
if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
|
2018-11-21 20:28:43 +00:00
|
|
|
rdt_last_cmd_printf("Need at least %d bits in the mask\n",
|
2017-09-25 16:39:34 -07:00
|
|
|
r->cache.min_cbm_bits);
|
2016-10-28 15:04:47 -07:00
|
|
|
return false;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2017-04-07 17:33:56 -07:00
|
|
|
|
|
|
|
*data = val;
|
2016-10-28 15:04:47 -07:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read one cache bit mask (hex). Check that it is valid for the current
|
|
|
|
* resource type.
|
|
|
|
*/
|
2021-07-28 17:06:22 +00:00
|
|
|
int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
|
2018-09-15 14:58:19 -07:00
|
|
|
struct rdt_domain *d)
|
2016-10-28 15:04:47 -07:00
|
|
|
{
|
2018-06-22 15:42:04 -07:00
|
|
|
struct rdtgroup *rdtgrp = data->rdtgrp;
|
2021-07-28 17:06:27 +00:00
|
|
|
struct resctrl_staged_config *cfg;
|
2021-07-28 17:06:22 +00:00
|
|
|
struct rdt_resource *r = s->res;
|
2018-06-22 15:42:02 -07:00
|
|
|
u32 cbm_val;
|
2016-10-28 15:04:47 -07:00
|
|
|
|
2021-07-28 17:06:27 +00:00
|
|
|
cfg = &d->staged_config[s->conf_type];
|
2021-07-28 17:06:26 +00:00
|
|
|
if (cfg->have_new_ctrl) {
|
2018-11-21 20:28:43 +00:00
|
|
|
rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
|
2017-04-03 14:44:16 -07:00
|
|
|
return -EINVAL;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2017-04-03 14:44:16 -07:00
|
|
|
|
2018-06-22 15:42:22 -07:00
|
|
|
/*
|
|
|
|
* Cannot set up more than one pseudo-locked region in a cache
|
|
|
|
* hierarchy.
|
|
|
|
*/
|
|
|
|
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
|
|
|
|
rdtgroup_pseudo_locked_in_hierarchy(d)) {
|
2018-11-27 11:19:36 -08:00
|
|
|
rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n");
|
2018-06-22 15:42:22 -07:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2020-07-08 16:39:28 +00:00
|
|
|
if (!cbm_validate(data->buf, &cbm_val, r))
|
2018-06-22 15:42:04 -07:00
|
|
|
return -EINVAL;
|
|
|
|
|
2018-06-22 15:42:22 -07:00
|
|
|
if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
|
|
|
|
rdtgrp->mode == RDT_MODE_SHAREABLE) &&
|
|
|
|
rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
|
2018-11-27 11:19:36 -08:00
|
|
|
rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n");
|
2018-06-22 15:42:22 -07:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2018-06-22 15:42:04 -07:00
|
|
|
/*
|
|
|
|
* The CBM may not overlap with the CBM of another closid if
|
|
|
|
* either is exclusive.
|
|
|
|
*/
|
2021-07-28 17:06:22 +00:00
|
|
|
if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
|
2018-11-27 11:19:36 -08:00
|
|
|
rdt_last_cmd_puts("Overlaps with exclusive group\n");
|
2016-10-28 15:04:47 -07:00
|
|
|
return -EINVAL;
|
2018-06-22 15:42:04 -07:00
|
|
|
}
|
|
|
|
|
2021-07-28 17:06:22 +00:00
|
|
|
if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
|
2018-06-22 15:42:17 -07:00
|
|
|
if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
|
|
|
|
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
|
2018-11-27 11:19:36 -08:00
|
|
|
rdt_last_cmd_puts("Overlaps with other group\n");
|
2018-06-22 15:42:04 -07:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
2018-06-22 15:42:02 -07:00
|
|
|
|
2021-07-28 17:06:26 +00:00
|
|
|
cfg->new_ctrl = cbm_val;
|
|
|
|
cfg->have_new_ctrl = true;
|
2016-10-28 15:04:47 -07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For each domain in this resource we expect to find a series of:
|
|
|
|
* id=mask
|
2017-04-03 14:44:16 -07:00
|
|
|
* separated by ";". The "id" is in decimal, and must match one of
|
|
|
|
* the "id"s for this resource.
|
2016-10-28 15:04:47 -07:00
|
|
|
*/
|
2021-07-28 17:06:22 +00:00
|
|
|
static int parse_line(char *line, struct resctrl_schema *s,
|
2018-06-22 15:42:04 -07:00
|
|
|
struct rdtgroup *rdtgrp)
|
2016-10-28 15:04:47 -07:00
|
|
|
{
|
2021-07-28 17:06:27 +00:00
|
|
|
enum resctrl_conf_type t = s->conf_type;
|
2021-07-28 17:06:26 +00:00
|
|
|
struct resctrl_staged_config *cfg;
|
2021-07-28 17:06:22 +00:00
|
|
|
struct rdt_resource *r = s->res;
|
2018-09-15 14:58:19 -07:00
|
|
|
struct rdt_parse_data data;
|
2016-10-28 15:04:47 -07:00
|
|
|
char *dom = NULL, *id;
|
|
|
|
struct rdt_domain *d;
|
|
|
|
unsigned long dom_id;
|
|
|
|
|
x86/resctrl: Separate arch and fs resctrl locks
resctrl has one mutex that is taken by the architecture-specific code, and the
filesystem parts. The two interact via cpuhp, where the architecture code
updates the domain list. Filesystem handlers that walk the domains list should
not run concurrently with the cpuhp callback modifying the list.
Exposing a lock from the filesystem code means the interface is not cleanly
defined, and creates the possibility of cross-architecture lock ordering
headaches. The interaction only exists so that certain filesystem paths are
serialised against CPU hotplug. The CPU hotplug code already has a mechanism to
do this using cpus_read_lock().
MPAM's monitors have an overflow interrupt, so it needs to be possible to walk
the domains list in irq context. RCU is ideal for this, but some paths need to
be able to sleep to allocate memory.
Because resctrl_{on,off}line_cpu() take the rdtgroup_mutex as part of a cpuhp
callback, cpus_read_lock() must always be taken first.
rdtgroup_schemata_write() already does this.
Most of the filesystem code's domain list walkers are currently protected by
the rdtgroup_mutex taken in rdtgroup_kn_lock_live(). The exceptions are
rdt_bit_usage_show() and the mon_config helpers which take the lock directly.
Make the domain list protected by RCU. An architecture-specific lock prevents
concurrent writers. rdt_bit_usage_show() could walk the domain list using RCU,
but to keep all the filesystem operations the same, this is changed to call
cpus_read_lock(). The mon_config helpers send multiple IPIs, take the
cpus_read_lock() in these cases.
The other filesystem list walkers need to be able to sleep. Add
cpus_read_lock() to rdtgroup_kn_lock_live() so that the cpuhp callbacks can't
be invoked when file system operations are occurring.
Add lockdep_assert_cpus_held() in the cases where the rdtgroup_kn_lock_live()
call isn't obvious.
Resctrl's domain online/offline calls now need to take the rdtgroup_mutex
themselves.
[ bp: Fold in a build fix: https://lore.kernel.org/r/87zfvwieli.ffs@tglx ]
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-25-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:38 +00:00
|
|
|
/* Walking r->domains, ensure it can't race with cpuhp */
|
|
|
|
lockdep_assert_cpus_held();
|
|
|
|
|
2018-09-15 14:58:24 -07:00
|
|
|
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
|
2023-01-13 09:20:32 -06:00
|
|
|
(r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
|
2018-09-15 14:58:24 -07:00
|
|
|
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2017-04-03 14:44:16 -07:00
|
|
|
next:
|
|
|
|
if (!line || line[0] == '\0')
|
|
|
|
return 0;
|
|
|
|
dom = strsep(&line, ";");
|
|
|
|
id = strsep(&dom, "=");
|
2017-09-25 16:39:34 -07:00
|
|
|
if (!dom || kstrtoul(id, 10, &dom_id)) {
|
|
|
|
rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
|
2017-04-03 14:44:16 -07:00
|
|
|
return -EINVAL;
|
2017-09-25 16:39:34 -07:00
|
|
|
}
|
2017-04-19 16:50:03 -07:00
|
|
|
dom = strim(dom);
|
2016-10-28 15:04:47 -07:00
|
|
|
list_for_each_entry(d, &r->domains, list) {
|
2017-04-03 14:44:16 -07:00
|
|
|
if (d->id == dom_id) {
|
2018-06-22 15:42:04 -07:00
|
|
|
data.buf = dom;
|
|
|
|
data.rdtgrp = rdtgrp;
|
2021-07-28 17:06:22 +00:00
|
|
|
if (r->parse_ctrlval(&data, s, d))
|
2017-04-03 14:44:16 -07:00
|
|
|
return -EINVAL;
|
2018-06-22 15:42:22 -07:00
|
|
|
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
|
2021-07-28 17:06:27 +00:00
|
|
|
cfg = &d->staged_config[t];
|
2018-06-22 15:42:22 -07:00
|
|
|
/*
|
|
|
|
* In pseudo-locking setup mode and just
|
|
|
|
* parsed a valid CBM that should be
|
|
|
|
* pseudo-locked. Only one locked region per
|
|
|
|
* resource group and domain so just do
|
|
|
|
* the required initialization for single
|
|
|
|
* region and return.
|
|
|
|
*/
|
2021-07-28 17:06:23 +00:00
|
|
|
rdtgrp->plr->s = s;
|
2018-06-22 15:42:22 -07:00
|
|
|
rdtgrp->plr->d = d;
|
2021-07-28 17:06:26 +00:00
|
|
|
rdtgrp->plr->cbm = cfg->new_ctrl;
|
2018-06-22 15:42:22 -07:00
|
|
|
d->plr = rdtgrp->plr;
|
|
|
|
return 0;
|
|
|
|
}
|
2017-04-03 14:44:16 -07:00
|
|
|
goto next;
|
|
|
|
}
|
2016-10-28 15:04:47 -07:00
|
|
|
}
|
2017-04-03 14:44:16 -07:00
|
|
|
return -EINVAL;
|
2016-10-28 15:04:47 -07:00
|
|
|
}
|
|
|
|
|
2021-07-28 17:06:33 +00:00
|
|
|
static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
|
2021-07-28 17:06:32 +00:00
|
|
|
{
|
2021-07-28 17:06:33 +00:00
|
|
|
switch (type) {
|
|
|
|
default:
|
|
|
|
case CDP_NONE:
|
2021-07-28 17:06:32 +00:00
|
|
|
return closid;
|
2021-07-28 17:06:33 +00:00
|
|
|
case CDP_CODE:
|
|
|
|
return closid * 2 + 1;
|
|
|
|
case CDP_DATA:
|
|
|
|
return closid * 2;
|
|
|
|
}
|
2021-07-28 17:06:32 +00:00
|
|
|
}
|
|
|
|
|
2022-09-02 15:48:19 +00:00
|
|
|
int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
|
|
|
|
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
|
|
|
|
{
|
|
|
|
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
|
|
|
|
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
|
|
|
|
u32 idx = get_config_index(closid, t);
|
|
|
|
struct msr_param msr_param;
|
|
|
|
|
|
|
|
if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
hw_dom->ctrl_val[idx] = cfg_val;
|
|
|
|
|
|
|
|
msr_param.res = r;
|
2024-03-08 13:38:45 -08:00
|
|
|
msr_param.dom = d;
|
2022-09-02 15:48:19 +00:00
|
|
|
msr_param.low = idx;
|
|
|
|
msr_param.high = idx + 1;
|
2024-03-08 13:38:46 -08:00
|
|
|
hw_res->msr_update(&msr_param);
|
2022-09-02 15:48:19 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-07-28 17:06:28 +00:00
|
|
|
int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
|
2016-10-28 15:04:47 -07:00
|
|
|
{
|
2021-07-28 17:06:26 +00:00
|
|
|
struct resctrl_staged_config *cfg;
|
2021-07-28 17:06:15 +00:00
|
|
|
struct rdt_hw_domain *hw_dom;
|
2016-10-28 15:04:47 -07:00
|
|
|
struct msr_param msr_param;
|
2021-07-28 17:06:27 +00:00
|
|
|
enum resctrl_conf_type t;
|
2016-10-28 15:04:47 -07:00
|
|
|
struct rdt_domain *d;
|
2021-07-28 17:06:32 +00:00
|
|
|
u32 idx;
|
2016-10-28 15:04:47 -07:00
|
|
|
|
x86/resctrl: Separate arch and fs resctrl locks
resctrl has one mutex that is taken by the architecture-specific code, and the
filesystem parts. The two interact via cpuhp, where the architecture code
updates the domain list. Filesystem handlers that walk the domains list should
not run concurrently with the cpuhp callback modifying the list.
Exposing a lock from the filesystem code means the interface is not cleanly
defined, and creates the possibility of cross-architecture lock ordering
headaches. The interaction only exists so that certain filesystem paths are
serialised against CPU hotplug. The CPU hotplug code already has a mechanism to
do this using cpus_read_lock().
MPAM's monitors have an overflow interrupt, so it needs to be possible to walk
the domains list in irq context. RCU is ideal for this, but some paths need to
be able to sleep to allocate memory.
Because resctrl_{on,off}line_cpu() take the rdtgroup_mutex as part of a cpuhp
callback, cpus_read_lock() must always be taken first.
rdtgroup_schemata_write() already does this.
Most of the filesystem code's domain list walkers are currently protected by
the rdtgroup_mutex taken in rdtgroup_kn_lock_live(). The exceptions are
rdt_bit_usage_show() and the mon_config helpers which take the lock directly.
Make the domain list protected by RCU. An architecture-specific lock prevents
concurrent writers. rdt_bit_usage_show() could walk the domain list using RCU,
but to keep all the filesystem operations the same, this is changed to call
cpus_read_lock(). The mon_config helpers send multiple IPIs, take the
cpus_read_lock() in these cases.
The other filesystem list walkers need to be able to sleep. Add
cpus_read_lock() to rdtgroup_kn_lock_live() so that the cpuhp callbacks can't
be invoked when file system operations are occurring.
Add lockdep_assert_cpus_held() in the cases where the rdtgroup_kn_lock_live()
call isn't obvious.
Resctrl's domain online/offline calls now need to take the rdtgroup_mutex
themselves.
[ bp: Fold in a build fix: https://lore.kernel.org/r/87zfvwieli.ffs@tglx ]
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-25-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:38 +00:00
|
|
|
/* Walking r->domains, ensure it can't race with cpuhp */
|
|
|
|
lockdep_assert_cpus_held();
|
|
|
|
|
2016-10-28 15:04:47 -07:00
|
|
|
list_for_each_entry(d, &r->domains, list) {
|
2021-07-28 17:06:15 +00:00
|
|
|
hw_dom = resctrl_to_arch_dom(d);
|
2024-03-08 13:38:45 -08:00
|
|
|
msr_param.res = NULL;
|
2021-07-28 17:06:27 +00:00
|
|
|
for (t = 0; t < CDP_NUM_TYPES; t++) {
|
|
|
|
cfg = &hw_dom->d_resctrl.staged_config[t];
|
|
|
|
if (!cfg->have_new_ctrl)
|
|
|
|
continue;
|
|
|
|
|
2021-07-28 17:06:33 +00:00
|
|
|
idx = get_config_index(closid, t);
|
2024-03-08 13:38:45 -08:00
|
|
|
if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
|
2021-07-28 17:06:32 +00:00
|
|
|
continue;
|
2024-03-08 13:38:45 -08:00
|
|
|
hw_dom->ctrl_val[idx] = cfg->new_ctrl;
|
2021-07-28 17:06:33 +00:00
|
|
|
|
2021-07-28 17:06:36 +00:00
|
|
|
if (!msr_param.res) {
|
|
|
|
msr_param.low = idx;
|
|
|
|
msr_param.high = msr_param.low + 1;
|
|
|
|
msr_param.res = r;
|
2024-03-08 13:38:45 -08:00
|
|
|
msr_param.dom = d;
|
2021-07-28 17:06:36 +00:00
|
|
|
} else {
|
|
|
|
msr_param.low = min(msr_param.low, idx);
|
|
|
|
msr_param.high = max(msr_param.high, idx + 1);
|
|
|
|
}
|
2021-07-28 17:06:27 +00:00
|
|
|
}
|
2024-03-08 13:38:45 -08:00
|
|
|
if (msr_param.res)
|
|
|
|
smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1);
|
2016-10-28 15:04:47 -07:00
|
|
|
}
|
2018-04-20 15:36:19 -07:00
|
|
|
|
2016-10-28 15:04:47 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-06-22 15:42:04 -07:00
|
|
|
static int rdtgroup_parse_resource(char *resname, char *tok,
|
|
|
|
struct rdtgroup *rdtgrp)
|
2017-04-19 16:50:04 -07:00
|
|
|
{
|
2021-07-28 17:06:19 +00:00
|
|
|
struct resctrl_schema *s;
|
2017-04-19 16:50:04 -07:00
|
|
|
|
2021-07-28 17:06:19 +00:00
|
|
|
list_for_each_entry(s, &resctrl_schema_all, list) {
|
2021-07-28 17:06:25 +00:00
|
|
|
if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
|
2021-07-28 17:06:22 +00:00
|
|
|
return parse_line(tok, s, rdtgrp);
|
2017-04-19 16:50:04 -07:00
|
|
|
}
|
2018-11-21 20:28:43 +00:00
|
|
|
rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
|
2017-04-19 16:50:04 -07:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2016-10-28 15:04:47 -07:00
|
|
|
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
2021-07-28 17:06:19 +00:00
|
|
|
struct resctrl_schema *s;
|
2016-10-28 15:04:47 -07:00
|
|
|
struct rdtgroup *rdtgrp;
|
|
|
|
struct rdt_resource *r;
|
|
|
|
char *tok, *resname;
|
2018-06-22 15:42:04 -07:00
|
|
|
int ret = 0;
|
2016-10-28 15:04:47 -07:00
|
|
|
|
|
|
|
/* Valid input requires a trailing newline */
|
2017-09-25 16:39:35 -07:00
|
|
|
if (nbytes == 0 || buf[nbytes - 1] != '\n')
|
2016-10-28 15:04:47 -07:00
|
|
|
return -EINVAL;
|
|
|
|
buf[nbytes - 1] = '\0';
|
|
|
|
|
|
|
|
rdtgrp = rdtgroup_kn_lock_live(of->kn);
|
|
|
|
if (!rdtgrp) {
|
|
|
|
rdtgroup_kn_unlock(of->kn);
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
2017-09-25 16:39:34 -07:00
|
|
|
rdt_last_cmd_clear();
|
2016-10-28 15:04:47 -07:00
|
|
|
|
2018-06-22 15:42:12 -07:00
|
|
|
/*
|
|
|
|
* No changes to pseudo-locked region allowed. It has to be removed
|
|
|
|
* and re-created instead.
|
|
|
|
*/
|
|
|
|
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
|
|
|
|
ret = -EINVAL;
|
2018-11-21 20:28:43 +00:00
|
|
|
rdt_last_cmd_puts("Resource group is pseudo-locked\n");
|
2018-06-22 15:42:12 -07:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2023-01-17 13:14:50 -08:00
|
|
|
rdt_staged_configs_clear();
|
2016-10-28 15:04:47 -07:00
|
|
|
|
|
|
|
while ((tok = strsep(&buf, "\n")) != NULL) {
|
2017-04-19 16:50:03 -07:00
|
|
|
resname = strim(strsep(&tok, ":"));
|
2016-10-28 15:04:47 -07:00
|
|
|
if (!tok) {
|
2017-09-25 16:39:34 -07:00
|
|
|
rdt_last_cmd_puts("Missing ':'\n");
|
2016-10-28 15:04:47 -07:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2017-11-10 11:16:24 -08:00
|
|
|
if (tok[0] == '\0') {
|
|
|
|
rdt_last_cmd_printf("Missing '%s' value\n", resname);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2018-06-22 15:42:04 -07:00
|
|
|
ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
|
2017-04-19 16:50:04 -07:00
|
|
|
if (ret)
|
2016-10-28 15:04:47 -07:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2021-07-28 17:06:19 +00:00
|
|
|
list_for_each_entry(s, &resctrl_schema_all, list) {
|
|
|
|
r = s->res;
|
2022-09-02 15:48:17 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Writes to mba_sc resources update the software controller,
|
|
|
|
* not the control MSR.
|
|
|
|
*/
|
|
|
|
if (is_mba_sc(r))
|
|
|
|
continue;
|
|
|
|
|
2021-07-28 17:06:28 +00:00
|
|
|
ret = resctrl_arch_update_domains(r, rdtgrp->closid);
|
2016-10-28 15:04:47 -07:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2018-06-22 15:42:22 -07:00
|
|
|
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
|
|
|
|
/*
|
|
|
|
* If pseudo-locking fails we keep the resource group in
|
|
|
|
* mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
|
|
|
|
* active and updated for just the domain the pseudo-locked
|
|
|
|
* region was requested for.
|
|
|
|
*/
|
|
|
|
ret = rdtgroup_pseudo_lock_create(rdtgrp);
|
|
|
|
}
|
|
|
|
|
2016-10-28 15:04:47 -07:00
|
|
|
out:
|
2023-01-17 13:14:50 -08:00
|
|
|
rdt_staged_configs_clear();
|
2016-10-28 15:04:47 -07:00
|
|
|
rdtgroup_kn_unlock(of->kn);
|
|
|
|
return ret ?: nbytes;
|
|
|
|
}
|
|
|
|
|
2021-08-11 16:38:31 +00:00
|
|
|
u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
|
|
|
|
u32 closid, enum resctrl_conf_type type)
|
2021-07-28 17:06:29 +00:00
|
|
|
{
|
|
|
|
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
|
2021-07-28 17:06:33 +00:00
|
|
|
u32 idx = get_config_index(closid, type);
|
2021-07-28 17:06:29 +00:00
|
|
|
|
2022-09-02 15:48:17 +00:00
|
|
|
return hw_dom->ctrl_val[idx];
|
2021-07-28 17:06:29 +00:00
|
|
|
}
|
|
|
|
|
2021-07-28 17:06:22 +00:00
|
|
|
static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
|
2016-10-28 15:04:47 -07:00
|
|
|
{
|
2021-07-28 17:06:22 +00:00
|
|
|
struct rdt_resource *r = schema->res;
|
2016-10-28 15:04:47 -07:00
|
|
|
struct rdt_domain *dom;
|
|
|
|
bool sep = false;
|
2018-04-20 15:36:19 -07:00
|
|
|
u32 ctrl_val;
|
2016-10-28 15:04:47 -07:00
|
|
|
|
x86/resctrl: Separate arch and fs resctrl locks
resctrl has one mutex that is taken by the architecture-specific code, and the
filesystem parts. The two interact via cpuhp, where the architecture code
updates the domain list. Filesystem handlers that walk the domains list should
not run concurrently with the cpuhp callback modifying the list.
Exposing a lock from the filesystem code means the interface is not cleanly
defined, and creates the possibility of cross-architecture lock ordering
headaches. The interaction only exists so that certain filesystem paths are
serialised against CPU hotplug. The CPU hotplug code already has a mechanism to
do this using cpus_read_lock().
MPAM's monitors have an overflow interrupt, so it needs to be possible to walk
the domains list in irq context. RCU is ideal for this, but some paths need to
be able to sleep to allocate memory.
Because resctrl_{on,off}line_cpu() take the rdtgroup_mutex as part of a cpuhp
callback, cpus_read_lock() must always be taken first.
rdtgroup_schemata_write() already does this.
Most of the filesystem code's domain list walkers are currently protected by
the rdtgroup_mutex taken in rdtgroup_kn_lock_live(). The exceptions are
rdt_bit_usage_show() and the mon_config helpers which take the lock directly.
Make the domain list protected by RCU. An architecture-specific lock prevents
concurrent writers. rdt_bit_usage_show() could walk the domain list using RCU,
but to keep all the filesystem operations the same, this is changed to call
cpus_read_lock(). The mon_config helpers send multiple IPIs, take the
cpus_read_lock() in these cases.
The other filesystem list walkers need to be able to sleep. Add
cpus_read_lock() to rdtgroup_kn_lock_live() so that the cpuhp callbacks can't
be invoked when file system operations are occurring.
Add lockdep_assert_cpus_held() in the cases where the rdtgroup_kn_lock_live()
call isn't obvious.
Resctrl's domain online/offline calls now need to take the rdtgroup_mutex
themselves.
[ bp: Fold in a build fix: https://lore.kernel.org/r/87zfvwieli.ffs@tglx ]
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-25-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:38 +00:00
|
|
|
/* Walking r->domains, ensure it can't race with cpuhp */
|
|
|
|
lockdep_assert_cpus_held();
|
|
|
|
|
2021-07-28 17:06:25 +00:00
|
|
|
seq_printf(s, "%*s:", max_name_width, schema->name);
|
2016-10-28 15:04:47 -07:00
|
|
|
list_for_each_entry(dom, &r->domains, list) {
|
|
|
|
if (sep)
|
|
|
|
seq_puts(s, ";");
|
2018-04-20 15:36:19 -07:00
|
|
|
|
2022-09-02 15:48:17 +00:00
|
|
|
if (is_mba_sc(r))
|
|
|
|
ctrl_val = dom->mbps_val[closid];
|
|
|
|
else
|
|
|
|
ctrl_val = resctrl_arch_get_config(r, dom, closid,
|
|
|
|
schema->conf_type);
|
|
|
|
|
2017-04-07 17:33:56 -07:00
|
|
|
seq_printf(s, r->format_str, dom->id, max_data_width,
|
2018-04-20 15:36:19 -07:00
|
|
|
ctrl_val);
|
2016-10-28 15:04:47 -07:00
|
|
|
sep = true;
|
|
|
|
}
|
|
|
|
seq_puts(s, "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
int rdtgroup_schemata_show(struct kernfs_open_file *of,
|
|
|
|
struct seq_file *s, void *v)
|
|
|
|
{
|
2021-07-28 17:06:19 +00:00
|
|
|
struct resctrl_schema *schema;
|
2016-10-28 15:04:47 -07:00
|
|
|
struct rdtgroup *rdtgrp;
|
2017-07-25 14:14:38 -07:00
|
|
|
int ret = 0;
|
|
|
|
u32 closid;
|
2016-10-28 15:04:47 -07:00
|
|
|
|
|
|
|
rdtgrp = rdtgroup_kn_lock_live(of->kn);
|
|
|
|
if (rdtgrp) {
|
2018-06-22 15:42:17 -07:00
|
|
|
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
|
2021-07-28 17:06:19 +00:00
|
|
|
list_for_each_entry(schema, &resctrl_schema_all, list) {
|
2021-07-28 17:06:25 +00:00
|
|
|
seq_printf(s, "%s:uninitialized\n", schema->name);
|
2021-07-28 17:06:19 +00:00
|
|
|
}
|
2018-06-22 15:42:23 -07:00
|
|
|
} else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
|
2018-10-12 15:51:01 -07:00
|
|
|
if (!rdtgrp->plr->d) {
|
|
|
|
rdt_last_cmd_clear();
|
|
|
|
rdt_last_cmd_puts("Cache domain offline\n");
|
|
|
|
ret = -ENODEV;
|
|
|
|
} else {
|
|
|
|
seq_printf(s, "%s:%d=%x\n",
|
2021-07-28 17:06:23 +00:00
|
|
|
rdtgrp->plr->s->res->name,
|
2018-10-12 15:51:01 -07:00
|
|
|
rdtgrp->plr->d->id,
|
|
|
|
rdtgrp->plr->cbm);
|
|
|
|
}
|
2018-06-22 15:42:17 -07:00
|
|
|
} else {
|
|
|
|
closid = rdtgrp->closid;
|
2021-07-28 17:06:19 +00:00
|
|
|
list_for_each_entry(schema, &resctrl_schema_all, list) {
|
2021-07-28 17:06:20 +00:00
|
|
|
if (closid < schema->num_closid)
|
2021-07-28 17:06:22 +00:00
|
|
|
show_doms(s, schema, closid);
|
2018-06-22 15:42:17 -07:00
|
|
|
}
|
2016-10-28 15:04:47 -07:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ret = -ENOENT;
|
|
|
|
}
|
|
|
|
rdtgroup_kn_unlock(of->kn);
|
|
|
|
return ret;
|
|
|
|
}
|
2017-07-25 14:14:38 -07:00
|
|
|
|
x86/resctrl: Queue mon_event_read() instead of sending an IPI
Intel is blessed with an abundance of monitors, one per RMID, that can be
read from any CPU in the domain. MPAMs monitors reside in the MMIO MSC,
the number implemented is up to the manufacturer. This means when there are
fewer monitors than needed, they need to be allocated and freed.
MPAM's CSU monitors are used to back the 'llc_occupancy' monitor file. The
CSU counter is allowed to return 'not ready' for a small number of
micro-seconds after programming. To allow one CSU hardware monitor to be
used for multiple control or monitor groups, the CPU accessing the
monitor needs to be able to block when configuring and reading the
counter.
Worse, the domain may be broken up into slices, and the MMIO accesses
for each slice may need performing from different CPUs.
These two details mean MPAMs monitor code needs to be able to sleep, and
IPI another CPU in the domain to read from a resource that has been sliced.
mon_event_read() already invokes mon_event_count() via IPI, which means
this isn't possible. On systems using nohz-full, some CPUs need to be
interrupted to run kernel work as they otherwise stay in user-space
running realtime workloads. Interrupting these CPUs should be avoided,
and scheduling work on them may never complete.
Change mon_event_read() to pick a housekeeping CPU, (one that is not using
nohz_full) and schedule mon_event_count() and wait. If all the CPUs
in a domain are using nohz-full, then an IPI is used as the fallback.
This function is only used in response to a user-space filesystem request
(not the timing sensitive overflow code).
This allows MPAM to hide the slice behaviour from resctrl, and to keep
the monitor-allocation in monitor.c. When the IPI fallback is used on
machines where MPAM needs to make an access on multiple CPUs, the counter
read will always fail.
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Peter Newman <peternewman@google.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-14-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:27 +00:00
|
|
|
static int smp_mon_event_count(void *arg)
|
|
|
|
{
|
|
|
|
mon_event_count(arg);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-05-05 15:36:16 -07:00
|
|
|
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
|
|
|
|
struct rdt_domain *d, struct rdtgroup *rdtgrp,
|
|
|
|
int evtid, int first)
|
2017-07-25 14:14:38 -07:00
|
|
|
{
|
x86/resctrl: Queue mon_event_read() instead of sending an IPI
Intel is blessed with an abundance of monitors, one per RMID, that can be
read from any CPU in the domain. MPAMs monitors reside in the MMIO MSC,
the number implemented is up to the manufacturer. This means when there are
fewer monitors than needed, they need to be allocated and freed.
MPAM's CSU monitors are used to back the 'llc_occupancy' monitor file. The
CSU counter is allowed to return 'not ready' for a small number of
micro-seconds after programming. To allow one CSU hardware monitor to be
used for multiple control or monitor groups, the CPU accessing the
monitor needs to be able to block when configuring and reading the
counter.
Worse, the domain may be broken up into slices, and the MMIO accesses
for each slice may need performing from different CPUs.
These two details mean MPAMs monitor code needs to be able to sleep, and
IPI another CPU in the domain to read from a resource that has been sliced.
mon_event_read() already invokes mon_event_count() via IPI, which means
this isn't possible. On systems using nohz-full, some CPUs need to be
interrupted to run kernel work as they otherwise stay in user-space
running realtime workloads. Interrupting these CPUs should be avoided,
and scheduling work on them may never complete.
Change mon_event_read() to pick a housekeeping CPU, (one that is not using
nohz_full) and schedule mon_event_count() and wait. If all the CPUs
in a domain are using nohz-full, then an IPI is used as the fallback.
This function is only used in response to a user-space filesystem request
(not the timing sensitive overflow code).
This allows MPAM to hide the slice behaviour from resctrl, and to keep
the monitor-allocation in monitor.c. When the IPI fallback is used on
machines where MPAM needs to make an access on multiple CPUs, the counter
read will always fail.
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Peter Newman <peternewman@google.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-14-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:27 +00:00
|
|
|
int cpu;
|
|
|
|
|
x86/resctrl: Separate arch and fs resctrl locks
resctrl has one mutex that is taken by the architecture-specific code, and the
filesystem parts. The two interact via cpuhp, where the architecture code
updates the domain list. Filesystem handlers that walk the domains list should
not run concurrently with the cpuhp callback modifying the list.
Exposing a lock from the filesystem code means the interface is not cleanly
defined, and creates the possibility of cross-architecture lock ordering
headaches. The interaction only exists so that certain filesystem paths are
serialised against CPU hotplug. The CPU hotplug code already has a mechanism to
do this using cpus_read_lock().
MPAM's monitors have an overflow interrupt, so it needs to be possible to walk
the domains list in irq context. RCU is ideal for this, but some paths need to
be able to sleep to allocate memory.
Because resctrl_{on,off}line_cpu() take the rdtgroup_mutex as part of a cpuhp
callback, cpus_read_lock() must always be taken first.
rdtgroup_schemata_write() already does this.
Most of the filesystem code's domain list walkers are currently protected by
the rdtgroup_mutex taken in rdtgroup_kn_lock_live(). The exceptions are
rdt_bit_usage_show() and the mon_config helpers which take the lock directly.
Make the domain list protected by RCU. An architecture-specific lock prevents
concurrent writers. rdt_bit_usage_show() could walk the domain list using RCU,
but to keep all the filesystem operations the same, this is changed to call
cpus_read_lock(). The mon_config helpers send multiple IPIs, take the
cpus_read_lock() in these cases.
The other filesystem list walkers need to be able to sleep. Add
cpus_read_lock() to rdtgroup_kn_lock_live() so that the cpuhp callbacks can't
be invoked when file system operations are occurring.
Add lockdep_assert_cpus_held() in the cases where the rdtgroup_kn_lock_live()
call isn't obvious.
Resctrl's domain online/offline calls now need to take the rdtgroup_mutex
themselves.
[ bp: Fold in a build fix: https://lore.kernel.org/r/87zfvwieli.ffs@tglx ]
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-25-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:38 +00:00
|
|
|
/* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
|
|
|
|
lockdep_assert_cpus_held();
|
|
|
|
|
2017-07-25 14:14:38 -07:00
|
|
|
/*
|
x86/resctrl: Queue mon_event_read() instead of sending an IPI
Intel is blessed with an abundance of monitors, one per RMID, that can be
read from any CPU in the domain. MPAMs monitors reside in the MMIO MSC,
the number implemented is up to the manufacturer. This means when there are
fewer monitors than needed, they need to be allocated and freed.
MPAM's CSU monitors are used to back the 'llc_occupancy' monitor file. The
CSU counter is allowed to return 'not ready' for a small number of
micro-seconds after programming. To allow one CSU hardware monitor to be
used for multiple control or monitor groups, the CPU accessing the
monitor needs to be able to block when configuring and reading the
counter.
Worse, the domain may be broken up into slices, and the MMIO accesses
for each slice may need performing from different CPUs.
These two details mean MPAMs monitor code needs to be able to sleep, and
IPI another CPU in the domain to read from a resource that has been sliced.
mon_event_read() already invokes mon_event_count() via IPI, which means
this isn't possible. On systems using nohz-full, some CPUs need to be
interrupted to run kernel work as they otherwise stay in user-space
running realtime workloads. Interrupting these CPUs should be avoided,
and scheduling work on them may never complete.
Change mon_event_read() to pick a housekeeping CPU, (one that is not using
nohz_full) and schedule mon_event_count() and wait. If all the CPUs
in a domain are using nohz-full, then an IPI is used as the fallback.
This function is only used in response to a user-space filesystem request
(not the timing sensitive overflow code).
This allows MPAM to hide the slice behaviour from resctrl, and to keep
the monitor-allocation in monitor.c. When the IPI fallback is used on
machines where MPAM needs to make an access on multiple CPUs, the counter
read will always fail.
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Peter Newman <peternewman@google.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-14-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:27 +00:00
|
|
|
* Setup the parameters to pass to mon_event_count() to read the data.
|
2017-07-25 14:14:38 -07:00
|
|
|
*/
|
|
|
|
rr->rgrp = rdtgrp;
|
|
|
|
rr->evtid = evtid;
|
2020-05-05 15:36:16 -07:00
|
|
|
rr->r = r;
|
2017-07-25 14:14:45 -07:00
|
|
|
rr->d = d;
|
2017-07-25 14:14:38 -07:00
|
|
|
rr->val = 0;
|
2017-07-25 14:14:46 -07:00
|
|
|
rr->first = first;
|
2024-02-13 18:44:29 +00:00
|
|
|
rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
|
|
|
|
if (IS_ERR(rr->arch_mon_ctx)) {
|
|
|
|
rr->err = -EINVAL;
|
|
|
|
return;
|
|
|
|
}
|
2017-07-25 14:14:38 -07:00
|
|
|
|
2024-02-13 18:44:35 +00:00
|
|
|
cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU);
|
x86/resctrl: Queue mon_event_read() instead of sending an IPI
Intel is blessed with an abundance of monitors, one per RMID, that can be
read from any CPU in the domain. MPAMs monitors reside in the MMIO MSC,
the number implemented is up to the manufacturer. This means when there are
fewer monitors than needed, they need to be allocated and freed.
MPAM's CSU monitors are used to back the 'llc_occupancy' monitor file. The
CSU counter is allowed to return 'not ready' for a small number of
micro-seconds after programming. To allow one CSU hardware monitor to be
used for multiple control or monitor groups, the CPU accessing the
monitor needs to be able to block when configuring and reading the
counter.
Worse, the domain may be broken up into slices, and the MMIO accesses
for each slice may need performing from different CPUs.
These two details mean MPAMs monitor code needs to be able to sleep, and
IPI another CPU in the domain to read from a resource that has been sliced.
mon_event_read() already invokes mon_event_count() via IPI, which means
this isn't possible. On systems using nohz-full, some CPUs need to be
interrupted to run kernel work as they otherwise stay in user-space
running realtime workloads. Interrupting these CPUs should be avoided,
and scheduling work on them may never complete.
Change mon_event_read() to pick a housekeeping CPU, (one that is not using
nohz_full) and schedule mon_event_count() and wait. If all the CPUs
in a domain are using nohz-full, then an IPI is used as the fallback.
This function is only used in response to a user-space filesystem request
(not the timing sensitive overflow code).
This allows MPAM to hide the slice behaviour from resctrl, and to keep
the monitor-allocation in monitor.c. When the IPI fallback is used on
machines where MPAM needs to make an access on multiple CPUs, the counter
read will always fail.
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Peter Newman <peternewman@google.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-14-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2024-02-13 18:44:27 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* cpumask_any_housekeeping() prefers housekeeping CPUs, but
|
|
|
|
* are all the CPUs nohz_full? If yes, pick a CPU to IPI.
|
|
|
|
* MPAM's resctrl_arch_rmid_read() is unable to read the
|
|
|
|
* counters on some platforms if its called in IRQ context.
|
|
|
|
*/
|
|
|
|
if (tick_nohz_full_cpu(cpu))
|
|
|
|
smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
|
|
|
|
else
|
|
|
|
smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
|
2024-02-13 18:44:29 +00:00
|
|
|
|
|
|
|
resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
|
2017-07-25 14:14:38 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
int rdtgroup_mondata_show(struct seq_file *m, void *arg)
|
|
|
|
{
|
|
|
|
struct kernfs_open_file *of = m->private;
|
|
|
|
u32 resid, evtid, domid;
|
|
|
|
struct rdtgroup *rdtgrp;
|
|
|
|
struct rdt_resource *r;
|
|
|
|
union mon_data_bits md;
|
|
|
|
struct rdt_domain *d;
|
|
|
|
struct rmid_read rr;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
rdtgrp = rdtgroup_kn_lock_live(of->kn);
|
2019-10-29 13:25:02 +08:00
|
|
|
if (!rdtgrp) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
2017-07-25 14:14:38 -07:00
|
|
|
|
|
|
|
md.priv = of->kn->priv;
|
|
|
|
resid = md.u.rid;
|
|
|
|
domid = md.u.domid;
|
|
|
|
evtid = md.u.evtid;
|
|
|
|
|
2022-09-02 15:48:29 +00:00
|
|
|
r = &rdt_resources_all[resid].r_resctrl;
|
2017-07-25 14:14:38 -07:00
|
|
|
d = rdt_find_domain(r, domid, NULL);
|
2018-12-10 14:31:13 -08:00
|
|
|
if (IS_ERR_OR_NULL(d)) {
|
2017-07-25 14:14:38 -07:00
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-05-05 15:36:16 -07:00
|
|
|
mon_event_read(&rr, r, d, rdtgrp, evtid, false);
|
2017-07-25 14:14:38 -07:00
|
|
|
|
2022-09-02 15:48:23 +00:00
|
|
|
if (rr.err == -EIO)
|
2017-07-25 14:14:38 -07:00
|
|
|
seq_puts(m, "Error\n");
|
2022-09-02 15:48:23 +00:00
|
|
|
else if (rr.err == -EINVAL)
|
2017-07-25 14:14:38 -07:00
|
|
|
seq_puts(m, "Unavailable\n");
|
|
|
|
else
|
2022-09-02 15:48:29 +00:00
|
|
|
seq_printf(m, "%llu\n", rr.val);
|
2017-07-25 14:14:38 -07:00
|
|
|
|
|
|
|
out:
|
|
|
|
rdtgroup_kn_unlock(of->kn);
|
|
|
|
return ret;
|
|
|
|
}
|