linux/arch/x86/kernel/cpu/perf_event_intel_cqm.c

768 lines
17 KiB
C
Raw Normal View History

/*
* Intel Cache Quality-of-Service Monitoring (CQM) support.
*
* Based very, very heavily on work by Peter Zijlstra.
*/
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <asm/cpu_device_id.h>
#include "perf_event.h"
#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
static unsigned int cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
struct intel_cqm_state {
raw_spinlock_t lock;
int rmid;
int cnt;
};
static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
/*
* Protects cache_cgroups and cqm_rmid_lru.
*/
static DEFINE_MUTEX(cache_mutex);
/*
* Groups of events that have the same target(s), one RMID per group.
*/
static LIST_HEAD(cache_groups);
/*
* Mask of CPUs for reading CQM values. We only need one per-socket.
*/
static cpumask_t cqm_cpumask;
#define RMID_VAL_ERROR (1ULL << 63)
#define RMID_VAL_UNAVAIL (1ULL << 62)
#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
static u64 __rmid_read(unsigned long rmid)
{
u64 val;
/*
* Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
* it just says that to increase confusion.
*/
wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
rdmsrl(MSR_IA32_QM_CTR, val);
/*
* Aside from the ERROR and UNAVAIL bits, assume this thing returns
* the number of cachelines tagged with @rmid.
*/
return val;
}
struct cqm_rmid_entry {
u64 rmid;
struct list_head list;
};
/*
* A least recently used list of RMIDs.
*
* Oldest entry at the head, newest (most recently used) entry at the
* tail. This list is never traversed, it's only used to keep track of
* the lru order. That is, we only pick entries of the head or insert
* them on the tail.
*
* All entries on the list are 'free', and their RMIDs are not currently
* in use. To mark an RMID as in use, remove its entry from the lru
* list.
*
* This list is protected by cache_mutex.
*/
static LIST_HEAD(cqm_rmid_lru);
/*
* We use a simple array of pointers so that we can lookup a struct
* cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
* and __put_rmid() from having to worry about dealing with struct
* cqm_rmid_entry - they just deal with rmids, i.e. integers.
*
* Once this array is initialized it is read-only. No locks are required
* to access it.
*
* All entries for all RMIDs can be looked up in the this array at all
* times.
*/
static struct cqm_rmid_entry **cqm_rmid_ptrs;
static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
{
struct cqm_rmid_entry *entry;
entry = cqm_rmid_ptrs[rmid];
WARN_ON(entry->rmid != rmid);
return entry;
}
/*
* Returns < 0 on fail.
*
* We expect to be called with cache_mutex held.
*/
static int __get_rmid(void)
{
struct cqm_rmid_entry *entry;
lockdep_assert_held(&cache_mutex);
if (list_empty(&cqm_rmid_lru))
return -EAGAIN;
entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list);
list_del(&entry->list);
return entry->rmid;
}
static void __put_rmid(int rmid)
{
struct cqm_rmid_entry *entry;
lockdep_assert_held(&cache_mutex);
entry = __rmid_entry(rmid);
list_add_tail(&entry->list, &cqm_rmid_lru);
}
static int intel_cqm_setup_rmid_cache(void)
{
struct cqm_rmid_entry *entry;
int r;
cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
(cqm_max_rmid + 1), GFP_KERNEL);
if (!cqm_rmid_ptrs)
return -ENOMEM;
for (r = 0; r <= cqm_max_rmid; r++) {
struct cqm_rmid_entry *entry;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
goto fail;
INIT_LIST_HEAD(&entry->list);
entry->rmid = r;
cqm_rmid_ptrs[r] = entry;
list_add_tail(&entry->list, &cqm_rmid_lru);
}
/*
* RMID 0 is special and is always allocated. It's used for all
* tasks that are not monitored.
*/
entry = __rmid_entry(0);
list_del(&entry->list);
return 0;
fail:
while (r--)
kfree(cqm_rmid_ptrs[r]);
kfree(cqm_rmid_ptrs);
return -ENOMEM;
}
/*
* Determine if @a and @b measure the same set of tasks.
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
*
* If @a and @b measure the same set of tasks then we want to share a
* single RMID.
*/
static bool __match_event(struct perf_event *a, struct perf_event *b)
{
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
/* Per-cpu and task events don't mix */
if ((a->attach_state & PERF_ATTACH_TASK) !=
(b->attach_state & PERF_ATTACH_TASK))
return false;
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
#ifdef CONFIG_CGROUP_PERF
if (a->cgrp != b->cgrp)
return false;
#endif
/* If not task event, we're machine wide */
if (!(b->attach_state & PERF_ATTACH_TASK))
return true;
/*
* Events that target same task are placed into the same cache group.
*/
if (a->hw.cqm_target == b->hw.cqm_target)
return true;
/*
* Are we an inherited event?
*/
if (b->parent == a)
return true;
return false;
}
#ifdef CONFIG_CGROUP_PERF
static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
{
if (event->attach_state & PERF_ATTACH_TASK)
return perf_cgroup_from_task(event->hw.cqm_target);
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
return event->cgrp;
}
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
#endif
/*
* Determine if @a's tasks intersect with @b's tasks
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
*
* There are combinations of events that we explicitly prohibit,
*
* PROHIBITS
* system-wide -> cgroup and task
* cgroup -> system-wide
* -> task in cgroup
* task -> system-wide
* -> task in cgroup
*
* Call this function before allocating an RMID.
*/
static bool __conflict_event(struct perf_event *a, struct perf_event *b)
{
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
#ifdef CONFIG_CGROUP_PERF
/*
* We can have any number of cgroups but only one system-wide
* event at a time.
*/
if (a->cgrp && b->cgrp) {
struct perf_cgroup *ac = a->cgrp;
struct perf_cgroup *bc = b->cgrp;
/*
* This condition should have been caught in
* __match_event() and we should be sharing an RMID.
*/
WARN_ON_ONCE(ac == bc);
if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
return true;
return false;
}
if (a->cgrp || b->cgrp) {
struct perf_cgroup *ac, *bc;
/*
* cgroup and system-wide events are mutually exclusive
*/
if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
(b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
return true;
/*
* Ensure neither event is part of the other's cgroup
*/
ac = event_to_cgroup(a);
bc = event_to_cgroup(b);
if (ac == bc)
return true;
/*
* Must have cgroup and non-intersecting task events.
*/
if (!ac || !bc)
return false;
/*
* We have cgroup and task events, and the task belongs
* to a cgroup. Check for for overlap.
*/
if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
return true;
return false;
}
#endif
/*
* If one of them is not a task, same story as above with cgroups.
*/
if (!(a->attach_state & PERF_ATTACH_TASK) ||
!(b->attach_state & PERF_ATTACH_TASK))
return true;
/*
* Must be non-overlapping.
*/
return false;
}
/*
* Find a group and setup RMID.
*
* If we're part of a group, we use the group's RMID.
*/
static int intel_cqm_setup_event(struct perf_event *event,
struct perf_event **group)
{
struct perf_event *iter;
int rmid;
list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
if (__match_event(iter, event)) {
/* All tasks in a group share an RMID */
event->hw.cqm_rmid = iter->hw.cqm_rmid;
*group = iter;
return 0;
}
if (__conflict_event(iter, event))
return -EBUSY;
}
rmid = __get_rmid();
if (rmid < 0)
return rmid;
event->hw.cqm_rmid = rmid;
return 0;
}
static void intel_cqm_event_read(struct perf_event *event)
{
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
unsigned long rmid;
u64 val;
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
/*
* Task events are handled by intel_cqm_event_count().
*/
if (event->cpu == -1)
return;
rmid = event->hw.cqm_rmid;
val = __rmid_read(rmid);
/*
* Ignore this reading on error states and do not update the value.
*/
if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
return;
local64_set(&event->count, val);
}
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
struct rmid_read {
unsigned int rmid;
atomic64_t value;
};
static void __intel_cqm_event_count(void *info)
{
struct rmid_read *rr = info;
u64 val;
val = __rmid_read(rr->rmid);
if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
return;
atomic64_add(val, &rr->value);
}
static inline bool cqm_group_leader(struct perf_event *event)
{
return !list_empty(&event->hw.cqm_groups_entry);
}
static u64 intel_cqm_event_count(struct perf_event *event)
{
struct rmid_read rr = {
.rmid = event->hw.cqm_rmid,
.value = ATOMIC64_INIT(0),
};
/*
* We only need to worry about task events. System-wide events
* are handled like usual, i.e. entirely with
* intel_cqm_event_read().
*/
if (event->cpu != -1)
return __perf_event_count(event);
/*
* Only the group leader gets to report values. This stops us
* reporting duplicate values to userspace, and gives us a clear
* rule for which task gets to report the values.
*
* Note that it is impossible to attribute these values to
* specific packages - we forfeit that ability when we create
* task events.
*/
if (!cqm_group_leader(event))
return 0;
on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
local64_set(&event->count, atomic64_read(&rr.value));
return __perf_event_count(event);
}
static void intel_cqm_event_start(struct perf_event *event, int mode)
{
struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
unsigned long rmid = event->hw.cqm_rmid;
unsigned long flags;
if (!(event->hw.cqm_state & PERF_HES_STOPPED))
return;
event->hw.cqm_state &= ~PERF_HES_STOPPED;
raw_spin_lock_irqsave(&state->lock, flags);
if (state->cnt++)
WARN_ON_ONCE(state->rmid != rmid);
else
WARN_ON_ONCE(state->rmid);
state->rmid = rmid;
wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
raw_spin_unlock_irqrestore(&state->lock, flags);
}
static void intel_cqm_event_stop(struct perf_event *event, int mode)
{
struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
unsigned long flags;
if (event->hw.cqm_state & PERF_HES_STOPPED)
return;
event->hw.cqm_state |= PERF_HES_STOPPED;
raw_spin_lock_irqsave(&state->lock, flags);
intel_cqm_event_read(event);
if (!--state->cnt) {
state->rmid = 0;
wrmsrl(MSR_IA32_PQR_ASSOC, 0);
} else {
WARN_ON_ONCE(!state->rmid);
}
raw_spin_unlock_irqrestore(&state->lock, flags);
}
static int intel_cqm_event_add(struct perf_event *event, int mode)
{
int rmid;
event->hw.cqm_state = PERF_HES_STOPPED;
rmid = event->hw.cqm_rmid;
WARN_ON_ONCE(!rmid);
if (mode & PERF_EF_START)
intel_cqm_event_start(event, mode);
return 0;
}
static void intel_cqm_event_del(struct perf_event *event, int mode)
{
intel_cqm_event_stop(event, mode);
}
static void intel_cqm_event_destroy(struct perf_event *event)
{
struct perf_event *group_other = NULL;
mutex_lock(&cache_mutex);
/*
* If there's another event in this group...
*/
if (!list_empty(&event->hw.cqm_group_entry)) {
group_other = list_first_entry(&event->hw.cqm_group_entry,
struct perf_event,
hw.cqm_group_entry);
list_del(&event->hw.cqm_group_entry);
}
/*
* And we're the group leader..
*/
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
if (cqm_group_leader(event)) {
/*
* If there was a group_other, make that leader, otherwise
* destroy the group and return the RMID.
*/
if (group_other) {
list_replace(&event->hw.cqm_groups_entry,
&group_other->hw.cqm_groups_entry);
} else {
int rmid = event->hw.cqm_rmid;
__put_rmid(rmid);
list_del(&event->hw.cqm_groups_entry);
}
}
mutex_unlock(&cache_mutex);
}
static struct pmu intel_cqm_pmu;
static int intel_cqm_event_init(struct perf_event *event)
{
struct perf_event *group = NULL;
int err;
if (event->attr.type != intel_cqm_pmu.type)
return -ENOENT;
if (event->attr.config & ~QOS_EVENT_MASK)
return -EINVAL;
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
event->attr.exclude_hv ||
event->attr.exclude_idle ||
event->attr.exclude_host ||
event->attr.exclude_guest ||
event->attr.sample_period) /* no sampling */
return -EINVAL;
INIT_LIST_HEAD(&event->hw.cqm_group_entry);
INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
event->destroy = intel_cqm_event_destroy;
mutex_lock(&cache_mutex);
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
/* Will also set rmid */
err = intel_cqm_setup_event(event, &group);
if (err)
goto out;
if (group) {
list_add_tail(&event->hw.cqm_group_entry,
&group->hw.cqm_group_entry);
} else {
list_add_tail(&event->hw.cqm_groups_entry,
&cache_groups);
}
out:
mutex_unlock(&cache_mutex);
return err;
}
EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
static struct attribute *intel_cqm_events_attr[] = {
EVENT_PTR(intel_cqm_llc),
EVENT_PTR(intel_cqm_llc_pkg),
EVENT_PTR(intel_cqm_llc_unit),
EVENT_PTR(intel_cqm_llc_scale),
EVENT_PTR(intel_cqm_llc_snapshot),
NULL,
};
static struct attribute_group intel_cqm_events_group = {
.name = "events",
.attrs = intel_cqm_events_attr,
};
PMU_FORMAT_ATTR(event, "config:0-7");
static struct attribute *intel_cqm_formats_attr[] = {
&format_attr_event.attr,
NULL,
};
static struct attribute_group intel_cqm_format_group = {
.name = "format",
.attrs = intel_cqm_formats_attr,
};
static const struct attribute_group *intel_cqm_attr_groups[] = {
&intel_cqm_events_group,
&intel_cqm_format_group,
NULL,
};
static struct pmu intel_cqm_pmu = {
.attr_groups = intel_cqm_attr_groups,
.task_ctx_nr = perf_sw_context,
.event_init = intel_cqm_event_init,
.add = intel_cqm_event_add,
.del = intel_cqm_event_del,
.start = intel_cqm_event_start,
.stop = intel_cqm_event_stop,
.read = intel_cqm_event_read,
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
.count = intel_cqm_event_count,
};
static inline void cqm_pick_event_reader(int cpu)
{
int phys_id = topology_physical_package_id(cpu);
int i;
for_each_cpu(i, &cqm_cpumask) {
if (phys_id == topology_physical_package_id(i))
return; /* already got reader for this socket */
}
cpumask_set_cpu(cpu, &cqm_cpumask);
}
static void intel_cqm_cpu_prepare(unsigned int cpu)
{
struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
struct cpuinfo_x86 *c = &cpu_data(cpu);
raw_spin_lock_init(&state->lock);
state->rmid = 0;
state->cnt = 0;
WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
}
static void intel_cqm_cpu_exit(unsigned int cpu)
{
int phys_id = topology_physical_package_id(cpu);
int i;
/*
* Is @cpu a designated cqm reader?
*/
if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
return;
for_each_online_cpu(i) {
if (i == cpu)
continue;
if (phys_id == topology_physical_package_id(i)) {
cpumask_set_cpu(i, &cqm_cpumask);
break;
}
}
}
static int intel_cqm_cpu_notifier(struct notifier_block *nb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
intel_cqm_cpu_prepare(cpu);
break;
case CPU_DOWN_PREPARE:
intel_cqm_cpu_exit(cpu);
break;
case CPU_STARTING:
cqm_pick_event_reader(cpu);
break;
}
return NOTIFY_OK;
}
static const struct x86_cpu_id intel_cqm_match[] = {
{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
{}
};
static int __init intel_cqm_init(void)
{
char *str, scale[20];
int i, cpu, ret;
if (!x86_match_cpu(intel_cqm_match))
return -ENODEV;
cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
/*
* It's possible that not all resources support the same number
* of RMIDs. Instead of making scheduling much more complicated
* (where we have to match a task's RMID to a cpu that supports
* that many RMIDs) just find the minimum RMIDs supported across
* all cpus.
*
* Also, check that the scales match on all cpus.
*/
cpu_notifier_register_begin();
for_each_online_cpu(cpu) {
struct cpuinfo_x86 *c = &cpu_data(cpu);
if (c->x86_cache_max_rmid < cqm_max_rmid)
cqm_max_rmid = c->x86_cache_max_rmid;
if (c->x86_cache_occ_scale != cqm_l3_scale) {
pr_err("Multiple LLC scale values, disabling\n");
ret = -EINVAL;
goto out;
}
}
snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
str = kstrdup(scale, GFP_KERNEL);
if (!str) {
ret = -ENOMEM;
goto out;
}
event_attr_intel_cqm_llc_scale.event_str = str;
ret = intel_cqm_setup_rmid_cache();
if (ret)
goto out;
for_each_online_cpu(i) {
intel_cqm_cpu_prepare(i);
cqm_pick_event_reader(i);
}
__perf_cpu_notifier(intel_cqm_cpu_notifier);
perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming <matt.fleming@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kanaka Juvva <kanaka.d.juvva@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-23 18:45:46 +00:00
ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
PERF_TYPE_INTEL_CQM);
if (ret)
pr_err("Intel CQM perf registration failed: %d\n", ret);
else
pr_info("Intel CQM monitoring enabled\n");
out:
cpu_notifier_register_done();
return ret;
}
device_initcall(intel_cqm_init);