mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-10-31 16:54:21 +00:00 
			
		
		
		
	 fb4c77c21a
			
		
	
	
		fb4c77c21a
		
	
	
	
	
		
			
			Due to the avoidance of IPIs to idle CPUs arch_freq_get_on_cpu() can return 0 when the last sample was too long ago. show_cpuinfo() has a fallback to cpufreq_quick_get() and if that fails to return cpu_khz, but the readout code for the per CPU scaling frequency in sysfs does not. Move that fallback into arch_freq_get_on_cpu() so the behaviour is the same when reading /proc/cpuinfo and /sys/..../cur_scaling_freq. Suggested-by: "Rafael J. Wysocki" <rafael@kernel.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Doug Smythies <dsmythies@telus.net> Link: https://lore.kernel.org/r/87pml5180p.ffs@tglx
		
			
				
	
	
		
			451 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			451 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0-only
 | |
| /*
 | |
|  * x86 APERF/MPERF KHz calculation for
 | |
|  * /sys/.../cpufreq/scaling_cur_freq
 | |
|  *
 | |
|  * Copyright (C) 2017 Intel Corp.
 | |
|  * Author: Len Brown <len.brown@intel.com>
 | |
|  */
 | |
| #include <linux/cpufreq.h>
 | |
| #include <linux/delay.h>
 | |
| #include <linux/ktime.h>
 | |
| #include <linux/math64.h>
 | |
| #include <linux/percpu.h>
 | |
| #include <linux/rcupdate.h>
 | |
| #include <linux/sched/isolation.h>
 | |
| #include <linux/sched/topology.h>
 | |
| #include <linux/smp.h>
 | |
| #include <linux/syscore_ops.h>
 | |
| 
 | |
| #include <asm/cpu.h>
 | |
| #include <asm/cpu_device_id.h>
 | |
| #include <asm/intel-family.h>
 | |
| 
 | |
| #include "cpu.h"
 | |
| 
 | |
| struct aperfmperf {
 | |
| 	seqcount_t	seq;
 | |
| 	unsigned long	last_update;
 | |
| 	u64		acnt;
 | |
| 	u64		mcnt;
 | |
| 	u64		aperf;
 | |
| 	u64		mperf;
 | |
| };
 | |
| 
 | |
| static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
 | |
| 	.seq = SEQCNT_ZERO(cpu_samples.seq)
 | |
| };
 | |
| 
 | |
| static void init_counter_refs(void)
 | |
| {
 | |
| 	u64 aperf, mperf;
 | |
| 
 | |
| 	rdmsrl(MSR_IA32_APERF, aperf);
 | |
| 	rdmsrl(MSR_IA32_MPERF, mperf);
 | |
| 
 | |
| 	this_cpu_write(cpu_samples.aperf, aperf);
 | |
| 	this_cpu_write(cpu_samples.mperf, mperf);
 | |
| }
 | |
| 
 | |
| #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
 | |
| /*
 | |
|  * APERF/MPERF frequency ratio computation.
 | |
|  *
 | |
|  * The scheduler wants to do frequency invariant accounting and needs a <1
 | |
|  * ratio to account for the 'current' frequency, corresponding to
 | |
|  * freq_curr / freq_max.
 | |
|  *
 | |
|  * Since the frequency freq_curr on x86 is controlled by micro-controller and
 | |
|  * our P-state setting is little more than a request/hint, we need to observe
 | |
|  * the effective frequency 'BusyMHz', i.e. the average frequency over a time
 | |
|  * interval after discarding idle time. This is given by:
 | |
|  *
 | |
|  *   BusyMHz = delta_APERF / delta_MPERF * freq_base
 | |
|  *
 | |
|  * where freq_base is the max non-turbo P-state.
 | |
|  *
 | |
|  * The freq_max term has to be set to a somewhat arbitrary value, because we
 | |
|  * can't know which turbo states will be available at a given point in time:
 | |
|  * it all depends on the thermal headroom of the entire package. We set it to
 | |
|  * the turbo level with 4 cores active.
 | |
|  *
 | |
|  * Benchmarks show that's a good compromise between the 1C turbo ratio
 | |
|  * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
 | |
|  * which would ignore the entire turbo range (a conspicuous part, making
 | |
|  * freq_curr/freq_max always maxed out).
 | |
|  *
 | |
|  * An exception to the heuristic above is the Atom uarch, where we choose the
 | |
|  * highest turbo level for freq_max since Atom's are generally oriented towards
 | |
|  * power efficiency.
 | |
|  *
 | |
|  * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
 | |
|  * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
 | |
|  */
 | |
| 
 | |
| DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
 | |
| 
 | |
| static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
 | |
| static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
 | |
| 
 | |
| void arch_set_max_freq_ratio(bool turbo_disabled)
 | |
| {
 | |
| 	arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
 | |
| 					arch_turbo_freq_ratio;
 | |
| }
 | |
| EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
 | |
| 
 | |
| static bool __init turbo_disabled(void)
 | |
| {
 | |
| 	u64 misc_en;
 | |
| 	int err;
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
 | |
| }
 | |
| 
 | |
| static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
 | |
| {
 | |
| 	int err;
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	*base_freq = (*base_freq >> 16) & 0x3F;     /* max P state */
 | |
| 	*turbo_freq = *turbo_freq & 0x3F;           /* 1C turbo    */
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| #define X86_MATCH(model)					\
 | |
| 	X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6,		\
 | |
| 		INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
 | |
| 
 | |
| static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
 | |
| 	X86_MATCH(XEON_PHI_KNL),
 | |
| 	X86_MATCH(XEON_PHI_KNM),
 | |
| 	{}
 | |
| };
 | |
| 
 | |
| static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
 | |
| 	X86_MATCH(SKYLAKE_X),
 | |
| 	{}
 | |
| };
 | |
| 
 | |
| static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
 | |
| 	X86_MATCH(ATOM_GOLDMONT),
 | |
| 	X86_MATCH(ATOM_GOLDMONT_D),
 | |
| 	X86_MATCH(ATOM_GOLDMONT_PLUS),
 | |
| 	{}
 | |
| };
 | |
| 
 | |
| static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
 | |
| 					  int num_delta_fratio)
 | |
| {
 | |
| 	int fratio, delta_fratio, found;
 | |
| 	int err, i;
 | |
| 	u64 msr;
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	*base_freq = (*base_freq >> 8) & 0xFF;	    /* max P state */
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	fratio = (msr >> 8) & 0xFF;
 | |
| 	i = 16;
 | |
| 	found = 0;
 | |
| 	do {
 | |
| 		if (found >= num_delta_fratio) {
 | |
| 			*turbo_freq = fratio;
 | |
| 			return true;
 | |
| 		}
 | |
| 
 | |
| 		delta_fratio = (msr >> (i + 5)) & 0x7;
 | |
| 
 | |
| 		if (delta_fratio) {
 | |
| 			found += 1;
 | |
| 			fratio -= delta_fratio;
 | |
| 		}
 | |
| 
 | |
| 		i += 8;
 | |
| 	} while (i < 64);
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
 | |
| {
 | |
| 	u64 ratios, counts;
 | |
| 	u32 group_size;
 | |
| 	int err, i;
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	*base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	for (i = 0; i < 64; i += 8) {
 | |
| 		group_size = (counts >> i) & 0xFF;
 | |
| 		if (group_size >= size) {
 | |
| 			*turbo_freq = (ratios >> i) & 0xFF;
 | |
| 			return true;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
 | |
| {
 | |
| 	u64 msr;
 | |
| 	int err;
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
 | |
| 	if (err)
 | |
| 		return false;
 | |
| 
 | |
| 	*base_freq = (*base_freq >> 8) & 0xFF;    /* max P state */
 | |
| 	*turbo_freq = (msr >> 24) & 0xFF;         /* 4C turbo    */
 | |
| 
 | |
| 	/* The CPU may have less than 4 cores */
 | |
| 	if (!*turbo_freq)
 | |
| 		*turbo_freq = msr & 0xFF;         /* 1C turbo    */
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| static bool __init intel_set_max_freq_ratio(void)
 | |
| {
 | |
| 	u64 base_freq, turbo_freq;
 | |
| 	u64 turbo_ratio;
 | |
| 
 | |
| 	if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
 | |
| 		goto out;
 | |
| 
 | |
| 	if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
 | |
| 	    skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
 | |
| 		goto out;
 | |
| 
 | |
| 	if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
 | |
| 	    knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
 | |
| 		goto out;
 | |
| 
 | |
| 	if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
 | |
| 	    skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
 | |
| 		goto out;
 | |
| 
 | |
| 	if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
 | |
| 		goto out;
 | |
| 
 | |
| 	return false;
 | |
| 
 | |
| out:
 | |
| 	/*
 | |
| 	 * Some hypervisors advertise X86_FEATURE_APERFMPERF
 | |
| 	 * but then fill all MSR's with zeroes.
 | |
| 	 * Some CPUs have turbo boost but don't declare any turbo ratio
 | |
| 	 * in MSR_TURBO_RATIO_LIMIT.
 | |
| 	 */
 | |
| 	if (!base_freq || !turbo_freq) {
 | |
| 		pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
 | |
| 		return false;
 | |
| 	}
 | |
| 
 | |
| 	turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
 | |
| 	if (!turbo_ratio) {
 | |
| 		pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
 | |
| 		return false;
 | |
| 	}
 | |
| 
 | |
| 	arch_turbo_freq_ratio = turbo_ratio;
 | |
| 	arch_set_max_freq_ratio(turbo_disabled());
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| #ifdef CONFIG_PM_SLEEP
 | |
| static struct syscore_ops freq_invariance_syscore_ops = {
 | |
| 	.resume = init_counter_refs,
 | |
| };
 | |
| 
 | |
| static void register_freq_invariance_syscore_ops(void)
 | |
| {
 | |
| 	register_syscore_ops(&freq_invariance_syscore_ops);
 | |
| }
 | |
| #else
 | |
| static inline void register_freq_invariance_syscore_ops(void) {}
 | |
| #endif
 | |
| 
 | |
| static void freq_invariance_enable(void)
 | |
| {
 | |
| 	if (static_branch_unlikely(&arch_scale_freq_key)) {
 | |
| 		WARN_ON_ONCE(1);
 | |
| 		return;
 | |
| 	}
 | |
| 	static_branch_enable(&arch_scale_freq_key);
 | |
| 	register_freq_invariance_syscore_ops();
 | |
| 	pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
 | |
| }
 | |
| 
 | |
| void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
 | |
| {
 | |
| 	arch_turbo_freq_ratio = ratio;
 | |
| 	arch_set_max_freq_ratio(turbo_disabled);
 | |
| 	freq_invariance_enable();
 | |
| }
 | |
| 
 | |
| static void __init bp_init_freq_invariance(void)
 | |
| {
 | |
| 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 | |
| 		return;
 | |
| 
 | |
| 	if (intel_set_max_freq_ratio())
 | |
| 		freq_invariance_enable();
 | |
| }
 | |
| 
 | |
| static void disable_freq_invariance_workfn(struct work_struct *work)
 | |
| {
 | |
| 	static_branch_disable(&arch_scale_freq_key);
 | |
| }
 | |
| 
 | |
| static DECLARE_WORK(disable_freq_invariance_work,
 | |
| 		    disable_freq_invariance_workfn);
 | |
| 
 | |
| DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
 | |
| 
 | |
| static void scale_freq_tick(u64 acnt, u64 mcnt)
 | |
| {
 | |
| 	u64 freq_scale;
 | |
| 
 | |
| 	if (!arch_scale_freq_invariant())
 | |
| 		return;
 | |
| 
 | |
| 	if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
 | |
| 		goto error;
 | |
| 
 | |
| 	if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
 | |
| 		goto error;
 | |
| 
 | |
| 	freq_scale = div64_u64(acnt, mcnt);
 | |
| 	if (!freq_scale)
 | |
| 		goto error;
 | |
| 
 | |
| 	if (freq_scale > SCHED_CAPACITY_SCALE)
 | |
| 		freq_scale = SCHED_CAPACITY_SCALE;
 | |
| 
 | |
| 	this_cpu_write(arch_freq_scale, freq_scale);
 | |
| 	return;
 | |
| 
 | |
| error:
 | |
| 	pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
 | |
| 	schedule_work(&disable_freq_invariance_work);
 | |
| }
 | |
| #else
 | |
| static inline void bp_init_freq_invariance(void) { }
 | |
| static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
 | |
| #endif /* CONFIG_X86_64 && CONFIG_SMP */
 | |
| 
 | |
| void arch_scale_freq_tick(void)
 | |
| {
 | |
| 	struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
 | |
| 	u64 acnt, mcnt, aperf, mperf;
 | |
| 
 | |
| 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
 | |
| 		return;
 | |
| 
 | |
| 	rdmsrl(MSR_IA32_APERF, aperf);
 | |
| 	rdmsrl(MSR_IA32_MPERF, mperf);
 | |
| 	acnt = aperf - s->aperf;
 | |
| 	mcnt = mperf - s->mperf;
 | |
| 
 | |
| 	s->aperf = aperf;
 | |
| 	s->mperf = mperf;
 | |
| 
 | |
| 	raw_write_seqcount_begin(&s->seq);
 | |
| 	s->last_update = jiffies;
 | |
| 	s->acnt = acnt;
 | |
| 	s->mcnt = mcnt;
 | |
| 	raw_write_seqcount_end(&s->seq);
 | |
| 
 | |
| 	scale_freq_tick(acnt, mcnt);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Discard samples older than the define maximum sample age of 20ms. There
 | |
|  * is no point in sending IPIs in such a case. If the scheduler tick was
 | |
|  * not running then the CPU is either idle or isolated.
 | |
|  */
 | |
| #define MAX_SAMPLE_AGE	((unsigned long)HZ / 50)
 | |
| 
 | |
| unsigned int arch_freq_get_on_cpu(int cpu)
 | |
| {
 | |
| 	struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
 | |
| 	unsigned int seq, freq;
 | |
| 	unsigned long last;
 | |
| 	u64 acnt, mcnt;
 | |
| 
 | |
| 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
 | |
| 		goto fallback;
 | |
| 
 | |
| 	do {
 | |
| 		seq = raw_read_seqcount_begin(&s->seq);
 | |
| 		last = s->last_update;
 | |
| 		acnt = s->acnt;
 | |
| 		mcnt = s->mcnt;
 | |
| 	} while (read_seqcount_retry(&s->seq, seq));
 | |
| 
 | |
| 	/*
 | |
| 	 * Bail on invalid count and when the last update was too long ago,
 | |
| 	 * which covers idle and NOHZ full CPUs.
 | |
| 	 */
 | |
| 	if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
 | |
| 		goto fallback;
 | |
| 
 | |
| 	return div64_u64((cpu_khz * acnt), mcnt);
 | |
| 
 | |
| fallback:
 | |
| 	freq = cpufreq_quick_get(cpu);
 | |
| 	return freq ? freq : cpu_khz;
 | |
| }
 | |
| 
 | |
| static int __init bp_init_aperfmperf(void)
 | |
| {
 | |
| 	if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
 | |
| 		return 0;
 | |
| 
 | |
| 	init_counter_refs();
 | |
| 	bp_init_freq_invariance();
 | |
| 	return 0;
 | |
| }
 | |
| early_initcall(bp_init_aperfmperf);
 | |
| 
 | |
| void ap_init_aperfmperf(void)
 | |
| {
 | |
| 	if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
 | |
| 		init_counter_refs();
 | |
| }
 |