// SPDX-License-Identifier: GPL-2.0-or-later /* * POWERNV cpufreq driver for the IBM POWER processors * * (C) Copyright IBM 2014 * * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
*/
#define MAX_RAMP_DOWN_TIME 5120 /* * On an idle system we want the global pstate to ramp-down from max value to * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and * then ramp-down rapidly later on. * * This gives a percentage rampdown for time elapsed in milliseconds. * ramp_down_percentage = ((ms * ms) >> 18) * ~= 3.8 * (sec * sec) * * At 0 ms ramp_down_percent = 0 * At 5120 ms ramp_down_percent = 100
*/ #define ramp_down_percent(time) ((time * time) >> 18)
/* Interval after which the timer is queued to bring down global pstate */ #define GPSTATE_TIMER_INTERVAL 2000
/** * struct global_pstate_info - Per policy data structure to maintain history of * global pstates * @highest_lpstate_idx: The local pstate index from which we are * ramping down * @elapsed_time: Time in ms spent in ramping down from * highest_lpstate_idx * @last_sampled_time: Time from boot in ms when global pstates were * last set * @last_lpstate_idx: Last set value of local pstate and global * @last_gpstate_idx: pstate in terms of cpufreq table index * @timer: Is used for ramping down if cpu goes idle for * a long time with global pstate held high * @gpstate_lock: A spinlock to maintain synchronization between * routines called by the timer handler and * governer's target_index calls * @policy: Associated CPUFreq policy
*/ struct global_pstate_info { int highest_lpstate_idx; unsignedint elapsed_time; unsignedint last_sampled_time; int last_lpstate_idx; int last_gpstate_idx;
spinlock_t gpstate_lock; struct timer_list timer; struct cpufreq_policy *policy;
};
static DEFINE_HASHTABLE(pstate_revmap, POWERNV_MAX_PSTATES_ORDER); /** * struct pstate_idx_revmap_data: Entry in the hashmap pstate_revmap * indexed by a function of pstate id. * * @pstate_id: pstate id for this entry. * * @cpufreq_table_idx: Index into the powernv_freqs * cpufreq_frequency_table for frequency * corresponding to pstate_id. * * @hentry: hlist_node that hooks this entry into the pstate_revmap * hashtable
*/ struct pstate_idx_revmap_data {
u8 pstate_id; unsignedint cpufreq_table_idx; struct hlist_node hentry;
};
/* * Note: * The set of pstates consists of contiguous integers. * powernv_pstate_info stores the index of the frequency table for * max, min and nominal frequencies. It also stores number of * available frequencies. * * powernv_pstate_info.nominal indicates the index to the highest * non-turbo frequency.
*/ staticstruct powernv_pstate_info { unsignedint min; unsignedint max; unsignedint nominal; unsignedint nr_pstates; bool wof_enabled;
} powernv_pstate_info;
/* Use following functions for conversions between pstate_id and index */
/* * idx_to_pstate : Returns the pstate id corresponding to the * frequency in the cpufreq frequency table * powernv_freqs indexed by @i. * * If @i is out of bound, this will return the pstate * corresponding to the nominal frequency.
*/ staticinline u8 idx_to_pstate(unsignedint i)
{ if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
pr_warn_once("idx_to_pstate: index %u is out of bound\n", i); return powernv_freqs[powernv_pstate_info.nominal].driver_data;
}
return powernv_freqs[i].driver_data;
}
/* * pstate_to_idx : Returns the index in the cpufreq frequencytable * powernv_freqs for the frequency whose corresponding * pstate id is @pstate. * * If no frequency corresponding to @pstate is found, * this will return the index of the nominal * frequency.
*/ staticunsignedint pstate_to_idx(u8 pstate)
{ unsignedint key = pstate % POWERNV_MAX_PSTATES; struct pstate_idx_revmap_data *revmap_data;
/* * Initialize the freq table based on data obtained * from the firmware passed via device-tree
*/ staticint init_powernv_pstates(void)
{ struct device_node *power_mgt; int i, nr_pstates = 0; const __be32 *pstate_ids, *pstate_freqs;
u32 len_ids, len_freqs;
u32 pstate_min, pstate_max, pstate_nominal;
u32 pstate_turbo, pstate_ultra_turbo; int rc = -ENODEV;
power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); if (!power_mgt) {
pr_warn("power-mgt node not found\n"); return -ENODEV;
}
if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) {
pr_warn("ibm,pstate-min node not found\n"); goto out;
}
if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) {
pr_warn("ibm,pstate-max node not found\n"); goto out;
}
if (of_property_read_u32(power_mgt, "ibm,pstate-nominal",
&pstate_nominal)) {
pr_warn("ibm,pstate-nominal not found\n"); goto out;
}
next:
pr_info("cpufreq pstate min 0x%x nominal 0x%x max 0x%x\n", pstate_min,
pstate_nominal, pstate_max);
pr_info("Workload Optimized Frequency is %s in the platform\n",
str_enabled_disabled(powernv_pstate_info.wof_enabled));
pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids); if (!pstate_ids) {
pr_warn("ibm,pstate-ids not found\n"); goto out;
}
pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz",
&len_freqs); if (!pstate_freqs) {
pr_warn("ibm,pstate-frequencies-mhz not found\n"); goto out;
}
if (len_ids != len_freqs) {
pr_warn("Entries in ibm,pstate-ids and " "ibm,pstate-frequencies-mhz does not match\n");
}
if (id == pstate_max)
powernv_pstate_info.max = i; if (id == pstate_nominal)
powernv_pstate_info.nominal = i; if (id == pstate_min)
powernv_pstate_info.min = i;
if (powernv_pstate_info.wof_enabled && id == pstate_turbo) { int j;
for (j = i - 1; j >= (int)powernv_pstate_info.max; j--)
powernv_freqs[j].flags = CPUFREQ_BOOST_FREQ;
}
}
/* End of list marker entry */
powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
/* Returns the CPU frequency corresponding to the pstate_id. */ staticunsignedint pstate_id_to_freq(u8 pstate_id)
{ int i;
i = pstate_to_idx(pstate_id); if (i >= powernv_pstate_info.nr_pstates || i < 0) {
pr_warn("PState id 0x%x outside of PState table, reporting nominal id 0x%x instead\n",
pstate_id, idx_to_pstate(powernv_pstate_info.nominal));
i = powernv_pstate_info.nominal;
}
return powernv_freqs[i].frequency;
}
/* * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by * the firmware
*/ static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy, char *buf)
{ return sprintf(buf, "%u\n",
powernv_freqs[powernv_pstate_info.nominal].frequency);
}
case SPRN_PMICR:
mtspr(SPRN_PMICR, val); return;
}
BUG();
}
/* * Use objects of this type to query/update * pstates on a remote CPU via smp_call_function.
*/ struct powernv_smp_call_data { unsignedint freq;
u8 pstate_id;
u8 gpstate_id;
};
/* * powernv_read_cpu_freq: Reads the current frequency on this CPU. * * Called via smp_call_function. * * Note: The caller of the smp_call_function should pass an argument of * the type 'struct powernv_smp_call_data *' along with this function. * * The current frequency on this CPU will be returned via * ((struct powernv_smp_call_data *)arg)->freq;
*/ staticvoid powernv_read_cpu_freq(void *arg)
{ unsignedlong pmspr_val; struct powernv_smp_call_data *freq_data = arg;
/* * powernv_cpufreq_get: Returns the CPU frequency as reported by the * firmware for CPU 'cpu'. This value is reported through the sysfs * file cpuinfo_cur_freq.
*/ staticunsignedint powernv_cpufreq_get(unsignedint cpu)
{ struct powernv_smp_call_data freq_data;
/* * set_pstate: Sets the pstate on this CPU. * * This is called via an smp_call_function. * * The caller must ensure that freq_data is of the type * (struct powernv_smp_call_data *) and the pstate_id which needs to be set * on this CPU should be present in freq_data->pstate_id.
*/ staticvoid set_pstate(void *data)
{ unsignedlong val; struct powernv_smp_call_data *freq_data = data; unsignedlong pstate_ul = freq_data->pstate_id; unsignedlong gpstate_ul = freq_data->gpstate_id;
val = get_pmspr(SPRN_PMCR);
val = val & 0x0000FFFFFFFFFFFFULL;
/* Set both global(bits 56..63) and local(bits 48..55) PStates */
val = val | (gpstate_ul << 56) | (pstate_ul << 48);
pr_debug("Setting cpu %d pmcr to %016lX\n",
raw_smp_processor_id(), val);
set_pmspr(SPRN_PMCR, val);
}
/* * get_nominal_index: Returns the index corresponding to the nominal * pstate in the cpufreq table
*/ staticinlineunsignedint get_nominal_index(void)
{ return powernv_pstate_info.nominal;
}
/* Check for Pmax Capping */
pmsr_pmax = extract_max_pstate(pmsr);
pmsr_pmax_idx = pstate_to_idx(pmsr_pmax); if (pmsr_pmax_idx != powernv_pstate_info.max) { if (chip->throttled) goto next;
chip->throttled = true; if (pmsr_pmax_idx > powernv_pstate_info.nominal) {
pr_warn_once("CPU %d on Chip %u has Pmax(0x%x) reduced below that of nominal frequency(0x%x)\n",
cpu, chip->id, pmsr_pmax,
idx_to_pstate(powernv_pstate_info.nominal));
chip->throttle_sub_turbo++;
} else {
chip->throttle_turbo++;
}
trace_powernv_throttle(chip->id,
throttle_reason[chip->throttle_reason],
pmsr_pmax);
} elseif (chip->throttled) {
chip->throttled = false;
trace_powernv_throttle(chip->id,
throttle_reason[chip->throttle_reason],
pmsr_pmax);
}
/* Check if Psafe_mode_active is set in PMSR. */
next: if (pmsr & PMSR_PSAFE_ENABLE) {
throttled = true;
pr_info("Pstate set to safe frequency\n");
}
/* Check if SPR_EM_DISABLE is set in PMSR */ if (pmsr & PMSR_SPR_EM_DISABLE) {
throttled = true;
pr_info("Frequency Control disabled from OS\n");
}
if (throttled) {
pr_info("PMSR = %16lx\n", pmsr);
pr_warn("CPU Frequency could be throttled\n");
}
}
/** * calc_global_pstate - Calculate global pstate * @elapsed_time: Elapsed time in milliseconds * @local_pstate_idx: New local pstate * @highest_lpstate_idx: pstate from which its ramping down * * Finds the appropriate global pstate based on the pstate from which its * ramping down and the time elapsed in ramping down. It follows a quadratic * equation which ensures that it reaches ramping down to pmin in 5sec.
*/ staticinlineint calc_global_pstate(unsignedint elapsed_time, int highest_lpstate_idx, int local_pstate_idx)
{ int index_diff;
/* * Using ramp_down_percent we get the percentage of rampdown * that we are expecting to be dropping. Difference between * highest_lpstate_idx and powernv_pstate_info.min will give a absolute * number of how many pstates we will drop eventually by the end of * 5 seconds, then just scale it get the number pstates to be dropped.
*/
index_diff = ((int)ramp_down_percent(elapsed_time) *
(powernv_pstate_info.min - highest_lpstate_idx)) / 100;
/* Ensure that global pstate is >= to local pstate */ if (highest_lpstate_idx + index_diff >= local_pstate_idx) return local_pstate_idx; else return highest_lpstate_idx + index_diff;
}
/* * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time. * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME * seconds of ramp down time.
*/ if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
> MAX_RAMP_DOWN_TIME)
timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time; else
timer_interval = GPSTATE_TIMER_INTERVAL;
/** * gpstate_timer_handler * * @t: Timer context used to fetch global pstate info struct * * This handler brings down the global pstate closer to the local pstate * according quadratic equation. Queues a new timer if it is still not equal * to local pstate
*/ staticvoid gpstate_timer_handler(struct timer_list *t)
{ struct global_pstate_info *gpstates = timer_container_of(gpstates, t,
timer); struct cpufreq_policy *policy = gpstates->policy; int gpstate_idx, lpstate_idx; unsignedlong val; unsignedint time_diff = jiffies_to_msecs(jiffies)
- gpstates->last_sampled_time; struct powernv_smp_call_data freq_data;
if (!spin_trylock(&gpstates->gpstate_lock)) return; /* * If the timer has migrated to the different cpu then bring * it back to one of the policy->cpus
*/ if (!cpumask_test_cpu(raw_smp_processor_id(), policy->cpus)) {
gpstates->timer.expires = jiffies + msecs_to_jiffies(1);
add_timer_on(&gpstates->timer, cpumask_first(policy->cpus));
spin_unlock(&gpstates->gpstate_lock); return;
}
/* * If PMCR was last updated was using fast_switch then * We may have wrong in gpstate->last_lpstate_idx * value. Hence, read from PMCR to get correct data.
*/
val = get_pmspr(SPRN_PMCR);
freq_data.gpstate_id = extract_global_pstate(val);
freq_data.pstate_id = extract_local_pstate(val); if (freq_data.gpstate_id == freq_data.pstate_id) {
reset_gpstates(policy);
spin_unlock(&gpstates->gpstate_lock); return;
}
if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
gpstate_idx = pstate_to_idx(freq_data.pstate_id);
lpstate_idx = gpstate_idx;
reset_gpstates(policy);
gpstates->highest_lpstate_idx = gpstate_idx;
} else {
lpstate_idx = pstate_to_idx(freq_data.pstate_id);
gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
gpstates->highest_lpstate_idx,
lpstate_idx);
}
freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
gpstates->last_gpstate_idx = gpstate_idx;
gpstates->last_lpstate_idx = lpstate_idx; /* * If local pstate is equal to global pstate, rampdown is over * So timer is not required to be queued.
*/ if (gpstate_idx != gpstates->last_lpstate_idx)
queue_gpstate_timer(gpstates);
/* * powernv_cpufreq_target_index: Sets the frequency corresponding to * the cpufreq table entry indexed by new_index on the cpus in the * mask policy->cpus
*/ staticint powernv_cpufreq_target_index(struct cpufreq_policy *policy, unsignedint new_index)
{ struct powernv_smp_call_data freq_data; unsignedint cur_msec, gpstate_idx; struct global_pstate_info *gpstates = policy->driver_data;
if (unlikely(rebooting) && new_index != get_nominal_index()) return 0;
if (!throttled) { /* we don't want to be preempted while * checking if the CPU frequency has been throttled
*/
preempt_disable();
powernv_cpufreq_throttle_check(NULL);
preempt_enable();
}
if (gpstates->last_gpstate_idx < new_index) {
gpstates->elapsed_time += cur_msec -
gpstates->last_sampled_time;
/* * If its has been ramping down for more than MAX_RAMP_DOWN_TIME * we should be resetting all global pstate related data. Set it * equal to local pstate to start fresh.
*/ if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
reset_gpstates(policy);
gpstates->highest_lpstate_idx = new_index;
gpstate_idx = new_index;
} else { /* Elaspsed_time is less than 5 seconds, continue to rampdown */
gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
gpstates->highest_lpstate_idx,
new_index);
}
} else {
reset_gpstates(policy);
gpstates->highest_lpstate_idx = new_index;
gpstate_idx = new_index;
}
/* * If local pstate is equal to global pstate, rampdown is over * So timer is not required to be queued.
*/ if (gpstate_idx != new_index)
queue_gpstate_timer(gpstates); else
timer_delete_sync(&gpstates->timer);
no_gpstate: /* * Use smp_call_function to send IPI and execute the * mtspr on target CPU. We could do that without IPI * if current CPU is within policy->cpus (core)
*/
smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1); return 0;
}
for (i = 0; i < threads_per_core; i++)
cpumask_set_cpu(base + i, policy->cpus);
kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name); if (!kn) { int ret;
ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp); if (ret) {
pr_info("Failed to create throttle stats directory for cpu %d\n",
policy->cpu); return ret;
}
} else {
kernfs_put(kn);
}
switch (omsg.type) { case OCC_RESET:
occ_reset = true;
pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n"); /* * powernv_cpufreq_throttle_check() is called in * target() callback which can detect the throttle state * for governors like ondemand. * But static governors will not call target() often thus * report throttling here.
*/ if (!throttled) {
throttled = true;
pr_warn("CPU frequency is throttled for duration\n");
}
break; case OCC_LOAD:
pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n"); break; case OCC_THROTTLE:
omsg.chip = be64_to_cpu(msg->params[1]);
omsg.throttle_status = be64_to_cpu(msg->params[2]);
if (occ_reset) {
occ_reset = false;
throttled = false;
pr_info("OCC Active, CPU frequency is no longer throttled\n");
for (i = 0; i < nr_chips; i++) {
chips[i].restore = true;
schedule_work(&chips[i].throttle);
}
return 0;
}
for (i = 0; i < nr_chips; i++) if (chips[i].id == omsg.chip) break;
staticint init_chip_info(void)
{ unsignedint *chip; unsignedint cpu, i; unsignedint prev_chip_id = UINT_MAX;
cpumask_t *chip_cpu_mask; int ret = 0;
chip = kcalloc(num_possible_cpus(), sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM;
/* Allocate a chip cpu mask large enough to fit mask for all chips */
chip_cpu_mask = kcalloc(MAX_NR_CHIPS, sizeof(cpumask_t), GFP_KERNEL); if (!chip_cpu_mask) {
ret = -ENOMEM; goto free_and_return;
}
for_each_possible_cpu(cpu) { unsignedint id = cpu_to_chip_id(cpu);
return 0;
cleanup:
clean_chip_info();
out:
pr_info("Platform driver disabled. System does not support PState control\n"); return rc;
}
module_init(powernv_cpufreq_init);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.