/* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU * with interrupts disabled. So, writes are safe. * They are read and saved off onto struct rq in update_rq_clock(). * This may result in other CPU reading this CPU's IRQ time and can * race with irq/vtime_account on this CPU. We would either get old * or new value with a side effect of accounting a slice of IRQ time to wrong * task when IRQ is in progress while we read rq->clock. That is a worthy * compromise in place of having locks on each IRQ in account_system_time.
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
/* * Called after incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit.
*/ void irqtime_account_irq(struct task_struct *curr, unsignedint offset)
{ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); unsignedint pc;
s64 delta; int cpu;
if (!irqtime_enabled()) return;
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
pc = irq_count() - offset;
/* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run.
*/ if (pc & HARDIRQ_MASK)
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); elseif ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
staticinlinevoid task_group_account_field(struct task_struct *p, int index,
u64 tmp)
{ /* * Since all updates are sure to touch the root cgroup, we * get ourselves ahead and touch it first. If the root cgroup * is the only cgroup, then nothing else should be necessary. *
*/
__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
cgroup_account_cputime_field(p, index, tmp);
}
/* * Account user CPU time to a process. * @p: the process that the CPU time gets accounted to * @cputime: the CPU time spent in user space since the last update
*/ void account_user_time(struct task_struct *p, u64 cputime)
{ int index;
/* Add user time to process. */
p->utime += cputime;
account_group_user_time(p, cputime);
index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
task_group_account_field(p, index, cputime);
/* Account for user time used */
acct_account_cputime(p);
}
/* * Account guest CPU time to a process. * @p: the process that the CPU time gets accounted to * @cputime: the CPU time spent in virtual machine since the last update
*/ void account_guest_time(struct task_struct *p, u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
/* Add guest time to process. */
p->utime += cputime;
account_group_user_time(p, cputime);
p->gtime += cputime;
/* Add guest time to cpustat. */ if (task_nice(p) > 0) {
task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime;
}
}
/* * Account system CPU time to a process and desired cpustat field * @p: the process that the CPU time gets accounted to * @cputime: the CPU time spent in kernel space since the last update * @index: pointer to cpustat field that has to be updated
*/ void account_system_index_time(struct task_struct *p,
u64 cputime, enum cpu_usage_stat index)
{ /* Add system time to process. */
p->stime += cputime;
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
task_group_account_field(p, index, cputime);
/* Account for system time used */
acct_account_cputime(p);
}
/* * Account system CPU time to a process. * @p: the process that the CPU time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the CPU time spent in kernel space since the last update
*/ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{ int index;
if (hardirq_count() - hardirq_offset)
index = CPUTIME_IRQ; elseif (in_serving_softirq())
index = CPUTIME_SOFTIRQ; else
index = CPUTIME_SYSTEM;
account_system_index_time(p, cputime, index);
}
/* * Account for involuntary wait time. * @cputime: the CPU time spent in involuntary wait
*/ void account_steal_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
cpustat[CPUTIME_STEAL] += cputime;
}
/* * Account for idle time. * @cputime: the CPU time spent in idle wait
*/ void account_idle_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat; struct rq *rq = this_rq();
/* * When a guest is interrupted for a longer amount of time, missed clock * ticks are not redelivered later. Due to that, this function may on * occasion account more time than the calling functions think elapsed.
*/ static __always_inline u64 steal_account_process_time(u64 maxtime)
{ #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) {
u64 steal;
/* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group.
*/ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{ struct signal_struct *sig = tsk->signal;
u64 utime, stime; struct task_struct *t; unsignedint seq, nextseq; unsignedlong flags;
/* * Update current task runtime to account pending time since last * scheduler action or thread_group_cputime() call. This thread group * might have other running tasks on different CPUs, but updating * their runtime can affect syscall performance, so we skip account * those pending times and rely only on values updated on tick or * other scheduler action.
*/ if (same_thread_group(current, tsk))
(void) task_sched_runtime(current);
rcu_read_lock(); /* Attempt a lockless read on the first round. */
nextseq = 0; do {
seq = nextseq;
flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
for_each_thread(tsk, t) {
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += read_sum_exec_runtime(t);
} /* If lockless access failed, take the lock. */
nextseq = 1;
} while (need_seqretry(&sig->stats_lock, seq));
done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
rcu_read_unlock();
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat * @p: the process that the CPU time gets accounted to * @user_tick: is the tick from userspace * @rq: the pointer to rq * * Tick demultiplexing follows the order * - pending hardirq update * - pending softirq update * - user_time * - idle_time * - system time * - check for guest_time * - else account as system_time * * Check for hardirq is done both for system and user time as there is * no timer going off while we are on hardirq and hence we may never get an * opportunity to update it solely in system time. * p->stime and friends are only updated on system time and not on IRQ * softirq as those do not count in task exec_runtime any more.
*/ staticvoid irqtime_account_process_tick(struct task_struct *p, int user_tick, int ticks)
{
u64 other, cputime = TICK_NSEC * ticks;
/* * When returning from idle, many ticks can get accounted at * once, including some ticks of steal, IRQ, and softirq time. * Subtract those ticks from the amount of time accounted to * idle, or potentially user or system time. Due to rounding, * other time can exceed ticks occasionally.
*/
other = account_other_time(ULONG_MAX); if (other >= cputime) return;
cputime -= other;
if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd.
*/
account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} elseif (user_tick) {
account_user_time(p, cputime);
} elseif (p == this_rq()->idle) {
account_idle_time(cputime);
} elseif (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime);
} else {
account_system_index_time(p, cputime, CPUTIME_SYSTEM);
}
}
/* * Account a single tick of CPU time. * @p: the process that the CPU time gets accounted to * @user_tick: indicates if the tick is a user or a system tick
*/ void account_process_tick(struct task_struct *p, int user_tick)
{
u64 cputime, steal;
if (vtime_accounting_enabled_this_cpu()) return;
if (irqtime_enabled()) {
irqtime_account_process_tick(p, user_tick, 1); return;
}
/* * Adjust tick based cputime random precision against scheduler runtime * accounting. * * Tick based cputime accounting depend on random scheduling timeslices of a * task to be interrupted or not by the timer. Depending on these * circumstances, the number of these interrupts may be over or * under-optimistic, matching the real user and system cputime with a variable * precision. * * Fix this by scaling these tick based values against the total runtime * accounted by the CFS scheduler. * * This code provides the following guarantees: * * stime + utime == rtime * stime_i+1 >= stime_i, utime_i+1 >= utime_i * * Assuming that rtime_i+1 >= rtime_i.
*/ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st)
{
u64 rtime, stime, utime; unsignedlong flags;
/* Serialize concurrent callers such that we can honour our guarantees */
raw_spin_lock_irqsave(&prev->lock, flags);
rtime = curr->sum_exec_runtime;
/* * This is possible under two circumstances: * - rtime isn't monotonic after all (a bug); * - we got reordered by the lock. * * In both cases this acts as a filter such that the rest of the code * can assume it is monotonic regardless of anything else.
*/ if (prev->stime + prev->utime >= rtime) goto out;
stime = curr->stime;
utime = curr->utime;
/* * If either stime or utime are 0, assume all runtime is userspace. * Once a task gets some ticks, the monotonicity code at 'update:' * will ensure things converge to the observed ratio.
*/ if (stime == 0) {
utime = rtime; goto update;
}
if (utime == 0) {
stime = rtime; goto update;
}
stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); /* * Because mul_u64_u64_div_u64() can approximate on some * achitectures; enforce the constraint that: a*b/(b+c) <= a.
*/ if (unlikely(stime > rtime))
stime = rtime;
update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. * * utime_i+1 = rtime_i+1 - stime_i * = rtime_i+1 - (rtime_i - utime_i) * = (rtime_i+1 - rtime_i) + utime_i * >= utime_i
*/ if (stime < prev->stime)
stime = prev->stime;
utime = rtime - stime;
/* * Make sure utime doesn't go backwards; this still preserves * monotonicity for stime, analogous argument to above.
*/ if (utime < prev->utime) {
utime = prev->utime;
stime = rtime - utime;
}
/* * Unlike tick based timing, vtime based timing never has lost * ticks, and no need for steal time accounting to make up for * lost ticks. Vtime accounts a rounded version of actual * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative.
*/
other = account_other_time(delta);
WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
vtime->starttime += delta;
void vtime_guest_enter(struct task_struct *tsk)
{ struct vtime *vtime = &tsk->vtime; /* * The flags must be updated under the lock with * the vtime_starttime flush and update. * That enforces a right ordering and update sequence * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&vtime->seqcount);
vtime_account_system(tsk, vtime);
tsk->flags |= PF_VCPU;
vtime->state = VTIME_GUEST;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
} while (read_seqcount_retry(&vtime->seqcount, seq));
return gtime;
}
/* * Fetch cputime raw values from fields of task_struct and * add up the pending nohz execution time since the last * cputime snapshot.
*/ bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{ struct vtime *vtime = &t->vtime; unsignedint seq;
u64 delta; int ret;
do {
ret = false;
seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime;
*stime = t->stime;
/* Task is sleeping or idle, nothing to add */ if (vtime->state < VTIME_SYS) continue;
ret = true;
delta = vtime_delta(vtime);
/* * Task runs either in user (including guest) or kernel space, * add pending nohz time to the right place.
*/ if (vtime->state == VTIME_SYS)
*stime += vtime->stime + delta; else
*utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
return ret;
}
staticint vtime_state_fetch(struct vtime *vtime, int cpu)
{ int state = READ_ONCE(vtime->state);
/* * We raced against a context switch, fetch the * kcpustat task again.
*/ if (vtime->cpu != cpu && vtime->cpu != -1) return -EAGAIN;
/* * Two possible things here: * 1) We are seeing the scheduling out task (prev) or any past one. * 2) We are seeing the scheduling in task (next) but it hasn't * passed though vtime_task_switch() yet so the pending * cputime of the prev task may not be flushed yet. * * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
*/ if (state == VTIME_INACTIVE) return -EAGAIN;
state = vtime_state_fetch(vtime, cpu); if (state < 0) return state;
*val = cpustat[usage];
/* * Nice VS unnice cputime accounting may be inaccurate if * the nice value has changed since the last vtime update. * But proper fix would involve interrupting target on nice * updates which is a no go on nohz_full (although the scheduler * may still interrupt the target if rescheduling is needed...)
*/ switch (usage) { case CPUTIME_SYSTEM: if (state == VTIME_SYS)
*val += vtime->stime + vtime_delta(vtime); break; case CPUTIME_USER: if (task_nice(tsk) <= 0)
*val += kcpustat_user_vtime(vtime); break; case CPUTIME_NICE: if (task_nice(tsk) > 0)
*val += kcpustat_user_vtime(vtime); break; case CPUTIME_GUEST: if (state == VTIME_GUEST && task_nice(tsk) <= 0)
*val += vtime->gtime + vtime_delta(vtime); break; case CPUTIME_GUEST_NICE: if (state == VTIME_GUEST && task_nice(tsk) > 0)
*val += vtime->gtime + vtime_delta(vtime); break; default: break;
}
} while (read_seqcount_retry(&vtime->seqcount, seq));
return 0;
}
u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu)
{
u64 *cpustat = kcpustat->cpustat;
u64 val = cpustat[usage]; struct rq *rq; int err;
if (!vtime_accounting_enabled_cpu(cpu)) return val;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.