// SPDX-License-Identifier: GPL-2.0 /* * Kernel internal timers * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. * * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to * serialize accesses to xtime/lost_ticks). * Copyright (C) 1998 Andrea Arcangeli * 1999-03-10 Improved NTP compatibility by Ulrich Windl * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love * 2000-10-05 Implemented scalable SMP per-CPU timer handling. * Copyright (C) 2000, 2001, 2002 Ingo Molnar * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
*/
/* * The timer wheel has LVL_DEPTH array levels. Each level provides an array of * LVL_SIZE buckets. Each level is driven by its own clock and therefore each * level has a different granularity. * * The level granularity is: LVL_CLK_DIV ^ level * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) * * The array level of a newly armed timer depends on the relative expiry * time. The farther the expiry time is away the higher the array level and * therefore the granularity becomes. * * Contrary to the original timer wheel implementation, which aims for 'exact' * expiry of the timers, this implementation removes the need for recascading * the timers into the lower array levels. The previous 'classic' timer wheel * implementation of the kernel already violated the 'exact' expiry by adding * slack to the expiry time to provide batched expiration. The granularity * levels provide implicit batching. * * This is an optimization of the original timer wheel implementation for the * majority of the timer wheel use cases: timeouts. The vast majority of * timeout timers (networking, disk I/O ...) are canceled before expiry. If * the timeout expires it indicates that normal operation is disturbed, so it * does not matter much whether the timeout comes with a slight delay. * * The only exception to this are networking timers with a small expiry * time. They rely on the granularity. Those fit into the first wheel level, * which has HZ granularity. * * We don't have cascading anymore. timers with a expiry time above the * capacity of the last wheel level are force expired at the maximum timeout * value of the last wheel level. From data sampling we know that the maximum * value observed is 5 days (network connection tracking), so this should not * be an issue. * * The currently chosen array constants values are a good compromise between * array size and granularity. * * This results in the following granularity and range levels: * * HZ 1000 steps * Level Offset Granularity Range * 0 0 1 ms 0 ms - 63 ms * 1 64 8 ms 64 ms - 511 ms * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) * * HZ 300 * Level Offset Granularity Range * 0 0 3 ms 0 ms - 210 ms * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) * * HZ 250 * Level Offset Granularity Range * 0 0 4 ms 0 ms - 255 ms * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) * * HZ 100 * Level Offset Granularity Range * 0 0 10 ms 0 ms - 630 ms * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
*/
/* * The time start value for each level to select the bucket at enqueue * time. We start from the last possible delta of the previous level * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
*/ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
/* The cutoff (max. capacity of the wheel) */ #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
/* * The resulting wheel size. If NOHZ is configured we allocate two * wheels so we have a separate storage for the deferrable timers.
*/ #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
#ifdef CONFIG_NO_HZ_COMMON /* * If multiple bases need to be locked, use the base ordering for lock * nesting, i.e. lowest number first.
*/ # define NR_BASES 3 # define BASE_LOCAL 0 # define BASE_GLOBAL 1 # define BASE_DEF 2 #else # define NR_BASES 1 # define BASE_LOCAL 0 # define BASE_GLOBAL 0 # define BASE_DEF 0 #endif
/** * struct timer_base - Per CPU timer base (number of base depends on config) * @lock: Lock protecting the timer_base * @running_timer: When expiring timers, the lock is dropped. To make * sure not to race against deleting/modifying a * currently running timer, the pointer is set to the * timer, which expires at the moment. If no timer is * running, the pointer is NULL. * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around * timer expiry callback execution and when trying to * delete a running timer and it wasn't successful in * the first glance. It prevents priority inversion * when callback was preempted on a remote CPU and a * caller tries to delete the running timer. It also * prevents a life lock, when the task which tries to * delete a timer preempted the softirq thread which * is running the timer callback function. * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter * waiting for the end of the timer callback function * execution. * @clk: clock of the timer base; is updated before enqueue * of a timer; during expiry, it is 1 offset ahead of * jiffies to avoid endless requeuing to current * jiffies * @next_expiry: expiry value of the first timer; it is updated when * finding the next timer and during enqueue; the * value is not valid, when next_expiry_recalc is set * @cpu: Number of CPU the timer base belongs to * @next_expiry_recalc: States, whether a recalculation of next_expiry is * required. Value is set true, when a timer was * deleted. * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ * code. This state is only used in standard * base. Deferrable timers, which are enqueued remotely * never wake up an idle CPU. So no matter of supporting it * for this base. * @timers_pending: Is set, when a timer is pending in the base. It is only * reliable when next_expiry_recalc is not set. * @pending_map: bitmap of the timer wheel; each bit reflects a * bucket of the wheel. When a bit is set, at least a * single timer is enqueued in the related bucket. * @vectors: Array of lists; Each array member reflects a bucket * of the timer wheel. The list contains all timers * which are enqueued into a specific bucket.
*/ struct timer_base {
raw_spinlock_t lock; struct timer_list *running_timer; #ifdef CONFIG_PREEMPT_RT
spinlock_t expiry_lock;
atomic_t timer_waiters; #endif unsignedlong clk; unsignedlong next_expiry; unsignedint cpu; bool next_expiry_recalc; bool is_idle; bool timers_pending;
DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
staticunsignedlong round_jiffies_common(unsignedlong j, int cpu, bool force_up)
{ int rem; unsignedlong original = j;
/* * We don't want all cpus firing their timers at once hitting the * same lock or cachelines, so we skew each extra cpu with an extra * 3 jiffies. This 3 jiffies came originally from the mm/ code which * already did this. * The skew is done by adding 3*cpunr, then round, then subtract this * extra offset again.
*/
j += cpu * 3;
rem = j % HZ;
/* * If the target jiffy is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. * But never round down if @force_up is set.
*/ if (rem < HZ/4 && !force_up) /* round down */
j = j - rem; else/* round up */
j = j - rem + HZ;
/* now that we have rounded, subtract the extra skew again */
j -= cpu * 3;
/* * Make sure j is still in the future. Otherwise return the * unmodified value.
*/ return time_is_after_jiffies(j) ? j : original;
}
/** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * __round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The exact rounding is skewed for each processor to avoid all * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * * The return value is the rounded version of the @j parameter.
*/ unsignedlong __round_jiffies_relative(unsignedlong j, int cpu)
{ unsignedlong j0 = jiffies;
/* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);
/** * round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * * round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter.
*/ unsignedlong round_jiffies(unsignedlong j)
{ return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);
/** * round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * * round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter.
*/ unsignedlong round_jiffies_relative(unsignedlong j)
{ return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);
/** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * This is the same as __round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early.
*/ unsignedlong __round_jiffies_up_relative(unsignedlong j, int cpu)
{ unsignedlong j0 = jiffies;
/* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
/** * round_jiffies_up - function to round jiffies up to a full second * @j: the time in (absolute) jiffies that should be rounded * * This is the same as round_jiffies() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early.
*/ unsignedlong round_jiffies_up(unsignedlong j)
{ return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);
/** * round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * * This is the same as round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early.
*/ unsignedlong round_jiffies_up_relative(unsignedlong j)
{ return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
/* * Helper function to calculate the array index for a given expiry * time.
*/ staticinlineunsigned calc_index(unsignedlong expires, unsigned lvl, unsignedlong *bucket_expiry)
{
/* * The timer wheel has to guarantee that a timer does not fire * early. Early expiry can happen due to: * - Timer is armed at the edge of a tick * - Truncation of the expiry time in the outer wheel levels * * Round up with level granularity to prevent this.
*/
expires = (expires >> LVL_SHIFT(lvl)) + 1;
*bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK);
}
staticvoid
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{ /* * Deferrable timers do not prevent the CPU from entering dynticks and * are not taken into account on the idle/nohz_full path. An IPI when a * new deferrable timer is enqueued will wake up the remote CPU but * nothing will be done with the deferrable timer base. Therefore skip * the remote IPI for deferrable timers completely.
*/ if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) return;
/* * We might have to IPI the remote CPU if the base is idle and the * timer is pinned. If it is a non pinned timer, it is only queued * on the remote CPU, when timer was running during queueing. Then * everything is handled by remote CPU anyway. If the other CPU is * on the way to idle then it can't set base->is_idle as we hold * the base lock:
*/ if (base->is_idle) {
WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
tick_nohz_full_cpu(base->cpu)));
wake_up_nohz_cpu(base->cpu);
}
}
/* * Enqueue the timer into the hash bucket, mark it pending in * the bitmap, store the index in the timer flags then wake up * the target CPU if needed.
*/ staticvoid enqueue_timer(struct timer_base *base, struct timer_list *timer, unsignedint idx, unsignedlong bucket_expiry)
{
/* * Check whether this is the new first expiring timer. The * effective expiry time of the timer is required here * (bucket_expiry) instead of timer->expires.
*/ if (time_before(bucket_expiry, base->next_expiry)) { /* * Set the next expiry time and kick the CPU so it * can reevaluate the wheel:
*/
WRITE_ONCE(base->next_expiry, bucket_expiry);
base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
}
}
/* Stub timer callback for improperly used timers. */ staticvoid stub_timer(struct timer_list *unused)
{
WARN_ON(1);
}
/* * timer_fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated
*/ staticbool timer_fixup_activate(void *addr, enum debug_obj_state state)
{ struct timer_list *timer = addr;
switch (state) { case ODEBUG_STATE_NOTAVAILABLE:
timer_setup(timer, stub_timer, 0); returntrue;
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
fallthrough; default: returnfalse;
}
}
/* * timer_fixup_free is called when: * - an active object is freed
*/ staticbool timer_fixup_free(void *addr, enum debug_obj_state state)
{ struct timer_list *timer = addr;
/** * timer_init_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * * timer_init_key() must be done to a timer prior to calling *any* of the * other timer functions.
*/ void timer_init_key(struct timer_list *timer, void (*func)(struct timer_list *), unsignedint flags, constchar *name, struct lock_class_key *key)
{
debug_init(timer);
do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(timer_init_key);
staticinlinestruct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
/* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base.
*/ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
index = BASE_DEF;
return per_cpu_ptr(&timer_bases[index], cpu);
}
staticinlinestruct timer_base *get_timer_this_cpu_base(u32 tflags)
{ int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
/* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base.
*/ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
index = BASE_DEF;
staticinlinevoid __forward_timer_base(struct timer_base *base, unsignedlong basej)
{ /* * Check whether we can forward the base. We can only do that when * @basej is past base->clk otherwise we might rewind base->clk.
*/ if (time_before_eq(basej, base->clk)) return;
/* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value.
*/ if (time_after(base->next_expiry, basej)) {
base->clk = basej;
} else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return;
base->clk = base->next_expiry;
}
/* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means * that all timers which are tied to this base are locked, and the base itself * is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found in the base->vectors array. * * When a timer is migrating then the TIMER_MIGRATING flag is set and we need * to wait until the migration is done.
*/ staticstruct timer_base *lock_timer_base(struct timer_list *timer, unsignedlong *flags)
__acquires(timer->base->lock)
{ for (;;) { struct timer_base *base;
u32 tf;
/* * We need to use READ_ONCE() here, otherwise the compiler * might re-read @tf between the check for TIMER_MIGRATING * and spin_lock().
*/
tf = READ_ONCE(timer->flags);
if (!(tf & TIMER_MIGRATING)) {
base = get_timer_base(tf);
raw_spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base;
raw_spin_unlock_irqrestore(&base->lock, *flags);
}
cpu_relax();
}
}
/* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return:
*/ if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new * timer with this expiry.
*/ long diff = timer->expires - expires;
if (!diff) return 1; if (options & MOD_TIMER_REDUCE && diff <= 0) return 1;
/* * We lock timer base and calculate the bucket index right * here. If the timer ends up in the same bucket, then we * just update the expiry time and avoid the whole * dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated * while holding base lock to prevent a race against the * shutdown code.
*/ if (!timer->function) goto out_unlock;
forward_timer_base(base);
if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
time_before_eq(timer->expires, expires)) {
ret = 1; goto out_unlock;
}
/* * Retrieve and compare the array index of the pending * timer. If it matches set the expiry to the new value so a * subsequent call will exit in the expires check above.
*/ if (idx == timer_get_idx(timer)) { if (!(options & MOD_TIMER_REDUCE))
timer->expires = expires; elseif (time_after(timer->expires, expires))
timer->expires = expires;
ret = 1; goto out_unlock;
}
} else {
base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated * while holding base lock to prevent a race against the * shutdown code.
*/ if (!timer->function) goto out_unlock;
forward_timer_base(base);
}
ret = detach_if_pending(timer, base, false); if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock;
new_base = get_timer_this_cpu_base(timer->flags);
if (base != new_base) { /* * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, * otherwise timer_delete_sync() can't detect that the timer's * handler yet has not finished. This also guarantees that the * timer is serialized wrt itself.
*/ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */
timer->flags |= TIMER_MIGRATING;
timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only * enqueue_timer() is required. Otherwise we need to (re)calculate * the wheel index via internal_add_timer().
*/ if (idx != UINT_MAX && clk == base->clk)
enqueue_timer(base, timer, idx, bucket_expiry); else
internal_add_timer(base, timer);
/** * mod_timer_pending - Modify a pending timer's timeout * @timer: The pending timer to be modified * @expires: New absolute timeout in jiffies * * mod_timer_pending() is the same for pending timers as mod_timer(), but * will not activate inactive timers. * * If @timer->function == NULL then the start operation is silently * discarded. * * Return: * * %0 - The timer was inactive and not modified or was in * shutdown state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires
*/ int mod_timer_pending(struct timer_list *timer, unsignedlong expires)
{ return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);
/** * mod_timer - Modify a timer's timeout * @timer: The timer to be modified * @expires: New absolute timeout in jiffies * * mod_timer(timer, expires) is equivalent to: * * timer_delete(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In * case that the timer is inactive, the timer_delete() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the * same timer, then mod_timer() is the only safe way to modify the timeout, * since add_timer() cannot modify an already running timer. * * If @timer->function == NULL then the start operation is silently * discarded. In this case the return value is 0 and meaningless. * * Return: * * %0 - The timer was inactive and started or was in shutdown * state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires or * the timer was active and not modified because @expires did * not change the effective expiry time
*/ int mod_timer(struct timer_list *timer, unsignedlong expires)
{ return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);
/** * timer_reduce - Modify a timer's timeout if it would reduce the timeout * @timer: The timer to be modified * @expires: New absolute timeout in jiffies * * timer_reduce() is very similar to mod_timer(), except that it will only * modify an enqueued timer if that would reduce the expiration time. If * @timer is not enqueued it starts the timer. * * If @timer->function == NULL then the start operation is silently * discarded. * * Return: * * %0 - The timer was inactive and started or was in shutdown * state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires or * the timer was active and not modified because @expires * did not change the effective expiry time such that the * timer would expire earlier than already scheduled
*/ int timer_reduce(struct timer_list *timer, unsignedlong expires)
{ return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
}
EXPORT_SYMBOL(timer_reduce);
/** * add_timer - Start a timer * @timer: The timer to be started * * Start @timer to expire at @timer->expires in the future. @timer->expires * is the absolute expiry time measured in 'jiffies'. When the timer expires * timer->function(timer) will be invoked from soft interrupt context. * * The @timer->expires and @timer->function fields must be set prior * to calling this function. * * If @timer->function == NULL then the start operation is silently * discarded. * * If @timer->expires is already in the past @timer will be queued to * expire at the next timer tick. * * This can only operate on an inactive timer. Attempts to invoke this on * an active timer are rejected with a warning.
*/ void add_timer(struct timer_list *timer)
{ if (WARN_ON_ONCE(timer_pending(timer))) return;
__mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);
/** * add_timer_local() - Start a timer on the local CPU * @timer: The timer to be started * * Same as add_timer() except that the timer flag TIMER_PINNED is set. * * See add_timer() for further details.
*/ void add_timer_local(struct timer_list *timer)
{ if (WARN_ON_ONCE(timer_pending(timer))) return;
timer->flags |= TIMER_PINNED;
__mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_local);
/** * add_timer_global() - Start a timer without TIMER_PINNED flag set * @timer: The timer to be started * * Same as add_timer() except that the timer flag TIMER_PINNED is unset. * * See add_timer() for further details.
*/ void add_timer_global(struct timer_list *timer)
{ if (WARN_ON_ONCE(timer_pending(timer))) return;
timer->flags &= ~TIMER_PINNED;
__mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_global);
/** * add_timer_on - Start a timer on a particular CPU * @timer: The timer to be started * @cpu: The CPU to start it on * * Same as add_timer() except that it starts the timer on the given CPU and * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in * the next round, add_timer_global() should be used instead as it unsets * the TIMER_PINNED flag. * * See add_timer() for further details.
*/ void add_timer_on(struct timer_list *timer, int cpu)
{ struct timer_base *new_base, *base; unsignedlong flags;
debug_assert_init(timer);
if (WARN_ON_ONCE(timer_pending(timer))) return;
/* Make sure timer flags have TIMER_PINNED flag set */
timer->flags |= TIMER_PINNED;
new_base = get_timer_cpu_base(timer->flags, cpu);
/* * If @timer was on a different CPU, it should be migrated with the * old base locked to prevent other operations proceeding with the * wrong base locked. See lock_timer_base().
*/
base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated while * holding base lock to prevent a race against the shutdown code.
*/ if (!timer->function) goto out_unlock;
if (base != new_base) {
timer->flags |= TIMER_MIGRATING;
/** * __timer_delete - Internal function: Deactivate a timer * @timer: The timer to be deactivated * @shutdown: If true, this indicates that the timer is about to be * shutdown permanently. * * If @shutdown is true then @timer->function is set to NULL under the * timer base lock which prevents further rearming of the time. In that * case any attempt to rearm @timer after this function returns will be * silently ignored. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated
*/ staticint __timer_delete(struct timer_list *timer, bool shutdown)
{ struct timer_base *base; unsignedlong flags; int ret = 0;
debug_assert_init(timer);
/* * If @shutdown is set then the lock has to be taken whether the * timer is pending or not to protect against a concurrent rearm * which might hit between the lockless pending check and the lock * acquisition. By taking the lock it is ensured that such a newly * enqueued timer is dequeued and cannot end up with * timer->function == NULL in the expiry code. * * If timer->function is currently executed, then this makes sure * that the callback cannot requeue the timer.
*/ if (timer_pending(timer) || shutdown) {
base = lock_timer_base(timer, &flags);
ret = detach_if_pending(timer, base, true); if (shutdown)
timer->function = NULL;
raw_spin_unlock_irqrestore(&base->lock, flags);
}
return ret;
}
/** * timer_delete - Deactivate a timer * @timer: The timer to be deactivated * * The function only deactivates a pending timer, but contrary to * timer_delete_sync() it does not take into account whether the timer's * callback function is concurrently executed on a different CPU or not. * It neither prevents rearming of the timer. If @timer can be rearmed * concurrently then the return value of this function is meaningless. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated
*/ int timer_delete(struct timer_list *timer)
{ return __timer_delete(timer, false);
}
EXPORT_SYMBOL(timer_delete);
/** * timer_shutdown - Deactivate a timer and prevent rearming * @timer: The timer to be deactivated * * The function does not wait for an eventually running timer callback on a * different CPU but it prevents rearming of the timer. Any attempt to arm * @timer after this function returns will be silently ignored. * * This function is useful for teardown code and should only be used when * timer_shutdown_sync() cannot be invoked due to locking or context constraints. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending
*/ int timer_shutdown(struct timer_list *timer)
{ return __timer_delete(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown);
/** * __try_to_del_timer_sync - Internal function: Try to deactivate a timer * @timer: Timer to deactivate * @shutdown: If true, this indicates that the timer is about to be * shutdown permanently. * * If @shutdown is true then @timer->function is set to NULL under the * timer base lock which prevents further rearming of the timer. Any * attempt to rearm @timer after this function returns will be silently * ignored. * * This function cannot guarantee that the timer cannot be rearmed * right after dropping the base lock if @shutdown is false. That * needs to be prevented by the calling code if necessary. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU
*/ staticint __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
{ struct timer_base *base; unsignedlong flags; int ret = -1;
debug_assert_init(timer);
base = lock_timer_base(timer, &flags);
if (base->running_timer != timer)
ret = detach_if_pending(timer, base, true); if (shutdown)
timer->function = NULL;
raw_spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
/** * timer_delete_sync_try - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not * queued and the timer callback function is not running on any CPU. * * This function does not guarantee that the timer cannot be rearmed right * after dropping the base lock. That needs to be prevented by the calling * code if necessary. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU
*/ int timer_delete_sync_try(struct timer_list *timer)
{ return __try_to_del_timer_sync(timer, false);
}
EXPORT_SYMBOL(timer_delete_sync_try);
/* * The counterpart to del_timer_wait_running(). * * If there is a waiter for base->expiry_lock, then it was waiting for the * timer callback to finish. Drop expiry_lock and reacquire it. That allows * the waiter to acquire the lock and make progress.
*/ staticvoid timer_sync_wait_running(struct timer_base *base)
__releases(&base->lock) __releases(&base->expiry_lock)
__acquires(&base->expiry_lock) __acquires(&base->lock)
{ if (atomic_read(&base->timer_waiters)) {
raw_spin_unlock_irq(&base->lock);
spin_unlock(&base->expiry_lock);
spin_lock(&base->expiry_lock);
raw_spin_lock_irq(&base->lock);
}
}
/* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion, if the softirq thread on a remote CPU * got preempted, and it prevents a life lock when the task which tries to * delete a timer preempted the softirq thread running the timer callback * function.
*/ staticvoid del_timer_wait_running(struct timer_list *timer)
{
u32 tf;
/* * Mark the base as contended and grab the expiry lock, * which is held by the softirq across the timer * callback. Drop the lock immediately so the softirq can * expire the next timer. In theory the timer could already * be running again, but that's more than unlikely and just * causes another wait loop.
*/
atomic_inc(&base->timer_waiters);
spin_lock_bh(&base->expiry_lock);
atomic_dec(&base->timer_waiters);
spin_unlock_bh(&base->expiry_lock);
}
} #else staticinlinevoid timer_base_init_expiry_lock(struct timer_base *base) { } staticinlinevoid timer_base_lock_expiry(struct timer_base *base) { } staticinlinevoid timer_base_unlock_expiry(struct timer_base *base) { } staticinlinevoid timer_sync_wait_running(struct timer_base *base) { } staticinlinevoid del_timer_wait_running(struct timer_list *timer) { } #endif
/** * __timer_delete_sync - Internal function: Deactivate a timer and wait * for the handler to finish. * @timer: The timer to be deactivated * @shutdown: If true, @timer->function will be set to NULL under the * timer base lock which prevents rearming of @timer * * If @shutdown is not set the timer can be rearmed later. If the timer can * be rearmed concurrently, i.e. after dropping the base lock then the * return value is meaningless. * * If @shutdown is set then @timer->function is set to NULL under timer * base lock which prevents rearming of the timer. Any attempt to rearm * a shutdown timer is silently ignored. * * If the timer should be reused after shutdown it has to be initialized * again. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated
*/ staticint __timer_delete_sync(struct timer_list *timer, bool shutdown)
{ int ret;
#ifdef CONFIG_LOCKDEP unsignedlong flags;
/* * If lockdep gives a backtrace here, please reference * the synchronization rules above.
*/
local_irq_save(flags);
lock_map_acquire(&timer->lockdep_map);
lock_map_release(&timer->lockdep_map);
local_irq_restore(flags); #endif /* * don't use it in hardirq context, because it * could lead to deadlock.
*/
WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));
/* * Must be able to sleep on PREEMPT_RT because of the slowpath in * del_timer_wait_running().
*/ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
lockdep_assert_preemption_enabled();
do {
ret = __try_to_del_timer_sync(timer, shutdown);
if (unlikely(ret < 0)) {
del_timer_wait_running(timer);
cpu_relax();
}
} while (ret < 0);
return ret;
}
/** * timer_delete_sync - Deactivate a timer and wait for the handler to finish. * @timer: The timer to be deactivated * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts unless the timer is an irqsafe one. The caller must * not hold locks which would prevent completion of the timer's callback * function. The timer's handler must not call add_timer_on(). Upon exit * the timer is not queued and the handler is not running on any CPU. * * For !irqsafe timers, the caller must not hold locks that are held in * interrupt context. Even if the lock has nothing to do with the timer in * question. Here's why:: * * CPU0 CPU1 * ---- ---- * <SOFTIRQ> * call_timer_fn(); * base->running_timer = mytimer; * spin_lock_irq(somelock); * <IRQ> * spin_lock(somelock); * timer_delete_sync(mytimer); * while (base->running_timer == mytimer); * * Now timer_delete_sync() will never return and never release somelock. * The interrupt on the other CPU is waiting to grab somelock but it has * interrupted the softirq that CPU0 is waiting to finish. * * This function cannot guarantee that the timer is not rearmed again by * some concurrent or preempting code, right after it dropped the base * lock. If there is the possibility of a concurrent rearm then the return * value of the function is meaningless. * * If such a guarantee is needed, e.g. for teardown situations then use * timer_shutdown_sync() instead. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated
*/ int timer_delete_sync(struct timer_list *timer)
{ return __timer_delete_sync(timer, false);
}
EXPORT_SYMBOL(timer_delete_sync);
/** * timer_shutdown_sync - Shutdown a timer and prevent rearming * @timer: The timer to be shutdown * * When the function returns it is guaranteed that: * - @timer is not queued * - The callback function of @timer is not running * - @timer cannot be enqueued again. Any attempt to rearm * @timer is silently ignored. * * See timer_delete_sync() for synchronization rules. * * This function is useful for final teardown of an infrastructure where * the timer is subject to a circular dependency problem. * * A common pattern for this is a timer and a workqueue where the timer can * schedule work and work can arm the timer. On shutdown the workqueue must * be destroyed and the timer must be prevented from rearming. Unless the * code has conditionals like 'if (mything->in_shutdown)' to prevent that * there is no way to get this correct with timer_delete_sync(). * * timer_shutdown_sync() is solving the problem. The correct ordering of * calls in this case is: * * timer_shutdown_sync(&mything->timer); * workqueue_destroy(&mything->workqueue); * * After this 'mything' can be safely freed. * * This obviously implies that the timer is not required to be functional * for the rest of the shutdown operation. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending
*/ int timer_shutdown_sync(struct timer_list *timer)
{ return __timer_delete_sync(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown_sync);
#ifdef CONFIG_LOCKDEP /* * It is permissible to free the timer from inside the * function that is called from it, this we need to take into * account for lockdep too. To avoid bogus "held lock freed" * warnings as well as problems when looking into * timer->lockdep_map, make a copy and use that here.
*/ struct lockdep_map lockdep_map;
lockdep_copy_map(&lockdep_map, &timer->lockdep_map); #endif /* * Couple the lock chain with the lock chain at * timer_delete_sync() by acquiring the lock_map around the fn() * call here and in timer_delete_sync().
*/
lock_map_acquire(&lockdep_map);
if (count != preempt_count()) {
WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had.
*/
preempt_count_set(count);
}
}
staticvoid expire_timers(struct timer_base *base, struct hlist_head *head)
{ /* * This value is required only for tracing. base->clk was * incremented directly before expire_timers was called. But expiry * is related to the old base->clk value.
*/ unsignedlong baseclk = base->clk - 1;
while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(struct timer_list *);
for (i = 0; i < LVL_DEPTH; i++) {
idx = (clk & LVL_MASK) + i * LVL_SIZE;
if (__test_and_clear_bit(idx, base->pending_map)) {
vec = base->vectors + idx;
hlist_move_list(vec, heads++);
levels++;
} /* Is it time to look at the next level? */ if (clk & LVL_CLK_MASK) break; /* Shift clock for the next level granularity */
clk >>= LVL_CLK_SHIFT;
} return levels;
}
/* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level * (@offset) up to @offset + clk.
*/ staticint next_pending_bucket(struct timer_base *base, unsigned offset, unsigned clk)
{ unsigned pos, start = offset + clk; unsigned end = offset + LVL_SIZE;
/* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. * * Store next expiry time in base->next_expiry.
*/ staticvoid timer_recalc_next_expiry(struct timer_base *base)
{ unsignedlong clk, next, adj; unsigned lvl, offset = 0;
tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next))
next = tmp;
/* * If the next expiration happens before we reach * the next level, no need to check further.
*/ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) break;
} /* * Clock for the next level. If the current level clock lower * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next * expiring jiffy. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 * * we have to look at all levels @index 0. With * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 2 * * LVL0 has the next expiring bucket @index 2. The upper * levels have the next expiring bucket @index 1. * * In case that the propagation wraps the next level the same * rules apply: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 F 2 * * So after looking at LVL0 we get: * * LVL5 LVL4 LVL3 LVL2 LVL1 * 0 0 0 1 0 * * So no propagation from LVL1 to LVL2 because that happened * with the add already, but then we need to propagate further * from LVL2 to LVL3. * * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases.
*/
adj = lvl_clk ? 1 : 0;
clk >>= LVL_CLK_SHIFT;
clk += adj;
}
#ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event:
*/ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
u64 nextevt = hrtimer_get_next_event();
/* * If high resolution timers are enabled * hrtimer_get_next_event() returns KTIME_MAX.
*/ if (expires <= nextevt) return expires;
/* * If the next timer is already expired, return the tick base * time so the tick is fired immediately.
*/ if (nextevt <= basem) return basem;
/* * Round up to the next jiffy. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. * * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
*/ return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}
staticunsignedlong next_timer_interrupt(struct timer_base *base, unsignedlong basej)
{ if (base->next_expiry_recalc)
timer_recalc_next_expiry(base);
/* * Move next_expiry for the empty base into the future to prevent an * unnecessary raise of the timer softirq when the next_expiry value * will be reached even if there is no timer pending. * * This update is also required to make timer_base::next_expiry values * easy comparable to find out which base holds the first pending timer.
*/ if (!base->timers_pending)
WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA);
/* * If the @nextevt is at max. one tick away, use @nextevt and store * it in the local expiry value. The next global event is irrelevant in * this case and can be left as KTIME_MAX.
*/ if (time_before_eq(nextevt, basej + 1)) { /* If we missed a tick already, force 0 delta */ if (time_before(nextevt, basej))
nextevt = basej;
tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;
/* * This is required for the remote check only but it doesn't * hurt, when it is done for both call sites: * * * The remote callers will only take care of the global timers * as local timers will be handled by CPU itself. When not * updating tevt->global with the already missed first global * timer, it is possible that it will be missed completely. * * * The local callers will ignore the tevt->global anyway, when * nextevt is max. one tick away.
*/ if (!local_first)
tevt->global = tevt->local; return nextevt;
}
/* * Update tevt.* values: * * If the local queue expires first, then the global event can be * ignored. If the global queue is empty, nothing to do either.
*/ if (!local_first && base_global->timers_pending)
tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;
# ifdef CONFIG_SMP /** * fetch_next_timer_interrupt_remote() - Store next timers into @tevt * @basej: base time jiffies * @basem: base time clock monotonic * @tevt: Pointer to the storage for the expiry values * @cpu: Remote CPU * * Stores the next pending local and global timer expiry values in the * struct pointed to by @tevt. If a queue is empty the corresponding * field is set to KTIME_MAX. If local event expires before global * event, global event is set to KTIME_MAX as well. * * Caller needs to make sure timer base locks are held (use * timer_lock_remote_bases() for this purpose).
*/ void fetch_next_timer_interrupt_remote(unsignedlong basej, u64 basem, struct timer_events *tevt, unsignedint cpu)
{ struct timer_base *base_local, *base_global;
/* Preset local / global events */
tevt->local = tevt->global = KTIME_MAX;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.