/* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about * writing to user space from interrupts. (Its not allowed anyway). * * Optimizations Manfred Spraul <manfred@colorfullife.com> * * More scalable flush, from Andi Kleen * * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
/* * Bits to mangle the TIF_SPEC_* state into the mm pointer which is * stored in cpu_tlb_state.last_user_mm_spec.
*/ #define LAST_USER_MM_IBPB 0x1UL #define LAST_USER_MM_L1D_FLUSH 0x2UL #define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH)
/* Bits to set when tlbstate and flush is (re)initialized */ #define LAST_USER_MM_INIT LAST_USER_MM_IBPB
/* * The x86 feature is called PCID (Process Context IDentifier). It is similar * to what is traditionally called ASID on the RISC processors. * * We don't use the traditional ASID implementation, where each process/mm gets * its own ASID and flush/restart when we run out of ASID space. * * Instead we have a small per-cpu array of ASIDs and cache the last few mm's * that came by on this CPU, allowing cheaper switch_mm between processes on * this CPU. * * We end up with different spaces for different things. To avoid confusion we * use different names for each of them: * * ASID - [0, TLB_NR_DYN_ASIDS-1] * the canonical identifier for an mm, dynamically allocated on each CPU * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1] * the canonical, global identifier for an mm, identical across all CPUs * * kPCID - [1, MAX_ASID_AVAILABLE] * the value we write into the PCID part of CR3; corresponds to the * ASID+1, because PCID 0 is special. * * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE] * for KPTI each mm has two address spaces and thus needs two * PCID values, but we can still do with a single ASID denomination * for each mm. Corresponds to kPCID + 2048. *
*/
/* * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches
*/ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define PTI_CONSUMED_PCID_BITS 1 #else # define PTI_CONSUMED_PCID_BITS 0 #endif
/* * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account * for them being zero-based. Another -1 is because PCID 0 is reserved for * use by non-PCID-aware users.
*/ #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Make sure that the dynamic ASID space does not conflict with the * bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
/* * The ASID being passed in here should have respected the * MAX_ASID_AVAILABLE and thus never have the switch bit set.
*/
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); #endif /* * The dynamically-assigned ASIDs that get passed in are small * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, * so do not bother to clear it. * * If PCID is on, ASID-aware code paths put the ASID+1 into the * PCID bits. This serves two purposes. It prevents a nasty * situation in which PCID-unaware code saves CR3, loads some other * value (with PCID == 0), and then restores CR3, thus corrupting * the TLB for ASID 0 if the saved ASID was nonzero. It also means * that any bugs involving loading a PCID-enabled CR3 with * CR4.PCIDE off will trigger deterministically.
*/ return asid + 1;
}
/* * Given @asid, compute uPCID
*/ staticinline u16 user_pcid(u16 asid)
{
u16 ret = kern_pcid(asid); #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; #endif return ret;
}
staticinlineunsignedlong build_cr3_noflush(pgd_t *pgd, u16 asid, unsignedlong lam)
{ /* * Use boot_cpu_has() instead of this_cpu_has() as this function * might be called during early boot. This should work even after * boot because all CPU's the have same capabilities:
*/
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); return build_cr3(pgd, asid, lam) | CR3_NOFLUSH;
}
/* * We get here when we do something requiring a TLB invalidation * but could not go invalidate all of the contexts. We do the * necessary invalidation by clearing out the 'ctx_id' which * forces a TLB flush when the context is loaded.
*/ staticvoid clear_asid_other(void)
{
u16 asid;
/* * This is only expected to be set if we have disabled * kernel _PAGE_GLOBAL pages.
*/ if (!static_cpu_has(X86_FEATURE_PTI)) {
WARN_ON_ONCE(1); return;
}
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { /* Do not need to flush the current asid */ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) continue; /* * Make sure the next time we go to switch to * this asid, we do a flush:
*/
this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
}
this_cpu_write(cpu_tlbstate.invalidate_other, false);
}
/* * TLB consistency for global ASIDs is maintained with hardware assisted * remote TLB flushing. Global ASIDs are always up to date.
*/ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
u16 global_asid = mm_global_asid(next);
/* * We don't currently own an ASID slot on this CPU. * Allocate a slot.
*/
ns.asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; if (ns.asid >= TLB_NR_DYN_ASIDS) {
ns.asid = 0;
this_cpu_write(cpu_tlbstate.next_asid, 1);
}
ns.need_flush = true;
return ns;
}
/* * Global ASIDs are allocated for multi-threaded processes that are * active on multiple CPUs simultaneously, giving each of those * processes the same PCID on every CPU, for use with hardware-assisted * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR. * * These global ASIDs are held for the lifetime of the process.
*/ static DEFINE_RAW_SPINLOCK(global_asid_lock); static u16 last_global_asid = MAX_ASID_AVAILABLE; static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE); static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE); staticint global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
/* * When the search for a free ASID in the global ASID space reaches * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously * freed global ASIDs are safe to re-use. * * This way the global flush only needs to happen at ASID rollover * time, and not at ASID allocation time.
*/ staticvoid reset_global_asid_space(void)
{
lockdep_assert_held(&global_asid_lock);
invlpgb_flush_all_nonglobals();
/* * The TLB flush above makes it safe to re-use the previously * freed global ASIDs.
*/
bitmap_andnot(global_asid_used, global_asid_used,
global_asid_freed, MAX_ASID_AVAILABLE);
bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
/* Restart the search from the start of global ASID space. */
last_global_asid = TLB_NR_DYN_ASIDS;
}
static u16 allocate_global_asid(void)
{
u16 asid;
lockdep_assert_held(&global_asid_lock);
/* The previous allocation hit the edge of available address space */ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
reset_global_asid_space();
if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) { /* This should never happen. */
VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
global_asid_available); return 0;
}
/* Claim this global ASID. */
__set_bit(asid, global_asid_used);
last_global_asid = asid;
global_asid_available--; return asid;
}
/* * Check whether a process is currently active on more than @threshold CPUs. * This is a cheap estimation on whether or not it may make sense to assign * a global ASID to this process, and use broadcast TLB invalidation.
*/ staticbool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
{ int count = 0; int cpu;
/* This quick check should eliminate most single threaded programs. */ if (cpumask_weight(mm_cpumask(mm)) <= threshold) returnfalse;
/* Slower check to make sure. */
for_each_cpu(cpu, mm_cpumask(mm)) { /* Skip the CPUs that aren't really running this process. */ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm) continue;
if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) continue;
if (++count > threshold) returntrue;
} returnfalse;
}
/* * Assign a global ASID to the current process, protecting against * races between multiple threads in the process.
*/ staticvoid use_global_asid(struct mm_struct *mm)
{
u16 asid;
guard(raw_spinlock_irqsave)(&global_asid_lock);
/* This process is already using broadcast TLB invalidation. */ if (mm_global_asid(mm)) return;
/* * The last global ASID was consumed while waiting for the lock. * * If this fires, a more aggressive ASID reuse scheme might be * needed.
*/ if (!global_asid_available) {
VM_WARN_ONCE(1, "Ran out of global ASIDs\n"); return;
}
asid = allocate_global_asid(); if (!asid) return;
mm_assign_global_asid(mm, asid);
}
void mm_free_global_asid(struct mm_struct *mm)
{ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) return;
if (!mm_global_asid(mm)) return;
guard(raw_spinlock_irqsave)(&global_asid_lock);
/* The global ASID can be re-used only after flush at wrap-around. */ #ifdef CONFIG_BROADCAST_TLB_FLUSH
__set_bit(mm->context.global_asid, global_asid_freed);
/* * Is the mm transitioning from a CPU-local ASID to a global ASID?
*/ staticbool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
{
u16 global_asid = mm_global_asid(mm);
if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) returnfalse;
/* Process is transitioning to a global ASID */ if (global_asid && asid != global_asid) returntrue;
returnfalse;
}
/* * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86 * systems have over 8k CPUs. Because of this potential ASID shortage, * global ASIDs are handed out to processes that have frequent TLB * flushes and are active on 4 or more CPUs simultaneously.
*/ staticvoid consider_global_asid(struct mm_struct *mm)
{ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) return;
/* Check every once in a while. */ if ((current->pid & 0x1f) != (jiffies & 0x1f)) return;
/* * Assign a global ASID if the process is active on * 4 or more CPUs simultaneously.
*/ if (mm_active_cpus_exceeds(mm, 3))
use_global_asid(mm);
}
staticvoid finish_asid_transition(struct flush_tlb_info *info)
{ struct mm_struct *mm = info->mm; int bc_asid = mm_global_asid(mm); int cpu;
if (!mm_in_asid_transition(mm)) return;
for_each_cpu(cpu, mm_cpumask(mm)) { /* * The remote CPU is context switching. Wait for that to * finish, to catch the unlikely case of it switching to * the target mm with an out of date ASID.
*/ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
cpu_relax();
if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm) continue;
/* * If at least one CPU is not using the global ASID yet, * send a TLB flush IPI. The IPI should cause stragglers * to transition soon. * * This can race with the CPU switching to another task; * that results in a (harmless) extra IPI.
*/ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
flush_tlb_multi(mm_cpumask(info->mm), info); return;
}
}
/* All the CPUs running this process are using the global ASID. */
mm_clear_asid_transition(mm);
}
/* * TLB flushes with INVLPGB are kicked off asynchronously. * The inc_mm_tlb_gen() guarantees page table updates are done * before these TLB flushes happen.
*/ if (info->end == TLB_FLUSH_ALL) {
invlpgb_flush_single_pcid_nosync(kern_pcid(asid)); /* Do any CPUs supporting INVLPGB need PTI? */ if (cpu_feature_enabled(X86_FEATURE_PTI))
invlpgb_flush_single_pcid_nosync(user_pcid(asid));
} elsedo { unsignedlong nr = 1;
if (info->stride_shift <= PMD_SHIFT) {
nr = (info->end - addr) >> info->stride_shift;
nr = clamp_val(nr, 1, invlpgb_count_max);
}
invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); if (cpu_feature_enabled(X86_FEATURE_PTI))
invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
addr += nr << info->stride_shift;
} while (addr < info->end);
finish_asid_transition(info);
/* Wait for the INVLPGBs kicked off above to finish. */
__tlbsync();
}
/* * Given an ASID, flush the corresponding user ASID. We can delay this * until the next time we switch to it. * * See SWITCH_TO_USER_CR3.
*/ staticinlinevoid invalidate_user_asid(u16 asid)
{ /* There is no user ASID if address space separation is off */ if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) return;
/* * We only have a single ASID if PCID is off and the CR3 * write will have flushed it.
*/ if (!cpu_feature_enabled(X86_FEATURE_PCID)) return;
/* * Caution: many callers of this function expect * that load_cr3() is serializing and orders TLB * fills with respect to the mm_cpumask writes.
*/
write_cr3(new_mm_cr3);
}
/* * It's plausible that we're in lazy TLB mode while our mm is init_mm. * If so, our callers still expect us to flush the TLB, but there * aren't any user TLB entries in init_mm to worry about. * * This needs to happen before any other sanity checks due to * intel_idle's shenanigans.
*/ if (loaded_mm == &init_mm) return;
/* Warn if we're not lazy. */
WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
/* * Invoked from return to user/guest by a task that opted-in to L1D * flushing but ended up running on an SMT enabled core due to wrong * affinity settings or CPU hotplug. This is part of the paranoid L1D flush * contract which this task requested.
*/ staticvoid l1d_flush_force_sigbus(struct callback_head *ch)
{
force_sig(SIGBUS);
}
staticvoid l1d_flush_evaluate(unsignedlong prev_mm, unsignedlong next_mm, struct task_struct *next)
{ /* Flush L1D if the outgoing task requests it */ if (prev_mm & LAST_USER_MM_L1D_FLUSH)
wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
/* Check whether the incoming task opted in for L1D flush */ if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) return;
/* * Validate that it is not running on an SMT sibling as this would * make the exercise pointless because the siblings share L1D. If * it runs on a SMT sibling, notify it with SIGBUS on return to * user/guest
*/ if (this_cpu_read(cpu_info.smt_active)) {
clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH);
next->l1d_flush_kill.func = l1d_flush_force_sigbus;
task_work_add(next, &next->l1d_flush_kill, TWA_RESUME);
}
}
/* * Ensure that the bit shift above works as expected and the two flags * end up in bit 0 and 1.
*/
BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1);
/* * Avoid user->user BTB/RSB poisoning by flushing them when switching * between processes. This stops one process from doing Spectre-v2 * attacks on another. * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the * same process. Using the mm pointer instead of mm->context.ctx_id * opens a hypothetical hole vs. mm_struct reuse, which is more or * less impossible to control by an attacker. Aside of that it * would only affect the first schedule so the theoretically * exposed data is not really interesting.
*/ if (static_branch_likely(&switch_mm_cond_ibpb)) { /* * This is a bit more complex than the always mode because * it has to handle two cases: * * 1) Switch from a user space task (potential attacker) * which has TIF_SPEC_IB set to a user space task * (potential victim) which has TIF_SPEC_IB not set. * * 2) Switch from a user space task (potential attacker) * which has TIF_SPEC_IB not set to a user space task * (potential victim) which has TIF_SPEC_IB set. * * This could be done by unconditionally issuing IBPB when * a task which has TIF_SPEC_IB set is either scheduled in * or out. Though that results in two flushes when: * * - the same user space task is scheduled out and later * scheduled in again and only a kernel thread ran in * between. * * - a user space task belonging to the same process is * scheduled in after a kernel thread ran in between * * - a user space task belonging to the same process is * scheduled in immediately. * * Optimize this with reasonably small overhead for the * above cases. Mangle the TIF_SPEC_IB bit into the mm * pointer of the incoming task which is stored in * cpu_tlbstate.last_user_mm_spec for comparison. * * Issue IBPB only if the mm's are different and one or * both have the IBPB bit set.
*/ if (next_mm != prev_mm &&
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
indirect_branch_prediction_barrier();
}
if (static_branch_unlikely(&switch_mm_always_ibpb)) { /* * Only flush when switching to a user space task with a * different context than the user space task which ran * last on this CPU.
*/ if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsignedlong)next->mm)
indirect_branch_prediction_barrier();
}
if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) { /* * Flush L1D when the outgoing task requested it and/or * check whether the incoming task requested L1D flushing * and ended up on an SMT sibling.
*/ if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH))
l1d_flush_evaluate(prev_mm, next_mm, next);
}
/* * This optimizes when not actually switching mm's. Some architectures use the * 'unused' argument for this optimization, but x86 must use * 'cpu_tlbstate.loaded_mm' instead because it does not always keep * 'current->active_mm' up to date.
*/ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, struct task_struct *tsk)
{ struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); unsignedlong new_lam; struct new_asid ns;
u64 next_tlb_gen;
/* We don't want flush_tlb_func() to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
/* * Verify that CR3 is what we think it is. This will catch * hypothetical buggy code that directly switches to swapper_pg_dir * without going through leave_mm() / switch_mm_irqs_off() or that * does something like write_cr3(read_cr3_pa()). * * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() * isn't free.
*/ #ifdef CONFIG_DEBUG_VM if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
tlbstate_lam_cr3_mask()))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. * Try to recover instead by ignoring the error and doing * a global flush to minimize the chance of corruption. * * (This is far from being a fully correct recovery. * Architecturally, the CPU could prefetch something * back into an incorrect ASID slot and leave it there * to cause trouble down the road. It's better than * nothing, though.)
*/
__flush_tlb_all();
} #endif if (was_lazy)
this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
/* * The membarrier system call requires a full memory barrier and * core serialization before returning to user-space, after * storing to rq->curr, when changing mm. This is because * membarrier() sends IPIs to all CPUs that are in the target mm * to make them issue memory barriers. However, if another CPU * switches to/from the target mm concurrently with * membarrier(), it can cause that CPU not to receive an IPI * when it really should issue a memory barrier. Writing to CR3 * provides that full memory barrier and core serializing * instruction.
*/ if (prev == next) { /* Not actually switching mm's */
VM_WARN_ON(is_dyn_asid(prev_asid) &&
this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
/* * If this races with another thread that enables lam, 'new_lam' * might not match tlbstate_lam_cr3_mask().
*/
/* * Even in lazy TLB mode, the CPU should stay set in the * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
/* Check if the current mm is transitioning to a global ASID */ if (mm_needs_global_asid(next, prev_asid)) {
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
ns = choose_new_asid(next, next_tlb_gen); goto reload_tlb;
}
/* * Broadcast TLB invalidation keeps this ASID up to date * all the time.
*/ if (is_global_asid(prev_asid)) return;
/* * If the CPU is not in lazy TLB mode, we are just switching * from one thread in a process to another thread in the same * process. No TLB flush required.
*/ if (!was_lazy) return;
/* * Read the tlb_gen to check whether a flush is needed. * If the TLB is up to date, just use it. * The barrier synchronizes with the tlb_gen increment in * the TLB shootdown code.
*/
smp_mb();
next_tlb_gen = atomic64_read(&next->context.tlb_gen); if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
next_tlb_gen) return;
/* * TLB contents went out of date while we were in lazy * mode. Fall through to the TLB switching code below.
*/
ns.asid = prev_asid;
ns.need_flush = true;
} else { /* * Apply process to process speculation vulnerability * mitigations if applicable.
*/
cond_mitigation(tsk);
/* * Indicate that CR3 is about to change. nmi_uaccess_okay() * and others are sensitive to the window where mm_cpumask(), * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
*/
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
/* * Make sure this CPU is set in mm_cpumask() such that we'll * receive invalidation IPIs. * * Rely on the smp_mb() implied by cpumask_set_cpu()'s atomic * operation, or explicitly provide one. Such that: * * switch_mm_irqs_off() flush_tlb_mm_range() * smp_store_release(loaded_mm, SWITCHING); atomic64_inc_return(tlb_gen) * smp_mb(); // here // smp_mb() implied * atomic64_read(tlb_gen); this_cpu_read(loaded_mm); * * we properly order against flush_tlb_mm_range(), where the * loaded_mm load can happen in mative_flush_tlb_multi() -> * should_flush_tlb(). * * This way switch_mm() must see the new tlb_gen or * flush_tlb_mm_range() must see the new loaded_mm, or both.
*/ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
cpumask_set_cpu(cpu, mm_cpumask(next)); else
smp_mb();
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else { /* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, ns.asid, new_lam, false);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
/* Make sure we write CR3 before loaded_mm. */
barrier();
if (next != prev) {
cr4_update_pce_mm(next);
switch_ldt(prev, next);
}
}
/* * Please ignore the name of this function. It should be called * switch_to_kernel_thread(). * * enter_lazy_tlb() is a hint from the scheduler that we are entering a * kernel thread or other context without an mm. Acceptable implementations * include doing nothing whatsoever, switching to init_mm, or various clever * lazy tricks to try to minimize TLB flushes. * * The scheduler reserves the right to call enter_lazy_tlb() several times * in a row. It will notify us that we're going back to a real mm by * calling switch_mm_irqs_off().
*/ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{ if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return;
/* * Using a temporary mm allows to set temporary mappings that are not accessible * by other CPUs. Such mappings are needed to perform sensitive memory writes * that override the kernel memory protections (e.g., W^X), without exposing the * temporary page-table mappings that are required for these write operations to * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the * mapping is torn down. Temporary mms can also be used for EFI runtime service * calls or similar functionality. * * It is illegal to schedule while using a temporary mm -- the context switch * code is unaware of the temporary mm and does not know how to context switch. * Use a real (non-temporary) mm in a kernel thread if you need to sleep. * * Note: For sensitive memory writes, the temporary mm needs to be used * exclusively by a single core, and IRQs should be disabled while the * temporary mm is loaded, thereby preventing interrupt handler bugs from * overriding the kernel memory protection.
*/ struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm)
{ struct mm_struct *prev_mm;
/* * Make sure not to be in TLB lazy mode, as otherwise we'll end up * with a stale address space WITHOUT being in lazy mode after * restoring the previous mm.
*/ if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
leave_mm();
/* * If breakpoints are enabled, disable them while the temporary mm is * used. Userspace might set up watchpoints on addresses that are used * in the temporary mm, which would lead to wrong signals being sent or * crashes. * * Note that breakpoints are not disabled selectively, which also causes * kernel breakpoints (e.g., perf's) to be disabled. This might be * undesirable, but still seems reasonable as the code that runs in the * temporary mm should be short.
*/ if (hw_breakpoint_active())
hw_breakpoint_disable();
/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm)));
switch_mm_irqs_off(NULL, prev_mm, current);
/* * Restore the breakpoints if they were disabled before the temporary mm * was loaded.
*/ if (hw_breakpoint_active())
hw_breakpoint_restore();
}
/* * Call this when reinitializing a CPU. It fixes the following potential * problems: * * - The ASID changed from what cpu_tlbstate thinks it is (most likely * because the CPU was taken down and came back up with CR3's PCID * bits clear. CPU hotplug can do this. * * - The TLB contains junk in slots corresponding to inactive ASIDs. * * - The CPU went so far out to lunch that it may have missed a TLB * flush.
*/ void initialize_tlbstate_and_flush(void)
{ int i; struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); unsignedlong lam = mm_lam_cr3_mask(mm); unsignedlong cr3 = __read_cr3();
/* Assert that CR3 already references the right mm. */
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
/* LAM expected to be disabled */
WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
WARN_ON(lam);
/* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization * doesn't work like other CR4 bits because it can only be set from * long mode.)
*/
WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
!(cr4_read_shadow() & X86_CR4_PCIDE));
/* Disable LAM, force ASID 0 and force a TLB flush. */
write_cr3(build_cr3(mm->pgd, 0, 0));
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
}
/* * flush_tlb_func()'s memory ordering requirement is that any * TLB fills that happen after we flush the TLB are ordered after we * read active_mm's tlb_gen. We don't need any explicit barriers * because all x86 flush operations are serializing and the * atomic64_read operation won't be reordered by the compiler.
*/ staticvoid flush_tlb_func(void *info)
{ /* * We have three different tlb_gen values in here. They are: * * - mm_tlb_gen: the latest generation. * - local_tlb_gen: the generation that this CPU has already caught * up to. * - f->new_tlb_gen: the generation that the requester of the flush * wants us to catch up to.
*/ conststruct flush_tlb_info *f = info; struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 local_tlb_gen; bool local = smp_processor_id() == f->initiating_cpu; unsignedlong nr_invalidate = 0;
u64 mm_tlb_gen;
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
if (!local) {
inc_irq_stat(irq_tlb_count);
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
}
/* The CPU was left in the mm_cpumask of the target mm. Clear it. */ if (f->mm && f->mm != loaded_mm) {
cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm));
trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); return;
}
if (unlikely(loaded_mm == &init_mm)) return;
/* Reload the ASID if transitioning into or out of a global ASID */ if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
switch_mm_irqs_off(NULL, loaded_mm, NULL);
loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
}
/* Broadcast ASIDs are always kept up to date with INVLPGB. */ if (is_global_asid(loaded_mm_asid)) return;
if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) { /* * We're in lazy mode. We need to at least flush our * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. * * This should be rare, with native_flush_tlb_multi() skipping * IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL); return;
}
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) { /* * The TLB is already up to date in respect to f->new_tlb_gen. * While the core might be still behind mm_tlb_gen, checking * mm_tlb_gen unnecessarily would have negative caching effects * so avoid it.
*/ return;
}
/* * Defer mm_tlb_gen reading as long as possible to avoid cache * contention.
*/
mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
if (unlikely(local_tlb_gen == mm_tlb_gen)) { /* * There's nothing to do: we're already up to date. This can * happen if two concurrent flushes happen -- the first flush to * be handled can catch us all the way up, leaving no work for * the second flush.
*/ goto done;
}
/* * If we get to this point, we know that our TLB is out of date. * This does not strictly imply that we need to flush (it's * possible that f->new_tlb_gen <= local_tlb_gen), but we're * going to need to flush in the very near future, so we might * as well get it over with. * * The only question is whether to do a full or partial flush. * * We do a partial flush if requested and two extra conditions * are met: * * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that * we've always done all needed flushes to catch up to * local_tlb_gen. If, for example, local_tlb_gen == 2 and * f->new_tlb_gen == 3, then we know that the flush needed to bring * us up to date for tlb_gen 3 is the partial flush we're * processing. * * As an example of why this check is needed, suppose that there * are two concurrent flushes. The first is a full flush that * changes context.tlb_gen from 1 to 2. The second is a partial * flush that changes context.tlb_gen from 2 to 3. If they get * processed on this CPU in reverse order, we'll see * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. * If we were to use __flush_tlb_one_user() and set local_tlb_gen to * 3, we'd be break the invariant: we'd update local_tlb_gen above * 1 without the full flush that's needed for tlb_gen 2. * * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization. * Partial TLB flushes are not all that much cheaper than full TLB * flushes, so it seems unlikely that it would be a performance win * to do a partial flush if that won't bring our TLB fully up to * date. By doing a full flush instead, we can increase * local_tlb_gen all the way to mm_tlb_gen and we can probably * avoid another flush in the very near future.
*/ if (f->end != TLB_FLUSH_ALL &&
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) { /* Partial flush */ unsignedlong addr = f->start;
/* * Order the 'loaded_mm' and 'is_lazy' against their * write ordering in switch_mm_irqs_off(). Ensure * 'is_lazy' is at least as new as 'loaded_mm'.
*/
smp_rmb();
/* Lazy TLB will get flushed at the next context switch. */ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) returnfalse;
/* No mm means kernel memory flush. */ if (!info->mm) returntrue;
/* * While switching, the remote CPU could have state from * either the prev or next mm. Assume the worst and flush.
*/ if (loaded_mm == LOADED_MM_SWITCHING) returntrue;
/* The target mm is loaded, and the CPU is not lazy. */ if (loaded_mm == info->mm) returntrue;
/* In cpumask, but not the loaded mm? Periodically remove by flushing. */ if (info->trim_cpumask) returntrue;
STATIC_NOPV void native_flush_tlb_multi(conststruct cpumask *cpumask, conststruct flush_tlb_info *info)
{ /* * Do accounting and tracing. Note that there are (and have always been) * cases in which a remote TLB flush will be traced, but eventually * would not happen.
*/
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); else
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
(info->end - info->start) >> PAGE_SHIFT);
/* * If no page tables were freed, we can skip sending IPIs to * CPUs in lazy TLB mode. They will flush the CPU themselves * at the next context switch. * * However, if page tables are getting freed, we need to send the * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping * up on the new contents of what used to be page tables, while * doing a speculative memory access.
*/ if (info->freed_tables || mm_in_asid_transition(info->mm))
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); else
on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
(void *)info, 1, cpumask);
}
/* * See Documentation/arch/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at * least 95%) of allocations, and is small enough that we are * confident it will not cause too much overhead. Each single * flush is about 100 ns, so this caps the maximum overhead at * _about_ 3,000 ns. * * This is in units of pages.
*/ unsignedlong tlb_single_page_flush_ceiling __read_mostly = 33;
#ifdef CONFIG_DEBUG_VM /* * Ensure that the following code is non-reentrant and flush_tlb_info * is not overwritten. This means no TLB flushing is initiated by * interrupt handlers and machine-check exception handlers.
*/
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); #endif
/* * If the number of flushes is so large that a full flush * would be faster, do a full flush.
*/ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
start = 0;
end = TLB_FLUSH_ALL;
}
/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
new_tlb_gen);
/* * flush_tlb_multi() is not optimized for the common case in which only * a local TLB flush is needed. Optimize this use-case by calling * flush_tlb_func_local() directly in this case.
*/ if (mm_global_asid(mm)) {
broadcast_tlb_flush(info);
} elseif (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
consider_global_asid(mm);
} elseif (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
flush_tlb_func(info);
local_irq_enable();
}
/* First try (faster) hardware-assisted TLB invalidation. */ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
invlpgb_flush_all(); else /* Fall back to the IPI-based invalidation. */
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
/* Flush an arbitrarily large range of memory with INVLPGB. */ staticvoid invlpgb_kernel_range_flush(struct flush_tlb_info *info)
{ unsignedlong addr, nr;
for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
nr = (info->end - addr) >> PAGE_SHIFT;
/* * INVLPGB has a limit on the size of ranges it can * flush. Break up large flushes.
*/
nr = clamp_val(nr, 1, invlpgb_count_max);
info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
TLB_GENERATION_INVALID);
if (info->end == TLB_FLUSH_ALL)
kernel_tlb_flush_all(info); else
kernel_tlb_flush_range(info);
put_flush_tlb_info();
}
/* * This can be used from process context to figure out what the value of * CR3 is without needing to do a (slow) __read_cr3(). * * It's intended to be used for code like KVM that sneakily changes CR3 * and needs to restore it. It needs to be used very carefully.
*/ unsignedlong __get_current_cr3_fast(void)
{ unsignedlong cr3 =
build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
this_cpu_read(cpu_tlbstate.loaded_mm_asid),
tlbstate_lam_cr3_mask());
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
/* * Flush one page in the kernel mapping
*/ void flush_tlb_one_kernel(unsignedlong addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
/* * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its * paravirt equivalent. Even with PCID, this is sufficient: we only * use PCID if we also use global PTEs for the kernel mapping, and * INVLPG flushes global translations across all address spaces. * * If PTI is on, then the kernel is mapped with non-global PTEs, and * __flush_tlb_one_user() will flush the given address for the current * kernel address space and for its usermode counterpart, but it does * not flush it for other address spaces.
*/
flush_tlb_one_user(addr);
if (!static_cpu_has(X86_FEATURE_PTI)) return;
/* * See above. We need to propagate the flush to all other address * spaces. In principle, we only need to propagate it to kernelmode * address spaces, but the extra bookkeeping we would need is not * worth it.
*/
this_cpu_write(cpu_tlbstate.invalidate_other, true);
}
/* * Flush one page in the user mapping
*/
STATIC_NOPV void native_flush_tlb_one_user(unsignedlong addr)
{
u32 loaded_mm_asid; bool cpu_pcide;
/* Flush 'addr' from the kernel PCID: */
invlpg(addr);
/* If PTI is off there is no user PCID and nothing to flush. */ if (!static_cpu_has(X86_FEATURE_PTI)) return;
/* * invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check * 'cpu_pcide' to ensure that *this* CPU will not trigger those * #GP's even if called before CR4.PCIDE has been initialized.
*/ if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide)
invpcid_flush_one(user_pcid(loaded_mm_asid), addr); else
invalidate_user_asid(loaded_mm_asid);
}
if (static_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. * * Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all(); return;
}
/* * Read-modify-write to CR4 - protect it from preemption and * from interrupts. (Use the raw variant because this code can * be called from deep inside debugging code.)
*/
raw_local_irq_save(flags);
/* * Flush the entire current user mapping
*/
STATIC_NOPV void native_flush_tlb_local(void)
{ /* * Preemption or interrupts must be disabled to protect the access * to the per CPU variable and to prevent being preempted between * read_cr3() and write_cr3().
*/
WARN_ON_ONCE(preemptible());
/* * Flush everything
*/ void __flush_tlb_all(void)
{ /* * This is to catch users with enabled preemption and the PGE feature * and don't trigger the warning in __native_flush_tlb().
*/
VM_WARN_ON_ONCE(preemptible());
if (cpu_feature_enabled(X86_FEATURE_PGE)) {
__flush_tlb_global();
} else { /* * !PGE -> !PCID (setup_pcid()), thus every flush is total.
*/
flush_tlb_local();
}
}
EXPORT_SYMBOL_GPL(__flush_tlb_all);
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
TLB_GENERATION_INVALID); /* * flush_tlb_multi() is not optimized for the common case in which only * a local TLB flush is needed. Optimize this use-case by calling * flush_tlb_func_local() directly in this case.
*/ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
invlpgb_flush_all_nonglobals();
batch->unmapped_pages = false;
} elseif (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} elseif (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
flush_tlb_func(info);
local_irq_enable();
}
cpumask_clear(&batch->cpumask);
put_flush_tlb_info();
put_cpu();
}
/* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or * switching the loaded mm. It can also be dangerous if we * interrupted some kernel code that was temporarily using a * different mm.
*/ bool nmi_uaccess_okay(void)
{ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); struct mm_struct *current_mm = current->mm;
VM_WARN_ON_ONCE(!loaded_mm);
/* * The condition we want to check is * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, * if we're running in a VM with shadow paging, and nmi_uaccess_okay() * is supposed to be reasonably fast. * * Instead, we check the almost equivalent but somewhat conservative * condition below, and we rely on the fact that switch_mm_irqs_off() * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
*/ if (loaded_mm != current_mm) returnfalse;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.