// SPDX-License-Identifier: GPL-2.0-or-later /* * x86 SMP booting functions * * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com> * Copyright 2001 Andi Kleen, SuSE Labs. * * Much of the core SMP work is based on previous work by Thomas Radke, to * whom a great many thanks are extended. * * Thanks to Intel for making available several different Pentium, * Pentium Pro and Pentium-II/Xeon MP machines. * Original development of Linux SMP code supported by Caldera. * * Fixes * Felix Koop : NR_CPUS used properly * Jose Renau : Handle single CPU case. * Alan Cox : By repeated request 8) - Total BogoMIPS report. * Greg Wright : Fix for kernel stacks panic. * Erich Boleyn : MP v1.4 and additional changes. * Matthias Sattler : Changes for 2.1 kernel map. * Michel Lespinasse : Changes for 2.1 kernel map. * Michael Chastain : Change trampoline.S to gnu as. * Alan Cox : Dumb bug: 'B' step PPro's are fine * Ingo Molnar : Added APIC timers, based on code * from Jose Renau * Ingo Molnar : various cleanups and rewrites * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. * Maciej W. Rozycki : Bits for genuine 82489DX APICs * Andi Kleen : Changed for SMP boot into long mode. * Martin J. Bligh : Added support for multi-quad systems * Dave Jones : Report invalid combinations of Athlon CPUs. * Rusty Russell : Hacked into shape for new "hotplug" boot process. * Andi Kleen : Converted to new state machine. * Ashok Raj : CPU hotplug support * Glauber Costa : i386 and x86_64 integration
*/
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
/* representing HT and core siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
/* representing HT, core, and die siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
EXPORT_PER_CPU_SYMBOL(cpu_die_map);
/* CPUs which are the primary SMT threads */ struct cpumask __cpu_primary_thread_mask __read_mostly;
/* Representing CPUs for which sibling maps can be computed */ static cpumask_var_t cpu_sibling_setup_mask;
/* * Cache line aligned data for mwait_play_dead(). Separate on purpose so * that it's unlikely to be touched by other CPUs.
*/ static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
/* Maximum number of SMT threads on any online core */ int __read_mostly __max_smt_threads = 1;
/* Flag to indicate if a complete sched domain rebuild is required */ bool x86_topology_update;
int arch_update_cpu_topology(void)
{ int retval = x86_topology_update;
/* * If woken up by an INIT in an 82489DX configuration the alive * synchronization guarantees that the CPU does not reach this * point before an INIT_deassert IPI reaches the local APIC, so it * is now safe to touch the local APIC. * * Set up this CPU, first the APIC, which is probably redundant on * most boards.
*/
apic_ap_setup();
/* Save the processor parameters. */
identify_secondary_cpu(cpuid);
/* * The topology information must be up to date before * notify_cpu_starting().
*/
set_cpu_sibling_map(cpuid);
ap_init_aperfmperf();
pr_debug("Stack at about %p\n", &cpuid);
wmb();
/* * This runs the AP through all the cpuhp states to its target * state CPUHP_ONLINE.
*/
notify_cpu_starting(cpuid);
}
staticvoid ap_calibrate_delay(void)
{ /* * Calibrate the delay loop and update loops_per_jiffy in cpu_data. * identify_secondary_cpu() stored a value that is close but not as * accurate as the value just calculated. * * As this is invoked after the TSC synchronization check, * calibrate_delay_is_known() will skip the calibration routine * when TSC is synchronized across sockets.
*/
calibrate_delay();
cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy;
}
/* * Activate a secondary processor.
*/ staticvoid notrace __noendbr start_secondary(void *unused)
{ /* * Don't put *anything* except direct CPU state initialization * before cpu_init(), SMP booting is too fragile that we want to * limit the things done here to the most necessary things.
*/
cr4_init();
/* * 32-bit specific. 64-bit reaches this code with the correct page * table established. Yet another historical divergence.
*/ if (IS_ENABLED(CONFIG_X86_32)) { /* switch away from the initial page table */
load_cr3(swapper_pg_dir);
__flush_tlb_all();
}
cpu_init_exception_handling(false);
/* * Load the microcode before reaching the AP alive synchronization * point below so it is not part of the full per CPU serialized * bringup part when "parallel" bringup is enabled. * * That's even safe when hyperthreading is enabled in the CPU as * the core code starts the primary threads first and leaves the * secondary threads waiting for SIPI. Loading microcode on * physical cores concurrently is a safe operation. * * This covers both the Intel specific issue that concurrent * microcode loading on SMT siblings must be prohibited and the * vendor independent issue`that microcode loading which changes * CPUID, MSRs etc. must be strictly serialized to maintain * software state correctness.
*/
load_ucode_ap();
/* * Synchronization point with the hotplug core. Sets this CPUs * synchronization state to ALIVE and spin-waits for the control CPU to * release this CPU for further bringup.
*/
cpuhp_ap_sync_alive();
/* Check TSC synchronization with the control CPU. */
check_tsc_sync_target();
/* * Calibrate the delay loop after the TSC synchronization check. * This allows to skip the calibration when TSC is synchronized * across sockets.
*/
ap_calibrate_delay();
speculative_store_bypass_ht_init();
/* * Lock vector_lock, set CPU online and bring the vector * allocator online. Online must be set with vector_lock held * to prevent a concurrent irq setup/teardown from seeing a * half valid vector space.
*/
lock_vector_lock();
set_cpu_online(smp_processor_id(), true);
lapic_online();
unlock_vector_lock();
x86_platform.nmi_init();
return !WARN_ONCE(!topology_same_node(c, o), "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " "[node: %d != %d]. Ignoring dependency.\n",
cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
}
#define link_mask(mfunc, c1, c2) \ do { \
cpumask_set_cpu((c1), mfunc(c2)); \
cpumask_set_cpu((c2), mfunc(c1)); \
} while (0)
staticbool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
if (c->topo.pkg_id == o->topo.pkg_id &&
c->topo.die_id == o->topo.die_id &&
c->topo.amd_node_id == o->topo.amd_node_id &&
per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) { if (c->topo.core_id == o->topo.core_id) return topology_sane(c, o, "smt");
if ((c->topo.cu_id != 0xff) &&
(o->topo.cu_id != 0xff) &&
(c->topo.cu_id == o->topo.cu_id)) return topology_sane(c, o, "smt");
}
/* If the arch didn't set up l2c_id, fall back to SMT */ if (per_cpu_l2c_id(cpu1) == BAD_APICID) return match_smt(c, o);
/* Do not match if L2 cache id does not match: */ if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2)) returnfalse;
return topology_sane(c, o, "l2c");
}
/* * Unlike the other levels, we do not enforce keeping a * multicore group inside a NUMA node. If this happens, we will * discard the MC level of the topology later.
*/ staticbool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{ if (c->topo.pkg_id == o->topo.pkg_id) returntrue; returnfalse;
}
/* * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. * * Any Intel CPU that has multiple nodes per package and does not * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. * * When in SNC mode, these CPUs enumerate an LLC that is shared * by multiple NUMA nodes. The LLC is shared for off-package data * access but private to the NUMA node (half of the package) for * on-package access. CPUID (the source of the information about * the LLC) can only enumerate the cache as shared or unshared, * but not this particular configuration.
*/
/* Do not match if we do not have a valid APICID for cpu: */ if (per_cpu_llc_id(cpu1) == BAD_APICID) returnfalse;
/* Do not match if LLC id does not match: */ if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2)) returnfalse;
/* * Allow the SNC topology without warning. Return of false * means 'c' does not share the LLC of 'o'. This will be * reflected to userspace.
*/ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) returnfalse;
/* * Set if a package/die has multiple NUMA nodes inside. * AMD Magny-Cours, Intel Cluster-on-Die, and Intel * Sub-NUMA Clustering have this.
*/ staticbool x86_has_numa_in_package;
/* * When there is NUMA topology inside the package invalidate the * PKG domain since the NUMA domains will auto-magically create the * right spanning domains based on the SLIT.
*/ if (x86_has_numa_in_package) { unsignedint pkgdom = ARRAY_SIZE(x86_topology) - 2;
/* * Drop the SMT domains if there is only one thread per-core * since it'll get degenerated by the scheduler anyways.
*/ if (cpu_smt_num_threads <= 1)
++topology;
/* * This needs a separate iteration over the cpus because we rely on all * topology_sibling_cpumask links to be set-up.
*/
for_each_cpu(i, cpu_sibling_setup_mask) {
o = &cpu_data(i);
/* * Does this new cpu bringup a new core?
*/ if (threads == 1) { /* * for each core in package, increment * the booted_cores for this new cpu
*/ if (cpumask_first(
topology_sibling_cpumask(i)) == i)
c->booted_cores++; /* * increment the core count for all * the other cpus in this package
*/ if (i != cpu)
cpu_data(i).booted_cores++;
} elseif (i != cpu && !c->booted_cores)
c->booted_cores = cpu_data(i).booted_cores;
}
}
}
/* maps the cpu to the sched domain representing multi-core */ conststruct cpumask *cpu_coregroup_mask(int cpu)
{ return cpu_llc_shared_mask(cpu);
}
/* * The Multiprocessor Specification 1.4 (1997) example code suggests * that there should be a 10ms delay between the BSP asserting INIT * and de-asserting INIT, when starting a remote processor. * But that slows boot and resume on modern processors, which include * many cores and don't require that delay. * * Cmdline "cpu_init_udelay=" is available to override this delay.
*/ #define UDELAY_10MS_LEGACY 10000
staticvoid __init smp_set_init_udelay(void)
{ /* if cmdline changed it from default, leave it alone */ if (init_udelay != UINT_MAX) return;
/* if modern processor, use no delay */ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) ||
(boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= 0x18) ||
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xF)) {
init_udelay = 0; return;
} /* else, use legacy delay */
init_udelay = UDELAY_10MS_LEGACY;
}
/* * Wake up AP by INIT, INIT, STARTUP sequence.
*/ staticvoid send_init_sequence(u32 phys_apicid)
{ int maxlvt = lapic_get_maxlvt();
/* Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(boot_cpu_apic_version)) { /* Due to the Pentium erratum 3AP. */ if (maxlvt > 3)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
/* Assert INIT on the target CPU */
apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid);
safe_apic_wait_icr_idle();
udelay(init_udelay);
/* Deassert INIT on the target CPU */
apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
safe_apic_wait_icr_idle();
}
/* * Wake up AP by INIT, INIT, STARTUP sequence.
*/ staticint wakeup_secondary_cpu_via_init(u32 phys_apicid, unsignedlong start_eip, unsignedint cpu)
{ unsignedlong send_status = 0, accept_status = 0; int num_starts, j, maxlvt;
/* * Should we send STARTUP IPIs ? * * Determine this based on the APIC version. * If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/ if (APIC_INTEGRATED(boot_cpu_apic_version))
num_starts = 2; else
num_starts = 0;
/* reduce the number of lines printed when booting a large cpu count system */ staticvoid announce_cpu(int cpu, int apicid)
{ staticint width, node_width, first = 1; staticint current_node = NUMA_NO_NODE; int node = early_cpu_to_node(cpu);
/* Initialize the interrupt stack(s) */
ret = irq_init_percpu_irqstack(cpu); if (ret) return ret;
#ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */
per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #endif return 0;
}
/* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. * Returns zero if startup was successfully sent, else error code from * ->wakeup_secondary_cpu.
*/ staticint do_boot_cpu(u32 apicid, unsignedint cpu, struct task_struct *idle)
{ unsignedlong start_ip = real_mode_header->trampoline_start; int ret;
#ifdef CONFIG_X86_64 /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ if (apic->wakeup_secondary_cpu_64)
start_ip = real_mode_header->trampoline_start64; #endif
idle->thread.sp = (unsignedlong)task_pt_regs(idle);
initial_code = (unsignedlong)start_secondary;
/* Enable the espfix hack for this CPU */
init_espfix_ap(cpu);
/* So we see what's up */
announce_cpu(cpu, apicid);
/* * This grunge runs the startup process for * the targeted processor.
*/ if (x86_platform.legacy.warm_reset) {
pr_debug("Setting warm reset code and vector.\n");
smpboot_setup_warm_reset_vector(start_ip); /* * Be paranoid about clearing APIC errors.
*/ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
}
smp_mb();
/* * Wake up a CPU in difference cases: * - Use a method from the APIC driver if one defined, with wakeup * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, * - Use an INIT boot APIC message
*/ if (apic->wakeup_secondary_cpu_64)
ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu); elseif (apic->wakeup_secondary_cpu)
ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu); else
ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu);
/* If the wakeup mechanism failed, cleanup the warm reset vector */ if (ret)
arch_cpuhp_cleanup_kick_cpu(cpu); return ret;
}
int native_kick_ap(unsignedint cpu, struct task_struct *tidle)
{
u32 apicid = apic->cpu_present_to_apicid(cpu); int err;
lockdep_assert_irqs_enabled();
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
if (apicid == BAD_APICID || !apic_id_valid(apicid)) {
pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid); return -EINVAL;
}
if (!test_bit(apicid, phys_cpu_present_map)) {
pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid); return -EINVAL;
}
/* * Save current MTRR state in case it was changed since early boot * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
*/
mtrr_save_state();
/* the FPU context is blank, nobody can own it */
per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
err = common_cpu_up(cpu, tidle); if (err) return err;
err = do_boot_cpu(apicid, cpu, tidle); if (err)
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
void arch_cpuhp_cleanup_kick_cpu(unsignedint cpu)
{ /* Cleanup possible dangling ends... */ if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset)
smpboot_restore_warm_reset_vector();
}
void arch_cpuhp_cleanup_dead_cpu(unsignedint cpu)
{ if (smp_ops.cleanup_dead_cpu)
smp_ops.cleanup_dead_cpu(cpu);
if (system_state == SYSTEM_RUNNING)
pr_info("CPU %u is now offline\n", cpu);
}
void arch_cpuhp_sync_state_poll(void)
{ if (smp_ops.poll_sync_state)
smp_ops.poll_sync_state();
}
/** * arch_disable_smp_support() - Disables SMP support for x86 at boottime
*/ void __init arch_disable_smp_support(void)
{
disable_ioapic_support();
}
/* * Fall back to non SMP mode after errors. * * RED-PEN audit/test this more. I bet there is more state messed up here.
*/ static __init void disable_smp(void)
{
pr_info("SMP disabled\n");
#ifdef CONFIG_X86_64 /* Establish whether parallel bringup can be supported. */ bool __init arch_cpuhp_init_parallel_bringup(void)
{ if (!x86_cpuinit.parallel_bringup) {
pr_info("Parallel CPU startup disabled by the platform\n"); returnfalse;
}
/* * Prepare for SMP bootup. * @max_cpus: configured maximum number of CPUs, It is a legacy parameter * for common interface support.
*/ void __init native_smp_prepare_cpus(unsignedint max_cpus)
{
smp_prepare_cpus_common();
switch (apic_intr_mode) { case APIC_PIC: case APIC_VIRTUAL_WIRE_NO_CONFIG:
disable_smp(); return; case APIC_SYMMETRIC_IO_NO_ROUTING:
disable_smp(); /* Setup local timer */
x86_init.timers.setup_percpu_clockev(); return; case APIC_VIRTUAL_WIRE: case APIC_SYMMETRIC_IO: break;
}
/* Setup local timer */
x86_init.timers.setup_percpu_clockev();
for_each_cpu(sibling, topology_core_cpumask(cpu)) {
cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); /*/
* last thread sibling in this cpu core going down
*/ if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
cpu_data(sibling).booted_cores--;
}
void cpu_disable_common(void)
{ int cpu = smp_processor_id();
remove_siblinginfo(cpu);
/* * Stop allowing kernel-mode FPU. This is needed so that if the CPU is * brought online again, the initial state is not allowed:
*/
this_cpu_write(kernel_fpu_allowed, false);
/* It's now safe to remove this processor from the online map */
lock_vector_lock();
remove_cpu_from_maps(cpu);
unlock_vector_lock();
fixup_irqs();
lapic_offline();
}
int native_cpu_disable(void)
{ int ret;
ret = lapic_can_unplug_cpu(); if (ret) return ret;
cpu_disable_common();
/* * Disable the local APIC. Otherwise IPI broadcasts will reach * it. It still responds normally to INIT, NMI, SMI, and SIPI * messages. * * Disabling the APIC must happen after cpu_disable_common() * which invokes fixup_irqs(). * * Disabling the APIC preserves already set bits in IRR, but * an interrupt arriving after disabling the local APIC does not * set the corresponding IRR bit. * * fixup_irqs() scans IRR for set bits so it can raise a not * yet handled interrupt on the new destination CPU via an IPI * but obviously it can't do so for IRR bits which are not set. * IOW, interrupts arriving after disabling the local APIC will * be lost.
*/
apic_soft_disable();
return 0;
}
void play_dead_common(void)
{
idle_task_exit();
cpuhp_ap_report_dead();
local_irq_disable();
}
/* * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up.
*/ void __noreturn mwait_play_dead(unsignedint eax_hint)
{ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
/* Set up state for the kexec() hack below */
md->status = CPUDEAD_MWAIT_WAIT;
md->control = CPUDEAD_MWAIT_WAIT;
wbinvd();
while (1) { /* * The CLFLUSH is a workaround for erratum AAI65 for * the Xeon 7400 series. It's not clear it is actually * needed, but it should be harmless in either case. * The WBINVD is insufficient due to the spurious-wakeup * case where we return around the loop.
*/
mb();
clflush(md);
mb();
__monitor(md, 0, 0);
mb();
__mwait(eax_hint, 0);
if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { /* * Kexec is about to happen. Don't go back into mwait() as * the kexec kernel might overwrite text and data including * page tables and stack. So mwait() would resume when the * monitor cache line is written to and then the CPU goes * south due to overwritten text, page tables and stack. * * Note: This does _NOT_ protect against a stray MCE, NMI, * SMI. They will resume execution at the instruction * following the HLT instruction and run into the problem * which this is trying to prevent.
*/
WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT); while(1)
native_halt();
}
}
}
/* * Kick all "offline" CPUs out of mwait on kexec(). See comment in * mwait_play_dead().
*/ void smp_kick_mwait_play_dead(void)
{
u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT; struct mwait_cpu_dead *md; unsignedint cpu, i;
/* Does it sit in mwait_play_dead() ? */ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT) continue;
/* Wait up to 5ms */ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) { /* Bring it out of mwait */
WRITE_ONCE(md->control, newstate);
udelay(5);
}
if (READ_ONCE(md->status) != newstate)
pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
}
}
void __noreturn hlt_play_dead(void)
{ if (__this_cpu_read(cpu_info.x86) >= 4)
wbinvd();
while (1)
native_halt();
}
/* * native_play_dead() is essentially a __noreturn function, but it can't * be marked as such as the compiler may complain about it.
*/ void native_play_dead(void)
{ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
__update_spec_ctrl(0);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.