#ifndef CONFIG_PREEMPTION /* * Some hypercalls issued by the toolstack can take many 10s of * seconds. Allow tasks running hypercalls via the privcmd driver to * be voluntarily preempted even if full kernel preemption is * disabled. * * Such preemptible hypercalls are bracketed by * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() * calls.
*/
DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
/* * In case of scheduling the flag must be cleared and restored after * returning from schedule as the task might move to a different CPU.
*/ static __always_inline bool get_and_clear_inhcall(void)
{ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
enum xen_lazy_mode xen_get_lazy_mode(void)
{ if (in_interrupt()) return XEN_LAZY_NONE;
return this_cpu_read(xen_lazy_mode);
}
/* * Updating the 3 TLS descriptors in the GDT on every task switch is * surprisingly expensive so we avoid updating them if they haven't * changed. Since Xen writes different descriptors than the one * passed in the update_descriptor hypercall we keep shadow copies to * compare against.
*/ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
for (reg = 0; reg < MTRR_MAX_VAR_RANGES; reg++) {
op.u.read_memtype.reg = reg; if (HYPERVISOR_platform_op(&op)) break;
/* * Only called in dom0, which has all RAM PFNs mapped at * RAM MFNs, and all PCI space etc. is identity mapped. * This means we can treat MFN == PFN regarding MTRR settings.
*/
var[reg].base_lo = op.u.read_memtype.type;
var[reg].base_lo |= op.u.read_memtype.mfn << PAGE_SHIFT;
var[reg].base_hi = op.u.read_memtype.mfn >> (32 - PAGE_SHIFT);
mask = ~((op.u.read_memtype.nr_mfns << PAGE_SHIFT) - 1);
mask &= (1UL << width) - 1; if (mask)
mask |= MTRR_PHYSMASK_V;
var[reg].mask_lo = mask;
var[reg].mask_hi = mask >> 32;
}
/* Only overwrite MTRR state if any MTRR could be got from Xen. */ if (reg)
guest_force_mtrr_state(var, reg, MTRR_TYPE_UNCACHABLE); #endif
}
staticvoid __init xen_pv_init_platform(void)
{ /* PV guests can't operate virtio devices without grants. */ if (IS_ENABLED(CONFIG_XEN_VIRTIO))
virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc);
/* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible.
*/ switch (*ax) { case 0x1: /* Replace initial APIC ID in bits 24-31 of EBX. */ /* See xen_pv_smp_config() for related topology preparations. */
maskebx = 0x00ffffff;
or_ebx = smp_processor_id() << 24; break;
case CPUID_LEAF_MWAIT: /* Synthesize the values.. */
*ax = 0;
*bx = 0;
*cx = cpuid_leaf5_ecx_val;
*dx = cpuid_leaf5_edx_val; return;
/* We need to determine whether it is OK to expose the MWAIT * capability to the kernel to harvest deeper than C3 states from ACPI * _CST using the processor_harvest_xen.c module. For this to work, we * need to gather the MWAIT_LEAF values (which the cstate.c code * checks against). The hypervisor won't expose the MWAIT flag because * it would break backwards compatibility; so we will find out directly * from the hardware and hypercall.
*/ if (!xen_initial_domain()) returnfalse;
/* * When running under platform earlier than Xen4.2, do not expose * mwait, to avoid the risk of loading native acpi pad driver
*/ if (!xen_running_on_version_or_later(4, 2)) returnfalse;
/* We need to emulate the MWAIT_LEAF and for that we need both * ecx and edx. The hypercall provides only partial information.
*/
ax = CPUID_LEAF_MWAIT;
bx = 0;
cx = 0;
dx = 0;
native_cpuid(&ax, &bx, &cx, &dx);
/* Ask the Hypervisor whether to clear ACPI_PROC_CAP_C_C2C3_FFH. If so, * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
*/
buf[0] = ACPI_PDC_REVISION_ID;
buf[1] = 1;
buf[2] = (ACPI_PROC_CAP_C_CAPABILITY_SMP | ACPI_PROC_CAP_EST_CAPABILITY_SWSMP);
/* * Xen PV would need some work to support PCID: CR3 handling as well * as xen_flush_tlb_others() would need updating.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
if (!xen_initial_domain())
setup_clear_cpu_cap(X86_FEATURE_ACPI);
if (xen_check_mwait())
setup_force_cpu_cap(X86_FEATURE_MWAIT); else
setup_clear_cpu_cap(X86_FEATURE_MWAIT);
if (!xen_check_xsave()) {
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
}
}
/* * Set the page permissions for a particular virtual address. If the * address is a vmalloc mapping (or other non-linear mapping), then * find the linear mapping of the page and also set its protections to * match.
*/ staticvoid set_aliased_prot(void *v, pgprot_t prot)
{ int level;
pte_t *ptep;
pte_t pte; unsignedlong pfn; unsignedchar dummy; void *va;
/* * Careful: update_va_mapping() will fail if the virtual address * we're poking isn't populated in the page tables. We don't * need to worry about the direct map (that's always in the page * tables), but we need to be careful about vmap space. In * particular, the top level page table can lazily propagate * entries between processes, so if we've switched mms since we * vmapped the target in the first place, we might not have the * top-level page table entry populated. * * We disable preemption because we want the same mm active when * we probe the target and when we issue the hypercall. We'll * have the same nominal mm, but if we're a kernel thread, lazy * mm dropping could change our pgd. * * Out of an abundance of caution, this uses __get_user() to fault * in the target address just in case there's some obscure case * in which the target address isn't readable.
*/
preempt_disable();
copy_from_kernel_nofault(&dummy, v, 1);
if (HYPERVISOR_update_va_mapping((unsignedlong)v, pte, 0))
BUG();
va = __va(PFN_PHYS(pfn));
if (va != v && HYPERVISOR_update_va_mapping((unsignedlong)va, pte, 0))
BUG();
/* * We need to mark the all aliases of the LDT pages RO. We * don't need to call vm_flush_aliases(), though, since that's * only responsible for flushing aliases out the TLBs, not the * page tables, and Xen will flush the TLB for us if needed. * * To avoid confusing future readers: none of this is necessary * to load the LDT. The hypervisor only checks this when the * LDT is faulted in due to subsequent descriptor access.
*/
for (i = 0; i < entries; i += entries_per_page)
set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
}
/* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
BUG_ON(size > PAGE_SIZE);
BUG_ON(va & ~PAGE_MASK);
/* * The GDT is per-cpu and is in the percpu data area. * That can be virtually mapped, so we need to do a * page-walk to get the underlying MFN for the * hypercall. The page can also be in the kernel's * linear range, so we need to RO that mapping too.
*/
ptep = lookup_address(va, &level);
BUG_ON(ptep == NULL);
if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
BUG();
}
/* * load_gdt for early boot, when the gdt is only mapped once
*/ staticvoid __init xen_load_gdt_boot(conststruct desc_ptr *dtr)
{ unsignedlong va = dtr->address; unsignedint size = dtr->size + 1; unsignedlong pfn, mfn;
pte_t pte;
/* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
BUG_ON(size > PAGE_SIZE);
BUG_ON(va & ~PAGE_MASK);
staticvoid xen_load_tls(struct thread_struct *t, unsignedint cpu)
{ /* * In lazy mode we need to zero %fs, otherwise we may get an * exception between the new %fs descriptor being loaded and * %fs being effectively cleared at __switch_to().
*/ if (xen_get_lazy_mode() == XEN_LAZY_CPU)
loadsegment(fs, 0);
xen_mc_flush(); if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
BUG();
preempt_enable();
}
void noist_exc_debug(struct pt_regs *regs);
DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
{ /* On Xen PV, NMI doesn't use IST. The C part is the same as native. */
exc_nmi(regs);
}
DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault)
{ /* On Xen PV, DF doesn't use IST. The C part is the same as native. */
exc_double_fault(regs, error_code);
}
DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
{ /* * There's no IST on Xen PV, but we still need to dispatch * to the correct handler.
*/ if (user_mode(regs))
noist_exc_debug(regs); else
exc_debug(regs);
}
DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap)
{ /* This should never happen and there is no way to handle it. */
instrumentation_begin();
pr_err("Unknown trap in Xen PV mode.");
BUG();
instrumentation_end();
}
#ifdef CONFIG_X86_MCE
DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
{ /* * There's no IST on Xen PV, but we still need to dispatch * to the correct handler.
*/ if (user_mode(regs))
noist_exc_machine_check(regs); else
exc_machine_check(regs);
} #endif
/* * Replace trap handler addresses by Xen specific ones. * Check for known traps using IST and whitelist them. * The debugger ones are the only ones we care about. * Xen will handle faults like double_fault, so we should never see * them. Warn if there's an unexpected IST-using fault handler.
*/ for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { struct trap_array_entry *entry = trap_array + nr;
if (*addr == entry->orig) {
*addr = entry->xen;
ist_okay = entry->ist_okay;
found = true; break;
}
}
/* Locations of each CPU's IDT */ static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
/* Set an IDT entry. If the entry is part of the current IDT, then
also update Xen. */ staticvoid xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
{ unsignedlong p = (unsignedlong)&dt[entrynum]; unsignedlong start, end;
trace_xen_cpu_write_idt_entry(dt, entrynum, g);
preempt_disable();
start = __this_cpu_read(idt_desc.address);
end = start + __this_cpu_read(idt_desc.size) + 1;
/* Load a new IDT into Xen. In principle this can be per-CPU, so we hold a spinlock to protect the static traps[] array (static because
it avoids allocation, and saves stack space). */ staticvoid xen_load_idt(conststruct desc_ptr *desc)
{ static DEFINE_SPINLOCK(lock); staticstruct trap_info traps[257]; staticconststruct trap_info zero = { }; unsigned out;
xen_mc_flush(); if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
BUG();
}
}
preempt_enable();
}
/* * Version of write_gdt_entry for use at early boot-time needed to * update an entry as simply as possible.
*/ staticvoid __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, constvoid *desc, int type)
{
trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
switch (type) { case DESC_LDT: case DESC_TSS: /* ignore */ break;
static u64 xen_do_read_msr(u32 msr, int *err)
{
u64 val = 0; /* Avoid uninitialized value for safe variant. */
if (pmu_msr_chk_emulated(msr, &val, true)) return val;
if (err)
*err = native_read_msr_safe(msr, &val); else
val = native_read_msr(msr);
switch (msr) { case MSR_IA32_APICBASE:
val &= ~X2APIC_ENABLE; if (smp_processor_id() == 0)
val |= MSR_IA32_APICBASE_BSP; else
val &= ~MSR_IA32_APICBASE_BSP; break;
} return val;
}
/* * Support write_msr_safe() and write_msr() semantics. * With err == NULL write_msr() semantics are selected. * Supplying an err pointer requires err to be pre-initialized with 0.
*/ staticvoid xen_do_write_msr(u32 msr, u64 val, int *err)
{ switch (msr) { case MSR_FS_BASE:
set_seg(SEGBASE_FS, val); break;
case MSR_KERNEL_GS_BASE:
set_seg(SEGBASE_GS_USER, val); break;
case MSR_GS_BASE:
set_seg(SEGBASE_GS_KERNEL, val); break;
case MSR_STAR: case MSR_CSTAR: case MSR_LSTAR: case MSR_SYSCALL_MASK: case MSR_IA32_SYSENTER_CS: case MSR_IA32_SYSENTER_ESP: case MSR_IA32_SYSENTER_EIP: /* Fast syscall setup is all done in hypercalls, so these are all ignored. Stub them out here to stop
Xen console noise. */ break;
default: if (pmu_msr_chk_emulated(msr, &val, false)) return;
/* Construct a value which looks like it came from port 0x61. */ if (test_bit(_XEN_NMIREASON_io_error,
&HYPERVISOR_shared_info->arch.nmi_reason))
reason |= NMI_REASON_IOCHK; if (test_bit(_XEN_NMIREASON_pci_serr,
&HYPERVISOR_shared_info->arch.nmi_reason))
reason |= NMI_REASON_SERR;
op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
op.u.firmware_info.index = nr;
ret = HYPERVISOR_platform_op(&op); if (ret) break;
mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
}
boot_params.edd_mbr_sig_buf_entries = nr; #endif
}
/* * Set up the GDT and segment registers for -fstack-protector. Until * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel.
*/ staticvoid __init xen_setup_gdt(int cpu)
{
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
pv_ops.cpu.load_gdt = xen_load_gdt_boot;
/* First C function to be called on Xen boot */
asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
{ struct physdev_set_iopl set_iopl; unsignedlong initrd_start = 0; int rc;
xen_domain_type = XEN_PV_DOMAIN;
xen_start_flags = xen_start_info->flags; /* Interrupts are guaranteed to be off initially. */
early_boot_irqs_disabled = true;
static_call_update_early(xen_hypercall, xen_hypercall_pv);
/* * Setup xen_vcpu early because it is needed for * local_irq_disable(), irqs_disabled(), e.g. in printk(). * * Don't do the full vcpu_info placement stuff until we have * the cpu_possible_mask and a non-dummy shared_info.
*/
xen_vcpu_info_reset(0);
/* * Set up some pagetable state before starting to set any ptes.
*/
xen_setup_machphys_mapping();
xen_init_mmu_ops();
/* Prevent unwanted bits from being set in PTEs. */
__supported_pte_mask &= ~_PAGE_GLOBAL;
__default_kernel_pte_mask &= ~_PAGE_GLOBAL;
/* Get mfn list */
xen_build_dynamic_phys_to_machine();
/* Work out if we support NX */
get_cpu_cap(&boot_cpu_data);
x86_configure_nx();
/* * Set up kernel GDT and segment registers, mainly so that * -fstack-protector code can be executed.
*/
xen_setup_gdt(0);
/* Determine virtual and physical address sizes */
get_cpu_address_sizes(&boot_cpu_data);
/* Let's presume PV guests always boot on vCPU with id 0. */
per_cpu(xen_vcpu_id, 0) = 0;
idt_setup_early_handler();
xen_init_capabilities();
/* * set up the basic apic ops.
*/
xen_init_apic();
machine_ops = xen_machine_ops;
/* * The only reliable way to retain the initial address of the * percpu gdt_page is to remember it here, so we can go and * mark it RW later, when the initial percpu area is freed.
*/
xen_initial_gdt = &per_cpu(gdt_page, 0);
xen_smp_init();
#ifdef CONFIG_ACPI_NUMA /* * The pages we from Xen are not related to machine pages, so * any NUMA information the kernel tries to get from ACPI will * be meaningless. Prevent it from trying.
*/
disable_srat(); #endif
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
local_irq_disable();
xen_raw_console_write("mapping kernel into physical memory\n");
xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
xen_start_info->nr_pages);
xen_reserve_special_pages();
/* * We used to do this in xen_arch_setup, but that is too late * on AMD were early_cpu_init (run before ->arch_setup()) calls * early_amd_init which pokes 0xcf8 port.
*/
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); if (rc != 0)
xen_raw_printk("physdev_op failed %d\n", rc);
if (xen_start_info->mod_start) { if (xen_start_info->flags & SIF_MOD_START_PFN)
initrd_start = PFN_PHYS(xen_start_info->mod_start); else
initrd_start = __pa(xen_start_info->mod_start);
}
/* Poke various useful things into boot_params */
boot_params.hdr.type_of_loader = (9 << 4) | 0;
boot_params.hdr.ramdisk_image = initrd_start;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
if (HYPERVISOR_platform_op(&op) == 0)
boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
/* Make sure ACS will be enabled */
pci_request_acs();
xen_acpi_sleep_register();
xen_boot_params_init_edd();
#ifdef CONFIG_ACPI /* * Disable selecting "Firmware First mode" for correctable * memory errors, as this is the duty of the hypervisor to * decide.
*/
acpi_disable_cmcff = 1; #endif
}
xen_add_preferred_consoles();
#ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */
pci_probe &= ~PCI_PROBE_BIOS; #endif
xen_raw_console_write("about to get started...\n");
/* We need this for printk timestamps */
xen_setup_runstate_info(0);
xen_efi_init(&boot_params);
/* Start the world */
cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
x86_64_start_reservations((char *)__pa_symbol(&boot_params));
}
staticint xen_cpu_up_prepare_pv(unsignedint cpu)
{ int rc;
if (per_cpu(xen_vcpu, cpu) == NULL) return -ENODEV;
xen_setup_timer(cpu);
rc = xen_smp_intr_init(cpu); if (rc) {
WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
cpu, rc); return rc;
}
rc = xen_smp_intr_init_pv(cpu); if (rc) {
WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
cpu, rc); return rc;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.