// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2024, Microsoft Corporation. * * The main part of the mshv_root module, providing APIs to create * and manage guest partitions. * * Authors: Microsoft Linux virtualization team
*/
/* TODO move this to mshyperv.h when needed outside driver */ staticinlinebool hv_parent_partition(void)
{ return hv_root_partition();
}
/* TODO move this to another file when debugfs code is added */ enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ #ifdefined(CONFIG_X86)
VpRootDispatchThreadBlocked = 201, #elifdefined(CONFIG_ARM64)
VpRootDispatchThreadBlocked = 94, #endif
VpStatsMaxCounter
};
/* Once we implement the fast extended hypercall ABI they can go away. */ staticvoid * __percpu *root_scheduler_input; staticvoid * __percpu *root_scheduler_output;
/* * Only allow hypercalls that have a u64 partition id as the first member of * the input structure. * These are sorted by value.
*/ static u16 mshv_passthru_hvcalls[] = {
HVCALL_GET_PARTITION_PROPERTY,
HVCALL_SET_PARTITION_PROPERTY,
HVCALL_INSTALL_INTERCEPT,
HVCALL_GET_VP_REGISTERS,
HVCALL_SET_VP_REGISTERS,
HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
HVCALL_CLEAR_VIRTUAL_INTERRUPT,
HVCALL_REGISTER_INTERCEPT_RESULT,
HVCALL_ASSERT_VIRTUAL_INTERRUPT,
HVCALL_GET_GPA_PAGES_ACCESS_STATES,
HVCALL_SIGNAL_EVENT_DIRECT,
HVCALL_POST_MESSAGE_DIRECT,
HVCALL_GET_VP_CPUID_VALUES,
};
if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) return -EINVAL;
for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) if (args.code == mshv_passthru_hvcalls[i]) break;
if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) return -EINVAL;
is_async = mshv_hvcall_is_async(args.code); if (is_async) { /* async hypercalls can only be called from partition fd */ if (!partition_locked) return -EINVAL;
ret = mshv_init_async_handler(partition); if (ret) return ret;
}
if (copy_from_user(input_pg, (void __user *)args.in_ptr,
args.in_sz)) {
ret = -EFAULT; goto free_pages_out;
}
/* * NOTE: This only works because all the allowed hypercalls' input * structs begin with a u64 partition_id field.
*/
*(u64 *)input_pg = partition->pt_id;
if (args.reps)
status = hv_do_rep_hypercall(args.code, args.reps, 0,
input_pg, output_pg); else
status = hv_do_hypercall(args.code, input_pg, output_pg);
if (hv_result(status) == HV_STATUS_CALL_PENDING) { if (is_async) {
mshv_async_hvcall_handler(partition, &status);
} else { /* Paranoia check. This shouldn't happen! */
ret = -EBADFD; goto free_pages_out;
}
}
if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); if (!ret)
ret = -EAGAIN;
} elseif (!hv_result_success(status)) {
ret = hv_result_to_errno(status);
}
/* * Always return the status and output data regardless of result. * The VMM may need it to determine how to proceed. E.g. the status may * contain the number of reps completed if a rep hypercall partially * succeeded.
*/
args.status = hv_result(status);
args.reps = args.reps ? hv_repcomp(status) : 0; if (copy_to_user(user_args, &args, sizeof(args)))
ret = -EFAULT;
if (output_pg &&
copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
ret = -EFAULT;
/* * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, * done by the hypervisor. * "Intercept" suspend leads to asynchronous message delivery to dom0 which * should be awaited to keep the VP loop consistent (i.e. no message pending * upon VP resume). * VP intercept suspend can't be done when the VP is explicitly suspended * already, and thus can be only two possible race scenarios: * 1. implicit suspend bit set -> explicit suspend bit set -> message sent * 2. implicit suspend bit set -> message sent -> explicit suspend bit set * Checking for implicit suspend bit set after explicit suspend request has * succeeded in either case allows us to reliably identify, if there is a * message to receive and deliver to VMM.
*/ staticint
mshv_suspend_vp(conststruct mshv_vp *vp, bool *message_in_flight)
{ struct hv_register_assoc explicit_suspend = {
.name = HV_REGISTER_EXPLICIT_SUSPEND
}; struct hv_register_assoc intercept_suspend = {
.name = HV_REGISTER_INTERCEPT_SUSPEND
}; union hv_explicit_suspend_register *es =
&explicit_suspend.value.explicit_suspend; union hv_intercept_suspend_register *is =
&intercept_suspend.value.intercept_suspend; int ret;
es->suspended = 1;
ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1, &explicit_suspend); if (ret) {
vp_err(vp, "Failed to explicitly suspend vCPU\n"); return ret;
}
ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
1, &intercept_suspend); if (ret) {
vp_err(vp, "Failed to get intercept suspend state\n"); return ret;
}
*message_in_flight = is->suspended;
return 0;
}
/* * This function is used when VPs are scheduled by the hypervisor's * scheduler. * * Caller has to make sure the registers contain cleared * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers * exactly in this order (the hypervisor clears them sequentially) to avoid * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the * opposite order.
*/ staticlong mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
{ long ret; struct hv_register_assoc suspend_regs[2] = {
{ .name = HV_REGISTER_INTERCEPT_SUSPEND },
{ .name = HV_REGISTER_EXPLICIT_SUSPEND }
};
size_t count = ARRAY_SIZE(suspend_regs);
/* Resume VP execution */
ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
count, suspend_regs); if (ret) {
vp_err(vp, "Failed to resume vp execution. %lx\n", ret); return ret;
}
ret = wait_event_interruptible(vp->run.vp_suspend_queue,
vp->run.kicked_by_hv == 1); if (ret) { bool message_in_flight;
/* * Otherwise the waiting was interrupted by a signal: suspend * the vCPU explicitly and copy message in flight (if any).
*/
ret = mshv_suspend_vp(vp, &message_in_flight); if (ret) return ret;
/* Return if no message in flight */ if (!message_in_flight) return -EINTR;
/* Wait for the message in flight. */
wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
}
/* * Reset the flag to make the wait_event call above work * next time.
*/
vp->run.kicked_by_hv = 0;
th_flags = read_thread_flags(); while (th_flags & work_flags) { int ret;
/* nb: following will call schedule */
ret = mshv_do_pre_guest_mode_work(th_flags);
if (ret) return ret;
th_flags = read_thread_flags();
}
return 0;
}
/* Must be called with interrupts enabled */ staticlong mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
{ long ret;
if (vp->run.flags.root_sched_blocked) { /* * Dispatch state of this VP is blocked. Need to wait * for the hypervisor to clear the blocked state before * dispatching it.
*/
ret = mshv_vp_wait_for_hv_kick(vp); if (ret) return ret;
}
do {
u32 flags = 0; struct hv_output_dispatch_vp output;
ret = mshv_pre_guest_mode_work(vp); if (ret) break;
if (vp->run.flags.intercept_suspend)
flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
if (mshv_vp_interrupt_pending(vp))
flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
ret = mshv_vp_dispatch(vp, flags, &output); if (ret) break;
vp->run.flags.intercept_suspend = 0;
if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { if (output.dispatch_event ==
HV_VP_DISPATCH_EVENT_SUSPEND) { /* * TODO: remove the warning once VP canceling * is supported
*/
WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), "%s: vp#%d: unexpected explicit suspend\n",
__func__, vp->vp_index); /* * Need to clear explicit suspend before * dispatching. * Explicit suspend is either: * - set right after the first VP dispatch or * - set explicitly via hypercall * Since the latter case is not yet supported, * simply clear it here.
*/
ret = mshv_vp_clear_explicit_suspend(vp); if (ret) break;
ret = mshv_vp_wait_for_hv_kick(vp); if (ret) break;
} else {
vp->run.flags.root_sched_blocked = 1;
ret = mshv_vp_wait_for_hv_kick(vp); if (ret) break;
}
} else { /* HV_VP_DISPATCH_STATE_READY */ if (output.dispatch_event ==
HV_VP_DISPATCH_EVENT_INTERCEPT)
vp->run.flags.intercept_suspend = 1;
}
} while (!vp->run.flags.intercept_suspend);
return ret;
}
static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
staticlong mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
{ long rc;
if (page_count > INT_MAX) return -EINVAL; /* * Check the arithmetic for wraparound/overflow. * The last page address in the buffer is: * (user_pfn + (page_count - 1)) * PAGE_SIZE
*/ if (check_add_overflow(user_pfn, (page_count - 1), &check)) return -EOVERFLOW; if (check_mul_overflow(check, PAGE_SIZE, &check)) return -EOVERFLOW;
/* Pin user pages so hypervisor can copy directly to them */
pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM;
if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) return -EFAULT;
switch (args.type) { case MSHV_VP_STATE_LAPIC:
state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
data_sz = HV_HYP_PAGE_SIZE; break; case MSHV_VP_STATE_XSAVE:
{
u64 data_sz_64;
ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
HV_PARTITION_PROPERTY_XSAVE_STATES,
&state_data.xsave.states.as_uint64); if (ret) return ret;
ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
&data_sz_64); if (ret) return ret;
if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) return -EFAULT;
if (data_sz > args.buf_sz) return -EINVAL;
/* If the data is transmitted via pfns, delegate to helper */ if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { unsignedlong user_pfn = PFN_DOWN(args.buf_ptr);
size_t page_count = PFN_DOWN(args.buf_sz);
switch (vma->vm_pgoff) { case MSHV_VP_MMAP_OFFSET_REGISTERS: if (!vp->vp_register_page) return -ENODEV; break; case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: if (!vp->vp_intercept_msg_page) return -ENODEV; break; case MSHV_VP_MMAP_OFFSET_GHCB: if (!vp->vp_ghcb_page) return -ENODEV; break; default: return -EINVAL;
}
if (copy_from_user(&args, arg, sizeof(args))) return -EFAULT;
if (args.vp_index >= MSHV_MAX_VPS) return -EINVAL;
if (partition->pt_vp_array[args.vp_index]) return -EEXIST;
ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
0 /* Only valid for root partition VPs */); if (ret) return ret;
ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
input_vtl_zero,
&intercept_message_page); if (ret) goto destroy_vp;
if (!mshv_partition_encrypted(partition)) {
ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
HV_VP_STATE_PAGE_REGISTERS,
input_vtl_zero,
®ister_page); if (ret) goto unmap_intercept_message_page;
}
if (mshv_partition_encrypted(partition) &&
is_ghcb_mapping_available()) {
ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
HV_VP_STATE_PAGE_GHCB,
input_vtl_normal,
&ghcb_page); if (ret) goto unmap_register_page;
}
if (hv_parent_partition()) {
ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
stats_pages); if (ret) goto unmap_ghcb_page;
}
vp = kzalloc(sizeof(*vp), GFP_KERNEL); if (!vp) goto unmap_stats_pages;
vp->vp_partition = mshv_partition_get(partition); if (!vp->vp_partition) {
ret = -EBADF; goto free_vp;
}
if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
vp->vp_ghcb_page = page_to_virt(ghcb_page);
if (hv_parent_partition())
memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
/* * Keep anon_inode_getfd last: it installs fd in the file struct and * thus makes the state accessible in user space.
*/
ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
O_RDWR | O_CLOEXEC); if (ret < 0) goto put_partition;
/* already exclusive with the partition mutex for all ioctls */
partition->pt_vp_count++;
partition->pt_vp_array[args.vp_index] = vp;
staticint mshv_init_async_handler(struct mshv_partition *partition)
{ if (completion_done(&partition->async_hypercall)) {
pt_err(partition, "Cannot issue async hypercall while another one in progress!\n"); return -EPERM;
}
/* * Pinning assuming 4k pages works for large pages too. * All page structs within the large page are returned. * * Pin requests are batched because pin_user_pages_fast * with the FOLL_LONGTERM flag does a large temporary * allocation of contiguous memory.
*/ if (region->flags.range_pinned)
ret = pin_user_pages_fast(userspace_addr,
nr_pages,
FOLL_WRITE | FOLL_LONGTERM,
pages); else
ret = -EOPNOTSUPP;
if (ret < 0) goto release_pages;
}
if (PageHuge(region->pages[page_offset]))
region->flags.large_pages = true;
/* Note: large_pages flag populated when we pin the pages */ if (!is_mmio)
region->flags.range_pinned = true;
region->partition = partition;
*regionpp = region;
return 0;
}
/* * Map guest ram. if snp, make sure to release that from the host first * Side Effects: In case of failure, pages are unpinned when feasible.
*/ staticint
mshv_partition_mem_region_map(struct mshv_mem_region *region)
{ struct mshv_partition *partition = region->partition; int ret;
ret = mshv_region_populate(region); if (ret) {
pt_err(partition, "Failed to populate memory region: %d\n",
ret); goto err_out;
}
/* * For an SNP partition it is a requirement that for every memory region * that we are going to map for this partition we should make sure that * host access to that region is released. This is ensured by doing an * additional hypercall which will update the SLAT to release host * access to guest memory regions.
*/ if (mshv_partition_encrypted(partition)) {
ret = mshv_partition_region_unshare(region); if (ret) {
pt_err(partition, "Failed to unshare memory region (guest_pfn: %llu): %d\n",
region->start_gfn, ret); goto evict_region;
}
}
ret = mshv_region_map(region); if (ret && mshv_partition_encrypted(partition)) { int shrc;
shrc = mshv_partition_region_share(region); if (!shrc) goto evict_region;
pt_err(partition, "Failed to share memory region (guest_pfn: %llu): %d\n",
region->start_gfn, shrc); /* * Don't unpin if marking shared failed because pages are no * longer mapped in the host, ie root, anymore.
*/ goto err_out;
}
/* * This maps two things: guest RAM and for pci passthru mmio space. * * mmio: * - vfio overloads vm_pgoff to store the mmio start pfn/spa. * - Two things need to happen for mapping mmio range: * 1. mapped in the uaddr so VMM can access it. * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. * * This function takes care of the second. The first one is managed by vfio, * and hence is taken care of via vfio_pci_mmap_fault().
*/ staticlong
mshv_map_user_memory(struct mshv_partition *partition, struct mshv_user_mem_region mem)
{ struct mshv_mem_region *region; struct vm_area_struct *vma; bool is_mmio;
ulong mmio_pfn; long ret;
if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
!access_ok((constvoid *)mem.userspace_addr, mem.size)) return -EINVAL;
ret = mshv_partition_create_region(partition, &mem, ®ion,
is_mmio); if (ret) return ret;
if (is_mmio)
ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn,
mmio_pfn, HVPFN_DOWN(mem.size)); else
ret = mshv_partition_mem_region_map(region);
if (ret) goto errout;
/* Install the new region */
hlist_add_head(®ion->hnode, &partition->pt_mem_regions);
return 0;
errout:
vfree(region); return ret;
}
/* Called for unmapping both the guest ram and the mmio space */ staticlong
mshv_unmap_user_memory(struct mshv_partition *partition, struct mshv_user_mem_region mem)
{ struct mshv_mem_region *region;
u32 unmap_flags = 0;
if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) return -EINVAL;
region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); if (!region) return -EINVAL;
if (region->flags.large_pages)
unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
/* ignore unmap failures and continue as process may be exiting */
hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn,
region->nr_pages, unmap_flags);
states = vzalloc(states_buf_sz); if (!states) return -ENOMEM;
ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
args.gpap_base, hv_flags, &written,
states); if (ret) goto free_return;
/* * Overwrite states buffer with bitmap - the bits in hv_type_mask * correspond to bitfields in hv_gpa_page_access_state
*/ for (i = 0; i < written; ++i)
__assign_bit(i, (ulong *)states,
states[i].as_uint8 & hv_type_mask);
/* zero the unused bits in the last byte(s) of the returned bitmap */ for (i = written; i < bitmap_buf_sz * 8; ++i)
__clear_bit(i, (ulong *)states);
if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
ret = -EFAULT;
if (mutex_lock_killable(&partition->pt_mutex)) return -EINTR;
switch (ioctl) { case MSHV_INITIALIZE_PARTITION:
ret = mshv_partition_ioctl_initialize(partition); break; case MSHV_SET_GUEST_MEMORY:
ret = mshv_partition_ioctl_set_memory(partition, uarg); break; case MSHV_CREATE_VP:
ret = mshv_partition_ioctl_create_vp(partition, uarg); break; case MSHV_IRQFD:
ret = mshv_partition_ioctl_irqfd(partition, uarg); break; case MSHV_IOEVENTFD:
ret = mshv_partition_ioctl_ioeventfd(partition, uarg); break; case MSHV_SET_MSI_ROUTING:
ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); break; case MSHV_GET_GPAP_ACCESS_BITMAP:
ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
uarg); break; case MSHV_ROOT_HVCALL:
ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); break; default:
ret = -ENOTTY;
}
/* * There should be at most 1 outstanding notification, but be extra * careful anyway.
*/ while (hv_signal_count != vp_signal_count) {
WARN_ON(hv_signal_count - vp_signal_count != 1);
staticvoid drain_all_vps(conststruct mshv_partition *partition)
{ int i; struct mshv_vp *vp;
/* * VPs are reachable from ISR. It is safe to not take the partition * lock because nobody else can enter this function and drop the * partition from the list.
*/ for (i = 0; i < MSHV_MAX_VPS; i++) {
vp = partition->pt_vp_array[i]; if (!vp) continue; /* * Disable dispatching of the VP in the hypervisor. After this * the hypervisor guarantees it won't generate any signals for * the VP and the hypervisor's VP signal count won't change.
*/
disable_vp_dispatch(vp);
drain_vp_signals(vp);
}
}
/* * Tear down a partition and remove it from the list. * Partition's refcount must be 0
*/ staticvoid destroy_partition(struct mshv_partition *partition)
{ struct mshv_vp *vp; struct mshv_mem_region *region; int i, ret; struct hlist_node *n;
if (refcount_read(&partition->pt_ref_count)) {
pt_err(partition, "Attempt to destroy partition but refcount > 0\n"); return;
}
if (partition->pt_initialized) { /* * We only need to drain signals for root scheduler. This should be * done before removing the partition from the partition list.
*/ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
drain_all_vps(partition);
/* Remove vps */ for (i = 0; i < MSHV_MAX_VPS; ++i) {
vp = partition->pt_vp_array[i]; if (!vp) continue;
if (hv_parent_partition())
mshv_vp_stats_unmap(partition->pt_id, vp->vp_index);
/* Deallocates and unmaps everything including vcpus, GPA mappings etc */
hv_call_finalize_partition(partition->pt_id);
partition->pt_initialized = false;
}
remove_partition(partition);
/* Remove regions, regain access to the memory and unpin the pages */
hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
hnode) {
hlist_del(®ion->hnode);
if (mshv_partition_encrypted(partition)) {
ret = mshv_partition_region_share(region); if (ret) {
pt_err(partition, "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
ret); return;
}
}
mshv_region_evict(region);
vfree(region);
}
/* Withdraw and free all pages we deposited */
hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
hv_call_delete_partition(partition->pt_id);
/* Retrieve and stash the supported scheduler type */ staticint __init mshv_retrieve_scheduler_type(struct device *dev)
{ int ret;
ret = hv_retrieve_scheduler_type(&hv_scheduler_type); if (ret) return ret;
dev_info(dev, "Hypervisor using %s\n",
scheduler_type_to_string(hv_scheduler_type));
switch (hv_scheduler_type) { case HV_SCHEDULER_TYPE_CORE_SMT: case HV_SCHEDULER_TYPE_LP_SMT: case HV_SCHEDULER_TYPE_ROOT: case HV_SCHEDULER_TYPE_LP: /* Supported scheduler, nothing to do */ break; default:
dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
hv_scheduler_type); return -EOPNOTSUPP;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.