Quelle mshv_root_main.c

Sprache: C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2024, Microsoft Corporation.
*
* The main part of the mshv_root module, providing APIs to create
* and manage guest partitions.
*
* Authors: Microsoft Linux virtualization team
*/

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/anon_inodes.h>
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/cpuhotplug.h>
#include <linux/random.h>
#include <asm/mshyperv.h>
#include <linux/hyperv.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/kexec.h>
#include <linux/page-flags.h>
#include <linux/crash_dump.h>
#include <linux/panic_notifier.h>
#include <linux/vmalloc.h>

#include "mshv_eventfd.h"
#include "mshv.h"
#include "mshv_root.h"

MODULE_AUTHOR("Microsoft");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");

/* TODO move this to mshyperv.h when needed outside driver */
static inline bool hv_parent_partition(void)
{
return hv_root_partition();
}

/* TODO move this to another file when debugfs code is added */
enum hv_stats_vp_counters {   /* HV_THREAD_COUNTER */
#if defined(CONFIG_X86)
VpRootDispatchThreadBlocked   = 201,
#elif defined(CONFIG_ARM64)
VpRootDispatchThreadBlocked   = 94,
#endif
VpStatsMaxCounter
};

struct hv_stats_page {
union {
  u64 vp_cntrs[VpStatsMaxCounter];  /* VP counters */
  u8 data[HV_HYP_PAGE_SIZE];
};
} __packed;

struct mshv_root mshv_root;

enum hv_scheduler_type hv_scheduler_type;

/* Once we implement the fast extended hypercall ABI they can go away. */
static void * __percpu *root_scheduler_input;
static void * __percpu *root_scheduler_output;

static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
static int mshv_dev_open(struct inode *inode, struct file *filp);
static int mshv_dev_release(struct inode *inode, struct file *filp);
static int mshv_vp_release(struct inode *inode, struct file *filp);
static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
static int mshv_partition_release(struct inode *inode, struct file *filp);
static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
static int mshv_init_async_handler(struct mshv_partition *partition);
static void mshv_async_hvcall_handler(void *data, u64 *status);

static const union hv_input_vtl input_vtl_zero;
static const union hv_input_vtl input_vtl_normal = {
.target_vtl = HV_NORMAL_VTL,
.use_target_vtl = 1,
};

static const struct vm_operations_struct mshv_vp_vm_ops = {
.fault = mshv_vp_fault,
};

static const struct file_operations mshv_vp_fops = {
.owner = THIS_MODULE,
.release = mshv_vp_release,
.unlocked_ioctl = mshv_vp_ioctl,
.llseek = noop_llseek,
.mmap = mshv_vp_mmap,
};

static const struct file_operations mshv_partition_fops = {
.owner = THIS_MODULE,
.release = mshv_partition_release,
.unlocked_ioctl = mshv_partition_ioctl,
.llseek = noop_llseek,
};

static const struct file_operations mshv_dev_fops = {
.owner = THIS_MODULE,
.open = mshv_dev_open,
.release = mshv_dev_release,
.unlocked_ioctl = mshv_dev_ioctl,
.llseek = noop_llseek,
};

static struct miscdevice mshv_dev = {
.minor = MISC_DYNAMIC_MINOR,
.name = "mshv",
.fops = &mshv_dev_fops,
.mode = 0600,
};

/*
* Only allow hypercalls that have a u64 partition id as the first member of
* the input structure.
* These are sorted by value.
*/
static u16 mshv_passthru_hvcalls[] = {
HVCALL_GET_PARTITION_PROPERTY,
HVCALL_SET_PARTITION_PROPERTY,
HVCALL_INSTALL_INTERCEPT,
HVCALL_GET_VP_REGISTERS,
HVCALL_SET_VP_REGISTERS,
HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
HVCALL_CLEAR_VIRTUAL_INTERRUPT,
HVCALL_REGISTER_INTERCEPT_RESULT,
HVCALL_ASSERT_VIRTUAL_INTERRUPT,
HVCALL_GET_GPA_PAGES_ACCESS_STATES,
HVCALL_SIGNAL_EVENT_DIRECT,
HVCALL_POST_MESSAGE_DIRECT,
HVCALL_GET_VP_CPUID_VALUES,
};

static bool mshv_hvcall_is_async(u16 code)
{
switch (code) {
case HVCALL_SET_PARTITION_PROPERTY:
  return true;
default:
  break;
}
return false;
}

static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
          bool partition_locked,
          void __user *user_args)
{
u64 status;
int ret = 0, i;
bool is_async;
struct mshv_root_hvcall args;
struct page *page;
unsigned int pages_order;
void *input_pg = NULL;
void *output_pg = NULL;

if (copy_from_user(&args, user_args, sizeof(args)))
  return -EFAULT;

if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
     mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
  return -EINVAL;

if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
  return -EINVAL;

for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i)
  if (args.code == mshv_passthru_hvcalls[i])
   break;

if (i >= ARRAY_SIZE(mshv_passthru_hvcalls))
  return -EINVAL;

is_async = mshv_hvcall_is_async(args.code);
if (is_async) {
  /* async hypercalls can only be called from partition fd */
  if (!partition_locked)
   return -EINVAL;
  ret = mshv_init_async_handler(partition);
  if (ret)
   return ret;
}

pages_order = args.out_ptr ? 1 : 0;
page = alloc_pages(GFP_KERNEL, pages_order);
if (!page)
  return -ENOMEM;
input_pg = page_address(page);

if (args.out_ptr)
  output_pg = (char *)input_pg + PAGE_SIZE;
else
  output_pg = NULL;

if (copy_from_user(input_pg, (void __user *)args.in_ptr,
      args.in_sz)) {
  ret = -EFAULT;
  goto free_pages_out;
}

/*
* NOTE: This only works because all the allowed hypercalls' input
* structs begin with a u64 partition_id field.
*/
*(u64 *)input_pg = partition->pt_id;

if (args.reps)
  status = hv_do_rep_hypercall(args.code, args.reps, 0,
          input_pg, output_pg);
else
  status = hv_do_hypercall(args.code, input_pg, output_pg);

if (hv_result(status) == HV_STATUS_CALL_PENDING) {
  if (is_async) {
   mshv_async_hvcall_handler(partition, &status);
  } else { /* Paranoia check. This shouldn't happen! */
   ret = -EBADFD;
   goto free_pages_out;
  }
}

if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
  ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1);
  if (!ret)
   ret = -EAGAIN;
} else if (!hv_result_success(status)) {
  ret = hv_result_to_errno(status);
}

/*
* Always return the status and output data regardless of result.
* The VMM may need it to determine how to proceed. E.g. the status may
* contain the number of reps completed if a rep hypercall partially
* succeeded.
*/
args.status = hv_result(status);
args.reps = args.reps ? hv_repcomp(status) : 0;
if (copy_to_user(user_args, &args, sizeof(args)))
  ret = -EFAULT;

if (output_pg &&
     copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
  ret = -EFAULT;

free_pages_out:
free_pages((unsigned long)input_pg, pages_order);

return ret;
}

static inline bool is_ghcb_mapping_available(void)
{
#if IS_ENABLED(CONFIG_X86_64)
return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
#else
return 0;
#endif
}

static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
     struct hv_register_assoc *registers)
{
return hv_call_get_vp_registers(vp_index, partition_id,
     count, input_vtl_zero, registers);
}

static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
     struct hv_register_assoc *registers)
{
return hv_call_set_vp_registers(vp_index, partition_id,
     count, input_vtl_zero, registers);
}

/*
* Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
* dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
* done by the hypervisor.
* "Intercept" suspend leads to asynchronous message delivery to dom0 which
* should be awaited to keep the VP loop consistent (i.e. no message pending
* upon VP resume).
* VP intercept suspend can't be done when the VP is explicitly suspended
* already, and thus can be only two possible race scenarios:
*   1. implicit suspend bit set -> explicit suspend bit set -> message sent
*   2. implicit suspend bit set -> message sent -> explicit suspend bit set
* Checking for implicit suspend bit set after explicit suspend request has
* succeeded in either case allows us to reliably identify, if there is a
* message to receive and deliver to VMM.
*/
static int
mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
{
struct hv_register_assoc explicit_suspend = {
  .name = HV_REGISTER_EXPLICIT_SUSPEND
};
struct hv_register_assoc intercept_suspend = {
  .name = HV_REGISTER_INTERCEPT_SUSPEND
};
union hv_explicit_suspend_register *es =
  &explicit_suspend.value.explicit_suspend;
union hv_intercept_suspend_register *is =
  &intercept_suspend.value.intercept_suspend;
int ret;

es->suspended = 1;

ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
        1, &explicit_suspend);
if (ret) {
  vp_err(vp, "Failed to explicitly suspend vCPU\n");
  return ret;
}

ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
        1, &intercept_suspend);
if (ret) {
  vp_err(vp, "Failed to get intercept suspend state\n");
  return ret;
}

*message_in_flight = is->suspended;

return 0;
}

/*
* This function is used when VPs are scheduled by the hypervisor's
* scheduler.
*
* Caller has to make sure the registers contain cleared
* HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
* exactly in this order (the hypervisor clears them sequentially) to avoid
* potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
* after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
* opposite order.
*/
static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
{
long ret;
struct hv_register_assoc suspend_regs[2] = {
   { .name = HV_REGISTER_INTERCEPT_SUSPEND },
   { .name = HV_REGISTER_EXPLICIT_SUSPEND }
};
size_t count = ARRAY_SIZE(suspend_regs);

/* Resume VP execution */
ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
        count, suspend_regs);
if (ret) {
  vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
  return ret;
}

ret = wait_event_interruptible(vp->run.vp_suspend_queue,
           vp->run.kicked_by_hv == 1);
if (ret) {
  bool message_in_flight;

  /*
* Otherwise the waiting was interrupted by a signal: suspend
* the vCPU explicitly and copy message in flight (if any).
*/
  ret = mshv_suspend_vp(vp, &message_in_flight);
  if (ret)
   return ret;

  /* Return if no message in flight */
  if (!message_in_flight)
   return -EINTR;

  /* Wait for the message in flight. */
  wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
}

/*
* Reset the flag to make the wait_event call above work
* next time.
*/
vp->run.kicked_by_hv = 0;

return 0;
}

static int
mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
   struct hv_output_dispatch_vp *res)
{
struct hv_input_dispatch_vp *input;
struct hv_output_dispatch_vp *output;
u64 status;

preempt_disable();
input = *this_cpu_ptr(root_scheduler_input);
output = *this_cpu_ptr(root_scheduler_output);

memset(input, 0, sizeof(*input));
memset(output, 0, sizeof(*output));

input->partition_id = vp->vp_partition->pt_id;
input->vp_index = vp->vp_index;
input->time_slice = 0; /* Run forever until something happens */
input->spec_ctrl = 0; /* TODO: set sensible flags */
input->flags = flags;

vp->run.flags.root_sched_dispatched = 1;
status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
vp->run.flags.root_sched_dispatched = 0;

*res = *output;
preempt_enable();

if (!hv_result_success(status))
  vp_err(vp, "%s: status %s\n", __func__,
         hv_result_to_string(status));

return hv_result_to_errno(status);
}

static int
mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
{
struct hv_register_assoc explicit_suspend = {
  .name = HV_REGISTER_EXPLICIT_SUSPEND,
  .value.explicit_suspend.suspended = 0,
};
int ret;

ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
        1, &explicit_suspend);

if (ret)
  vp_err(vp, "Failed to unsuspend\n");

return ret;
}

#if IS_ENABLED(CONFIG_X86_64)
static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
{
if (!vp->vp_register_page)
  return 0;
return vp->vp_register_page->interrupt_vectors.as_uint64;
}
#else
static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
{
return 0;
}
#endif

static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
{
struct hv_stats_page **stats = vp->vp_stats_pages;
u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs;
u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs;

if (self_vp_cntrs[VpRootDispatchThreadBlocked])
  return self_vp_cntrs[VpRootDispatchThreadBlocked];
return parent_vp_cntrs[VpRootDispatchThreadBlocked];
}

static int
mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
{
int ret;

ret = wait_event_interruptible(vp->run.vp_suspend_queue,
           (vp->run.kicked_by_hv == 1 &&
     !mshv_vp_dispatch_thread_blocked(vp)) ||
           mshv_vp_interrupt_pending(vp));
if (ret)
  return -EINTR;

vp->run.flags.root_sched_blocked = 0;
vp->run.kicked_by_hv = 0;

return 0;
}

static int mshv_pre_guest_mode_work(struct mshv_vp *vp)
{
const ulong work_flags = _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING |
     _TIF_NEED_RESCHED  | _TIF_NEED_RESCHED_LAZY |
     _TIF_NOTIFY_RESUME;
ulong th_flags;

th_flags = read_thread_flags();
while (th_flags & work_flags) {
  int ret;

  /* nb: following will call schedule */
  ret = mshv_do_pre_guest_mode_work(th_flags);

  if (ret)
   return ret;

  th_flags = read_thread_flags();
}

return 0;
}

/* Must be called with interrupts enabled */
static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
{
long ret;

if (vp->run.flags.root_sched_blocked) {
  /*
* Dispatch state of this VP is blocked. Need to wait
* for the hypervisor to clear the blocked state before
* dispatching it.
*/
  ret = mshv_vp_wait_for_hv_kick(vp);
  if (ret)
   return ret;
}

do {
  u32 flags = 0;
  struct hv_output_dispatch_vp output;

  ret = mshv_pre_guest_mode_work(vp);
  if (ret)
   break;

  if (vp->run.flags.intercept_suspend)
   flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;

  if (mshv_vp_interrupt_pending(vp))
   flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;

  ret = mshv_vp_dispatch(vp, flags, &output);
  if (ret)
   break;

  vp->run.flags.intercept_suspend = 0;

  if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
   if (output.dispatch_event ==
      HV_VP_DISPATCH_EVENT_SUSPEND) {
    /*
* TODO: remove the warning once VP canceling
* is supported
*/
    WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
       "%s: vp#%d: unexpected explicit suspend\n",
       __func__, vp->vp_index);
    /*
* Need to clear explicit suspend before
* dispatching.
* Explicit suspend is either:
* - set right after the first VP dispatch or
* - set explicitly via hypercall
* Since the latter case is not yet supported,
* simply clear it here.
*/
    ret = mshv_vp_clear_explicit_suspend(vp);
    if (ret)
     break;

    ret = mshv_vp_wait_for_hv_kick(vp);
    if (ret)
     break;
   } else {
    vp->run.flags.root_sched_blocked = 1;
    ret = mshv_vp_wait_for_hv_kick(vp);
    if (ret)
     break;
   }
  } else {
   /* HV_VP_DISPATCH_STATE_READY */
   if (output.dispatch_event ==
      HV_VP_DISPATCH_EVENT_INTERCEPT)
    vp->run.flags.intercept_suspend = 1;
  }
} while (!vp->run.flags.intercept_suspend);

return ret;
}

static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
       "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");

static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
{
long rc;

if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
  rc = mshv_run_vp_with_root_scheduler(vp);
else
  rc = mshv_run_vp_with_hyp_scheduler(vp);

if (rc)
  return rc;

if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
    sizeof(struct hv_message)))
  rc = -EFAULT;

return rc;
}

static int
mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
    struct hv_vp_state_data state_data,
    unsigned long user_pfn, size_t page_count,
    bool is_set)
{
int completed, ret = 0;
unsigned long check;
struct page **pages;

if (page_count > INT_MAX)
  return -EINVAL;
/*
* Check the arithmetic for wraparound/overflow.
* The last page address in the buffer is:
* (user_pfn + (page_count - 1)) * PAGE_SIZE
*/
if (check_add_overflow(user_pfn, (page_count - 1), &check))
  return -EOVERFLOW;
if (check_mul_overflow(check, PAGE_SIZE, &check))
  return -EOVERFLOW;

/* Pin user pages so hypervisor can copy directly to them */
pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL);
if (!pages)
  return -ENOMEM;

for (completed = 0; completed < page_count; completed += ret) {
  unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
  int remaining = page_count - completed;

  ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
       &pages[completed]);
  if (ret < 0) {
   vp_err(vp, "%s: Failed to pin user pages error %i\n",
          __func__, ret);
   goto unpin_pages;
  }
}

if (is_set)
  ret = hv_call_set_vp_state(vp->vp_index,
        vp->vp_partition->pt_id,
        state_data, page_count, pages,
        0, NULL);
else
  ret = hv_call_get_vp_state(vp->vp_index,
        vp->vp_partition->pt_id,
        state_data, page_count, pages,
        NULL);

unpin_pages:
unpin_user_pages(pages, completed);
kfree(pages);
return ret;
}

static long
mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
       struct mshv_get_set_vp_state __user *user_args,
       bool is_set)
{
struct mshv_get_set_vp_state args;
long ret = 0;
union hv_output_get_vp_state vp_state;
u32 data_sz;
struct hv_vp_state_data state_data = {};

if (copy_from_user(&args, user_args, sizeof(args)))
  return -EFAULT;

if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
     !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
     !PAGE_ALIGNED(args.buf_ptr))
  return -EINVAL;

if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
  return -EFAULT;

switch (args.type) {
case MSHV_VP_STATE_LAPIC:
  state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
  data_sz = HV_HYP_PAGE_SIZE;
  break;
case MSHV_VP_STATE_XSAVE:
{
  u64 data_sz_64;

  ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
           HV_PARTITION_PROPERTY_XSAVE_STATES,
           &state_data.xsave.states.as_uint64);
  if (ret)
   return ret;

  ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
           HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
           &data_sz_64);
  if (ret)
   return ret;

  data_sz = (u32)data_sz_64;
  state_data.xsave.flags = 0;
  /* Always request legacy states */
  state_data.xsave.states.legacy_x87 = 1;
  state_data.xsave.states.legacy_sse = 1;
  state_data.type = HV_GET_SET_VP_STATE_XSAVE;
  break;
}
case MSHV_VP_STATE_SIMP:
  state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
  data_sz = HV_HYP_PAGE_SIZE;
  break;
case MSHV_VP_STATE_SIEFP:
  state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
  data_sz = HV_HYP_PAGE_SIZE;
  break;
case MSHV_VP_STATE_SYNTHETIC_TIMERS:
  state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
  data_sz = sizeof(vp_state.synthetic_timers_state);
  break;
default:
  return -EINVAL;
}

if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
  return -EFAULT;

if (data_sz > args.buf_sz)
  return -EINVAL;

/* If the data is transmitted via pfns, delegate to helper */
if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
  unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
  size_t page_count = PFN_DOWN(args.buf_sz);

  return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
             page_count, is_set);
}

/* Paranoia check - this shouldn't happen! */
if (data_sz > sizeof(vp_state)) {
  vp_err(vp, "Invalid vp state data size!\n");
  return -EINVAL;
}

if (is_set) {
  if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
   return -EFAULT;

  return hv_call_set_vp_state(vp->vp_index,
         vp->vp_partition->pt_id,
         state_data, 0, NULL,
         sizeof(vp_state), (u8 *)&vp_state);
}

ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
       state_data, 0, NULL, &vp_state);
if (ret)
  return ret;

if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
  return -EFAULT;

return 0;
}

static long
mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct mshv_vp *vp = filp->private_data;
long r = -ENOTTY;

if (mutex_lock_killable(&vp->vp_mutex))
  return -EINTR;

switch (ioctl) {
case MSHV_RUN_VP:
  r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
  break;
case MSHV_GET_VP_STATE:
  r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
  break;
case MSHV_SET_VP_STATE:
  r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
  break;
case MSHV_ROOT_HVCALL:
  r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
            (void __user *)arg);
  break;
default:
  vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
  break;
}
mutex_unlock(&vp->vp_mutex);

return r;
}

static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
{
struct mshv_vp *vp = vmf->vma->vm_file->private_data;

switch (vmf->vma->vm_pgoff) {
case MSHV_VP_MMAP_OFFSET_REGISTERS:
  vmf->page = virt_to_page(vp->vp_register_page);
  break;
case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
  vmf->page = virt_to_page(vp->vp_intercept_msg_page);
  break;
case MSHV_VP_MMAP_OFFSET_GHCB:
  vmf->page = virt_to_page(vp->vp_ghcb_page);
  break;
default:
  return VM_FAULT_SIGBUS;
}

get_page(vmf->page);

return 0;
}

static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
{
struct mshv_vp *vp = file->private_data;

switch (vma->vm_pgoff) {
case MSHV_VP_MMAP_OFFSET_REGISTERS:
  if (!vp->vp_register_page)
   return -ENODEV;
  break;
case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
  if (!vp->vp_intercept_msg_page)
   return -ENODEV;
  break;
case MSHV_VP_MMAP_OFFSET_GHCB:
  if (!vp->vp_ghcb_page)
   return -ENODEV;
  break;
default:
  return -EINVAL;
}

vma->vm_ops = &mshv_vp_vm_ops;
return 0;
}

static int
mshv_vp_release(struct inode *inode, struct file *filp)
{
struct mshv_vp *vp = filp->private_data;

/* Rest of VP cleanup happens in destroy_partition() */
mshv_partition_put(vp->vp_partition);
return 0;
}

static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
{
union hv_stats_object_identity identity = {
  .vp.partition_id = partition_id,
  .vp.vp_index = vp_index,
};

identity.vp.stats_area_type = HV_STATS_AREA_SELF;
hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);

identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
}

static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
        void *stats_pages[])
{
union hv_stats_object_identity identity = {
  .vp.partition_id = partition_id,
  .vp.vp_index = vp_index,
};
int err;

identity.vp.stats_area_type = HV_STATS_AREA_SELF;
err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
        &stats_pages[HV_STATS_AREA_SELF]);
if (err)
  return err;

identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
        &stats_pages[HV_STATS_AREA_PARENT]);
if (err)
  goto unmap_self;

return 0;

unmap_self:
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
return err;
}

static long
mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
          void __user *arg)
{
struct mshv_create_vp args;
struct mshv_vp *vp;
struct page *intercept_message_page, *register_page, *ghcb_page;
void *stats_pages[2];
long ret;

if (copy_from_user(&args, arg, sizeof(args)))
  return -EFAULT;

if (args.vp_index >= MSHV_MAX_VPS)
  return -EINVAL;

if (partition->pt_vp_array[args.vp_index])
  return -EEXIST;

ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
    0 /* Only valid for root partition VPs */);
if (ret)
  return ret;

ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
     HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
     input_vtl_zero,
     &intercept_message_page);
if (ret)
  goto destroy_vp;

if (!mshv_partition_encrypted(partition)) {
  ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
      HV_VP_STATE_PAGE_REGISTERS,
      input_vtl_zero,
      ®ister_page);
  if (ret)
   goto unmap_intercept_message_page;
}

if (mshv_partition_encrypted(partition) &&
     is_ghcb_mapping_available()) {
  ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
      HV_VP_STATE_PAGE_GHCB,
      input_vtl_normal,
      &ghcb_page);
  if (ret)
   goto unmap_register_page;
}

if (hv_parent_partition()) {
  ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
     stats_pages);
  if (ret)
   goto unmap_ghcb_page;
}

vp = kzalloc(sizeof(*vp), GFP_KERNEL);
if (!vp)
  goto unmap_stats_pages;

vp->vp_partition = mshv_partition_get(partition);
if (!vp->vp_partition) {
  ret = -EBADF;
  goto free_vp;
}

mutex_init(&vp->vp_mutex);
init_waitqueue_head(&vp->run.vp_suspend_queue);
atomic64_set(&vp->run.vp_signaled_count, 0);

vp->vp_index = args.vp_index;
vp->vp_intercept_msg_page = page_to_virt(intercept_message_page);
if (!mshv_partition_encrypted(partition))
  vp->vp_register_page = page_to_virt(register_page);

if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
  vp->vp_ghcb_page = page_to_virt(ghcb_page);

if (hv_parent_partition())
  memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));

/*
* Keep anon_inode_getfd last: it installs fd in the file struct and
* thus makes the state accessible in user space.
*/
ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
          O_RDWR | O_CLOEXEC);
if (ret < 0)
  goto put_partition;

/* already exclusive with the partition mutex for all ioctls */
partition->pt_vp_count++;
partition->pt_vp_array[args.vp_index] = vp;

return ret;

put_partition:
mshv_partition_put(partition);
free_vp:
kfree(vp);
unmap_stats_pages:
if (hv_parent_partition())
  mshv_vp_stats_unmap(partition->pt_id, args.vp_index);
unmap_ghcb_page:
if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) {
  hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
         HV_VP_STATE_PAGE_GHCB,
         input_vtl_normal);
}
unmap_register_page:
if (!mshv_partition_encrypted(partition)) {
  hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
         HV_VP_STATE_PAGE_REGISTERS,
         input_vtl_zero);
}
unmap_intercept_message_page:
hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
        HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
        input_vtl_zero);
destroy_vp:
hv_call_delete_vp(partition->pt_id, args.vp_index);
return ret;
}

static int mshv_init_async_handler(struct mshv_partition *partition)
{
if (completion_done(&partition->async_hypercall)) {
  pt_err(partition,
         "Cannot issue async hypercall while another one in progress!\n");
  return -EPERM;
}

reinit_completion(&partition->async_hypercall);
return 0;
}

static void mshv_async_hvcall_handler(void *data, u64 *status)
{
struct mshv_partition *partition = data;

wait_for_completion(&partition->async_hypercall);
pt_dbg(partition, "Async hypercall completed!\n");

*status = partition->async_hypercall_status;
}

static int
mshv_partition_region_share(struct mshv_mem_region *region)
{
u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;

if (region->flags.large_pages)
  flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;

return hv_call_modify_spa_host_access(region->partition->pt_id,
   region->pages, region->nr_pages,
   HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE,
   flags, true);
}

static int
mshv_partition_region_unshare(struct mshv_mem_region *region)
{
u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;

if (region->flags.large_pages)
  flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;

return hv_call_modify_spa_host_access(region->partition->pt_id,
   region->pages, region->nr_pages,
   0,
   flags, false);
}

static int
mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags,
   u64 page_offset, u64 page_count)
{
if (page_offset + page_count > region->nr_pages)
  return -EINVAL;

if (region->flags.large_pages)
  map_flags |= HV_MAP_GPA_LARGE_PAGE;

/* ask the hypervisor to map guest ram */
return hv_call_map_gpa_pages(region->partition->pt_id,
         region->start_gfn + page_offset,
         page_count, map_flags,
         region->pages + page_offset);
}

static int
mshv_region_map(struct mshv_mem_region *region)
{
u32 map_flags = region->hv_map_flags;

return mshv_region_remap_pages(region, map_flags,
           0, region->nr_pages);
}

static void
mshv_region_evict_pages(struct mshv_mem_region *region,
   u64 page_offset, u64 page_count)
{
if (region->flags.range_pinned)
  unpin_user_pages(region->pages + page_offset, page_count);

memset(region->pages + page_offset, 0,
        page_count * sizeof(struct page *));
}

static void
mshv_region_evict(struct mshv_mem_region *region)
{
mshv_region_evict_pages(region, 0, region->nr_pages);
}

static int
mshv_region_populate_pages(struct mshv_mem_region *region,
      u64 page_offset, u64 page_count)
{
u64 done_count, nr_pages;
struct page **pages;
__u64 userspace_addr;
int ret;

if (page_offset + page_count > region->nr_pages)
  return -EINVAL;

for (done_count = 0; done_count < page_count; done_count += ret) {
  pages = region->pages + page_offset + done_count;
  userspace_addr = region->start_uaddr +
    (page_offset + done_count) *
    HV_HYP_PAGE_SIZE;
  nr_pages = min(page_count - done_count,
          MSHV_PIN_PAGES_BATCH_SIZE);

  /*
* Pinning assuming 4k pages works for large pages too.
* All page structs within the large page are returned.
*
* Pin requests are batched because pin_user_pages_fast
* with the FOLL_LONGTERM flag does a large temporary
* allocation of contiguous memory.
*/
  if (region->flags.range_pinned)
   ret = pin_user_pages_fast(userspace_addr,
        nr_pages,
        FOLL_WRITE | FOLL_LONGTERM,
        pages);
  else
   ret = -EOPNOTSUPP;

  if (ret < 0)
   goto release_pages;
}

if (PageHuge(region->pages[page_offset]))
  region->flags.large_pages = true;

return 0;

release_pages:
mshv_region_evict_pages(region, page_offset, done_count);
return ret;
}

static int
mshv_region_populate(struct mshv_mem_region *region)
{
return mshv_region_populate_pages(region, 0, region->nr_pages);
}

static struct mshv_mem_region *
mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
{
struct mshv_mem_region *region;

hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
  if (gfn >= region->start_gfn &&
      gfn < region->start_gfn + region->nr_pages)
   return region;
}

return NULL;
}

static struct mshv_mem_region *
mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr)
{
struct mshv_mem_region *region;

hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
  if (uaddr >= region->start_uaddr &&
      uaddr < region->start_uaddr +
       (region->nr_pages << HV_HYP_PAGE_SHIFT))
   return region;
}

return NULL;
}

/*
* NB: caller checks and makes sure mem->size is page aligned
* Returns: 0 with regionpp updated on success, or -errno
*/
static int mshv_partition_create_region(struct mshv_partition *partition,
     struct mshv_user_mem_region *mem,
     struct mshv_mem_region **regionpp,
     bool is_mmio)
{
struct mshv_mem_region *region;
u64 nr_pages = HVPFN_DOWN(mem->size);

/* Reject overlapping regions */
if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) ||
     mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) ||
     mshv_partition_region_by_uaddr(partition, mem->userspace_addr) ||
     mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1))
  return -EEXIST;

region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
if (!region)
  return -ENOMEM;

region->nr_pages = nr_pages;
region->start_gfn = mem->guest_pfn;
region->start_uaddr = mem->userspace_addr;
region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
  region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
  region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;

/* Note: large_pages flag populated when we pin the pages */
if (!is_mmio)
  region->flags.range_pinned = true;

region->partition = partition;

*regionpp = region;

return 0;
}

/*
* Map guest ram. if snp, make sure to release that from the host first
* Side Effects: In case of failure, pages are unpinned when feasible.
*/
static int
mshv_partition_mem_region_map(struct mshv_mem_region *region)
{
struct mshv_partition *partition = region->partition;
int ret;

ret = mshv_region_populate(region);
if (ret) {
  pt_err(partition, "Failed to populate memory region: %d\n",
         ret);
  goto err_out;
}

/*
* For an SNP partition it is a requirement that for every memory region
* that we are going to map for this partition we should make sure that
* host access to that region is released. This is ensured by doing an
* additional hypercall which will update the SLAT to release host
* access to guest memory regions.
*/
if (mshv_partition_encrypted(partition)) {
  ret = mshv_partition_region_unshare(region);
  if (ret) {
   pt_err(partition,
          "Failed to unshare memory region (guest_pfn: %llu): %d\n",
          region->start_gfn, ret);
   goto evict_region;
  }
}

ret = mshv_region_map(region);
if (ret && mshv_partition_encrypted(partition)) {
  int shrc;

  shrc = mshv_partition_region_share(region);
  if (!shrc)
   goto evict_region;

  pt_err(partition,
         "Failed to share memory region (guest_pfn: %llu): %d\n",
         region->start_gfn, shrc);
  /*
* Don't unpin if marking shared failed because pages are no
* longer mapped in the host, ie root, anymore.
*/
  goto err_out;
}

return 0;

evict_region:
mshv_region_evict(region);
err_out:
return ret;
}

/*
* This maps two things: guest RAM and for pci passthru mmio space.
*
* mmio:
*  - vfio overloads vm_pgoff to store the mmio start pfn/spa.
*  - Two things need to happen for mapping mmio range:
* 1. mapped in the uaddr so VMM can access it.
* 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
*
*   This function takes care of the second. The first one is managed by vfio,
*   and hence is taken care of via vfio_pci_mmap_fault().
*/
static long
mshv_map_user_memory(struct mshv_partition *partition,
       struct mshv_user_mem_region mem)
{
struct mshv_mem_region *region;
struct vm_area_struct *vma;
bool is_mmio;
ulong mmio_pfn;
long ret;

if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
     !access_ok((const void *)mem.userspace_addr, mem.size))
  return -EINVAL;

mmap_read_lock(current->mm);
vma = vma_lookup(current->mm, mem.userspace_addr);
is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
mmap_read_unlock(current->mm);

if (!vma)
  return -EINVAL;

ret = mshv_partition_create_region(partition, &mem, ®ion,
        is_mmio);
if (ret)
  return ret;

if (is_mmio)
  ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn,
          mmio_pfn, HVPFN_DOWN(mem.size));
else
  ret = mshv_partition_mem_region_map(region);

if (ret)
  goto errout;

/* Install the new region */
hlist_add_head(®ion->hnode, &partition->pt_mem_regions);

return 0;

errout:
vfree(region);
return ret;
}

/* Called for unmapping both the guest ram and the mmio space */
static long
mshv_unmap_user_memory(struct mshv_partition *partition,
         struct mshv_user_mem_region mem)
{
struct mshv_mem_region *region;
u32 unmap_flags = 0;

if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
  return -EINVAL;

region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
if (!region)
  return -EINVAL;

/* Paranoia check */
if (region->start_uaddr != mem.userspace_addr ||
     region->start_gfn != mem.guest_pfn ||
     region->nr_pages != HVPFN_DOWN(mem.size))
  return -EINVAL;

hlist_del(®ion->hnode);

if (region->flags.large_pages)
  unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;

/* ignore unmap failures and continue as process may be exiting */
hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn,
    region->nr_pages, unmap_flags);

mshv_region_evict(region);

vfree(region);
return 0;
}

static long
mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
    struct mshv_user_mem_region __user *user_mem)
{
struct mshv_user_mem_region mem;

if (copy_from_user(&mem, user_mem, sizeof(mem)))
  return -EFAULT;

if (!mem.size ||
     !PAGE_ALIGNED(mem.size) ||
     !PAGE_ALIGNED(mem.userspace_addr) ||
     (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
     mshv_field_nonzero(mem, rsvd))
  return -EINVAL;

if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
  return mshv_unmap_user_memory(partition, mem);

return mshv_map_user_memory(partition, mem);
}

static long
mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
          void __user *user_args)
{
struct mshv_user_ioeventfd args;

if (copy_from_user(&args, user_args, sizeof(args)))
  return -EFAULT;

return mshv_set_unset_ioeventfd(partition, &args);
}

static long
mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
      void __user *user_args)
{
struct mshv_user_irqfd args;

if (copy_from_user(&args, user_args, sizeof(args)))
  return -EFAULT;

return mshv_set_unset_irqfd(partition, &args);
}

static long
mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
         void __user *user_args)
{
struct mshv_gpap_access_bitmap args;
union hv_gpa_page_access_state *states;
long ret, i;
union hv_gpa_page_access_state_flags hv_flags = {};
u8 hv_type_mask;
ulong bitmap_buf_sz, states_buf_sz;
int written = 0;

if (copy_from_user(&args, user_args, sizeof(args)))
  return -EFAULT;

if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
     args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
     mshv_field_nonzero(args, rsvd) || !args.page_count ||
     !args.bitmap_ptr)
  return -EINVAL;

if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
  return -E2BIG;

/* Num bytes needed to store bitmap; one bit per page rounded up */
bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);

/* Sanity check */
if (bitmap_buf_sz > states_buf_sz)
  return -EBADFD;

switch (args.access_type) {
case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
  hv_type_mask = 1;
  if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
   hv_flags.clear_accessed = 1;
   /* not accessed implies not dirty */
   hv_flags.clear_dirty = 1;
  } else { /* MSHV_GPAP_ACCESS_OP_SET */
   hv_flags.set_accessed = 1;
  }
  break;
case MSHV_GPAP_ACCESS_TYPE_DIRTY:
  hv_type_mask = 2;
  if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
   hv_flags.clear_dirty = 1;
  } else { /* MSHV_GPAP_ACCESS_OP_SET */
   hv_flags.set_dirty = 1;
   /* dirty implies accessed */
   hv_flags.set_accessed = 1;
  }
  break;
}

states = vzalloc(states_buf_sz);
if (!states)
  return -ENOMEM;

ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
         args.gpap_base, hv_flags, &written,
         states);
if (ret)
  goto free_return;

/*
* Overwrite states buffer with bitmap - the bits in hv_type_mask
* correspond to bitfields in hv_gpa_page_access_state
*/
for (i = 0; i < written; ++i)
  __assign_bit(i, (ulong *)states,
        states[i].as_uint8 & hv_type_mask);

/* zero the unused bits in the last byte(s) of the returned bitmap */
for (i = written; i < bitmap_buf_sz * 8; ++i)
  __clear_bit(i, (ulong *)states);

if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
  ret = -EFAULT;

free_return:
vfree(states);
return ret;
}

static long
mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
         void __user *user_args)
{
struct mshv_user_irq_entry *entries = NULL;
struct mshv_user_irq_table args;
long ret;

if (copy_from_user(&args, user_args, sizeof(args)))
  return -EFAULT;

if (args.nr > MSHV_MAX_GUEST_IRQS ||
     mshv_field_nonzero(args, rsvd))
  return -EINVAL;

if (args.nr) {
  struct mshv_user_irq_table __user *urouting = user_args;

  entries = vmemdup_user(urouting->entries,
           array_size(sizeof(*entries),
        args.nr));
  if (IS_ERR(entries))
   return PTR_ERR(entries);
}
ret = mshv_update_routing_table(partition, entries, args.nr);
kvfree(entries);

return ret;
}

static long
mshv_partition_ioctl_initialize(struct mshv_partition *partition)
{
long ret;

if (partition->pt_initialized)
  return 0;

ret = hv_call_initialize_partition(partition->pt_id);
if (ret)
  goto withdraw_mem;

partition->pt_initialized = true;

return 0;

withdraw_mem:
hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);

return ret;
}

static long
mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct mshv_partition *partition = filp->private_data;
long ret;
void __user *uarg = (void __user *)arg;

if (mutex_lock_killable(&partition->pt_mutex))
  return -EINTR;

switch (ioctl) {
case MSHV_INITIALIZE_PARTITION:
  ret = mshv_partition_ioctl_initialize(partition);
  break;
case MSHV_SET_GUEST_MEMORY:
  ret = mshv_partition_ioctl_set_memory(partition, uarg);
  break;
case MSHV_CREATE_VP:
  ret = mshv_partition_ioctl_create_vp(partition, uarg);
  break;
case MSHV_IRQFD:
  ret = mshv_partition_ioctl_irqfd(partition, uarg);
  break;
case MSHV_IOEVENTFD:
  ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
  break;
case MSHV_SET_MSI_ROUTING:
  ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
  break;
case MSHV_GET_GPAP_ACCESS_BITMAP:
  ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
          uarg);
  break;
case MSHV_ROOT_HVCALL:
  ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
  break;
default:
  ret = -ENOTTY;
}

mutex_unlock(&partition->pt_mutex);
return ret;
}

static int
disable_vp_dispatch(struct mshv_vp *vp)
{
int ret;
struct hv_register_assoc dispatch_suspend = {
  .name = HV_REGISTER_DISPATCH_SUSPEND,
  .value.dispatch_suspend.suspended = 1,
};

ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
        1, &dispatch_suspend);
if (ret)
  vp_err(vp, "failed to suspend\n");

return ret;
}

static int
get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
{
int ret;
struct hv_register_assoc root_signal_count = {
  .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
};

ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
        1, &root_signal_count);

if (ret) {
  vp_err(vp, "Failed to get root signal count");
  *count = 0;
  return ret;
}

*count = root_signal_count.value.reg64;

return ret;
}

static void
drain_vp_signals(struct mshv_vp *vp)
{
u64 hv_signal_count;
u64 vp_signal_count;

get_vp_signaled_count(vp, &hv_signal_count);

vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);

/*
* There should be at most 1 outstanding notification, but be extra
* careful anyway.
*/
while (hv_signal_count != vp_signal_count) {
  WARN_ON(hv_signal_count - vp_signal_count != 1);

  if (wait_event_interruptible(vp->run.vp_suspend_queue,
          vp->run.kicked_by_hv == 1))
   break;
  vp->run.kicked_by_hv = 0;
  vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
}
}

static void drain_all_vps(const struct mshv_partition *partition)
{
int i;
struct mshv_vp *vp;

/*
* VPs are reachable from ISR. It is safe to not take the partition
* lock because nobody else can enter this function and drop the
* partition from the list.
*/
for (i = 0; i < MSHV_MAX_VPS; i++) {
  vp = partition->pt_vp_array[i];
  if (!vp)
   continue;
  /*
* Disable dispatching of the VP in the hypervisor. After this
* the hypervisor guarantees it won't generate any signals for
* the VP and the hypervisor's VP signal count won't change.
*/
  disable_vp_dispatch(vp);
  drain_vp_signals(vp);
}
}

static void
remove_partition(struct mshv_partition *partition)
{
spin_lock(&mshv_root.pt_ht_lock);
hlist_del_rcu(&partition->pt_hnode);
spin_unlock(&mshv_root.pt_ht_lock);

synchronize_rcu();
}

/*
* Tear down a partition and remove it from the list.
* Partition's refcount must be 0
*/
static void destroy_partition(struct mshv_partition *partition)
{
struct mshv_vp *vp;
struct mshv_mem_region *region;
int i, ret;
struct hlist_node *n;

if (refcount_read(&partition->pt_ref_count)) {
  pt_err(partition,
         "Attempt to destroy partition but refcount > 0\n");
  return;
}

if (partition->pt_initialized) {
  /*
* We only need to drain signals for root scheduler. This should be
* done before removing the partition from the partition list.
*/
  if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
   drain_all_vps(partition);

  /* Remove vps */
  for (i = 0; i < MSHV_MAX_VPS; ++i) {
   vp = partition->pt_vp_array[i];
   if (!vp)
    continue;

   if (hv_parent_partition())
    mshv_vp_stats_unmap(partition->pt_id, vp->vp_index);

   if (vp->vp_register_page) {
    (void)hv_call_unmap_vp_state_page(partition->pt_id,
          vp->vp_index,
          HV_VP_STATE_PAGE_REGISTERS,
          input_vtl_zero);
    vp->vp_register_page = NULL;
   }

   (void)hv_call_unmap_vp_state_page(partition->pt_id,
         vp->vp_index,
         HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
         input_vtl_zero);
   vp->vp_intercept_msg_page = NULL;

   if (vp->vp_ghcb_page) {
    (void)hv_call_unmap_vp_state_page(partition->pt_id,
          vp->vp_index,
          HV_VP_STATE_PAGE_GHCB,
          input_vtl_normal);
    vp->vp_ghcb_page = NULL;
   }

   kfree(vp);

   partition->pt_vp_array[i] = NULL;
  }

  /* Deallocates and unmaps everything including vcpus, GPA mappings etc */
  hv_call_finalize_partition(partition->pt_id);

  partition->pt_initialized = false;
}

remove_partition(partition);

/* Remove regions, regain access to the memory and unpin the pages */
hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
      hnode) {
  hlist_del(®ion->hnode);

  if (mshv_partition_encrypted(partition)) {
   ret = mshv_partition_region_share(region);
   if (ret) {
    pt_err(partition,
           "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
          ret);
    return;
   }
  }

  mshv_region_evict(region);

  vfree(region);
}

/* Withdraw and free all pages we deposited */
hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
hv_call_delete_partition(partition->pt_id);

mshv_free_routing_table(partition);
kfree(partition);
}

struct
mshv_partition *mshv_partition_get(struct mshv_partition *partition)
{
if (refcount_inc_not_zero(&partition->pt_ref_count))
  return partition;
return NULL;
}

struct
mshv_partition *mshv_partition_find(u64 partition_id)
__must_hold(RCU)
{
struct mshv_partition *p;

hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
       partition_id)
  if (p->pt_id == partition_id)
   return p;

return NULL;
}

void
mshv_partition_put(struct mshv_partition *partition)
{
if (refcount_dec_and_test(&partition->pt_ref_count))
  destroy_partition(partition);
}

static int
mshv_partition_release(struct inode *inode, struct file *filp)
{
struct mshv_partition *partition = filp->private_data;

mshv_eventfd_release(partition);

cleanup_srcu_struct(&partition->pt_irq_srcu);

mshv_partition_put(partition);

return 0;
}

static int
add_partition(struct mshv_partition *partition)
{
spin_lock(&mshv_root.pt_ht_lock);

hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
       partition->pt_id);

spin_unlock(&mshv_root.pt_ht_lock);

return 0;
}

static long
mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
{
struct mshv_create_partition args;
u64 creation_flags;
struct hv_partition_creation_properties creation_properties = {};
union hv_partition_isolation_properties isolation_properties = {};
struct mshv_partition *partition;
struct file *file;
int fd;
long ret;

if (copy_from_user(&args, user_arg, sizeof(args)))
  return -EFAULT;

if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
     args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
  return -EINVAL;

/* Only support EXO partitions */
creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
    HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;

if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC))
  creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC))
  creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES))
  creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;

switch (args.pt_isolation) {
case MSHV_PT_ISOLATION_NONE:
  isolation_properties.isolation_type =
   HV_PARTITION_ISOLATION_TYPE_NONE;
  break;
}

partition = kzalloc(sizeof(*partition), GFP_KERNEL);
if (!partition)
  return -ENOMEM;

partition->pt_module_dev = module_dev;
partition->isolation_type = isolation_properties.isolation_type;

refcount_set(&partition->pt_ref_count, 1);

mutex_init(&partition->pt_mutex);

mutex_init(&partition->pt_irq_lock);

init_completion(&partition->async_hypercall);

INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);

INIT_HLIST_HEAD(&partition->pt_devices);

INIT_HLIST_HEAD(&partition->pt_mem_regions);

mshv_eventfd_init(partition);

ret = init_srcu_struct(&partition->pt_irq_srcu);
if (ret)
  goto free_partition;

ret = hv_call_create_partition(creation_flags,
           creation_properties,
           isolation_properties,
           &partition->pt_id);
if (ret)
  goto cleanup_irq_srcu;

ret = add_partition(partition);
if (ret)
  goto delete_partition;

ret = mshv_init_async_handler(partition);
if (ret)
  goto remove_partition;

fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0) {
  ret = fd;
  goto remove_partition;
}

file = anon_inode_getfile("mshv_partition", &mshv_partition_fops,
      partition, O_RDWR);
if (IS_ERR(file)) {
  ret = PTR_ERR(file);
  goto put_fd;
}

fd_install(fd, file);

return fd;

put_fd:
put_unused_fd(fd);
remove_partition:
remove_partition(partition);
delete_partition:
hv_call_delete_partition(partition->pt_id);
cleanup_irq_srcu:
cleanup_srcu_struct(&partition->pt_irq_srcu);
free_partition:
kfree(partition);

return ret;
}

static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
      unsigned long arg)
{
struct miscdevice *misc = filp->private_data;

switch (ioctl) {
case MSHV_CREATE_PARTITION:
  return mshv_ioctl_create_partition((void __user *)arg,
      misc->this_device);
}

return -ENOTTY;
}

static int
mshv_dev_open(struct inode *inode, struct file *filp)
{
return 0;
}

static int
mshv_dev_release(struct inode *inode, struct file *filp)
{
return 0;
}

static int mshv_cpuhp_online;
static int mshv_root_sched_online;

static const char *scheduler_type_to_string(enum hv_scheduler_type type)
{
switch (type) {
case HV_SCHEDULER_TYPE_LP:
  return "classic scheduler without SMT";
case HV_SCHEDULER_TYPE_LP_SMT:
  return "classic scheduler with SMT";
case HV_SCHEDULER_TYPE_CORE_SMT:
  return "core scheduler";
case HV_SCHEDULER_TYPE_ROOT:
  return "root scheduler";
default:
  return "unknown scheduler";
};
}

/* TODO move this to hv_common.c when needed outside */
static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
{
struct hv_input_get_system_property *input;
struct hv_output_get_system_property *output;
unsigned long flags;
u64 status;

local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);

memset(input, 0, sizeof(*input));
memset(output, 0, sizeof(*output));
input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;

status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
if (!hv_result_success(status)) {
  local_irq_restore(flags);
  pr_err("%s: %s\n", __func__, hv_result_to_string(status));
  return hv_result_to_errno(status);
}

*out = output->scheduler_type;
local_irq_restore(flags);

return 0;
}

/* Retrieve and stash the supported scheduler type */
static int __init mshv_retrieve_scheduler_type(struct device *dev)
{
int ret;

ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
if (ret)
  return ret;

dev_info(dev, "Hypervisor using %s\n",
   scheduler_type_to_string(hv_scheduler_type));

switch (hv_scheduler_type) {
case HV_SCHEDULER_TYPE_CORE_SMT:
case HV_SCHEDULER_TYPE_LP_SMT:
case HV_SCHEDULER_TYPE_ROOT:
case HV_SCHEDULER_TYPE_LP:
  /* Supported scheduler, nothing to do */
  break;
default:
  dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
   hv_scheduler_type);
  return -EOPNOTSUPP;
}

return 0;
}

static int mshv_root_scheduler_init(unsigned int cpu)
{
void **inputarg, **outputarg, *p;

inputarg = (void **)this_cpu_ptr(root_scheduler_input);
outputarg = (void **)this_cpu_ptr(root_scheduler_output);

/* Allocate two consecutive pages. One for input, one for output. */
p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!p)
  return -ENOMEM;

*inputarg = p;
*outputarg = (char *)p + HV_HYP_PAGE_SIZE;

return 0;
}

static int mshv_root_scheduler_cleanup(unsigned int cpu)
{
void *p, **inputarg, **outputarg;

inputarg = (void **)this_cpu_ptr(root_scheduler_input);
outputarg = (void **)this_cpu_ptr(root_scheduler_output);

p = *inputarg;

*inputarg = NULL;
*outputarg = NULL;

kfree(p);

return 0;
}

/* Must be called after retrieving the scheduler type */
static int
root_scheduler_init(struct device *dev)
{
int ret;

if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
  return 0;

root_scheduler_input = alloc_percpu(void *);
root_scheduler_output = alloc_percpu(void *);

if (!root_scheduler_input || !root_scheduler_output) {
  dev_err(dev, "Failed to allocate root scheduler buffers\n");
  ret = -ENOMEM;
  goto out;
}

ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
    mshv_root_scheduler_init,
    mshv_root_scheduler_cleanup);

if (ret < 0) {
  dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
  goto out;
}

mshv_root_sched_online = ret;

return 0;

out:
free_percpu(root_scheduler_input);
free_percpu(root_scheduler_output);
return ret;
}

static void
root_scheduler_deinit(void)
{
if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
  return;

cpuhp_remove_state(mshv_root_sched_online);
free_percpu(root_scheduler_input);
free_percpu(root_scheduler_output);
}

static int mshv_reboot_notify(struct notifier_block *nb,
         unsigned long code, void *unused)
{
cpuhp_remove_state(mshv_cpuhp_online);
return 0;
}

struct notifier_block mshv_reboot_nb = {
.notifier_call = mshv_reboot_notify,
};

static void mshv_root_partition_exit(void)
{
unregister_reboot_notifier(&mshv_reboot_nb);
root_scheduler_deinit();
}

static int __init mshv_root_partition_init(struct device *dev)
{
int err;

if (mshv_retrieve_scheduler_type(dev))
  return -ENODEV;

err = root_scheduler_init(dev);
if (err)
  return err;

err = register_reboot_notifier(&mshv_reboot_nb);
if (err)
  goto root_sched_deinit;

return 0;

root_sched_deinit:
root_scheduler_deinit();
return err;
}

static int __init mshv_parent_partition_init(void)
{
int ret;
struct device *dev;
union hv_hypervisor_version_info version_info;

if (!hv_root_partition() || is_kdump_kernel())
  return -ENODEV;

if (hv_get_hypervisor_version(&version_info))
  return -ENODEV;

ret = misc_register(&mshv_dev);
if (ret)
  return ret;

dev = mshv_dev.this_device;

if (version_info.build_number < MSHV_HV_MIN_VERSION ||
     version_info.build_number > MSHV_HV_MAX_VERSION) {
  dev_err(dev, "Running on unvalidated Hyper-V version\n");
  dev_err(dev, "Versions: current: %u  min: %u  max: %u\n",
   version_info.build_number, MSHV_HV_MIN_VERSION,
   MSHV_HV_MAX_VERSION);
}

mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
if (!mshv_root.synic_pages) {
  dev_err(dev, "Failed to allocate percpu synic page\n");
  ret = -ENOMEM;
  goto device_deregister;
}

ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
    mshv_synic_init,
    mshv_synic_cleanup);
if (ret < 0) {
  dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
  goto free_synic_pages;
}

mshv_cpuhp_online = ret;

ret = mshv_root_partition_init(dev);
if (ret)
  goto remove_cpu_state;

ret = mshv_irqfd_wq_init();
if (ret)
  goto exit_partition;

spin_lock_init(&mshv_root.pt_ht_lock);
hash_init(mshv_root.pt_htable);

hv_setup_mshv_handler(mshv_isr);

return 0;

exit_partition:
if (hv_root_partition())
  mshv_root_partition_exit();
remove_cpu_state:
cpuhp_remove_state(mshv_cpuhp_online);
free_synic_pages:
free_percpu(mshv_root.synic_pages);
device_deregister:
misc_deregister(&mshv_dev);
return ret;
}

static void __exit mshv_parent_partition_exit(void)
{
hv_setup_mshv_handler(NULL);
mshv_port_table_fini();
misc_deregister(&mshv_dev);
mshv_irqfd_wq_cleanup();
if (hv_root_partition())
  mshv_root_partition_exit();
cpuhp_remove_state(mshv_cpuhp_online);
free_percpu(mshv_root.synic_pages);
}

module_init(mshv_parent_partition_init);
module_exit(mshv_parent_partition_exit);

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.24 Sekunden (vorverarbeitet am 2026-04-28) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.