SSL kvm_main.c Sprache: unbekannt

rahmenlose Ansicht.c DruckansichtUnknown {[0] [0] [0]}zum Wurzelverzeichnis wechseln

// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine (KVM) Hypervisor
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
*   Avi Kivity   <avi@qumranet.com>
*   Yaniv Kamay  <yaniv@qumranet.com>
*/

#include <kvm/iodev.h>

#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/percpu.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/syscore_ops.h>
#include <linux/cpu.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/sched/stat.h>
#include <linux/cpumask.h>
#include <linux/smp.h>
#include <linux/anon_inodes.h>
#include <linux/profile.h>
#include <linux/kvm_para.h>
#include <linux/pagemap.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/compat.h>
#include <linux/srcu.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/bsearch.h>
#include <linux/io.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/suspend.h>

#include <asm/processor.h>
#include <asm/ioctl.h>
#include <linux/uaccess.h>

#include "coalesced_mmio.h"
#include "async_pf.h"
#include "kvm_mm.h"
#include "vfio.h"

#include <trace/events/ipi.h>

#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>

#include <linux/kvm_dirty_ring.h>

/* Worst case buffer size needed for holding an integer. */
#define ITOA_MAX_LEN 12

MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
MODULE_LICENSE("GPL");

/* Architectures should define their poll value according to the halt latency */
unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
module_param(halt_poll_ns, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns);

/* Default doubles per-vcpu halt_poll_ns. */
unsigned int halt_poll_ns_grow = 2;
module_param(halt_poll_ns_grow, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow);

/* The start value to grow halt_poll_ns from */
unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
module_param(halt_poll_ns_grow_start, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);

/* Default halves per-vcpu halt_poll_ns. */
unsigned int halt_poll_ns_shrink = 2;
module_param(halt_poll_ns_shrink, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);

/*
* Allow direct access (from KVM or the CPU) without MMU notifier protection
* to unpinned pages.
*/
static bool allow_unsafe_mappings;
module_param(allow_unsafe_mappings, bool, 0444);

/*
* Ordering of locks:
*
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
*/

DEFINE_MUTEX(kvm_lock);
LIST_HEAD(vm_list);

static struct kmem_cache *kvm_vcpu_cache;

static __read_mostly struct preempt_ops kvm_preempt_ops;
static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);

static struct dentry *kvm_debugfs_dir;

static const struct file_operations stat_fops_per_vm;

static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
      unsigned long arg);
#ifdef CONFIG_KVM_COMPAT
static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
      unsigned long arg);
#define KVM_COMPAT(c) .compat_ioctl = (c)
#else
/*
* For architectures that don't implement a compat infrastructure,
* adopt a double line of defense:
* - Prevent a compat task from opening /dev/kvm
* - If the open has been done by a 64bit task, and the KVM fd
*   passed to a compat task, let the ioctls fail.
*/
static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
    unsigned long arg) { return -EINVAL; }

static int kvm_no_compat_open(struct inode *inode, struct file *file)
{
return is_compat_task() ? -ENODEV : 0;
}
#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
   .open  = kvm_no_compat_open
#endif

static void kvm_io_bus_destroy(struct kvm_io_bus *bus);

#define KVM_EVENT_CREATE_VM 0
#define KVM_EVENT_DESTROY_VM 1
static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
static unsigned long long kvm_createvm_count;
static unsigned long long kvm_active_vms;

static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);

__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
}

/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
void vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu = get_cpu();

__this_cpu_write(kvm_running_vcpu, vcpu);
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
}
EXPORT_SYMBOL_GPL(vcpu_load);

void vcpu_put(struct kvm_vcpu *vcpu)
{
preempt_disable();
kvm_arch_vcpu_put(vcpu);
preempt_notifier_unregister(&vcpu->preempt_notifier);
__this_cpu_write(kvm_running_vcpu, NULL);
preempt_enable();
}
EXPORT_SYMBOL_GPL(vcpu_put);

/* TODO: merge with kvm_arch_vcpu_should_kick */
static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
{
int mode = kvm_vcpu_exiting_guest_mode(vcpu);

/*
  * We need to wait for the VCPU to reenable interrupts and get out of
  * READING_SHADOW_PAGE_TABLES mode.
  */
if (req & KVM_REQUEST_WAIT)
  return mode != OUTSIDE_GUEST_MODE;

/*
  * Need to kick a running VCPU, but otherwise there is nothing to do.
  */
return mode == IN_GUEST_MODE;
}

static void ack_kick(void *_completed)
{
}

static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
{
if (cpumask_empty(cpus))
  return false;

smp_call_function_many(cpus, ack_kick, NULL, wait);
return true;
}

static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
      struct cpumask *tmp, int current_cpu)
{
int cpu;

if (likely(!(req & KVM_REQUEST_NO_ACTION)))
  __kvm_make_request(req, vcpu);

if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
  return;

/*
  * Note, the vCPU could get migrated to a different pCPU at any point
  * after kvm_request_needs_ipi(), which could result in sending an IPI
  * to the previous pCPU.  But, that's OK because the purpose of the IPI
  * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
  * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
  * after this point is also OK, as the requirement is only that KVM wait
  * for vCPUs that were reading SPTEs _before_ any changes were
  * finalized. See kvm_vcpu_kick() for more details on handling requests.
  */
if (kvm_request_needs_ipi(vcpu, req)) {
  cpu = READ_ONCE(vcpu->cpu);
  if (cpu != -1 && cpu != current_cpu)
   __cpumask_set_cpu(cpu, tmp);
}
}

bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
     unsigned long *vcpu_bitmap)
{
struct kvm_vcpu *vcpu;
struct cpumask *cpus;
int i, me;
bool called;

me = get_cpu();

cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
cpumask_clear(cpus);

for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
  vcpu = kvm_get_vcpu(kvm, i);
  if (!vcpu)
   continue;
  kvm_make_vcpu_request(vcpu, req, cpus, me);
}

called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
put_cpu();

return called;
}

bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
{
struct kvm_vcpu *vcpu;
struct cpumask *cpus;
unsigned long i;
bool called;
int me;

me = get_cpu();

cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
cpumask_clear(cpus);

kvm_for_each_vcpu(i, vcpu, kvm)
  kvm_make_vcpu_request(vcpu, req, cpus, me);

called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
put_cpu();

return called;
}
EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);

void kvm_flush_remote_tlbs(struct kvm *kvm)
{
++kvm->stat.generic.remote_tlb_flush_requests;

/*
  * We want to publish modifications to the page tables before reading
  * mode. Pairs with a memory barrier in arch-specific code.
  * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
  * and smp_mb in walk_shadow_page_lockless_begin/end.
  * - powerpc: smp_mb in kvmppc_prepare_to_enter.
  *
  * There is already an smp_mb__after_atomic() before
  * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
  * barrier here.
  */
if (!kvm_arch_flush_remote_tlbs(kvm)
     || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
  ++kvm->stat.generic.remote_tlb_flush;
}
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);

void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
{
if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
  return;

/*
  * Fall back to a flushing entire TLBs if the architecture range-based
  * TLB invalidation is unsupported or can't be performed for whatever
  * reason.
  */
kvm_flush_remote_tlbs(kvm);
}

void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
       const struct kvm_memory_slot *memslot)
{
/*
  * All current use cases for flushing the TLBs for a specific memslot
  * are related to dirty logging, and many do the TLB flush out of
  * mmu_lock. The interaction between the various operations on memslot
  * must be serialized by slots_locks to ensure the TLB flush from one
  * operation is observed by any other operation on the same memslot.
  */
lockdep_assert_held(&kvm->slots_lock);
kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
}

static void kvm_flush_shadow_all(struct kvm *kvm)
{
kvm_arch_flush_shadow_all(kvm);
kvm_arch_guest_memory_reclaimed(kvm);
}

#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
            gfp_t gfp_flags)
{
void *page;

gfp_flags |= mc->gfp_zero;

if (mc->kmem_cache)
  return kmem_cache_alloc(mc->kmem_cache, gfp_flags);

page = (void *)__get_free_page(gfp_flags);
if (page && mc->init_value)
  memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
return page;
}

int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
{
gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
void *obj;

if (mc->nobjs >= min)
  return 0;

if (unlikely(!mc->objects)) {
  if (WARN_ON_ONCE(!capacity))
   return -EIO;

  /*
   * Custom init values can be used only for page allocations,
   * and obviously conflict with __GFP_ZERO.
   */
  if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
   return -EIO;

  mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
  if (!mc->objects)
   return -ENOMEM;

  mc->capacity = capacity;
}

/* It is illegal to request a different capacity across topups. */
if (WARN_ON_ONCE(mc->capacity != capacity))
  return -EIO;

while (mc->nobjs < mc->capacity) {
  obj = mmu_memory_cache_alloc_obj(mc, gfp);
  if (!obj)
   return mc->nobjs >= min ? 0 : -ENOMEM;
  mc->objects[mc->nobjs++] = obj;
}
return 0;
}

int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
{
return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
}

int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
{
return mc->nobjs;
}

void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{
while (mc->nobjs) {
  if (mc->kmem_cache)
   kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
  else
   free_page((unsigned long)mc->objects[--mc->nobjs]);
}

kvfree(mc->objects);

mc->objects = NULL;
mc->capacity = 0;
}

void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
{
void *p;

if (WARN_ON(!mc->nobjs))
  p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
else
  p = mc->objects[--mc->nobjs];
BUG_ON(!p);
return p;
}
#endif

static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
mutex_init(&vcpu->mutex);
vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
vcpu->pid = NULL;
rwlock_init(&vcpu->pid_lock);
#ifndef __KVM_HAVE_ARCH_WQP
rcuwait_init(&vcpu->wait);
#endif
kvm_async_pf_vcpu_init(vcpu);

kvm_vcpu_set_in_spin_loop(vcpu, false);
kvm_vcpu_set_dy_eligible(vcpu, false);
vcpu->preempted = false;
vcpu->ready = false;
preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
vcpu->last_used_slot = NULL;

/* Fill the stats id string for the vcpu */
snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
   task_pid_nr(current), id);
}

static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
{
kvm_arch_vcpu_destroy(vcpu);
kvm_dirty_ring_free(&vcpu->dirty_ring);

/*
  * No need for rcu_read_lock as VCPU_RUN is the only place that changes
  * the vcpu->pid pointer, and at destruction time all file descriptors
  * are already gone.
  */
put_pid(vcpu->pid);

free_page((unsigned long)vcpu->run);
kmem_cache_free(kvm_vcpu_cache, vcpu);
}

void kvm_destroy_vcpus(struct kvm *kvm)
{
unsigned long i;
struct kvm_vcpu *vcpu;

kvm_for_each_vcpu(i, vcpu, kvm) {
  kvm_vcpu_destroy(vcpu);
  xa_erase(&kvm->vcpu_array, i);

  /*
   * Assert that the vCPU isn't visible in any way, to ensure KVM
   * doesn't trigger a use-after-free if destroying vCPUs results
   * in VM-wide request, e.g. to flush remote TLBs when tearing
   * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires.
   */
  WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i));
}

atomic_set(&kvm->online_vcpus, 0);
}
EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);

#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
{
return container_of(mn, struct kvm, mmu_notifier);
}

typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);

typedef void (*on_lock_fn_t)(struct kvm *kvm);

struct kvm_mmu_notifier_range {
/*
  * 64-bit addresses, as KVM notifiers can operate on host virtual
  * addresses (unsigned long) and guest physical addresses (64-bit).
  */
u64 start;
u64 end;
union kvm_mmu_notifier_arg arg;
gfn_handler_t handler;
on_lock_fn_t on_lock;
bool flush_on_ret;
bool may_block;
bool lockless;
};

/*
* The inner-most helper returns a tuple containing the return value from the
* arch- and action-specific handler, plus a flag indicating whether or not at
* least one memslot was found, i.e. if the handler found guest memory.
*
* Note, most notifiers are averse to booleans, so even though KVM tracks the
* return from arch code as a bool, outer helpers will cast it to an int. :-(
*/
typedef struct kvm_mmu_notifier_return {
bool ret;
bool found_memslot;
} kvm_mn_ret_t;

/*
* Use a dedicated stub instead of NULL to indicate that there is no callback
* function/handler.  The compiler technically can't guarantee that a real
* function will have a non-zero address, and so it will generate code to
* check for !NULL, whereas comparing against a stub will be elided at compile
* time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
*/
static void kvm_null_fn(void)
{

}
#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)

/* Iterate over each memslot intersecting [start, last] (inclusive) range */
#define kvm_for_each_memslot_in_hva_range(node, slots, start, last)      \
for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
      node;            \
      node = interval_tree_iter_next(node, start, last))      \

static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm,
        const struct kvm_mmu_notifier_range *range)
{
struct kvm_mmu_notifier_return r = {
  .ret = false,
  .found_memslot = false,
};
struct kvm_gfn_range gfn_range;
struct kvm_memory_slot *slot;
struct kvm_memslots *slots;
int i, idx;

if (WARN_ON_ONCE(range->end <= range->start))
  return r;

/* A null handler is allowed if and only if on_lock() is provided. */
if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
    IS_KVM_NULL_FN(range->handler)))
  return r;

/* on_lock will never be called for lockless walks */
if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock)))
  return r;

idx = srcu_read_lock(&kvm->srcu);

for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  struct interval_tree_node *node;

  slots = __kvm_memslots(kvm, i);
  kvm_for_each_memslot_in_hva_range(node, slots,
        range->start, range->end - 1) {
   unsigned long hva_start, hva_end;

   slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
   hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
   hva_end = min_t(unsigned long, range->end,
     slot->userspace_addr + (slot->npages << PAGE_SHIFT));

   /*
    * To optimize for the likely case where the address
    * range is covered by zero or one memslots, don't
    * bother making these conditional (to avoid writes on
    * the second or later invocation of the handler).
    */
   gfn_range.arg = range->arg;
   gfn_range.may_block = range->may_block;
   /*
    * HVA-based notifications aren't relevant to private
    * mappings as they don't have a userspace mapping.
    */
   gfn_range.attr_filter = KVM_FILTER_SHARED;

   /*
    * {gfn(page) | page intersects with [hva_start, hva_end)} =
    * {gfn_start, gfn_start+1, ..., gfn_end-1}.
    */
   gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
   gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
   gfn_range.slot = slot;
   gfn_range.lockless = range->lockless;

   if (!r.found_memslot) {
    r.found_memslot = true;
    if (!range->lockless) {
     KVM_MMU_LOCK(kvm);
     if (!IS_KVM_NULL_FN(range->on_lock))
      range->on_lock(kvm);

     if (IS_KVM_NULL_FN(range->handler))
      goto mmu_unlock;
    }
   }
   r.ret |= range->handler(kvm, &gfn_range);
  }
}

if (range->flush_on_ret && r.ret)
  kvm_flush_remote_tlbs(kvm);

mmu_unlock:
if (r.found_memslot && !range->lockless)
  KVM_MMU_UNLOCK(kvm);

srcu_read_unlock(&kvm->srcu, idx);

return r;
}

static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn,
      unsigned long start,
      unsigned long end,
      gfn_handler_t handler,
      bool flush_on_ret)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
const struct kvm_mmu_notifier_range range = {
  .start  = start,
  .end  = end,
  .handler = handler,
  .on_lock = (void *)kvm_null_fn,
  .flush_on_ret = flush_on_ret,
  .may_block = false,
  .lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING),
};

return kvm_handle_hva_range(kvm, &range).ret;
}

static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn,
            unsigned long start,
            unsigned long end,
            gfn_handler_t handler)
{
return kvm_age_hva_range(mn, start, end, handler, false);
}

void kvm_mmu_invalidate_begin(struct kvm *kvm)
{
lockdep_assert_held_write(&kvm->mmu_lock);
/*
  * The count increase must become visible at unlock time as no
  * spte can be established without taking the mmu_lock and
  * count is also read inside the mmu_lock critical section.
  */
kvm->mmu_invalidate_in_progress++;

if (likely(kvm->mmu_invalidate_in_progress == 1)) {
  kvm->mmu_invalidate_range_start = INVALID_GPA;
  kvm->mmu_invalidate_range_end = INVALID_GPA;
}
}

void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
{
lockdep_assert_held_write(&kvm->mmu_lock);

WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);

if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
  kvm->mmu_invalidate_range_start = start;
  kvm->mmu_invalidate_range_end = end;
} else {
  /*
   * Fully tracking multiple concurrent ranges has diminishing
   * returns. Keep things simple and just find the minimal range
   * which includes the current and new ranges. As there won't be
   * enough information to subtract a range after its invalidate
   * completes, any ranges invalidated concurrently will
   * accumulate and persist until all outstanding invalidates
   * complete.
   */
  kvm->mmu_invalidate_range_start =
   min(kvm->mmu_invalidate_range_start, start);
  kvm->mmu_invalidate_range_end =
   max(kvm->mmu_invalidate_range_end, end);
}
}

bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
return kvm_unmap_gfn_range(kvm, range);
}

static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
     const struct mmu_notifier_range *range)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
const struct kvm_mmu_notifier_range hva_range = {
  .start  = range->start,
  .end  = range->end,
  .handler = kvm_mmu_unmap_gfn_range,
  .on_lock = kvm_mmu_invalidate_begin,
  .flush_on_ret = true,
  .may_block = mmu_notifier_range_blockable(range),
};

trace_kvm_unmap_hva_range(range->start, range->end);

/*
  * Prevent memslot modification between range_start() and range_end()
  * so that conditionally locking provides the same result in both
  * functions.  Without that guarantee, the mmu_invalidate_in_progress
  * adjustments will be imbalanced.
  *
  * Pairs with the decrement in range_end().
  */
spin_lock(&kvm->mn_invalidate_lock);
kvm->mn_active_invalidate_count++;
spin_unlock(&kvm->mn_invalidate_lock);

/*
  * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
  * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
  * each cache's lock.  There are relatively few caches in existence at
  * any given time, and the caches themselves can check for hva overlap,
  * i.e. don't need to rely on memslot overlap checks for performance.
  * Because this runs without holding mmu_lock, the pfn caches must use
  * mn_active_invalidate_count (see above) instead of
  * mmu_invalidate_in_progress.
  */
gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);

/*
  * If one or more memslots were found and thus zapped, notify arch code
  * that guest memory has been reclaimed.  This needs to be done *after*
  * dropping mmu_lock, as x86's reclaim path is slooooow.
  */
if (kvm_handle_hva_range(kvm, &hva_range).found_memslot)
  kvm_arch_guest_memory_reclaimed(kvm);

return 0;
}

void kvm_mmu_invalidate_end(struct kvm *kvm)
{
lockdep_assert_held_write(&kvm->mmu_lock);

/*
  * This sequence increase will notify the kvm page fault that
  * the page that is going to be mapped in the spte could have
  * been freed.
  */
kvm->mmu_invalidate_seq++;
smp_wmb();
/*
  * The above sequence increase must be visible before the
  * below count decrease, which is ensured by the smp_wmb above
  * in conjunction with the smp_rmb in mmu_invalidate_retry().
  */
kvm->mmu_invalidate_in_progress--;
KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);

/*
  * Assert that at least one range was added between start() and end().
  * Not adding a range isn't fatal, but it is a KVM bug.
  */
WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
}

static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
     const struct mmu_notifier_range *range)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
const struct kvm_mmu_notifier_range hva_range = {
  .start  = range->start,
  .end  = range->end,
  .handler = (void *)kvm_null_fn,
  .on_lock = kvm_mmu_invalidate_end,
  .flush_on_ret = false,
  .may_block = mmu_notifier_range_blockable(range),
};
bool wake;

kvm_handle_hva_range(kvm, &hva_range);

/* Pairs with the increment in range_start(). */
spin_lock(&kvm->mn_invalidate_lock);
if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
  --kvm->mn_active_invalidate_count;
wake = !kvm->mn_active_invalidate_count;
spin_unlock(&kvm->mn_invalidate_lock);

/*
  * There can only be one waiter, since the wait happens under
  * slots_lock.
  */
if (wake)
  rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
}

static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
           struct mm_struct *mm,
           unsigned long start,
           unsigned long end)
{
trace_kvm_age_hva(start, end);

return kvm_age_hva_range(mn, start, end, kvm_age_gfn,
     !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
}

static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
     struct mm_struct *mm,
     unsigned long start,
     unsigned long end)
{
trace_kvm_age_hva(start, end);

/*
  * Even though we do not flush TLB, this will still adversely
  * affect performance on pre-Haswell Intel EPT, where there is
  * no EPT Access Bit to clear so that we have to tear down EPT
  * tables instead. If we find this unacceptable, we can always
  * add a parameter to kvm_age_hva so that it effectively doesn't
  * do anything on clear_young.
  *
  * Also note that currently we never issue secondary TLB flushes
  * from clear_young, leaving this job up to the regular system
  * cadence. If we find this inaccurate, we might come up with a
  * more sophisticated heuristic later.
  */
return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn);
}

static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
           struct mm_struct *mm,
           unsigned long address)
{
trace_kvm_test_age_hva(address);

return kvm_age_hva_range_no_flush(mn, address, address + 1,
       kvm_test_age_gfn);
}

static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
         struct mm_struct *mm)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int idx;

idx = srcu_read_lock(&kvm->srcu);
kvm_flush_shadow_all(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}

static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
.clear_young  = kvm_mmu_notifier_clear_young,
.test_young  = kvm_mmu_notifier_test_young,
.release  = kvm_mmu_notifier_release,
};

static int kvm_init_mmu_notifier(struct kvm *kvm)
{
kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
}

#else  /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */

static int kvm_init_mmu_notifier(struct kvm *kvm)
{
return 0;
}

#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */

#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
static int kvm_pm_notifier_call(struct notifier_block *bl,
    unsigned long state,
    void *unused)
{
struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);

return kvm_arch_pm_notifier(kvm, state);
}

static void kvm_init_pm_notifier(struct kvm *kvm)
{
kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
/* Suspend KVM before we suspend ftrace, RCU, etc. */
kvm->pm_notifier.priority = INT_MAX;
register_pm_notifier(&kvm->pm_notifier);
}

static void kvm_destroy_pm_notifier(struct kvm *kvm)
{
unregister_pm_notifier(&kvm->pm_notifier);
}
#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
static void kvm_init_pm_notifier(struct kvm *kvm)
{
}

static void kvm_destroy_pm_notifier(struct kvm *kvm)
{
}
#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */

static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
{
if (!memslot->dirty_bitmap)
  return;

vfree(memslot->dirty_bitmap);
memslot->dirty_bitmap = NULL;
}

/* This does not remove the slot from struct kvm_memslots data structures */
static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
if (slot->flags & KVM_MEM_GUEST_MEMFD)
  kvm_gmem_unbind(slot);

kvm_destroy_dirty_bitmap(slot);

kvm_arch_free_memslot(kvm, slot);

kfree(slot);
}

static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
{
struct hlist_node *idnode;
struct kvm_memory_slot *memslot;
int bkt;

/*
  * The same memslot objects live in both active and inactive sets,
  * arbitrarily free using index '1' so the second invocation of this
  * function isn't operating over a structure with dangling pointers
  * (even though this function isn't actually touching them).
  */
if (!slots->node_idx)
  return;

hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
  kvm_free_memslot(kvm, memslot);
}

static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
{
switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
case KVM_STATS_TYPE_INSTANT:
  return 0444;
case KVM_STATS_TYPE_CUMULATIVE:
case KVM_STATS_TYPE_PEAK:
default:
  return 0644;
}
}

static void kvm_destroy_vm_debugfs(struct kvm *kvm)
{
int i;
int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
          kvm_vcpu_stats_header.num_desc;

if (IS_ERR(kvm->debugfs_dentry))
  return;

debugfs_remove_recursive(kvm->debugfs_dentry);

if (kvm->debugfs_stat_data) {
  for (i = 0; i < kvm_debugfs_num_entries; i++)
   kfree(kvm->debugfs_stat_data[i]);
  kfree(kvm->debugfs_stat_data);
}
}

static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
{
static DEFINE_MUTEX(kvm_debugfs_lock);
struct dentry *dent;
char dir_name[ITOA_MAX_LEN * 2];
struct kvm_stat_data *stat_data;
const struct _kvm_stats_desc *pdesc;
int i, ret = -ENOMEM;
int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
          kvm_vcpu_stats_header.num_desc;

if (!debugfs_initialized())
  return 0;

snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
mutex_lock(&kvm_debugfs_lock);
dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
if (dent) {
  pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
  dput(dent);
  mutex_unlock(&kvm_debugfs_lock);
  return 0;
}
dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
mutex_unlock(&kvm_debugfs_lock);
if (IS_ERR(dent))
  return 0;

kvm->debugfs_dentry = dent;
kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
      sizeof(*kvm->debugfs_stat_data),
      GFP_KERNEL_ACCOUNT);
if (!kvm->debugfs_stat_data)
  goto out_err;

for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
  pdesc = &kvm_vm_stats_desc[i];
  stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
  if (!stat_data)
   goto out_err;

  stat_data->kvm = kvm;
  stat_data->desc = pdesc;
  stat_data->kind = KVM_STAT_VM;
  kvm->debugfs_stat_data[i] = stat_data;
  debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
        kvm->debugfs_dentry, stat_data,
        &stat_fops_per_vm);
}

for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
  pdesc = &kvm_vcpu_stats_desc[i];
  stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
  if (!stat_data)
   goto out_err;

  stat_data->kvm = kvm;
  stat_data->desc = pdesc;
  stat_data->kind = KVM_STAT_VCPU;
  kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
  debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
        kvm->debugfs_dentry, stat_data,
        &stat_fops_per_vm);
}

kvm_arch_create_vm_debugfs(kvm);
return 0;
out_err:
kvm_destroy_vm_debugfs(kvm);
return ret;
}

/*
* Called just after removing the VM from the vm_list, but before doing any
* other destruction.
*/
void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
{
}

/*
* Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
* be setup already, so we can create arch-specific debugfs entries under it.
* Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
* a per-arch destroy interface is not needed.
*/
void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
}

static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
{
struct kvm *kvm = kvm_arch_alloc_vm();
struct kvm_memslots *slots;
int r, i, j;

if (!kvm)
  return ERR_PTR(-ENOMEM);

KVM_MMU_LOCK_INIT(kvm);
mmgrab(current->mm);
kvm->mm = current->mm;
kvm_eventfd_init(kvm);
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
mutex_init(&kvm->slots_arch_lock);
spin_lock_init(&kvm->mn_invalidate_lock);
rcuwait_init(&kvm->mn_memslots_update_rcuwait);
xa_init(&kvm->vcpu_array);
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
xa_init(&kvm->mem_attr_array);
#endif

INIT_LIST_HEAD(&kvm->gpc_list);
spin_lock_init(&kvm->gpc_lock);

INIT_LIST_HEAD(&kvm->devices);
kvm->max_vcpus = KVM_MAX_VCPUS;

BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);

/*
  * Force subsequent debugfs file creations to fail if the VM directory
  * is not created (by kvm_create_vm_debugfs()).
  */
kvm->debugfs_dentry = ERR_PTR(-ENOENT);

snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
   task_pid_nr(current));

r = -ENOMEM;
if (init_srcu_struct(&kvm->srcu))
  goto out_err_no_srcu;
if (init_srcu_struct(&kvm->irq_srcu))
  goto out_err_no_irq_srcu;

r = kvm_init_irq_routing(kvm);
if (r)
  goto out_err_no_irq_routing;

refcount_set(&kvm->users_count, 1);

for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  for (j = 0; j < 2; j++) {
   slots = &kvm->__memslots[i][j];

   atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
   slots->hva_tree = RB_ROOT_CACHED;
   slots->gfn_tree = RB_ROOT;
   hash_init(slots->id_hash);
   slots->node_idx = j;

   /* Generations must be different for each address space. */
   slots->generation = i;
  }

  rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
}

r = -ENOMEM;
for (i = 0; i < KVM_NR_BUSES; i++) {
  rcu_assign_pointer(kvm->buses[i],
   kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
  if (!kvm->buses[i])
   goto out_err_no_arch_destroy_vm;
}

r = kvm_arch_init_vm(kvm, type);
if (r)
  goto out_err_no_arch_destroy_vm;

r = kvm_enable_virtualization();
if (r)
  goto out_err_no_disable;

#ifdef CONFIG_HAVE_KVM_IRQCHIP
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
#endif

r = kvm_init_mmu_notifier(kvm);
if (r)
  goto out_err_no_mmu_notifier;

r = kvm_coalesced_mmio_init(kvm);
if (r < 0)
  goto out_no_coalesced_mmio;

r = kvm_create_vm_debugfs(kvm, fdname);
if (r)
  goto out_err_no_debugfs;

mutex_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
mutex_unlock(&kvm_lock);

preempt_notifier_inc();
kvm_init_pm_notifier(kvm);

return kvm;

out_err_no_debugfs:
kvm_coalesced_mmio_free(kvm);
out_no_coalesced_mmio:
#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
if (kvm->mmu_notifier.ops)
  mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
#endif
out_err_no_mmu_notifier:
kvm_disable_virtualization();
out_err_no_disable:
kvm_arch_destroy_vm(kvm);
out_err_no_arch_destroy_vm:
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
for (i = 0; i < KVM_NR_BUSES; i++)
  kfree(kvm_get_bus(kvm, i));
kvm_free_irq_routing(kvm);
out_err_no_irq_routing:
cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
cleanup_srcu_struct(&kvm->srcu);
out_err_no_srcu:
kvm_arch_free_vm(kvm);
mmdrop(current->mm);
return ERR_PTR(r);
}

static void kvm_destroy_devices(struct kvm *kvm)
{
struct kvm_device *dev, *tmp;

/*
  * We do not need to take the kvm->lock here, because nobody else
  * has a reference to the struct kvm at this point and therefore
  * cannot access the devices list anyhow.
  *
  * The device list is generally managed as an rculist, but list_del()
  * is used intentionally here. If a bug in KVM introduced a reader that
  * was not backed by a reference on the kvm struct, the hope is that
  * it'd consume the poisoned forward pointer instead of suffering a
  * use-after-free, even though this cannot be guaranteed.
  */
list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
  list_del(&dev->vm_node);
  dev->ops->destroy(dev);
}
}

static void kvm_destroy_vm(struct kvm *kvm)
{
int i;
struct mm_struct *mm = kvm->mm;

kvm_destroy_pm_notifier(kvm);
kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
kvm_destroy_vm_debugfs(kvm);
mutex_lock(&kvm_lock);
list_del(&kvm->vm_list);
mutex_unlock(&kvm_lock);
kvm_arch_pre_destroy_vm(kvm);

kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++) {
  struct kvm_io_bus *bus = kvm_get_bus(kvm, i);

  if (bus)
   kvm_io_bus_destroy(bus);
  kvm->buses[i] = NULL;
}
kvm_coalesced_mmio_free(kvm);
#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
/*
  * At this point, pending calls to invalidate_range_start()
  * have completed but no more MMU notifiers will run, so
  * mn_active_invalidate_count may remain unbalanced.
  * No threads can be waiting in kvm_swap_active_memslots() as the
  * last reference on KVM has been dropped, but freeing
  * memslots would deadlock without this manual intervention.
  *
  * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
  * notifier between a start() and end(), then there shouldn't be any
  * in-progress invalidations.
  */
WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
if (kvm->mn_active_invalidate_count)
  kvm->mn_active_invalidate_count = 0;
else
  WARN_ON(kvm->mmu_invalidate_in_progress);
#else
kvm_flush_shadow_all(kvm);
#endif
kvm_arch_destroy_vm(kvm);
kvm_destroy_devices(kvm);
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
  kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
}
cleanup_srcu_struct(&kvm->irq_srcu);
cleanup_srcu_struct(&kvm->srcu);
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
xa_destroy(&kvm->mem_attr_array);
#endif
kvm_arch_free_vm(kvm);
preempt_notifier_dec();
kvm_disable_virtualization();
mmdrop(mm);
}

void kvm_get_kvm(struct kvm *kvm)
{
refcount_inc(&kvm->users_count);
}
EXPORT_SYMBOL_GPL(kvm_get_kvm);

/*
* Make sure the vm is not during destruction, which is a safe version of
* kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
*/
bool kvm_get_kvm_safe(struct kvm *kvm)
{
return refcount_inc_not_zero(&kvm->users_count);
}
EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);

void kvm_put_kvm(struct kvm *kvm)
{
if (refcount_dec_and_test(&kvm->users_count))
  kvm_destroy_vm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_put_kvm);

/*
* Used to put a reference that was taken on behalf of an object associated
* with a user-visible file descriptor, e.g. a vcpu or device, if installation
* of the new file descriptor fails and the reference cannot be transferred to
* its final owner.  In such cases, the caller is still actively using @kvm and
* will fail miserably if the refcount unexpectedly hits zero.
*/
void kvm_put_kvm_no_destroy(struct kvm *kvm)
{
WARN_ON(refcount_dec_and_test(&kvm->users_count));
}
EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);

static int kvm_vm_release(struct inode *inode, struct file *filp)
{
struct kvm *kvm = filp->private_data;

kvm_irqfd_release(kvm);

kvm_put_kvm(kvm);
return 0;
}

int kvm_trylock_all_vcpus(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i, j;

lockdep_assert_held(&kvm->lock);

kvm_for_each_vcpu(i, vcpu, kvm)
  if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
   goto out_unlock;
return 0;

out_unlock:
kvm_for_each_vcpu(j, vcpu, kvm) {
  if (i == j)
   break;
  mutex_unlock(&vcpu->mutex);
}
return -EINTR;
}
EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus);

int kvm_lock_all_vcpus(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i, j;
int r;

lockdep_assert_held(&kvm->lock);

kvm_for_each_vcpu(i, vcpu, kvm) {
  r = mutex_lock_killable_nest_lock(&vcpu->mutex, &kvm->lock);
  if (r)
   goto out_unlock;
}
return 0;

out_unlock:
kvm_for_each_vcpu(j, vcpu, kvm) {
  if (i == j)
   break;
  mutex_unlock(&vcpu->mutex);
}
return r;
}
EXPORT_SYMBOL_GPL(kvm_lock_all_vcpus);

void kvm_unlock_all_vcpus(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i;

lockdep_assert_held(&kvm->lock);

kvm_for_each_vcpu(i, vcpu, kvm)
  mutex_unlock(&vcpu->mutex);
}
EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus);

/*
* Allocation size is twice as large as the actual dirty bitmap size.
* See kvm_vm_ioctl_get_dirty_log() why this is needed.
*/
static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
{
unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);

memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
if (!memslot->dirty_bitmap)
  return -ENOMEM;

return 0;
}

static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
{
struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
int node_idx_inactive = active->node_idx ^ 1;

return &kvm->__memslots[as_id][node_idx_inactive];
}

/*
* Helper to get the address space ID when one of memslot pointers may be NULL.
* This also serves as a sanity that at least one of the pointers is non-NULL,
* and that their address space IDs don't diverge.
*/
static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
      struct kvm_memory_slot *b)
{
if (WARN_ON_ONCE(!a && !b))
  return 0;

if (!a)
  return b->as_id;
if (!b)
  return a->as_id;

WARN_ON_ONCE(a->as_id != b->as_id);
return a->as_id;
}

static void kvm_insert_gfn_node(struct kvm_memslots *slots,
    struct kvm_memory_slot *slot)
{
struct rb_root *gfn_tree = &slots->gfn_tree;
struct rb_node **node, *parent;
int idx = slots->node_idx;

parent = NULL;
for (node = &gfn_tree->rb_node; *node; ) {
  struct kvm_memory_slot *tmp;

  tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
  parent = *node;
  if (slot->base_gfn < tmp->base_gfn)
   node = &(*node)->rb_left;
  else if (slot->base_gfn > tmp->base_gfn)
   node = &(*node)->rb_right;
  else
   BUG();
}

rb_link_node(&slot->gfn_node[idx], parent, node);
rb_insert_color(&slot->gfn_node[idx], gfn_tree);
}

static void kvm_erase_gfn_node(struct kvm_memslots *slots,
          struct kvm_memory_slot *slot)
{
rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
}

static void kvm_replace_gfn_node(struct kvm_memslots *slots,
     struct kvm_memory_slot *old,
     struct kvm_memory_slot *new)
{
int idx = slots->node_idx;

WARN_ON_ONCE(old->base_gfn != new->base_gfn);

rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
   &slots->gfn_tree);
}

/*
* Replace @old with @new in the inactive memslots.
*
* With NULL @old this simply adds @new.
* With NULL @new this simply removes @old.
*
* If @new is non-NULL its hva_node[slots_idx] range has to be set
* appropriately.
*/
static void kvm_replace_memslot(struct kvm *kvm,
    struct kvm_memory_slot *old,
    struct kvm_memory_slot *new)
{
int as_id = kvm_memslots_get_as_id(old, new);
struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
int idx = slots->node_idx;

if (old) {
  hash_del(&old->id_node[idx]);
  interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);

  if ((long)old == atomic_long_read(&slots->last_used_slot))
   atomic_long_set(&slots->last_used_slot, (long)new);

  if (!new) {
   kvm_erase_gfn_node(slots, old);
   return;
  }
}

/*
  * Initialize @new's hva range.  Do this even when replacing an @old
  * slot, kvm_copy_memslot() deliberately does not touch node data.
  */
new->hva_node[idx].start = new->userspace_addr;
new->hva_node[idx].last = new->userspace_addr +
      (new->npages << PAGE_SHIFT) - 1;

/*
  * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
  * hva_node needs to be swapped with remove+insert even though hva can't
  * change when replacing an existing slot.
  */
hash_add(slots->id_hash, &new->id_node[idx], new->id);
interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);

/*
  * If the memslot gfn is unchanged, rb_replace_node() can be used to
  * switch the node in the gfn tree instead of removing the old and
  * inserting the new as two separate operations. Replacement is a
  * single O(1) operation versus two O(log(n)) operations for
  * remove+insert.
  */
if (old && old->base_gfn == new->base_gfn) {
  kvm_replace_gfn_node(slots, old, new);
} else {
  if (old)
   kvm_erase_gfn_node(slots, old);
  kvm_insert_gfn_node(slots, new);
}
}

/*
* Flags that do not access any of the extra space of struct
* kvm_userspace_memory_region2.  KVM_SET_USER_MEMORY_REGION_V1_FLAGS
* only allows these.
*/
#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)

static int check_memory_region_flags(struct kvm *kvm,
         const struct kvm_userspace_memory_region2 *mem)
{
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;

if (kvm_arch_has_private_mem(kvm))
  valid_flags |= KVM_MEM_GUEST_MEMFD;

/* Dirty logging private memory is not currently supported. */
if (mem->flags & KVM_MEM_GUEST_MEMFD)
  valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;

/*
  * GUEST_MEMFD is incompatible with read-only memslots, as writes to
  * read-only memslots have emulated MMIO, not page fault, semantics,
  * and KVM doesn't allow emulated MMIO for private memory.
  */
if (kvm_arch_has_readonly_mem(kvm) &&
     !(mem->flags & KVM_MEM_GUEST_MEMFD))
  valid_flags |= KVM_MEM_READONLY;

if (mem->flags & ~valid_flags)
  return -EINVAL;

return 0;
}

static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
{
struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);

/* Grab the generation from the activate memslots. */
u64 gen = __kvm_memslots(kvm, as_id)->generation;

WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;

/*
  * Do not store the new memslots while there are invalidations in
  * progress, otherwise the locking in invalidate_range_start and
  * invalidate_range_end will be unbalanced.
  */
spin_lock(&kvm->mn_invalidate_lock);
prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
while (kvm->mn_active_invalidate_count) {
  set_current_state(TASK_UNINTERRUPTIBLE);
  spin_unlock(&kvm->mn_invalidate_lock);
  schedule();
  spin_lock(&kvm->mn_invalidate_lock);
}
finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
rcu_assign_pointer(kvm->memslots[as_id], slots);
spin_unlock(&kvm->mn_invalidate_lock);

/*
  * Acquired in kvm_set_memslot. Must be released before synchronize
  * SRCU below in order to avoid deadlock with another thread
  * acquiring the slots_arch_lock in an srcu critical section.
  */
mutex_unlock(&kvm->slots_arch_lock);

synchronize_srcu_expedited(&kvm->srcu);

/*
  * Increment the new memslot generation a second time, dropping the
  * update in-progress flag and incrementing the generation based on
  * the number of address spaces.  This provides a unique and easily
  * identifiable generation number while the memslots are in flux.
  */
gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;

/*
  * Generations must be unique even across address spaces.  We do not need
  * a global counter for that, instead the generation space is evenly split
  * across address spaces.  For example, with two address spaces, address
  * space 0 will use generations 0, 2, 4, ... while address space 1 will
  * use generations 1, 3, 5, ...
  */
gen += kvm_arch_nr_memslot_as_ids(kvm);

kvm_arch_memslots_updated(kvm, gen);

slots->generation = gen;
}

static int kvm_prepare_memory_region(struct kvm *kvm,
         const struct kvm_memory_slot *old,
         struct kvm_memory_slot *new,
         enum kvm_mr_change change)
{
int r;

/*
  * If dirty logging is disabled, nullify the bitmap; the old bitmap
  * will be freed on "commit".  If logging is enabled in both old and
  * new, reuse the existing bitmap.  If logging is enabled only in the
  * new and KVM isn't using a ring buffer, allocate and initialize a
  * new bitmap.
  */
if (change != KVM_MR_DELETE) {
  if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
   new->dirty_bitmap = NULL;
  else if (old && old->dirty_bitmap)
   new->dirty_bitmap = old->dirty_bitmap;
  else if (kvm_use_dirty_bitmap(kvm)) {
   r = kvm_alloc_dirty_bitmap(new);
   if (r)
    return r;

   if (kvm_dirty_log_manual_protect_and_init_set(kvm))
    bitmap_set(new->dirty_bitmap, 0, new->npages);
  }
}

r = kvm_arch_prepare_memory_region(kvm, old, new, change);

/* Free the bitmap on failure if it was allocated above. */
if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
  kvm_destroy_dirty_bitmap(new);

return r;
}

static void kvm_commit_memory_region(struct kvm *kvm,
         struct kvm_memory_slot *old,
         const struct kvm_memory_slot *new,
         enum kvm_mr_change change)
{
int old_flags = old ? old->flags : 0;
int new_flags = new ? new->flags : 0;
/*
  * Update the total number of memslot pages before calling the arch
  * hook so that architectures can consume the result directly.
  */
if (change == KVM_MR_DELETE)
  kvm->nr_memslot_pages -= old->npages;
else if (change == KVM_MR_CREATE)
  kvm->nr_memslot_pages += new->npages;

if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
  int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
  atomic_set(&kvm->nr_memslots_dirty_logging,
      atomic_read(&kvm->nr_memslots_dirty_logging) + change);
}

kvm_arch_commit_memory_region(kvm, old, new, change);

switch (change) {
case KVM_MR_CREATE:
  /* Nothing more to do. */
  break;
case KVM_MR_DELETE:
  /* Free the old memslot and all its metadata. */
  kvm_free_memslot(kvm, old);
  break;
case KVM_MR_MOVE:
case KVM_MR_FLAGS_ONLY:
  /*
   * Free the dirty bitmap as needed; the below check encompasses
   * both the flags and whether a ring buffer is being used)
   */
  if (old->dirty_bitmap && !new->dirty_bitmap)
   kvm_destroy_dirty_bitmap(old);

  /*
   * The final quirk.  Free the detached, old slot, but only its
   * memory, not any metadata.  Metadata, including arch specific
   * data, may be reused by @new.
   */
  kfree(old);
  break;
default:
  BUG();
}
}

/*
* Activate @new, which must be installed in the inactive slots by the caller,
* by swapping the active slots and then propagating @new to @old once @old is
* unreachable and can be safely modified.
*
* With NULL @old this simply adds @new to @active (while swapping the sets).
* With NULL @new this simply removes @old from @active and frees it
* (while also swapping the sets).
*/
static void kvm_activate_memslot(struct kvm *kvm,
     struct kvm_memory_slot *old,
     struct kvm_memory_slot *new)
{
int as_id = kvm_memslots_get_as_id(old, new);

kvm_swap_active_memslots(kvm, as_id);

/* Propagate the new memslot to the now inactive memslots. */
kvm_replace_memslot(kvm, old, new);
}

static void kvm_copy_memslot(struct kvm_memory_slot *dest,
        const struct kvm_memory_slot *src)
{
dest->base_gfn = src->base_gfn;
dest->npages = src->npages;
dest->dirty_bitmap = src->dirty_bitmap;
dest->arch = src->arch;
dest->userspace_addr = src->userspace_addr;
dest->flags = src->flags;
dest->id = src->id;
dest->as_id = src->as_id;
}

static void kvm_invalidate_memslot(struct kvm *kvm,
       struct kvm_memory_slot *old,
       struct kvm_memory_slot *invalid_slot)
{
/*
  * Mark the current slot INVALID.  As with all memslot modifications,
  * this must be done on an unreachable slot to avoid modifying the
  * current slot in the active tree.
  */
kvm_copy_memslot(invalid_slot, old);
invalid_slot->flags |= KVM_MEMSLOT_INVALID;
kvm_replace_memslot(kvm, old, invalid_slot);

/*
  * Activate the slot that is now marked INVALID, but don't propagate
  * the slot to the now inactive slots. The slot is either going to be
  * deleted or recreated as a new slot.
  */
kvm_swap_active_memslots(kvm, old->as_id);

/*
  * From this point no new shadow pages pointing to a deleted, or moved,
  * memslot will be created.  Validation of sp->gfn happens in:
  * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
  * - kvm_is_visible_gfn (mmu_check_root)
  */
kvm_arch_flush_shadow_memslot(kvm, old);
kvm_arch_guest_memory_reclaimed(kvm);

/* Was released by kvm_swap_active_memslots(), reacquire. */
mutex_lock(&kvm->slots_arch_lock);

/*
  * Copy the arch-specific field of the newly-installed slot back to the
  * old slot as the arch data could have changed between releasing
  * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
  * above.  Writers are required to retrieve memslots *after* acquiring
  * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
  */
old->arch = invalid_slot->arch;
}

static void kvm_create_memslot(struct kvm *kvm,
          struct kvm_memory_slot *new)
{
/* Add the new memslot to the inactive set and activate. */
kvm_replace_memslot(kvm, NULL, new);
kvm_activate_memslot(kvm, NULL, new);
}

static void kvm_delete_memslot(struct kvm *kvm,
          struct kvm_memory_slot *old,
          struct kvm_memory_slot *invalid_slot)
{
/*
  * Remove the old memslot (in the inactive memslots) by passing NULL as
  * the "new" slot, and for the invalid version in the active slots.
  */
kvm_replace_memslot(kvm, old, NULL);
kvm_activate_memslot(kvm, invalid_slot, NULL);
}

static void kvm_move_memslot(struct kvm *kvm,
        struct kvm_memory_slot *old,
        struct kvm_memory_slot *new,
        struct kvm_memory_slot *invalid_slot)
{
/*
  * Replace the old memslot in the inactive slots, and then swap slots
  * and replace the current INVALID with the new as well.
  */
kvm_replace_memslot(kvm, old, new);
kvm_activate_memslot(kvm, invalid_slot, new);
}

static void kvm_update_flags_memslot(struct kvm *kvm,
         struct kvm_memory_slot *old,
         struct kvm_memory_slot *new)
{
/*
  * Similar to the MOVE case, but the slot doesn't need to be zapped as
  * an intermediate step. Instead, the old memslot is simply replaced
  * with a new, updated copy in both memslot sets.
  */
kvm_replace_memslot(kvm, old, new);
kvm_activate_memslot(kvm, old, new);
}

static int kvm_set_memslot(struct kvm *kvm,
      struct kvm_memory_slot *old,
      struct kvm_memory_slot *new,
      enum kvm_mr_change change)
{
struct kvm_memory_slot *invalid_slot;
int r;

/*
  * Released in kvm_swap_active_memslots().
  *
  * Must be held from before the current memslots are copied until after
  * the new memslots are installed with rcu_assign_pointer, then
  * released before the synchronize srcu in kvm_swap_active_memslots().
  *
  * When modifying memslots outside of the slots_lock, must be held
  * before reading the pointer to the current memslots until after all
  * changes to those memslots are complete.
  *
  * These rules ensure that installing new memslots does not lose
  * changes made to the previous memslots.
  */
mutex_lock(&kvm->slots_arch_lock);

/*
  * Invalidate the old slot if it's being deleted or moved.  This is
  * done prior to actually deleting/moving the memslot to allow vCPUs to
  * continue running by ensuring there are no mappings or shadow pages
  * for the memslot when it is deleted/moved.  Without pre-invalidation
  * (and without a lock), a window would exist between effecting the
  * delete/move and committing the changes in arch code where KVM or a
  * guest could access a non-existent memslot.
  *
  * Modifications are done on a temporary, unreachable slot.  The old
  * slot needs to be preserved in case a later step fails and the
  * invalidation needs to be reverted.
  */
if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
  invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
  if (!invalid_slot) {
   mutex_unlock(&kvm->slots_arch_lock);
   return -ENOMEM;
  }
  kvm_invalidate_memslot(kvm, old, invalid_slot);
}

r = kvm_prepare_memory_region(kvm, old, new, change);
if (r) {
  /*
   * For DELETE/MOVE, revert the above INVALID change.  No
   * modifications required since the original slot was preserved
   * in the inactive slots.  Changing the active memslots also
   * release slots_arch_lock.
   */
  if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
   kvm_activate_memslot(kvm, invalid_slot, old);
   kfree(invalid_slot);
  } else {
   mutex_unlock(&kvm->slots_arch_lock);
  }
  return r;
}

/*
  * For DELETE and MOVE, the working slot is now active as the INVALID
  * version of the old slot.  MOVE is particularly special as it reuses
  * the old slot and returns a copy of the old slot (in working_slot).
  * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
  * old slot is detached but otherwise preserved.
  */
if (change == KVM_MR_CREATE)
  kvm_create_memslot(kvm, new);
else if (change == KVM_MR_DELETE)
  kvm_delete_memslot(kvm, old, invalid_slot);
else if (change == KVM_MR_MOVE)
  kvm_move_memslot(kvm, old, new, invalid_slot);
else if (change == KVM_MR_FLAGS_ONLY)
  kvm_update_flags_memslot(kvm, old, new);
else
  BUG();

/* Free the temporary INVALID slot used for DELETE and MOVE. */
if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
  kfree(invalid_slot);

/*
  * No need to refresh new->arch, changes after dropping slots_arch_lock
  * will directly hit the final, active memslot.  Architectures are
  * responsible for knowing that new->arch may be stale.
  */
kvm_commit_memory_region(kvm, old, new, change);

return 0;
}

static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
          gfn_t start, gfn_t end)
{
struct kvm_memslot_iter iter;

kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
  if (iter.slot->id != id)
   return true;
}

return false;
}

static int kvm_set_memory_region(struct kvm *kvm,
     const struct kvm_userspace_memory_region2 *mem)
{
struct kvm_memory_slot *old, *new;
struct kvm_memslots *slots;
enum kvm_mr_change change;
unsigned long npages;
gfn_t base_gfn;
int as_id, id;
int r;

lockdep_assert_held(&kvm->slots_lock);

r = check_memory_region_flags(kvm, mem);
if (r)
  return r;

as_id = mem->slot >> 16;
id = (u16)mem->slot;

/* General sanity checks */
if ((mem->memory_size & (PAGE_SIZE - 1)) ||
     (mem->memory_size != (unsigned long)mem->memory_size))
  return -EINVAL;
if (mem->guest_phys_addr & (PAGE_SIZE - 1))
  return -EINVAL;
/* We can read the guest memory with __xxx_user() later on. */
if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
     (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
      !access_ok((void __user *)(unsigned long)mem->userspace_addr,
   mem->memory_size))
  return -EINVAL;
if (mem->flags & KVM_MEM_GUEST_MEMFD &&
     (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
      mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
  return -EINVAL;
if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
  return -EINVAL;
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
  return -EINVAL;

/*
  * The size of userspace-defined memory regions is restricted in order
  * to play nice with dirty bitmap operations, which are indexed with an
  * "unsigned int".  KVM's internal memory regions don't support dirty
  * logging, and so are exempt.
  */
if (id < KVM_USER_MEM_SLOTS &&
     (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
  return -EINVAL;

slots = __kvm_memslots(kvm, as_id);

/*
  * Note, the old memslot (and the pointer itself!) may be invalidated
  * and/or destroyed by kvm_set_memslot().
  */
old = id_to_memslot(slots, id);

if (!mem->memory_size) {
  if (!old || !old->npages)
   return -EINVAL;

  if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
   return -EIO;

  return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
}

base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
npages = (mem->memory_size >> PAGE_SHIFT);

if (!old || !old->npages) {
  change = KVM_MR_CREATE;

  /*
   * To simplify KVM internals, the total number of pages across
   * all memslots must fit in an unsigned long.
   */
  if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
   return -EINVAL;
} else { /* Modify an existing slot. */
  /* Private memslots are immutable, they can only be deleted. */
  if (mem->flags & KVM_MEM_GUEST_MEMFD)
   return -EINVAL;
  if ((mem->userspace_addr != old->userspace_addr) ||
      (npages != old->npages) ||
      ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
   return -EINVAL;

  if (base_gfn != old->base_gfn)
   change = KVM_MR_MOVE;
  else if (mem->flags != old->flags)
   change = KVM_MR_FLAGS_ONLY;
  else /* Nothing to change. */
   return 0;
}

if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
     kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
  return -EEXIST;

/* Allocate a slot that will persist in the memslot. */
new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
if (!new)
  return -ENOMEM;

new->as_id = as_id;
new->id = id;
new->base_gfn = base_gfn;
new->npages = npages;
new->flags = mem->flags;
new->userspace_addr = mem->userspace_addr;
if (mem->flags & KVM_MEM_GUEST_MEMFD) {
  r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
  if (r)
   goto out;
}

r = kvm_set_memslot(kvm, old, new, change);
if (r)
  goto out_unbind;

return 0;

out_unbind:
if (mem->flags & KVM_MEM_GUEST_MEMFD)
  kvm_gmem_unbind(new);
out:
kfree(new);
return r;
}

int kvm_set_internal_memslot(struct kvm *kvm,
        const struct kvm_userspace_memory_region2 *mem)
{
if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS))
  return -EINVAL;

if (WARN_ON_ONCE(mem->flags))
  return -EINVAL;

return kvm_set_memory_region(kvm, mem);
}
EXPORT_SYMBOL_GPL(kvm_set_internal_memslot);

static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
       struct kvm_userspace_memory_region2 *mem)
{
if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
  return -EINVAL;

guard(mutex)(&kvm->slots_lock);
return kvm_set_memory_region(kvm, mem);
}

#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
/**
* kvm_get_dirty_log - get a snapshot of dirty pages
* @kvm: pointer to kvm instance
* @log: slot id and address to which we copy the log
* @is_dirty: set to '1' if any dirty pages were found
* @memslot: set to the associated memslot, always valid on success
*/
int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
        int *is_dirty, struct kvm_memory_slot **memslot)
{
struct kvm_memslots *slots;
int i, as_id, id;
unsigned long n;
unsigned long any = 0;

/* Dirty ring tracking may be exclusive to dirty log tracking */
if (!kvm_use_dirty_bitmap(kvm))
  return -ENXIO;

*memslot = NULL;
*is_dirty = 0;

as_id = log->slot >> 16;
id = (u16)log->slot;
if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
  return -EINVAL;

slots = __kvm_memslots(kvm, as_id);
*memslot = id_to_memslot(slots, id);
if (!(*memslot) || !(*memslot)->dirty_bitmap)
  return -ENOENT;

kvm_arch_sync_dirty_log(kvm, *memslot);

n = kvm_dirty_bitmap_bytes(*memslot);

for (i = 0; !any && i < n/sizeof(long); ++i)
  any = (*memslot)->dirty_bitmap[i];

if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
  return -EFAULT;

if (any)
  *is_dirty = 1;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dirty_log);

#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
/**
* kvm_get_dirty_log_protect - get a snapshot of dirty pages
* and reenable dirty page tracking for the corresponding pages.
* @kvm: pointer to kvm instance
* @log: slot id and address to which we copy the log
*
* We need to keep it in mind that VCPU threads can write to the bitmap
* concurrently. So, to avoid losing track of dirty pages we keep the
* following order:
*
*    1. Take a snapshot of the bit and clear it if needed.
*    2. Write protect the corresponding page.
*    3. Copy the snapshot to the userspace.
*    4. Upon return caller flushes TLB's if needed.
*
* Between 2 and 4, the guest may write to the page using the remaining TLB
* entry.  This is not a problem because the page is reported dirty using
* the snapshot taken before and step 4 ensures that writes done after
* exiting to userspace will be logged for the next call.
*
*/
static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i, as_id, id;
unsigned long n;
unsigned long *dirty_bitmap;
unsigned long *dirty_bitmap_buffer;
bool flush;

/* Dirty ring tracking may be exclusive to dirty log tracking */
if (!kvm_use_dirty_bitmap(kvm))
  return -ENXIO;

as_id = log->slot >> 16;
id = (u16)log->slot;
if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
  return -EINVAL;

slots = __kvm_memslots(kvm, as_id);
memslot = id_to_memslot(slots, id);
if (!memslot || !memslot->dirty_bitmap)
  return -ENOENT;

dirty_bitmap = memslot->dirty_bitmap;

kvm_arch_sync_dirty_log(kvm, memslot);

n = kvm_dirty_bitmap_bytes(memslot);
flush = false;
if (kvm->manual_dirty_log_protect) {
  /*
   * Unlike kvm_get_dirty_log, we always return false in *flush,
   * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
   * is some code duplication between this function and
   * kvm_get_dirty_log, but hopefully all architecture
   * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
   * can be eliminated.
   */
  dirty_bitmap_buffer = dirty_bitmap;
} else {
  dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
  memset(dirty_bitmap_buffer, 0, n);

  KVM_MMU_LOCK(kvm);
  for (i = 0; i < n / sizeof(long); i++) {
   unsigned long mask;
   gfn_t offset;

   if (!dirty_bitmap[i])
    continue;

   flush = true;
   mask = xchg(&dirty_bitmap[i], 0);
   dirty_bitmap_buffer[i] = mask;

   offset = i * BITS_PER_LONG;
   kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
        offset, mask);
  }
  KVM_MMU_UNLOCK(kvm);
}

if (flush)
  kvm_flush_remote_tlbs_memslot(kvm, memslot);

if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
  return -EFAULT;
return 0;
}

/**
* kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
* @kvm: kvm instance
* @log: slot id and address to which we copy the log
*
* Steps 1-4 below provide general overview of dirty page logging. See
* kvm_get_dirty_log_protect() function description for additional details.
*
* We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
* always flush the TLB (step 4) even if previous step failed  and the dirty
* bitmap may be corrupt. Regardless of previous outcome the KVM logging API
* does not preclude user space subsequent dirty log read. Flushing TLB ensures
--> --------------------

--> maximum size reached

--> --------------------

SSL kvm_main.c Sprache: unbekannt

[ Verzeichnis aufwärts0.71unsichere Verbindung ]