Quellcodebibliothek Statistik Leitseite products/Sources/formale Sprachen/C/Linux/drivers/nvme/host/   (Open Source Betriebssystem Version 6.17.9©)  Datei vom 24.10.2025 mit Größe 143 kB image not shown  

Quelle  core.c   Sprache: C

 
// SPDX-License-Identifier: GPL-2.0
/*
 * NVM Express device driver
 * Copyright (c) 2011-2014, Intel Corporation.
 */


#include <linux/async.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-integrity.h>
#include <linux/compat.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/hdreg.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/pr.h>
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
#include <linux/pm_qos.h>
#include <linux/ratelimit.h>
#include <linux/unaligned.h>

#include "nvme.h"
#include "fabrics.h"
#include <linux/nvme-auth.h>

#define CREATE_TRACE_POINTS
#include "trace.h"

#define NVME_MINORS  (1U << MINORBITS)

struct nvme_ns_info {
 struct nvme_ns_ids ids;
 u32 nsid;
 __le32 anagrpid;
 u8 pi_offset;
 u16 endgid;
 u64 runs;
 bool is_shared;
 bool is_readonly;
 bool is_ready;
 bool is_removed;
 bool is_rotational;
 bool no_vwc;
};

unsigned int admin_timeout = 60;
module_param(admin_timeout, uint, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
EXPORT_SYMBOL_GPL(admin_timeout);

unsigned int nvme_io_timeout = 30;
module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
EXPORT_SYMBOL_GPL(nvme_io_timeout);

static unsigned char shutdown_timeout = 5;
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");

static u8 nvme_max_retries = 5;
module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");

static unsigned long default_ps_max_latency_us = 100000;
module_param(default_ps_max_latency_us, ulong, 0644);
MODULE_PARM_DESC(default_ps_max_latency_us,
   "max power saving latency for new devices; use PM QOS to change per device");

static bool force_apst;
module_param(force_apst, bool, 0644);
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");

static unsigned long apst_primary_timeout_ms = 100;
module_param(apst_primary_timeout_ms, ulong, 0644);
MODULE_PARM_DESC(apst_primary_timeout_ms,
 "primary APST timeout in ms");

static unsigned long apst_secondary_timeout_ms = 2000;
module_param(apst_secondary_timeout_ms, ulong, 0644);
MODULE_PARM_DESC(apst_secondary_timeout_ms,
 "secondary APST timeout in ms");

static unsigned long apst_primary_latency_tol_us = 15000;
module_param(apst_primary_latency_tol_us, ulong, 0644);
MODULE_PARM_DESC(apst_primary_latency_tol_us,
 "primary APST latency tolerance in us");

static unsigned long apst_secondary_latency_tol_us = 100000;
module_param(apst_secondary_latency_tol_us, ulong, 0644);
MODULE_PARM_DESC(apst_secondary_latency_tol_us,
 "secondary APST latency tolerance in us");

/*
 * Older kernels didn't enable protection information if it was at an offset.
 * Newer kernels do, so it breaks reads on the upgrade if such formats were
 * used in prior kernels since the metadata written did not contain a valid
 * checksum.
 */

static bool disable_pi_offsets = false;
module_param(disable_pi_offsets, bool, 0444);
MODULE_PARM_DESC(disable_pi_offsets,
 "disable protection information if it has an offset");

/*
 * nvme_wq - hosts nvme related works that are not reset or delete
 * nvme_reset_wq - hosts nvme reset works
 * nvme_delete_wq - hosts nvme delete works
 *
 * nvme_wq will host works such as scan, aen handling, fw activation,
 * keep-alive, periodic reconnects etc. nvme_reset_wq
 * runs reset works which also flush works hosted on nvme_wq for
 * serialization purposes. nvme_delete_wq host controller deletion
 * works which flush reset works for serialization.
 */

struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);

struct workqueue_struct *nvme_reset_wq;
EXPORT_SYMBOL_GPL(nvme_reset_wq);

struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);

static LIST_HEAD(nvme_subsystems);
DEFINE_MUTEX(nvme_subsystems_lock);

static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt;
static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
static const struct class nvme_class = {
 .name = "nvme",
 .dev_uevent = nvme_class_uevent,
};

static const struct class nvme_subsys_class = {
 .name = "nvme-subsystem",
};

static DEFINE_IDA(nvme_ns_chr_minor_ida);
static dev_t nvme_ns_chr_devt;
static const struct class nvme_ns_chr_class = {
 .name = "nvme-generic",
};

static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
        unsigned nsid);
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
       struct nvme_command *cmd);
static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
  u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);

void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
 /*
 * Only new queue scan work when admin and IO queues are both alive
 */

 if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
  queue_work(nvme_wq, &ctrl->scan_work);
}

/*
 * Use this function to proceed with scheduling reset_work for a controller
 * that had previously been set to the resetting state. This is intended for
 * code paths that can't be interrupted by other reset attempts. A hot removal
 * may prevent this from succeeding.
 */

int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
{
 if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
  return -EBUSY;
 if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
  return -EBUSY;
 return 0;
}
EXPORT_SYMBOL_GPL(nvme_try_sched_reset);

static void nvme_failfast_work(struct work_struct *work)
{
 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
   struct nvme_ctrl, failfast_work);

 if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
  return;

 set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 dev_info(ctrl->device, "failfast expired\n");
 nvme_kick_requeue_lists(ctrl);
}

static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
{
 if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
  return;

 schedule_delayed_work(&ctrl->failfast_work,
         ctrl->opts->fast_io_fail_tmo * HZ);
}

static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
{
 if (!ctrl->opts)
  return;

 cancel_delayed_work_sync(&ctrl->failfast_work);
 clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
}


int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
  return -EBUSY;
 if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
  return -EBUSY;
 return 0;
}
EXPORT_SYMBOL_GPL(nvme_reset_ctrl);

int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
{
 int ret;

 ret = nvme_reset_ctrl(ctrl);
 if (!ret) {
  flush_work(&ctrl->reset_work);
  if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
   ret = -ENETRESET;
 }

 return ret;
}

static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
{
 dev_info(ctrl->device,
   "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));

 flush_work(&ctrl->reset_work);
 nvme_stop_ctrl(ctrl);
 nvme_remove_namespaces(ctrl);
 ctrl->ops->delete_ctrl(ctrl);
 nvme_uninit_ctrl(ctrl);
}

static void nvme_delete_ctrl_work(struct work_struct *work)
{
 struct nvme_ctrl *ctrl =
  container_of(work, struct nvme_ctrl, delete_work);

 nvme_do_delete_ctrl(ctrl);
}

int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{
 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
  return -EBUSY;
 if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
  return -EBUSY;
 return 0;
}
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);

void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
{
 /*
 * Keep a reference until nvme_do_delete_ctrl() complete,
 * since ->delete_ctrl can free the controller.
 */

 nvme_get_ctrl(ctrl);
 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
  nvme_do_delete_ctrl(ctrl);
 nvme_put_ctrl(ctrl);
}

static blk_status_t nvme_error_status(u16 status)
{
 switch (status & NVME_SCT_SC_MASK) {
 case NVME_SC_SUCCESS:
  return BLK_STS_OK;
 case NVME_SC_CAP_EXCEEDED:
  return BLK_STS_NOSPC;
 case NVME_SC_LBA_RANGE:
 case NVME_SC_CMD_INTERRUPTED:
 case NVME_SC_NS_NOT_READY:
  return BLK_STS_TARGET;
 case NVME_SC_BAD_ATTRIBUTES:
 case NVME_SC_INVALID_OPCODE:
 case NVME_SC_INVALID_FIELD:
 case NVME_SC_INVALID_NS:
  return BLK_STS_NOTSUPP;
 case NVME_SC_WRITE_FAULT:
 case NVME_SC_READ_ERROR:
 case NVME_SC_UNWRITTEN_BLOCK:
 case NVME_SC_ACCESS_DENIED:
 case NVME_SC_READ_ONLY:
 case NVME_SC_COMPARE_FAILED:
  return BLK_STS_MEDIUM;
 case NVME_SC_GUARD_CHECK:
 case NVME_SC_APPTAG_CHECK:
 case NVME_SC_REFTAG_CHECK:
 case NVME_SC_INVALID_PI:
  return BLK_STS_PROTECTION;
 case NVME_SC_RESERVATION_CONFLICT:
  return BLK_STS_RESV_CONFLICT;
 case NVME_SC_HOST_PATH_ERROR:
  return BLK_STS_TRANSPORT;
 case NVME_SC_ZONE_TOO_MANY_ACTIVE:
  return BLK_STS_ZONE_ACTIVE_RESOURCE;
 case NVME_SC_ZONE_TOO_MANY_OPEN:
  return BLK_STS_ZONE_OPEN_RESOURCE;
 default:
  return BLK_STS_IOERR;
 }
}

static void nvme_retry_req(struct request *req)
{
 unsigned long delay = 0;
 u16 crd;

 /* The mask and shift result must be <= 3 */
 crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11;
 if (crd)
  delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;

 nvme_req(req)->retries++;
 blk_mq_requeue_request(req, false);
 blk_mq_delay_kick_requeue_list(req->q, delay);
}

static void nvme_log_error(struct request *req)
{
 struct nvme_ns *ns = req->q->queuedata;
 struct nvme_request *nr = nvme_req(req);

 if (ns) {
  pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
         ns->disk ? ns->disk->disk_name : "?",
         nvme_get_opcode_str(nr->cmd->common.opcode),
         nr->cmd->common.opcode,
         nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
         blk_rq_bytes(req) >> ns->head->lba_shift,
         nvme_get_error_status_str(nr->status),
         NVME_SCT(nr->status),  /* Status Code Type */
         nr->status & NVME_SC_MASK, /* Status Code */
         nr->status & NVME_STATUS_MORE ? "MORE " : "",
         nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
  return;
 }

 pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
      dev_name(nr->ctrl->device),
      nvme_get_admin_opcode_str(nr->cmd->common.opcode),
      nr->cmd->common.opcode,
      nvme_get_error_status_str(nr->status),
      NVME_SCT(nr->status), /* Status Code Type */
      nr->status & NVME_SC_MASK, /* Status Code */
      nr->status & NVME_STATUS_MORE ? "MORE " : "",
      nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
}

static void nvme_log_err_passthru(struct request *req)
{
 struct nvme_ns *ns = req->q->queuedata;
 struct nvme_request *nr = nvme_req(req);

 pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s"
  "cdw10=0x%x cdw11=0x%x cdw12=0x%x cdw13=0x%x cdw14=0x%x cdw15=0x%x\n",
  ns ? ns->disk->disk_name : dev_name(nr->ctrl->device),
  ns ? nvme_get_opcode_str(nr->cmd->common.opcode) :
       nvme_get_admin_opcode_str(nr->cmd->common.opcode),
  nr->cmd->common.opcode,
  nvme_get_error_status_str(nr->status),
  NVME_SCT(nr->status),  /* Status Code Type */
  nr->status & NVME_SC_MASK, /* Status Code */
  nr->status & NVME_STATUS_MORE ? "MORE " : "",
  nr->status & NVME_STATUS_DNR  ? "DNR "  : "",
  le32_to_cpu(nr->cmd->common.cdw10),
  le32_to_cpu(nr->cmd->common.cdw11),
  le32_to_cpu(nr->cmd->common.cdw12),
  le32_to_cpu(nr->cmd->common.cdw13),
  le32_to_cpu(nr->cmd->common.cdw14),
  le32_to_cpu(nr->cmd->common.cdw15));
}

enum nvme_disposition {
 COMPLETE,
 RETRY,
 FAILOVER,
 AUTHENTICATE,
};

static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
{
 if (likely(nvme_req(req)->status == 0))
  return COMPLETE;

 if (blk_noretry_request(req) ||
     (nvme_req(req)->status & NVME_STATUS_DNR) ||
     nvme_req(req)->retries >= nvme_max_retries)
  return COMPLETE;

 if ((nvme_req(req)->status & NVME_SCT_SC_MASK) == NVME_SC_AUTH_REQUIRED)
  return AUTHENTICATE;

 if (req->cmd_flags & REQ_NVME_MPATH) {
  if (nvme_is_path_error(nvme_req(req)->status) ||
      blk_queue_dying(req->q))
   return FAILOVER;
 } else {
  if (blk_queue_dying(req->q))
   return COMPLETE;
 }

 return RETRY;
}

static inline void nvme_end_req_zoned(struct request *req)
{
 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
     req_op(req) == REQ_OP_ZONE_APPEND) {
  struct nvme_ns *ns = req->q->queuedata;

  req->__sector = nvme_lba_to_sect(ns->head,
   le64_to_cpu(nvme_req(req)->result.u64));
 }
}

static inline void __nvme_end_req(struct request *req)
{
 if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
  if (blk_rq_is_passthrough(req))
   nvme_log_err_passthru(req);
  else
   nvme_log_error(req);
 }
 nvme_end_req_zoned(req);
 nvme_trace_bio_complete(req);
 if (req->cmd_flags & REQ_NVME_MPATH)
  nvme_mpath_end_request(req);
}

void nvme_end_req(struct request *req)
{
 blk_status_t status = nvme_error_status(nvme_req(req)->status);

 __nvme_end_req(req);
 blk_mq_end_request(req, status);
}

void nvme_complete_rq(struct request *req)
{
 struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;

 trace_nvme_complete_rq(req);
 nvme_cleanup_cmd(req);

 /*
 * Completions of long-running commands should not be able to
 * defer sending of periodic keep alives, since the controller
 * may have completed processing such commands a long time ago
 * (arbitrarily close to command submission time).
 * req->deadline - req->timeout is the command submission time
 * in jiffies.
 */

 if (ctrl->kas &&
     req->deadline - req->timeout >= ctrl->ka_last_check_time)
  ctrl->comp_seen = true;

 switch (nvme_decide_disposition(req)) {
 case COMPLETE:
  nvme_end_req(req);
  return;
 case RETRY:
  nvme_retry_req(req);
  return;
 case FAILOVER:
  nvme_failover_req(req);
  return;
 case AUTHENTICATE:
#ifdef CONFIG_NVME_HOST_AUTH
  queue_work(nvme_wq, &ctrl->dhchap_auth_work);
  nvme_retry_req(req);
#else
  nvme_end_req(req);
#endif
  return;
 }
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);

void nvme_complete_batch_req(struct request *req)
{
 trace_nvme_complete_rq(req);
 nvme_cleanup_cmd(req);
 __nvme_end_req(req);
}
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);

/*
 * Called to unwind from ->queue_rq on a failed command submission so that the
 * multipathing code gets called to potentially failover to another path.
 * The caller needs to unwind all transport specific resource allocations and
 * must return propagate the return value.
 */

blk_status_t nvme_host_path_error(struct request *req)
{
 nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
 blk_mq_set_request_complete(req);
 nvme_complete_rq(req);
 return BLK_STS_OK;
}
EXPORT_SYMBOL_GPL(nvme_host_path_error);

bool nvme_cancel_request(struct request *req, void *data)
{
 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
    "Cancelling I/O %d", req->tag);

 /* don't abort one completed or idle request */
 if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT)
  return true;

 nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
 nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 blk_mq_complete_request(req);
 return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);

void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
{
 if (ctrl->tagset) {
  blk_mq_tagset_busy_iter(ctrl->tagset,
    nvme_cancel_request, ctrl);
  blk_mq_tagset_wait_completed_request(ctrl->tagset);
 }
}
EXPORT_SYMBOL_GPL(nvme_cancel_tagset);

void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
{
 if (ctrl->admin_tagset) {
  blk_mq_tagset_busy_iter(ctrl->admin_tagset,
    nvme_cancel_request, ctrl);
  blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
 }
}
EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);

bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
  enum nvme_ctrl_state new_state)
{
 enum nvme_ctrl_state old_state;
 unsigned long flags;
 bool changed = false;

 spin_lock_irqsave(&ctrl->lock, flags);

 old_state = nvme_ctrl_state(ctrl);
 switch (new_state) {
 case NVME_CTRL_LIVE:
  switch (old_state) {
  case NVME_CTRL_CONNECTING:
   changed = true;
   fallthrough;
  default:
   break;
  }
  break;
 case NVME_CTRL_RESETTING:
  switch (old_state) {
  case NVME_CTRL_NEW:
  case NVME_CTRL_LIVE:
   changed = true;
   fallthrough;
  default:
   break;
  }
  break;
 case NVME_CTRL_CONNECTING:
  switch (old_state) {
  case NVME_CTRL_NEW:
  case NVME_CTRL_RESETTING:
   changed = true;
   fallthrough;
  default:
   break;
  }
  break;
 case NVME_CTRL_DELETING:
  switch (old_state) {
  case NVME_CTRL_LIVE:
  case NVME_CTRL_RESETTING:
  case NVME_CTRL_CONNECTING:
   changed = true;
   fallthrough;
  default:
   break;
  }
  break;
 case NVME_CTRL_DELETING_NOIO:
  switch (old_state) {
  case NVME_CTRL_DELETING:
  case NVME_CTRL_DEAD:
   changed = true;
   fallthrough;
  default:
   break;
  }
  break;
 case NVME_CTRL_DEAD:
  switch (old_state) {
  case NVME_CTRL_DELETING:
   changed = true;
   fallthrough;
  default:
   break;
  }
  break;
 default:
  break;
 }

 if (changed) {
  WRITE_ONCE(ctrl->state, new_state);
  wake_up_all(&ctrl->state_wq);
 }

 spin_unlock_irqrestore(&ctrl->lock, flags);
 if (!changed)
  return false;

 if (new_state == NVME_CTRL_LIVE) {
  if (old_state == NVME_CTRL_CONNECTING)
   nvme_stop_failfast_work(ctrl);
  nvme_kick_requeue_lists(ctrl);
 } else if (new_state == NVME_CTRL_CONNECTING &&
  old_state == NVME_CTRL_RESETTING) {
  nvme_start_failfast_work(ctrl);
 }
 return changed;
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);

/*
 * Waits for the controller state to be resetting, or returns false if it is
 * not possible to ever transition to that state.
 */

bool nvme_wait_reset(struct nvme_ctrl *ctrl)
{
 wait_event(ctrl->state_wq,
     nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
     nvme_state_terminal(ctrl));
 return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
}
EXPORT_SYMBOL_GPL(nvme_wait_reset);

static void nvme_free_ns_head(struct kref *ref)
{
 struct nvme_ns_head *head =
  container_of(ref, struct nvme_ns_head, ref);

 nvme_mpath_put_disk(head);
 ida_free(&head->subsys->ns_ida, head->instance);
 cleanup_srcu_struct(&head->srcu);
 nvme_put_subsystem(head->subsys);
 kfree(head->plids);
 kfree(head);
}

bool nvme_tryget_ns_head(struct nvme_ns_head *head)
{
 return kref_get_unless_zero(&head->ref);
}

void nvme_put_ns_head(struct nvme_ns_head *head)
{
 kref_put(&head->ref, nvme_free_ns_head);
}

static void nvme_free_ns(struct kref *kref)
{
 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);

 put_disk(ns->disk);
 nvme_put_ns_head(ns->head);
 nvme_put_ctrl(ns->ctrl);
 kfree(ns);
}

bool nvme_get_ns(struct nvme_ns *ns)
{
 return kref_get_unless_zero(&ns->kref);
}

void nvme_put_ns(struct nvme_ns *ns)
{
 kref_put(&ns->kref, nvme_free_ns);
}
EXPORT_SYMBOL_NS_GPL(nvme_put_ns, "NVME_TARGET_PASSTHRU");

static inline void nvme_clear_nvme_request(struct request *req)
{
 nvme_req(req)->status = 0;
 nvme_req(req)->retries = 0;
 nvme_req(req)->flags = 0;
 req->rq_flags |= RQF_DONTPREP;
}

/* initialize a passthrough request */
void nvme_init_request(struct request *req, struct nvme_command *cmd)
{
 struct nvme_request *nr = nvme_req(req);
 bool logging_enabled;

 if (req->q->queuedata) {
  struct nvme_ns *ns = req->q->disk->private_data;

  logging_enabled = ns->head->passthru_err_log_enabled;
  req->timeout = NVME_IO_TIMEOUT;
 } else { /* no queuedata implies admin queue */
  logging_enabled = nr->ctrl->passthru_err_log_enabled;
  req->timeout = NVME_ADMIN_TIMEOUT;
 }

 if (!logging_enabled)
  req->rq_flags |= RQF_QUIET;

 /* passthru commands should let the driver set the SGL flags */
 cmd->common.flags &= ~NVME_CMD_SGL_ALL;

 req->cmd_flags |= REQ_FAILFAST_DRIVER;
 if (req->mq_hctx->type == HCTX_TYPE_POLL)
  req->cmd_flags |= REQ_POLLED;
 nvme_clear_nvme_request(req);
 memcpy(nr->cmd, cmd, sizeof(*cmd));
}
EXPORT_SYMBOL_GPL(nvme_init_request);

/*
 * For something we're not in a state to send to the device the default action
 * is to busy it and retry it after the controller state is recovered.  However,
 * if the controller is deleting or if anything is marked for failfast or
 * nvme multipath it is immediately failed.
 *
 * Note: commands used to initialize the controller will be marked for failfast.
 * Note: nvme cli/ioctl commands are marked for failfast.
 */

blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
  struct request *rq)
{
 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);

 if (state != NVME_CTRL_DELETING_NOIO &&
     state != NVME_CTRL_DELETING &&
     state != NVME_CTRL_DEAD &&
     !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
     !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
  return BLK_STS_RESOURCE;

 if (!(rq->rq_flags & RQF_DONTPREP))
  nvme_clear_nvme_request(rq);

 return nvme_host_path_error(rq);
}
EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);

bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
  bool queue_live, enum nvme_ctrl_state state)
{
 struct nvme_request *req = nvme_req(rq);

 /*
 * currently we have a problem sending passthru commands
 * on the admin_q if the controller is not LIVE because we can't
 * make sure that they are going out after the admin connect,
 * controller enable and/or other commands in the initialization
 * sequence. until the controller will be LIVE, fail with
 * BLK_STS_RESOURCE so that they will be rescheduled.
 */

 if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
  return false;

 if (ctrl->ops->flags & NVME_F_FABRICS) {
  /*
 * Only allow commands on a live queue, except for the connect
 * command, which is require to set the queue live in the
 * appropinquate states.
 */

  switch (state) {
  case NVME_CTRL_CONNECTING:
   if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
       (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
        req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
        req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
    return true;
   break;
  default:
   break;
  case NVME_CTRL_DEAD:
   return false;
  }
 }

 return queue_live;
}
EXPORT_SYMBOL_GPL(__nvme_check_ready);

static inline void nvme_setup_flush(struct nvme_ns *ns,
  struct nvme_command *cmnd)
{
 memset(cmnd, 0, sizeof(*cmnd));
 cmnd->common.opcode = nvme_cmd_flush;
 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
}

static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
  struct nvme_command *cmnd)
{
 unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
 struct nvme_dsm_range *range;
 struct bio *bio;

 /*
 * Some devices do not consider the DSM 'Number of Ranges' field when
 * determining how much data to DMA. Always allocate memory for maximum
 * number of segments to prevent device reading beyond end of buffer.
 */

 static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;

 range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
 if (!range) {
  /*
 * If we fail allocation our range, fallback to the controller
 * discard page. If that's also busy, it's safe to return
 * busy, as we know we can make progress once that's freed.
 */

  if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
   return BLK_STS_RESOURCE;

  range = page_address(ns->ctrl->discard_page);
 }

 if (queue_max_discard_segments(req->q) == 1) {
  u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req));
  u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);

  range[0].cattr = cpu_to_le32(0);
  range[0].nlb = cpu_to_le32(nlb);
  range[0].slba = cpu_to_le64(slba);
  n = 1;
 } else {
  __rq_for_each_bio(bio, req) {
   u64 slba = nvme_sect_to_lba(ns->head,
          bio->bi_iter.bi_sector);
   u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;

   if (n < segments) {
    range[n].cattr = cpu_to_le32(0);
    range[n].nlb = cpu_to_le32(nlb);
    range[n].slba = cpu_to_le64(slba);
   }
   n++;
  }
 }

 if (WARN_ON_ONCE(n != segments)) {
  if (virt_to_page(range) == ns->ctrl->discard_page)
   clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
  else
   kfree(range);
  return BLK_STS_IOERR;
 }

 memset(cmnd, 0, sizeof(*cmnd));
 cmnd->dsm.opcode = nvme_cmd_dsm;
 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
 cmnd->dsm.nr = cpu_to_le32(segments - 1);
 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);

 bvec_set_virt(&req->special_vec, range, alloc_size);
 req->rq_flags |= RQF_SPECIAL_PAYLOAD;

 return BLK_STS_OK;
}

static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd)
{
 cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag);
 cmnd->rw.lbatm = cpu_to_le16(0xffff);
}

static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
         struct request *req)
{
 u32 upper, lower;
 u64 ref48;

 /* only type1 and type 2 PI formats have a reftag */
 switch (ns->head->pi_type) {
 case NVME_NS_DPS_PI_TYPE1:
 case NVME_NS_DPS_PI_TYPE2:
  break;
 default:
  return;
 }

 /* both rw and write zeroes share the same reftag format */
 switch (ns->head->guard_type) {
 case NVME_NVM_NS_16B_GUARD:
  cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
  break;
 case NVME_NVM_NS_64B_GUARD:
  ref48 = ext_pi_ref_tag(req);
  lower = lower_32_bits(ref48);
  upper = upper_32_bits(ref48);

  cmnd->rw.reftag = cpu_to_le32(lower);
  cmnd->rw.cdw3 = cpu_to_le32(upper);
  break;
 default:
  break;
 }
}

static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
  struct request *req, struct nvme_command *cmnd)
{
 memset(cmnd, 0, sizeof(*cmnd));

 if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
  return nvme_setup_discard(ns, req, cmnd);

 cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
 cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
 cmnd->write_zeroes.slba =
  cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
 cmnd->write_zeroes.length =
  cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);

 if (!(req->cmd_flags & REQ_NOUNMAP) &&
     (ns->head->features & NVME_NS_DEAC))
  cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);

 if (nvme_ns_has_pi(ns->head)) {
  cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
  nvme_set_ref_tag(ns, cmnd, req);
 }

 return BLK_STS_OK;
}

/*
 * NVMe does not support a dedicated command to issue an atomic write. A write
 * which does adhere to the device atomic limits will silently be executed
 * non-atomically. The request issuer should ensure that the write is within
 * the queue atomic writes limits, but just validate this in case it is not.
 */

static bool nvme_valid_atomic_write(struct request *req)
{
 struct request_queue *q = req->q;
 u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);

 if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
  return false;

 if (boundary_bytes) {
  u64 mask = boundary_bytes - 1, imask = ~mask;
  u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
  u64 end = start + blk_rq_bytes(req) - 1;

  /* If greater then must be crossing a boundary */
  if (blk_rq_bytes(req) > boundary_bytes)
   return false;

  if ((start & imask) != (end & imask))
   return false;
 }

 return true;
}

static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
  struct request *req, struct nvme_command *cmnd,
  enum nvme_opcode op)
{
 u16 control = 0;
 u32 dsmgmt = 0;

 if (req->cmd_flags & REQ_FUA)
  control |= NVME_RW_FUA;
 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
  control |= NVME_RW_LR;

 if (req->cmd_flags & REQ_RAHEAD)
  dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;

 if (op == nvme_cmd_write && ns->head->nr_plids) {
  u16 write_stream = req->bio->bi_write_stream;

  if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
   return BLK_STS_INVAL;

  if (write_stream) {
   dsmgmt |= ns->head->plids[write_stream - 1] << 16;
   control |= NVME_RW_DTYPE_DPLCMT;
  }
 }

 if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
  return BLK_STS_INVAL;

 cmnd->rw.opcode = op;
 cmnd->rw.flags = 0;
 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 cmnd->rw.cdw2 = 0;
 cmnd->rw.cdw3 = 0;
 cmnd->rw.metadata = 0;
 cmnd->rw.slba =
  cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
 cmnd->rw.length =
  cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
 cmnd->rw.reftag = 0;
 cmnd->rw.lbat = 0;
 cmnd->rw.lbatm = 0;

 if (ns->head->ms) {
  /*
 * If formatted with metadata, the block layer always provides a
 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
 * we enable the PRACT bit for protection information or set the
 * namespace capacity to zero to prevent any I/O.
 */

  if (!blk_integrity_rq(req)) {
   if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
    return BLK_STS_NOTSUPP;
   control |= NVME_RW_PRINFO_PRACT;
   nvme_set_ref_tag(ns, cmnd, req);
  }

  if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
   control |= NVME_RW_PRINFO_PRCHK_GUARD;
  if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) {
   control |= NVME_RW_PRINFO_PRCHK_REF;
   if (op == nvme_cmd_zone_append)
    control |= NVME_RW_APPEND_PIREMAP;
   nvme_set_ref_tag(ns, cmnd, req);
  }
  if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) {
   control |= NVME_RW_PRINFO_PRCHK_APP;
   nvme_set_app_tag(req, cmnd);
  }
 }

 cmnd->rw.control = cpu_to_le16(control);
 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 return 0;
}

void nvme_cleanup_cmd(struct request *req)
{
 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
  struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;

  if (req->special_vec.bv_page == ctrl->discard_page)
   clear_bit_unlock(0, &ctrl->discard_page_busy);
  else
   kfree(bvec_virt(&req->special_vec));
  req->rq_flags &= ~RQF_SPECIAL_PAYLOAD;
 }
}
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);

blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
{
 struct nvme_command *cmd = nvme_req(req)->cmd;
 blk_status_t ret = BLK_STS_OK;

 if (!(req->rq_flags & RQF_DONTPREP))
  nvme_clear_nvme_request(req);

 switch (req_op(req)) {
 case REQ_OP_DRV_IN:
 case REQ_OP_DRV_OUT:
  /* these are setup prior to execution in nvme_init_request() */
  break;
 case REQ_OP_FLUSH:
  nvme_setup_flush(ns, cmd);
  break;
 case REQ_OP_ZONE_RESET_ALL:
 case REQ_OP_ZONE_RESET:
  ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
  break;
 case REQ_OP_ZONE_OPEN:
  ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
  break;
 case REQ_OP_ZONE_CLOSE:
  ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
  break;
 case REQ_OP_ZONE_FINISH:
  ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
  break;
 case REQ_OP_WRITE_ZEROES:
  ret = nvme_setup_write_zeroes(ns, req, cmd);
  break;
 case REQ_OP_DISCARD:
  ret = nvme_setup_discard(ns, req, cmd);
  break;
 case REQ_OP_READ:
  ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
  break;
 case REQ_OP_WRITE:
  ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
  break;
 case REQ_OP_ZONE_APPEND:
  ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
  break;
 default:
  WARN_ON_ONCE(1);
  return BLK_STS_IOERR;
 }

 cmd->common.command_id = nvme_cid(req);
 trace_nvme_setup_cmd(req, cmd);
 return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);

/*
 * Return values:
 * 0:  success
 * >0: nvme controller's cqe status response
 * <0: kernel error in lieu of controller response
 */

int nvme_execute_rq(struct request *rq, bool at_head)
{
 blk_status_t status;

 status = blk_execute_rq(rq, at_head);
 if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
  return -EINTR;
 if (nvme_req(rq)->status)
  return nvme_req(rq)->status;
 return blk_status_to_errno(status);
}
EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, "NVME_TARGET_PASSTHRU");

/*
 * Returns 0 on success.  If the result is negative, it's a Linux error code;
 * if the result is positive, it's an NVM Express status code
 */

int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
  union nvme_result *result, void *buffer, unsigned bufflen,
  int qid, nvme_submit_flags_t flags)
{
 struct request *req;
 int ret;
 blk_mq_req_flags_t blk_flags = 0;

 if (flags & NVME_SUBMIT_NOWAIT)
  blk_flags |= BLK_MQ_REQ_NOWAIT;
 if (flags & NVME_SUBMIT_RESERVED)
  blk_flags |= BLK_MQ_REQ_RESERVED;
 if (qid == NVME_QID_ANY)
  req = blk_mq_alloc_request(q, nvme_req_op(cmd), blk_flags);
 else
  req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), blk_flags,
      qid - 1);

 if (IS_ERR(req))
  return PTR_ERR(req);
 nvme_init_request(req, cmd);
 if (flags & NVME_SUBMIT_RETRY)
  req->cmd_flags &= ~REQ_FAILFAST_DRIVER;

 if (buffer && bufflen) {
  ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL);
  if (ret)
   goto out;
 }

 ret = nvme_execute_rq(req, flags & NVME_SUBMIT_AT_HEAD);
 if (result && ret >= 0)
  *result = nvme_req(req)->result;
 out:
 blk_mq_free_request(req);
 return ret;
}
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);

int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
  void *buffer, unsigned bufflen)
{
 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
   NVME_QID_ANY, 0);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);

u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
{
 u32 effects = 0;

 if (ns) {
  effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
  if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
   dev_warn_once(ctrl->device,
    "IO command:%02x has unusual effects:%08x\n",
    opcode, effects);

  /*
 * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues,
 * which would deadlock when done on an I/O command.  Note that
 * We already warn about an unusual effect above.
 */

  effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
 } else {
  effects = le32_to_cpu(ctrl->effects->acs[opcode]);

  /* Ignore execution restrictions if any relaxation bits are set */
  if (effects & NVME_CMD_EFFECTS_CSER_MASK)
   effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
 }

 return effects;
}
EXPORT_SYMBOL_NS_GPL(nvme_command_effects, "NVME_TARGET_PASSTHRU");

u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
{
 u32 effects = nvme_command_effects(ctrl, ns, opcode);

 /*
 * For simplicity, IO to all namespaces is quiesced even if the command
 * effects say only one namespace is affected.
 */

 if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
  mutex_lock(&ctrl->scan_lock);
  mutex_lock(&ctrl->subsys->lock);
  nvme_mpath_start_freeze(ctrl->subsys);
  nvme_mpath_wait_freeze(ctrl->subsys);
  nvme_start_freeze(ctrl);
  nvme_wait_freeze(ctrl);
 }
 return effects;
}
EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, "NVME_TARGET_PASSTHRU");

void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
         struct nvme_command *cmd, int status)
{
 if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
  nvme_unfreeze(ctrl);
  nvme_mpath_unfreeze(ctrl->subsys);
  mutex_unlock(&ctrl->subsys->lock);
  mutex_unlock(&ctrl->scan_lock);
 }
 if (effects & NVME_CMD_EFFECTS_CCC) {
  if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY,
          &ctrl->flags)) {
   dev_info(ctrl->device,
"controller capabilities changed, reset may be required to take effect.\n");
  }
 }
 if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
  nvme_queue_scan(ctrl);
  flush_work(&ctrl->scan_work);
 }
 if (ns)
  return;

 switch (cmd->common.opcode) {
 case nvme_admin_set_features:
  switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
  case NVME_FEAT_KATO:
   /*
 * Keep alive commands interval on the host should be
 * updated when KATO is modified by Set Features
 * commands.
 */

   if (!status)
    nvme_update_keep_alive(ctrl, cmd);
   break;
  default:
   break;
  }
  break;
 default:
  break;
 }
}
EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, "NVME_TARGET_PASSTHRU");

/*
 * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
 *
 *   The host should send Keep Alive commands at half of the Keep Alive Timeout
 *   accounting for transport roundtrip times [..].
 */

static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
{
 unsigned long delay = ctrl->kato * HZ / 2;

 /*
 * When using Traffic Based Keep Alive, we need to run
 * nvme_keep_alive_work at twice the normal frequency, as one
 * command completion can postpone sending a keep alive command
 * by up to twice the delay between runs.
 */

 if (ctrl->ctratt & NVME_CTRL_ATTR_TBKAS)
  delay /= 2;
 return delay;
}

static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
{
 unsigned long now = jiffies;
 unsigned long delay = nvme_keep_alive_work_period(ctrl);
 unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;

 if (time_after(now, ka_next_check_tm))
  delay = 0;
 else
  delay = ka_next_check_tm - now;

 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
}

static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
       blk_status_t status)
{
 struct nvme_ctrl *ctrl = rq->end_io_data;
 unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
 unsigned long delay = nvme_keep_alive_work_period(ctrl);
 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);

 /*
 * Subtract off the keepalive RTT so nvme_keep_alive_work runs
 * at the desired frequency.
 */

 if (rtt <= delay) {
  delay -= rtt;
 } else {
  dev_warn(ctrl->device, "long keepalive RTT (%u ms)\n",
    jiffies_to_msecs(rtt));
  delay = 0;
 }

 blk_mq_free_request(rq);

 if (status) {
  dev_err(ctrl->device,
   "failed nvme_keep_alive_end_io error=%d\n",
    status);
  return RQ_END_IO_NONE;
 }

 ctrl->ka_last_check_time = jiffies;
 ctrl->comp_seen = false;
 if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING)
  queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
 return RQ_END_IO_NONE;
}

static void nvme_keep_alive_work(struct work_struct *work)
{
 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
   struct nvme_ctrl, ka_work);
 bool comp_seen = ctrl->comp_seen;
 struct request *rq;

 ctrl->ka_last_check_time = jiffies;

 if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
  dev_dbg(ctrl->device,
   "reschedule traffic based keep-alive timer\n");
  ctrl->comp_seen = false;
  nvme_queue_keep_alive_work(ctrl);
  return;
 }

 rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
      BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
 if (IS_ERR(rq)) {
  /* allocation failure, reset the controller */
  dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
  nvme_reset_ctrl(ctrl);
  return;
 }
 nvme_init_request(rq, &ctrl->ka_cmd);

 rq->timeout = ctrl->kato * HZ;
 rq->end_io = nvme_keep_alive_end_io;
 rq->end_io_data = ctrl;
 blk_execute_rq_nowait(rq, false);
}

static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
{
 if (unlikely(ctrl->kato == 0))
  return;

 nvme_queue_keep_alive_work(ctrl);
}

void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
{
 if (unlikely(ctrl->kato == 0))
  return;

 cancel_delayed_work_sync(&ctrl->ka_work);
}
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);

static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
       struct nvme_command *cmd)
{
 unsigned int new_kato =
  DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);

 dev_info(ctrl->device,
   "keep alive interval updated from %u ms to %u ms\n",
   ctrl->kato * 1000 / 2, new_kato * 1000 / 2);

 nvme_stop_keep_alive(ctrl);
 ctrl->kato = new_kato;
 nvme_start_keep_alive(ctrl);
}

static bool nvme_id_cns_ok(struct nvme_ctrl *ctrl, u8 cns)
{
 /*
 * The CNS field occupies a full byte starting with NVMe 1.2
 */

 if (ctrl->vs >= NVME_VS(1, 2, 0))
  return true;

 /*
 * NVMe 1.1 expanded the CNS value to two bits, which means values
 * larger than that could get truncated and treated as an incorrect
 * value.
 *
 * Qemu implemented 1.0 behavior for controllers claiming 1.1
 * compliance, so they need to be quirked here.
 */

 if (ctrl->vs >= NVME_VS(1, 1, 0) &&
     !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS))
  return cns <= 3;

 /*
 * NVMe 1.0 used a single bit for the CNS value.
 */

 return cns <= 1;
}

static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
{
 struct nvme_command c = { };
 int error;

 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 c.identify.opcode = nvme_admin_identify;
 c.identify.cns = NVME_ID_CNS_CTRL;

 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
 if (!*id)
  return -ENOMEM;

 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
   sizeof(struct nvme_id_ctrl));
 if (error) {
  kfree(*id);
  *id = NULL;
 }
 return error;
}

static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
  struct nvme_ns_id_desc *cur, bool *csi_seen)
{
 const char *warn_str = "ctrl returned bogus length:";
 void *data = cur;

 switch (cur->nidt) {
 case NVME_NIDT_EUI64:
  if (cur->nidl != NVME_NIDT_EUI64_LEN) {
   dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
     warn_str, cur->nidl);
   return -1;
  }
  if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
   return NVME_NIDT_EUI64_LEN;
  memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
  return NVME_NIDT_EUI64_LEN;
 case NVME_NIDT_NGUID:
  if (cur->nidl != NVME_NIDT_NGUID_LEN) {
   dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
     warn_str, cur->nidl);
   return -1;
  }
  if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
   return NVME_NIDT_NGUID_LEN;
  memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
  return NVME_NIDT_NGUID_LEN;
 case NVME_NIDT_UUID:
  if (cur->nidl != NVME_NIDT_UUID_LEN) {
   dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
     warn_str, cur->nidl);
   return -1;
  }
  if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
   return NVME_NIDT_UUID_LEN;
  uuid_copy(&ids->uuid, data + sizeof(*cur));
  return NVME_NIDT_UUID_LEN;
 case NVME_NIDT_CSI:
  if (cur->nidl != NVME_NIDT_CSI_LEN) {
   dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
     warn_str, cur->nidl);
   return -1;
  }
  memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
  *csi_seen = true;
  return NVME_NIDT_CSI_LEN;
 default:
  /* Skip unknown types */
  return cur->nidl;
 }
}

static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
  struct nvme_ns_info *info)
{
 struct nvme_command c = { };
 bool csi_seen = false;
 int status, pos, len;
 void *data;

 if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
  return 0;
 if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
  return 0;

 c.identify.opcode = nvme_admin_identify;
 c.identify.nsid = cpu_to_le32(info->nsid);
 c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;

 data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
 if (!data)
  return -ENOMEM;

 status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
          NVME_IDENTIFY_DATA_SIZE);
 if (status) {
  dev_warn(ctrl->device,
   "Identify Descriptors failed (nsid=%u, status=0x%x)\n",
   info->nsid, status);
  goto free_data;
 }

 for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
  struct nvme_ns_id_desc *cur = data + pos;

  if (cur->nidl == 0)
   break;

  len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
  if (len < 0)
   break;

  len += sizeof(*cur);
 }

 if (nvme_multi_css(ctrl) && !csi_seen) {
  dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
    info->nsid);
  status = -EINVAL;
 }

free_data:
 kfree(data);
 return status;
}

int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
   struct nvme_id_ns **id)
{
 struct nvme_command c = { };
 int error;

 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 c.identify.opcode = nvme_admin_identify;
 c.identify.nsid = cpu_to_le32(nsid);
 c.identify.cns = NVME_ID_CNS_NS;

 *id = kmalloc(sizeof(**id), GFP_KERNEL);
 if (!*id)
  return -ENOMEM;

 error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
 if (error) {
  dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
  kfree(*id);
  *id = NULL;
 }
 return error;
}

static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
  struct nvme_ns_info *info)
{
 struct nvme_ns_ids *ids = &info->ids;
 struct nvme_id_ns *id;
 int ret;

 ret = nvme_identify_ns(ctrl, info->nsid, &id);
 if (ret)
  return ret;

 if (id->ncap == 0) {
  /* namespace not allocated or attached */
  info->is_removed = true;
  ret = -ENODEV;
  goto error;
 }

 info->anagrpid = id->anagrpid;
 info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
 info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
 info->is_ready = true;
 info->endgid = le16_to_cpu(id->endgid);
 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
  dev_info(ctrl->device,
    "Ignoring bogus Namespace Identifiers\n");
 } else {
  if (ctrl->vs >= NVME_VS(1, 1, 0) &&
      !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
   memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
  if (ctrl->vs >= NVME_VS(1, 2, 0) &&
      !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
   memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
 }

error:
 kfree(id);
 return ret;
}

static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
  struct nvme_ns_info *info)
{
 struct nvme_id_ns_cs_indep *id;
 struct nvme_command c = {
  .identify.opcode = nvme_admin_identify,
  .identify.nsid  = cpu_to_le32(info->nsid),
  .identify.cns  = NVME_ID_CNS_NS_CS_INDEP,
 };
 int ret;

 id = kmalloc(sizeof(*id), GFP_KERNEL);
 if (!id)
  return -ENOMEM;

 ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
 if (!ret) {
  info->anagrpid = id->anagrpid;
  info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
  info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
  info->is_ready = id->nstat & NVME_NSTAT_NRDY;
  info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
  info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
  info->endgid = le16_to_cpu(id->endgid);
 }
 kfree(id);
 return ret;
}

static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
  unsigned int dword11, void *buffer, size_t buflen, u32 *result)
{
 union nvme_result res = { 0 };
 struct nvme_command c = { };
 int ret;

 c.features.opcode = op;
 c.features.fid = cpu_to_le32(fid);
 c.features.dword11 = cpu_to_le32(dword11);

 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
   buffer, buflen, NVME_QID_ANY, 0);
 if (ret >= 0 && result)
  *result = le32_to_cpu(res.u32);
 return ret;
}

int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
        unsigned int dword11, void *buffer, size_t buflen,
        void *result)
{
 return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
        buflen, result);
}
EXPORT_SYMBOL_GPL(nvme_set_features);

int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
        unsigned int dword11, void *buffer, size_t buflen,
        void *result)
{
 return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
        buflen, result);
}
EXPORT_SYMBOL_GPL(nvme_get_features);

int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
 u32 q_count = (*count - 1) | ((*count - 1) << 16);
 u32 result;
 int status, nr_io_queues;

 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
   &result);

 /*
 * It's either a kernel error or the host observed a connection
 * lost. In either case it's not possible communicate with the
 * controller and thus enter the error code path.
 */

 if (status < 0 || status == NVME_SC_HOST_PATH_ERROR)
  return status;

 /*
 * Degraded controllers might return an error when setting the queue
 * count.  We still want to be able to bring them online and offer
 * access to the admin queue, as that might be only way to fix them up.
 */

 if (status > 0) {
  dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
  *count = 0;
 } else {
  nr_io_queues = min(result & 0xffff, result >> 16) + 1;
  *count = min(*count, nr_io_queues);
 }

 return 0;
}
EXPORT_SYMBOL_GPL(nvme_set_queue_count);

#define NVME_AEN_SUPPORTED \
 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
  NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)

static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{
 u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
 int status;

 if (!supported_aens)
  return;

 status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
   NULL, 0, &result);
 if (status)
  dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
    supported_aens);

 queue_work(nvme_wq, &ctrl->async_event_work);
}

static int nvme_ns_open(struct nvme_ns *ns)
{

 /* should never be called due to GENHD_FL_HIDDEN */
 if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
  goto fail;
 if (!nvme_get_ns(ns))
  goto fail;
 if (!try_module_get(ns->ctrl->ops->module))
  goto fail_put_ns;

 return 0;

fail_put_ns:
 nvme_put_ns(ns);
fail:
 return -ENXIO;
}

static void nvme_ns_release(struct nvme_ns *ns)
{

 module_put(ns->ctrl->ops->module);
 nvme_put_ns(ns);
}

static int nvme_open(struct gendisk *disk, blk_mode_t mode)
{
 return nvme_ns_open(disk->private_data);
}

static void nvme_release(struct gendisk *disk)
{
 nvme_ns_release(disk->private_data);
}

int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
 /* some standard values */
 geo->heads = 1 << 6;
 geo->sectors = 1 << 5;
 geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
 return 0;
}

static bool nvme_init_integrity(struct nvme_ns_head *head,
  struct queue_limits *lim, struct nvme_ns_info *info)
{
 struct blk_integrity *bi = &lim->integrity;

 memset(bi, 0, sizeof(*bi));

 if (!head->ms)
  return true;

 /*
 * PI can always be supported as we can ask the controller to simply
 * insert/strip it, which is not possible for other kinds of metadata.
 */

 if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
     !(head->features & NVME_NS_METADATA_SUPPORTED))
  return nvme_ns_has_pi(head);

 switch (head->pi_type) {
 case NVME_NS_DPS_PI_TYPE3:
  switch (head->guard_type) {
  case NVME_NVM_NS_16B_GUARD:
   bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
   bi->tag_size = sizeof(u16) + sizeof(u32);
   bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
   break;
  case NVME_NVM_NS_64B_GUARD:
   bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
   bi->tag_size = sizeof(u16) + 6;
   bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
   break;
  default:
   break;
  }
  break;
 case NVME_NS_DPS_PI_TYPE1:
 case NVME_NS_DPS_PI_TYPE2:
  switch (head->guard_type) {
  case NVME_NVM_NS_16B_GUARD:
   bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
   bi->tag_size = sizeof(u16);
   bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
         BLK_INTEGRITY_REF_TAG;
   break;
  case NVME_NVM_NS_64B_GUARD:
   bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
   bi->tag_size = sizeof(u16);
   bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
         BLK_INTEGRITY_REF_TAG;
   break;
  default:
   break;
  }
  break;
 default:
  break;
 }

 bi->metadata_size = head->ms;
 if (bi->csum_type) {
  bi->pi_tuple_size = head->pi_size;
  bi->pi_offset = info->pi_offset;
 }
 return true;
}

static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
{
 struct nvme_ctrl *ctrl = ns->ctrl;

 if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
  lim->max_hw_discard_sectors =
   nvme_lba_to_sect(ns->head, ctrl->dmrsl);
 else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
  lim->max_hw_discard_sectors = UINT_MAX;
 else
  lim->max_hw_discard_sectors = 0;

 lim->discard_granularity = lim->logical_block_size;

 if (ctrl->dmrl)
  lim->max_discard_segments = ctrl->dmrl;
 else
  lim->max_discard_segments = NVME_DSM_MAX_RANGES;
}

static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
{
 return uuid_equal(&a->uuid, &b->uuid) &&
  memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
  memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
  a->csi == b->csi;
}

static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
  struct nvme_id_ns_nvm **nvmp)
{
 struct nvme_command c = {
  .identify.opcode = nvme_admin_identify,
  .identify.nsid  = cpu_to_le32(nsid),
  .identify.cns  = NVME_ID_CNS_CS_NS,
  .identify.csi  = NVME_CSI_NVM,
 };
 struct nvme_id_ns_nvm *nvm;
 int ret;

 nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
 if (!nvm)
  return -ENOMEM;

 ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
 if (ret)
  kfree(nvm);
 else
  *nvmp = nvm;
 return ret;
}

static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
  struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
{
 u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
 u8 guard_type;

 /* no support for storage tag formats right now */
 if (nvme_elbaf_sts(elbaf))
  return;

 guard_type = nvme_elbaf_guard_type(elbaf);
 if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) &&
      guard_type == NVME_NVM_NS_QTYPE_GUARD)
  guard_type = nvme_elbaf_qualified_guard_type(elbaf);

 head->guard_type = guard_type;
 switch (head->guard_type) {
 case NVME_NVM_NS_64B_GUARD:
  head->pi_size = sizeof(struct crc64_pi_tuple);
  break;
 case NVME_NVM_NS_16B_GUARD:
  head->pi_size = sizeof(struct t10_pi_tuple);
  break;
 default:
  break;
 }
}

static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
  struct nvme_ns_head *head, struct nvme_id_ns *id,
  struct nvme_id_ns_nvm *nvm, struct nvme_ns_info *info)
{
 head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
 head->pi_type = 0;
 head->pi_size = 0;
 head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
 if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
  return;

 if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
  nvme_configure_pi_elbas(head, id, nvm);
 } else {
  head->pi_size = sizeof(struct t10_pi_tuple);
  head->guard_type = NVME_NVM_NS_16B_GUARD;
 }

 if (head->pi_size && head->ms >= head->pi_size)
  head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
 if (!(id->dps & NVME_NS_DPS_PI_FIRST)) {
  if (disable_pi_offsets)
   head->pi_type = 0;
  else
   info->pi_offset = head->ms - head->pi_size;
 }

 if (ctrl->ops->flags & NVME_F_FABRICS) {
  /*
 * The NVMe over Fabrics specification only supports metadata as
 * part of the extended data LBA.  We rely on HCA/HBA support to
 * remap the separate metadata buffer from the block layer.
 */

  if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
   return;

  head->features |= NVME_NS_EXT_LBAS;

  /*
 * The current fabrics transport drivers support namespace
 * metadata formats only if nvme_ns_has_pi() returns true.
 * Suppress support for all other formats so the namespace will
 * have a 0 capacity and not be usable through the block stack.
 *
 * Note, this check will need to be modified if any drivers
 * gain the ability to use other metadata formats.
 */

  if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
   head->features |= NVME_NS_METADATA_SUPPORTED;
 } else {
  /*
 * For PCIe controllers, we can't easily remap the separate
 * metadata buffer from the block layer and thus require a
 * separate metadata buffer for block layer metadata/PI support.
 * We allow extended LBAs for the passthrough interface, though.
 */

  if (id->flbas & NVME_NS_FLBAS_META_EXT)
   head->features |= NVME_NS_EXT_LBAS;
  else
   head->features |= NVME_NS_METADATA_SUPPORTED;
 }
}


static u32 nvme_configure_atomic_write(struct nvme_ns *ns,
  struct nvme_id_ns *id, struct queue_limits *lim, u32 bs)
{
 u32 atomic_bs, boundary = 0;

 /*
 * We do not support an offset for the atomic boundaries.
 */

 if (id->nabo)
  return bs;

 if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) {
  /*
 * Use the per-namespace atomic write unit when available.
 */

  atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
  if (id->nabspf)
   boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
 } else {
  /*
 * Use the controller wide atomic write unit.  This sucks
 * because the limit is defined in terms of logical blocks while
 * namespaces can have different formats, and because there is
 * no clear language in the specification prohibiting different
 * values for different controllers in the subsystem.
 */

  atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
 }

 lim->atomic_write_hw_max = atomic_bs;
 lim->atomic_write_hw_boundary = boundary;
 lim->atomic_write_hw_unit_min = bs;
 lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
 lim->features |= BLK_FEAT_ATOMIC_WRITES;
 return atomic_bs;
}

static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
{
 return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
}

static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
  struct queue_limits *lim)
{
 lim->max_hw_sectors = ctrl->max_hw_sectors;
 lim->max_segments = min_t(u32, USHRT_MAX,
  min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
 lim->max_integrity_segments = ctrl->max_integrity_segments;
 lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
 lim->max_segment_size = UINT_MAX;
 lim->dma_alignment = 3;
}

static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
  struct queue_limits *lim)
{
 struct nvme_ns_head *head = ns->head;
 u32 bs = 1U << head->lba_shift;
 u32 atomic_bs, phys_bs, io_opt = 0;
 bool valid = true;

 /*
 * The block layer can't support LBA sizes larger than the page size
 * or smaller than a sector size yet, so catch this early and don't
 * allow block I/O.
 */

 if (blk_validate_block_size(bs)) {
  bs = (1 << 9);
  valid = false;
 }

 phys_bs = bs;
 atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs);

 if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
  /* NPWG = Namespace Preferred Write Granularity */
  phys_bs = bs * (1 + le16_to_cpu(id->npwg));
  /* NOWS = Namespace Optimal Write Size */
  if (id->nows)
   io_opt = bs * (1 + le16_to_cpu(id->nows));
 }

 /*
 * Linux filesystems assume writing a single physical block is
 * an atomic operation. Hence limit the physical block size to the
 * value of the Atomic Write Unit Power Fail parameter.
 */

 lim->logical_block_size = bs;
 lim->physical_block_size = min(phys_bs, atomic_bs);
 lim->io_min = phys_bs;
 lim->io_opt = io_opt;
 if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
     (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM))
  lim->max_write_zeroes_sectors = UINT_MAX;
 else
  lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
 return valid;
}

static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
{
 return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
}

static inline bool nvme_first_scan(struct gendisk *disk)
{
 /* nvme_alloc_ns() scans the disk prior to adding it */
 return !disk_live(disk);
}

static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
  struct queue_limits *lim)
{
 struct nvme_ctrl *ctrl = ns->ctrl;
 u32 iob;

 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
     is_power_of_2(ctrl->max_hw_sectors))
  iob = ctrl->max_hw_sectors;
 else
  iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob));

 if (!iob)
  return;

 if (!is_power_of_2(iob)) {
  if (nvme_first_scan(ns->disk))
   pr_warn("%s: ignoring unaligned IO boundary:%u\n",
    ns->disk->disk_name, iob);
  return;
 }

 if (blk_queue_is_zoned(ns->disk->queue)) {
  if (nvme_first_scan(ns->disk))
   pr_warn("%s: ignoring zoned namespace IO boundary\n",
    ns->disk->disk_name);
  return;
 }

 lim->chunk_sectors = iob;
}

static int nvme_update_ns_info_generic(struct nvme_ns *ns,
  struct nvme_ns_info *info)
{
 struct queue_limits lim;
 unsigned int memflags;
 int ret;

 lim = queue_limits_start_update(ns->disk->queue);
 nvme_set_ctrl_limits(ns->ctrl, &lim);

 memflags = blk_mq_freeze_queue(ns->disk->queue);
 ret = queue_limits_commit_update(ns->disk->queue, &lim);
 set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
 blk_mq_unfreeze_queue(ns->disk->queue, memflags);

 /* Hide the block-interface for these devices */
 if (!ret)
  ret = -ENODEV;
 return ret;
}

static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
          struct nvme_ns_info *info, u8 fdp_idx)
{
 struct nvme_fdp_config_log hdr, *h;
 struct nvme_fdp_config_desc *desc;
 size_t size = sizeof(hdr);
 void *log, *end;
 int i, n, ret;

 ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
          NVME_CSI_NVM, &hdr, size, 0, info->endgid);
 if (ret) {
  dev_warn(ctrl->device,
    "FDP configs log header status:0x%x endgid:%d\n", ret,
    info->endgid);
  return ret;
 }

 size = le32_to_cpu(hdr.sze);
 if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
  dev_warn(ctrl->device, "FDP config size too large:%zu\n",
    size);
  return 0;
 }

 h = kvmalloc(size, GFP_KERNEL);
 if (!h)
  return -ENOMEM;

 ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
          NVME_CSI_NVM, h, size, 0, info->endgid);
 if (ret) {
  dev_warn(ctrl->device,
    "FDP configs log status:0x%x endgid:%d\n", ret,
    info->endgid);
  goto out;
 }

 n = le16_to_cpu(h->numfdpc) + 1;
 if (fdp_idx > n) {
  dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
    fdp_idx, n);
  /* Proceed without registering FDP streams */
  ret = 0;
  goto out;
 }

 log = h + 1;
 desc = log;
 end = log + size - sizeof(*h);
 for (i = 0; i < fdp_idx; i++) {
  log += le16_to_cpu(desc->dsze);
  desc = log;
  if (log >= end) {
   dev_warn(ctrl->device,
     "FDP invalid config descriptor list\n");
   ret = 0;
   goto out;
  }
 }

 if (le32_to_cpu(desc->nrg) > 1) {
  dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
  ret = 0;
  goto out;
 }

 info->runs = le64_to_cpu(desc->runs);
out:
 kvfree(h);
 return ret;
}

static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
 struct nvme_ns_head *head = ns->head;
 struct nvme_ctrl *ctrl = ns->ctrl;
 struct nvme_fdp_ruh_status *ruhs;
 struct nvme_fdp_config fdp;
 struct nvme_command c = {};
 size_t size;
 int i, ret;

 /*
 * The FDP configuration is static for the lifetime of the namespace,
 * so return immediately if we've already registered this namespace's
 * streams.
 */

 if (head->nr_plids)
  return 0;

 ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0,
    &fdp);
 if (ret) {
  dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret);
  return ret;
 }

 if (!(fdp.flags & FDPCFG_FDPE))
  return 0;

 ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx);
 if (!info->runs)
  return ret;

 size = struct_size(ruhs, ruhsd, S8_MAX - 1);
 ruhs = kzalloc(size, GFP_KERNEL);
 if (!ruhs)
  return -ENOMEM;

 c.imr.opcode = nvme_cmd_io_mgmt_recv;
 c.imr.nsid = cpu_to_le32(head->ns_id);
 c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS;
 c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size));
 ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
 if (ret) {
  dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret);
  goto free;
 }

 head->nr_plids = le16_to_cpu(ruhs->nruhsd);
 if (!head->nr_plids)
  goto free;

 head->plids = kcalloc(head->nr_plids, sizeof(*head->plids),
         GFP_KERNEL);
 if (!head->plids) {
  dev_warn(ctrl->device,
    "failed to allocate %u FDP placement IDs\n",
    head->nr_plids);
  head->nr_plids = 0;
  ret = -ENOMEM;
  goto free;
 }

 for (i = 0; i < head->nr_plids; i++)
  head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
free:
 kfree(ruhs);
 return ret;
}

static int nvme_update_ns_info_block(struct nvme_ns *ns,
  struct nvme_ns_info *info)
{
 struct queue_limits lim;
 struct nvme_id_ns_nvm *nvm = NULL;
 struct nvme_zone_info zi = {};
 struct nvme_id_ns *id;
 unsigned int memflags;
 sector_t capacity;
 unsigned lbaf;
 int ret;

 ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
 if (ret)
  return ret;

 if (id->ncap == 0) {
  /* namespace not allocated or attached */
  info->is_removed = true;
  ret = -ENXIO;
  goto out;
 }
 lbaf = nvme_lbaf_index(id->flbas);

 if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
  ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
  if (ret < 0)
   goto out;
 }

 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
     ns->head->ids.csi == NVME_CSI_ZNS) {
  ret = nvme_query_zone_info(ns, lbaf, &zi);
  if (ret < 0)
   goto out;
 }

 if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
  ret = nvme_query_fdp_info(ns, info);
  if (ret < 0)
   goto out;
 }

 lim = queue_limits_start_update(ns->disk->queue);

 memflags = blk_mq_freeze_queue(ns->disk->queue);
 ns->head->lba_shift = id->lbaf[lbaf].ds;
 ns->head->nuse = le64_to_cpu(id->nuse);
 capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
 nvme_set_ctrl_limits(ns->ctrl, &lim);
 nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
 nvme_set_chunk_sectors(ns, id, &lim);
 if (!nvme_update_disk_info(ns, id, &lim))
  capacity = 0;

 nvme_config_discard(ns, &lim);
 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
     ns->head->ids.csi == NVME_CSI_ZNS)
  nvme_update_zone_info(ns, &lim, &zi);

 if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
  lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
 else
  lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);

 if (info->is_rotational)
  lim.features |= BLK_FEAT_ROTATIONAL;

 /*
 * Register a metadata profile for PI, or the plain non-integrity NVMe
 * metadata masquerading as Type 0 if supported, otherwise reject block
 * I/O to namespaces with metadata except when the namespace supports
 * PI, as it can strip/insert in that case.
 */

 if (!nvme_init_integrity(ns->head, &lim, info))
  capacity = 0;

 lim.max_write_streams = ns->head->nr_plids;
 if (lim.max_write_streams)
  lim.write_stream_granularity = min(info->runs, U32_MAX);
 else
  lim.write_stream_granularity = 0;

 /*
 * Only set the DEAC bit if the device guarantees that reads from
 * deallocated data return zeroes.  While the DEAC bit does not
 * require that, it must be a no-op if reads from deallocated data
 * do not return zeroes.
 */

 if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) {
  ns->head->features |= NVME_NS_DEAC;
  lim.max_hw_wzeroes_unmap_sectors = lim.max_write_zeroes_sectors;
 }

 ret = queue_limits_commit_update(ns->disk->queue, &lim);
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5
C=97 H=92 G=94

¤ Dauer der Verarbeitung: 0.51 Sekunden  ¤

*© Formatika GbR, Deutschland






Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.