/* Define the socket priority to use for connections were it is desirable * that the NIC consider performing optimized packet processing or filtering. * A non-zero value being sufficient to indicate general consideration of any * possible optimization. Making it a module param allows for alternative * values that may be unique for some NIC implementations.
*/ staticint so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
/* * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity * from sysfs.
*/ staticbool wq_unbound;
module_param(wq_unbound, bool, 0644);
MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)");
#ifdef CONFIG_DEBUG_LOCK_ALLOC /* lockdep can detect a circular dependency of the form * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock * because dependencies are tracked for both nvme-tcp and user contexts. Using * a separate class prevents lockdep from conflating nvme-tcp socket use with * user-space socket API use.
*/ staticstruct lock_class_key nvme_tcp_sk_key[2]; staticstruct lock_class_key nvme_tcp_slock_key[2];
staticinlinebool nvme_tcp_recv_pdu_supported(enum nvme_tcp_pdu_type type)
{ switch (type) { case nvme_tcp_c2h_term: case nvme_tcp_c2h_data: case nvme_tcp_r2t: case nvme_tcp_rsp: returntrue; default: returnfalse;
}
}
/* * Check if the queue is TLS encrypted
*/ staticinlinebool nvme_tcp_queue_tls(struct nvme_tcp_queue *queue)
{ if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) return 0;
return queue->tls_enabled;
}
/* * Check if TLS is configured for the controller.
*/ staticinlinebool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl)
{ if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) return 0;
staticinlinevoid *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
{ /* use the pdu space in the back for the data pdu */ return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) - sizeof(struct nvme_tcp_data_pdu);
}
/* * if we're the first on the send_list and we can try to send * directly, otherwise queue io_work. Also, only do that if we * are on the same cpu, so we don't introduce contention.
*/ if (queue->io_cpu == raw_smp_processor_id() &&
empty && mutex_trylock(&queue->send_mutex)) {
nvme_tcp_send_all(queue);
mutex_unlock(&queue->send_mutex);
}
if (last && nvme_tcp_queue_has_pending(queue))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
dev_err(queue->ctrl->ctrl.device, "queue %d tag %#x SUCCESS set but not last PDU\n",
nvme_tcp_queue_id(queue), rq->tag);
nvme_tcp_error_recovery(&queue->ctrl->ctrl); return -EPROTO;
}
return 0;
}
staticint nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, struct nvme_tcp_rsp_pdu *pdu)
{ struct nvme_completion *cqe = &pdu->cqe; int ret = 0;
/* * AEN requests are special as they don't time out and can * survive any kind of queue freeze and often don't respond to * aborts. We don't even bother to allocate a struct request * for them but rather special case them here.
*/ if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
cqe->command_id)))
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result); else
ret = nvme_tcp_process_nvme_cqe(queue, cqe);
hdr = queue->pdu; if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) { if (!nvme_tcp_recv_pdu_supported(hdr->type)) goto unsupported_pdu;
dev_err(queue->ctrl->ctrl.device, "pdu type %d has unexpected header length (%d)\n",
hdr->type, hdr->hlen); return -EPROTO;
}
if (unlikely(hdr->type == nvme_tcp_c2h_term)) { /* * C2HTermReq never includes Header or Data digests. * Skip the checks.
*/
nvme_tcp_handle_c2h_term(queue, (void *)queue->pdu); return -EINVAL;
}
if (queue->hdr_digest) {
ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); if (unlikely(ret)) return ret;
}
if (queue->data_digest) {
ret = nvme_tcp_check_ddgst(queue, queue->pdu); if (unlikely(ret)) return ret;
}
switch (hdr->type) { case nvme_tcp_c2h_data: return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); case nvme_tcp_rsp:
nvme_tcp_init_recv_ctx(queue); return nvme_tcp_handle_comp(queue, (void *)queue->pdu); case nvme_tcp_r2t:
nvme_tcp_init_recv_ctx(queue); return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: goto unsupported_pdu;
}
unsupported_pdu:
dev_err(queue->ctrl->ctrl.device, "unsupported pdu type (%d)\n", hdr->type); return -EINVAL;
}
staticinlinevoid nvme_tcp_end_request(struct request *rq, u16 status)
{ union nvme_result res = {};
if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
nvme_complete_rq(rq);
}
recv_len = min_t(size_t, *len, queue->data_remaining); if (!recv_len) break;
if (!iov_iter_count(&req->iter)) {
req->curr_bio = req->curr_bio->bi_next;
/* * If we don`t have any bios it means that controller * sent more data than we requested, hence error
*/ if (!req->curr_bio) {
dev_err(queue->ctrl->ctrl.device, "queue %d no space in request %#x",
nvme_tcp_queue_id(queue), rq->tag);
nvme_tcp_init_recv_ctx(queue); return -EIO;
}
nvme_tcp_init_iter(req, ITER_DEST);
}
/* we can read only from what is left in this bio */
recv_len = min_t(size_t, recv_len,
iov_iter_count(&req->iter));
if (queue->data_digest)
ret = skb_copy_and_crc32c_datagram_iter(skb, *offset,
&req->iter, recv_len, &queue->rcv_crc); else
ret = skb_copy_datagram_iter(skb, *offset,
&req->iter, recv_len); if (ret) {
dev_err(queue->ctrl->ctrl.device, "queue %d failed to copy request %#x data",
nvme_tcp_queue_id(queue), rq->tag); return ret;
}
read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data; if (!queue) goto done;
switch (sk->sk_state) { case TCP_CLOSE: case TCP_CLOSE_WAIT: case TCP_LAST_ACK: case TCP_FIN_WAIT1: case TCP_FIN_WAIT2:
nvme_tcp_error_recovery(&queue->ctrl->ctrl); break; default:
dev_info(queue->ctrl->ctrl.device, "queue %d socket state %d\n",
nvme_tcp_queue_id(queue), sk->sk_state);
}
if (!sendpages_ok(page, len, offset))
msg.msg_flags &= ~MSG_SPLICE_PAGES;
bvec_set_page(&bvec, page, len, offset);
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
ret = sock_sendmsg(queue->sock, &msg); if (ret <= 0) return ret;
if (queue->data_digest)
nvme_tcp_ddgst_update(&queue->snd_crc, page,
offset, ret);
/* * update the request iterator except for the last payload send * in the request where we don't want to modify it as we may * compete with the RX path completing the request.
*/ if (req_data_sent + ret < req_data_len)
nvme_tcp_advance_req(req, ret);
/* fully successful last send in current PDU */ if (last && ret == len) { if (queue->data_digest) {
req->ddgst =
nvme_tcp_ddgst_final(queue->snd_crc);
req->state = NVME_TCP_SEND_DDGST;
req->offset = 0;
} else { if (h2cdata_left)
nvme_tcp_setup_h2c_data_pdu(req); else
nvme_tcp_done_send_req(queue);
} return 1;
}
} return -EAGAIN;
}
if (mutex_trylock(&queue->send_mutex)) {
result = nvme_tcp_try_send(queue);
mutex_unlock(&queue->send_mutex); if (result > 0)
pending = true; elseif (unlikely(result < 0)) break;
}
result = nvme_tcp_try_recv(queue); if (result > 0)
pending = true; elseif (unlikely(result < 0)) return;
/* did we get some space after spending time in recv? */ if (nvme_tcp_queue_has_pending(queue) &&
sk_stream_is_writeable(queue->sock->sk))
pending = true;
if (!pending || !queue->rd_enabled) return;
} while (!time_after(jiffies, deadline)); /* quota is exhausted */
if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return;
page_frag_cache_drain(&queue->pf_cache);
noreclaim_flag = memalloc_noreclaim_save(); /* ->sock will be released by fput() */
fput(queue->sock->file);
queue->sock = NULL;
memalloc_noreclaim_restore(noreclaim_flag);
/*
* Track the number of queues assigned to each cpu using a global per-cpu
* counter and select the least used cpu from the mq_map. Our goal is to spread
* different controllers I/O threads across different cpu cores.
*
* Note that the accounting is not 100% perfect, but we don't need to be, we're
* simply putting our best effort to select the best candidate cpu core that we
* find at any given point.
*/
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
struct blk_mq_tag_set *set = &ctrl->tag_set;
int qid = nvme_tcp_queue_id(queue) - 1;
unsigned int *mq_map = NULL;
int cpu, min_queues = INT_MAX, io_cpu;
if (wq_unbound)
goto out;
if (nvme_tcp_default_queue(queue))
mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
else if (nvme_tcp_read_queue(queue))
mq_map = set->map[HCTX_TYPE_READ].mq_map;
else if (nvme_tcp_poll_queue(queue))
mq_map = set->map[HCTX_TYPE_POLL].mq_map;
if (WARN_ON(!mq_map))
goto out;
/* Search for the least used cpu from the mq_map */
io_cpu = WORK_CPU_UNBOUND;
for_each_online_cpu(cpu) {
int num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
if (mq_map[cpu] != qid)
continue;
if (num_queues < min_queues) {
io_cpu = cpu;
min_queues = num_queues;
}
}
if (io_cpu != WORK_CPU_UNBOUND) {
queue->io_cpu = io_cpu;
atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
set_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags);
}
out:
dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
qid, queue->io_cpu);
}
/* Single syn retry */
tcp_sock_set_syncnt(queue->sock->sk, 1);
/* Set TCP no delay */
tcp_sock_set_nodelay(queue->sock->sk);
/*
* Cleanup whatever is sitting in the TCP transmit queue on socket
* close. This is done to prevent stale data from being sent should
* the network connection be restored before TCP times out.
*/
sock_no_linger(queue->sock->sk);
if (so_priority > 0)
sock_set_priority(queue->sock->sk, so_priority);
/* Set socket type of service */
if (nctrl->opts->tos >= 0)
ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
/* Set 10 seconds timeout for icresp recvmsg */
queue->sock->sk->sk_rcvtimeo = 10 * HZ;
ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
sizeof(ctrl->addr), 0);
if (ret) {
dev_err(nctrl->device, "failed to connect socket: %d\n", ret);
goto err_rcv_pdu;
}
/* If PSKs are configured try to start TLS */
if (nvme_tcp_tls_configured(nctrl) && pskid) {
ret = nvme_tcp_start_tls(nctrl, queue, pskid);
if (ret)
goto err_init_connect;
}
ret = nvme_tcp_init_connection(queue);
if (ret)
goto err_init_connect;
set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
return 0;
err_init_connect:
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
err_rcv_pdu:
kfree(queue->pdu);
err_sock:
/* ->sock will be released by fput() */
fput(queue->sock->file);
queue->sock = NULL;
err_destroy_mutex:
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);
return ret;
}
static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
int i, ret;
if (nvme_tcp_tls_configured(ctrl)) {
if (ctrl->opts->concat) {
/*
* The generated PSK is stored in the
* fabric options
*/
if (!ctrl->opts->tls_key) {
dev_err(ctrl->device, "no PSK generated\n");
return -ENOKEY;
}
if (ctrl->tls_pskid &&
ctrl->tls_pskid != key_serial(ctrl->opts->tls_key)) {
dev_err(ctrl->device, "Stale PSK id %08x\n", ctrl->tls_pskid);
ctrl->tls_pskid = 0;
}
} else if (!ctrl->tls_pskid) {
dev_err(ctrl->device, "no PSK negotiated\n");
return -ENOKEY;
}
}
for (i = 1; i < ctrl->queue_count; i++) {
ret = nvme_tcp_alloc_queue(ctrl, i,
ctrl->tls_pskid);
if (ret)
goto out_free_queues;
}
return 0;
out_free_queues:
for (i--; i >= 1; i--)
nvme_tcp_free_queue(ctrl, i);
return ret;
}
static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
unsigned int nr_io_queues;
int ret;
nr_io_queues = nvmf_nr_io_queues(ctrl->opts);
ret = nvme_set_queue_count(ctrl, &nr_io_queues);
if (ret)
return ret;
if (nr_io_queues == 0) {
dev_err(ctrl->device, "unable to set any I/O queues\n");
return -ENOMEM;
}
static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
{
int ret, nr_queues;
ret = nvme_tcp_alloc_io_queues(ctrl);
if (ret)
return ret;
if (new) {
ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set,
&nvme_tcp_mq_ops,
ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
sizeof(struct nvme_tcp_request));
if (ret)
goto out_free_io_queues;
}
/*
* Only start IO queues for which we have allocated the tagset
* and limited it to the available queues. On reconnects, the
* queue number might have changed.
*/
nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues);
if (ret)
goto out_cleanup_connect_q;
if (!new) {
nvme_start_freeze(ctrl);
nvme_unquiesce_io_queues(ctrl);
if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
/*
* If we timed out waiting for freeze we are likely to
* be stuck. Fail the controller initialization just
* to be safe.
*/
ret = -ENODEV;
nvme_unfreeze(ctrl);
goto out_wait_freeze_timed_out;
}
blk_mq_update_nr_hw_queues(ctrl->tagset,
ctrl->queue_count - 1);
nvme_unfreeze(ctrl);
}
/*
* If the number of queues has increased (reconnect case)
* start all new queues now.
*/
ret = nvme_tcp_start_io_queues(ctrl, nr_queues,
ctrl->tagset->nr_hw_queues + 1);
if (ret)
goto out_wait_freeze_timed_out;
static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
int status)
{
enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
/* If we are resetting/deleting then do nothing */
if (state != NVME_CTRL_CONNECTING) {
WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
return;
}
/*
* The TLS key is set by secure concatenation after negotiation has been
* completed on the admin queue. We need to revoke the key when:
* - concatenation is enabled (otherwise it's a static key set by the user)
* and
* - the generated key is present in ctrl->tls_key (otherwise there's nothing
* to revoke)
* and
* - a valid PSK key ID has been set in ctrl->tls_pskid (otherwise TLS
* negotiation has not run).
*
* We cannot always revoke the key as nvme_tcp_alloc_admin_queue() is called
* twice during secure concatenation, once on a 'normal' connection to run the
* DH-HMAC-CHAP negotiation (which generates the key, so it _must not_ be set),
* and once after the negotiation (which uses the key, so it _must_ be set).
*/
static bool nvme_tcp_key_revoke_needed(struct nvme_ctrl *ctrl)
{
return ctrl->opts->concat && ctrl->opts->tls_key && ctrl->tls_pskid;
}
static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
{
struct nvmf_ctrl_options *opts = ctrl->opts;
int ret;
ret = nvme_tcp_configure_admin_queue(ctrl, new);
if (ret)
return ret;
if (ctrl->opts->concat && !ctrl->tls_pskid) {
/* See comments for nvme_tcp_key_revoke_needed() */
dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n");
nvme_stop_keep_alive(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, false);
ret = nvme_tcp_configure_admin_queue(ctrl, false);
if (ret)
goto destroy_admin;
}
if (ctrl->icdoff) {
ret = -EOPNOTSUPP;
dev_err(ctrl->device, "icdoff is not supported!\n");
goto destroy_admin;
}
if (!nvme_ctrl_sgl_supported(ctrl)) {
ret = -EOPNOTSUPP;
dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
goto destroy_admin;
}
if (ctrl->queue_count > 1) {
ret = nvme_tcp_configure_io_queues(ctrl, new);
if (ret)
goto destroy_admin;
}
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
/*
* state change failure is ok if we started ctrl delete,
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
}
if (nvme_tcp_key_revoke_needed(ctrl))
nvme_auth_revoke_tls_key(ctrl);
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
nvme_tcp_teardown_io_queues(ctrl, false);
/* unquiesce to fail fast pending requests */
nvme_unquiesce_io_queues(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, false);
nvme_unquiesce_admin_queue(ctrl);
nvme_auth_stop(ctrl);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
state != NVME_CTRL_DELETING_NOIO);
return;
}
if (nvme_tcp_key_revoke_needed(ctrl))
nvme_auth_revoke_tls_key(ctrl);
nvme_stop_ctrl(ctrl);
nvme_tcp_teardown_ctrl(ctrl, false);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
state != NVME_CTRL_DELETING_NOIO);
return;
}
ret = nvme_tcp_setup_ctrl(ctrl, false);
if (ret)
goto out_fail;
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.26Bemerkung:
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.