/* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. *
*/ #include <linux/kernel.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/in.h> #include <linux/export.h> #include <linux/sched/clock.h> #include <linux/time.h> #include <linux/rds.h>
/* * Process all extension headers that come with this message.
*/ staticvoid rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
{ struct rds_header *hdr = &inc->i_hdr; unsignedint pos = 0, type, len; union { struct rds_ext_header_version version; struct rds_ext_header_rdma rdma; struct rds_ext_header_rdma_dest rdma_dest;
} buffer;
while (1) {
len = sizeof(buffer);
type = rds_message_next_extension(hdr, &pos, &buffer, &len); if (type == RDS_EXTHDR_NONE) break; /* Process extension header here */ switch (type) { case RDS_EXTHDR_RDMA:
rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0); break;
case RDS_EXTHDR_RDMA_DEST: /* We ignore the size for now. We could stash it
* somewhere and use it for error checking. */
inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie(
be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
while (1) {
len = sizeof(buffer);
type = rds_message_next_extension(hdr, &pos, &buffer, &len); if (type == RDS_EXTHDR_NONE) break; /* Process extension header here */ switch (type) { case RDS_EXTHDR_NPATHS:
conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
be16_to_cpu(buffer.rds_npaths)); break; case RDS_EXTHDR_GEN_NUM:
new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num); break; default:
pr_warn_ratelimited("ignoring unknown exthdr type " "0x%x\n", type);
}
} /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
conn->c_npaths = max_t(int, conn->c_npaths, 1);
conn->c_ping_triggered = 0;
rds_conn_peer_gen_update(conn, new_peer_gen_num);
}
/* rds_start_mprds() will synchronously start multiple paths when appropriate. * The scheme is based on the following rules: * * 1. rds_sendmsg on first connect attempt sends the probe ping, with the * sender's npaths (s_npaths) * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It * sends back a probe-pong with r_npaths. After that, if rcvr is the * smaller ip addr, it starts rds_conn_path_connect_if_down on all * mprds_paths. * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down. * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be * called after reception of the probe-pong on all mprds_paths. * Otherwise (sender of probe-ping is not the smaller ip addr): just call * rds_conn_path_connect_if_down on the hashed path. (see rule 4) * 4. rds_connect_worker must only trigger a connection if laddr < faddr. * 5. sender may end up queuing the packet on the cp. will get sent out later. * when connection is completed.
*/ staticvoid rds_start_mprds(struct rds_connection *conn)
{ int i; struct rds_conn_path *cp;
if (conn->c_npaths > 1 &&
rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) { for (i = 0; i < conn->c_npaths; i++) {
cp = &conn->c_path[i];
rds_conn_path_connect_if_down(cp);
}
}
}
/* * The transport must make sure that this is serialized against other * rx and conn reset on this specific conn. * * We currently assert that only one fragmented message will be sent * down a connection at a time. This lets us reassemble in the conn * instead of per-flow which means that we don't have to go digging through * flows to tear down partial reassembly progress on conn failure and * we save flow lookup and locking for each frag arrival. It does mean * that small messages will wait behind large ones. Fragmenting at all * is only to reduce the memory consumption of pre-posted buffers. * * The caller passes in saddr and daddr instead of us getting it from the * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message.
*/ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp)
{ struct rds_sock *rs = NULL; struct sock *sk; unsignedlong flags; struct rds_conn_path *cp;
rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " "flags 0x%x rx_jiffies %lu\n", conn,
(unsignedlonglong)cp->cp_next_rx_seq,
inc,
(unsignedlonglong)be64_to_cpu(inc->i_hdr.h_sequence),
be32_to_cpu(inc->i_hdr.h_len),
be16_to_cpu(inc->i_hdr.h_sport),
be16_to_cpu(inc->i_hdr.h_dport),
inc->i_hdr.h_flags,
inc->i_rx_jiffies);
/* * Sequence numbers should only increase. Messages get their * sequence number as they're queued in a sending conn. They * can be dropped, though, if the sending socket is closed before * they hit the wire. So sequence numbers can skip forward * under normal operation. They can also drop back in the conn * failover case as previously sent messages are resent down the * new instance of a conn. We drop those, otherwise we have * to assume that the next valid seq does not come after a * hole in the fragment stream. * * The headers don't give us a way to realize if fragments of * a message have been dropped. We assume that frags that arrive * to a flow are part of the current message on the flow that is * being reassembled. This means that senders can't drop messages * from the sending conn until all their frags are sent. * * XXX we could spend more on the wire to get more robust failure * detection, arguably worth it to avoid data corruption.
*/ if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
(inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
rds_stats_inc(s_recv_drop_old_seq); goto out;
}
cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (inc->i_hdr.h_sport == 0) {
rdsdebug("ignore ping with 0 sport from %pI6c\n",
saddr); goto out;
}
rds_stats_inc(s_recv_ping);
rds_send_pong(cp, inc->i_hdr.h_sport); /* if this is a handshake ping, start multipath if necessary */ if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport),
be16_to_cpu(inc->i_hdr.h_dport))) {
rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
rds_start_mprds(cp->cp_conn);
} goto out;
}
if (be16_to_cpu(inc->i_hdr.h_dport) == RDS_FLAG_PROBE_PORT &&
inc->i_hdr.h_sport == 0) {
rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); /* if this is a handshake pong, start multipath if necessary */
rds_start_mprds(cp->cp_conn);
wake_up(&cp->cp_conn->c_hs_waitq); goto out;
}
out: if (rs)
rds_sock_put(rs);
}
EXPORT_SYMBOL_GPL(rds_recv_incoming);
/* * be very careful here. This is being called as the condition in * wait_event_*() needs to cope with being called many times.
*/ staticint rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
{ unsignedlong flags;
if (!*inc) {
read_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&rs->rs_recv_queue)) {
*inc = list_entry(rs->rs_recv_queue.next, struct rds_incoming,
i_item);
rds_inc_addref(*inc);
}
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
}
return *inc != NULL;
}
staticint rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, int drop)
{ struct sock *sk = rds_rs_to_sk(rs); int ret = 0; unsignedlong flags; struct rds_incoming *to_drop = NULL;
write_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&inc->i_item)) {
ret = 1; if (drop) { /* XXX make sure this i_conn is reliable */
rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
-be32_to_cpu(inc->i_hdr.h_len),
inc->i_hdr.h_dport);
list_del_init(&inc->i_item);
to_drop = inc;
}
}
write_unlock_irqrestore(&rs->rs_recv_lock, flags);
/* * Pull errors off the error queue. * If msghdr is NULL, we will just purge the error queue.
*/ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
{ struct rds_notifier *notifier; struct rds_rdma_notify cmsg; unsignedint count = 0, max_messages = ~0U; unsignedlong flags;
LIST_HEAD(copy); int err = 0;
memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */
/* put_cmsg copies to user space and thus may sleep. We can't do this * with rs_lock held, so first grab as many notifications as we can stuff * in the user provided cmsg buffer. We don't try to copy more, to avoid * losing notifications - except when the buffer is so small that it wouldn't * even hold a single notification. Then we give him as much of this single * msg as we can squeeze in, and set MSG_CTRUNC.
*/ if (msghdr) {
max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg)); if (!max_messages)
max_messages = 1;
}
/* * Receive any control messages.
*/ staticint rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, struct rds_sock *rs)
{ int ret = 0;
if (inc->i_usercopy.rdma_cookie) {
ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, sizeof(inc->i_usercopy.rdma_cookie),
&inc->i_usercopy.rdma_cookie); if (ret) goto out;
}
if ((inc->i_usercopy.rx_tstamp != 0) &&
sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { struct __kernel_old_timeval tv =
ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp);
if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) {
ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv);
} else { struct __kernel_sock_timeval sk_tv;
if (msg_flags & MSG_OOB) goto out; if (msg_flags & MSG_ERRQUEUE) return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
while (1) { /* If there are pending notifications, do those - and nothing else */ if (!list_empty(&rs->rs_notify_queue)) {
ret = rds_notify_queue_get(rs, msg); break;
}
if (rs->rs_cong_notify) {
ret = rds_notify_cong(rs, msg); break;
}
if (!rds_next_incoming(rs, &inc)) { if (nonblock) { bool reaped = rds_recvmsg_zcookie(rs, msg);
ret = timeo; if (ret == 0)
ret = -ETIMEDOUT; break;
}
rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
&inc->i_conn->c_faddr,
ntohs(inc->i_hdr.h_sport));
ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); if (ret < 0) break;
/* * if the message we just copied isn't at the head of the * recv queue then someone else raced us to return it, try * to get the next message.
*/ if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
rds_inc_put(inc);
inc = NULL;
rds_stats_inc(s_recv_deliver_raced);
iov_iter_revert(&msg->msg_iter, ret); continue;
}
if (ret < be32_to_cpu(inc->i_hdr.h_len)) { if (msg_flags & MSG_TRUNC)
ret = be32_to_cpu(inc->i_hdr.h_len);
msg->msg_flags |= MSG_TRUNC;
}
if (rds_cmsg_recv(inc, msg, rs)) {
ret = -EFAULT; break;
}
rds_recvmsg_zcookie(rs, msg);
/* * The socket is being shut down and we're asked to drop messages that were * queued for recvmsg. The caller has unbound the socket so the receive path * won't queue any more incoming fragments or messages on the socket.
*/ void rds_clear_recv_queue(struct rds_sock *rs)
{ struct sock *sk = rds_rs_to_sk(rs); struct rds_incoming *inc, *tmp; unsignedlong flags;
LIST_HEAD(to_drop);
/* * inc->i_saddr isn't used here because it is only set in the receive * path.
*/ void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter,
__be32 saddr, __be32 daddr, int flip)
{ struct rds_info_message minfo;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.