// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. ** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. ** ** *******************************************************************************
******************************************************************************/
/* * lowcomms.c * * This is the "low-level" comms layer. * * It is responsible for sending/receiving messages * from other nodes in the cluster. * * Cluster nodes are referred to by their nodeids. nodeids are * simply 32 bit numbers to the locking module - if they need to * be expanded for the cluster infrastructure then that is its * responsibility. It is this layer's * responsibility to resolve these into IP address or * whatever it needs for inter-node communication. * * The comms level is two kernel threads that deal mainly with * the receiving of messages from other nodes and passing them * up to the mid-level comms layer (which understands the * message format) for execution by the locking core, and * a send thread which does all the setting up of connections * to remote nodes and the sending of data. Threads are not allowed * to send their own data because it may cause them to wait in times * of high load. Also, this way, the sending thread can collect together * messages bound for one node and send them in one block. * * lowcomms will choose to use either TCP or SCTP as its transport layer * depending on the configuration variable 'protocol'. This should be set * to 0 (default) for TCP or 1 for SCTP. It should be configured using a * cluster-wide mechanism as it must be the same on all nodes of the cluster * for the DLM to function. *
*/
struct connection { struct socket *sock; /* NULL if not connected */
uint32_t nodeid; /* So we know who we are in the list */ /* this semaphore is used to allow parallel recv/send in read * lock mode. When we release a sock we need to held the write lock. * * However this is locking code and not nice. When we remove the * othercon handling we can look into other mechanism to synchronize * io handling to call sock_release() at the right time.
*/ struct rw_semaphore sock_lock; unsignedlong flags; #define CF_APP_LIMITED 0 #define CF_RECV_PENDING 1 #define CF_SEND_PENDING 2 #define CF_RECV_INTR 3 #define CF_IO_STOP 4 #define CF_IS_OTHERCON 5 struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock; int retries; struct hlist_node list; /* due some connect()/accept() races we currently have this cross over * connection attempt second connection for one node. * * There is a solution to avoid the race by introducing a connect * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to * connect. Otherside can connect but will only be considered that * the other side wants to have a reconnect. * * However changing to this behaviour will break backwards compatible. * In a DLM protocol major version upgrade we should remove this!
*/ struct connection *othercon; struct work_struct rwork; /* receive worker */ struct work_struct swork; /* send worker */
wait_queue_head_t shutdown_wait; unsignedchar rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; int rx_leftover; int mark; int addr_count; int curr_addr_index; struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT];
spinlock_t addrs_lock; struct rcu_head rcu;
}; #define sock2con(x) ((struct connection *)(x)->sk_user_data)
/* An entry waiting to be sent */ struct writequeue_entry { struct list_head list; struct page *page; int offset; int len; int end; int users; bool dirty; struct connection *con; struct list_head msgs; struct kref ref;
};
/* need to held writequeue_lock */ staticstruct writequeue_entry *con_next_wq(struct connection *con)
{ struct writequeue_entry *e;
e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry,
list); /* if len is zero nothing is to send, if there are users filling * buffers we wait until the users are done so we can send more.
*/ if (!e || e->users || e->len == 0) return NULL;
return e;
}
staticstruct connection *__find_con(int nodeid, int r)
{ struct connection *con;
/* * If 'allocation' is zero then we don't attempt to create a new * connection structure for this node.
*/ staticstruct connection *nodeid2con(int nodeid, gfp_t alloc)
{ struct connection *con, *tmp; int r;
r = nodeid_hash(nodeid);
con = __find_con(nodeid, r); if (con || !alloc) return con;
con = kzalloc(sizeof(*con), alloc); if (!con) return NULL;
dlm_con_init(con, nodeid);
spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() * we just check in rare cases of recently added nodes again * under protection of connections_lock. If this is the case we * abort our connection creation and return the existing connection.
*/
tmp = __find_con(nodeid, r); if (tmp) {
spin_unlock(&connections_lock);
kfree(con); return tmp;
}
/* Data available on socket or listen socket received a connect */ staticvoid lowcomms_data_ready(struct sock *sk)
{ struct connection *con = sock2con(sk);
staticvoid lowcomms_state_change(struct sock *sk)
{ /* SCTP layer is not calling sk_data_ready when the connection * is done, so we catch the signal through here.
*/ if (sk->sk_shutdown & RCV_SHUTDOWN)
lowcomms_data_ready(sk);
}
/* Close a remote connection and tidy up */ staticvoid close_connection(struct connection *con, bool and_other)
{ struct writequeue_entry *e;
if (con->othercon && and_other)
close_connection(con->othercon, false);
down_write(&con->sock_lock); if (!con->sock) {
up_write(&con->sock_lock); return;
}
dlm_close_sock(&con->sock);
/* if we send a writequeue entry only a half way, we drop the * whole entry because reconnection and that we not start of the * middle of a msg which will confuse the other end. * * we can always drop messages because retransmits, but what we * cannot allow is to transmit half messages which may be processed * at the other side. * * our policy is to start on a clean state when disconnects, we don't * know what's send/received on transport layer in this case.
*/
spin_lock_bh(&con->writequeue_lock); if (!list_empty(&con->writequeue)) {
e = list_first_entry(&con->writequeue, struct writequeue_entry,
list); if (e->dirty)
free_entry(e);
}
spin_unlock_bh(&con->writequeue_lock);
list_del(&pentry->list); if (atomic_dec_and_test(&processqueue_count))
wake_up(&processqueue_wq);
spin_unlock_bh(&processqueue_lock);
}
}
/* Data received from remote end */ staticint receive_from_sock(struct connection *con, int buflen)
{ struct processqueue_entry *pentry; int ret, buflen_real; struct msghdr msg; struct kvec iov;
pentry = new_processqueue_entry(con->nodeid, buflen); if (!pentry) return DLM_IO_RESCHED;
/* new buflen according readed bytes and leftover from last receive */
buflen_real = ret + con->rx_leftover;
ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf,
buflen_real); if (ret < 0) {
free_processqueue_entry(pentry); return ret;
}
pentry->buflen = ret;
/* calculate leftover bytes from process and put it into begin of * the receive buffer, so next receive we have the full message * at the start address of the receive buffer.
*/
con->rx_leftover = buflen_real - ret;
memmove(con->rx_leftover_buf, pentry->buf + ret,
con->rx_leftover);
spin_lock_bh(&processqueue_lock);
ret = atomic_inc_return(&processqueue_count);
list_add_tail(&pentry->list, &processqueue); if (!process_dlm_messages_pending) {
process_dlm_messages_pending = true;
queue_work(process_workqueue, &process_work);
}
spin_unlock_bh(&processqueue_lock);
if (ret > DLM_MAX_PROCESS_BUFFERS) return DLM_IO_FLUSH;
return DLM_IO_SUCCESS;
}
/* Listening socket is busy, accept a connection */ staticint accept_from_sock(void)
{ struct sockaddr_storage peeraddr; int len, idx, result, nodeid; struct connection *newcon; struct socket *newsock; unsignedint mark;
result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK); if (result == -EAGAIN) return DLM_IO_END; elseif (result < 0) goto accept_err;
/* Get the connected socket's peer */
memset(&peeraddr, 0, sizeof(peeraddr));
len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2); if (len < 0) {
result = -ECONNABORTED; goto accept_err;
}
/* Get the new node's NODEID */
make_sockaddr(&peeraddr, 0, &len); if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { switch (peeraddr.ss_family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr;
log_print("connect from non cluster IPv4 node %pI4",
&sin->sin_addr); break;
} #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr;
log_print("connect from non cluster IPv6 node %pI6c",
&sin6->sin6_addr); break;
} #endif default:
log_print("invalid family from non cluster node"); break;
}
sock_release(newsock); return -1;
}
log_print("got connection from %d", nodeid);
/* Check to see if we already have a connection to this node. This * could happen if the two nodes initiate a connection at roughly * the same time and the connections cross on the wire. * In this case we store the incoming one in "othercon"
*/
idx = srcu_read_lock(&connections_srcu);
newcon = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!newcon)) {
srcu_read_unlock(&connections_srcu, idx);
result = -ENOENT; goto accept_err;
}
sock_set_mark(newsock->sk, mark);
down_write(&newcon->sock_lock); if (newcon->sock) { struct connection *othercon = newcon->othercon;
if (!othercon) {
othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) {
log_print("failed to allocate incoming socket");
up_write(&newcon->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
result = -ENOMEM; goto accept_err;
}
dlm_con_init(othercon, nodeid);
lockdep_set_subclass(&othercon->sock_lock, 1);
newcon->othercon = othercon;
set_bit(CF_IS_OTHERCON, &othercon->flags);
} else { /* close other sock con if we have something new */
close_connection(othercon, false);
}
/* check if we receved something while adding */
lock_sock(othercon->sock->sk);
lowcomms_queue_rwork(othercon);
release_sock(othercon->sock->sk);
up_write(&othercon->sock_lock);
} else { /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will
result in calling sk_error_report recursively. */
add_sock(newsock, newcon);
/* check if we receved something while adding */
lock_sock(newcon->sock->sk);
lowcomms_queue_rwork(newcon);
release_sock(newcon->sock->sk);
}
up_write(&newcon->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
return DLM_IO_SUCCESS;
accept_err: if (newsock)
sock_release(newsock);
return result;
}
/* * writequeue_entry_complete - try to delete and free write queue entry * @e: write queue entry to try to delete * @completed: bytes completed * * writequeue_lock must be held.
*/ staticvoid writequeue_entry_complete(struct writequeue_entry *e, int completed)
{
e->offset += completed;
e->len -= completed; /* signal that page was half way transmitted */
e->dirty = true;
if (e->len == 0 && e->users == 0)
free_entry(e);
}
/* * sctp_bind_addrs - bind a SCTP socket to all our addresses
*/ staticint sctp_bind_addrs(struct socket *sock, __be16 port)
{ struct sockaddr_storage localaddr; struct sockaddr *addr = (struct sockaddr *)&localaddr; int i, addr_len, result = 0;
for (i = 0; i < dlm_local_count; i++) {
memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr));
make_sockaddr(&localaddr, port, &addr_len);
if (!i)
result = kernel_bind(sock, addr, addr_len); else
result = sock_bind_add(sock->sk, addr, addr_len);
if (result < 0) {
log_print("Can't bind to %d addr number %d, %d.\n",
port, i + 1, result); break;
}
} return result;
}
/* Get local addresses */ staticvoid init_local(void)
{ struct sockaddr_storage sas; int i;
dlm_local_count = 0; for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) { if (dlm_our_addr(&sas, i)) break;
/* avoid false positive for nodes_srcu, unlock happens in * dlm_lowcomms_commit_msg which is a must call if success
*/ #ifndef __CHECKER__ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, char **ppc, void (*cb)(void *data), void *data)
{ struct connection *con; struct dlm_msg *msg; int idx;
if (len > DLM_MAX_SOCKET_BUFSIZE ||
len < sizeof(struct dlm_header)) {
BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE);
log_print("failed to allocate a buffer of size %d", len);
WARN_ON_ONCE(1); return NULL;
}
idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!con)) {
srcu_read_unlock(&connections_srcu, idx); return NULL;
}
/* does not held connections_srcu, usage lowcomms_error_report only */ int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
{ struct dlm_msg *msg_resend; char *ppc;
/* for debugging we print when we are done to compare with other * messages in between. This function need to be correctly synchronized * with io handling
*/
log_print("closing connection to node %d done", nodeid);
down_read(&con->sock_lock); if (!con->sock) {
up_read(&con->sock_lock); return;
}
buflen = READ_ONCE(dlm_config.ci_buffer_size); do {
ret = receive_from_sock(con, buflen);
} while (ret == DLM_IO_SUCCESS);
up_read(&con->sock_lock);
switch (ret) { case DLM_IO_END: /* CF_RECV_PENDING cleared */ break; case DLM_IO_EOF:
close_connection(con, false);
wake_up(&con->shutdown_wait); /* CF_RECV_PENDING cleared */ break; case DLM_IO_FLUSH: /* we can't flush the process_workqueue here because a * WQ_MEM_RECLAIM workequeue can occurr a deadlock for a non * WQ_MEM_RECLAIM workqueue such as process_workqueue. Instead * we have a waitqueue to wait until all messages are * processed. * * This handling is only necessary to backoff the sender and * not queue all messages from the socket layer into DLM * processqueue. When DLM is capable to parse multiple messages * on an e.g. per socket basis this handling can might be * removed. Especially in a message burst we are too slow to * process messages and the queue will fill up memory.
*/
wait_event(processqueue_wq, !atomic_read(&processqueue_count));
fallthrough; case DLM_IO_RESCHED:
cond_resched();
queue_work(io_workqueue, &con->rwork); /* CF_RECV_PENDING not cleared */ break; default: if (ret < 0) { if (test_bit(CF_IS_OTHERCON, &con->flags)) {
close_connection(con, false);
} else {
spin_lock_bh(&con->writequeue_lock);
lowcomms_queue_swork(con);
spin_unlock_bh(&con->writequeue_lock);
}
/* CF_RECV_PENDING cleared for othercon * we trigger send queue if not already done * and process_send_sockets will handle it
*/ break;
}
WARN_ON_ONCE(1); break;
}
}
staticvoid process_listen_recv_socket(struct work_struct *work)
{ int ret;
if (WARN_ON_ONCE(!listen_con.sock)) return;
do {
ret = accept_from_sock();
} while (ret == DLM_IO_SUCCESS);
memset(&addr, 0, sizeof(addr));
result = nodeid_to_addr(con->nodeid, &addr, NULL,
dlm_proto_ops->try_new_addr, &mark); if (result < 0) {
log_print("no address for nodeid %d", con->nodeid); return result;
}
/* Create a socket to communicate with */
result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) return result;
result = dlm_proto_ops->bind(sock); if (result < 0) {
sock_release(sock); return result;
}
add_sock(sock, con);
log_print_ratelimited("connecting to %d", con->nodeid);
make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
result = kernel_connect(sock, (struct sockaddr *)&addr, addr_len, 0); switch (result) { case -EINPROGRESS: /* not an error */
fallthrough; case 0: break; default: if (result < 0)
dlm_close_sock(&con->sock);
break;
}
return result;
}
/* Send worker function */ staticvoid process_send_sockets(struct work_struct *work)
{ struct connection *con = container_of(work, struct connection, swork); int ret;
down_read(&con->sock_lock); if (!con->sock) {
up_read(&con->sock_lock);
down_write(&con->sock_lock); if (!con->sock) {
ret = dlm_connect(con); switch (ret) { case 0: break; default: /* CF_SEND_PENDING not cleared */
up_write(&con->sock_lock);
log_print("connect to node %d try %d error %d",
con->nodeid, con->retries++, ret);
msleep(1000); /* For now we try forever to reconnect. In * future we should send a event to cluster * manager to fence itself after certain amount * of retries.
*/
queue_work(io_workqueue, &con->swork); return;
}
}
downgrade_write(&con->sock_lock);
}
do {
ret = send_to_sock(con);
} while (ret == DLM_IO_SUCCESS);
up_read(&con->sock_lock);
switch (ret) { case DLM_IO_END: /* CF_SEND_PENDING cleared */ break; case DLM_IO_RESCHED: /* CF_SEND_PENDING not cleared */
cond_resched();
queue_work(io_workqueue, &con->swork); break; default: if (ret < 0) {
close_connection(con, false);
/* Bind to our cluster-known address connecting to avoid * routing problems.
*/
memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
make_sockaddr(&src_addr, 0, &addr_len);
result = kernel_bind(sock, (struct sockaddr *)&src_addr,
addr_len); if (result < 0) { /* This *may* not indicate a critical error */
log_print("could not bind for connect: %d", result);
}
return 0;
}
staticint dlm_tcp_listen_validate(void)
{ /* We don't support multi-homed hosts */ if (dlm_local_count > 1) {
log_print("Detect multi-homed hosts but use only the first IP address.");
log_print("Try SCTP, if you want to enable multi-link.");
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.