// SPDX-License-Identifier: GPL-2.0-only /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
*/
/* Implementation notes: * * - There are two kinds of sockets: those created by user action (such as * calling socket(2)) and those created by incoming connection request packets. * * - There are two "global" tables, one for bound sockets (sockets that have * specified an address that they are responsible for) and one for connected * sockets (sockets that have established a connection with another socket). * These tables are "global" in that all sockets on the system are placed * within them. - Note, though, that the bound table contains an extra entry * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in * that list. The bound table is used solely for lookup of sockets when packets * are received and that's not necessary for SOCK_DGRAM sockets since we create * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM * sockets out of the bound hash buckets will reduce the chance of collisions * when looking for SOCK_STREAM sockets and prevents us from having to check the * socket type in the hash table lookups. * * - Sockets created by user action will either be "client" sockets that * initiate a connection or "server" sockets that listen for connections; we do * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this * implementation because they are in the TCP_LISTEN state. When a * connection request is received (the second kind of socket mentioned above), * we create a new socket and refer to it as a pending socket. These pending * sockets are placed on the pending connection list of the listener socket. * When future packets are received for the address the listener socket is * bound to, we check if the source of the packet is from one that has an * existing pending connection. If it does, we process the packet for the * pending socket. When that socket reaches the connected state, it is removed * from the listener socket's pending list and enqueued in the listener * socket's accept queue. Callers of accept(2) will accept connected sockets * from the listener socket's accept queue. If the socket cannot be accepted * for some reason then it is marked rejected. Once the connection is * accepted, it is owned by the user process and the responsibility for cleanup * falls with that user process. * * - It is possible that these pending sockets will never reach the connected * state; in fact, we may never receive another packet after the connection * request. Because of this, we must schedule a cleanup function to run in the * future, after some amount of time passes where a connection should have been * established. This function ensures that the socket is off all lists so it * cannot be retrieved, then drops all references to the socket so it is cleaned * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this * function will also cleanup rejected sockets, those that reach the connected * state but leave it before they have been accepted. * * - Lock ordering for pending or accept queue sockets is: * * lock_sock(listener); * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); * * Using explicit nested locking keeps lockdep happy since normally only one * lock of a given class may be taken at a time. * * - Sockets created by user action will be cleaned up when the user process * calls close(2), causing our release implementation to be called. Our release * implementation will perform some cleanup then drop the last reference so our * sk_destruct implementation is invoked. Our sk_destruct implementation will * perform additional cleanup that's common for both types of sockets. * * - A socket's reference count is what ensures that the structure won't be * freed. Each entry in a list (such as the "global" bound and connected tables * and the listener socket's pending list and connected queue) ensures a * reference. When we defer work until process context and pass a socket as our * argument, we must ensure the reference count is increased to ensure the * socket isn't freed before the function is run; the deferred function will * then drop the reference. * * - sk->sk_state uses the TCP state constants because they are widely used by * other address families and exposed to userspace tools like ss(8): * * TCP_CLOSE - unconnected * TCP_SYN_SENT - connecting * TCP_ESTABLISHED - connected * TCP_CLOSING - disconnecting * TCP_LISTEN - listening
*/
/* The default peer timeout indicates how long we will wait for a peer response * to a control message.
*/ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
/* Transport used for host->guest communication */ staticconststruct vsock_transport *transport_h2g; /* Transport used for guest->host communication */ staticconststruct vsock_transport *transport_g2h; /* Transport used for DGRAM communication */ staticconststruct vsock_transport *transport_dgram; /* Transport used for local communication */ staticconststruct vsock_transport *transport_local; static DEFINE_MUTEX(vsock_register_mutex);
/**** UTILS ****/
/* Each bound VSocket is stored in the bind hash table and each connected * VSocket is stored in the connected hash table. * * Unbound sockets are all put on the same list attached to the end of the hash * table (vsock_unbound_sockets). Bound sockets are added to the hash table in * the bucket that their local address hashes to (vsock_bound_sockets(addr) * represents the list that addr hashes to). * * Specifically, we initialize the vsock_bind_table array to a size of * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function * mods with VSOCK_HASH_SIZE to ensure this.
*/ #define MAX_PORT_RETRIES 24
/* Autobind this socket to the local address if necessary. */ staticint vsock_auto_bind(struct vsock_sock *vsk)
{ struct sock *sk = sk_vsock(vsk); struct sockaddr_vm local_addr;
void vsock_remove_sock(struct vsock_sock *vsk)
{ /* Transport reassignment must not remove the binding. */ if (sock_flag(sk_vsock(vsk), SOCK_DEAD))
vsock_remove_bound(vsk);
/* Assign a transport to a socket and call the .init transport callback. * * Note: for connection oriented socket this must be called when vsk->remote_addr * is set (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if * g2h is not loaded, will use local transport; * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; * - remote CID > VMADDR_CID_HOST will use host->guest transport;
*/ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
{ conststruct vsock_transport *new_transport; struct sock *sk = sk_vsock(vsk); unsignedint remote_cid = vsk->remote_addr.svm_cid;
__u8 remote_flags; int ret;
/* If the packet is coming with the source and destination CIDs higher * than VMADDR_CID_HOST, then a vsock channel where all the packets are * forwarded to the host should be established. Then the host will * need to forward the packets to the guest. * * The flag is set on the (listen) receive path (psk is not NULL). On * the connect path the flag can be set by the user space application.
*/ if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST &&
vsk->remote_addr.svm_cid > VMADDR_CID_HOST)
vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST;
remote_flags = vsk->remote_addr.svm_flags;
mutex_lock(&vsock_register_mutex);
switch (sk->sk_type) { case SOCK_DGRAM:
new_transport = transport_dgram; break; case SOCK_STREAM: case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid))
new_transport = transport_local; elseif (remote_cid <= VMADDR_CID_HOST || !transport_h2g ||
(remote_flags & VMADDR_FLAG_TO_HOST))
new_transport = transport_g2h; else
new_transport = transport_h2g; break; default:
ret = -ESOCKTNOSUPPORT; goto err;
}
if (vsk->transport && vsk->transport == new_transport) {
ret = 0; goto err;
}
/* We increase the module refcnt to prevent the transport unloading * while there are open sockets assigned to it.
*/ if (!new_transport || !try_module_get(new_transport->module)) {
ret = -ENODEV; goto err;
}
/* It's safe to release the mutex after a successful try_module_get(). * Whichever transport `new_transport` points at, it won't go away until * the last module_put() below or in vsock_deassign_transport().
*/
mutex_unlock(&vsock_register_mutex);
if (vsk->transport) { /* transport->release() must be called with sock lock acquired. * This path can only be taken during vsock_connect(), where we * have already held the sock lock. In the other cases, this * function is called on a new socket which is not assigned to * any transport.
*/
vsk->transport->release(vsk);
vsock_deassign_transport(vsk);
/* transport's release() and destruct() can touch some socket * state, since we are reassigning the socket to a new transport * during vsock_connect(), let's reset these fields to have a * clean state.
*/
sock_reset_flag(sk, SOCK_DONE);
sk->sk_state = TCP_CLOSE;
vsk->peer_shutdown = 0;
}
if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow ||
!new_transport->seqpacket_allow(remote_cid)) {
module_put(new_transport->module); return -ESOCKTNOSUPPORT;
}
}
ret = new_transport->init(vsk, psk); if (ret) {
module_put(new_transport->module); return ret;
}
/* * Provide safe access to static transport_{h2g,g2h,dgram,local} callbacks. * Otherwise we may race with module removal. Do not use on `vsk->transport`.
*/ static u32 vsock_registered_transport_cid(conststruct vsock_transport **transport)
{
u32 cid = VMADDR_CID_ANY;
mutex_lock(&vsock_register_mutex); if (*transport)
cid = (*transport)->get_local_cid();
mutex_unlock(&vsock_register_mutex);
return cid;
}
bool vsock_find_cid(unsignedint cid)
{ if (cid == vsock_registered_transport_cid(&transport_g2h)) returntrue;
if (transport_h2g && cid == VMADDR_CID_HOST) returntrue;
if (transport_local && cid == VMADDR_CID_LOCAL) returntrue;
list_del_init(&vconnected->accept_queue);
sock_put(listener); /* The caller will need a reference on the connected socket so we let * it call sock_put().
*/
if (vsock_is_pending(sk)) {
vsock_remove_pending(listener, sk);
sk_acceptq_removed(listener);
} elseif (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We * just need to drop our references to the sockets and be on * our way.
*/
cleanup = false; goto out;
}
/* We need to remove ourself from the global connected sockets list so * incoming packets can't find this socket, and to reduce the reference * count.
*/
vsock_remove_connected(vsk);
sk->sk_state = TCP_CLOSE;
out:
release_sock(sk);
release_sock(listener); if (cleanup)
sock_put(sk);
if (addr->svm_port == VMADDR_PORT_ANY) { bool found = false; unsignedint i;
for (i = 0; i < MAX_PORT_RETRIES; i++) { if (port == VMADDR_PORT_ANY ||
port <= LAST_RESERVED_PORT)
port = LAST_RESERVED_PORT + 1;
new_addr.svm_port = port++;
if (!__vsock_find_bound_socket(&new_addr)) {
found = true; break;
}
}
if (!found) return -EADDRNOTAVAIL;
} else { /* If port is in reserved range, ensure caller * has necessary privileges.
*/ if (addr->svm_port <= LAST_RESERVED_PORT &&
!capable(CAP_NET_BIND_SERVICE)) { return -EACCES;
}
if (__vsock_find_bound_socket(&new_addr)) return -EADDRINUSE;
}
/* Remove connection oriented sockets from the unbound list and add them * to the hash table for easy lookup by its address. The unbound list * is simply an extra entry at the end of the hash table, a trick used * by AF_UNIX.
*/
__vsock_remove_bound(vsk);
__vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk);
/* First ensure this socket isn't already bound. */ if (vsock_addr_bound(&vsk->local_addr)) return -EINVAL;
/* Now bind to the provided address or select appropriate values if * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that * like AF_INET prevents binding to a non-local IP address (in most * cases), we only allow binding to a local CID.
*/ if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) return -EADDRNOTAVAIL;
switch (sk->sk_socket->type) { case SOCK_STREAM: case SOCK_SEQPACKET:
spin_lock_bh(&vsock_table_lock);
retval = __vsock_bind_connectible(vsk, addr);
spin_unlock_bh(&vsock_table_lock); break;
case SOCK_DGRAM:
retval = __vsock_bind_dgram(vsk, addr); break;
sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); if (!sk) return NULL;
sock_init_data(sock, sk);
/* sk->sk_type is normally set in sock_init_data, but only if sock is * non-NULL. We make sure that our sockets always have a type by * setting it here if needed.
*/ if (!sock)
sk->sk_type = type;
/* When "level" is SINGLE_DEPTH_NESTING, use the nested * version to avoid the warning "possible recursive locking * detected". When "level" is 0, lock_sock_nested(sk, level) * is the same as lock_sock(sk).
*/
lock_sock_nested(sk, level);
/* Indicate to vsock_remove_sock() that the socket is being released and * can be removed from the bound_table. Unlike transport reassignment * case, where the socket must remain bound despite vsock_remove_sock() * being called from the transport release() callback.
*/
sock_set_flag(sk, SOCK_DEAD);
if (vsk->transport)
vsk->transport->release(vsk); elseif (sock_type_connectible(sk->sk_type))
vsock_remove_sock(vsk);
sock_orphan(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
skb_queue_purge(&sk->sk_receive_queue);
/* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) {
__vsock_release(pending, SINGLE_DEPTH_NESTING);
sock_put(pending);
}
/* When clearing these addresses, there's no need to set the family and * possibly register the address family with the kernel.
*/
vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
put_cred(vsk->owner);
}
staticint vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{ int err;
err = sock_queue_rcv_skb(sk, skb); if (err)
kfree_skb(skb);
if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat ||
sock_flag(sk, SOCK_DONE))
sk->sk_data_ready(sk);
}
EXPORT_SYMBOL_GPL(vsock_data_ready);
/* Dummy callback required by sockmap. * See unconditional call of saved_close() in sock_map_close().
*/ staticvoid vsock_close(struct sock *sk, long timeout)
{
}
/* sys_getsockname() and sys_getpeername() pass us a * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately * that macro is defined in socket.c instead of .h, so we hardcode its * value here.
*/
BUILD_BUG_ON(sizeof(*vm_addr) > 128);
memcpy(addr, vm_addr, sizeof(*vm_addr));
err = sizeof(*vm_addr);
timeout = sk->sk_lingertime; if (!timeout) return;
/* Transports must implement `unsent_bytes` if they want to support * SOCK_LINGER through `vsock_linger()` since we use it to check when * the socket can be closed.
*/
unsent = vsk->transport->unsent_bytes; if (!unsent) return;
add_wait_queue(sk_sleep(sk), &wait);
do { if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) break;
} while (!signal_pending(current) && timeout);
staticint vsock_shutdown(struct socket *sock, int mode)
{ int err; struct sock *sk;
/* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode * here like the other address families do. Note also that the * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), * which is what we want.
*/
mode++;
if ((mode & ~SHUTDOWN_MASK) || !mode) return -EINVAL;
/* If this is a connection oriented socket and it is not connected then * bail out immediately. If it is a DGRAM socket then we must first * kick the socket so that it wakes up from any sleeping calls, for * example recv(), and then afterwards return the error.
*/
if (sk_is_readable(sk))
mask |= EPOLLIN | EPOLLRDNORM;
if (sock->type == SOCK_DGRAM) { /* For datagram sockets we can read if there is something in * the queue and write as long as the socket isn't shutdown for * sending.
*/ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
(sk->sk_shutdown & RCV_SHUTDOWN)) {
mask |= EPOLLIN | EPOLLRDNORM;
}
/* Listening sockets that have connections in their accept * queue can be read.
*/ if (sk->sk_state == TCP_LISTEN
&& !vsock_is_accept_queue_empty(sk))
mask |= EPOLLIN | EPOLLRDNORM;
/* If there is something in the queue then we can read. */ if (transport && transport->stream_is_active(vsk) &&
!(sk->sk_shutdown & RCV_SHUTDOWN)) { bool data_ready_now = false; int target = sock_rcvlowat(sk, 0, INT_MAX); int ret = transport->notify_poll_in(
vsk, target, &data_ready_now); if (ret < 0) {
mask |= EPOLLERR;
} else { if (data_ready_now)
mask |= EPOLLIN | EPOLLRDNORM;
}
}
/* Sockets whose connections have been closed, reset, or * terminated should also be considered read, and we check the * shutdown flag for that.
*/ if (sk->sk_shutdown & RCV_SHUTDOWN ||
vsk->peer_shutdown & SEND_SHUTDOWN) {
mask |= EPOLLIN | EPOLLRDNORM;
}
/* Connected sockets that can produce data can be written. */ if (transport && sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out(
vsk, 1, &space_avail_now); if (ret < 0) {
mask |= EPOLLERR;
} else { if (space_avail_now) /* Remove EPOLLWRBAND since INET * sockets are not setting it.
*/
mask |= EPOLLOUT | EPOLLWRNORM;
}
}
}
/* Simulate INET socket poll behaviors, which sets * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown.
*/ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { if (!(sk->sk_shutdown & SEND_SHUTDOWN))
mask |= EPOLLOUT | EPOLLWRNORM;
/* For now, MSG_DONTWAIT is always assumed... */
err = 0;
sk = sock->sk;
vsk = vsock_sk(sk);
lock_sock(sk);
transport = vsk->transport;
err = vsock_auto_bind(vsk); if (err) goto out;
/* If the provided message contains an address, use that. Otherwise * fall back on the socket's remote handle (if it has been connected).
*/ if (msg->msg_name &&
vsock_addr_cast(msg->msg_name, msg->msg_namelen,
&remote_addr) == 0) { /* Ensure this address is of the right type and is a valid * destination.
*/
if (remote_addr->svm_cid == VMADDR_CID_ANY)
remote_addr->svm_cid = transport->get_local_cid();
/* sock map disallows redirection of non-TCP sockets with sk_state != * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams. * * This doesn't seem to be abnormal state for datagram sockets, as the * same approach can be see in other datagram socket types as well * (such as unix sockets).
*/
sk->sk_state = TCP_ESTABLISHED;
staticint vsock_connect(struct socket *sock, struct sockaddr *addr, int addr_len, int flags)
{ int err; struct sock *sk; struct vsock_sock *vsk; conststruct vsock_transport *transport; struct sockaddr_vm *remote_addr; long timeout;
DEFINE_WAIT(wait);
err = 0;
sk = sock->sk;
vsk = vsock_sk(sk);
lock_sock(sk);
/* XXX AF_UNSPEC should make us disconnect like AF_INET. */ switch (sock->state) { case SS_CONNECTED:
err = -EISCONN; goto out; case SS_DISCONNECTING:
err = -EINVAL; goto out; case SS_CONNECTING: /* This continues on so we can move sock into the SS_CONNECTED * state once the connection has completed (at which point err * will be set to zero also). Otherwise, we will either wait * for the connection or return -EALREADY should this be a * non-blocking call.
*/
err = -EALREADY; if (flags & O_NONBLOCK) goto out; break; default: if ((sk->sk_state == TCP_LISTEN) ||
vsock_addr_cast(addr, addr_len, &remote_addr) != 0) {
err = -EINVAL; goto out;
}
/* Set the remote address that we are connecting to. */
memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
err = vsock_assign_transport(vsk, NULL); if (err) goto out;
transport = vsk->transport;
/* The hypervisor and well-known contexts do not have socket * endpoints.
*/ if (!transport ||
!transport->stream_allow(remote_addr->svm_cid,
remote_addr->svm_port)) {
err = -ENETUNREACH; goto out;
}
if (vsock_msgzerocopy_allow(transport)) {
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
} elseif (sock_flag(sk, SOCK_ZEROCOPY)) { /* If this option was set before 'connect()', * when transport was unknown, check that this * feature is supported here.
*/
err = -EOPNOTSUPP; goto out;
}
err = vsock_auto_bind(vsk); if (err) goto out;
sk->sk_state = TCP_SYN_SENT;
err = transport->connect(vsk); if (err < 0) goto out;
/* sk_err might have been set as a result of an earlier * (failed) connect attempt.
*/
sk->sk_err = 0;
/* Mark sock as connecting and set the error code to in * progress in case this is a non-blocking connect.
*/
sock->state = SS_CONNECTING;
err = -EINPROGRESS;
}
/* The receive path will handle all communication until we are able to * enter the connected state. Here we wait for the connection to be * completed or a notification of an error.
*/
timeout = vsk->connect_timeout;
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
/* If the socket is already closing or it is in an error state, there * is no point in waiting.
*/ while (sk->sk_state != TCP_ESTABLISHED &&
sk->sk_state != TCP_CLOSING && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection * attempt, in case the peer doesn't respond in a * timely manner. We hold on to the socket until the * timeout fires.
*/
sock_hold(sk);
/* If the timeout function is already scheduled, * reschedule it, then ungrab the socket refcount to * keep it balanced.
*/ if (mod_delayed_work(system_wq, &vsk->connect_work,
timeout))
sock_put(sk);
/* Skip ahead to preserve error code set above. */ goto out_wait;
}
/* Wait for children sockets to appear; these are the new sockets * created upon connection establishment.
*/
timeout = sock_rcvtimeo(listener, arg->flags & O_NONBLOCK);
prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
/* If the listener socket has received an error, then we should * reject this socket and return. Note that we simply mark the * socket rejected, drop our reference, and let the cleanup * function handle the cleanup; the fact that we found it in * the listener's accept queue guarantees that the cleanup * function hasn't run yet.
*/ if (err) {
vconnected->rejected = true;
} else {
newsock->state = SS_CONNECTED;
sock_graft(connected, newsock); if (vsock_msgzerocopy_allow(vconnected->transport))
set_bit(SOCK_SUPPORT_ZC,
&connected->sk_socket->flags);
}
release_sock(connected);
sock_put(connected);
}
out:
release_sock(listener); return err;
}
staticint vsock_listen(struct socket *sock, int backlog)
{ int err; struct sock *sk; struct vsock_sock *vsk;
sk = sock->sk;
lock_sock(sk);
if (!sock_type_connectible(sk->sk_type)) {
err = -EOPNOTSUPP; goto out;
}
/* Callers should not provide a destination with connection oriented * sockets.
*/ if (msg->msg_namelen) {
err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out;
}
/* Send data only if both sides are not shutdown in the direction. */ if (sk->sk_shutdown & SEND_SHUTDOWN ||
vsk->peer_shutdown & RCV_SHUTDOWN) {
err = -EPIPE; goto out;
}
/* These checks occur both as part of and after the loop * conditional since we need to check before and after * sleeping.
*/ if (sk->sk_err) {
err = -sk->sk_err; goto out_err;
} elseif ((sk->sk_shutdown & SEND_SHUTDOWN) ||
(vsk->peer_shutdown & RCV_SHUTDOWN)) {
err = -EPIPE; goto out_err;
}
err = transport->notify_send_pre_enqueue(vsk, &send_data); if (err < 0) goto out_err;
/* Note that enqueue will only write as many bytes as are free * in the produce queue, so we don't need to ensure len is * smaller than the queue size. It is the caller's * responsibility to check how many bytes we were able to send.
*/
if (sk->sk_type == SOCK_SEQPACKET) {
written = transport->seqpacket_enqueue(vsk,
msg, len - total_written);
} else {
written = transport->stream_enqueue(vsk,
msg, len - total_written);
}
/* Internal transport error when checking for available * data. XXX This should be changed to a connection * reset in a later change.
*/ if (data < 0) return -ENOMEM;
return data;
}
staticint __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg,
size_t len, int flags)
{ struct vsock_transport_recv_notify_data recv_data; conststruct vsock_transport *transport; struct vsock_sock *vsk;
ssize_t copied;
size_t target; long timeout; int err;
DEFINE_WAIT(wait);
vsk = vsock_sk(sk);
transport = vsk->transport;
/* We must not copy less than target bytes into the user's buffer * before returning successfully, so we wait for the consume queue to * have that much data to consume before dequeueing. Note that this * makes it impossible to handle cases where target is greater than the * queue size.
*/
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (target >= transport->stream_rcvhiwat(vsk)) {
err = -ENOMEM; goto out;
}
timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
copied = 0;
if (sk->sk_err) {
err = -sk->sk_err;
} elseif (sk->sk_shutdown & RCV_SHUTDOWN) {
err = 0;
} else { /* User sets MSG_TRUNC, so return real length of * packet.
*/ if (flags & MSG_TRUNC)
err = msg_len; else
err = len - msg_data_left(msg);
/* Always set MSG_TRUNC if real length of packet is * bigger than user's buffer.
*/ if (msg_len > len)
msg->msg_flags |= MSG_TRUNC;
}
out: return err;
}
int
__vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags)
{ struct sock *sk; struct vsock_sock *vsk; conststruct vsock_transport *transport; int err;
sk = sock->sk;
if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR);
vsk = vsock_sk(sk);
err = 0;
lock_sock(sk);
transport = vsk->transport;
if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occurred with the * SOCK_DONE flag.
*/ if (sock_flag(sk, SOCK_DONE))
err = 0; else
err = -ENOTCONN;
/* We don't check peer_shutdown flag here since peer may actually shut * down, but there can be data in the queue that a local socket can * receive.
*/ if (sk->sk_shutdown & RCV_SHUTDOWN) {
err = 0; goto out;
}
/* It is valid on Linux to pass in a zero-length receive buffer. This * is not an error. We may as well bail out now.
*/ if (!len) {
err = 0; goto out;
}
staticint vsock_create(struct net *net, struct socket *sock, int protocol, int kern)
{ struct vsock_sock *vsk; struct sock *sk; int ret;
if (!sock) return -EINVAL;
if (protocol && protocol != PF_VSOCK) return -EPROTONOSUPPORT;
switch (sock->type) { case SOCK_DGRAM:
sock->ops = &vsock_dgram_ops; break; case SOCK_STREAM:
sock->ops = &vsock_stream_ops; break; case SOCK_SEQPACKET:
sock->ops = &vsock_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT;
}
sock->state = SS_UNCONNECTED;
sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); if (!sk) return -ENOMEM;
vsk = vsock_sk(sk);
if (sock->type == SOCK_DGRAM) {
ret = vsock_assign_transport(vsk, NULL); if (ret < 0) {
sock->sk = NULL;
sock_put(sk); return ret;
}
}
/* SOCK_DGRAM doesn't have 'setsockopt' callback set in its * proto_ops, so there is no handler for custom logic.
*/ if (sock_type_connectible(sock->type))
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: /* To be compatible with the VMCI behavior, we prioritize the * guest CID instead of well-know host CID (VMADDR_CID_HOST).
*/
cid = vsock_registered_transport_cid(&transport_g2h); if (cid == VMADDR_CID_ANY)
cid = vsock_registered_transport_cid(&transport_h2g); if (cid == VMADDR_CID_ANY)
cid = vsock_registered_transport_cid(&transport_local);
if (put_user(cid, p) != 0)
retval = -EFAULT; break;
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.47Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.