/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ /* * Header file for the io_uring interface. * * Copyright (C) 2019 Jens Axboe * Copyright (C) 2019 Christoph Hellwig
*/ #ifndef LINUX_IO_URING_H #define LINUX_IO_URING_H
#include <linux/fs.h> #include <linux/types.h> /* * this file is shared with liburing and that has to autodetect * if linux/time_types.h is available or not, it can * define UAPI_LINUX_IO_URING_H_SKIP_LINUX_TIME_TYPES_H * if linux/time_types.h is not available
*/ #ifndef UAPI_LINUX_IO_URING_H_SKIP_LINUX_TIME_TYPES_H #include <linux/time_types.h> #endif
#ifdef __cplusplus extern"C" { #endif
/* * IO submission data structure (Submission Queue Entry)
*/ struct io_uring_sqe {
__u8 opcode; /* type of operation for this sqe */
__u8 flags; /* IOSQE_ flags */
__u16 ioprio; /* ioprio for the request */
__s32 fd; /* file descriptor to do IO on */ union {
__u64 off; /* offset into file */
__u64 addr2; struct {
__u32 cmd_op;
__u32 __pad1;
};
}; union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in; struct {
__u32 level;
__u32 optname;
};
};
__u32 len; /* buffer size or number of iovecs */ union {
__u32 rw_flags;
__u32 fsync_flags;
__u16 poll_events; /* compatibility */
__u32 poll32_events; /* word-reversed for BE */
__u32 sync_range_flags;
__u32 msg_flags;
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
__u32 xattr_flags;
__u32 msg_ring_flags;
__u32 uring_cmd_flags;
__u32 waitid_flags;
__u32 futex_flags;
__u32 install_fd_flags;
__u32 nop_flags;
__u32 pipe_flags;
};
__u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ union { /* index into fixed buffers, if used */
__u16 buf_index; /* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed)); /* personality to use, if used */
__u16 personality; union {
__s32 splice_fd_in;
__u32 file_index;
__u32 zcrx_ifq_idx;
__u32 optlen; struct {
__u16 addr_len;
__u16 __pad3[1];
}; struct {
__u8 write_stream;
__u8 __pad4[3];
};
}; union { struct {
__u64 addr3;
__u64 __pad2[1];
}; struct {
__u64 attr_ptr; /* pointer to attribute information */
__u64 attr_type_mask; /* bit mask of attributes */
};
__u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then * this field is used for 80 bytes of arbitrary command data
*/
__u8 cmd[0];
};
};
/* * If sqe->file_index is set to this for opcodes that instantiate a new * direct descriptor (like openat/openat2/accept), then io_uring will allocate * an available direct descriptor instead of having the application pass one * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE * if the space is full.
*/ #define IORING_FILE_INDEX_ALLOC (~0U)
/* * sqe->flags
*/ /* use fixed fileset */ #define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT) /* issue after inflight IO */ #define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT) /* links next sqe */ #define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT) /* like LINK, but stronger */ #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) /* always go async */ #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) /* select buffer from sqe->buf_group */ #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) /* don't post CQE if request succeeded */ #define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
/* * io_uring_setup() flags
*/ #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ /* * Cooperative task running. When requests complete, they often require * forcing the submitter to transition to the kernel to complete. If this * flag is set, work will be done when the task transitions anyway, rather * than force an inter-processor interrupt reschedule. This avoids interrupting * a task running in userspace, and saves an IPI.
*/ #define IORING_SETUP_COOP_TASKRUN (1U << 8) /* * If COOP_TASKRUN is set, get notified if task work is available for * running and a kernel transition would be needed to run it. This sets * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
*/ #define IORING_SETUP_TASKRUN_FLAG (1U << 9) #define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ #define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ /* * Only one task is allowed to submit requests
*/ #define IORING_SETUP_SINGLE_ISSUER (1U << 12)
/* * Defer running task work to get events. * Rather than running bits of task work whenever the task transitions * try to do it just before it is needed.
*/ #define IORING_SETUP_DEFER_TASKRUN (1U << 13)
/* * Application provides the memory for the rings
*/ #define IORING_SETUP_NO_MMAP (1U << 14)
/* * Register the ring fd in itself for use with * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather * than an fd.
*/ #define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)
/* * Removes indirection through the SQ index array.
*/ #define IORING_SETUP_NO_SQARRAY (1U << 16)
/* Use hybrid poll in iopoll process */ #define IORING_SETUP_HYBRID_IOPOLL (1U << 17)
/* this goes last, obviously */
IORING_OP_LAST,
};
/* * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index.
*/ #define IORING_URING_CMD_FIXED (1U << 0) #define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED
/* * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the * command flags for POLL_ADD are stored in sqe->len. * * IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if * the poll handler will continue to report * CQEs on behalf of the same SQE. * * IORING_POLL_UPDATE Update existing poll request, matching * sqe->addr as the old user_data field. * * IORING_POLL_LEVEL Level triggered poll.
*/ #define IORING_POLL_ADD_MULTI (1U << 0) #define IORING_POLL_UPDATE_EVENTS (1U << 1) #define IORING_POLL_UPDATE_USER_DATA (1U << 2) #define IORING_POLL_ADD_LEVEL (1U << 3)
/* * ASYNC_CANCEL flags. * * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the * request 'user_data' * IORING_ASYNC_CANCEL_ANY Match any request * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor * IORING_ASYNC_CANCEL_USERDATA Match on user_data, default for no other key * IORING_ASYNC_CANCEL_OP Match request based on opcode
*/ #define IORING_ASYNC_CANCEL_ALL (1U << 0) #define IORING_ASYNC_CANCEL_FD (1U << 1) #define IORING_ASYNC_CANCEL_ANY (1U << 2) #define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3) #define IORING_ASYNC_CANCEL_USERDATA (1U << 4) #define IORING_ASYNC_CANCEL_OP (1U << 5)
/* * send/sendmsg and recv/recvmsg flags (sqe->ioprio) * * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send * or receive and arm poll if that yields an * -EAGAIN result, arm poll upfront and skip * the initial transfer attempt. * * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if * the handler will continue to report * CQEs on behalf of the same SQE. * * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in * the buf_index field. * * IORING_SEND_ZC_REPORT_USAGE * If set, SEND[MSG]_ZC should report * the zerocopy usage in cqe.res * for the IORING_CQE_F_NOTIF cqe. * 0 is reported if zerocopy was actually possible. * IORING_NOTIF_USAGE_ZC_COPIED if data was copied * (at least partially). * * IORING_RECVSEND_BUNDLE Used with IOSQE_BUFFER_SELECT. If set, send or * recv will grab as many buffers from the buffer * group ID given and send them all. The completion * result will be the number of buffers send, with * the starting buffer ID in cqe->flags as per * usual for provided buffer usage. The buffers * will be contiguous from the starting buffer ID. * * IORING_SEND_VECTORIZED If set, SEND[_ZC] will take a pointer to a io_vec * to allow vectorized send operations.
*/ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) #define IORING_SEND_ZC_REPORT_USAGE (1U << 3) #define IORING_RECVSEND_BUNDLE (1U << 4) #define IORING_SEND_VECTORIZED (1U << 5)
/* * cqe.res for IORING_CQE_F_NOTIF if * IORING_SEND_ZC_REPORT_USAGE was requested * * It should be treated as a flag, all other * bits of cqe.res should be treated as reserved!
*/ #define IORING_NOTIF_USAGE_ZC_COPIED (1U << 31)
/* * IORING_OP_MSG_RING command types, stored in sqe->addr
*/ enum io_uring_msg_ring_flags {
IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
IORING_MSG_SEND_FD, /* send a registered fd to another ring */
};
/* * IORING_OP_MSG_RING flags (sqe->msg_ring_flags) * * IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not * applicable for IORING_MSG_DATA, obviously.
*/ #define IORING_MSG_RING_CQE_SKIP (1U << 0) /* Pass through the flags from sqe->file_index to cqe->flags */ #define IORING_MSG_RING_FLAGS_PASS (1U << 1)
/* * IORING_OP_FIXED_FD_INSTALL flags (sqe->install_fd_flags) * * IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC
*/ #define IORING_FIXED_FD_NO_CLOEXEC (1U << 0)
/* * IO completion data structure (Completion Queue Entry)
*/ struct io_uring_cqe {
__u64 user_data; /* sqe->user_data value passed back */
__s32 res; /* result code for this event */
__u32 flags;
/* * If the ring is initialized with IORING_SETUP_CQE32, then this field * contains 16-bytes of padding, doubling the size of the CQE.
*/
__u64 big_cqe[];
};
/* * cqe->flags * * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv * IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct * them from sends. * IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get * more completions. In other words, the buffer is being * partially consumed, and will be used by the kernel for * more completions. This is only set for buffers used via * the incremental buffer consumption, as provided by * a ring buffer setup with IOU_PBUF_RING_INC. For any * other provided buffer type, all completions with a * buffer passed back is automatically returned to the * application.
*/ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_BUF_MORE (1U << 4)
#define IORING_CQE_BUFFER_SHIFT 16
/* * Magic offsets for the application to mmap the data it needs
*/ #define IORING_OFF_SQ_RING 0ULL #define IORING_OFF_CQ_RING 0x8000000ULL #define IORING_OFF_SQES 0x10000000ULL #define IORING_OFF_PBUF_RING 0x80000000ULL #define IORING_OFF_PBUF_SHIFT 16 #define IORING_OFF_MMAP_MASK 0xf8000000ULL
struct io_uring_buf_ring { union { /* * To avoid spilling into more pages than we need to, the * ring tail is overlaid with the io_uring_buf->resv field.
*/ struct {
__u64 resv1;
__u32 resv2;
__u16 resv3;
__u16 tail;
};
__DECLARE_FLEX_ARRAY(struct io_uring_buf, bufs);
};
};
/* * Flags for IORING_REGISTER_PBUF_RING. * * IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring. * The application must not set a ring_addr in struct * io_uring_buf_reg, instead it must subsequently call * mmap(2) with the offset set as: * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) * to get a virtual mapping for the ring. * IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be * consumed incrementally. Normally one (or more) buffers * are fully consumed. With incremental consumptions, it's * feasible to register big ranges of buffers, and each * use of it will consume only as much as it needs. This * requires that both the kernel and application keep * track of where the current read/recv index is at.
*/ enum io_uring_register_pbuf_ring_flags {
IOU_PBUF_RING_MMAP = 1,
IOU_PBUF_RING_INC = 2,
};
/* a io_uring_napi_op value */
__u8 opcode;
__u8 pad[2];
/* * for IO_URING_NAPI_REGISTER_OP, it is a * io_uring_napi_tracking_strategy value. * * for IO_URING_NAPI_STATIC_ADD_ID/IO_URING_NAPI_STATIC_DEL_ID * it is the napi id to add/del from napi_list.
*/
__u32 op_param;
__u32 resv;
};
/* Require sqe flags (these flags must be set on each submission) */
IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3,
IORING_RESTRICTION_LAST
};
enum {
IORING_REG_WAIT_TS = (1U << 0),
};
/* * Argument for io_uring_enter(2) with * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument * is an index into a previously registered fixed wait region described by * the below structure.
*/ struct io_uring_reg_wait { struct __kernel_timespec ts;
__u32 min_wait_usec;
__u32 flags;
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad[3];
__u64 pad2[2];
};
/* * Argument for IORING_REGISTER_FILE_ALLOC_RANGE * The range is specified as [off, off + len)
*/ struct io_uring_file_index_range {
__u32 off;
__u32 len;
__u64 resv;
};
/* * Argument for IORING_OP_URING_CMD when file is a socket
*/ enum io_uring_socket_op {
SOCKET_URING_OP_SIOCINQ = 0,
SOCKET_URING_OP_SIOCOUTQ,
SOCKET_URING_OP_GETSOCKOPT,
SOCKET_URING_OP_SETSOCKOPT,
SOCKET_URING_OP_TX_TIMESTAMP,
};
/* * SOCKET_URING_OP_TX_TIMESTAMP definitions
*/
#define IORING_TIMESTAMP_HW_SHIFT 16 /* The cqe->flags bit from which the timestamp type is stored */ #define IORING_TIMESTAMP_TYPE_SHIFT (IORING_TIMESTAMP_HW_SHIFT + 1) /* The cqe->flags flag signifying whether it's a hardware timestamp */ #define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT)
/* The bit from which area id is encoded into offsets */ #define IORING_ZCRX_AREA_SHIFT 48 #define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.