/* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 #define TRACE_BLK_OPT_CGROUP 0x2 #define TRACE_BLK_OPT_CGNAME 0x4
if (blk_tracer) {
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len + cgid_len,
trace_ctx); if (!event) return;
t = ring_buffer_event_data(event); goto record_it;
}
if (!bt->rchan) return;
t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) {
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->time = ktime_to_ns(ktime_get());
record_it:
t->device = bt->dev;
t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
t->pid = pid;
t->cpu = cpu;
t->pdu_len = len + cgid_len; if (cgid_len)
memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
}
}
/* * Send out a notify for this process, if we haven't done so since a trace * started
*/ staticvoid trace_note_tsk(struct task_struct *tsk)
{ unsignedlong flags; struct blk_trace *bt;
/* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \
(ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
/* * The worker for the various blk_add_trace*() types. Fills out a * blk_io_trace structure and places it in a per-cpu subbuffer.
*/ staticvoid __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, const blk_opf_t opf, u32 what, int error, int pdu_len, void *pdu_data, u64 cgid)
{ struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; struct blk_io_trace *t; unsignedlong flags = 0; unsignedlong *sequence; unsignedint trace_ctx = 0;
pid_t pid; int cpu; bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0; constenum req_op op = opf & REQ_OP_MASK;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return;
what |= ddir_act[op_is_write(op) ? WRITE : READ];
what |= MASK_TC_BIT(opf, SYNC);
what |= MASK_TC_BIT(opf, RAHEAD);
what |= MASK_TC_BIT(opf, META);
what |= MASK_TC_BIT(opf, PREFLUSH);
what |= MASK_TC_BIT(opf, FUA); if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH)
what |= BLK_TC_ACT(BLK_TC_FLUSH); if (cgid)
what |= __BLK_TA_CGROUP;
pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) return;
cpu = raw_smp_processor_id();
if (blk_tracer) {
tracing_record_cmdline(current);
if (unlikely(tsk->btrace_seq != blktrace_seq))
trace_note_tsk(tsk);
/* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes.
*/
local_irq_save(flags);
t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->sequence = ++(*sequence);
t->time = ktime_to_ns(ktime_get());
record_it: /* * These two are not needed in ftrace as they are in the * generic trace_entry, filled by tracing_generic_entry_update, * but for the trace_event->bin() synthesizer benefit we do it * here too.
*/
t->cpu = cpu;
t->pid = pid;
/* * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created * under 'q->debugfs_dir', thus lookup and remove them.
*/ if (!bt->dir) {
debugfs_lookup_and_remove("dropped", q->debugfs_dir);
debugfs_lookup_and_remove("msg", q->debugfs_dir);
} else {
debugfs_remove(bt->dir);
}
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
}
/* * some device names have larger paths - convert the slashes * to underscores for this to work as expected
*/
strreplace(buts->name, '/', '_');
/* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be.
*/ if (rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex))) {
pr_warn("Concurrent blktraces are not allowed on %s\n",
buts->name); return -EBUSY;
}
bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM;
ret = -ENOMEM;
bt->sequence = alloc_percpu(unsignedlong); if (!bt->sequence) goto err;
bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); if (!bt->msg_data) goto err;
/* * When tracing the whole disk reuse the existing debugfs directory * created by the block layer on init. For partitions block devices, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends.
*/ if (bdev && !bdev_is_partition(bdev))
dir = q->debugfs_dir; else
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
/* * As blktrace relies on debugfs for its interface the debugfs directory * is required, contrary to the usual mantra of not checking for debugfs * files or directories.
*/ if (IS_ERR_OR_NULL(dir)) {
pr_warn("debugfs_dir not present for %s so skipping\n",
buts->name);
ret = -ENOENT; goto err;
}
/* * When reading or writing the blktrace sysfs files, the references to the * opened sysfs or device files should prevent the underlying block device * from being removed. So no further delete protection is really needed.
*/
/** * blk_trace_ioctl - handle the ioctls associated with tracing * @bdev: the block device * @cmd: the ioctl cmd * @arg: the argument data, if any *
**/ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{ struct request_queue *q = bdev_get_queue(bdev); int ret, start = 0; char b[BDEVNAME_SIZE];
switch (cmd) { case BLKTRACESETUP:
snprintf(b, sizeof(b), "%pg", bdev);
ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #ifdefined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32:
snprintf(b, sizeof(b), "%pg", bdev);
ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #endif case BLKTRACESTART:
start = 1;
fallthrough; case BLKTRACESTOP:
ret = blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN:
ret = blk_trace_remove(q); break; default:
ret = -ENOTTY; break;
} return ret;
}
/** * blk_trace_shutdown - stop and cleanup trace structures * @q: the request queue associated with the device *
**/ void blk_trace_shutdown(struct request_queue *q)
{ if (rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex)))
__blk_trace_remove(q);
}
/* We don't use the 'bt' value here except as an optimization... */
bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0;
static u64
blk_trace_request_get_cgid(struct request *rq)
{ if (!rq->bio) return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(rq->q, rq->bio);
}
/* * blktrace probes
*/
/** * blk_add_trace_rq - Add a trace for a request oriented action * @rq: the source request * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. *
**/ staticvoid blk_add_trace_rq(struct request *rq, blk_status_t error, unsignedint nr_bytes, u32 what, u64 cgid)
{ struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) {
rcu_read_unlock(); return;
}
if (blk_rq_is_passthrough(rq))
what |= BLK_TC_ACT(BLK_TC_PC); else
what |= BLK_TC_ACT(BLK_TC_FS);
/** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for * @bio: the source bio * @what: the action * @error: error, if any * * Description: * Records an action against a bio. Will log the bio offset + size. *
**/ staticvoid blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what, int error)
{ struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace); if (likely(!bt)) {
rcu_read_unlock(); return;
}
/** * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) * @bio: the source bio * @dev: source device * @from: source sector * * Called after a bio is remapped to a different device and/or sector.
**/ staticvoid blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
sector_t from)
{ struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace); if (likely(!bt)) {
rcu_read_unlock(); return;
}
cgroup_path_from_kernfs_id(id, blkcg_name_buf, sizeof(blkcg_name_buf));
trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
MAJOR(t->device), MINOR(t->device),
blkcg_name_buf, act, rwbs);
} else { /* * The cgid portion used to be "INO,GEN". Userland * builds a FILEID_INO32_GEN fid out of them and * opens the cgroup using open_by_handle_at(2). * While 32bit ino setups are still the same, 64bit * ones now use the 64bit ino as the whole ID and * no longer use generation. * * Regardless of the content, always output * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can * be mapped back to @id on both 64 and 32bit ino * setups. See __kernfs_fh_to_dentry().
*/
trace_seq_printf(&iter->seq, "%3d,%-3d %llx,%-llx %2s %3s ",
MAJOR(t->device), MINOR(t->device),
id & U32_MAX, id >> 32, act, rwbs);
}
} else
trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
MAJOR(t->device), MINOR(t->device), act, rwbs);
}
staticvoid blk_log_dump_pdu(struct trace_seq *s, conststruct trace_entry *ent, bool has_cg)
{ constunsignedchar *pdu_buf; int pdu_len; int i, end;
/* * stop when the rest is just zeros and indicate so * with a ".." appended
*/ if (i == end && end != pdu_len - 1) {
trace_seq_puts(s, " ..) "); return;
}
}
staticint __init init_blk_tracer(void)
{ if (!register_trace_event(&trace_blk_event)) {
pr_warn("Warning: could not register block events\n"); return 1;
}
if (register_tracer(&blk_tracer) != 0) {
pr_warn("Warning: could not register the block tracer\n");
unregister_trace_event(&trace_blk_event); return 1;
}
if (attr == &dev_attr_act_mask) { if (kstrtoull(buf, 0, &value)) { /* Assume it is a list of trace category names */
ret = blk_trace_str2mask(buf); if (ret < 0) goto out;
value = ret;
}
} else { if (kstrtoull(buf, 0, &value)) goto out;
}
mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { if (!!value == !!bt) {
ret = 0; goto out_unlock_bdev;
} if (value)
ret = blk_trace_setup_queue(q, bdev); else
ret = blk_trace_remove_queue(q); goto out_unlock_bdev;
}
ret = 0; if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex));
}
out_unlock_bdev:
mutex_unlock(&q->debugfs_mutex);
out: return ret ? ret : count;
} #endif/* CONFIG_BLK_DEV_IO_TRACE */
#ifdef CONFIG_EVENT_TRACING
/** * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. * @rwbs: buffer to be filled * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint * * Description: * Maps each request operation and flag to a single character and fills the * buffer provided by the caller with resulting string. *
**/ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
{ int i = 0;
if (opf & REQ_PREFLUSH)
rwbs[i++] = 'F';
switch (opf & REQ_OP_MASK) { case REQ_OP_WRITE:
rwbs[i++] = 'W'; break; case REQ_OP_DISCARD:
rwbs[i++] = 'D'; break; case REQ_OP_SECURE_ERASE:
rwbs[i++] = 'D';
rwbs[i++] = 'E'; break; case REQ_OP_FLUSH:
rwbs[i++] = 'F'; break; case REQ_OP_READ:
rwbs[i++] = 'R'; break; case REQ_OP_ZONE_APPEND:
rwbs[i++] = 'Z';
rwbs[i++] = 'A'; break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL:
rwbs[i++] = 'Z';
rwbs[i++] = 'R'; if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL)
rwbs[i++] = 'A'; break; case REQ_OP_ZONE_FINISH:
rwbs[i++] = 'Z';
rwbs[i++] = 'F'; break; case REQ_OP_ZONE_OPEN:
rwbs[i++] = 'Z';
rwbs[i++] = 'O'; break; case REQ_OP_ZONE_CLOSE:
rwbs[i++] = 'Z';
rwbs[i++] = 'C'; break; default:
rwbs[i++] = 'N';
}
if (opf & REQ_FUA)
rwbs[i++] = 'F'; if (opf & REQ_RAHEAD)
rwbs[i++] = 'A'; if (opf & REQ_SYNC)
rwbs[i++] = 'S'; if (opf & REQ_META)
rwbs[i++] = 'M'; if (opf & REQ_ATOMIC)
rwbs[i++] = 'U';
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.