/* * blkfront.c * * XenLinux virtual block device driver. * * Copyright (c) 2003-2004, Keir Fraser & Steve Hand * Modifications by Mark A. Williamson are (c) Intel Research Cambridge * Copyright (c) 2004, Christian Limpach * Copyright (c) 2004, Andrew Warfield * Copyright (c) 2005, Christopher Clark * Copyright (c) 2005, XenSource Ltd * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation; or, when distributed * separately from the Linux kernel or incorporated into other * software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE.
*/
/* * The minimal size of segment supported by the block framework is PAGE_SIZE. * When Linux is using a different page size than Xen, it may not be possible * to put all the data in a single segment. * This can happen when the backend doesn't support indirect descriptor and * therefore the maximum amount of data that a request can carry is * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB * * Note that we only support one extra request. So the Linux page size * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) = * 88KB.
*/ #define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
/* * Maximum number of segments in indirect requests, the actual value used by * the frontend driver is the minimum of this value and the value provided * by the backend driver.
*/
staticunsignedint xen_blkif_max_segments = 32;
module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444);
MODULE_PARM_DESC(max_indirect_segments, "Maximum amount of segments in indirect requests (default is 32)");
staticunsignedint xen_blkif_max_queues = 4;
module_param_named(max_queues, xen_blkif_max_queues, uint, 0444);
MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
/* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used.
*/ staticunsignedint xen_blkif_max_ring_order;
module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
/* * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 * characters are enough. Define to 20 to keep consistent with backend.
*/ #define RINGREF_NAME_LEN (20) /* * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
*/ #define QUEUE_NAME_LEN (17)
/* * Per-ring info. * Every blkfront device can associate with one or more blkfront_ring_info, * depending on how many hardware queues/rings to be used.
*/ struct blkfront_ring_info { /* Lock to protect data in every ring buffer. */
spinlock_t ring_lock; struct blkif_front_ring ring; unsignedint ring_ref[XENBUS_MAX_RING_GRANTS]; unsignedint evtchn, irq; struct work_struct work; struct gnttab_free_callback callback; struct list_head indirect_pages; struct list_head grants; unsignedint persistent_gnts_c; unsignedlong shadow_free; struct blkfront_info *dev_info; struct blk_shadow shadow[];
};
/* * We have one of these per vbd, whether ide, scsi or 'other'. They * hang in private_data off the gendisk structure. We may end up * putting all kinds of interesting stuff here :-)
*/ struct blkfront_info
{ struct mutex mutex; struct xenbus_device *xbdev; struct gendisk *gd;
u16 sector_size; unsignedint physical_sector_size; unsignedlong vdisk_info; int vdevice;
blkif_vdev_t handle; enum blkif_state connected; /* Number of pages per ring buffer. */ unsignedint nr_ring_pages; struct request_queue *rq; unsignedint feature_flush:1; unsignedint feature_fua:1; unsignedint feature_discard:1; unsignedint feature_secdiscard:1; /* Connect-time cached feature_persistent parameter */ unsignedint feature_persistent_parm:1; /* Persistent grants feature negotiation result */ unsignedint feature_persistent:1; unsignedint bounce:1; unsignedint discard_granularity; unsignedint discard_alignment; /* Number of 4KB segments handled */ unsignedint max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; struct blkfront_ring_info *rinfo; unsignedint nr_rings; unsignedint rinfo_size; /* Save uncomplete reqs and bios for migration. */ struct list_head requests; struct bio_list bio_list; struct list_head info_list;
};
/* * Grants are always the same size as a Xen page (i.e 4KB). * A physical segment is always the same size as a Linux page. * Number of grants per physical segment
*/ #define GRANTS_PER_PSEG (PAGE_SIZE / XEN_PAGE_SIZE)
staticstruct grant *get_grant(grant_ref_t *gref_head, unsignedlong gfn, struct blkfront_ring_info *rinfo)
{ struct grant *gnt_list_entry = get_free_grant(rinfo); struct blkfront_info *info = rinfo->dev_info;
if (gnt_list_entry->gref != INVALID_GRANT_REF) return gnt_list_entry;
/* Assign a gref to this page */
gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
BUG_ON(gnt_list_entry->gref == -ENOSPC); if (info->bounce)
grant_foreign_access(gnt_list_entry, info); else { /* Grant access to the GFN passed by the caller */
gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
info->xbdev->otherend_id,
gfn, 0);
}
return gnt_list_entry;
}
staticstruct grant *get_indirect_grant(grant_ref_t *gref_head, struct blkfront_ring_info *rinfo)
{ struct grant *gnt_list_entry = get_free_grant(rinfo); struct blkfront_info *info = rinfo->dev_info;
if (gnt_list_entry->gref != INVALID_GRANT_REF) return gnt_list_entry;
/* Assign a gref to this page */
gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
BUG_ON(gnt_list_entry->gref == -ENOSPC); if (!info->bounce) { struct page *indirect_page;
/* Fetch a pre-allocated page to use for indirect grefs */
BUG_ON(list_empty(&rinfo->indirect_pages));
indirect_page = list_first_entry(&rinfo->indirect_pages, struct page, lru);
list_del(&indirect_page->lru);
gnt_list_entry->page = indirect_page;
}
grant_foreign_access(gnt_list_entry, info);
staticint blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
{ /* We don't have real geometry info, but let's at least return
values consistent with the size of the device */
sector_t nsect = get_capacity(bd->bd_disk);
sector_t cylinders = nsect;
/* Copy the request to the ring page. */
*final_ring_req = *ring_req;
rinfo->shadow[id].status = REQ_WAITING;
return 0;
}
struct setup_rw_req { unsignedint grant_idx; struct blkif_request_segment *segments; struct blkfront_ring_info *rinfo; struct blkif_request *ring_req;
grant_ref_t gref_head; unsignedint id; /* Only used when persistent grant is used and it's a write request */ bool need_copy; unsignedint bvec_off; char *bvec_data;
staticvoid blkif_setup_rw_req_grant(unsignedlong gfn, unsignedint offset, unsignedint len, void *data)
{ struct setup_rw_req *setup = data; int n, ref; struct grant *gnt_list_entry; unsignedint fsect, lsect; /* Convenient aliases */ unsignedint grant_idx = setup->grant_idx; struct blkif_request *ring_req = setup->ring_req; struct blkfront_ring_info *rinfo = setup->rinfo; /* * We always use the shadow of the first request to store the list * of grant associated to the block I/O request. This made the * completion more easy to handle even if the block I/O request is * split.
*/ struct blk_shadow *shadow = &rinfo->shadow[setup->id];
if (unlikely(setup->require_extra_req &&
grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) { /* * We are using the second request, setup grant_idx * to be the index of the segment array.
*/
grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
ring_req = setup->extra_ring_req;
}
if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
(grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { if (setup->segments)
kunmap_atomic(setup->segments);
gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
ref = gnt_list_entry->gref; /* * All the grants are stored in the shadow of the first * request. Therefore we have to use the global index.
*/
shadow->grants_used[setup->grant_idx] = gnt_list_entry;
if (setup->need_copy) { void *shared_data;
shared_data = kmap_atomic(gnt_list_entry->page); /* * this does not wipe data stored outside the * range sg->offset..sg->offset+sg->length. * Therefore, blkback *could* see data from * previous requests. This is OK as long as * persistent grants are shared with just one * domain. It may need refactoring if this * changes
*/
memcpy(shared_data + offset,
setup->bvec_data + setup->bvec_off,
len);
/* * The second request is only present when the first request uses * all its segments. It's always the continuity of the first one.
*/
first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
/* * Used to store if we are able to queue the request by just using * existing persistent grants, or if we have to get new grants, * as there are not sufficiently many free.
*/ bool new_persistent_gnts = false; struct scatterlist *sg; int num_sg, max_grefs, num_grant;
max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG; if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) /* * If we are using indirect segments we need to account * for the indirect grefs used in the request.
*/
max_grefs += INDIRECT_GREFS(max_grefs);
/* Check if we have enough persistent grants to allocate a requests */ if (rinfo->persistent_gnts_c < max_grefs) {
new_persistent_gnts = true;
/* Fill out a communications ring structure. */
id = blkif_ring_get_request(rinfo, req, &final_ring_req);
ring_req = &rinfo->shadow[id].req;
num_sg = blk_rq_map_sg(req, rinfo->shadow[id].sg);
num_grant = 0; /* Calculate the number of grant used */
for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
num_grant += gnttab_count_grant(sg->offset, sg->length);
rinfo->shadow[id].num_sg = num_sg; if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
likely(!require_extra_req)) { /* * The indirect operation can only be a BLKIF_OP_READ or * BLKIF_OP_WRITE
*/
BUG_ON(req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA);
ring_req->operation = BLKIF_OP_INDIRECT;
ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
BLKIF_OP_WRITE : BLKIF_OP_READ;
ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
ring_req->u.indirect.handle = info->handle;
ring_req->u.indirect.nr_segments = num_grant;
} else {
ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
ring_req->u.rw.handle = info->handle;
ring_req->operation = rq_data_dir(req) ?
BLKIF_OP_WRITE : BLKIF_OP_READ; if (req_op(req) == REQ_OP_FLUSH ||
(req_op(req) == REQ_OP_WRITE && (req->cmd_flags & REQ_FUA))) { /* * Ideally we can do an unordered flush-to-disk. * In case the backend onlysupports barriers, use that. * A barrier request a superset of FUA, so we can * implement it the same way. (It's also a FLUSH+FUA, * since it is guaranteed ordered WRT previous writes.) * * Note that can end up here with a FUA write and the * flags cleared. This happens when the flag was * run-time disabled after a failing I/O, and we'll * simplify submit it as a normal write.
*/ if (info->feature_flush && info->feature_fua)
ring_req->operation =
BLKIF_OP_WRITE_BARRIER; elseif (info->feature_flush)
ring_req->operation =
BLKIF_OP_FLUSH_DISKCACHE;
}
ring_req->u.rw.nr_segments = num_grant; if (unlikely(require_extra_req)) {
extra_id = blkif_ring_get_request(rinfo, req,
&final_extra_ring_req);
extra_ring_req = &rinfo->shadow[extra_id].req;
/* * Only the first request contains the scatter-gather * list.
*/
rinfo->shadow[extra_id].num_sg = 0;
blkif_setup_extra_req(ring_req, extra_ring_req);
/* Link the 2 requests together */
rinfo->shadow[extra_id].associated_id = id;
rinfo->shadow[id].associated_id = extra_id;
}
}
setup.ring_req = ring_req;
setup.id = id;
setup.require_extra_req = require_extra_req; if (unlikely(require_extra_req))
setup.extra_ring_req = extra_ring_req;
/* * Check if the backend actually supports flushes. * * While the block layer won't send us flushes if we don't claim to * support them, the Xen protocol allows the backend to revoke support * at any time. That is of course a really bad idea and dangerous, but * has been allowed for 10+ years. In that case we simply clear the * flags, and directly return here for an empty flush and ignore the * FUA flag later on.
*/ if (unlikely(req_op(qd->rq) == REQ_OP_FLUSH && !info->feature_flush)) goto complete;
if (RING_FULL(&rinfo->ring)) goto out_busy; if (blkif_queue_request(qd->rq, rinfo)) goto out_busy;
if ((info->vdevice>>EXT_SHIFT) > 1) { /* this is above the extended range; something is wrong */
printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice); return -ENODEV;
}
if (!VDEV_IS_EXTENDED(info->vdevice)) {
err = xen_translate_vdev(info->vdevice, &minor, &offset); if (err) return err;
nr_parts = PARTS_PER_DISK;
} else {
minor = BLKIF_MINOR_EXT(info->vdevice);
nr_parts = PARTS_PER_EXT_DISK;
offset = minor / nr_parts; if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with " "emulated IDE disks,\n\t choose an xvd device name" "from xvde on\n", info->vdevice);
} if (minor >> MINORBITS) {
pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
info->vdevice, minor); return -ENODEV;
}
if ((minor % nr_parts) == 0)
nr_minors = nr_parts;
err = xlbd_reserve_minors(minor, nr_minors); if (err) return err;
memset(&info->tag_set, 0, sizeof(info->tag_set));
info->tag_set.ops = &blkfront_mq_ops;
info->tag_set.nr_hw_queues = info->nr_rings; if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { /* * When indirect descriptior is not supported, the I/O request * will be split between multiple request in the ring. * To avoid problems when sending the request, divide by * 2 the depth of the queue.
*/
info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
} else
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE;
info->tag_set.cmd_size = sizeof(struct blkif_req);
info->tag_set.driver_data = info;
err = blk_mq_alloc_tag_set(&info->tag_set); if (err) goto out_release_minors;
if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
kick_pending_request_queues(rinfo);
}
staticvoid blkif_free_ring(struct blkfront_ring_info *rinfo)
{ struct grant *persistent_gnt, *n; struct blkfront_info *info = rinfo->dev_info; int i, j, segs;
/* * Remove indirect pages, this only happens when using indirect * descriptors but not persistent grants
*/ if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n;
BUG_ON(info->bounce);
list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
list_del(&indirect_page->lru);
__free_page(indirect_page);
}
}
/* Remove all persistent grants. */ if (!list_empty(&rinfo->grants)) {
list_for_each_entry_safe(persistent_gnt, n,
&rinfo->grants, node) {
list_del(&persistent_gnt->node); if (persistent_gnt->gref != INVALID_GRANT_REF) {
gnttab_end_foreign_access(persistent_gnt->gref,
NULL);
rinfo->persistent_gnts_c--;
} if (info->bounce)
__free_page(persistent_gnt->page);
kfree(persistent_gnt);
}
}
BUG_ON(rinfo->persistent_gnts_c != 0);
for (i = 0; i < BLK_RING_SIZE(info); i++) { /* * Clear persistent grants present in requests already * on the shared ring
*/ if (!rinfo->shadow[i].request) goto free_shadow;
if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT) /* * If this is not an indirect operation don't try to * free indirect segments
*/ goto free_shadow;
/* Prevent new requests being issued until we fix things up. */
info->connected = suspend ?
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; /* No more blkif_request(). */ if (info->rq)
blk_mq_stop_hw_queues(info->rq);
staticenum blk_req_status blkif_rsp_to_req_status(int rsp)
{ switch (rsp)
{ case BLKIF_RSP_OKAY: return REQ_DONE; case BLKIF_RSP_EOPNOTSUPP: return REQ_EOPNOTSUPP; case BLKIF_RSP_ERROR: default: return REQ_ERROR;
}
}
/* * Get the final status of the block request based on two ring response
*/ staticint blkif_get_final_status(enum blk_req_status s1, enum blk_req_status s2)
{
BUG_ON(s1 < REQ_DONE);
BUG_ON(s2 < REQ_DONE);
/* The I/O request may be split in two. */ if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) { struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
/* Keep the status of the current response in shadow. */
s->status = blkif_rsp_to_req_status(bret->status);
/* Wait the second response if not yet here. */ if (s2->status < REQ_DONE) return 0;
/* * All the grants is stored in the first shadow in order * to make the completion code simpler.
*/
num_grant += s2->req.u.rw.nr_segments;
/* * The two responses may not come in order. Only the * first request will store the scatter-gather list.
*/ if (s2->num_sg != 0) { /* Update "id" with the ID of the first response. */
*id = s->associated_id;
s = s2;
}
/* * We don't need anymore the second request, so recycling * it now.
*/ if (add_id_to_freelist(rinfo, s->associated_id))
WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
info->gd->disk_name, s->associated_id);
}
kunmap_atomic(data.bvec_data);
}
} /* Add the persistent grant into the list of free grants */ for (i = 0; i < num_grant; i++) { if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the * backend has chosen to make this grant persistent) * we add it at the head of the list, so it will be * reused first.
*/ if (!info->feature_persistent) {
pr_alert("backed has not unmapped grant: %u\n",
s->grants_used[i]->gref); return -1;
}
list_add(&s->grants_used[i]->node, &rinfo->grants);
rinfo->persistent_gnts_c++;
} else { /* * If the grant is not mapped by the backend we add it * to the tail of the list, so it will not be picked * again unless we run out of persistent grants.
*/
s->grants_used[i]->gref = INVALID_GRANT_REF;
list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
}
} if (s->req.operation == BLKIF_OP_INDIRECT) { for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) { if (!info->feature_persistent) {
pr_alert("backed has not unmapped grant: %u\n",
s->indirect_grants[i]->gref); return -1;
}
list_add(&s->indirect_grants[i]->node, &rinfo->grants);
rinfo->persistent_gnts_c++;
} else { struct page *indirect_page;
/* * Add the used indirect page back to the list of * available pages for indirect grefs.
*/ if (!info->bounce) {
indirect_page = s->indirect_grants[i]->page;
list_add(&indirect_page->lru, &rinfo->indirect_pages);
}
s->indirect_grants[i]->gref = INVALID_GRANT_REF;
list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
}
}
}
if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); return IRQ_HANDLED;
}
spin_lock_irqsave(&rinfo->ring_lock, flags);
again:
rp = READ_ONCE(rinfo->ring.sring->rsp_prod);
virt_rmb(); /* Ensure we see queued responses up to 'rp'. */ if (RING_RESPONSE_PROD_OVERFLOW(&rinfo->ring, rp)) {
pr_alert("%s: illegal number of responses %u\n",
info->gd->disk_name, rp - rinfo->ring.rsp_cons); goto err;
}
for (i = rinfo->ring.rsp_cons; i != rp; i++) { unsignedlong id; unsignedint op;
eoiflag = 0;
RING_COPY_RESPONSE(&rinfo->ring, i, &bret);
id = bret.id;
/* * The backend has messed up and given us an id that we would * never have given to it (we stamp it up to BLK_RING_SIZE - * look in get_id_from_freelist.
*/ if (id >= BLK_RING_SIZE(info)) {
pr_alert("%s: response has incorrect id (%ld)\n",
info->gd->disk_name, id); goto err;
} if (rinfo->shadow[id].status != REQ_WAITING) {
pr_alert("%s: response references no pending request\n",
info->gd->disk_name); goto err;
}
op = rinfo->shadow[id].req.operation; if (op == BLKIF_OP_INDIRECT)
op = rinfo->shadow[id].req.u.indirect.indirect_op; if (bret.operation != op) {
pr_alert("%s: response has wrong operation (%u instead of %u)\n",
info->gd->disk_name, bret.operation, op); goto err;
}
if (bret.operation != BLKIF_OP_DISCARD) { int ret;
/* * We may need to wait for an extra response if the * I/O request is split in 2
*/
ret = blkif_completion(&id, rinfo, &bret); if (!ret) continue; if (unlikely(ret < 0)) goto err;
}
if (add_id_to_freelist(rinfo, id)) {
WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
info->gd->disk_name, op_name(bret.operation), id); continue;
}
switch (bret.operation) { case BLKIF_OP_DISCARD: if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq;
pr_warn_ratelimited("blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret.operation));
blkif_req(req)->error = BLK_STS_NOTSUPP;
info->feature_discard = 0;
info->feature_secdiscard = 0;
blk_queue_disable_discard(rq);
blk_queue_disable_secure_erase(rq);
} break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) {
pr_warn_ratelimited("blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret.operation));
blkif_req(req)->error = BLK_STS_NOTSUPP;
} if (unlikely(bret.status == BLKIF_RSP_ERROR &&
rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
pr_warn_ratelimited("blkfront: %s: empty %s op failed\n",
info->gd->disk_name, op_name(bret.operation));
blkif_req(req)->error = BLK_STS_NOTSUPP;
} if (unlikely(blkif_req(req)->error)) { if (blkif_req(req)->error == BLK_STS_NOTSUPP)
blkif_req(req)->error = BLK_STS_OK;
info->feature_fua = 0;
info->feature_flush = 0;
}
fallthrough; case BLKIF_OP_READ: case BLKIF_OP_WRITE: if (unlikely(bret.status != BLKIF_RSP_OKAY))
dev_dbg_ratelimited(&info->xbdev->dev, "Bad return from blkdev data request: %#x\n",
bret.status);
break; default:
BUG();
}
if (likely(!blk_should_fake_timeout(req->q)))
blk_mq_complete_request(req);
}
rinfo->ring.rsp_cons = i;
if (i != rinfo->ring.req_prod_pvt) { int more_to_do;
RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do); if (more_to_do) goto again;
} else
rinfo->ring.sring->rsp_event = i + 1;
kick_pending_request_queues_locked(rinfo);
spin_unlock_irqrestore(&rinfo->ring_lock, flags);
xen_irq_lateeoi(irq, eoiflag);
return IRQ_HANDLED;
err:
info->connected = BLKIF_STATE_ERROR;
spin_unlock_irqrestore(&rinfo->ring_lock, flags);
/* No EOI in order to avoid further interrupts. */
pr_alert("%s disabled for further use\n", info->gd->disk_name); return IRQ_HANDLED;
}
/* * Write out per-ring/queue nodes including ring-ref and event-channel, and each * ring buffer may have multi pages depending on ->nr_ring_pages.
*/ staticint write_per_ring_nodes(struct xenbus_transaction xbt, struct blkfront_ring_info *rinfo, constchar *dir)
{ int err; unsignedint i; constchar *message = NULL; struct blkfront_info *info = rinfo->dev_info;
if (info->nr_ring_pages == 1) {
err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]); if (err) {
message = "writing ring-ref"; goto abort_transaction;
}
} else { for (i = 0; i < info->nr_ring_pages; i++) { char ring_ref_name[RINGREF_NAME_LEN];
abort_transaction:
xenbus_transaction_end(xbt, 1); if (message)
xenbus_dev_fatal(info->xbdev, err, "%s", message);
return err;
}
/* Enable the persistent grants feature. */ staticbool feature_persistent = true;
module_param(feature_persistent, bool, 0644);
MODULE_PARM_DESC(feature_persistent, "Enables the persistent grants feature");
/* Common code used when first setting up, and when resuming. */ staticint talk_to_blkback(struct xenbus_device *dev, struct blkfront_info *info)
{ constchar *message = NULL; struct xenbus_transaction xbt; int err; unsignedint i, max_page_order; unsignedint ring_page_order; struct blkfront_ring_info *rinfo;
if (!info) return -ENODEV;
/* Check if backend is trusted. */
info->bounce = !xen_blkif_trusted ||
!xenbus_read_unsigned(dev->nodename, "trusted", 1);
/* We already got the number of queues/rings in _probe */ if (info->nr_rings == 1) {
err = write_per_ring_nodes(xbt, info->rinfo, dev->nodename); if (err) goto destroy_blkring;
} else { char *path;
size_t pathsize;
/* Check if backend supports multiple queues. */
backend_max_queues = xenbus_read_unsigned(info->xbdev->otherend, "multi-queue-max-queues", 1);
info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); /* We need at least one ring. */ if (!info->nr_rings)
info->nr_rings = 1;
/* * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffer for communication with the backend, and * inform the backend of the appropriate details for those. Switch to * Initialised state.
*/ staticint blkfront_probe(struct xenbus_device *dev, conststruct xenbus_device_id *id)
{ int err, vdevice; struct blkfront_info *info;
/* FIXME: Use dynamic device id if this is not set. */
err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device", "%i", &vdevice); if (err != 1) { /* go looking in the extended area instead */
err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext", "%i", &vdevice); if (err != 1) {
xenbus_dev_fatal(dev, err, "reading virtual-device"); return err;
}
}
if (xen_hvm_domain()) { char *type; int len; /* no unplug has been done: do not hook devices != xen vbds */ if (xen_has_pv_and_legacy_disk_devices()) { int major;
if (!VDEV_IS_EXTENDED(vdevice))
major = BLKIF_MAJOR(vdevice); else
major = XENVBD_MAJOR;
if (major != XENVBD_MAJOR) {
printk(KERN_INFO "%s: HVM does not support vbd %d as xen block device\n",
__func__, vdevice); return -ENODEV;
}
} /* do not create a PV cdrom device if we are an HVM guest */
type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len); if (IS_ERR(type)) return -ENODEV; if (strncmp(type, "cdrom", 5) == 0) {
kfree(type); return -ENODEV;
}
kfree(type);
}
info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) {
xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); return -ENOMEM;
}
/* Front end dir is a number, which is used as the id. */
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
dev_set_drvdata(&dev->dev, info);
while ((bio = bio_list_pop(&info->bio_list)) != NULL) { /* Traverse the list of pending bios and re-queue them */
submit_bio(bio);
}
return 0;
}
/* * We are reconnecting to the backend, due to a suspend/resume, or a backend * driver restart. We tear down our blkif structure and recreate it, but * leave the device-layer structures intact so that this is transparent to the * rest of the kernel.
*/ staticint blkfront_resume(struct xenbus_device *dev)
{ struct blkfront_info *info = dev_get_drvdata(&dev->dev); int err = 0; unsignedint i, j; struct blkfront_ring_info *rinfo;
for (j = 0; j < BLK_RING_SIZE(info); j++) { /* Not in use? */ if (!shadow[j].request) continue;
/* * Get the bios in the request so we can re-queue them.
*/ if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
req_op(shadow[j].request) == REQ_OP_DISCARD ||
req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
shadow[j].request->cmd_flags & REQ_FUA) { /* * Flush operations don't contain bios, so * we need to requeue the whole request * * XXX: but this doesn't make any sense for a * write with the FUA flag set..
*/
list_add(&shadow[j].request->queuelist, &info->requests); continue;
}
merge_bio.head = shadow[j].request->bio;
merge_bio.tail = shadow[j].request->biotail;
bio_list_merge(&info->bio_list, &merge_bio);
shadow[j].request->bio = NULL;
blk_mq_end_request(shadow[j].request, BLK_STS_OK);
}
}
if (info->max_indirect_segments == 0) { if (!HAS_EXTRA_REQ)
grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; else { /* * When an extra req is required, the maximum * grants supported is related to the size of the * Linux block segment.
*/
grants = GRANTS_PER_PSEG;
}
} else
grants = info->max_indirect_segments;
psegs = DIV_ROUND_UP(grants, GRANTS_PER_PSEG);
if (!info->bounce && info->max_indirect_segments) { /* * We are using indirect descriptors but don't have a bounce * buffer, we need to allocate a set of pages that can be * used for mapping indirect grefs
*/ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { struct page *indirect_page = alloc_page(GFP_KERNEL |
__GFP_ZERO); if (!indirect_page) goto out_of_memory;
list_add(&indirect_page->lru, &rinfo->indirect_pages);
}
}
for (i = 0; i < BLK_RING_SIZE(info); i++) {
rinfo->shadow[i].grants_used =
kvcalloc(grants, sizeof(rinfo->shadow[i].grants_used[0]),
GFP_KERNEL);
rinfo->shadow[i].sg = kvcalloc(psegs, sizeof(rinfo->shadow[i].sg[0]),
GFP_KERNEL); if (info->max_indirect_segments)
rinfo->shadow[i].indirect_grants =
kvcalloc(INDIRECT_GREFS(grants), sizeof(rinfo->shadow[i].indirect_grants[0]),
GFP_KERNEL); if ((rinfo->shadow[i].grants_used == NULL) ||
(rinfo->shadow[i].sg == NULL) ||
(info->max_indirect_segments &&
(rinfo->shadow[i].indirect_grants == NULL))) goto out_of_memory;
sg_init_table(rinfo->shadow[i].sg, psegs);
}
memalloc_noio_restore(memflags);
return 0;
out_of_memory: for (i = 0; i < BLK_RING_SIZE(info); i++) {
kvfree(rinfo->shadow[i].grants_used);
rinfo->shadow[i].grants_used = NULL;
kvfree(rinfo->shadow[i].sg);
rinfo->shadow[i].sg = NULL;
kvfree(rinfo->shadow[i].indirect_grants);
rinfo->shadow[i].indirect_grants = NULL;
} if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n;
list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
list_del(&indirect_page->lru);
__free_page(indirect_page);
}
}
/* * If there's no "feature-barrier" defined, then it means * we're dealing with a very old backend which writes * synchronously; nothing to do. * * If there are barriers, then we use flush.
*/ if (xenbus_read_unsigned(info->xbdev->otherend, "feature-barrier", 0)) {
info->feature_flush = 1;
info->feature_fua = 1;
}
/* * And if there is "feature-flush-cache" use that above * barriers.
*/ if (xenbus_read_unsigned(info->xbdev->otherend, "feature-flush-cache",
0)) {
info->feature_flush = 1;
info->feature_fua = 0;
}
if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0))
blkfront_setup_discard(info);
if (info->feature_persistent_parm)
info->feature_persistent =
!!xenbus_read_unsigned(info->xbdev->otherend, "feature-persistent", 0); if (info->feature_persistent)
info->bounce = true;
/* * Invoked when the backend is finally 'ready' (and has told produced * the details about the physical device - #sectors, size, etc).
*/ staticvoid blkfront_connect(struct blkfront_info *info)
{ unsignedlonglong sectors; int err, i; struct blkfront_ring_info *rinfo;
switch (info->connected) { case BLKIF_STATE_CONNECTED: /* * Potentially, the back-end may be signalling * a capacity change; update the capacity.
*/
err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, "sectors", "%Lu", §ors); if (XENBUS_EXIST_ERR(err)) return;
printk(KERN_INFO "Setting capacity to %Lu\n",
sectors);
set_capacity_and_notify(info->gd, sectors);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.