/* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE.
*/
/* * Each of the three rwsem locks (devices, clients, client_data) protects the * xarray of the same name. Specifically it allows the caller to assert that * the MARK will/will not be changing under the lock, and for devices and * clients, that the value in the xarray is still a valid pointer. Change of * the MARK is linked to the object state, so holding the lock and testing the * MARK also asserts that the contained object is in a certain state. * * This is used to build a two stage register/unregister flow where objects * can continue to be in the xarray even though they are still in progress to * register/unregister. * * The xarray itself provides additional locking, and restartable iteration, * which is also relied on. * * Locks should not be nested, with the exception of client_data, which is * allowed to nest under the read side of the other two locks. * * The devices_rwsem also protects the device name list, any change or * assignment of device name must also hold the write side to guarantee unique * names.
*/
/* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration * status need to call ib_device_try_get() on the device to ensure it is * registered, and keep it registered, for the required duration. *
*/ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1
staticvoid ib_client_put(struct ib_client *client)
{ if (refcount_dec_and_test(&client->uses))
complete(&client->uses_zero);
}
/* * If client_data is registered then the corresponding client must also still * be registered.
*/ #define CLIENT_DATA_REGISTERED XA_MARK_1
unsignedint rdma_dev_net_id;
/* * A list of net namespaces is maintained in an xarray. This is necessary * because we can't get the locking right using the existing net ns list. We * would require a init_net callback after the list is updated.
*/ static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); /* * rwsem to protect accessing the rdma_nets xarray entries.
*/ static DECLARE_RWSEM(rdma_nets_rwsem);
bool ib_devices_shared_netns = true;
module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
MODULE_PARM_DESC(netns_mode, "Share device among net namespaces; default=1 (shared)"); /** * rdma_dev_access_netns() - Return whether an rdma device can be accessed * from a specified net namespace or not. * @dev: Pointer to rdma device which needs to be checked * @net: Pointer to net namesapce for which access to be checked * * When the rdma device is in shared mode, it ignores the net namespace. * When the rdma device is exclusive to a net namespace, rdma device net * namespace is checked against the specified one.
*/ bool rdma_dev_access_netns(conststruct ib_device *dev, conststruct net *net)
{ return (ib_devices_shared_netns ||
net_eq(read_pnet(&dev->coredev.rdma_net), net));
}
EXPORT_SYMBOL(rdma_dev_access_netns);
/** * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has * CAP_NET_RAW capability or not. * * @dev: Pointer to rdma device whose capability to be checked * * Returns true if a rdma device's owning user namespace has CAP_NET_RAW * capability, otherwise false. When rdma subsystem is in legacy shared network, * namespace mode, the default net namespace is considered.
*/ bool rdma_dev_has_raw_cap(conststruct ib_device *dev)
{ conststruct net *net;
/* Network namespace is the resource whose user namespace * to be considered. When in shared mode, there is no reliable * network namespace resource, so consider the default net namespace.
*/ if (ib_devices_shared_netns)
net = &init_net; else
net = read_pnet(&dev->coredev.rdma_net);
/* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in * the array. This does the same thing as xa_for_each except that it also * returns NULL valued entries if the array is allocating. Simplified to only * work on simple xarrays.
*/ staticvoid *xan_find_marked(struct xarray *xa, unsignedlong *indexp,
xa_mark_t filter)
{
XA_STATE(xas, xa, *indexp); void *entry;
rcu_read_lock(); do {
entry = xas_find_marked(&xas, ULONG_MAX, filter); if (xa_is_zero(entry)) break;
} while (xas_retry(&xas, entry));
rcu_read_unlock();
staticint rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net);
/* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; struct ib_port_data pdata[];
};
/** * ib_device_put - Release IB device reference * @device: device whose reference to be released * * ib_device_put() releases reference to the IB device to allow it to be * unregistered and eventually free.
*/ void ib_device_put(struct ib_device *device)
{ if (refcount_dec_and_test(&device->refcount))
complete(&device->unreg_completion);
}
EXPORT_SYMBOL(ib_device_put);
xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device;
return NULL;
}
/** * ib_device_get_by_name - Find an IB device by name * @name: The name to look for * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device by its name. The caller must call * ib_device_put() on the returned pointer.
*/ struct ib_device *ib_device_get_by_name(constchar *name, enum rdma_driver_id driver_id)
{ struct ib_device *device;
if (device) { if (!ib_device_try_get(device))
device = NULL;
}
up_read(&devices_rwsem); return device;
}
EXPORT_SYMBOL(ib_device_get_by_name);
staticint rename_compat_devs(struct ib_device *device)
{ struct ib_core_device *cdev; unsignedlong index; int ret = 0;
mutex_lock(&device->compat_devs_mutex);
xa_for_each (&device->compat_devs, index, cdev) {
ret = device_rename(&cdev->dev, dev_name(&device->dev)); if (ret) {
dev_warn(&cdev->dev, "Fail to rename compatdev to new name %s\n",
dev_name(&device->dev)); break;
}
}
mutex_unlock(&device->compat_devs_mutex); return ret;
}
int ib_device_rename(struct ib_device *ibdev, constchar *name)
{ unsignedlong index; void *client_data; int ret;
down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) {
up_write(&devices_rwsem); return 0;
}
if (__ib_device_get_by_name(name)) {
up_write(&devices_rwsem); return -EEXIST;
}
ret = device_rename(&ibdev->dev, name); if (ret) {
up_write(&devices_rwsem); return ret;
}
strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
ret = rename_compat_devs(ibdev);
/* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers * driver uses it. Adding anything in ib_core_device before * device will break this assumption.
*/
BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
offsetof(struct ib_device, dev));
/** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * @net: network namespace device should be located in, namespace * must stay valid until ib_register_device() is completed. * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device().
*/ struct ib_device *_ib_alloc_device(size_t size, struct net *net)
{ struct ib_device *device; unsignedint i;
if (WARN_ON(size < sizeof(struct ib_device))) return NULL;
device = kzalloc(size, GFP_KERNEL); if (!device) return NULL;
if (rdma_restrack_init(device)) {
kfree(device); return NULL;
}
/* ib_devices_shared_netns can't change while we have active namespaces * in the system which means either init_net is passed or the user has * no idea what they are doing. * * To avoid breaking backward compatibility, when in shared mode, * force to init the device in the init_net.
*/
net = ib_devices_shared_netns ? &init_net : net;
rdma_init_coredev(&device->coredev, device, net);
INIT_LIST_HEAD(&device->event_handler_list);
spin_lock_init(&device->qp_open_list_lock);
init_rwsem(&device->event_handler_rwsem);
mutex_init(&device->unregistration_lock); /* * client_data needs to be alloc because we don't want our mark to be * destroyed if the user stores NULL in the client data.
*/
xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
init_rwsem(&device->client_data_rwsem);
xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
mutex_init(&device->compat_devs_mutex);
init_completion(&device->unreg_completion);
INIT_WORK(&device->unregistration_work, ib_unregister_work);
spin_lock_init(&device->cq_pools_lock); for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++)
INIT_LIST_HEAD(&device->cq_pools[i]);
/** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device().
*/ void ib_dealloc_device(struct ib_device *device)
{ if (device->ops.dealloc_driver)
device->ops.dealloc_driver(device);
/* * ib_unregister_driver() requires all devices to remain in the xarray * while their ops are callable. The last op we call is dealloc_driver * above. This is needed to create a fence on op callbacks prior to * allowing the driver module to unload.
*/
down_write(&devices_rwsem); if (xa_load(&devices, device->index) == device)
xa_erase(&devices, device->index);
up_write(&devices_rwsem);
/* * add_client_context() and remove_client_context() must be safe against * parallel calls on the same device - registration/unregistration of both the * device and client can be occurring in parallel. * * The routines need to be a fence, any caller must not return until the add * or remove is fully completed.
*/ staticint add_client_context(struct ib_device *device, struct ib_client *client)
{ int ret = 0;
if (!device->kverbs_provider && !client->no_kverbs_req) return 0;
down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device * unregistration locks.
*/ if (!refcount_inc_not_zero(&client->uses)) goto out_unlock;
refcount_inc(&device->refcount);
/* * Another caller to add_client_context got here first and has already * completely initialized context.
*/ if (xa_get_mark(&device->client_data, client->client_id,
CLIENT_DATA_REGISTERED)) goto out;
ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
GFP_KERNEL)); if (ret) goto out;
downgrade_write(&device->client_data_rwsem); if (client->add) { if (client->add(device)) { /* * If a client fails to add then the error code is * ignored, but we won't call any more ops on this * client.
*/
xa_erase(&device->client_data, client->client_id);
up_read(&device->client_data_rwsem);
ib_device_put(device);
ib_client_put(client); return 0;
}
}
/* Readers shall not see a client until add has been completed */
xa_set_mark(&device->client_data, client->client_id,
CLIENT_DATA_REGISTERED);
up_read(&device->client_data_rwsem); return 0;
/* * Notice we cannot be holding any exclusive locks when calling the * remove callback as the remove callback can recurse back into any * public functions in this module and thus try for any locks those * functions take. * * For this reason clients and drivers should not call the * unregistration functions will holdling any locks.
*/ if (client->remove)
client->remove(device, client_data);
/* This can only be called once the physical port range is defined */ if (WARN_ON(!device->phys_port_cnt)) return -EINVAL;
/* Reserve U32_MAX so the logic to go over all the ports is sane */ if (WARN_ON(device->phys_port_cnt == U32_MAX)) return -EINVAL;
/* * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning.
*/
pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
size_add(rdma_end_port(device), 1)),
GFP_KERNEL); if (!pdata_rcu) return -ENOMEM; /* * The rcu_head is put in front of the port data array and the stored * pointer is adjusted since we never need to see that member until * kfree_rcu.
*/
device->port_data = pdata_rcu->pdata;
ret = device->ops.get_port_immutable(device, port,
&pdata->immutable); if (ret) return ret;
if (verify_immutable(device, port)) return -EINVAL;
} return 0;
}
/** * ib_port_immutable_read() - Read rdma port's immutable data * @dev: IB device * @port: port number whose immutable data to read. It starts with index 1 and * valid upto including rdma_end_port().
*/ conststruct ib_port_immutable*
ib_port_immutable_read(struct ib_device *dev, unsignedint port)
{
WARN_ON(!rdma_is_port_valid(dev, port)); return &dev->port_data[port].immutable;
}
EXPORT_SYMBOL(ib_port_immutable_read);
lockdep_assert_held(&rdma_nets_rwsem); if (!ib_devices_shared_netns) return 0;
/* * Create and add compat device in all namespaces other than where it * is currently bound to.
*/ if (net_eq(read_pnet(&rnet->net),
read_pnet(&device->coredev.rdma_net))) return 0;
/* * The first of init_net() or ib_register_device() to take the * compat_devs_mutex wins and gets to add the device. Others will wait * for completion here.
*/
mutex_lock(&device->compat_devs_mutex);
cdev = xa_load(&device->compat_devs, rnet->id); if (cdev) {
ret = 0; goto done;
}
ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); if (ret) goto done;
cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) {
ret = -ENOMEM; goto cdev_err;
}
cdev->dev.parent = device->dev.parent;
rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
cdev->dev.release = compatdev_release;
ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); if (ret) goto add_err;
ret = device_add(&cdev->dev); if (ret) goto add_err;
ret = ib_setup_port_attrs(cdev); if (ret) goto port_err;
ret = xa_err(xa_store(&device->compat_devs, rnet->id,
cdev, GFP_KERNEL)); if (ret) goto insert_err;
/* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread.
*/
down_read(&rdma_nets_rwsem);
xa_for_each (&dev->compat_devs, c_index, cdev)
remove_one_compat_dev(dev, c_index);
up_read(&rdma_nets_rwsem);
}
up_read(&devices_rwsem);
}
staticint add_all_compat_devs(void)
{ struct rdma_dev_net *rnet; struct ib_device *dev; unsignedlong index; int ret = 0;
/* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread.
*/
down_read(&rdma_nets_rwsem);
xa_for_each (&rdma_nets, net_index, rnet) {
ret = add_one_compat_dev(dev, rnet); if (ret) break;
}
up_read(&rdma_nets_rwsem);
}
up_read(&devices_rwsem); if (ret)
remove_all_compat_devs(); return ret;
}
int rdma_compatdev_set(u8 enable)
{ struct rdma_dev_net *rnet; unsignedlong index; int ret = 0;
down_write(&rdma_nets_rwsem); if (ib_devices_shared_netns == enable) {
up_write(&rdma_nets_rwsem); return 0;
}
/* enable/disable of compat devices is not supported * when more than default init_net exists.
*/
xa_for_each (&rdma_nets, index, rnet) {
ret++; break;
} if (!ret)
ib_devices_shared_netns = enable;
up_write(&rdma_nets_rwsem); if (ret) return -EBUSY;
if (enable)
ret = add_all_compat_devs(); else
remove_all_compat_devs(); return ret;
}
staticvoid rdma_dev_exit_net(struct net *net)
{ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsignedlong index; int ret;
down_write(&rdma_nets_rwsem); /* * Prevent the ID from being re-used and hide the id from xa_for_each.
*/
ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
WARN_ON(ret);
up_write(&rdma_nets_rwsem);
down_read(&devices_rwsem);
xa_for_each (&devices, index, dev) {
get_device(&dev->dev); /* * Release the devices_rwsem so that pontentially blocking * device_del, doesn't hold the devices_rwsem for too long.
*/
up_read(&devices_rwsem);
remove_one_compat_dev(dev, rnet->id);
/* * If the real device is in the NS then move it back to init.
*/
rdma_dev_change_netns(dev, net, &init_net);
static __net_init int rdma_dev_init_net(struct net *net)
{ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsignedlong index; struct ib_device *dev; int ret;
write_pnet(&rnet->net, net);
ret = rdma_nl_net_init(rnet); if (ret) return ret;
/* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0;
ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); if (ret) {
rdma_nl_net_exit(rnet); return ret;
}
down_read(&devices_rwsem);
xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { /* Hold nets_rwsem so that netlink command cannot change * system configuration for device sharing mode.
*/
down_read(&rdma_nets_rwsem);
ret = add_one_compat_dev(dev, rnet);
up_read(&rdma_nets_rwsem); if (ret) break;
}
up_read(&devices_rwsem);
if (ret)
rdma_dev_exit_net(net);
return ret;
}
/* * Assign the unique string device name and the unique device index. This is * undone by ib_dealloc_device.
*/ staticint assign_name(struct ib_device *device, constchar *name)
{ static u32 last_id; int ret;
down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%'))
ret = alloc_name(device, name); else
ret = dev_set_name(&device->dev, name); if (ret) goto out;
if (__ib_device_get_by_name(dev_name(&device->dev))) {
ret = -ENFILE; goto out;
}
strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
&last_id, GFP_KERNEL); if (ret > 0)
ret = 0;
out:
up_write(&devices_rwsem); return ret;
}
/* * setup_device() allocates memory and sets up data that requires calling the * device ops, this is the only reason these actions are not done during * ib_alloc_device. It is undone by ib_dealloc_device().
*/ staticint setup_device(struct ib_device *device)
{ struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret;
ib_device_check_mandatory(device);
ret = setup_port_data(device); if (ret) {
dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret;
}
memset(&device->attrs, 0, sizeof(device->attrs));
ret = device->ops.query_device(device, &device->attrs, &uhw); if (ret) {
dev_warn(&device->dev, "Couldn't query the device attributes\n"); return ret;
}
/* * Remove clients in LIFO order, see assign_client_id. This could be * more efficient if xarray learns to reverse iterate. Since no new * clients can be added to this ib_device past this point we only need * the maximum possible client_id value here.
*/
down_read(&clients_rwsem);
cid = highest_client_id;
up_read(&clients_rwsem); while (cid) {
cid--;
remove_client_context(device, cid);
}
ib_cq_pool_cleanup(device);
/* Pairs with refcount_set in enable_device */
ib_device_put(device);
wait_for_completion(&device->unreg_completion);
/* * compat devices must be removed after device refcount drops to zero. * Otherwise init_net() may add more compatdevs after removing compat * devices and before device is disabled.
*/
remove_compat_devs(device);
}
/* * An enabled device is visible to all clients and to all the public facing * APIs that return a device pointer. This always returns with a new get, even * if it fails.
*/ staticint enable_device_and_get(struct ib_device *device)
{ struct ib_client *client; unsignedlong index; int ret = 0;
/* * One ref belongs to the xa and the other belongs to this * thread. This is needed to guard against parallel unregistration.
*/
refcount_set(&device->refcount, 2);
down_write(&devices_rwsem);
xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
/* * By using downgrade_write() we ensure that no other thread can clear * DEVICE_REGISTERED while we are completing the client setup.
*/
downgrade_write(&devices_rwsem);
if (device->ops.enable_driver) {
ret = device->ops.enable_driver(device); if (ret) goto out;
}
down_read(&clients_rwsem);
xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
ret = add_client_context(device, client); if (ret) break;
}
up_read(&clients_rwsem); if (!ret)
ret = add_compat_devs(device);
out:
up_read(&devices_rwsem); return ret;
}
/* Mark for userspace that device is ready */
kobject_uevent(&device->dev.kobj, KOBJ_ADD);
ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) goto out;
rdma_for_each_port(device, port) {
netdev = ib_device_get_netdev(device, port); if (!netdev) continue;
ret = rdma_nl_notify_event(device, port,
RDMA_NETDEV_ATTACH_EVENT);
dev_put(netdev); if (ret) goto out;
}
out:
up_read(&devices_rwsem);
}
/** * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will * cause a unique index to be added to the passed device name. * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB * device will be used. In this case the caller should fully * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). * * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() * asynchronously then the device pointer may become freed as soon as this * function returns.
*/ int ib_register_device(struct ib_device *device, constchar *name, struct device *dma_device)
{ int ret;
ret = assign_name(device, name); if (ret) return ret;
/* * If the caller does not provide a DMA capable device then the IB core * will set up ib_sge and scatterlist structures that stash the kernel * virtual address into the address field.
*/
WARN_ON(dma_device && !dma_device->dma_parms);
device->dma_device = dma_device;
ret = setup_device(device); if (ret) return ret;
ret = ib_cache_setup_one(device); if (ret) {
dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); return ret;
}
device->groups[0] = &ib_dev_attr_group;
device->groups[1] = device->ops.device_group;
ret = ib_setup_device_attrs(device); if (ret) goto cache_cleanup;
ib_device_register_rdmacg(device);
rdma_counter_init(device);
/* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet.
*/
dev_set_uevent_suppress(&device->dev, true);
ret = device_add(&device->dev); if (ret) goto cg_cleanup;
ret = ib_setup_port_attrs(&device->coredev); if (ret) {
dev_warn(&device->dev, "Couldn't register device with driver model\n"); goto dev_cleanup;
}
ret = enable_device_and_get(device); if (ret) { void (*dealloc_fn)(struct ib_device *);
/* * If we hit this error flow then we don't want to * automatically dealloc the device since the caller is * expected to call ib_dealloc_device() after * ib_register_device() fails. This is tricky due to the * possibility for a parallel unregistration along with this * error flow. Since we have a refcount here we know any * parallel flow is stopped in disable_device and will see the * special dealloc_driver pointer, causing the responsibility to * ib_dealloc_device() to revert back to this thread.
*/
dealloc_fn = device->ops.dealloc_driver;
device->ops.dealloc_driver = prevent_dealloc_device;
ib_device_put(device);
__ib_unregister_device(device);
device->ops.dealloc_driver = dealloc_fn;
dev_set_uevent_suppress(&device->dev, false); return ret;
}
dev_set_uevent_suppress(&device->dev, false);
/* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely * unregistered even if multiple callers are unregistering it at the * same time. This also interacts with the registration flow and * provides sane semantics if register and unregister are racing.
*/
mutex_lock(&ib_dev->unregistration_lock); if (!refcount_read(&ib_dev->refcount)) goto out;
/* * Drivers using the new flow may not call ib_dealloc_device except * in error unwind prior to registration success.
*/ if (ib_dev->ops.dealloc_driver &&
ib_dev->ops.dealloc_driver != prevent_dealloc_device) {
WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
ib_dealloc_device(ib_dev);
}
out:
mutex_unlock(&ib_dev->unregistration_lock);
}
/** * ib_unregister_device - Unregister an IB device * @ib_dev: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. * * Callers should call this routine only once, and protect against races with * registration. Typically it should only be called as part of a remove * callback in an implementation of driver core's struct device_driver and * related. * * If ops.dealloc_driver is used then ib_dev will be freed upon return from * this function.
*/ void ib_unregister_device(struct ib_device *ib_dev)
{
get_device(&ib_dev->dev);
__ib_unregister_device(ib_dev);
put_device(&ib_dev->dev);
}
EXPORT_SYMBOL(ib_unregister_device);
/** * ib_unregister_device_and_put - Unregister a device while holding a 'get' * @ib_dev: The device to unregister * * This is the same as ib_unregister_device(), except it includes an internal * ib_device_put() that should match a 'get' obtained by the caller. * * It is safe to call this routine concurrently from multiple threads while * holding the 'get'. When the function returns the device is fully * unregistered. * * Drivers using this flow MUST use the driver_unregister callback to clean up * their resources associated with the device and dealloc it.
*/ void ib_unregister_device_and_put(struct ib_device *ib_dev)
{
WARN_ON(!ib_dev->ops.dealloc_driver);
get_device(&ib_dev->dev);
ib_device_put(ib_dev);
__ib_unregister_device(ib_dev);
put_device(&ib_dev->dev);
}
EXPORT_SYMBOL(ib_unregister_device_and_put);
/** * ib_unregister_driver - Unregister all IB devices for a driver * @driver_id: The driver to unregister * * This implements a fence for device unregistration. It only returns once all * devices associated with the driver_id have fully completed their * unregistration and returned from ib_unregister_device*(). * * If device's are not yet unregistered it goes ahead and starts unregistering * them. * * This does not block creation of new devices with the given driver_id, that * is the responsibility of the caller.
*/ void ib_unregister_driver(enum rdma_driver_id driver_id)
{ struct ib_device *ib_dev; unsignedlong index;
/** * ib_unregister_device_queued - Unregister a device using a work queue * @ib_dev: The device to unregister * * This schedules an asynchronous unregistration using a WQ for the device. A * driver should use this to avoid holding locks while doing unregistration, * such as holding the RTNL lock. * * Drivers using this API must use ib_unregister_driver before module unload * to ensure that all scheduled unregistrations have completed.
*/ void ib_unregister_device_queued(struct ib_device *ib_dev)
{
WARN_ON(!refcount_read(&ib_dev->refcount));
WARN_ON(!ib_dev->ops.dealloc_driver);
get_device(&ib_dev->dev); if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work))
put_device(&ib_dev->dev);
}
EXPORT_SYMBOL(ib_unregister_device_queued);
/* * The caller must pass in a device that has the kref held and the refcount * released. If the device is in cur_net and still registered then it is moved * into net.
*/ staticint rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net)
{ int ret2 = -EINVAL; int ret;
mutex_lock(&device->unregistration_lock);
/* * If a device not under ib_device_get() or if the unregistration_lock * is not held, the namespace can be changed, or it can be unregistered. * Check again under the lock.
*/ if (refcount_read(&device->refcount) == 0 ||
!net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
ret = -ENODEV; goto out;
}
/* * At this point no one can be using the device, so it is safe to * change the namespace.
*/
write_pnet(&device->coredev.rdma_net, net);
down_read(&devices_rwsem); /* * Currently rdma devices are system wide unique. So the device name * is guaranteed free in the new namespace. Publish the new namespace * at the sysfs level.
*/
ret = device_rename(&device->dev, dev_name(&device->dev));
up_read(&devices_rwsem); if (ret) {
dev_warn(&device->dev, "%s: Couldn't rename device after namespace change\n",
__func__); /* Try and put things back and re-enable the device */
write_pnet(&device->coredev.rdma_net, cur_net);
}
ret2 = enable_device_and_get(device); if (ret2) { /* * This shouldn't really happen, but if it does, let the user * retry at later point. So don't disable the device.
*/
dev_warn(&device->dev, "%s: Couldn't re-enable device after namespace change\n",
__func__);
}
kobject_uevent(&device->dev.kobj, KOBJ_ADD);
ib_device_put(device);
out:
mutex_unlock(&device->unregistration_lock); if (ret) return ret; return ret2;
}
int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd)
{ struct net *net; int ret;
net = get_net_ns_by_fd(ns_fd); if (IS_ERR(net)) {
ret = PTR_ERR(net); goto net_err;
}
if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM; goto ns_err;
}
/* * All the ib_clients, including uverbs, are reset when the namespace is * changed and this cannot be blocked waiting for userspace to do * something, so disassociation is mandatory.
*/ if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) {
ret = -EOPNOTSUPP; goto ns_err;
}
get_device(&dev->dev);
ib_device_put(dev);
ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
put_device(&dev->dev);
staticint assign_client_id(struct ib_client *client)
{ int ret;
lockdep_assert_held(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in * registration order.
*/
client->client_id = highest_client_id;
ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); if (ret) return ret;
staticvoid remove_client_id(struct ib_client *client)
{
down_write(&clients_rwsem);
xa_erase(&clients, client->client_id); for (; highest_client_id; highest_client_id--) if (xa_load(&clients, highest_client_id - 1)) break;
up_write(&clients_rwsem);
}
/** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered.
*/ int ib_register_client(struct ib_client *client)
{ struct ib_device *device; unsignedlong index; bool need_unreg = false; int ret;
/* * The devices_rwsem is held in write mode to ensure that a racing * ib_register_device() sees a consisent view of clients and devices.
*/
down_write(&devices_rwsem);
down_write(&clients_rwsem);
ret = assign_client_id(client); if (ret) goto out;
need_unreg = true;
xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
ret = add_client_context(device, client); if (ret) goto out;
}
ret = 0;
out:
up_write(&clients_rwsem);
up_write(&devices_rwsem); if (need_unreg && ret)
ib_unregister_client(client); return ret;
}
EXPORT_SYMBOL(ib_register_client);
/** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. * * This is a full fence, once it returns no client callbacks will be called, * or are running in another thread.
*/ void ib_unregister_client(struct ib_client *client)
{ struct ib_device *device; unsignedlong index;
/* We do not want to have locks while calling client->remove() */
rcu_read_lock();
xa_for_each (&devices, index, device) { if (!ib_device_try_get(device)) continue;
rcu_read_unlock();
/* * remove_client_context() is not a fence, it can return even though a * removal is ongoing. Wait until all removals are completed.
*/
wait_for_completion(&client->uses_zero);
remove_client_id(client);
}
EXPORT_SYMBOL(ib_unregister_client);
staticint __ib_get_global_client_nl_info(constchar *client_name, struct ib_client_nl_info *res)
{ struct ib_client *client; unsignedlong index; int ret = -ENOENT;
down_read(&clients_rwsem);
xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { if (strcmp(client->name, client_name) != 0) continue; if (!client->get_global_nl_info) {
ret = -EOPNOTSUPP; break;
}
ret = client->get_global_nl_info(res); if (WARN_ON(ret == -ENOENT))
ret = -EINVAL; if (!ret && res->cdev)
get_device(res->cdev); break;
}
up_read(&clients_rwsem); return ret;
}
staticint __ib_get_client_nl_info(struct ib_device *ibdev, constchar *client_name, struct ib_client_nl_info *res)
{ unsignedlong index; void *client_data; int ret = -ENOENT;
if (!client || strcmp(client->name, client_name) != 0) continue; if (!client->get_nl_info) {
ret = -EOPNOTSUPP; break;
}
ret = client->get_nl_info(ibdev, client_data, res); if (WARN_ON(ret == -ENOENT))
ret = -EINVAL;
/* * The cdev is guaranteed valid as long as we are inside the * client_data_rwsem as remove_one can't be called. Keep it * valid for the caller.
*/ if (!ret && res->cdev)
get_device(res->cdev); break;
}
up_read(&ibdev->client_data_rwsem);
return ret;
}
/** * ib_get_client_nl_info - Fetch the nl_info from a client * @ibdev: IB device * @client_name: Name of the client * @res: Result of the query
*/ int ib_get_client_nl_info(struct ib_device *ibdev, constchar *client_name, struct ib_client_nl_info *res)
{ int ret;
if (ibdev)
ret = __ib_get_client_nl_info(ibdev, client_name, res); else
ret = __ib_get_global_client_nl_info(client_name, res); #ifdef CONFIG_MODULES if (ret == -ENOENT) {
request_module("rdma-client-%s", client_name); if (ibdev)
ret = __ib_get_client_nl_info(ibdev, client_name, res); else
ret = __ib_get_global_client_nl_info(client_name, res);
} #endif if (ret) { if (ret == -ENOENT) return -EOPNOTSUPP; return ret;
}
if (WARN_ON(!res->cdev)) return -EINVAL; return 0;
}
/** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context data that can be retrieved with * ib_get_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called.
*/ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data)
{ void *rc;
/** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback occurs in workqueue context.
*/ void ib_register_event_handler(struct ib_event_handler *event_handler)
{
down_write(&event_handler->device->event_handler_rwsem);
list_add_tail(&event_handler->list,
&event_handler->device->event_handler_list);
up_write(&event_handler->device->event_handler_rwsem);
}
EXPORT_SYMBOL(ib_register_event_handler);
/** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler().
*/ void ib_unregister_event_handler(struct ib_event_handler *event_handler)
{
down_write(&event_handler->device->event_handler_rwsem);
list_del(&event_handler->list);
up_write(&event_handler->device->event_handler_rwsem);
}
EXPORT_SYMBOL(ib_unregister_event_handler);
/** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer.
*/ int ib_query_port(struct ib_device *device,
u32 port_num, struct ib_port_attr *port_attr)
{ if (!rdma_is_port_valid(device, port_num)) return -EINVAL;
spin_lock_irqsave(&ndev_hash_lock, flags); if (hash_hashed(&pdata->ndev_hash_link)) {
hash_del_rcu(&pdata->ndev_hash_link);
spin_unlock_irqrestore(&ndev_hash_lock, flags); /* * We cannot do hash_add_rcu after a hash_del_rcu until the * grace period
*/
synchronize_rcu();
spin_lock_irqsave(&ndev_hash_lock, flags);
} if (pdata->netdev)
hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
(uintptr_t)pdata->netdev);
spin_unlock_irqrestore(&ndev_hash_lock, flags);
}
/** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify * @ndev: net_device to affiliate, may be NULL * @port: IB port the net_device is connected to * * Drivers should use this to link the ib_device to a netdev so the netdev * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be * affiliated with any port. * * The caller must ensure that the given ndev is not unregistered or * unregistering, and that either the ib_device is unregistered or * ib_device_set_netdev() is called with NULL when the ndev sends a * NETDEV_UNREGISTER event.
*/ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
u32 port)
{ enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsignedlong flags; int ret;
if (!rdma_is_port_valid(ib_dev, port)) return -EINVAL;
/* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early.
*/
ret = alloc_port_data(ib_dev); if (ret) return ret;
/* * If this is the last dev_put there is still a * synchronize_rcu before the netdev is kfreed, so we * can continue to rely on unlocked pointer * comparisons after the put
*/
rcu_assign_pointer(pdata->netdev, NULL);
netdev_put(ndev, &pdata->netdev_tracker);
}
spin_unlock_irqrestore(&pdata->netdev_lock, flags);
}
}
if (!rdma_is_port_valid(ib_dev, port)) return NULL;
if (!ib_dev->port_data) return NULL;
pdata = &ib_dev->port_data[port];
/* * New drivers should use ib_device_set_netdev() not the legacy * get_netdev().
*/ if (ib_dev->ops.get_netdev)
res = ib_dev->ops.get_netdev(ib_dev, port); else {
spin_lock(&pdata->netdev_lock);
res = rcu_dereference_protected(
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
dev_hold(res);
spin_unlock(&pdata->netdev_lock);
}
/** * ib_query_netdev_port - Query the port number of a net_device * associated with an ibdev * @ibdev: IB device * @ndev: Network device * @port: IB port the net_device is connected to
*/ int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev,
u32 *port)
{ struct net_device *ib_ndev;
u32 port_num;
/** * ib_device_get_by_netdev - Find an IB device associated with a netdev * @ndev: netdev to locate * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device that is associated with a netdev via * ib_device_set_netdev(). The caller must call ib_device_put() on the * returned pointer.
*/ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id)
{ struct ib_device *res = NULL; struct ib_port_data *cur;
/** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all of the physical RoCE ports of ib_dev * which are related to netdevice and calls callback() on each * device for which filter() function returns non zero.
*/ void ib_enum_roce_netdev(struct ib_device *ib_dev,
roce_netdev_filter filter, void *filter_cookie,
roce_netdev_callback cb, void *cookie)
{
u32 port;
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.14 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.