/* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE.
*/
int ipoib_open(struct net_device *dev)
{ struct ipoib_dev_priv *priv = ipoib_priv(dev);
ipoib_dbg(priv, "bringing up interface\n");
netif_carrier_off(dev);
set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; goto err_disable;
}
ipoib_ib_dev_up(dev);
if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv;
/* Bring up any child interfaces too */
netdev_lock_ops_to_full(dev);
list_for_each_entry(cpriv, &priv->child_intfs, list)
ipoib_schedule_ifupdown_task(cpriv->dev, true);
netdev_unlock_full_to_ops(dev);
} elseif (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags))
ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n",
ppriv->dev->name);
}
netif_start_queue(dev);
break; case AF_INET6: if (IS_ENABLED(CONFIG_IPV6) &&
ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1)) returntrue;
break;
} returnfalse;
}
/* * Find the L2 master net_device on top of the given net_device. * @dev: base IPoIB net_device * * Returns the L2 master net_device with reference held if the L2 master * exists (such as bond netdevice), or returns same netdev with reference * held when master does not exist or when L3 master (such as VRF netdev).
*/ staticstruct net_device *ipoib_get_master_net_dev(struct net_device *dev)
{ struct net_device *master;
rcu_read_lock();
master = netdev_master_upper_dev_get_rcu(dev); if (!master || netif_is_l3_master(master))
master = dev;
staticint ipoib_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv)
{ struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; int ret = 0;
if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
dev_hold(upper);
data->result = upper;
ret = 1;
}
return ret;
}
/** * ipoib_get_net_dev_match_addr - Find a net_device matching * the given address, which is an upper device of the given net_device. * * @addr: IP address to look for. * @dev: base IPoIB net_device * * If found, returns the net_device with a reference held. Otherwise return * NULL.
*/ staticstruct net_device *ipoib_get_net_dev_match_addr( conststruct sockaddr *addr, struct net_device *dev)
{ struct netdev_nested_priv priv; struct ipoib_walk_data data = {
.addr = addr,
};
/* returns the number of IPoIB netdevs on top a given ipoib device matching a * pkey_index and address, if one exists. * * @found_net_dev: contains a matching net_device if the return value >= 1,
* with a reference held. */ staticint ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, constunion ib_gid *gid,
u16 pkey_index, conststruct sockaddr *addr, int nesting, struct net_device **found_net_dev)
{ struct ipoib_dev_priv *child_priv; struct net_device *net_dev = NULL; int matches = 0;
if (priv->pkey_index == pkey_index &&
(!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { if (!addr) {
net_dev = ipoib_get_master_net_dev(priv->dev);
} else { /* Verify the net_device matches the IP address, as
* IPoIB child devices currently share a GID. */
net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
} if (net_dev) { if (!*found_net_dev)
*found_net_dev = net_dev; else
dev_put(net_dev);
++matches;
}
}
if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) return matches;
/* Returns the number of matching net_devs found (between 0 and 2). Also * return the matching net_device in the @net_dev parameter, holding a
* reference to the net_device, if the number of matches >= 1 */ staticint __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port,
u16 pkey_index, constunion ib_gid *gid, conststruct sockaddr *addr, struct net_device **net_dev)
{ struct ipoib_dev_priv *priv; int matches = 0;
*net_dev = NULL;
list_for_each_entry(priv, dev_list, list) { if (priv->port != port) continue;
ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); if (ret) return NULL;
/* See if we can find a unique device matching the pkey and GID */
matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
gid, NULL, &net_dev);
switch (matches) { case 0: return NULL; case 1: return net_dev;
}
dev_put(net_dev);
/* Couldn't find a unique device with pkey and GID only. Use L3
* address to uniquely match the net device */
matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
gid, addr, &net_dev); switch (matches) { case 0: return NULL; default:
dev_warn_ratelimited(&dev->dev, "duplicate IP address detected\n");
fallthrough; case 1: return net_dev;
}
}
if (!status)
ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
be32_to_cpu(sa_path_get_dlid(pathrec)),
pathrec->dgid.raw); else
ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
status, path->pathrec.dgid.raw);
if (!IS_ERR_OR_NULL(ah)) { /* * pathrec.dgid is used as the database key from the LLADDR, * it must remain unchanged even if the SA returns a different * GID to use in the AH.
*/ if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid))) {
ipoib_dbg(
priv, "%s got PathRec for gid %pI6 while asked for %pI6\n",
dev->name, pathrec->dgid.raw,
path->pathrec.dgid.raw);
memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid));
}
path->pathrec = *pathrec;
old_ah = path->ah;
path->ah = ah;
ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
ah, be32_to_cpu(sa_path_get_dlid(pathrec)),
pathrec->sl);
while ((skb = __skb_dequeue(&path->queue)))
__skb_queue_tail(&skqueue, skb);
list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { if (neigh->ah) {
WARN_ON(neigh->ah != old_ah); /* * Dropping the ah reference inside * priv->lock is safe here, because we * will hold one more reference from * the original value of path->ah (ie * old_ah).
*/
ipoib_put_ah(neigh->ah);
}
kref_get(&path->ah->ref);
neigh->ah = path->ah;
if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh))
ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
path,
neigh)); if (!ipoib_cm_get(neigh)) {
ipoib_neigh_free(neigh); continue;
}
}
/* To avoid race condition, make sure that the * neigh will be added only once.
*/ if (unlikely(!list_empty(&neigh->list))) {
spin_unlock_irqrestore(&priv->lock, flags); return neigh;
}
path = __path_find(dev, daddr + 4); if (!path) {
path = path_rec_create(dev, daddr + 4); if (!path) goto err_path;
__path_add(dev, path);
}
list_add_tail(&neigh->list, &path->neigh_list);
if (path->ah && path->ah->valid) {
kref_get(&path->ah->ref);
neigh->ah = path->ah;
/* unicast, arrange "switch" according to probability */ switch (header->proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_TIPC):
neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (unlikely(!neigh)) {
neigh = neigh_add_path(skb, phdr->hwaddr, dev); if (likely(!neigh)) return NETDEV_TX_OK;
} break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): /* for unicast ARP and RARP should always perform path find */
unicast_arp_send(skb, dev, phdr); return NETDEV_TX_OK; default: /* ethertype not supported by IPoIB */
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb); return NETDEV_TX_OK;
}
send_using_neigh: /* note we now hold a ref to neigh */ if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) {
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref;
}
} elseif (neigh->ah && neigh->ah->valid) {
neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
IPOIB_QPN(phdr->hwaddr)); goto unref;
} elseif (neigh->ah) {
neigh_refresh_path(neigh, phdr->hwaddr, dev);
}
/* * we don't rely on dst_entry structure, always stuff the * destination address into skb hard header so we can figure out where * to send the packet later.
*/
push_pseudo_header(skb, daddr);
static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
{ /* * Use only the address parts that contributes to spreading * The subnet prefix is not used as one can not connect to * same remote port (GUID) using the same remote QPN via two * different subnets.
*/ /* qpn octets[1:4) & port GUID octets[12:20) */
u32 *d32 = (u32 *) daddr;
u32 hv;
/* neigh is obsolete if it was idle for two GC periods */
dt = 2 * arp_tbl.gc_interval;
neigh_obsolete = jiffies - dt;
for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i];
while ((neigh = rcu_dereference_protected(*np,
lockdep_is_held(&priv->lock))) != NULL) { /* was the neigh idle for two GC periods */ if (time_after(neigh_obsolete, neigh->alive)) {
/* need to add a new neigh, but maybe some other thread succeeded? * recalc hash, maybe hash resize took place so we do a search
*/
hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
lockdep_is_held(&priv->lock));
neigh != NULL;
neigh = rcu_dereference_protected(neigh->hnext,
lockdep_is_held(&priv->lock))) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!refcount_inc_not_zero(&neigh->refcnt)) { /* deleted */
neigh = NULL; break;
}
neigh->alive = jiffies; goto out_unlock;
}
}
neigh = ipoib_neigh_ctor(daddr, dev); if (!neigh) goto out_unlock;
/* one ref on behalf of the hash table */
refcount_inc(&neigh->refcnt);
neigh->alive = jiffies; /* put in hash */
rcu_assign_pointer(neigh->hnext,
rcu_dereference_protected(htbl->buckets[hash_val],
lockdep_is_held(&priv->lock)));
rcu_assign_pointer(htbl->buckets[hash_val], neigh);
atomic_inc(&ntbl->entries);
out_unlock:
return neigh;
}
void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
{ /* neigh reference count was dropprd to zero */ struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sk_buff *skb; if (neigh->ah)
ipoib_put_ah(neigh->ah); while ((skb = __skb_dequeue(&neigh->queue))) {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
} if (ipoib_cm_get(neigh))
ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
ipoib_dbg(ipoib_priv(dev), "neigh free for %06x %pI6\n",
IPOIB_QPN(neigh->daddr),
neigh->daddr + 4);
kfree(neigh); if (atomic_dec_and_test(&priv->ntbl.entries)) { if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
complete(&priv->ntbl.flushed);
}
}
staticvoid ipoib_neigh_reclaim(struct rcu_head *rp)
{ /* Called as a result of removal from hash table */ struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); /* note TX context may hold another ref */
ipoib_neigh_put(neigh);
}
staticint ipoib_dev_init(struct net_device *dev)
{ struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = -ENOMEM;
priv->qp = NULL;
/* * the various IPoIB tasks assume they will never race against * themselves, so always use a single thread workqueue
*/
priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM); if (!priv->wq) {
pr_warn("%s: failed to allocate device WQ\n", dev->name); goto out;
}
/* create pd, which used both for control and datapath*/
priv->pd = ib_alloc_pd(priv->ca, 0); if (IS_ERR(priv->pd)) {
pr_warn("%s: failed to allocate PD\n", priv->ca->name); goto clean_wq;
}
ret = priv->rn_ops->ndo_init(dev); if (ret) {
pr_warn("%s failed to init HW resource\n", dev->name); goto out_free_pd;
}
ret = ipoib_neigh_hash_init(priv); if (ret) {
pr_warn("%s failed to init neigh hash\n", dev->name); goto out_dev_uninit;
}
if (dev->flags & IFF_UP) { if (ipoib_ib_dev_open(dev)) {
pr_warn("%s failed to open device\n", dev->name);
ret = -ENODEV; goto out_hash_uninit;
}
}
return 0;
out_hash_uninit:
ipoib_neigh_hash_uninit(dev);
out_dev_uninit:
ipoib_ib_dev_cleanup(dev);
out_free_pd: if (priv->pd) {
ib_dealloc_pd(priv->pd);
priv->pd = NULL;
}
clean_wq: if (priv->wq) {
destroy_workqueue(priv->wq);
priv->wq = NULL;
}
out: return ret;
}
/* * This must be called before doing an unregister_netdev on a parent device to * shutdown the IB event handler.
*/ staticvoid ipoib_parent_unregister_pre(struct net_device *ndev)
{ struct ipoib_dev_priv *priv = ipoib_priv(ndev);
/* * ipoib_set_mac checks netif_running before pushing work, clearing * running ensures the it will not add more work.
*/
rtnl_lock();
dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL);
rtnl_unlock();
/* ipoib_event() cannot be running once this returns */
ib_unregister_event_handler(&priv->event_handler);
/* * Work on the queue grabs the rtnl lock, so this cannot be done while * also holding it.
*/
flush_workqueue(ipoib_workqueue);
}
result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); if (result) {
pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n",
priv->ca->name, priv->port, result); return result;
}
result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid); if (result) {
pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n",
priv->ca->name, priv->port, result); return result;
}
dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid));
SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent);
priv->dev->dev_port = priv->port - 1; /* Let's set this one too for backwards compatibility. */
priv->dev->dev_id = priv->port - 1;
/* * ipoib_remove_one guarantees the children are removed before the * parent, and that is the only place where a parent can be removed.
*/
WARN_ON(!list_empty(&priv->child_intfs));
if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
/* no more works over the priv->wq */ if (priv->wq) { /* See ipoib_mcast_carrier_on_task() */
WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags));
destroy_workqueue(priv->wq);
priv->wq = NULL;
}
dev_put(priv->parent);
}
staticint ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
{ struct ipoib_dev_priv *priv = ipoib_priv(dev);
/* * unregister_netdev always frees the netdev, we use this mode * consistently to unify all the various unregister paths, including * those connected to rtnl_link_ops which require it.
*/
dev->needs_free_netdev = true;
}
rn->clnt_priv = priv; /* * Only the child register_netdev flows can handle priv_destructor * being set, so we force it to NULL here and handle manually until it * is safe to turn on.
*/
priv->next_priv_destructor = dev->priv_destructor;
dev->priv_destructor = NULL;
/* * Upon success the caller must ensure ipoib_intf_free is called or * register_netdevice succeed'd and priv_destructor is set to * ipoib_intf_free.
*/ return dev;
}
dev->priv_destructor = priv->next_priv_destructor; if (dev->priv_destructor)
dev->priv_destructor(dev);
/* * There are some error flows around register_netdev failing that may * attempt to call priv_destructor twice, prevent that from happening.
*/
dev->priv_destructor = NULL;
/* unregister/destroy is very complicated. Make bugs more obvious. */
rn->clnt_priv = NULL;
staticint ipoib_check_lladdr(struct net_device *dev, struct sockaddr_storage *ss)
{ union ib_gid *gid = (union ib_gid *)(ss->__data + 4); int ret = 0;
netif_addr_lock_bh(dev);
/* Make sure the QPN, reserved and subnet prefix match the current * lladdr, it also makes sure the lladdr is unicast.
*/ if (memcmp(dev->dev_addr, ss->__data,
4 + sizeof(gid->global.subnet_prefix)) ||
gid->global.interface_id == 0)
ret = -EINVAL;
return ret ? ret : count;
} static DEVICE_ATTR_WO(create_child);
static ssize_t delete_child_store(struct device *dev, struct device_attribute *attr, constchar *buf, size_t count)
{ int pkey; int ret;
if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL;
if (pkey < 0 || pkey > 0xffff) return -EINVAL;
ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
return ret ? ret : count;
} static DEVICE_ATTR_WO(delete_child);
int ipoib_add_pkey_attr(struct net_device *dev)
{ return device_create_file(&dev->dev, &dev_attr_pkey);
}
/* * We erroneously exposed the iface's port number in the dev_id * sysfs field long after dev_port was introduced for that purpose[1], * and we need to stop everyone from relying on that. * Let's overload the shower routine for the dev_id file here * to gently bring the issue up. * * [1] https://www.spinics.net/lists/netdev/msg272123.html
*/ static ssize_t dev_id_show(struct device *dev, struct device_attribute *attr, char *buf)
{ struct net_device *ndev = to_net_dev(dev);
/* * ndev->dev_port will be equal to 0 in old kernel prior to commit * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface * port numbers") Zero was chosen as special case for user space * applications to fallback and query dev_id to check if it has * different value or not. * * Don't print warning in such scenario. * * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358
*/ if (ndev->dev_port && ndev->dev_id == ndev->dev_port)
netdev_info_once(ndev, "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n",
current->comm);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.