/* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE.
*/
/* join state that allows creating mcg with sendonly member request */ #define SENDONLY_FULLMEMBER_JOIN 8
/* * This should be called with the priv->lock held
*/ staticvoid __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast, bool delay)
{ if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) return;
/* * We will be scheduling *something*, so cancel whatever is * currently scheduled first
*/
cancel_delayed_work(&priv->mcast_task); if (mcast && delay) { /* * We had a failure and want to schedule a retry later
*/
mcast->backoff *= 2; if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
mcast->delay_until = jiffies + (mcast->backoff * HZ); /* * Mark this mcast for its delay, but restart the * task immediately. The join task will make sure to * clear out all entries without delays, and then * schedule itself to run again when the earliest * delay expires
*/
queue_delayed_work(priv->wq, &priv->mcast_task, 0);
} elseif (delay) { /* * Special case of retrying after a failure to * allocate the broadcast multicast group, wait * 1 second and try again
*/
queue_delayed_work(priv->wq, &priv->mcast_task, HZ);
} else
queue_delayed_work(priv->wq, &priv->mcast_task, 0);
}
/* Set the multicast MTU and cached Q_Key before we attach if it's * the broadcast group.
*/ if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid))) {
spin_lock_irq(&priv->lock); if (!priv->broadcast) {
spin_unlock_irq(&priv->lock); return -EAGAIN;
} /*update priv member according to the new mcast*/
priv->broadcast->mcmember.qkey = mcmember->qkey;
priv->broadcast->mcmember.mtu = mcmember->mtu;
priv->broadcast->mcmember.traffic_class = mcmember->traffic_class;
priv->broadcast->mcmember.rate = mcmember->rate;
priv->broadcast->mcmember.sl = mcmember->sl;
priv->broadcast->mcmember.flow_label = mcmember->flow_label;
priv->broadcast->mcmember.hop_limit = mcmember->hop_limit; /* assume if the admin and the mcast are the same both can be changed */
mtu = rdma_mtu_enum_to_int(priv->ca, priv->port,
priv->broadcast->mcmember.mtu); if (priv->mcast_mtu == priv->admin_mtu)
priv->admin_mtu = IPOIB_UD_MTU(mtu);
priv->mcast_mtu = IPOIB_UD_MTU(mtu);
rn->mtu = priv->mcast_mtu;
if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
ipoib_warn(priv, "multicast group %pI6 already attached\n",
mcast->mcmember.mgid.raw);
return 0;
}
ret = rn->attach_mcast(dev, priv->ca, &mcast->mcmember.mgid,
be16_to_cpu(mcast->mcmember.mlid),
set_qkey, priv->qkey); if (ret < 0) {
ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n",
mcast->mcmember.mgid.raw);
if (ib_query_port(priv->ca, priv->port, &attr) ||
attr.state != IB_PORT_ACTIVE) {
ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return;
} /* * Take rtnl_lock to avoid racing with ipoib_stop() and * turning the carrier back on while a device is being * removed. However, ipoib_stop() will attempt to flush * the workqueue while holding the rtnl lock, so loop * on trylock until either we get the lock or we see * FLAG_OPER_UP go away as that signals that we are bailing * and can safely ignore the carrier on work.
*/ while (!rtnl_trylock()) { if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) return; else
msleep(20);
} if (!ipoib_cm_admin_enabled(priv->dev))
dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu));
netif_carrier_on(priv->dev);
rtnl_unlock();
}
/* We trap for port events ourselves. */ if (status == -ENETRESET) {
status = 0; goto out;
}
if (!status)
status = ipoib_mcast_join_finish(mcast, &multicast->rec);
if (!status) {
mcast->backoff = 1;
mcast->delay_until = jiffies;
/* * Defer carrier on work to priv->wq to avoid a * deadlock on rtnl_lock here. Requeue our multicast * work too, which will end up happening right after * our carrier on task work and will allow us to * send out all of the non-broadcast joins
*/ if (mcast == priv->broadcast) {
spin_lock_irq(&priv->lock);
queue_work(priv->wq, &priv->carrier_on_task);
__ipoib_mcast_schedule_join_thread(priv, NULL, 0); goto out_locked;
}
} else { bool silent_fail =
test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
status == -EINVAL;
if (mcast->logcount < 20) { if (status == -ETIMEDOUT || status == -EAGAIN ||
silent_fail) {
ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n",
test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
mcast->mcmember.mgid.raw, status);
} else {
ipoib_warn(priv, "%smulticast join failed for %pI6, status %d\n",
test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
mcast->mcmember.mgid.raw, status);
}
if (!silent_fail)
mcast->logcount++;
}
if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
mcast->backoff >= 2) { /* * We only retry sendonly joins once before we drop * the packet and quit trying to deal with the * group. However, we leave the group in the * mcast list as an unjoined group. If we want to * try joining again, we simply queue up a packet * and restart the join thread. The empty queue * is why the join thread ignores this group.
*/
mcast->backoff = 1;
netif_tx_lock_bh(dev); while (!skb_queue_empty(&mcast->pkt_queue)) {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
}
netif_tx_unlock_bh(dev);
} else {
spin_lock_irq(&priv->lock); /* Requeue this join task with a backoff delay */
__ipoib_mcast_schedule_join_thread(priv, mcast, 1); goto out_locked;
}
}
out:
spin_lock_irq(&priv->lock);
out_locked: /* * Make sure to set mcast->mc before we clear the busy flag to avoid * racing with code that checks for BUSY before checking mcast->mc
*/ if (status)
mcast->mc = NULL; else
mcast->mc = multicast;
clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
spin_unlock_irq(&priv->lock);
complete(&mcast->done);
return status;
}
/* * Caller must hold 'priv->lock'
*/ staticint ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
{ struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_sa_multicast *multicast; struct ib_sa_mcmember_rec rec = {
.join_state = 1
};
ib_sa_comp_mask comp_mask; int ret = 0;
if (!priv->broadcast ||
!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) return -EINVAL;
if (mcast != priv->broadcast) { /* * RFC 4391: * The MGID MUST use the same P_Key, Q_Key, SL, MTU, * and HopLimit as those used in the broadcast-GID. The rest * of attributes SHOULD follow the values used in the * broadcast-GID as well.
*/
comp_mask |=
IB_SA_MCMEMBER_REC_QKEY |
IB_SA_MCMEMBER_REC_MTU_SELECTOR |
IB_SA_MCMEMBER_REC_MTU |
IB_SA_MCMEMBER_REC_TRAFFIC_CLASS |
IB_SA_MCMEMBER_REC_RATE_SELECTOR |
IB_SA_MCMEMBER_REC_RATE |
IB_SA_MCMEMBER_REC_SL |
IB_SA_MCMEMBER_REC_FLOW_LABEL |
IB_SA_MCMEMBER_REC_HOP_LIMIT;
/* * Send-only IB Multicast joins work at the core IB layer but * require specific SM support. * We can use such joins here only if the current SM supports that feature. * However, if not, we emulate an Ethernet multicast send, * which does not require a multicast subscription and will * still send properly. The most appropriate thing to * do is to create the group if it doesn't exist as that * most closely emulates the behavior, from a user space * application perspective, of Ethernet multicast operation.
*/ if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
rec.join_state = SENDONLY_FULLMEMBER_JOIN;
}
multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
&rec, comp_mask, GFP_ATOMIC,
ipoib_mcast_join_complete, mcast); if (IS_ERR(multicast)) {
ret = PTR_ERR(multicast);
ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); /* Requeue this join task with a backoff delay */
__ipoib_mcast_schedule_join_thread(priv, mcast, 1);
clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
complete(&mcast->done); return ret;
} return 0;
}
if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) return;
if (ib_query_port(priv->ca, priv->port, &port_attr)) {
ipoib_dbg(priv, "ib_query_port() failed\n"); return;
} if (port_attr.state != IB_PORT_ACTIVE) {
ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n",
port_attr.state); return;
}
priv->local_lid = port_attr.lid;
netif_addr_lock_bh(dev);
if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
netif_addr_unlock_bh(dev); return;
}
netif_addr_unlock_bh(dev);
spin_lock_irq(&priv->lock); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) goto out;
if (!priv->broadcast) { struct ipoib_mcast *broadcast;
broadcast = ipoib_mcast_alloc(dev); if (!broadcast) {
ipoib_warn(priv, "failed to allocate broadcast group\n"); /* * Restart us after a 1 second delay to retry * creating our broadcast group and attaching to * it. Until this succeeds, this ipoib dev is * completely stalled (multicast wise).
*/
__ipoib_mcast_schedule_join_thread(priv, NULL, 1); goto out;
}
/* * We'll never get here until the broadcast group is both allocated * and attached
*/
list_for_each_entry(mcast, &priv->multicast_list, list) { if (IS_ERR_OR_NULL(mcast->mc) &&
!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
(!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ||
!skb_queue_empty(&mcast->pkt_queue))) { if (mcast->backoff == 1 ||
time_after_eq(jiffies, mcast->delay_until)) { /* Found the next unjoined group */ if (ipoib_mcast_join(dev, mcast)) {
spin_unlock_irq(&priv->lock); return;
}
} elseif (!delay_until ||
time_before(mcast->delay_until, delay_until))
delay_until = mcast->delay_until;
}
}
mcast = NULL;
ipoib_dbg_mcast(priv, "successfully started all multicast joins\n");
out: if (delay_until) {
cancel_delayed_work(&priv->mcast_task);
queue_delayed_work(priv->wq, &priv->mcast_task,
delay_until - jiffies);
} if (mcast)
ipoib_mcast_join(dev, mcast);
if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
if (!IS_ERR_OR_NULL(mcast->mc))
ib_sa_free_multicast(mcast->mc);
if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
ipoib_dbg_mcast(priv, "leaving MGID %pI6\n",
mcast->mcmember.mgid.raw);
/* Remove ourselves from the multicast group */
ret = rn->detach_mcast(dev, priv->ca, &mcast->mcmember.mgid,
be16_to_cpu(mcast->mcmember.mlid)); if (ret)
ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
} elseif (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
ipoib_dbg(priv, "leaving with no mcmember but not a " "SENDONLY join\n");
return 0;
}
/* * Check if the multicast group is sendonly. If so remove it from the maps * and add to the remove list
*/ void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid, struct list_head *remove_list)
{ /* Is this multicast ? */ if (*mgid == 0xff) { struct ipoib_mcast *mcast = __ipoib_mcast_find(priv->dev, mgid);
/* * make sure the in-flight joins have finished before we attempt * to leave
*/
list_for_each_entry_safe(mcast, tmcast, remove_list, list) if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
wait_for_completion(&mcast->done);
mcast = __ipoib_mcast_find(dev, mgid); if (!mcast || !mcast->ah) { if (!mcast) { /* Let's create a new send only group now */
ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n",
mgid);
mcast = ipoib_mcast_alloc(dev); if (!mcast) {
ipoib_warn(priv, "unable to allocate memory " "for multicast structure\n");
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb); goto unlock;
}
set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
__ipoib_mcast_add(dev, mcast);
list_add_tail(&mcast->list, &priv->multicast_list);
} if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) { /* put pseudoheader back on for next time */
skb_push(skb, sizeof(struct ipoib_pseudo_header));
skb_queue_tail(&mcast->pkt_queue, skb);
} else {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
} if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
}
} else { struct ipoib_neigh *neigh;
spin_unlock_irqrestore(&priv->lock, flags);
neigh = ipoib_neigh_get(dev, daddr);
spin_lock_irqsave(&priv->lock, flags); if (!neigh) {
neigh = ipoib_neigh_alloc(daddr, dev); /* Make sure that the neigh will be added only * once to mcast list.
*/ if (neigh && list_empty(&neigh->list)) {
kref_get(&mcast->ah->ref);
neigh->ah = mcast->ah;
neigh->ah->valid = 1;
list_add_tail(&neigh->list, &mcast->neigh_list);
}
}
spin_unlock_irqrestore(&priv->lock, flags);
mcast->ah->last_send = rn->send(dev, skb, mcast->ah->ah,
IB_MULTICAST_QPN); if (neigh)
ipoib_neigh_put(neigh); return;
}
/* * Unfortunately, the networking core only gives us a list of all of * the multicast hardware addresses. We need to figure out which ones * are new and which ones have been removed
*/
/* Clear out the found flag */
list_for_each_entry(mcast, &priv->multicast_list, list)
clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
/* Mark all of the entries that are found or don't exist */
netdev_for_each_mc_addr(ha, dev) { union ib_gid mgid;
if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) continue;
/* ignore group which is directly joined by userspace */ if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
!ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n",
mgid.raw); continue;
}
/* Not found or send-only group, let's add a new entry */
ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n",
mgid.raw);
nmcast = ipoib_mcast_alloc(dev); if (!nmcast) {
ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); continue;
}
set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
nmcast->mcmember.mgid = mgid;
if (mcast) { /* Destroy the send only entry */
list_move_tail(&mcast->list, &remove_list);
/* * Double check that we are still up
*/ if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
spin_lock_irq(&priv->lock);
__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
spin_unlock_irq(&priv->lock);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.