/* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE.
*/
/* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer.
*/ if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
++dev->stats.rx_dropped; goto repost;
}
/* * Drop packets that this interface sent, ie multicast packets * that the HCA has replicated.
*/ if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) { int need_repost = 1;
/* * As the result of a completion error the QP Can be transferred to SQE states. * The function checks if the (send)QP is in SQE state and * moves it back to RTS state, that in order to have it functional again.
*/ staticvoid ipoib_qp_state_validate_work(struct work_struct *work)
{ struct ipoib_qp_state_validate *qp_work =
container_of(work, struct ipoib_qp_state_validate, work);
staticint poll_tx(struct ipoib_dev_priv *priv)
{ int n, i; struct ib_wc *wc;
n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); for (i = 0; i < n; ++i) {
wc = priv->send_wc + i; if (wc->wr_id & IPOIB_OP_CM)
ipoib_cm_handle_tx_wc(priv->dev, priv->send_wc + i); else
ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
} return n == MAX_SEND_CQE;
}
int ipoib_rx_poll(struct napi_struct *napi, int budget)
{ struct ipoib_dev_priv *priv =
container_of(napi, struct ipoib_dev_priv, recv_napi); struct net_device *dev = priv->dev; int done; int t; int n, i;
done = 0;
poll_more: while (done < budget) { int max = (budget - done);
t = min(IPOIB_NUM_WC, max);
n = ib_poll_cq(priv->recv_cq, t, priv->ibwc);
for (i = 0; i < n; i++) { struct ib_wc *wc = priv->ibwc + i;
/* The function will force napi_schedule */ void ipoib_napi_schedule_work(struct work_struct *work)
{ struct ipoib_dev_priv *priv =
container_of(work, struct ipoib_dev_priv, reschedule_napi_work); bool ret;
do {
ret = napi_schedule(&priv->send_napi); if (!ret)
msleep(3);
} while (!ret && netif_queue_stopped(priv->dev) &&
test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags));
}
ret = napi_schedule(&priv->send_napi); /* * if the queue is closed the driver must be able to schedule napi, * otherwise we can end with closed queue forever, because no new * packets to send and napi callback might not get new event after * its re-arm of the napi.
*/ if (!ret && netif_queue_stopped(priv->dev))
schedule_work(&priv->reschedule_napi_work);
}
/* * We put the skb into the tx_ring _before_ we call post_send() * because it's entirely possible that the completion handler will * run before we execute anything after the post_send(). That * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send().
*/
tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
tx_req->skb = skb; if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
++dev->stats.tx_errors;
dev_kfree_skb_any(skb); return -1;
}
if (skb->ip_summed == CHECKSUM_PARTIAL)
priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM; else
priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; /* increase the tx_head after send success, but use it for queue state */ if ((priv->global_tx_head - priv->global_tx_tail) ==
ipoib_sendq_size - 1) {
ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
netif_stop_queue(dev);
}
skb_orphan(skb);
skb_dst_drop(skb);
if (netif_queue_stopped(dev)) if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
IB_CQ_REPORT_MISSED_EVENTS) < 0)
ipoib_warn(priv, "request notify on send CQ failed\n");
staticvoid ipoib_stop_ah_reaper(struct ipoib_dev_priv *priv)
{
set_bit(IPOIB_STOP_REAPER, &priv->flags);
cancel_delayed_work(&priv->ah_reap_task); /* * After ipoib_stop_ah_reaper() we always go through * ipoib_reap_dead_ahs() which ensures the work is really stopped and * does a final flush out of the dead_ah's list
*/
}
staticint recvs_pending(struct net_device *dev)
{ struct ipoib_dev_priv *priv = ipoib_priv(dev); int pending = 0; int i;
for (i = 0; i < ipoib_recvq_size; ++i) if (priv->rx_ring[i].skb)
++pending;
ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr); if (ret) {
ipoib_warn(priv, "%s: Failed to query QP\n", __func__); return;
} /* print according to the new-state and the previous state.*/ if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET)
ipoib_dbg(priv, "Failed modify QP, IB_QPS_RESET to IB_QPS_ERR, acceptable\n"); else
ipoib_warn(priv, "Failed to modify QP to state: %d from state: %d\n",
new_state, qp_attr.qp_state);
}
int ipoib_ib_dev_stop_default(struct net_device *dev)
{ struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_qp_attr qp_attr; unsignedlong begin; struct ipoib_tx_buf *tx_req; int i;
if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
ipoib_napi_disable(dev);
ipoib_cm_dev_stop(dev);
/* * Move our QP to the error state and then reinitialize in * when all work requests have completed or have been flushed.
*/
qp_attr.qp_state = IB_QPS_ERR; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR);
/* Wait for all sends and receives to complete */
begin = jiffies;
while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { if (time_after(jiffies, begin + 5 * HZ)) {
ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
priv->tx_head - priv->tx_tail,
recvs_pending(dev));
/* * assume the HW is wedged and just free up * all our pending work requests.
*/ while ((int)priv->tx_tail - (int)priv->tx_head < 0) {
tx_req = &priv->tx_ring[priv->tx_tail &
(ipoib_sendq_size - 1)];
ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(tx_req->skb);
++priv->tx_tail;
++priv->global_tx_tail;
}
for (i = 0; i < ipoib_recvq_size; ++i) { struct ipoib_rx_buf *rx_req;
void ipoib_drain_cq(struct net_device *dev)
{ struct ipoib_dev_priv *priv = ipoib_priv(dev); int i, n;
/* * We call completion handling routines that expect to be * called from the BH-disabled NAPI poll context, so disable * BHs here too.
*/
local_bh_disable();
do {
n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); for (i = 0; i < n; ++i) { /* * Convert any successful completions to flush * errors to avoid passing packets up the * stack after bringing the device down.
*/ if (priv->ibwc[i].status == IB_WC_SUCCESS)
priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
/* * Takes whatever value which is in pkey index 0 and updates priv->pkey * returns 0 if the pkey value was changed.
*/ staticinlineint update_parent_pkey(struct ipoib_dev_priv *priv)
{ int result;
u16 prev_pkey;
prev_pkey = priv->pkey;
result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); if (result) {
ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n",
priv->port, result); return result;
}
priv->pkey |= 0x8000;
if (prev_pkey != priv->pkey) {
ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n",
prev_pkey, priv->pkey); /* * Update the pkey in the broadcast address, while making sure to set * the full membership bit, so that we join the right broadcast group.
*/
priv->dev->broadcast[8] = priv->pkey >> 8;
priv->dev->broadcast[9] = priv->pkey & 0xff; return 0;
}
return 1;
} /* * returns 0 if pkey value was found in a different slot.
*/ staticinlineint update_child_pkey(struct ipoib_dev_priv *priv)
{
u16 old_index = priv->pkey_index;
/* * returns true if the device address of the ipoib interface has changed and the * new address is a valid one (i.e in the gid table), return false otherwise.
*/ staticbool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
{ union ib_gid search_gid; union ib_gid gid0; int err;
u16 index;
u32 port; bool ret = false;
if (rdma_query_gid(priv->ca, priv->port, 0, &gid0)) returnfalse;
netif_addr_lock_bh(priv->dev);
/* The subnet prefix may have changed, update it now so we won't have * to do it later
*/
priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix;
dev_addr_mod(priv->dev, 4, (u8 *)&gid0.global.subnet_prefix, sizeof(gid0.global.subnet_prefix));
search_gid.global.subnet_prefix = gid0.global.subnet_prefix;
if (search_gid.global.interface_id !=
priv->local_gid.global.interface_id) /* There was a change while we were looking up the gid, bail * here and let the next work sort this out
*/ goto out;
/* The next section of code needs some background: * Per IB spec the port GUID can't change if the HCA is powered on. * port GUID is the basis for GID at index 0 which is the basis for * the default device address of a ipoib interface. * * so it seems the flow should be: * if user_changed_dev_addr && gid in gid tbl * set bit dev_addr_set * return true * else * return false * * The issue is that there are devices that don't follow the spec, * they change the port GUID when the HCA is powered, so in order * not to break userspace applications, We need to check if the * user wanted to control the device address and we assume that * if he sets the device address back to be based on GID index 0, * he no longer wishs to control it. * * If the user doesn't control the device address, * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means * the port GUID has changed and GID at index 0 has changed * so we need to change priv->local_gid and priv->dev->dev_addr * to reflect the new GID.
*/ if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { if (!err && port == priv->port) {
set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); if (index == 0)
clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL,
&priv->flags); else
set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags);
ret = true;
} else {
ret = false;
}
} else { if (!err && port == priv->port) {
ret = true;
} else { if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) {
memcpy(&priv->local_gid, &gid0, sizeof(priv->local_gid));
dev_addr_mod(priv->dev, 4, (u8 *)&gid0, sizeof(priv->local_gid));
ret = true;
}
}
}
if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
level != IPOIB_FLUSH_HEAVY) { /* Make sure the dev_addr is set even if not flushing */ if (level == IPOIB_FLUSH_LIGHT)
ipoib_dev_addr_changed_valid(priv);
ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return;
}
if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { /* interface is down. update pkey and leave. */ if (level == IPOIB_FLUSH_HEAVY) { if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
update_parent_pkey(priv); else
update_child_pkey(priv);
} elseif (level == IPOIB_FLUSH_LIGHT)
ipoib_dev_addr_changed_valid(priv);
ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return;
}
if (level == IPOIB_FLUSH_HEAVY) { /* child devices chase their origin pkey value, while non-child * (parent) devices should always takes what present in pkey index 0
*/ if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
result = update_child_pkey(priv); if (result) { /* restart QP only if P_Key index is changed */
ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); return;
}
} else {
result = update_parent_pkey(priv); /* restart QP only if P_Key value changed */ if (result) {
ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n"); return;
}
}
}
if (level == IPOIB_FLUSH_LIGHT) { int oper_up;
ipoib_mark_paths_invalid(dev); /* Set IPoIB operation as down to prevent races between: * the flush flow which leaves MCG and on the fly joins * which can happen during that time. mcast restart task * should deal with join requests we missed.
*/
oper_up = test_and_clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
ipoib_mcast_dev_flush(dev); if (oper_up)
set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
ipoib_reap_dead_ahs(priv);
}
if (level >= IPOIB_FLUSH_NORMAL)
ipoib_ib_dev_down(dev);
if (level == IPOIB_FLUSH_HEAVY) {
netdev_lock_ops(dev); if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
ipoib_ib_dev_stop(dev);
result = ipoib_ib_dev_open(dev);
netdev_unlock_ops(dev);
if (result) return;
if (netif_queue_stopped(dev))
netif_start_queue(dev);
}
/* * The device could have been brought down between the start and when * we get here, don't bring it back up if it's not configured up
*/ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { if (level >= IPOIB_FLUSH_NORMAL)
ipoib_ib_dev_up(dev); if (ipoib_dev_addr_changed_valid(priv))
ipoib_mcast_restart_task(&priv->restart_task);
}
}
ipoib_dbg(priv, "cleaning up ib_dev\n"); /* * We must make sure there are no more (path) completions * that may wish to touch priv fields that are no longer valid
*/
ipoib_flush_paths(dev);
/* * All of our ah references aren't free until after * ipoib_mcast_dev_flush(), ipoib_flush_paths, and * the neighbor garbage collection is stopped and reaped. * That should all be done now, so make a final ah flush.
*/
ipoib_reap_dead_ahs(priv);
clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
priv->rn_ops->ndo_uninit(dev);
if (priv->pd) {
ib_dealloc_pd(priv->pd);
priv->pd = NULL;
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.