/* * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE.
*/
/* Try to lock GW access, this stage doesn't return * EBUSY because locked GW does not mean that other PF * already started the reset.
*/
ret = mlx5_vsc_gw_lock(dev); if (ret == -EBUSY) return -EINVAL; if (ret) return ret;
state = lock ? MLX5_VSC_LOCK : MLX5_VSC_UNLOCK; /* At this stage, if the return status == EBUSY, then we know * for sure that another PF started the reset, so don't allow * another reset.
*/
ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, state); if (ret)
mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n");
/* The reset only needs to be issued by one PF. The health buffer is * shared between all functions, and will be cleared during a reset. * Check again to avoid a redundant 2nd reset. If the fatal errors was * PCI related a reset won't help.
*/
fatal_error = mlx5_health_check_fatal_sensors(dev); if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
fatal_error == MLX5_SENSOR_NIC_DISABLED ||
fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help."); returnfalse;
}
mlx5_core_warn(dev, "Issuing FW Reset\n"); /* Write the NIC interface field to initiate the reset, the command * interface address also resides here, don't overwrite it.
*/
mlx5_set_nic_state(dev, MLX5_INITIAL_SEG_NIC_INTERFACE_SW_RESET);
/* Mark the device as fatal in order to abort FW commands */ if ((mlx5_health_check_fatal_sensors(dev) || force) &&
dev->state == MLX5_DEVICE_STATE_UP) {
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
err_detected = true;
}
mutex_lock(&dev->intf_state_mutex); if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) goto unlock;/* a previous error is still being handled */
recover_from_sw_reset: /* Recover from SW reset */
end = jiffies + msecs_to_jiffies(delay_ms); do { if (mlx5_get_nic_state(dev) == MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED) break; if (pci_channel_offline(dev->pdev)) {
mlx5_core_err(dev, "PCI channel offline, stop waiting for NIC IFC\n"); goto unlock;
}
msleep(20);
} while (!time_after(jiffies, end));
if (mlx5_get_nic_state(dev) != MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED) {
dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n",
mlx5_get_nic_state(dev), delay_ms);
}
/* Release FW semaphore if you are the lock owner */ if (!lock)
lock_sem_sw_reset(dev, false);
switch (nic_interface) { case MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER:
mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n"); break;
case MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED:
mlx5_core_warn(dev, "starting teardown\n"); break;
case MLX5_INITIAL_SEG_NIC_INTERFACE_NO_DRAM_NIC:
mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n"); break;
case MLX5_INITIAL_SEG_NIC_INTERFACE_SW_RESET: /* The IFC mode field is 3 bits, so it will read 0x7 in 2 cases: * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded * and this is a VF), this is not recoverable by SW reset. * Logging of this is handled elsewhere. * 2. FW reset has been issued by another function, driver can * be reloaded to recover after the mode switches to * MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED.
*/ if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR)
mlx5_core_warn(dev, "NIC SW reset in progress\n"); break;
default:
mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
nic_interface);
}
mlx5_disable_device(dev);
}
int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev)
{ unsignedlong end;
end = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FW_RESET)); while (sensor_pci_not_working(dev)) { if (time_after(jiffies, end)) return -ETIMEDOUT; if (test_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state)) {
mlx5_core_warn(dev, "device is being removed, stop waiting for PCI\n"); return -ENODEV;
} if (pci_channel_offline(dev->pdev)) {
mlx5_core_err(dev, "PCI channel offline, stop waiting for PCI\n"); return -EACCES;
}
msleep(100);
} return 0;
}
staticint mlx5_health_try_recover(struct mlx5_core_dev *dev)
{
mlx5_core_warn(dev, "handling bad device here\n");
mlx5_handle_bad_state(dev); if (mlx5_health_wait_pci_up(dev)) {
mlx5_core_err(dev, "health recovery flow aborted, PCI reads still not working\n"); return -EIO;
}
mlx5_core_err(dev, "starting health recovery flow\n"); if (mlx5_recover_device(dev) || mlx5_health_check_fatal_sensors(dev)) {
mlx5_core_err(dev, "health recovery failed\n"); return -EIO;
}
staticconstchar *hsynd_str(u8 synd)
{ switch (synd) { case MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_INTERNAL_ERR: return"firmware internal error"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_DEAD_IRISC: return"irisc not responding"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_HW_FATAL_ERR: return"unrecoverable hardware error"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_CRC_ERR: return"firmware CRC error"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_FETCH_PCI_ERR: return"ICM fetch PCI error"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PAGE_ERR: return"HW fatal error\n"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ASYNCHRONOUS_EQ_BUF_OVERRUN: return"async EQ buffer overrun"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_IN_ERR: return"EQ error"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_INV: return"Invalid EQ referenced"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR: return"FFSER error"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR: return"High temperature"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PCI_POISONED_ERR: return"ICM fetch PCI data poisoned error"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_TRUST_LOCKDOWN_ERR: return"Trust lockdown error"; default: return"unrecognized error";
}
}
staticconstchar *mlx5_loglevel_str(int level)
{ switch (level) { case LOGLEVEL_EMERG: return"EMERGENCY"; case LOGLEVEL_ALERT: return"ALERT"; case LOGLEVEL_CRIT: return"CRITICAL"; case LOGLEVEL_ERR: return"ERROR"; case LOGLEVEL_WARNING: return"WARNING"; case LOGLEVEL_NOTICE: return"NOTICE"; case LOGLEVEL_INFO: return"INFO"; case LOGLEVEL_DEBUG: return"DEBUG";
} return"Unknown log level";
}
mutex_lock(&dev->intf_state_mutex); if (test_bit(MLX5_DROP_HEALTH_WORK, &health->flags)) {
mlx5_core_err(dev, "health works are not permitted at this stage\n");
mutex_unlock(&dev->intf_state_mutex); return;
}
mutex_unlock(&dev->intf_state_mutex);
enter_error_state(dev, false); if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
devl_lock(devlink); if (mlx5_health_try_recover(dev))
mlx5_core_err(dev, "health recovery failed\n");
devl_unlock(devlink); return;
}
fw_reporter_ctx.err_synd = health->synd;
fw_reporter_ctx.miss_counter = health->miss_counter; if (devlink_health_report(health->fw_fatal_reporter, "FW fatal error reported", &fw_reporter_ctx) == -ECANCELED) { /* If recovery wasn't performed, due to grace period, * unload the driver. This ensures that the driver * closes all its resources and it is not subjected to * requests from the kernel.
*/
mlx5_core_err(dev, "Driver is in error state. Unloading\n");
mlx5_unload_one(dev, false);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.