uint num_vls = HFI1_MAX_VLS_SUPPORTED;
module_param(num_vls, uint, S_IRUGO);
MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
/* * Default time to aggregate two 10K packets from the idle state * (timer not running). The timer starts at the end of the first packet, * so only the time for one 10K packet and header plus a bit extra is needed. * 10 * 1024 + 64 header byte = 10304 byte * 10304 byte / 12.5 GB/s = 824.32ns
*/
uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
module_param(rcv_intr_timeout, uint, S_IRUGO);
MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
uint rcv_intr_count = 16; /* same as qib */
module_param(rcv_intr_count, uint, S_IRUGO);
MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
ushort link_crc_mask = SUPPORTED_CRCS;
module_param(link_crc_mask, ushort, S_IRUGO);
MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
/* * Error interrupt table entry. This is used as input to the interrupt * "clear down" routine used for all second tier error interrupt register. * Second tier interrupt registers have a single bit representing them * in the top-level CceIntStatus.
*/ struct err_reg_info {
u32 status; /* status CSR offset */
u32 clear; /* clear CSR offset */
u32 mask; /* mask CSR offset */ void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg); constchar *desc;
};
/* * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG * register can not be derived from the MTU value because 10K is not * a power of 2. Therefore, we need a constant. Everything else can * be calculated.
*/ #define DCC_CFG_PORT_MTU_CAP_10240 7
/* * Table of the DC grouping of error interrupts. Each entry refers to * another register containing more information.
*/ staticconststruct err_reg_info dc_errs[NUM_DC_ERRS] = { /* 0*/ DC_EE1(DCC_ERR, handle_dcc_err, "DCC Err"), /* 1*/ DC_EE2(DC_LCB_ERR, handle_lcb_err, "LCB Err"), /* 2*/ DC_EE2(DC_DC8051_ERR, handle_8051_interrupt, "DC8051 Interrupt"), /* 3*/ /* dc_lbm_int - special, see is_dc_int() */ /* the rest are reserved */
};
struct cntr_entry { /* * counter name
*/ char *name;
/* * csr to read for name (if applicable)
*/
u64 csr;
/* * offset into dd or ppd to store the counter's value
*/ int offset;
/* * flags
*/
u8 flags;
/* * accessor for stat element, context either dd or ppd
*/
u64 (*rw_cntr)(conststruct cntr_entry *, void *context, int vl, int mode, u64 data);
};
/** * hfi1_addr_from_offset - return addr for readq/writeq * @dd: the dd device * @offset: the offset of the CSR within bar0 * * This routine selects the appropriate base address * based on the indicated offset.
*/ staticinlinevoid __iomem *hfi1_addr_from_offset( conststruct hfi1_devdata *dd,
u32 offset)
{ if (offset >= dd->base2_start) return dd->kregbase2 + (offset - dd->base2_start); return dd->kregbase1 + offset;
}
/** * read_csr - read CSR at the indicated offset * @dd: the dd device * @offset: the offset of the CSR within bar0 * * Return: the value read or all FF's if there * is no mapping
*/
u64 read_csr(conststruct hfi1_devdata *dd, u32 offset)
{ if (dd->flags & HFI1_PRESENT) return readq(hfi1_addr_from_offset(dd, offset)); return -1;
}
/** * write_csr - write CSR at the indicated offset * @dd: the dd device * @offset: the offset of the CSR within bar0 * @value: value to write
*/ void write_csr(conststruct hfi1_devdata *dd, u32 offset, u64 value)
{ if (dd->flags & HFI1_PRESENT) { void __iomem *base = hfi1_addr_from_offset(dd, offset);
/* avoid write to RcvArray */ if (WARN_ON(offset >= RCV_ARRAY && offset < dd->base2_start)) return;
writeq(value, base);
}
}
/** * get_csr_addr - return te iomem address for offset * @dd: the dd device * @offset: the offset of the CSR within bar0 * * Return: The iomem address to use in subsequent * writeq/readq operations.
*/ void __iomem *get_csr_addr( conststruct hfi1_devdata *dd,
u32 offset)
{ if (dd->flags & HFI1_PRESENT) return hfi1_addr_from_offset(dd, offset); return NULL;
}
static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
u64 __percpu *cntr, int vl, int mode, u64 data)
{
u64 ret = 0;
if (vl != CNTR_INVALID_VL) return 0;
if (mode == CNTR_MODE_R) {
ret = get_all_cpu_total(cntr) - *z_val;
} elseif (mode == CNTR_MODE_W) { /* A write can only zero the counter */ if (data == 0)
*z_val = get_all_cpu_total(cntr); else
dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
} else {
dd_dev_err(dd, "Invalid cntr sw cpu access mode"); return 0;
}
return ret;
}
static u64 access_sw_cpu_intr(conststruct cntr_entry *entry, void *context, int vl, int mode, u64 data)
{ struct hfi1_devdata *dd = context;
/* * Software counter for the aggregate of * individual CceErrStatus counters
*/ static u64 access_sw_cce_err_status_aggregated_cnt( conststruct cntr_entry *entry, void *context, int vl, int mode, u64 data)
{ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
return dd->sw_cce_err_status_aggregate;
}
/* * Software counters corresponding to each of the * error status bits within CceErrStatus
*/ static u64 access_cce_msix_csr_parity_err_cnt(conststruct cntr_entry *entry, void *context, int vl, int mode,
u64 data)
{ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
/* * Software counters corresponding to each of the * error status bits within RcvErrStatus
*/ static u64 access_rx_csr_parity_err_cnt(conststruct cntr_entry *entry, void *context, int vl, int mode,
u64 data)
{ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
/* * Software counters corresponding to each of the * error status bits within SendPioErrStatus
*/ static u64 access_pio_pec_sop_head_parity_err_cnt( conststruct cntr_entry *entry, void *context, int vl, int mode, u64 data)
{ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
/* * Software counters corresponding to each of the * error status bits within SendDmaErrStatus
*/ static u64 access_sdma_pcie_req_tracking_cor_err_cnt( conststruct cntr_entry *entry, void *context, int vl, int mode, u64 data)
{ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
/* * Software counters corresponding to each of the * error status bits within SendEgressErrStatus
*/ static u64 access_tx_read_pio_memory_csr_unc_err_cnt( conststruct cntr_entry *entry, void *context, int vl, int mode, u64 data)
{ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
/* * Software counters corresponding to each of the * error status bits within SendErrStatus
*/ static u64 access_send_csr_write_bad_addr_err_cnt( conststruct cntr_entry *entry, void *context, int vl, int mode, u64 data)
{ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
/* * Software counters corresponding to each of the * error status bits within SendCtxtErrStatus
*/ static u64 access_pio_write_out_of_bounds_err_cnt( conststruct cntr_entry *entry, void *context, int vl, int mode, u64 data)
{ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
/* * Software counters corresponding to each of the * error status bits within SendDmaEngErrStatus
*/ static u64 access_sdma_header_request_fifo_cor_err_cnt( conststruct cntr_entry *entry, void *context, int vl, int mode, u64 data)
{ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
/* * Append string s to buffer buf. Arguments curp and len are the current * position and remaining length, respectively. * * return 0 on success, 1 on out of room
*/ staticint append_str(char *buf, char **curp, int *lenp, constchar *s)
{ char *p = *curp; int len = *lenp; int result = 0; /* success */ char c;
/* add a comma, if first in the buffer */ if (p != buf) { if (len == 0) {
result = 1; /* out of room */ goto done;
}
*p++ = ',';
len--;
}
/* copy the string */ while ((c = *s++) != 0) { if (len == 0) {
result = 1; /* out of room */ goto done;
}
*p++ = c;
len--;
}
/* * Using the given flag table, print a comma separated string into * the buffer. End in '*' if the buffer is too short.
*/ staticchar *flag_string(char *buf, int buf_len, u64 flags, conststruct flag_table *table, int table_size)
{ char extra[32]; char *p = buf; int len = buf_len; int no_room = 0; int i;
/* make sure there is at least 2 so we can form "*" */ if (len < 2) return"";
len--; /* leave room for a nul */ for (i = 0; i < table_size; i++) { if (flags & table[i].flag) {
no_room = append_str(buf, &p, &len, table[i].str); if (no_room) break;
flags &= ~table[i].flag;
}
}
staticvoid handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
{ char buf[96]; int i = 0;
/* * For most these errors, there is nothing that can be done except * report or record it.
*/
dd_dev_info(dd, "CCE Error: %s\n",
cce_err_status_string(buf, sizeof(buf), reg));
if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) &&
is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) { /* this error requires a manual drop into SPC freeze mode */ /* then a fix up */
start_freeze_handling(dd->pport, FREEZE_SELF);
}
for (i = 0; i < NUM_CCE_ERR_STATUS_COUNTERS; i++) { if (reg & (1ull << i)) {
incr_cntr64(&dd->cce_err_status_cnt[i]); /* maintain a counter over all cce_err_status errors */
incr_cntr64(&dd->sw_cce_err_status_aggregate);
}
}
}
/* * Check counters for receive errors that do not have an interrupt * associated with them.
*/ #define RCVERR_CHECK_TIME 10 staticvoid update_rcverr_timer(struct timer_list *t)
{ struct hfi1_devdata *dd = timer_container_of(dd, t, rcverr_timer); struct hfi1_pportdata *ppd = dd->pport;
u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
/* * Freeze mode recovery is disabled for the errors * in RXE_FREEZE_ABORT_MASK
*/ if (is_ax(dd) && (reg & RXE_FREEZE_ABORT_MASK))
flags = FREEZE_ABORT;
start_freeze_handling(dd->pport, flags);
}
for (i = 0; i < NUM_RCV_ERR_STATUS_COUNTERS; i++) { if (reg & (1ull << i))
incr_cntr64(&dd->rcv_err_status_cnt[i]);
}
}
staticvoid handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
{ char buf[96]; int i = 0;
dd_dev_info(dd, "Misc Error: %s",
misc_err_status_string(buf, sizeof(buf), reg)); for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) { if (reg & (1ull << i))
incr_cntr64(&dd->misc_err_status_cnt[i]);
}
}
staticvoid handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
{ char buf[96]; int i = 0;
/* * We have had a "disallowed packet" error during egress. Determine the * integrity check which failed, and update relevant error counter, etc. * * Note that the SEND_EGRESS_ERR_INFO register has only a single * bit of state per integrity check, and so we can miss the reason for an * egress error if more than one packet fails the same integrity check * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
*/ staticvoid handle_send_egress_err_info(struct hfi1_devdata *dd, int vl)
{ struct hfi1_pportdata *ppd = dd->pport;
u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO); char buf[96];
/* clear down all observed info as quickly as possible after read */
write_csr(dd, SEND_EGRESS_ERR_INFO, info);
/* Eventually add other counters for each bit */ if (info & PORT_DISCARD_EGRESS_ERRS) { int weight, i;
/* * Count all applicable bits as individual errors and * attribute them to the packet that triggered this handler. * This may not be completely accurate due to limitations * on the available hardware error information. There is * a single information register and any number of error * packets may have occurred and contributed to it before * this routine is called. This means that: * a) If multiple packets with the same error occur before * this routine is called, earlier packets are missed. * There is only a single bit for each error type. * b) Errors may not be attributed to the correct VL. * The driver is attributing all bits in the info register * to the packet that triggered this call, but bits * could be an accumulation of different packets with * different VLs. * c) A single error packet may have multiple counts attached * to it. There is no way for the driver to know if * multiple bits set in the info register are due to a * single packet or multiple packets. The driver assumes * multiple packets.
*/
weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS); for (i = 0; i < weight; i++) {
__count_port_discards(ppd); if (vl >= 0 && vl < TXE_NUM_DATA_VL)
incr_cntr64(&ppd->port_xmit_discards_vl[vl]); elseif (vl == 15)
incr_cntr64(&ppd->port_xmit_discards_vl
[C_VL_15]);
}
}
}
/* * Input value is a bit position within the SEND_EGRESS_ERR_STATUS * register. Does it represent a 'port inactive' error?
*/ staticinlineint port_inactive_err(u64 posn)
{ return (posn >= SEES(TX_LINKDOWN) &&
posn <= SEES(TX_INCORRECT_LINK_STATE));
}
/* * Input value is a bit position within the SEND_EGRESS_ERR_STATUS * register. Does it represent a 'disallowed packet' error?
*/ staticinlineint disallowed_pkt_err(int posn)
{ return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
}
/* * Input value is a bit position of one of the SDMA engine disallowed * packet errors. Return which engine. Use of this must be guarded by * disallowed_pkt_err().
*/ staticinlineint disallowed_pkt_engine(int posn)
{ return posn - SEES(TX_SDMA0_DISALLOWED_PACKET);
}
/* * Translate an SDMA engine to a VL. Return -1 if the tranlation cannot * be done.
*/ staticint engine_to_vl(struct hfi1_devdata *dd, int engine)
{ struct sdma_vl_map *m; int vl;
/* range check */ if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES) return -1;
rcu_read_lock();
m = rcu_dereference(dd->sdma_map);
vl = m->engine_to_vl[engine];
rcu_read_unlock();
return vl;
}
/* * Translate the send context (sofware index) into a VL. Return -1 if the * translation cannot be done.
*/ staticint sc_to_vl(struct hfi1_devdata *dd, int sw_index)
{ struct send_context_info *sci; struct send_context *sc; int i;
sci = &dd->send_contexts[sw_index];
/* there is no information for user (PSM) and ack contexts */ if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15)) return -1;
sc = sci->sc; if (!sc) return -1; if (dd->vld[15].sc == sc) return 15; for (i = 0; i < num_vls; i++) if (dd->vld[i].sc == sc) return i;
while (reg_copy) { int posn = fls64(reg_copy); /* fls64() returns a 1-based offset, we want it zero based */ int shift = posn - 1;
u64 mask = 1ULL << shift;
if (port_inactive_err(shift)) {
count_port_inactive(dd);
handled |= mask;
} elseif (disallowed_pkt_err(shift)) { int vl = engine_to_vl(dd, disallowed_pkt_engine(shift));
for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) { if (reg & (1ull << i))
incr_cntr64(&dd->send_err_status_cnt[i]);
}
}
/* * The maximum number of times the error clear down will loop before * blocking a repeating error. This value is arbitrary.
*/ #define MAX_CLEAR_COUNT 20
/* * Clear and handle an error register. All error interrupts are funneled * through here to have a central location to correctly handle single- * or multi-shot errors. * * For non per-context registers, call this routine with a context value * of 0 so the per-context offset is zero. * * If the handler loops too many times, assume that something is wrong * and can't be fixed, so mask the error bits.
*/ staticvoid interrupt_clear_down(struct hfi1_devdata *dd,
u32 context, conststruct err_reg_info *eri)
{
u64 reg;
u32 count;
/* read in a loop until no more errors are seen */
count = 0; while (1) {
reg = read_kctxt_csr(dd, context, eri->status); if (reg == 0) break;
write_kctxt_csr(dd, context, eri->clear, reg); if (likely(eri->handler))
eri->handler(dd, context, reg);
count++; if (count > MAX_CLEAR_COUNT) {
u64 mask;
/* * Send context error interrupt. Source (hw_context) is < 160. * * All send context errors cause the send context to halt. The normal * clear-down mechanism cannot be used because we cannot clear the * error bits until several other long-running items are done first. * This is OK because with the context halted, nothing else is going * to happen on it anyway.
*/ staticvoid is_sendctxt_err_int(struct hfi1_devdata *dd, unsignedint hw_context)
{ struct send_context_info *sci; struct send_context *sc; char flags[96];
u64 status;
u32 sw_index; int i = 0; unsignedlong irq_flags;
sw_index = dd->hw_to_sw[hw_context]; if (sw_index >= dd->num_send_contexts) {
dd_dev_err(dd, "out of range sw index %u for send context %u\n",
sw_index, hw_context); return;
}
sci = &dd->send_contexts[sw_index];
spin_lock_irqsave(&dd->sc_lock, irq_flags);
sc = sci->sc; if (!sc) {
dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
sw_index, hw_context);
spin_unlock_irqrestore(&dd->sc_lock, irq_flags); return;
}
/* tell the software that a halt has begun */
sc_stop(sc, SCF_HALTED);
status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index));
/* * Automatically restart halted kernel contexts out of interrupt * context. User contexts must ask the driver to restart the context.
*/ if (sc->type != SC_USER)
queue_work(dd->pport->hfi1_wq, &sc->halt_work);
spin_unlock_irqrestore(&dd->sc_lock, irq_flags);
/* * Update the counters for the corresponding status bits. * Note that these particular counters are aggregated over all * 160 contexts.
*/ for (i = 0; i < NUM_SEND_CTXT_ERR_STATUS_COUNTERS; i++) { if (status & (1ull << i))
incr_cntr64(&dd->sw_ctxt_err_status_cnt[i]);
}
}
staticvoid handle_sdma_eng_err(struct hfi1_devdata *dd, unsignedint source, u64 status)
{ struct sdma_engine *sde; int i = 0;
/* * Update the counters for the corresponding status bits. * Note that these particular counters are aggregated over * all 16 DMA engines.
*/ for (i = 0; i < NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS; i++) { if (status & (1ull << i))
incr_cntr64(&dd->sw_send_dma_eng_err_status_cnt[i]);
}
}
/* * TCritInt cannot go through interrupt_clear_down() * because it is not a second tier interrupt. The handler * should be called directly.
*/ if (source == TCRIT_INT_SOURCE)
handle_temp_err(dd); elseif (eri->handler)
interrupt_clear_down(dd, 0, eri); else
dd_dev_info(dd, "%s: Unimplemented/reserved interrupt %d\n",
__func__, source);
}
if (reg & QSFP_HFI0_MODPRST_N) { if (!qsfp_mod_present(ppd)) {
dd_dev_info(dd, "%s: QSFP module removed\n",
__func__);
ppd->driver_link_ready = 0; /* * Cable removed, reset all our information about the * cache and cable capabilities
*/
spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); /* * We don't set cache_refresh_required here as we expect * an interrupt when a cable is inserted
*/
ppd->qsfp_info.cache_valid = 0;
ppd->qsfp_info.reset_needed = 0;
ppd->qsfp_info.limiting_active = 0;
spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
flags); /* Invert the ModPresent pin now to detect plug-in */
write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
ASIC_QSFP1_INVERT, qsfp_int_mgmt);
if (ppd->host_link_state == HLS_DN_POLL) { /* * The link is still in POLL. This means * that the normal link down processing * will not happen. We have to do it here * before turning the DC off.
*/
queue_work(ppd->link_wq, &ppd->link_down_work);
}
} else {
dd_dev_info(dd, "%s: QSFP module inserted\n",
__func__);
if (reg & QSFP_HFI0_INT_N) {
dd_dev_info(dd, "%s: Interrupt received from QSFP module\n",
__func__);
spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
ppd->qsfp_info.check_interrupt_flags = 1;
spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
}
/* Schedule the QSFP work only if there is a cable attached. */ if (qsfp_mod_present(ppd))
queue_work(ppd->link_wq, &ppd->qsfp_info.qsfp_work);
}
staticint request_host_lcb_access(struct hfi1_devdata *dd)
{ int ret;
ret = do_8051_command(dd, HCMD_MISC,
(u64)HCMD_MISC_REQUEST_LCB_ACCESS <<
LOAD_DATA_FIELD_ID_SHIFT, NULL); if (ret != HCMD_SUCCESS && !(dd->flags & HFI1_SHUTDOWN)) {
dd_dev_err(dd, "%s: command failed with error %d\n",
__func__, ret);
} return ret == HCMD_SUCCESS ? 0 : -EBUSY;
}
staticint request_8051_lcb_access(struct hfi1_devdata *dd)
{ int ret;
ret = do_8051_command(dd, HCMD_MISC,
(u64)HCMD_MISC_GRANT_LCB_ACCESS <<
LOAD_DATA_FIELD_ID_SHIFT, NULL); if (ret != HCMD_SUCCESS) {
dd_dev_err(dd, "%s: command failed with error %d\n",
__func__, ret);
} return ret == HCMD_SUCCESS ? 0 : -EBUSY;
}
/* * Set the LCB selector - allow host access. The DCC selector always * points to the host.
*/ staticinlinevoid set_host_lcb_access(struct hfi1_devdata *dd)
{
write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK |
DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
}
/* * Clear the LCB selector - allow 8051 access. The DCC selector always * points to the host.
*/ staticinlinevoid set_8051_lcb_access(struct hfi1_devdata *dd)
{
write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
}
/* * Acquire LCB access from the 8051. If the host already has access, * just increment a counter. Otherwise, inform the 8051 that the * host is taking access. * * Returns: * 0 on success * -EBUSY if the 8051 has control and cannot be disturbed * -errno if unable to acquire access from the 8051
*/ int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
{ struct hfi1_pportdata *ppd = dd->pport; int ret = 0;
/* * Use the host link state lock so the operation of this routine * { link state check, selector change, count increment } can occur * as a unit against a link state change. Otherwise there is a * race between the state change and the count increment.
*/ if (sleep_ok) {
mutex_lock(&ppd->hls_lock);
} else { while (!mutex_trylock(&ppd->hls_lock))
udelay(1);
}
/* this access is valid only when the link is up */ if (ppd->host_link_state & HLS_DOWN) {
dd_dev_info(dd, "%s: link state %s not up\n",
__func__, link_state_name(ppd->host_link_state));
ret = -EBUSY; goto done;
}
if (dd->lcb_access_count == 0) {
ret = request_host_lcb_access(dd); if (ret) { if (!(dd->flags & HFI1_SHUTDOWN))
dd_dev_err(dd, "%s: unable to acquire LCB access, err %d\n",
__func__, ret); goto done;
}
set_host_lcb_access(dd);
}
dd->lcb_access_count++;
done:
mutex_unlock(&ppd->hls_lock); return ret;
}
/* * Release LCB access by decrementing the use count. If the count is moving * from 1 to 0, inform 8051 that it has control back. * * Returns: * 0 on success * -errno if unable to release access to the 8051
*/ int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
{ int ret = 0;
/* * Use the host link state lock because the acquire needed it. * Here, we only need to keep { selector change, count decrement } * as a unit.
*/ if (sleep_ok) {
mutex_lock(&dd->pport->hls_lock);
} else { while (!mutex_trylock(&dd->pport->hls_lock))
udelay(1);
}
if (dd->lcb_access_count == 0) {
dd_dev_err(dd, "%s: LCB access count is zero. Skipping.\n",
__func__); goto done;
}
if (dd->lcb_access_count == 1) {
set_8051_lcb_access(dd);
ret = request_8051_lcb_access(dd); if (ret) {
dd_dev_err(dd, "%s: unable to release LCB access, err %d\n",
__func__, ret); /* restore host access if the grant didn't work */
set_host_lcb_access(dd); goto done;
}
}
dd->lcb_access_count--;
done:
mutex_unlock(&dd->pport->hls_lock); return ret;
}
/* * Initialize LCB access variables and state. Called during driver load, * after most of the initialization is finished. * * The DC default is LCB access on for the host. The driver defaults to * leaving access to the 8051. Assign access now - this constrains the call * to this routine to be after all LCB set-up is done. In particular, after * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
*/ staticvoid init_lcb_access(struct hfi1_devdata *dd)
{
dd->lcb_access_count = 0;
}
/* * Write a response back to a 8051 request.
*/ staticvoid hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
{
write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK |
(u64)return_code <<
DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT |
(u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
}
/* * Handle host requests from the 8051.
*/ staticvoid handle_8051_request(struct hfi1_pportdata *ppd)
{ struct hfi1_devdata *dd = ppd->dd;
u64 reg;
u16 data = 0;
u8 type;
reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1); if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0) return; /* no request */
/* zero out COMPLETED so the response is seen */
write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
/* extract request details */
type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
& DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
& DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
switch (type) { case HREQ_LOAD_CONFIG: case HREQ_SAVE_CONFIG: case HREQ_READ_CONFIG: case HREQ_SET_TX_EQ_ABS: case HREQ_SET_TX_EQ_REL: case HREQ_ENABLE:
dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
type);
hreq_response(dd, HREQ_NOT_SUPPORTED, 0); break; case HREQ_LCB_RESET: /* Put the LCB, RX FPE and TX FPE into reset */
write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_INTO_RESET); /* Make sure the write completed */
(void)read_csr(dd, DCC_CFG_RESET); /* Hold the reset long enough to take effect */
udelay(1); /* Take the LCB, RX FPE and TX FPE out of reset */
write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_OUT_OF_RESET);
hreq_response(dd, HREQ_SUCCESS, 0);
break; case HREQ_CONFIG_DONE:
hreq_response(dd, HREQ_SUCCESS, 0); break;
/* * Set up allocation unit vaulue.
*/ void set_up_vau(struct hfi1_devdata *dd, u8 vau)
{
u64 reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
/* do not modify other values in the register */
reg &= ~SEND_CM_GLOBAL_CREDIT_AU_SMASK;
reg |= (u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT;
write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
}
/* * Set up initial VL15 credits of the remote. Assumes the rest of * the CM credit registers are zero from a previous global or credit reset. * Shared limit for VL15 will always be 0.
*/ void set_up_vl15(struct hfi1_devdata *dd, u16 vl15buf)
{
u64 reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
/* set initial values for total and shared credit limit */
reg &= ~(SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK |
SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK);
/* * Set total limit to be equal to VL15 credits. * Leave shared limit at 0.
*/
reg |= (u64)vl15buf << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
/* * Zero all credit details from the previous connection and * reset the CM manager's internal counters.
*/ void reset_link_credits(struct hfi1_devdata *dd)
{ int i;
/* remove all previous VL credit limits */ for (i = 0; i < TXE_NUM_DATA_VL; i++)
write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
write_csr(dd, SEND_CM_CREDIT_VL15, 0);
write_csr(dd, SEND_CM_GLOBAL_CREDIT, 0); /* reset the CM block */
pio_send_control(dd, PSC_CM_RESET); /* reset cached value */
dd->vl15buf_cached = 0;
}
/* convert a vCU to a CU */ static u32 vcu_to_cu(u8 vcu)
{ return 1 << vcu;
}
/* convert a CU to a vCU */ static u8 cu_to_vcu(u32 cu)
{ return ilog2(cu);
}
/* convert a vAU to an AU */ static u32 vau_to_au(u8 vau)
{ return 8 * (1 << vau);
}
/* * Graceful LCB shutdown. This leaves the LCB FIFOs in reset.
*/ staticvoid lcb_shutdown(struct hfi1_devdata *dd, int abort)
{
u64 reg;
/* clear lcb run: LCB_CFG_RUN.EN = 0 */
write_csr(dd, DC_LCB_CFG_RUN, 0); /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT); /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
reg = read_csr(dd, DCC_CFG_RESET);
write_csr(dd, DCC_CFG_RESET, reg |
DCC_CFG_RESET_RESET_LCB | DCC_CFG_RESET_RESET_RX_FPE);
(void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */ if (!abort) {
udelay(1); /* must hold for the longer of 16cclks or 20ns */
write_csr(dd, DCC_CFG_RESET, reg);
write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
}
}
/* * This routine should be called after the link has been transitioned to * OFFLINE (OFFLINE state has the side effect of putting the SerDes into * reset). * * The expectation is that the caller of this routine would have taken * care of properly transitioning the link into the correct state. * NOTE: the caller needs to acquire the dd->dc8051_lock lock * before calling this function.
*/ staticvoid _dc_shutdown(struct hfi1_devdata *dd)
{
lockdep_assert_held(&dd->dc8051_lock);
if (dd->dc_shutdown) return;
dd->dc_shutdown = 1; /* Shutdown the LCB */
lcb_shutdown(dd, 1); /* * Going to OFFLINE would have causes the 8051 to put the * SerDes into reset already. Just need to shut down the 8051, * itself.
*/
write_csr(dd, DC_DC8051_CFG_RST, 0x1);
}
/* * Calling this after the DC has been brought out of reset should not * do any damage. * NOTE: the caller needs to acquire the dd->dc8051_lock lock * before calling this function.
*/ staticvoid _dc_start(struct hfi1_devdata *dd)
{
lockdep_assert_held(&dd->dc8051_lock);
if (!dd->dc_shutdown) return;
/* Take the 8051 out of reset */
write_csr(dd, DC_DC8051_CFG_RST, 0ull); /* Wait until 8051 is ready */ if (wait_fm_ready(dd, TIMEOUT_8051_START))
dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
__func__);
/* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_OUT_OF_RESET); /* lcb_shutdown() with abort=1 does not restore these */
write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
dd->dc_shutdown = 0;
}
/* * These LCB adjustments are for the Aurora SerDes core in the FPGA.
*/ staticvoid adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
{
u64 rx_radr, tx_radr;
u32 version;
if (dd->icode != ICODE_FPGA_EMULATION) return;
/* * These LCB defaults on emulator _s are good, nothing to do here: * LCB_CFG_TX_FIFOS_RADR * LCB_CFG_RX_FIFOS_RADR * LCB_CFG_LN_DCLK * LCB_CFG_IGNORE_LOST_RCLK
*/ if (is_emulator_s(dd)) return; /* else this is _p */
version = emulator_rev(dd); if (!is_ax(dd))
version = 0x2d; /* all B0 use 0x2d or higher settings */
if (version <= 0x12) { /* release 0x12 and below */
/* * Handle a SMA idle message * * This is a work-queue function outside of the interrupt.
*/ void handle_sma_message(struct work_struct *work)
{ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
sma_message_work); struct hfi1_devdata *dd = ppd->dd;
u64 msg; int ret;
/* * msg is bytes 1-4 of the 40-bit idle message - the command code * is stripped off
*/
ret = read_idle_sma(dd, &msg); if (ret) return;
dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg); /* * React to the SMA message. Byte[1] (0 for us) is the command.
*/ switch (msg & 0xff) { case SMA_IDLE_ARM: /* * See OPAv1 table 9-14 - HFI and External Switch Ports Key * State Transitions * * Only expected in INIT or ARMED, discard otherwise.
*/ if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
ppd->neighbor_normal = 1; break; case SMA_IDLE_ACTIVE: /* * See OPAv1 table 9-14 - HFI and External Switch Ports Key * State Transitions * * Can activate the node. Discard otherwise.
*/ if (ppd->host_link_state == HLS_UP_ARMED &&
ppd->is_active_optimize_enabled) {
ppd->neighbor_normal = 1;
ret = set_link_state(ppd, HLS_UP_ACTIVE); if (ret)
dd_dev_err(
dd, "%s: received Active SMA idle message, couldn't set link to Active\n",
__func__);
} break; default:
dd_dev_err(dd, "%s: received unexpected SMA idle message 0x%llx\n",
__func__, msg); break;
}
}
/* * Called from all interrupt handlers to start handling an SPC freeze.
*/ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
{ struct hfi1_devdata *dd = ppd->dd; struct send_context *sc; int i; int sc_flags;
if (flags & FREEZE_SELF)
write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
/* enter frozen mode */
dd->flags |= HFI1_FROZEN;
/* notify all SDMA engines that they are going into a freeze */
sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
sc_flags = SCF_FROZEN | SCF_HALTED | (flags & FREEZE_LINK_DOWN ?
SCF_LINK_DOWN : 0); /* do halt pre-handling on all enabled send contexts */ for (i = 0; i < dd->num_send_contexts; i++) {
sc = dd->send_contexts[i].sc; if (sc && (sc->flags & SCF_ENABLED))
sc_stop(sc, sc_flags);
}
/* Send context are frozen. Notify user space */
hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
/* * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen, * depending on the "freeze" parameter. * * No need to return an error if it times out, our only option * is to proceed anyway.
*/ staticvoid wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
{ unsignedlong timeout;
u64 reg;
timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT); while (1) {
reg = read_csr(dd, CCE_STATUS); if (freeze) { /* waiting until all indicators are set */ if ((reg & ALL_FROZE) == ALL_FROZE) return; /* all done */
} else { /* waiting until all indicators are clear */ if ((reg & ALL_FROZE) == 0) return; /* all done */
}
/* * Do all freeze handling for the RXE block.
*/ staticvoid rxe_freeze(struct hfi1_devdata *dd)
{ int i; struct hfi1_ctxtdata *rcd;
/* disable port */
clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
/* disable all receive contexts */ for (i = 0; i < dd->num_rcv_contexts; i++) {
rcd = hfi1_rcd_get_by_index(dd, i);
hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, rcd);
hfi1_rcd_put(rcd);
}
}
/* * Unfreeze handling for the RXE block - kernel contexts only. * This will also enable the port. User contexts will do unfreeze * handling on a per-context basis as they call into the driver. *
*/ staticvoid rxe_kernel_unfreeze(struct hfi1_devdata *dd)
{
u32 rcvmask;
u16 i; struct hfi1_ctxtdata *rcd;
/* enable all kernel contexts */ for (i = 0; i < dd->num_rcv_contexts; i++) {
rcd = hfi1_rcd_get_by_index(dd, i);
/* Ensure all non-user contexts(including vnic) are enabled */ if (!rcd ||
(i >= dd->first_dyn_alloc_ctxt && !rcd->is_vnic)) {
hfi1_rcd_put(rcd); continue;
}
rcvmask = HFI1_RCVCTRL_CTXT_ENB; /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
rcvmask |= hfi1_rcvhdrtail_kvaddr(rcd) ?
HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
hfi1_rcvctrl(dd, rcvmask, rcd);
hfi1_rcd_put(rcd);
}
/* enable port */
add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
}
/* * Non-interrupt SPC freeze handling. * * This is a work-queue function outside of the triggering interrupt.
*/ void handle_freeze(struct work_struct *work)
{ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
freeze_work); struct hfi1_devdata *dd = ppd->dd;
/* wait for freeze indicators on all affected blocks */
wait_for_freeze_status(dd, 1);
/* SPC is now frozen */
/* do send PIO freeze steps */
pio_freeze(dd);
/* do send DMA freeze steps */
sdma_freeze(dd);
/* do send egress freeze steps - nothing to do */
/* do receive freeze steps */
rxe_freeze(dd);
/* * Unfreeze the hardware - clear the freeze, wait for each * block's frozen bit to clear, then clear the frozen flag.
*/
write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
wait_for_freeze_status(dd, 0);
/* do send PIO unfreeze steps for kernel contexts */
pio_kernel_unfreeze(dd);
/* do send DMA unfreeze steps */
sdma_unfreeze(dd);
/* do send egress unfreeze steps - nothing to do */
/* do receive unfreeze steps for kernel contexts */
rxe_kernel_unfreeze(dd);
/* * The unfreeze procedure touches global device registers when * it disables and re-enables RXE. Mark the device unfrozen * after all that is done so other parts of the driver waiting * for the device to unfreeze don't do things out of order. * * The above implies that the meaning of HFI1_FROZEN flag is * "Device has gone into freeze mode and freeze mode handling * is still in progress." * * The flag will be removed when freeze mode processing has * completed.
*/
dd->flags &= ~HFI1_FROZEN;
wake_up(&dd->event_queue);
/* no longer frozen */
}
/** * update_xmit_counters - update PortXmitWait/PortVlXmitWait * counters. * @ppd: info of physical Hfi port * @link_width: new link width after link up or downgrade * * Update the PortXmitWait and PortVlXmitWait counters after * a link up or downgrade event to reflect a link width change.
*/ staticvoid update_xmit_counters(struct hfi1_pportdata *ppd, u16 link_width)
{ int i;
u16 tx_width;
u16 link_speed;
/* * There are C_VL_COUNT number of PortVLXmitWait counters. * Adding 1 to C_VL_COUNT to include the PortXmitWait counter.
*/ for (i = 0; i < C_VL_COUNT + 1; i++)
get_xmit_wait_counters(ppd, tx_width, link_speed, i);
}
/* * Handle a link up interrupt from the 8051. * * This is a work-queue function outside of the interrupt.
*/ void handle_link_up(struct work_struct *work)
{ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
link_up_work); struct hfi1_devdata *dd = ppd->dd;
set_link_state(ppd, HLS_UP_INIT);
/* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
read_ltp_rtt(dd); /* * OPA specifies that certain counters are cleared on a transition * to link up, so do that.
*/
clear_linkup_counters(dd); /* * And (re)set link up default values.
*/
set_linkup_defaults(ppd);
/* * Set VL15 credits. Use cached value from verify cap interrupt. * In case of quick linkup or simulator, vl15 value will be set by * handle_linkup_change. VerifyCap interrupt handler will not be * called in those scenarios.
*/ if (!(quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR))
set_up_vl15(dd, dd->vl15buf_cached);
/* enforce link speed enabled */ if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) { /* oops - current speed is not enabled, bounce */
dd_dev_err(dd, "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
ppd->link_speed_active, ppd->link_speed_enabled);
set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
OPA_LINKDOWN_REASON_SPEED_POLICY);
set_link_state(ppd, HLS_DN_OFFLINE);
start_link(ppd);
}
}
/* * Several pieces of LNI information were cached for SMA in ppd. * Reset these on link down
*/ staticvoid reset_neighbor_info(struct hfi1_pportdata *ppd)
{
ppd->neighbor_guid = 0;
ppd->neighbor_port_number = 0;
ppd->neighbor_type = 0;
ppd->neighbor_fm_security = 0;
}
/* return the neighbor link down reason string */ staticconstchar *link_down_reason_str(u8 reason)
{ constchar *str = NULL;
if (reason < ARRAY_SIZE(link_down_reason_strs))
str = link_down_reason_strs[reason]; if (!str)
str = "(invalid)";
return str;
}
/* * Handle a link down interrupt from the 8051. * * This is a work-queue function outside of the interrupt.
*/ void handle_link_down(struct work_struct *work)
{
u8 lcl_reason, neigh_reason = 0;
u8 link_down_reason; struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
link_down_work); int was_up; staticconstchar ldr_str[] = "Link down reason: ";
/* Go offline first, then deal with reading/writing through 8051 */
was_up = !!(ppd->host_link_state & HLS_UP);
set_link_state(ppd, HLS_DN_OFFLINE);
xchg(&ppd->is_link_down_queued, 0);
if (was_up) {
lcl_reason = 0; /* link down reason is only valid if the link was up */
read_link_down_reason(ppd->dd, &link_down_reason); switch (link_down_reason) { case LDR_LINK_TRANSFER_ACTIVE_LOW: /* the link went down, no idle message reason */
dd_dev_info(ppd->dd, "%sUnexpected link down\n",
ldr_str); break; case LDR_RECEIVED_LINKDOWN_IDLE_MSG: /* * The neighbor reason is only valid if an idle message * was received for it.
*/
read_planned_down_reason_code(ppd->dd, &neigh_reason);
dd_dev_info(ppd->dd, "%sNeighbor link down message %d, %s\n",
ldr_str, neigh_reason,
link_down_reason_str(neigh_reason)); break; case LDR_RECEIVED_HOST_OFFLINE_REQ:
dd_dev_info(ppd->dd, "%sHost requested link to go offline\n",
ldr_str); break; default:
dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
ldr_str, link_down_reason); break;
}
/* * If no reason, assume peer-initiated but missed * LinkGoingDown idle flits.
*/ if (neigh_reason == 0)
lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
} else { /* went down while polling or going up */
lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
}
/* inform the SMA when the link transitions from up to down */ if (was_up && ppd->local_link_down_reason.sma == 0 &&
ppd->neigh_link_down_reason.sma == 0) {
ppd->local_link_down_reason.sma =
ppd->local_link_down_reason.latest;
ppd->neigh_link_down_reason.sma =
ppd->neigh_link_down_reason.latest;
}
reset_neighbor_info(ppd);
/* disable the port */
clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
/* * If there is no cable attached, turn the DC off. Otherwise, * start the link bring up.
*/ if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd))
dc_shutdown(ppd->dd); else
start_link(ppd);
}
/* * Only do something if the link is currently up.
*/ if (ppd->host_link_state & HLS_UP) {
set_link_state(ppd, HLS_DN_OFFLINE);
start_link(ppd);
} else {
dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
__func__, link_state_name(ppd->host_link_state));
}
}
/* * Mask conversion: Capability exchange to Port LTP. The capability * exchange has an implicit 16b CRC that is mandatory.
*/ staticint cap_to_port_ltp(int cap)
{ int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
if (cap & CAP_CRC_14B)
port_ltp |= PORT_LTP_CRC_MODE_14; if (cap & CAP_CRC_48B)
port_ltp |= PORT_LTP_CRC_MODE_48; if (cap & CAP_CRC_12B_16B_PER_LANE)
port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
return port_ltp;
}
/* * Convert an OPA Port LTP mask to capability mask
*/ int port_ltp_to_cap(int port_ltp)
{ int cap_mask = 0;
if (port_ltp & PORT_LTP_CRC_MODE_14)
cap_mask |= CAP_CRC_14B; if (port_ltp & PORT_LTP_CRC_MODE_48)
cap_mask |= CAP_CRC_48B; if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
cap_mask |= CAP_CRC_12B_16B_PER_LANE;
return cap_mask;
}
/* * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
*/ staticint lcb_to_port_ltp(int lcb_crc)
{ int port_ltp = 0;
/* * Convert the given link width to the OPA link width bitmask.
*/ static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
{ switch (width) { case 0: /* * Simulator and quick linkup do not set the width. * Just set it to 4x without complaint.
*/ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup) return OPA_LINK_WIDTH_4X; return 0; /* no lanes up */ case 1: return OPA_LINK_WIDTH_1X; case 2: return OPA_LINK_WIDTH_2X; case 3: return OPA_LINK_WIDTH_3X; case 4: return OPA_LINK_WIDTH_4X; default:
dd_dev_info(dd, "%s: invalid width %d, using 4\n",
__func__, width); return OPA_LINK_WIDTH_4X;
}
}
/* * Do a population count on the bottom nibble.
*/ staticconst u8 bit_counts[16] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
};
/* * Read the active lane information from the 8051 registers and return * their widths. * * Active lane information is found in these 8051 registers: * enable_lane_tx * enable_lane_rx
*/ staticvoid get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
u16 *rx_width)
{
u16 tx, rx;
u8 enable_lane_rx;
u8 enable_lane_tx;
u8 tx_polarity_inversion;
u8 rx_polarity_inversion;
u8 max_rate;
/* read the active lanes */
read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
&rx_polarity_inversion, &max_rate);
read_local_lni(dd, &enable_lane_rx);
/* * Set link_speed_active here, overriding what was set in * handle_verify_cap(). The ASIC 8051 firmware does not correctly * set the max_rate field in handle_verify_cap until v0.19.
*/ if ((dd->icode == ICODE_RTL_SILICON) &&
(dd->dc8051_ver < dc8051_ver(0, 19, 0))) { /* max_rate: 0 = 12.5G, 1 = 25G */ switch (max_rate) { case 0:
dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G; break; case 1:
dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G; break; default:
dd_dev_err(dd, "%s: unexpected max rate %d, using 25Gb\n",
__func__, (int)max_rate);
dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G; break;
}
}
/* * Read verify_cap_local_fm_link_width[1] to obtain the link widths. * Valid after the end of VerifyCap and during LinkUp. Does not change * after link up. I.e. look elsewhere for downgrade information. * * Bits are: * + bits [7:4] contain the number of active transmitters * + bits [3:0] contain the number of active receivers * These are numbers 1 through 4 and can be different values if the * link is asymmetric. * * verify_cap_local_fm_link_width[0] retains its original value.
*/ staticvoid get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
u16 *rx_width)
{
u16 widths, tx, rx;
u8 misc_bits, local_flags;
u16 active_tx, active_rx;
/* print the active widths */
get_link_widths(dd, &active_tx, &active_rx);
}
/* * Set ppd->link_width_active and ppd->link_width_downgrade_active using * hardware information when the link first comes up. * * The link width is not available until after VerifyCap.AllFramesReceived * (the trigger for handle_verify_cap), so this is outside that routine * and should be called when the 8051 signals linkup.
*/ void get_linkup_link_widths(struct hfi1_pportdata *ppd)
{
u16 tx_width, rx_width;
/* get end-of-LNI link widths */
get_linkup_widths(ppd->dd, &tx_width, &rx_width);
/* use tx_width as the link is supposed to be symmetric on link up */
ppd->link_width_active = tx_width; /* link width downgrade active (LWD.A) starts out matching LW.A */
ppd->link_width_downgrade_tx_active = ppd->link_width_active;
ppd->link_width_downgrade_rx_active = ppd->link_width_active; /* per OPA spec, on link up LWD.E resets to LWD.S */
ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported; /* cache the active egress rate (units {10^6 bits/sec]) */
ppd->current_egress_rate = active_egress_rate(ppd);
}
/* * Handle a verify capabilities interrupt from the 8051. * * This is a work-queue function outside of the interrupt.
*/ void handle_verify_cap(struct work_struct *work)
{ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
link_vc_work); struct hfi1_devdata *dd = ppd->dd;
u64 reg;
u8 power_management;
u8 continuous;
u8 vcu;
u8 vau;
u8 z;
u16 vl15buf;
u16 link_widths;
u16 crc_mask;
u16 crc_val;
u16 device_id;
u16 active_tx, active_rx;
u8 partner_supported_crc;
u8 remote_tx_rate;
u8 device_rev;
/* print the active widths */
get_link_widths(dd, &active_tx, &active_rx);
dd_dev_info(dd, "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
(int)power_management, (int)continuous);
dd_dev_info(dd, "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
(int)vau, (int)z, (int)vcu, (int)vl15buf,
(int)partner_supported_crc);
dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
(u32)remote_tx_rate, (u32)link_widths);
dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
(u32)device_id, (u32)device_rev); /* * The peer vAU value just read is the peer receiver value. HFI does * not support a transmit vAU of 0 (AU == 8). We advertised that * with Z=1 in the fabric capabilities sent to the peer. The peer * will see our Z=1, and, if it advertised a vAU of 0, will move its * receive to vAU of 1 (AU == 16). Do the same here. We do not care * about the peer Z value - our sent vAU is 3 (hardwired) and is not * subject to the Z value exception.
*/ if (vau == 0)
vau = 1;
set_up_vau(dd, vau);
/* * Set VL15 credits to 0 in global credit register. Cache remote VL15 * credits value and wait for link-up interrupt ot set it.
*/
set_up_vl15(dd, 0);
dd->vl15buf_cached = vl15buf;
/* set up the LCB CRC mode */
crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
/* order is important: use the lowest bit in common */ if (crc_mask & CAP_CRC_14B)
crc_val = LCB_CRC_14B; elseif (crc_mask & CAP_CRC_48B)
crc_val = LCB_CRC_48B; elseif (crc_mask & CAP_CRC_12B_16B_PER_LANE)
crc_val = LCB_CRC_12B_16B_PER_LANE; else
crc_val = LCB_CRC_16B;
/* * Cache the values of the supported, enabled, and active * LTP CRC modes to return in 'portinfo' queries. But the bit * flags that are returned in the portinfo query differ from * what's in the link_crc_mask, crc_sizes, and crc_val * variables. Convert these here.
*/
ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8; /* supported crc modes */
ppd->port_ltp_crc_mode |=
cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4; /* enabled crc modes */
ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val); /* active crc mode */
/* set up the remote credit return table */
assign_remote_cm_au_table(dd, vcu);
/* * The LCB is reset on entry to handle_verify_cap(), so this must * be applied on every link up. * * Adjust LCB error kill enable to kill the link if * these RBUF errors are seen: * REPLAY_BUF_MBE_SMASK * FLIT_INPUT_BUF_MBE_SMASK
*/ if (is_ax(dd)) { /* fixed in B0 */
reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
| DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
}
/* pull LCB fifos out of reset - all fifo clocks must be stable */
write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
/* give 8051 access to the LCB CSRs */
write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
set_8051_lcb_access(dd);
/* tell the 8051 to go to LinkUp */
set_link_state(ppd, HLS_GOING_UP);
}
/** * apply_link_downgrade_policy - Apply the link width downgrade enabled * policy against the current active link widths. * @ppd: info of physical Hfi port * @refresh_widths: True indicates link downgrade event * @return: True indicates a successful link downgrade. False indicates * link downgrade event failed and the link will bounce back to * default link width. * * Called when the enabled policy changes or the active link widths * change. * Refresh_widths indicates that a link downgrade occurred. The * link_downgraded variable is set by refresh_widths and * determines the success/failure of the policy application.
*/ bool apply_link_downgrade_policy(struct hfi1_pportdata *ppd, bool refresh_widths)
{ int do_bounce = 0; int tries;
u16 lwde;
u16 tx, rx; bool link_downgraded = refresh_widths;
/* use the hls lock to avoid a race with actual link up */
tries = 0;
retry:
mutex_lock(&ppd->hls_lock); /* only apply if the link is up */ if (ppd->host_link_state & HLS_DOWN) { /* still going up..wait and retry */ if (ppd->host_link_state & HLS_GOING_UP) { if (++tries < 1000) {
mutex_unlock(&ppd->hls_lock);
usleep_range(100, 120); /* arbitrary */ goto retry;
}
dd_dev_err(ppd->dd, "%s: giving up waiting for link state change\n",
__func__);
} goto done;
}
if (ppd->link_width_downgrade_tx_active == 0 ||
ppd->link_width_downgrade_rx_active == 0) { /* the 8051 reported a dead link as a downgrade */
dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
link_downgraded = false;
} elseif (lwde == 0) { /* downgrade is disabled */
/* bounce if not at starting active width */ if ((ppd->link_width_active !=
ppd->link_width_downgrade_tx_active) ||
(ppd->link_width_active !=
ppd->link_width_downgrade_rx_active)) {
dd_dev_err(ppd->dd, "Link downgrade is disabled and link has downgraded, downing link\n");
dd_dev_err(ppd->dd, " original 0x%x, tx active 0x%x, rx active 0x%x\n",
ppd->link_width_active,
ppd->link_width_downgrade_tx_active,
ppd->link_width_downgrade_rx_active);
do_bounce = 1;
link_downgraded = false;
}
} elseif ((lwde & ppd->link_width_downgrade_tx_active) == 0 ||
(lwde & ppd->link_width_downgrade_rx_active) == 0) { /* Tx or Rx is outside the enabled policy */
dd_dev_err(ppd->dd, "Link is outside of downgrade allowed, downing link\n");
dd_dev_err(ppd->dd, " enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
lwde, ppd->link_width_downgrade_tx_active,
ppd->link_width_downgrade_rx_active);
do_bounce = 1;
link_downgraded = false;
}
/* * Handle a link downgrade interrupt from the 8051. * * This is a work-queue function outside of the interrupt.
*/ void handle_link_downgrade(struct work_struct *work)
{ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
link_downgrade_work);
dd_dev_info(ppd->dd, "8051: Link width downgrade\n"); if (apply_link_downgrade_policy(ppd, true))
update_xmit_counters(ppd, ppd->link_width_downgrade_tx_active);
}
/* look at the flags */ if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) { /* 8051 information set by firmware */ /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
& DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
host_msg = (info >>
DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
& DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
/* * Handle error flags.
*/ if (err & FAILED_LNI) { /* * LNI error indications are cleared by the 8051 * only when starting polling. Only pay attention * to them when in the states that occur during * LNI.
*/ if (ppd->host_link_state
& (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
queue_link_down = 1;
dd_dev_info(dd, "Link error: %s\n",
dc8051_info_err_string(buf, sizeof(buf),
err &
FAILED_LNI));
}
err &= ~(u64)FAILED_LNI;
} /* unknown frames can happen durning LNI, just count */ if (err & UNKNOWN_FRAME) {
ppd->unknown_frame_count++;
err &= ~(u64)UNKNOWN_FRAME;
} if (err) { /* report remaining errors, but do not do anything */
dd_dev_err(dd, "8051 info error: %s\n",
dc8051_info_err_string(buf, sizeof(buf),
err));
}
/* * Handle host message flags.
*/ if (host_msg & HOST_REQ_DONE) { /* * Presently, the driver does a busy wait for * host requests to complete. This is only an * informational message. * NOTE: The 8051 clears the host message * information *on the next 8051 command*. * Therefore, when linkup is achieved, * this flag will still be set.
*/
host_msg &= ~(u64)HOST_REQ_DONE;
} if (host_msg & BC_SMA_MSG) {
queue_work(ppd->link_wq, &ppd->sma_message_work);
host_msg &= ~(u64)BC_SMA_MSG;
} if (host_msg & LINKUP_ACHIEVED) {
dd_dev_info(dd, "8051: Link up\n");
queue_work(ppd->link_wq, &ppd->link_up_work);
host_msg &= ~(u64)LINKUP_ACHIEVED;
} if (host_msg & EXT_DEVICE_CFG_REQ) {
handle_8051_request(ppd);
host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
} if (host_msg & VERIFY_CAP_FRAME) {
queue_work(ppd->link_wq, &ppd->link_vc_work);
host_msg &= ~(u64)VERIFY_CAP_FRAME;
} if (host_msg & LINK_GOING_DOWN) { constchar *extra = ""; /* no downgrade action needed if going down */ if (host_msg & LINK_WIDTH_DOWNGRADED) {
host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
extra = " (ignoring downgrade)";
}
dd_dev_info(dd, "8051: Link down%s\n", extra);
queue_link_down = 1;
host_msg &= ~(u64)LINK_GOING_DOWN;
} if (host_msg & LINK_WIDTH_DOWNGRADED) {
queue_work(ppd->link_wq, &ppd->link_downgrade_work);
host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
} if (host_msg) { /* report remaining messages, but do not do anything */
dd_dev_info(dd, "8051 info host message: %s\n",
dc8051_info_host_msg_string(buf, sizeof(buf),
host_msg));
}
reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
} if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) { /* * Lost the 8051 heartbeat. If this happens, we * receive constant interrupts about it. Disable * the interrupt after the first.
*/
dd_dev_err(dd, "Lost 8051 heartbeat\n");
write_csr(dd, DC_DC8051_ERR_EN,
read_csr(dd, DC_DC8051_ERR_EN) &
~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
} if (reg) { /* report the error, but do not do anything */
dd_dev_err(dd, "8051 error: %s\n",
dc8051_err_string(buf, sizeof(buf), reg));
}
if (queue_link_down) { /* * if the link is already going down or disabled, do not * queue another. If there's a link down entry already * queued, don't queue another one.
*/ if ((ppd->host_link_state &
(HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
ppd->link_enabled == 0) {
dd_dev_info(dd, "%s: not queuing link down. host_link_state %x, link_enabled %x\n",
__func__, ppd->host_link_state,
ppd->link_enabled);
} else { if (xchg(&ppd->is_link_down_queued, 1) == 1)
dd_dev_info(dd, "%s: link down request already queued\n",
__func__); else
queue_work(ppd->link_wq, &ppd->link_down_work);
}
}
}
staticconstchar * const fm_config_txt[] = {
[0] = "BadHeadDist: Distance violation between two head flits",
[1] = "BadTailDist: Distance violation between two tail flits",
[2] = "BadCtrlDist: Distance violation between two credit control flits",
[3] = "BadCrdAck: Credits return for unsupported VL",
[4] = "UnsupportedVLMarker: Received VL Marker",
[5] = "BadPreempt: Exceeded the preemption nesting level",
[6] = "BadControlFlit: Received unsupported control flit", /* no 7 */
[8] = "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
};
staticconstchar * const port_rcv_txt[] = {
[1] = "BadPktLen: Illegal PktLen",
[2] = "PktLenTooLong: Packet longer than PktLen",
[3] = "PktLenTooShort: Packet shorter than PktLen",
[4] = "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
[5] = "BadDLID: Illegal DLID (0, doesn't match HFI)",
[6] = "BadL2: Illegal L2 opcode",
[7] = "BadSC: Unsupported SC",
[9] = "BadRC: Illegal RC",
[11] = "PreemptError: Preempting with same VL",
[12] = "PreemptVL15: Preempting a VL15 packet",
};
if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) { if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK; /* set status bit */
dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
}
reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
}
if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) { struct hfi1_pportdata *ppd = dd->pport; /* this counter saturates at (2^32) - 1 */ if (ppd->link_downed < (u32)UINT_MAX)
ppd->link_downed++;
reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
}
if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
u8 reason_valid = 1;
info = read_csr(dd, DCC_ERR_INFO_FMCONFIG); if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK; /* set status bit */
dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
} switch (info) { case 0: case 1: case 2: case 3: case 4: case 5: case 6:
extra = fm_config_txt[info]; break; case 8:
extra = fm_config_txt[info]; if (ppd->port_error_action &
OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
do_bounce = 1; /* * lcl_reason cannot be derived from info * for this error
*/
lcl_reason =
OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
} break; default:
reason_valid = 0;
snprintf(buf, sizeof(buf), "reserved%lld", info);
extra = buf; break;
}
if (reason_valid && !do_bounce) {
do_bounce = ppd->port_error_action &
(1 << (OPA_LDR_FMCONFIG_OFFSET + info));
lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
}
/* just report this */
dd_dev_info_ratelimited(dd, "DCC Error: fmconfig error: %s\n",
extra);
reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
}
if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
u8 reason_valid = 1;
info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1); if (!(dd->err_info_rcvport.status_and_code &
OPA_EI_STATUS_SMASK)) {
dd->err_info_rcvport.status_and_code =
info & OPA_EI_CODE_SMASK; /* set status bit */
dd->err_info_rcvport.status_and_code |=
OPA_EI_STATUS_SMASK; /* * save first 2 flits in the packet that caused * the error
*/
dd->err_info_rcvport.packet_flit1 = hdr0;
dd->err_info_rcvport.packet_flit2 = hdr1;
} switch (info) { case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 9: case 11: case 12:
extra = port_rcv_txt[info]; break; default:
reason_valid = 0;
snprintf(buf, sizeof(buf), "reserved%lld", info);
extra = buf; break;
}
if (reason_valid && !do_bounce) {
do_bounce = ppd->port_error_action &
(1 << (OPA_LDR_PORTRCV_OFFSET + info));
lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
}
/* just report this */
dd_dev_info_ratelimited(dd, "DCC Error: PortRcv error: %s\n" " hdr0 0x%llx, hdr1 0x%llx\n",
extra, hdr0, hdr1);
reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
}
if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) { /* informative only */
dd_dev_info_ratelimited(dd, "8051 access to LCB blocked\n");
reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
} if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) { /* informative only */
dd_dev_info_ratelimited(dd, "host access to LCB blocked\n");
reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
}
if (unlikely(hfi1_dbg_fault_suppress_err(&dd->verbs_dev)))
reg &= ~DCC_ERR_FLG_LATE_EBP_ERR_SMASK;
/* report any remaining errors */ if (reg)
dd_dev_info_ratelimited(dd, "DCC Error: %s\n",
dcc_err_string(buf, sizeof(buf), reg));
if (lcl_reason == 0)
lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
if (eri->handler) {
interrupt_clear_down(dd, 0, eri);
} elseif (source == 3 /* dc_lbm_int */) { /* * This indicates that a parity error has occurred on the * address/control lines presented to the LBM. The error * is a single pulse, there is no associated error flag, * and it is non-maskable. This is because if a parity * error occurs on the request the request is dropped. * This should never occur, but it is nice to know if it * ever does.
*/
dd_dev_err(dd, "Parity error in DC LBM block\n");
} else {
dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
}
}
if (likely(what < 3 && which < dd->num_sdma)) {
sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
} else { /* should not happen */
dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
}
}
/** * is_rcv_avail_int() - User receive context available IRQ handler * @dd: valid dd * @source: logical IRQ source (offset from IS_RCVAVAIL_START) * * RX block receive available interrupt. Source is < 160. * * This is the general interrupt handler for user (PSM) receive contexts, * and can only be used for non-threaded IRQs.
*/ staticvoid is_rcv_avail_int(struct hfi1_devdata *dd, unsignedint source)
{ struct hfi1_ctxtdata *rcd; char *err_detail;
if (likely(source < dd->num_rcv_contexts)) {
rcd = hfi1_rcd_get_by_index(dd, source); if (rcd) {
handle_user_interrupt(rcd);
hfi1_rcd_put(rcd); return; /* OK */
} /* received an interrupt, but no rcd */
err_detail = "dataless";
} else { /* received an interrupt, but are not using that context */
err_detail = "out of range";
}
dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
err_detail, source);
}
/** * is_rcv_urgent_int() - User receive context urgent IRQ handler * @dd: valid dd * @source: logical IRQ source (offset from IS_RCVURGENT_START) * * RX block receive urgent interrupt. Source is < 160. * * NOTE: kernel receive contexts specifically do NOT enable this IRQ.
*/ staticvoid is_rcv_urgent_int(struct hfi1_devdata *dd, unsignedint source)
{ struct hfi1_ctxtdata *rcd; char *err_detail;
if (likely(source < dd->num_rcv_contexts)) {
rcd = hfi1_rcd_get_by_index(dd, source); if (rcd) {
handle_user_interrupt(rcd);
hfi1_rcd_put(rcd); return; /* OK */
} /* received an interrupt, but no rcd */
err_detail = "dataless";
} else { /* received an interrupt, but are not using that context */
err_detail = "out of range";
}
dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
err_detail, source);
}
/* * Reserved range interrupt. Should not be called in normal operation.
*/ staticvoid is_reserved_int(struct hfi1_devdata *dd, unsignedint source)
{ char name[64];
/* * Interrupt source interrupt - called when the given source has an interrupt. * Source is a bit index into an array of 64-bit integers.
*/ staticvoid is_interrupt(struct hfi1_devdata *dd, unsignedint source)
{ conststruct is_table *entry;
/* avoids a double compare by walking the table in-order */ for (entry = &is_table[0]; entry->is_name; entry++) { if (source <= entry->end) {
trace_hfi1_interrupt(dd, entry, source);
entry->is_int(dd, source - entry->start); return;
}
} /* fell off the end */
dd_dev_err(dd, "invalid interrupt source %u\n", source);
}
/** * general_interrupt - General interrupt handler * @irq: MSIx IRQ vector * @data: hfi1 devdata * * This is able to correctly handle all non-threaded interrupts. Receive * context DATA IRQs are threaded and are not supported by this handler. *
*/
irqreturn_t general_interrupt(int irq, void *data)
{ struct hfi1_devdata *dd = data;
u64 regs[CCE_NUM_INT_CSRS];
u32 bit; int i;
irqreturn_t handled = IRQ_NONE;
this_cpu_inc(*dd->int_counter);
/* phase 1: scan and clear all handled interrupts */ for (i = 0; i < CCE_NUM_INT_CSRS; i++) { if (dd->gi_mask[i] == 0) {
regs[i] = 0; /* used later */ continue;
}
regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
dd->gi_mask[i]; /* only clear if anything is set */ if (regs[i])
write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
}
/* This read_csr is really bad in the hot path */
status = read_csr(dd,
CCE_INT_STATUS + (8 * (IS_SDMA_START / 64)))
& sde->imask; if (likely(status)) { /* clear the interrupt(s) */
write_csr(dd,
CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)),
status);
/* handle the interrupt(s) */
sdma_engine_interrupt(sde, status);
} else {
dd_dev_info_ratelimited(dd, "SDMA engine %u interrupt, but no status bits set\n",
sde->this_idx);
} return IRQ_HANDLED;
}
/* * Clear the receive interrupt. Use a read of the interrupt clear CSR * to insure that the write completed. This does NOT guarantee that * queued DMA writes to memory from the chip are pushed.
*/ staticinlinevoid clear_recv_intr(struct hfi1_ctxtdata *rcd)
{ struct hfi1_devdata *dd = rcd->dd;
u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
write_csr(dd, addr, rcd->imask); /* force the above write on the chip and get a value back */
(void)read_csr(dd, addr);
}
/* force the receive interrupt */ void force_recv_intr(struct hfi1_ctxtdata *rcd)
{
write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask);
}
/* * Return non-zero if a packet is present. * * This routine is called when rechecking for packets after the RcvAvail * interrupt has been cleared down. First, do a quick check of memory for * a packet present. If not found, use an expensive CSR read of the context * tail to determine the actual tail. The CSR read is necessary because there * is no method to push pending DMAs to memory other than an interrupt and we * are trying to determine if we need to force an interrupt.
*/ staticinlineint check_packet_present(struct hfi1_ctxtdata *rcd)
{
u32 tail;
if (hfi1_packet_present(rcd)) return 1;
/* fall back to a CSR read, correct indpendent of DMA_RTAIL */
tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL); return hfi1_rcd_head(rcd) != tail;
}
/* * Common code for receive contexts interrupt handlers. * Update traces, increment kernel IRQ counter and * setup ASPM when needed.
*/ staticvoid receive_interrupt_common(struct hfi1_ctxtdata *rcd)
{ struct hfi1_devdata *dd = rcd->dd;
/* * __hfi1_rcd_eoi_intr() - Make HW issue receive interrupt * when there are packets present in the queue. When calling * with interrupts enabled please use hfi1_rcd_eoi_intr. * * @rcd: valid receive context
*/ staticvoid __hfi1_rcd_eoi_intr(struct hfi1_ctxtdata *rcd)
{ if (!rcd->rcvhdrq) return;
clear_recv_intr(rcd); if (check_packet_present(rcd))
force_recv_intr(rcd);
}
/** * hfi1_rcd_eoi_intr() - End of Interrupt processing action * * @rcd: Ptr to hfi1_ctxtdata of receive context * * Hold IRQs so we can safely clear the interrupt and * recheck for a packet that may have arrived after the previous * check and the interrupt clear. If a packet arrived, force another * interrupt. This routine can be called at the end of receive packet * processing in interrupt service routines, interrupt service thread * and softirqs
*/ staticvoid hfi1_rcd_eoi_intr(struct hfi1_ctxtdata *rcd)
{ unsignedlong flags;
/** * hfi1_netdev_rx_napi - napi poll function to move eoi inline * @napi: pointer to napi object * @budget: netdev budget
*/ int hfi1_netdev_rx_napi(struct napi_struct *napi, int budget)
{ struct hfi1_netdev_rxq *rxq = container_of(napi, struct hfi1_netdev_rxq, napi); struct hfi1_ctxtdata *rcd = rxq->rcd; int work_done = 0;
work_done = rcd->do_interrupt(rcd, budget);
if (work_done < budget) {
napi_complete_done(napi, work_done);
hfi1_rcd_eoi_intr(rcd);
}
return work_done;
}
/* Receive packet napi handler for netdevs VNIC and AIP */
irqreturn_t receive_context_interrupt_napi(int irq, void *data)
{ struct hfi1_ctxtdata *rcd = data;
receive_interrupt_common(rcd);
if (likely(rcd->napi)) { if (likely(napi_schedule_prep(rcd->napi)))
__napi_schedule_irqoff(rcd->napi); else
__hfi1_rcd_eoi_intr(rcd);
} else {
WARN_ONCE(1, "Napi IRQ handler without napi set up ctxt=%d\n",
rcd->ctxt);
__hfi1_rcd_eoi_intr(rcd);
}
return IRQ_HANDLED;
}
/* * Receive packet IRQ handler. This routine expects to be on its own IRQ. * This routine will try to handle packets immediately (latency), but if * it finds too many, it will invoke the thread handler (bandwitdh). The * chip receive interrupt is *not* cleared down until this or the thread (if * invoked) is finished. The intent is to avoid extra interrupts while we * are processing packets anyway.
*/
irqreturn_t receive_context_interrupt(int irq, void *data)
{ struct hfi1_ctxtdata *rcd = data; int disposition;
/* * Too many packets were seen while processing packets in this * IRQ handler. Invoke the handler thread. The receive interrupt * remains blocked.
*/ if (disposition == RCV_PKT_LIMIT) return IRQ_WAKE_THREAD;
__hfi1_rcd_eoi_intr(rcd); return IRQ_HANDLED;
}
/* * Receive packet thread handler. This expects to be invoked with the * receive interrupt still blocked.
*/
irqreturn_t receive_context_thread(int irq, void *data)
{ struct hfi1_ctxtdata *rcd = data;
/* receive interrupt is still blocked from the IRQ handler */
(void)rcd->do_interrupt(rcd, 1);
/* register is an index of LCB registers: (offset - base) / 8 */
regno = (addr - DC_LCB_CFG_RUN) >> 3;
ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data); if (ret != HCMD_SUCCESS) return -EBUSY; return 0;
}
/* * Provide a cache for some of the LCB registers in case the LCB is * unavailable. * (The LCB is unavailable in certain link states, for example.)
*/ struct lcb_datum {
u32 off;
u64 val;
};
staticvoid update_lcb_cache(struct hfi1_devdata *dd)
{ int i; int ret;
u64 val;
for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
ret = read_lcb_csr(dd, lcb_cache[i].off, &val);
/* Update if we get good data */ if (likely(ret != -EBUSY))
lcb_cache[i].val = val;
}
}
staticint read_lcb_cache(u32 off, u64 *val)
{ int i;
for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) { if (lcb_cache[i].off == off) {
*val = lcb_cache[i].val; return 0;
}
}
pr_warn("%s bad offset 0x%x\n", __func__, off); return -1;
}
/* * Read an LCB CSR. Access may not be in host control, so check. * Return 0 on success, -EBUSY on failure.
*/ int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
{ struct hfi1_pportdata *ppd = dd->pport;
/* if up, go through the 8051 for the value */ if (ppd->host_link_state & HLS_UP) return read_lcb_via_8051(dd, addr, data); /* if going up or down, check the cache, otherwise, no access */ if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE)) { if (read_lcb_cache(addr, data)) return -EBUSY; return 0;
}
/* register is an index of LCB registers: (offset - base) / 8 */
regno = (addr - DC_LCB_CFG_RUN) >> 3;
ret = do_8051_command(dd, HCMD_WRITE_LCB_CSR, regno, &data); if (ret != HCMD_SUCCESS) return -EBUSY; return 0;
}
/* * Write an LCB CSR. Access may not be in host control, so check. * Return 0 on success, -EBUSY on failure.
*/ int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
{ struct hfi1_pportdata *ppd = dd->pport;
/* if up, go through the 8051 for the value */ if (ppd->host_link_state & HLS_UP) return write_lcb_via_8051(dd, addr, data); /* if going up or down, no access */ if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE)) return -EBUSY; /* otherwise, host has access */
write_csr(dd, addr, data); return 0;
}
/* * Returns: * < 0 = Linux error, not able to get access * > 0 = 8051 command RETURN_CODE
*/ staticint do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
u64 *out_data)
{
u64 reg, completed; int return_code; unsignedlong timeout;
hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
mutex_lock(&dd->dc8051_lock);
/* We can't send any commands to the 8051 if it's in reset */ if (dd->dc_shutdown) {
return_code = -ENODEV; goto fail;
}
/* * If an 8051 host command timed out previously, then the 8051 is * stuck. * * On first timeout, attempt to reset and restart the entire DC * block (including 8051). (Is this too big of a hammer?) * * If the 8051 times out a second time, the reset did not bring it * back to healthy life. In that case, fail any subsequent commands.
*/ if (dd->dc8051_timed_out) { if (dd->dc8051_timed_out > 1) {
dd_dev_err(dd, "Previous 8051 host command timed out, skipping command %u\n",
type);
return_code = -ENXIO; goto fail;
}
_dc_shutdown(dd);
_dc_start(dd);
}
/* * If there is no timeout, then the 8051 command interface is * waiting for a command.
*/
/* * When writing a LCB CSR, out_data contains the full value to * be written, while in_data contains the relative LCB * address in 7:0. Do the work here, rather than the caller, * of distrubting the write data to where it needs to go: * * Write data * 39:00 -> in_data[47:8] * 47:40 -> DC8051_CFG_EXT_DEV_0.RETURN_CODE * 63:48 -> DC8051_CFG_EXT_DEV_0.RSP_DATA
*/ if (type == HCMD_WRITE_LCB_CSR) {
in_data |= ((*out_data) & 0xffffffffffull) << 8; /* must preserve COMPLETED - it is tied to hardware */
reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_0);
reg &= DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK;
reg |= ((((*out_data) >> 40) & 0xff) <<
DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT)
| ((((*out_data) >> 48) & 0xffff) <<
DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, reg);
}
/* * Do two writes: the first to stabilize the type and req_data, the * second to activate.
*/
reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
<< DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
| (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
<< DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
u8 lane_id, u32 config_data)
{
u64 data; int ret;
data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
| (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
| (u64)config_data << LOAD_DATA_DATA_SHIFT;
ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL); if (ret != HCMD_SUCCESS) {
dd_dev_err(dd, "load 8051 config: field id %d, lane %d, err %d\n",
(int)field_id, (int)lane_id, ret);
} return ret;
}
/* * Read the 8051 firmware "registers". Use the RAM directly. Always * set the result, even on error. * Return 0 on success, -errno on failure
*/ int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
u32 *result)
{
u64 big_data;
u32 addr; int ret;
/* no need to mask, all variable sizes match field widths */
frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
| tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
| rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
| max_rate << MAX_RATE_SHIFT; return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
}
/* * Read an idle LCB message. * * Returns 0 on success, -EINVAL on error
*/ staticint read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
{ int ret;
ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out); if (ret != HCMD_SUCCESS) {
dd_dev_err(dd, "read idle message: type %d, err %d\n",
(u32)type, ret); return -EINVAL;
}
dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out); /* return only the payload as we already know the type */
*data_out >>= IDLE_PAYLOAD_SHIFT; return 0;
}
/* * Read an idle SMA message. To be done in response to a notification from * the 8051. * * Returns 0 on success, -EINVAL on error
*/ staticint read_idle_sma(struct hfi1_devdata *dd, u64 *data)
{ return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT,
data);
}
/* * Send an idle LCB message. * * Returns 0 on success, -EINVAL on error
*/ staticint send_idle_message(struct hfi1_devdata *dd, u64 data)
{ int ret;
/* * Initialize the LCB then do a quick link up. This may or may not be * in loopback. * * return 0 on success, -errno on error
*/ staticint do_quick_linkup(struct hfi1_devdata *dd)
{ int ret;
if (!loopback) { /* * When doing quick linkup and not in loopback, both * sides must be done with LCB set-up before either * starts the quick linkup. Put a delay here so that * both sides can be started and have a chance to be * done with LCB set up before resuming.
*/
dd_dev_err(dd, "Pausing for peer to be finished with LCB set up\n");
msleep(5000);
dd_dev_err(dd, "Continuing with quick linkup\n");
}
/* * State "quick" LinkUp request sets the physical link state to * LinkUp without a verify capability sequence. * This state is in simulator v37 and later.
*/
ret = set_physical_link_state(dd, PLS_QUICK_LINKUP); if (ret != HCMD_SUCCESS) {
dd_dev_err(dd, "%s: set physical link state to quick LinkUp failed with return %d\n",
__func__, ret);
/* * Do all special steps to set up loopback.
*/ staticint init_loopback(struct hfi1_devdata *dd)
{
dd_dev_info(dd, "Entering loopback mode\n");
/* all loopbacks should disable self GUID check */
write_csr(dd, DC_DC8051_CFG_MODE,
(read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
/* * The simulator has only one loopback option - LCB. Switch * to that option, which includes quick link up. * * Accept all valid loopback values.
*/ if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) &&
(loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
loopback == LOOPBACK_CABLE)) {
loopback = LOOPBACK_LCB;
quick_linkup = 1; return 0;
}
/* * SerDes loopback init sequence is handled in set_local_link_attributes
*/ if (loopback == LOOPBACK_SERDES) return 0;
/* LCB loopback - handled at poll time */ if (loopback == LOOPBACK_LCB) {
quick_linkup = 1; /* LCB is always quick linkup */
/* not supported in emulation due to emulation RTL changes */ if (dd->icode == ICODE_FPGA_EMULATION) {
dd_dev_err(dd, "LCB loopback not supported in emulation\n"); return -EINVAL;
} return 0;
}
/* external cable loopback requires no extra steps */ if (loopback == LOOPBACK_CABLE) return 0;
/* * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits * used in the Verify Capability link width attribute.
*/ static u16 opa_to_vc_link_widths(u16 opa_widths)
{ int i;
u16 result = 0;
for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) { if (opa_widths & opa_link_xlate[i].from)
result |= opa_link_xlate[i].to;
} return result;
}
/* * Set link attributes before moving to polling.
*/ staticint set_local_link_attributes(struct hfi1_pportdata *ppd)
{ struct hfi1_devdata *dd = ppd->dd;
u8 enable_lane_tx;
u8 tx_polarity_inversion;
u8 rx_polarity_inversion; int ret;
u32 misc_bits = 0; /* reset our fabric serdes to clear any lingering problems */
fabric_serdes_reset(dd);
/* set the local tx rate - need to read-modify-write */
ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
&rx_polarity_inversion, &ppd->local_tx_rate); if (ret) goto set_local_link_attributes_fail;
if (dd->dc8051_ver < dc8051_ver(0, 20, 0)) { /* set the tx rate to the fastest enabled */ if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
ppd->local_tx_rate = 1; else
ppd->local_tx_rate = 0;
} else { /* set the tx rate to all enabled */
ppd->local_tx_rate = 0; if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
ppd->local_tx_rate |= 2; if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
ppd->local_tx_rate |= 1;
}
enable_lane_tx = 0xF; /* enable all four lanes */
ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
rx_polarity_inversion, ppd->local_tx_rate); if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail;
ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); if (ret != HCMD_SUCCESS) {
dd_dev_err(dd, "Failed to set host interface version, return 0x%x\n",
ret); goto set_local_link_attributes_fail;
}
/* * DC supports continuous updates.
*/
ret = write_vc_local_phy(dd,
0 /* no power management */,
1 /* continuous updates */); if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail;
/* z=1 in the next call: AU of 0 is not supported by the hardware */
ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
ppd->port_crc_mode_enabled); if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail;
/* * SerDes loopback init sequence requires * setting bit 0 of MISC_CONFIG_BITS
*/ if (loopback == LOOPBACK_SERDES)
misc_bits |= 1 << LOOPBACK_SERDES_CONFIG_BIT_MASK_SHIFT;
/* * An external device configuration request is used to reset the LCB * to retry to obtain operational lanes when the first attempt is * unsuccesful.
*/ if (dd->dc8051_ver >= dc8051_ver(1, 25, 0))
misc_bits |= 1 << EXT_CFG_LCB_RESET_SUPPORTED_SHIFT;
ret = write_vc_local_link_mode(dd, misc_bits, 0,
opa_to_vc_link_widths(
ppd->link_width_enabled)); if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail;
/* let peer know who we are */
ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev); if (ret == HCMD_SUCCESS) return 0;
set_local_link_attributes_fail:
dd_dev_err(dd, "Failed to set local link attributes, return 0x%x\n",
ret); return ret;
}
/* * Call this to start the link. * Do not do anything if the link is disabled. * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
*/ int start_link(struct hfi1_pportdata *ppd)
{ /* * Tune the SerDes to a ballpark setting for optimal signal and bit * error rate. Needs to be done before starting the link.
*/
tune_serdes(ppd);
if (!ppd->driver_link_ready) {
dd_dev_info(ppd->dd, "%s: stopping link start because driver is not ready\n",
__func__); return 0;
}
/* * FULL_MGMT_P_KEY is cleared from the pkey table, so that the * pkey table can be configured properly if the HFI unit is connected * to switch port with MgmtAllowed=NO
*/
clear_full_mgmt_pkey(ppd);
/* * Some QSFP cables have a quirk that asserts the IntN line as a side * effect of power up on plug-in. We ignore this false positive * interrupt until the module has finished powering up by waiting for * a minimum timeout of the module inrush initialization time of * 500 ms (SFF 8679 Table 5-6) to ensure the voltage rails in the * module have stabilized.
*/
msleep(500);
/* * Check for QSFP interrupt for t_init (SFF 8679 Table 8-1)
*/
timeout = jiffies + msecs_to_jiffies(2000); while (1) {
mask = read_csr(dd, dd->hfi1_id ?
ASIC_QSFP2_IN : ASIC_QSFP1_IN); if (!(mask & QSFP_HFI0_INT_N)) break; if (time_after(jiffies, timeout)) {
dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
__func__); break;
}
udelay(2);
}
}
/* * Allow INT_N to trigger the QSFP interrupt to watch * for alarms and warnings
*/
set_qsfp_int_n(ppd, 1);
/* * After the reset, AOC transmitters are enabled by default. They need * to be turned off to complete the QSFP setup before they can be * enabled again.
*/ return set_qsfp_tx(ppd, 0);
}
if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
(qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
dd_dev_err(dd, "%s: QSFP cable temperature too high\n",
__func__);
if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
(qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
dd_dev_err(dd, "%s: QSFP cable temperature too low\n",
__func__);
/* * The remaining alarms/warnings don't matter if the link is down.
*/ if (ppd->host_link_state & HLS_DOWN) return 0;
if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
(qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
dd_dev_err(dd, "%s: QSFP supply voltage too high\n",
__func__);
if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
(qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
dd_dev_err(dd, "%s: QSFP supply voltage too low\n",
__func__);
/* Byte 2 is vendor specific */
if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
(qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
dd_dev_err(dd, "%s: Cable RX channel 1/2 power too high\n",
__func__);
if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
(qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
dd_dev_err(dd, "%s: Cable RX channel 1/2 power too low\n",
__func__);
if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
(qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
dd_dev_err(dd, "%s: Cable RX channel 3/4 power too high\n",
__func__);
if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
(qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
dd_dev_err(dd, "%s: Cable RX channel 3/4 power too low\n",
__func__);
if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
(qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too high\n",
__func__);
if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
(qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too low\n",
__func__);
if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
(qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too high\n",
__func__);
if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
(qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too low\n",
__func__);
if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
(qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
dd_dev_err(dd, "%s: Cable TX channel 1/2 power too high\n",
__func__);
if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
(qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
dd_dev_err(dd, "%s: Cable TX channel 1/2 power too low\n",
__func__);
if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
(qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
dd_dev_err(dd, "%s: Cable TX channel 3/4 power too high\n",
__func__);
if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
(qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
dd_dev_err(dd, "%s: Cable TX channel 3/4 power too low\n",
__func__);
/* Bytes 9-10 and 11-12 are reserved */ /* Bytes 13-15 are vendor specific */
return 0;
}
/* This routine will only be scheduled if the QSFP module present is asserted */ void qsfp_event(struct work_struct *work)
{ struct qsfp_data *qd; struct hfi1_pportdata *ppd; struct hfi1_devdata *dd;
qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); /* Clear current status to avoid spurious interrupts */
write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
qsfp_mask);
write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
qsfp_mask);
set_qsfp_int_n(ppd, 0);
/* Handle active low nature of INT_N and MODPRST_N pins */ if (qsfp_mod_present(ppd))
qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
write_csr(dd,
dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
qsfp_mask);
/* Enable the appropriate QSFP IRQ source */ if (!dd->hfi1_id)
set_intr_bits(dd, QSFP1_INT, QSFP1_INT, true); else
set_intr_bits(dd, QSFP2_INT, QSFP2_INT, true);
}
/* * Do a one-time initialize of the LCB block.
*/ staticvoid init_lcb(struct hfi1_devdata *dd)
{ /* simulator does not correctly handle LCB cclk loopback, skip */ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) return;
/* the DC has been reset earlier in the driver load */
/* set LCB for cclk loopback on the port */
write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x01);
write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0x00);
write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0x00);
write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110);
write_csr(dd, DC_LCB_CFG_CLK_CNTR, 0x08);
write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x02);
write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x00);
}
/* * Perform a test read on the QSFP. Return 0 on success, -ERRNO * on error.
*/ staticint test_qsfp_read(struct hfi1_pportdata *ppd)
{ int ret;
u8 status;
/* * Report success if not a QSFP or, if it is a QSFP, but the cable is * not present
*/ if (ppd->port_type != PORT_TYPE_QSFP || !qsfp_mod_present(ppd)) return 0;
/* read byte 2, the status byte */
ret = one_qsfp_read(ppd, ppd->dd->hfi1_id, 2, &status, 1); if (ret < 0) return ret; if (ret != 1) return -EIO;
return 0; /* success */
}
/* * Values for QSFP retry. * * Give up after 10s (20 x 500ms). The overall timeout was empirically * arrived at from experience on a large cluster.
*/ #define MAX_QSFP_RETRIES 20 #define QSFP_RETRY_WAIT 500 /* msec */
/* * Try a QSFP read. If it fails, schedule a retry for later. * Called on first link activation after driver load.
*/ staticvoid try_start_link(struct hfi1_pportdata *ppd)
{ if (test_qsfp_read(ppd)) { /* read failed */ if (ppd->qsfp_retry_count >= MAX_QSFP_RETRIES) {
dd_dev_err(ppd->dd, "QSFP not responding, giving up\n"); return;
}
dd_dev_info(ppd->dd, "QSFP not responding, waiting and retrying %d\n",
(int)ppd->qsfp_retry_count);
ppd->qsfp_retry_count++;
queue_delayed_work(ppd->link_wq, &ppd->start_link_work,
msecs_to_jiffies(QSFP_RETRY_WAIT)); return;
}
ppd->qsfp_retry_count = 0;
start_link(ppd);
}
/* * Workqueue function to start the link after a delay.
*/ void handle_start_link(struct work_struct *work)
{ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
start_link_work.work);
try_start_link(ppd);
}
int bringup_serdes(struct hfi1_pportdata *ppd)
{ struct hfi1_devdata *dd = ppd->dd;
u64 guid; int ret;
if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
guid = ppd->guids[HFI1_PORT_GUID_INDEX]; if (!guid) { if (dd->base_guid)
guid = dd->base_guid + ppd->port - 1;
ppd->guids[HFI1_PORT_GUID_INDEX] = guid;
}
/* Set linkinit_reason on power up per OPA spec */
ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
/* one-time init of the LCB */
init_lcb(dd);
if (loopback) {
ret = init_loopback(dd); if (ret < 0) return ret;
}
/* * Shut down the link and keep it down. First turn off that the * driver wants to allow the link to be up (driver_link_ready). * Then make sure the link is not automatically restarted * (link_enabled). Cancel any pending restart. And finally * go offline.
*/
ppd->driver_link_ready = 0;
ppd->link_enabled = 0;
ppd->qsfp_retry_count = MAX_QSFP_RETRIES; /* prevent more retries */
flush_delayed_work(&ppd->start_link_work);
cancel_delayed_work_sync(&ppd->start_link_work);
/* * index is the index into the receive array
*/ void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
u32 type, unsignedlong pa, u16 order)
{
u64 reg;
if (!(dd->flags & HFI1_PRESENT)) goto done;
if (type == PT_INVALID || type == PT_INVALID_FLUSH) {
pa = 0;
order = 0;
} elseif (type > PT_INVALID) {
dd_dev_err(dd, "unexpected receive array type %u for index %u, not handled\n",
type, index); goto done;
}
trace_hfi1_put_tid(dd, index, type, pa, order);
staticconstchar *ib_cfg_name(int which)
{ if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings)) return"invalid"; return ib_cfg_name_strings[which];
}
int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
{ struct hfi1_devdata *dd = ppd->dd; int val = 0;
switch (which) { case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
val = ppd->link_width_enabled; break; case HFI1_IB_CFG_LWID: /* currently active Link-width */
val = ppd->link_width_active; break; case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
val = ppd->link_speed_enabled; break; case HFI1_IB_CFG_SPD: /* current Link speed */
val = ppd->link_speed_active; break;
case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */ case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */ case HFI1_IB_CFG_LINKLATENCY: goto unimplemented;
case HFI1_IB_CFG_OP_VLS:
val = ppd->actual_vls_operational; break; case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
val = VL_ARB_HIGH_PRIO_TABLE_SIZE; break; case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
val = VL_ARB_LOW_PRIO_TABLE_SIZE; break; case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
val = ppd->overrun_threshold; break; case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
val = ppd->phy_error_threshold; break; case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
val = HLS_DEFAULT; break;
case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */ case HFI1_IB_CFG_PMA_TICKS: default:
unimplemented: if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
dd_dev_info(
dd, "%s: which %s: not implemented\n",
__func__,
ib_cfg_name(which)); break;
}
return val;
}
/* * The largest MAD packet size.
*/ #define MAX_MAD_PACKET 2048
/* * Return the maximum header bytes that can go on the _wire_ * for this device. This count includes the ICRC which is * not part of the packet held in memory but it is appended * by the HW. * This is dependent on the device's receive header entry size. * HFI allows this to be set per-receive context, but the * driver presently enforces a global value.
*/
u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
{ /* * The maximum non-payload (MTU) bytes in LRH.PktLen are * the Receive Header Entry Size minus the PBC (or RHF) size * plus one DW for the ICRC appended by HW. * * dd->rcd[0].rcvhdrqentsize is in DW. * We use rcd[0] as all context will have the same value. Also, * the first kernel context would have been allocated by now so * we are guaranteed a valid value.
*/ return (get_hdrqentsize(dd->rcd[0]) - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
}
/* * Set Send Length * @ppd: per port data * * Set the MTU by limiting how many DWs may be sent. The SendLenCheck* * registers compare against LRH.PktLen, so use the max bytes included * in the LRH. * * This routine changes all VL values except VL15, which it maintains at * the same value.
*/ staticvoid set_send_length(struct hfi1_pportdata *ppd)
{ struct hfi1_devdata *dd = ppd->dd;
u32 max_hb = lrh_max_header_bytes(dd), dcmtu;
u32 maxvlmtu = dd->vld[15].mtu;
u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
& SEND_LEN_CHECK1_LEN_VL15_MASK) <<
SEND_LEN_CHECK1_LEN_VL15_SHIFT; int i, j;
u32 thres;
for (i = 0; i < ppd->vls_supported; i++) { if (dd->vld[i].mtu > maxvlmtu)
maxvlmtu = dd->vld[i].mtu; if (i <= 3)
len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
& SEND_LEN_CHECK0_LEN_VL0_MASK) <<
((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT); else
len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
& SEND_LEN_CHECK1_LEN_VL4_MASK) <<
((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
}
write_csr(dd, SEND_LEN_CHECK0, len1);
write_csr(dd, SEND_LEN_CHECK1, len2); /* adjust kernel credit return thresholds based on new MTUs */ /* all kernel receive contexts have the same hdrqentsize */ for (i = 0; i < ppd->vls_supported; i++) {
thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
sc_mtu_to_threshold(dd->vld[i].sc,
dd->vld[i].mtu,
get_hdrqentsize(dd->rcd[0]))); for (j = 0; j < INIT_SC_PER_VL; j++)
sc_set_cr_threshold(
pio_select_send_context_vl(dd, j, i),
thres);
}
thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
sc_mtu_to_threshold(dd->vld[15].sc,
dd->vld[15].mtu,
dd->rcd[0]->rcvhdrqentsize));
sc_set_cr_threshold(dd->vld[15].sc, thres);
/* Adjust maximum MTU for the port in DC */
dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
(ilog2(maxvlmtu >> 8) + 1);
len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
}
/* * Program 0 in CSR if port lid is extended. This prevents * 9B packets being sent out for large lids.
*/
lid = (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) ? 0 : ppd->lid;
c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
| DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
c1 |= ((lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
<< DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) |
((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
<< DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
/* * Iterate over all the send contexts and set their SLID check
*/
sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
(((lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
for (i = 0; i < chip_send_contexts(dd); i++) {
hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
i, (u32)sreg);
write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
}
/* Now we have to do the same thing for the sdma engines */
sdma_update_lmc(dd, mask, lid);
}
if (completed < ARRAY_SIZE(state_completed)) return state_completed[completed];
return"unknown";
}
staticconstchar all_lanes_dead_timeout_expired[] = "All lanes were inactive – was the interconnect media removed?"; staticconstchar tx_out_of_policy[] = "Passing lanes on local port do not meet the local link width policy"; staticconstchar no_state_complete[] = "State timeout occurred before link partner completed the state"; staticconstchar * const state_complete_reasons[] = {
[0x00] = "Reason unknown",
[0x01] = "Link was halted by driver, refer to LinkDownReason",
[0x02] = "Link partner reported failure",
[0x10] = "Unable to achieve frame sync on any lane",
[0x11] = "Unable to find a common bit rate with the link partner",
[0x12] = "Unable to achieve frame sync on sufficient lanes to meet the local link width policy",
[0x13] = "Unable to identify preset equalization on sufficient lanes to meet the local link width policy",
[0x14] = no_state_complete,
[0x15] = "State timeout occurred before link partner identified equalization presets",
[0x16] = "Link partner completed the EstablishComm state, but the passing lanes do not meet the local link width policy",
[0x17] = tx_out_of_policy,
[0x20] = all_lanes_dead_timeout_expired,
[0x21] = "Unable to achieve acceptable BER on sufficient lanes to meet the local link width policy",
[0x22] = no_state_complete,
[0x23] = "Link partner completed the OptimizeEq state, but the passing lanes do not meet the local link width policy",
[0x24] = tx_out_of_policy,
[0x30] = all_lanes_dead_timeout_expired,
[0x31] = "State timeout occurred waiting for host to process received frames",
[0x32] = no_state_complete,
[0x33] = "Link partner completed the VerifyCap state, but the passing lanes do not meet the local link width policy",
[0x34] = tx_out_of_policy,
[0x35] = "Negotiated link width is mutually exclusive",
[0x36] = "Timed out before receiving verifycap frames in VerifyCap.Exchange",
[0x37] = "Unable to resolve secure data exchange",
};
dd_dev_err(dd, "Last %s LNI state complete frame 0x%08x:\n",
prefix, frame);
dd_dev_err(dd, " last reported state state: %s (0x%x)\n",
state_completed_string(state), state);
dd_dev_err(dd, " state successfully completed: %s\n",
success ? "yes" : "no");
dd_dev_err(dd, " fail reason 0x%x: %s\n",
reason, state_complete_reason_code_string(ppd, reason));
dd_dev_err(dd, " passing lane mask: 0x%x", lanes);
}
/* * Read the last state complete frames and explain them. This routine * expects to be called if the link went down during link negotiation * and initialization (LNI). That is, anywhere between polling and link up.
*/ staticvoid check_lni_states(struct hfi1_pportdata *ppd)
{
u32 last_local_state;
u32 last_remote_state;
/* * Don't report anything if there is nothing to report. A value of * 0 means the link was taken down while polling and there was no * training in-process.
*/ if (last_local_state == 0 && last_remote_state == 0) return;
/* wait for wait_ms for LINK_TRANSFER_ACTIVE to go to 1 */ staticint wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms)
{
u64 reg; unsignedlong timeout;
/* watch LCB_STS_LINK_TRANSFER_ACTIVE */
timeout = jiffies + msecs_to_jiffies(wait_ms); while (1) {
reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE); if (reg) break; if (time_after(jiffies, timeout)) {
dd_dev_err(dd, "timeout waiting for LINK_TRANSFER_ACTIVE\n"); return -ETIMEDOUT;
}
udelay(2);
} return 0;
}
/* called when the logical link state is not down as it should be */ staticvoid force_logical_link_state_down(struct hfi1_pportdata *ppd)
{ struct hfi1_devdata *dd = ppd->dd;
/* * Bring link up in LCB loopback
*/
write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 1);
write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
/* * Bring the link down again.
*/
write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 1);
write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, 0);
write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK, 0);
dd_dev_info(ppd->dd, "logical state forced to LINK_DOWN\n");
}
/* * Helper for set_link_state(). Do not call except from that routine. * Expects ppd->hls_mutex to be held. * * @rem_reason value to be sent to the neighbor * * LinkDownReasons only set if transition succeeds.
*/ staticint goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
{ struct hfi1_devdata *dd = ppd->dd;
u32 previous_state; int offline_state_ret; int ret;
if (ret != HCMD_SUCCESS) {
dd_dev_err(dd, "Failed to transition to Offline link state, return %d\n",
ret); return -EINVAL;
} if (ppd->offline_disabled_reason ==
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
ppd->offline_disabled_reason =
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
offline_state_ret = wait_phys_link_offline_substates(ppd, 10000); if (offline_state_ret < 0) return offline_state_ret;
/* Disabling AOC transmitters */ if (ppd->port_type == PORT_TYPE_QSFP &&
ppd->qsfp_info.limiting_active &&
qsfp_mod_present(ppd)) { int ret;
ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT); if (ret == 0) {
set_qsfp_tx(ppd, 0);
release_chip_resource(dd, qsfp_resource(dd));
} else { /* not fatal, but should warn */
dd_dev_err(dd, "Unable to acquire lock to turn off QSFP TX\n");
}
}
/* * Wait for the offline.Quiet transition if it hasn't happened yet. It * can take a while for the link to go down.
*/ if (offline_state_ret != PLS_OFFLINE_QUIET) {
ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 30000); if (ret < 0) return ret;
}
/* * Now in charge of LCB - must be after the physical state is * offline.quiet and before host_link_state is changed.
*/
set_host_lcb_access(dd);
write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
/* make sure the logical state is also down */
ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); if (ret)
force_logical_link_state_down(ppd);
/* * The LNI has a mandatory wait time after the physical state * moves to Offline.Quiet. The wait time may be different * depending on how the link went down. The 8051 firmware * will observe the needed wait time and only move to ready * when that is completed. The largest of the quiet timeouts * is 6s, so wait that long and then at least 0.5s more for * other transitions, and another 0.5s for a buffer.
*/
ret = wait_fm_ready(dd, 7000); if (ret) {
dd_dev_err(dd, "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n"); /* state is really offline, so make it so */
ppd->host_link_state = HLS_DN_OFFLINE; return ret;
}
/* * The state is now offline and the 8051 is ready to accept host * requests. * - change our state * - notify others if we were previously in a linkup state
*/
ppd->host_link_state = HLS_DN_OFFLINE; if (previous_state & HLS_UP) { /* went down while link was up */
handle_linkup_change(dd, 0);
} elseif (previous_state
& (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { /* went down while attempting link up */
check_lni_states(ppd);
/* The QSFP doesn't need to be reset on LNI failure */
ppd->qsfp_info.reset_needed = 0;
}
/* the active link width (downgrade) is 0 on link down */
ppd->link_width_active = 0;
ppd->link_width_downgrade_tx_active = 0;
ppd->link_width_downgrade_rx_active = 0;
ppd->current_egress_rate = 0; return 0;
}
/* return the link state name */ staticconstchar *link_state_name(u32 state)
{ constchar *name; int n = ilog2(state); staticconstchar * const names[] = {
[__HLS_UP_INIT_BP] = "INIT",
[__HLS_UP_ARMED_BP] = "ARMED",
[__HLS_UP_ACTIVE_BP] = "ACTIVE",
[__HLS_DN_DOWNDEF_BP] = "DOWNDEF",
[__HLS_DN_POLL_BP] = "POLL",
[__HLS_DN_DISABLE_BP] = "DISABLE",
[__HLS_DN_OFFLINE_BP] = "OFFLINE",
[__HLS_VERIFY_CAP_BP] = "VERIFY_CAP",
[__HLS_GOING_UP_BP] = "GOING_UP",
[__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
[__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
};
name = n < ARRAY_SIZE(names) ? names[n] : NULL; return name ? name : "unknown";
}
/* return the link state reason name */ staticconstchar *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
{ if (state == HLS_UP_INIT) { switch (ppd->linkinit_reason) { case OPA_LINKINIT_REASON_LINKUP: return"(LINKUP)"; case OPA_LINKINIT_REASON_FLAPPING: return"(FLAPPING)"; case OPA_LINKINIT_OUTSIDE_POLICY: return"(OUTSIDE_POLICY)"; case OPA_LINKINIT_QUARANTINED: return"(QUARANTINED)"; case OPA_LINKINIT_INSUFIC_CAPABILITY: return"(INSUFIC_CAPABILITY)"; default: break;
}
} return"";
}
/* * driver_pstate - convert the driver's notion of a port's * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*). * Return -1 (converted to a u32) to indicate error.
*/
u32 driver_pstate(struct hfi1_pportdata *ppd)
{ switch (ppd->host_link_state) { case HLS_UP_INIT: case HLS_UP_ARMED: case HLS_UP_ACTIVE: return IB_PORTPHYSSTATE_LINKUP; case HLS_DN_POLL: return IB_PORTPHYSSTATE_POLLING; case HLS_DN_DISABLE: return IB_PORTPHYSSTATE_DISABLED; case HLS_DN_OFFLINE: return OPA_PORTPHYSSTATE_OFFLINE; case HLS_VERIFY_CAP: return IB_PORTPHYSSTATE_TRAINING; case HLS_GOING_UP: return IB_PORTPHYSSTATE_TRAINING; case HLS_GOING_OFFLINE: return OPA_PORTPHYSSTATE_OFFLINE; case HLS_LINK_COOLDOWN: return OPA_PORTPHYSSTATE_OFFLINE; case HLS_DN_DOWNDEF: default:
dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
ppd->host_link_state); return -1;
}
}
/* * driver_lstate - convert the driver's notion of a port's * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1 * (converted to a u32) to indicate error.
*/
u32 driver_lstate(struct hfi1_pportdata *ppd)
{ if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN)) return IB_PORT_DOWN;
switch (ppd->host_link_state & HLS_UP) { case HLS_UP_INIT: return IB_PORT_INIT; case HLS_UP_ARMED: return IB_PORT_ARMED; case HLS_UP_ACTIVE: return IB_PORT_ACTIVE; default:
dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
ppd->host_link_state); return -1;
}
}
/** * data_vls_operational() - Verify if data VL BCT credits and MTU * are both set. * @ppd: pointer to hfi1_pportdata structure * * Return: true - Ok, false -otherwise.
*/ staticinlinebool data_vls_operational(struct hfi1_pportdata *ppd)
{ int i;
u64 reg;
if (!ppd->actual_vls_operational) returnfalse;
for (i = 0; i < ppd->vls_supported; i++) {
reg = read_csr(ppd->dd, SEND_CM_CREDIT_VL + (8 * i)); if ((reg && !ppd->dd->vld[i].mtu) ||
(!reg && ppd->dd->vld[i].mtu)) returnfalse;
}
returntrue;
}
/* * Change the physical and/or logical link state. * * Do not call this routine while inside an interrupt. It contains * calls to routines that can take multiple seconds to finish. * * Returns 0 on success, -errno on failure.
*/ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
{ struct hfi1_devdata *dd = ppd->dd; struct ib_event event = {.device = NULL}; int ret1, ret = 0; int orig_new_state, poll_bounce;
mutex_lock(&ppd->hls_lock);
orig_new_state = state; if (state == HLS_DN_DOWNDEF)
state = HLS_DEFAULT;
/* interpret poll -> poll as a link bounce */
poll_bounce = ppd->host_link_state == HLS_DN_POLL &&
state == HLS_DN_POLL;
dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
link_state_name(ppd->host_link_state),
link_state_name(orig_new_state),
poll_bounce ? "(bounce) " : "",
link_state_reason_name(ppd, state));
/* * If we're going to a (HLS_*) link state that implies the logical * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then * reset is_sm_config_started to 0.
*/ if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
ppd->is_sm_config_started = 0;
/* * Do nothing if the states match. Let a poll to poll link bounce * go through.
*/ if (ppd->host_link_state == state && !poll_bounce) goto done;
switch (state) { case HLS_UP_INIT: if (ppd->host_link_state == HLS_DN_POLL &&
(quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) { /* * Quick link up jumps from polling to here. * * Whether in normal or loopback mode, the * simulator jumps from polling to link up. * Accept that here.
*/ /* OK */
} elseif (ppd->host_link_state != HLS_GOING_UP) { goto unexpected;
}
/* * Wait for Link_Up physical state. * Physical and Logical states should already be * be transitioned to LinkUp and LinkInit respectively.
*/
ret = wait_physical_linkstate(ppd, PLS_LINKUP, 1000); if (ret) {
dd_dev_err(dd, "%s: physical state did not change to LINK-UP\n",
__func__); break;
}
ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000); if (ret) {
dd_dev_err(dd, "%s: logical state did not change to INIT\n",
__func__); break;
}
/* clear old transient LINKINIT_REASON code */ if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
ppd->linkinit_reason =
OPA_LINKINIT_REASON_LINKUP;
/* enable the port */
add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
/* * After link up, a new link width will have been set. * Update the xmit counters with regards to the new * link width.
*/
update_xmit_counters(ppd, ppd->link_width_active);
ppd->host_link_state = HLS_UP_INIT;
update_statusp(ppd, IB_PORT_INIT); break; case HLS_UP_ARMED: if (ppd->host_link_state != HLS_UP_INIT) goto unexpected;
if (!data_vls_operational(ppd)) {
dd_dev_err(dd, "%s: Invalid data VL credits or mtu\n",
__func__);
ret = -EINVAL; break;
}
set_logical_state(dd, LSTATE_ARMED);
ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000); if (ret) {
dd_dev_err(dd, "%s: logical state did not change to ARMED\n",
__func__); break;
}
ppd->host_link_state = HLS_UP_ARMED;
update_statusp(ppd, IB_PORT_ARMED); /* * The simulator does not currently implement SMA messages, * so neighbor_normal is not set. Set it here when we first * move to Armed.
*/ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
ppd->neighbor_normal = 1; break; case HLS_UP_ACTIVE: if (ppd->host_link_state != HLS_UP_ARMED) goto unexpected;
set_logical_state(dd, LSTATE_ACTIVE);
ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000); if (ret) {
dd_dev_err(dd, "%s: logical state did not change to ACTIVE\n",
__func__);
} else { /* tell all engines to go running */
sdma_all_running(dd);
ppd->host_link_state = HLS_UP_ACTIVE;
update_statusp(ppd, IB_PORT_ACTIVE);
/* Signal the IB layer that the port has went active */
event.device = &dd->verbs_dev.rdi.ibdev;
event.element.port_num = ppd->port;
event.event = IB_EVENT_PORT_ACTIVE;
} break; case HLS_DN_POLL: if ((ppd->host_link_state == HLS_DN_DISABLE ||
ppd->host_link_state == HLS_DN_OFFLINE) &&
dd->dc_shutdown)
dc_start(dd); /* Hand LED control to the DC */
write_csr(dd, DCC_CFG_LED_CNTRL, 0);
if (ppd->host_link_state != HLS_DN_OFFLINE) {
u8 tmp = ppd->link_enabled;
ret = goto_offline(ppd, ppd->remote_link_down_reason); if (ret) {
ppd->link_enabled = tmp; break;
}
ppd->remote_link_down_reason = 0;
if (ppd->driver_link_ready)
ppd->link_enabled = 1;
}
set_all_slowpath(ppd->dd);
ret = set_local_link_attributes(ppd); if (ret) break;
ppd->port_error_action = 0;
if (quick_linkup) { /* quick linkup does not go into polling */
ret = do_quick_linkup(dd);
} else {
ret1 = set_physical_link_state(dd, PLS_POLLING); if (!ret1)
ret1 = wait_phys_link_out_of_offline(ppd,
3000); if (ret1 != HCMD_SUCCESS) {
dd_dev_err(dd, "Failed to transition to Polling link state, return 0x%x\n",
ret1);
ret = -EINVAL;
}
}
/* * Change the host link state after requesting DC8051 to * change its physical state so that we can ignore any * interrupt with stale LNI(XX) error, which will not be * cleared until DC8051 transitions to Polling state.
*/
ppd->host_link_state = HLS_DN_POLL;
ppd->offline_disabled_reason =
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); /* * If an error occurred above, go back to offline. The * caller may reschedule another attempt.
*/ if (ret)
goto_offline(ppd, 0); else
log_physical_state(ppd, PLS_POLLING); break; case HLS_DN_DISABLE: /* link is disabled */
ppd->link_enabled = 0;
/* allow any state to transition to disabled */
/* must transition to offline first */ if (ppd->host_link_state != HLS_DN_OFFLINE) {
ret = goto_offline(ppd, ppd->remote_link_down_reason); if (ret) break;
ppd->remote_link_down_reason = 0;
}
if (!dd->dc_shutdown) {
ret1 = set_physical_link_state(dd, PLS_DISABLED); if (ret1 != HCMD_SUCCESS) {
dd_dev_err(dd, "Failed to transition to Disabled link state, return 0x%x\n",
ret1);
ret = -EINVAL; break;
}
ret = wait_physical_linkstate(ppd, PLS_DISABLED, 10000); if (ret) {
dd_dev_err(dd, "%s: physical state did not change to DISABLED\n",
__func__); break;
}
dc_shutdown(dd);
}
ppd->host_link_state = HLS_DN_DISABLE; break; case HLS_DN_OFFLINE: if (ppd->host_link_state == HLS_DN_DISABLE)
dc_start(dd);
/* allow any state to transition to offline */
ret = goto_offline(ppd, ppd->remote_link_down_reason); if (!ret)
ppd->remote_link_down_reason = 0; break; case HLS_VERIFY_CAP: if (ppd->host_link_state != HLS_DN_POLL) goto unexpected;
ppd->host_link_state = HLS_VERIFY_CAP;
log_physical_state(ppd, PLS_CONFIGPHY_VERIFYCAP); break; case HLS_GOING_UP: if (ppd->host_link_state != HLS_VERIFY_CAP) goto unexpected;
ret1 = set_physical_link_state(dd, PLS_LINKUP); if (ret1 != HCMD_SUCCESS) {
dd_dev_err(dd, "Failed to transition to link up state, return 0x%x\n",
ret1);
ret = -EINVAL; break;
}
ppd->host_link_state = HLS_GOING_UP; break;
case HLS_GOING_OFFLINE: /* transient within goto_offline() */ case HLS_LINK_COOLDOWN: /* transient within goto_offline() */ default:
dd_dev_info(dd, "%s: state 0x%x: not supported\n",
__func__, state);
ret = -EINVAL; break;
}
goto done;
unexpected:
dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
__func__, link_state_name(ppd->host_link_state),
link_state_name(state));
ret = -EINVAL;
done:
mutex_unlock(&ppd->hls_lock);
if (event.device)
ib_dispatch_event(&event);
return ret;
}
int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
{
u64 reg; int ret = 0;
switch (which) { case HFI1_IB_CFG_LIDLMC:
set_lidlmc(ppd); break; case HFI1_IB_CFG_VL_HIGH_LIMIT: /* * The VL Arbitrator high limit is sent in units of 4k * bytes, while HFI stores it in units of 64 bytes.
*/
val *= 4096 / 64;
reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
<< SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg); break; case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ /* HFI only supports POLL as the default link down state */ if (val != HLS_DN_POLL)
ret = -EINVAL; break; case HFI1_IB_CFG_OP_VLS: if (ppd->vls_operational != val) {
ppd->vls_operational = val; if (!ppd->port)
ret = -EINVAL;
} break; /* * For link width, link width downgrade, and speed enable, always AND * the setting with what is actually supported. This has two benefits. * First, enabled can't have unsupported values, no matter what the * SM or FM might want. Second, the ALL_SUPPORTED wildcards that mean * "fill in with your supported value" have all the bits in the * field set, so simply ANDing with supported has the desired result.
*/ case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
ppd->link_width_enabled = val & ppd->link_width_supported; break; case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
ppd->link_width_downgrade_enabled =
val & ppd->link_width_downgrade_supported; break; case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
ppd->link_speed_enabled = val & ppd->link_speed_supported; break; case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ /* * HFI does not follow IB specs, save this value * so we can report it, if asked.
*/
ppd->overrun_threshold = val; break; case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ /* * HFI does not follow IB specs, save this value * so we can report it, if asked.
*/
ppd->phy_error_threshold = val; break;
case HFI1_IB_CFG_MTU:
set_send_length(ppd); break;
case HFI1_IB_CFG_PKEYS: if (HFI1_CAP_IS_KSET(PKEY_CHECK))
set_partition_keys(ppd); break;
default: if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
dd_dev_info(ppd->dd, "%s: which %s, val 0x%x: not implemented\n",
__func__, ib_cfg_name(which), val); break;
} return ret;
}
/* begin functions related to vl arbitration table caching */ staticvoid init_vl_arb_caches(struct hfi1_pportdata *ppd)
{ int i;
/* * Note that we always return values directly from the * 'vl_arb_cache' (and do no CSR reads) in response to a * 'Get(VLArbTable)'. This is obviously correct after a * 'Set(VLArbTable)', since the cache will then be up to * date. But it's also correct prior to any 'Set(VLArbTable)' * since then both the cache, and the relevant h/w registers * will be zeroed.
*/
for (i = 0; i < MAX_PRIO_TABLE; i++)
spin_lock_init(&ppd->vl_arb_cache[i].lock);
}
/* * vl_arb_lock_cache * * All other vl_arb_* functions should be called only after locking * the cache.
*/ staticinlinestruct vl_arb_cache *
vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
{ if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE) return NULL;
spin_lock(&ppd->vl_arb_cache[idx].lock); return &ppd->vl_arb_cache[idx];
}
staticinlinevoid vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
{
spin_unlock(&ppd->vl_arb_cache[idx].lock);
}
/* end functions related to vl arbitration table caching */
staticint set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
u32 size, struct ib_vl_weight_elem *vl)
{ struct hfi1_devdata *dd = ppd->dd;
u64 reg; unsignedint i, is_up = 0; int drain, ret = 0;
mutex_lock(&ppd->hls_lock);
if (ppd->host_link_state & HLS_UP)
is_up = 1;
drain = !is_ax(dd) && is_up;
if (drain) /* * Before adjusting VL arbitration weights, empty per-VL * FIFOs, otherwise a packet whose VL weight is being * set to 0 could get stuck in a FIFO with no chance to * egress.
*/
ret = stop_drain_data_vls(dd);
if (ret) {
dd_dev_err(
dd, "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
__func__); goto err;
}
for (i = 0; i < size; i++, vl++) { /* * NOTE: The low priority shift and mask are used here, but * they are the same for both the low and high registers.
*/
reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
<< SEND_LOW_PRIORITY_LIST_VL_SHIFT)
| (((u64)vl->weight
& SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
<< SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
write_csr(dd, target + (i * 8), reg);
}
pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
if (drain)
open_fill_data_vls(dd); /* reopen all VLs */
if (reg == 0) return; /* success */ if (time_after(jiffies, timeout)) break; /* timed out */
udelay(1);
}
dd_dev_err(dd, "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
which, VL_STATUS_CLEAR_TIMEOUT, mask, reg); /* * If this occurs, it is likely there was a credit loss on the link. * The only recovery from that is a link bounce.
*/
dd_dev_err(dd, "Continuing anyway. A credit loss may occur. Suggest a link bounce\n");
}
/* * The number of credits on the VLs may be changed while everything * is "live", but the following algorithm must be followed due to * how the hardware is actually implemented. In particular, * Return_Credit_Status[] is the only correct status check. * * if (reducing Global_Shared_Credit_Limit or any shared limit changing) * set Global_Shared_Credit_Limit = 0 * use_all_vl = 1 * mask0 = all VLs that are changing either dedicated or shared limits * set Shared_Limit[mask0] = 0 * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0 * if (changing any dedicated limit) * mask1 = all VLs that are lowering dedicated limits * lower Dedicated_Limit[mask1] * spin until Return_Credit_Status[mask1] == 0 * raise Dedicated_Limits * raise Shared_Limits * raise Global_Shared_Credit_Limit * * lower = if the new limit is lower, set the limit to the new value * raise = if the new limit is higher than the current value (may be changed * earlier in the algorithm), set the new limit to the new value
*/ int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *new_bc)
{ struct hfi1_devdata *dd = ppd->dd;
u64 changing_mask, ld_mask, stat_mask; int change_count; int i, use_all_mask; int this_shared_changing; int vl_count = 0, ret; /* * A0: add the variable any_shared_limit_changing below and in the * algorithm above. If removing A0 support, it can be removed.
*/ int any_shared_limit_changing; struct buffer_control cur_bc;
u8 changing[OPA_MAX_VLS];
u8 lowering_dedicated[OPA_MAX_VLS];
u16 cur_total;
u32 new_total = 0; const u64 all_mask =
SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
| SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
| SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
| SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
| SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
| SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
| SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
| SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
| SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15) #define NUM_USABLE_VLS 16 /* look at VL15 and less */
/* find the new total credits, do sanity check on unused VLs */ for (i = 0; i < OPA_MAX_VLS; i++) { if (valid_vl(i)) {
new_total += be16_to_cpu(new_bc->vl[i].dedicated); continue;
}
nonzero_msg(dd, i, "dedicated",
be16_to_cpu(new_bc->vl[i].dedicated));
nonzero_msg(dd, i, "shared",
be16_to_cpu(new_bc->vl[i].shared));
new_bc->vl[i].dedicated = 0;
new_bc->vl[i].shared = 0;
}
new_total += be16_to_cpu(new_bc->overall_shared_limit);
/* fetch the current values */
get_buffer_control(dd, &cur_bc, &cur_total);
/* * Create the masks we will use.
*/
memset(changing, 0, sizeof(changing));
memset(lowering_dedicated, 0, sizeof(lowering_dedicated)); /* * NOTE: Assumes that the individual VL bits are adjacent and in * increasing order
*/
stat_mask =
SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
changing_mask = 0;
ld_mask = 0;
change_count = 0;
any_shared_limit_changing = 0; for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) { if (!valid_vl(i)) continue;
this_shared_changing = new_bc->vl[i].shared
!= cur_bc.vl[i].shared; if (this_shared_changing)
any_shared_limit_changing = 1; if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated ||
this_shared_changing) {
changing[i] = 1;
changing_mask |= stat_mask;
change_count++;
} if (be16_to_cpu(new_bc->vl[i].dedicated) <
be16_to_cpu(cur_bc.vl[i].dedicated)) {
lowering_dedicated[i] = 1;
ld_mask |= stat_mask;
}
}
/* bracket the credit change with a total adjustment */ if (new_total > cur_total)
set_global_limit(dd, new_total);
/* now raise all dedicated that are going up */ for (i = 0; i < NUM_USABLE_VLS; i++) { if (!valid_vl(i)) continue;
if (be16_to_cpu(new_bc->vl[i].dedicated) >
be16_to_cpu(cur_bc.vl[i].dedicated))
set_vl_dedicated(dd, i,
be16_to_cpu(new_bc->
vl[i].dedicated));
}
}
/* next raise all shared that are going up */ for (i = 0; i < NUM_USABLE_VLS; i++) { if (!valid_vl(i)) continue;
if (be16_to_cpu(new_bc->vl[i].shared) >
be16_to_cpu(cur_bc.vl[i].shared))
set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
}
/* finally raise the global shared */ if (be16_to_cpu(new_bc->overall_shared_limit) >
be16_to_cpu(cur_bc.overall_shared_limit))
set_global_shared(dd,
be16_to_cpu(new_bc->overall_shared_limit));
/* bracket the credit change with a total adjustment */ if (new_total < cur_total)
set_global_limit(dd, new_total);
/* * Determine the actual number of operational VLS using the number of * dedicated and shared credits for each VL.
*/ if (change_count > 0) { for (i = 0; i < TXE_NUM_DATA_VL; i++) if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 ||
be16_to_cpu(new_bc->vl[i].shared) > 0)
vl_count++;
ppd->actual_vls_operational = vl_count;
ret = sdma_map_init(dd, ppd->port - 1, vl_count ?
ppd->actual_vls_operational :
ppd->vls_operational,
NULL); if (ret == 0)
ret = pio_map_init(dd, ppd->port - 1, vl_count ?
ppd->actual_vls_operational :
ppd->vls_operational, NULL); if (ret) return ret;
} return 0;
}
/* * Read the given fabric manager table. Return the size of the * table (in bytes) on success, and a negative error code on * failure.
*/ int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
{ int size; struct vl_arb_cache *vlc;
switch (which) { case FM_TBL_VL_HIGH_ARB:
size = 256; /* * OPA specifies 128 elements (of 2 bytes each), though * HFI supports only 16 elements in h/w.
*/
vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
vl_arb_get_cache(vlc, t);
vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); break; case FM_TBL_VL_LOW_ARB:
size = 256; /* * OPA specifies 128 elements (of 2 bytes each), though * HFI supports only 16 elements in h/w.
*/
vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
vl_arb_get_cache(vlc, t);
vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); break; case FM_TBL_BUFFER_CONTROL:
size = get_buffer_control(ppd->dd, t, NULL); break; case FM_TBL_SC2VLNT:
size = get_sc2vlnt(ppd->dd, t); break; case FM_TBL_VL_PREEMPT_ELEMS:
size = 256; /* OPA specifies 128 elements, of 2 bytes each */
get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t); break; case FM_TBL_VL_PREEMPT_MATRIX:
size = 256; /* * OPA specifies that this is the same size as the VL * arbitration tables (i.e., 256 bytes).
*/ break; default: return -EINVAL;
} return size;
}
/* * Write the given fabric manager table.
*/ int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
{ int ret = 0; struct vl_arb_cache *vlc;
switch (which) { case FM_TBL_VL_HIGH_ARB:
vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE); if (vl_arb_match_cache(vlc, t)) {
vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); break;
}
vl_arb_set_cache(vlc, t);
vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
VL_ARB_HIGH_PRIO_TABLE_SIZE, t); break; case FM_TBL_VL_LOW_ARB:
vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE); if (vl_arb_match_cache(vlc, t)) {
vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); break;
}
vl_arb_set_cache(vlc, t);
vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
VL_ARB_LOW_PRIO_TABLE_SIZE, t); break; case FM_TBL_BUFFER_CONTROL:
ret = set_buffer_control(ppd, t); break; case FM_TBL_SC2VLNT:
set_sc2vlnt(ppd->dd, t); break; default:
ret = -EINVAL;
} return ret;
}
/* * Disable all data VLs. * * Return 0 if disabled, non-zero if the VLs cannot be disabled.
*/ staticint disable_data_vls(struct hfi1_devdata *dd)
{ if (is_ax(dd)) return 1;
pio_send_control(dd, PSC_DATA_VL_DISABLE);
return 0;
}
/* * open_fill_data_vls() - the counterpart to stop_drain_data_vls(). * Just re-enables all data VLs (the "fill" part happens * automatically - the name was chosen for symmetry with * stop_drain_data_vls()). * * Return 0 if successful, non-zero if the VLs cannot be enabled.
*/ int open_fill_data_vls(struct hfi1_devdata *dd)
{ if (is_ax(dd)) return 1;
pio_send_control(dd, PSC_DATA_VL_ENABLE);
return 0;
}
/* * drain_data_vls() - assumes that disable_data_vls() has been called, * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA * engines to drop to 0.
*/ staticvoid drain_data_vls(struct hfi1_devdata *dd)
{
sc_wait(dd);
sdma_wait(dd);
pause_for_credit_return(dd);
}
/* * stop_drain_data_vls() - disable, then drain all per-VL fifos. * * Use open_fill_data_vls() to resume using data VLs. This pair is * meant to be used like this: * * stop_drain_data_vls(dd); * // do things with per-VL resources * open_fill_data_vls(dd);
*/ int stop_drain_data_vls(struct hfi1_devdata *dd)
{ int ret;
ret = disable_data_vls(dd); if (ret == 0)
drain_data_vls(dd);
return ret;
}
/* * Convert a nanosecond time to a cclock count. No matter how slow * the cclock, a non-zero ns will always have a non-zero result.
*/
u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
{
u32 cclocks;
if (dd->icode == ICODE_FPGA_EMULATION)
cclocks = (ns * 1000) / FPGA_CCLOCK_PS; else/* simulation pretends to be ASIC */
cclocks = (ns * 1000) / ASIC_CCLOCK_PS; if (ns && !cclocks) /* if ns nonzero, must be at least 1 */
cclocks = 1; return cclocks;
}
/* * Convert a cclock count to nanoseconds. Not matter how slow * the cclock, a non-zero cclocks will always have a non-zero result.
*/
u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
{
u32 ns;
if (dd->icode == ICODE_FPGA_EMULATION)
ns = (cclocks * FPGA_CCLOCK_PS) / 1000; else/* simulation pretends to be ASIC */
ns = (cclocks * ASIC_CCLOCK_PS) / 1000; if (cclocks && !ns)
ns = 1; return ns;
}
/* * Dynamically adjust the receive interrupt timeout for a context based on * incoming packet rate. * * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
*/ staticvoid adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
{ struct hfi1_devdata *dd = rcd->dd;
u32 timeout = rcd->rcvavail_timeout;
/* * This algorithm doubles or halves the timeout depending on whether * the number of packets received in this interrupt were less than or * greater equal the interrupt count. * * The calculations below do not allow a steady state to be achieved. * Only at the endpoints it is possible to have an unchanging * timeout.
*/ if (npkts < rcv_intr_count) { /* * Not enough packets arrived before the timeout, adjust * timeout downward.
*/ if (timeout < 2) /* already at minimum? */ return;
timeout >>= 1;
} else { /* * More than enough packets arrived before the timeout, adjust * timeout upward.
*/ if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */ return;
timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
}
rcd->rcvavail_timeout = timeout; /* * timeout cannot be larger than rcv_intr_timeout_csr which has already * been verified to be in range
*/
write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
(u64)timeout <<
RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
}
/* * Need to write timeout register before updating RcvHdrHead to ensure * that a new value is used when the HW decides to restart counting.
*/ if (intr_adjust)
adjust_rcv_timeout(rcd, npkts); if (updegr) {
reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
<< RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
}
reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
(((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
<< RCV_HDR_HEAD_HEAD_SHIFT);
write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
}
/* * Context Control and Receive Array encoding for buffer size: * 0x0 invalid * 0x1 4 KB * 0x2 8 KB * 0x3 16 KB * 0x4 32 KB * 0x5 64 KB * 0x6 128 KB * 0x7 256 KB * 0x8 512 KB (Receive Array only) * 0x9 1 MB (Receive Array only) * 0xa 2 MB (Receive Array only) * * 0xB-0xF - reserved (Receive Array only) * * * This routine assumes that the value has already been sanity checked.
*/ static u32 encoded_size(u32 size)
{ switch (size) { case 4 * 1024: return 0x1; case 8 * 1024: return 0x2; case 16 * 1024: return 0x3; case 32 * 1024: return 0x4; case 64 * 1024: return 0x5; case 128 * 1024: return 0x6; case 256 * 1024: return 0x7; case 512 * 1024: return 0x8; case 1 * 1024 * 1024: return 0x9; case 2 * 1024 * 1024: return 0xa;
} return 0x1; /* if invalid, go with the minimum size */
}
/** * encode_rcv_header_entry_size - return chip specific encoding for size * @size: size in dwords * * Convert a receive header entry size that to the encoding used in the CSR. * * Return a zero if the given size is invalid, otherwise the encoding.
*/
u8 encode_rcv_header_entry_size(u8 size)
{ /* there are only 3 valid receive header entry sizes */ if (size == 2) return 1; if (size == 16) return 2; if (size == 32) return 4; return 0; /* invalid */
}
/** * hfi1_validate_rcvhdrcnt - validate hdrcnt * @dd: the device data * @thecnt: the header count
*/ int hfi1_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt)
{ if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
dd_dev_err(dd, "Receive header queue count too small\n"); return -EINVAL;
}
if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
dd_dev_err(dd, "Receive header queue count cannot be greater than %u\n",
HFI1_MAX_HDRQ_EGRBUF_CNT); return -EINVAL;
}
if (thecnt % HDRQ_INCREMENT) {
dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n",
thecnt, HDRQ_INCREMENT); return -EINVAL;
}
return 0;
}
/** * set_hdrq_regs - set header queue registers for context * @dd: the device data * @ctxt: the context * @entsize: the dword entry size * @hdrcnt: the number of header entries
*/ void set_hdrq_regs(struct hfi1_devdata *dd, u8 ctxt, u8 entsize, u16 hdrcnt)
{
u64 reg;
/* * Program dummy tail address for every receive context * before enabling any receive context
*/
write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
dd->rcvhdrtail_dummy_dma);
}
rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL); /* if the context already enabled, don't do the extra steps */ if ((op & HFI1_RCVCTRL_CTXT_ENB) &&
!(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) { /* reset the tail and hdr addresses, and sequence count */
write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
rcd->rcvhdrq_dma); if (hfi1_rcvhdrtail_kvaddr(rcd))
write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
rcd->rcvhdrqtailaddr_dma);
hfi1_set_seq_cnt(rcd, 1);
/* reset the cached receive header queue head value */
hfi1_set_rcd_head(rcd, 0);
/* * Zero the receive header queue so we don't get false * positives when checking the sequence number. The * sequence numbers could land exactly on the same spot. * E.g. a rcd restart before the receive header wrapped.
*/
memset(rcd->rcvhdrq, 0, rcvhdrq_size(rcd));
/* enable the context */
rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
/* clean the egr buffer size first */
rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
& RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
<< RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
/* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
did_enable = 1;
/* zero RcvEgrIndexHead */
write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
/* set eager count and base index */
reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
& RCV_EGR_CTRL_EGR_CNT_MASK)
<< RCV_EGR_CTRL_EGR_CNT_SHIFT) |
(((rcd->eager_base >> RCV_SHIFT)
& RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
<< RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
/* * Set TID (expected) count and base index. * rcd->expected_count is set to individual RcvArray entries, * not pairs, and the CSR takes a pair-count in groups of * four, so divide by 8.
*/
reg = (((rcd->expected_count >> RCV_SHIFT)
& RCV_TID_CTRL_TID_PAIR_CNT_MASK)
<< RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
(((rcd->expected_base >> RCV_SHIFT)
& RCV_TID_CTRL_TID_BASE_INDEX_MASK)
<< RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg); if (ctxt == HFI1_CTRL_CTXT)
write_csr(dd, RCV_VL15, HFI1_CTRL_CTXT);
} if (op & HFI1_RCVCTRL_CTXT_DIS) {
write_csr(dd, RCV_VL15, 0); /* * When receive context is being disabled turn on tail * update with a dummy tail address and then disable * receive context.
*/ if (dd->rcvhdrtail_dummy_dma) {
write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
dd->rcvhdrtail_dummy_dma); /* Enabling RcvCtxtCtrl.TailUpd is intentional. */
rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
}
rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
} if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) {
set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
IS_RCVAVAIL_START + rcd->ctxt, true);
rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
} if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) {
set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
IS_RCVAVAIL_START + rcd->ctxt, false);
rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
} if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && hfi1_rcvhdrtail_kvaddr(rcd))
rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; if (op & HFI1_RCVCTRL_TAILUPD_DIS) { /* See comment on RcvCtxtCtrl.TailUpd above */ if (!(op & HFI1_RCVCTRL_CTXT_DIS))
rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
} if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) { /* * In one-packet-per-eager mode, the size comes from * the RcvArray entry.
*/
rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
} if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK; if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK; if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK; if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; if (op & HFI1_RCVCTRL_URGENT_ENB)
set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
IS_RCVURGENT_START + rcd->ctxt, true); if (op & HFI1_RCVCTRL_URGENT_DIS)
set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
IS_RCVURGENT_START + rcd->ctxt, false);
if (did_enable) { /* * The interrupt timeout and count must be set after * the context is enabled to take effect.
*/ /* set interrupt timeout */
write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
(u64)rcd->rcvavail_timeout <<
RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
/* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
}
if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS)) /* * If the context has been disabled and the Tail Update has * been cleared, set the RCV_HDR_TAIL_ADDR CSR to dummy address * so it doesn't contain an address that is invalid.
*/
write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
dd->rcvhdrtail_dummy_dma);
}
u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp)
{ int ret;
u64 val = 0;
if (namep) {
ret = dd->cntrnameslen;
*namep = dd->cntrnames;
} else { conststruct cntr_entry *entry; int i, j;
ret = (dd->ndevcntrs) * sizeof(u64);
/* Get the start of the block of counters */
*cntrp = dd->cntrs;
/* * Now go and fill in each counter in the block.
*/ for (i = 0; i < DEV_CNTR_LAST; i++) {
entry = &dev_cntrs[i];
hfi1_cdbg(CNTR, "reading %s", entry->name); if (entry->flags & CNTR_DISABLED) { /* Nothing */
hfi1_cdbg(CNTR, "\tDisabled");
} else { if (entry->flags & CNTR_VL) {
hfi1_cdbg(CNTR, "\tPer VL"); for (j = 0; j < C_VL_COUNT; j++) {
val = entry->rw_cntr(entry,
dd, j,
CNTR_MODE_R,
0);
hfi1_cdbg(
CNTR, "\t\tRead 0x%llx for %d",
val, j);
dd->cntrs[entry->offset + j] =
val;
}
} elseif (entry->flags & CNTR_SDMA) {
hfi1_cdbg(CNTR, "\t Per SDMA Engine"); for (j = 0; j < chip_sdma_engines(dd);
j++) {
val =
entry->rw_cntr(entry, dd, j,
CNTR_MODE_R, 0);
hfi1_cdbg(CNTR, "\t\tRead 0x%llx for %d",
val, j);
dd->cntrs[entry->offset + j] =
val;
}
} else {
val = entry->rw_cntr(entry, dd,
CNTR_INVALID_VL,
CNTR_MODE_R, 0);
dd->cntrs[entry->offset] = val;
hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
}
}
}
} return ret;
}
/* * Used by sysfs to create files for hfi stats to read
*/
u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp)
{ int ret;
u64 val = 0;
if (namep) {
ret = ppd->dd->portcntrnameslen;
*namep = ppd->dd->portcntrnames;
} else { conststruct cntr_entry *entry; int i, j;
ret = ppd->dd->nportcntrs * sizeof(u64);
*cntrp = ppd->cntrs;
for (i = 0; i < PORT_CNTR_LAST; i++) {
entry = &port_cntrs[i];
hfi1_cdbg(CNTR, "reading %s", entry->name); if (entry->flags & CNTR_DISABLED) { /* Nothing */
hfi1_cdbg(CNTR, "\tDisabled"); continue;
}
val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
/* If its a synthetic counter there is more work we need to do */ if (entry->flags & CNTR_SYNTH) { if (sval == CNTR_MAX) { /* No need to read already saturated */ return CNTR_MAX;
}
if (entry->flags & CNTR_32BIT) { /* 32bit counters can wrap multiple times */
u64 upper = sval >> 32;
u64 lower = (sval << 32) >> 32;
if (lower > val) { /* hw wrapped */ if (upper == CNTR_32BIT_MAX)
val = CNTR_MAX; else
upper++;
}
if (val != CNTR_MAX)
val = (upper << 32) | val;
} else { /* If we rolled we are saturated */ if ((val < sval) || (val > CNTR_MAX))
val = CNTR_MAX;
}
}
if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
(index <= C_RCV_HDR_OVF_LAST)) { /* We do not want to bother for disabled contexts */ return 0;
}
if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
(index <= C_RCV_HDR_OVF_LAST)) { /* We do not want to bother for disabled contexts */ return 0;
}
/* * Rather than keep beating on the CSRs pick a minimal set that we can * check to watch for potential roll over. We can do this by looking at * the number of flits sent/recv. If the total flits exceeds 32bits then * we have to iterate all the counters and update.
*/
entry = &dev_cntrs[C_DC_RCV_FLITS];
cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) { /* * May not be strictly necessary to update but it won't hurt and * simplifies the logic here.
*/
update = 1;
hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
dd->unit);
} else {
total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
hfi1_cdbg(CNTR, "[%d] total flits 0x%llx limit 0x%llx", dd->unit,
total_flits, (u64)CNTR_32BIT_MAX); if (total_flits >= CNTR_32BIT_MAX) {
hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
dd->unit);
update = 1;
}
}
if (update) {
hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit); for (i = 0; i < DEV_CNTR_LAST; i++) {
entry = &dev_cntrs[i]; if (entry->flags & CNTR_VL) { for (vl = 0; vl < C_VL_COUNT; vl++)
read_dev_cntr(dd, i, vl);
} else {
read_dev_cntr(dd, i, CNTR_INVALID_VL);
}
}
ppd = (struct hfi1_pportdata *)(dd + 1); for (i = 0; i < dd->num_pports; i++, ppd++) { for (j = 0; j < PORT_CNTR_LAST; j++) {
entry = &port_cntrs[j]; if (entry->flags & CNTR_VL) { for (vl = 0; vl < C_VL_COUNT; vl++)
read_port_cntr(ppd, j, vl);
} else {
read_port_cntr(ppd, j, CNTR_INVALID_VL);
}
}
}
/* * We want the value in the register. The goal is to keep track * of the number of "ticks" not the counter value. In other * words if the register rolls we want to notice it and go ahead * and force an update.
*/
entry = &dev_cntrs[C_DC_XMIT_FLITS];
dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
CNTR_MODE_R, 0);
/* Counter is 32 bits */ if (dev_cntrs[i].flags & CNTR_32BIT) {
memcpy(p, bit_type_32, bit_type_32_sz);
p += bit_type_32_sz;
}
*p++ = '\n';
}
}
/*********************/ /* per port counters */ /*********************/
/* * Go through the counters for the overflows and disable the ones we * don't need. This varies based on platform so we need to do it * dynamically here.
*/
rcv_ctxts = dd->num_rcv_contexts; for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
i <= C_RCV_HDR_OVF_LAST; i++) {
port_cntrs[i].flags |= CNTR_DISABLED;
}
/* size port counter names and determine how many we have*/
sz = 0;
dd->nportcntrs = 0; for (i = 0; i < PORT_CNTR_LAST; i++) { if (port_cntrs[i].flags & CNTR_DISABLED) {
hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name); continue;
}
/* Counter is 32 bits */ if (port_cntrs[i].flags & CNTR_32BIT) {
memcpy(p, bit_type_32, bit_type_32_sz);
p += bit_type_32_sz;
}
*p++ = '\n';
}
}
/* allocate per port storage for counter values */
ppd = (struct hfi1_pportdata *)(dd + 1); for (i = 0; i < dd->num_pports; i++, ppd++) {
ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL); if (!ppd->cntrs) goto bail;
ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL); if (!ppd->scntrs) goto bail;
}
/* CPU counters need to be allocated and zeroed */ if (init_cpu_counters(dd)) goto bail;
dd->update_cntr_wq = alloc_ordered_workqueue("hfi1_update_cntr_%d",
WQ_MEM_RECLAIM, dd->unit); if (!dd->update_cntr_wq) goto bail;
static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
{ switch (chip_lstate) { case LSTATE_DOWN: return IB_PORT_DOWN; case LSTATE_INIT: return IB_PORT_INIT; case LSTATE_ARMED: return IB_PORT_ARMED; case LSTATE_ACTIVE: return IB_PORT_ACTIVE; default:
dd_dev_err(dd, "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
chip_lstate); return IB_PORT_DOWN;
}
}
u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
{ /* look at the HFI meta-states only */ switch (chip_pstate & 0xf0) { case PLS_DISABLED: return IB_PORTPHYSSTATE_DISABLED; case PLS_OFFLINE: return OPA_PORTPHYSSTATE_OFFLINE; case PLS_POLLING: return IB_PORTPHYSSTATE_POLLING; case PLS_CONFIGPHY: return IB_PORTPHYSSTATE_TRAINING; case PLS_LINKUP: return IB_PORTPHYSSTATE_LINKUP; case PLS_PHYTEST: return IB_PORTPHYSSTATE_PHY_TEST; default:
dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
chip_pstate); return IB_PORTPHYSSTATE_DISABLED;
}
}
/* return the OPA port physical state name */ constchar *opa_pstate_name(u32 pstate)
{ staticconstchar * const port_physical_names[] = { "PHYS_NOP", "reserved1", "PHYS_POLL", "PHYS_DISABLED", "PHYS_TRAINING", "PHYS_LINKUP", "PHYS_LINK_ERR_RECOVER", "PHYS_PHY_TEST", "reserved8", "PHYS_OFFLINE", "PHYS_GANGED", "PHYS_TEST",
}; if (pstate < ARRAY_SIZE(port_physical_names)) return port_physical_names[pstate]; return"unknown";
}
/** * update_statusp - Update userspace status flag * @ppd: Port data structure * @state: port state information * * Actual port status is determined by the host_link_state value * in the ppd. * * host_link_state MUST be updated before updating the user space * statusp.
*/ staticvoid update_statusp(struct hfi1_pportdata *ppd, u32 state)
{ /* * Set port status flags in the page mapped into userspace * memory. Do it here to ensure a reliable state - this is * the only function called by all state handling code. * Always set the flags due to the fact that the cache value * might have been changed explicitly outside of this * function.
*/ if (ppd->statusp) { switch (state) { case IB_PORT_DOWN: case IB_PORT_INIT:
*ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
HFI1_STATUS_IB_READY); break; case IB_PORT_ARMED:
*ppd->statusp |= HFI1_STATUS_IB_CONF; break; case IB_PORT_ACTIVE:
*ppd->statusp |= HFI1_STATUS_IB_READY; break;
}
}
}
/** * wait_logical_linkstate - wait for an IB link state change to occur * @ppd: port device * @state: the state to wait for * @msecs: the number of milliseconds to wait * * Wait up to msecs milliseconds for IB link state change to occur. * For now, take the easy polling route. * Returns 0 if state reached, otherwise -ETIMEDOUT.
*/ staticint wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs)
{ unsignedlong timeout;
u32 new_state;
timeout = jiffies + msecs_to_jiffies(msecs); while (1) {
new_state = chip_to_opa_lstate(ppd->dd,
read_logical_state(ppd->dd)); if (new_state == state) break; if (time_after(jiffies, timeout)) {
dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n",
state); return -ETIMEDOUT;
}
msleep(20);
}
dd_dev_info(ppd->dd, "physical state changed to %s (0x%x), phy 0x%x\n",
opa_pstate_name(ib_pstate), ib_pstate, state);
}
/* * Read the physical hardware link state and check if it matches host * drivers anticipated state.
*/ staticvoid log_physical_state(struct hfi1_pportdata *ppd, u32 state)
{
u32 read_state = read_physical_state(ppd->dd);
if (read_state == state) {
log_state_transition(ppd, state);
} else {
dd_dev_err(ppd->dd, "anticipated phy link state 0x%x, read 0x%x\n",
state, read_state);
}
}
/* * wait_physical_linkstate - wait for an physical link state change to occur * @ppd: port device * @state: the state to wait for * @msecs: the number of milliseconds to wait * * Wait up to msecs milliseconds for physical link state change to occur. * Returns 0 if state reached, otherwise -ETIMEDOUT.
*/ staticint wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs)
{
u32 read_state; unsignedlong timeout;
timeout = jiffies + msecs_to_jiffies(msecs); while (1) {
read_state = read_physical_state(ppd->dd); if (read_state == state) break; if (time_after(jiffies, timeout)) {
dd_dev_err(ppd->dd, "timeout waiting for phy link state 0x%x\n",
state); return -ETIMEDOUT;
}
usleep_range(1950, 2050); /* sleep 2ms-ish */
}
log_state_transition(ppd, state); return 0;
}
/* * wait_phys_link_offline_quiet_substates - wait for any offline substate * @ppd: port device * @msecs: the number of milliseconds to wait * * Wait up to msecs milliseconds for any offline physical link * state change to occur. * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
*/ staticint wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, int msecs)
{
u32 read_state; unsignedlong timeout;
timeout = jiffies + msecs_to_jiffies(msecs); while (1) {
read_state = read_physical_state(ppd->dd); if ((read_state & 0xF0) == PLS_OFFLINE) break; if (time_after(jiffies, timeout)) {
dd_dev_err(ppd->dd, "timeout waiting for phy link offline.quiet substates. Read state 0x%x, %dms\n",
read_state, msecs); return -ETIMEDOUT;
}
usleep_range(1950, 2050); /* sleep 2ms-ish */
}
/* * wait_phys_link_out_of_offline - wait for any out of offline state * @ppd: port device * @msecs: the number of milliseconds to wait * * Wait up to msecs milliseconds for any out of offline physical link * state change to occur. * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
*/ staticint wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd, int msecs)
{
u32 read_state; unsignedlong timeout;
timeout = jiffies + msecs_to_jiffies(msecs); while (1) {
read_state = read_physical_state(ppd->dd); if ((read_state & 0xF0) != PLS_OFFLINE) break; if (time_after(jiffies, timeout)) {
dd_dev_err(ppd->dd, "timeout waiting for phy link out of offline. Read state 0x%x, %dms\n",
read_state, msecs); return -ETIMEDOUT;
}
usleep_range(1950, 2050); /* sleep 2ms-ish */
}
/** * read_mod_write() - Calculate the IRQ register index and set/clear the bits * @dd: valid devdata * @src: IRQ source to determine register index from * @bits: the bits to set or clear * @set: true == set the bits, false == clear the bits *
*/ staticvoid read_mod_write(struct hfi1_devdata *dd, u16 src, u64 bits, bool set)
{
u64 reg;
u16 idx = src / BITS_PER_REGISTER; unsignedlong flags;
/** * set_intr_bits() - Enable/disable a range (one or more) IRQ sources * @dd: valid devdata * @first: first IRQ source to set/clear * @last: last IRQ source (inclusive) to set/clear * @set: true == set the bits, false == clear the bits * * If first == last, set the exact source.
*/ int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set)
{
u64 bits = 0;
u64 bit;
u16 src;
if (first > NUM_INTERRUPT_SOURCES || last > NUM_INTERRUPT_SOURCES) return -EINVAL;
if (last < first) return -ERANGE;
for (src = first; src <= last; src++) {
bit = src % BITS_PER_REGISTER; /* wrapped to next register? */ if (!bit && bits) {
read_mod_write(dd, src - 1, bits, set);
bits = 0;
}
bits |= BIT_ULL(bit);
}
read_mod_write(dd, last, bits, set);
return 0;
}
/* * Clear all interrupt sources on the chip.
*/ staticvoid clear_all_interrupts(struct hfi1_devdata *dd)
{ int i;
for (i = 0; i < CCE_NUM_INT_CSRS; i++)
write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0);
write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0); for (i = 0; i < chip_send_contexts(dd); i++)
write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0); for (i = 0; i < chip_sdma_engines(dd); i++)
write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
/* * Remap the interrupt source from the general handler to the given MSI-X * interrupt.
*/ void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
{
u64 reg; int m, n;
/* clear from the handled mask of the general interrupt */
m = isrc / 64;
n = isrc % 64; if (likely(m < CCE_NUM_INT_CSRS)) {
dd->gi_mask[m] &= ~((u64)1 << n);
} else {
dd_dev_err(dd, "remap interrupt err\n"); return;
}
/* direct the chip source to the given MSI-X interrupt */
m = isrc / 8;
n = isrc % 8;
reg = read_csr(dd, CCE_INT_MAP + (8 * m));
reg &= ~((u64)0xff << (8 * n));
reg |= ((u64)msix_intr & 0xff) << (8 * n);
write_csr(dd, CCE_INT_MAP + (8 * m), reg);
}
void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr)
{ /* * SDMA engine interrupt sources grouped by type, rather than * engine. Per-engine interrupts are as follows: * SDMA * SDMAProgress * SDMAIdle
*/
remap_intr(dd, IS_SDMA_START + engine, msix_intr);
remap_intr(dd, IS_SDMA_PROGRESS_START + engine, msix_intr);
remap_intr(dd, IS_SDMA_IDLE_START + engine, msix_intr);
}
/* * Set the general handler to accept all interrupts, remap all * chip interrupts back to MSI-X 0.
*/ void reset_interrupts(struct hfi1_devdata *dd)
{ int i;
/* all interrupts handled by the general handler */ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
dd->gi_mask[i] = ~(u64)0;
/* all chip interrupts map to MSI-X 0 */ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
write_csr(dd, CCE_INT_MAP + (8 * i), 0);
}
/** * set_up_interrupts() - Initialize the IRQ resources and state * @dd: valid devdata *
*/ staticint set_up_interrupts(struct hfi1_devdata *dd)
{ int ret;
/* mask all interrupts */
set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
/* clear all pending interrupts */
clear_all_interrupts(dd);
/* reset general handler mask, chip MSI-X mappings */
reset_interrupts(dd);
/* ask for MSI-X interrupts */
ret = msix_initialize(dd); if (ret) return ret;
ret = msix_request_irqs(dd); if (ret)
msix_clean_up_interrupts(dd);
return ret;
}
/* * Set up context values in dd. Sets: * * num_rcv_contexts - number of contexts being used * n_krcv_queues - number of kernel contexts * first_dyn_alloc_ctxt - first dynamically allocated context * in array of contexts * freectxts - number of free user contexts * num_send_contexts - number of PIO send contexts being used * num_netdev_contexts - number of contexts reserved for netdev
*/ staticint set_up_context_variables(struct hfi1_devdata *dd)
{ unsignedlong num_kernel_contexts;
u16 num_netdev_contexts; int ret; unsigned ngroups; int rmt_count;
u32 n_usr_ctxts;
u32 send_contexts = chip_send_contexts(dd);
u32 rcv_contexts = chip_rcv_contexts(dd);
/* * Kernel receive contexts: * - Context 0 - control context (VL15/multicast/error) * - Context 1 - first kernel context * - Context 2 - second kernel context * ...
*/ if (n_krcvqs) /* * n_krcvqs is the sum of module parameter kernel receive * contexts, krcvqs[]. It does not include the control * context, so add that.
*/
num_kernel_contexts = n_krcvqs + 1; else
num_kernel_contexts = DEFAULT_KRCVQS + 1; /* * Every kernel receive context needs an ACK send context. * one send context is allocated for each VL{0-7} and VL15
*/ if (num_kernel_contexts > (send_contexts - num_vls - 1)) {
dd_dev_err(dd, "Reducing # kernel rcv contexts to: %d, from %lu\n",
send_contexts - num_vls - 1,
num_kernel_contexts);
num_kernel_contexts = send_contexts - num_vls - 1;
}
/* * User contexts: * - default to 1 user context per real (non-HT) CPU core if * num_user_contexts is negative
*/ if (num_user_contexts < 0)
n_usr_ctxts = cpumask_weight(&node_affinity.real_cpu_mask); else
n_usr_ctxts = num_user_contexts; /* * Adjust the counts given a global max.
*/ if (num_kernel_contexts + n_usr_ctxts > rcv_contexts) {
dd_dev_err(dd, "Reducing # user receive contexts to: %u, from %u\n",
(u32)(rcv_contexts - num_kernel_contexts),
n_usr_ctxts); /* recalculate */
n_usr_ctxts = rcv_contexts - num_kernel_contexts;
}
num_netdev_contexts =
hfi1_num_netdev_contexts(dd, rcv_contexts -
(num_kernel_contexts + n_usr_ctxts),
&node_affinity.real_cpu_mask); /* * RMT entries are allocated as follows: * 1. QOS (0 to 128 entries) * 2. FECN (num_kernel_context - 1 [a] + num_user_contexts + * num_netdev_contexts [b]) * 3. netdev (NUM_NETDEV_MAP_ENTRIES) * * Notes: * [a] Kernel contexts (except control) are included in FECN if kernel * TID_RDMA is active. * [b] Netdev and user contexts are randomly allocated from the same * context pool, so FECN must cover all contexts in the pool.
*/
rmt_count = qos_rmt_entries(num_kernel_contexts - 1, NULL, NULL)
+ (HFI1_CAP_IS_KSET(TID_RDMA) ? num_kernel_contexts - 1
: 0)
+ n_usr_ctxts
+ num_netdev_contexts
+ NUM_NETDEV_MAP_ENTRIES; if (rmt_count > NUM_MAP_ENTRIES) { int over = rmt_count - NUM_MAP_ENTRIES; /* try to squish user contexts, minimum of 1 */ if (over >= n_usr_ctxts) {
dd_dev_err(dd, "RMT overflow: reduce the requested number of contexts\n"); return -EINVAL;
}
dd_dev_err(dd, "RMT overflow: reducing # user contexts from %u to %u\n",
n_usr_ctxts, n_usr_ctxts - over);
n_usr_ctxts -= over;
}
/* the first N are kernel contexts, the rest are user/netdev contexts */
dd->num_rcv_contexts =
num_kernel_contexts + n_usr_ctxts + num_netdev_contexts;
dd->n_krcv_queues = num_kernel_contexts;
dd->first_dyn_alloc_ctxt = num_kernel_contexts;
dd->num_netdev_contexts = num_netdev_contexts;
dd->num_user_contexts = n_usr_ctxts;
dd->freectxts = n_usr_ctxts;
dd_dev_info(dd, "rcv contexts: chip %d, used %d (kernel %d, netdev %u, user %u)\n",
rcv_contexts,
(int)dd->num_rcv_contexts,
(int)dd->n_krcv_queues,
dd->num_netdev_contexts,
dd->num_user_contexts);
/* * Receive array allocation: * All RcvArray entries are divided into groups of 8. This * is required by the hardware and will speed up writes to * consecutive entries by using write-combining of the entire * cacheline. * * The number of groups are evenly divided among all contexts. * any left over groups will be given to the first N user * contexts.
*/
dd->rcv_entries.group_size = RCV_INCREMENT;
ngroups = chip_rcv_array_count(dd) / dd->rcv_entries.group_size;
dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
dd->rcv_entries.nctxt_extra = ngroups -
(dd->num_rcv_contexts * dd->rcv_entries.ngroups);
dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
dd->rcv_entries.ngroups,
dd->rcv_entries.nctxt_extra); if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
MAX_EAGER_ENTRIES * 2) {
dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
dd->rcv_entries.group_size;
dd_dev_info(dd, "RcvArray group count too high, change to %u\n",
dd->rcv_entries.ngroups);
dd->rcv_entries.nctxt_extra = 0;
} /* * PIO send contexts
*/
ret = init_sc_pools_and_sizes(dd); if (ret >= 0) { /* success */
dd->num_send_contexts = ret;
dd_dev_info(
dd, "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
send_contexts,
dd->num_send_contexts,
dd->sc_sizes[SC_KERNEL].count,
dd->sc_sizes[SC_ACK].count,
dd->sc_sizes[SC_USER].count,
dd->sc_sizes[SC_VL15].count);
ret = 0; /* success */
}
return ret;
}
/* * Set the device/port partition key table. The MAD code * will ensure that, at least, the partial management * partition key is present in the table.
*/ staticvoid set_partition_keys(struct hfi1_pportdata *ppd)
{ struct hfi1_devdata *dd = ppd->dd;
u64 reg = 0; int i;
/* Always enable HW pkeys check when pkeys table is set */
add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
}
/* * These CSRs and memories are uninitialized on reset and must be * written before reading to set the ECC/parity bits. * * NOTE: All user context CSRs that are not mmaped write-only * (e.g. the TID flows) must be initialized even if the driver never * reads them.
*/ staticvoid write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
{ int i, j;
/* CceIntMap */ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
write_csr(dd, CCE_INT_MAP + (8 * i), 0);
/* SendCtxtCreditReturnAddr */ for (i = 0; i < chip_send_contexts(dd); i++)
write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
/* PIO Send buffers */ /* SDMA Send buffers */ /* * These are not normally read, and (presently) have no method * to be read, so are not pre-initialized
*/
/* RcvHdrAddr */ /* RcvHdrTailAddr */ /* RcvTidFlowTable */ for (i = 0; i < chip_rcv_contexts(dd); i++) {
write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0); for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0);
}
/* RcvArray */ for (i = 0; i < chip_rcv_array_count(dd); i++)
hfi1_put_tid(dd, i, PT_INVALID_FLUSH, 0, 0);
/* RcvQPMapTable */ for (i = 0; i < 32; i++)
write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
}
/* * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
*/ staticvoid clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
u64 ctrl_bits)
{ unsignedlong timeout;
u64 reg;
/* is the condition present? */
reg = read_csr(dd, CCE_STATUS); if ((reg & status_bits) == 0) return;
/* clear the condition */
write_csr(dd, CCE_CTRL, ctrl_bits);
/* wait for the condition to clear */
timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT); while (1) {
reg = read_csr(dd, CCE_STATUS); if ((reg & status_bits) == 0) return; if (time_after(jiffies, timeout)) {
dd_dev_err(dd, "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
status_bits, reg & status_bits); return;
}
udelay(1);
}
}
/* set CCE CSRs to chip reset defaults */ staticvoid reset_cce_csrs(struct hfi1_devdata *dd)
{ int i;
/* CCE_REVISION read-only */ /* CCE_REVISION2 read-only */ /* CCE_CTRL - bits clear automatically */ /* CCE_STATUS read-only, use CceCtrl to clear */
clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK); for (i = 0; i < CCE_NUM_SCRATCH; i++)
write_csr(dd, CCE_SCRATCH + (8 * i), 0); /* CCE_ERR_STATUS read-only */
write_csr(dd, CCE_ERR_MASK, 0);
write_csr(dd, CCE_ERR_CLEAR, ~0ull); /* CCE_ERR_FORCE leave alone */ for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR); /* CCE_PCIE_CTRL leave alone */ for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
CCE_MSIX_TABLE_UPPER_RESETCSR);
} for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) { /* CCE_MSIX_PBA read-only */
write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
} for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
write_csr(dd, CCE_INT_MAP, 0); for (i = 0; i < CCE_NUM_INT_CSRS; i++) { /* CCE_INT_STATUS read-only */
write_csr(dd, CCE_INT_MASK + (8 * i), 0);
write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull); /* CCE_INT_FORCE leave alone */ /* CCE_INT_BLOCKED read-only */
} for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
}
/* set MISC CSRs to chip reset defaults */ staticvoid reset_misc_csrs(struct hfi1_devdata *dd)
{ int i;
/* * TXE Per-Context CSRs
*/ for (i = 0; i < chip_send_contexts(dd); i++) {
write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
}
/* * TXE Per-SDMA CSRs
*/ for (i = 0; i < chip_sdma_engines(dd); i++) {
write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); /* SEND_DMA_STATUS read-only */
write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0); /* SEND_DMA_HEAD read-only */
write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0); /* SEND_DMA_IDLE_CNT read-only */
write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0); /* SEND_DMA_DESC_FETCHED_CNT read-only */ /* SEND_DMA_ENG_ERR_STATUS read-only */
write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull); /* SEND_DMA_ENG_ERR_FORCE leave alone */
write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
}
}
/* * Expect on entry: * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
*/ staticvoid init_rbufs(struct hfi1_devdata *dd)
{
u64 reg; int count;
/* * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are * clear.
*/
count = 0; while (1) {
reg = read_csr(dd, RCV_STATUS); if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
| RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0) break; /* * Give up after 1ms - maximum wait time. * * RBuf size is 136KiB. Slowest possible is PCIe Gen1 x1 at * 250MB/s bandwidth. Lower rate to 66% for overhead to get: * 136 KB / (66% * 250MB/s) = 844us
*/ if (count++ > 500) {
dd_dev_err(dd, "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
__func__, reg); break;
}
udelay(2); /* do not busy-wait the CSR */
}
/* start the init - expect RcvCtrl to be 0 */
write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
/* * Read to force the write of Rcvtrl.RxRbufInit. There is a brief * period after the write before RcvStatus.RxRbufInitDone is valid. * The delay in the first run through the loop below is sufficient and * required before the first read of RcvStatus.RxRbufInintDone.
*/
read_csr(dd, RCV_CTRL);
/* wait for the init to finish */
count = 0; while (1) { /* delay is required first time through - see above */
udelay(2); /* do not busy-wait the CSR */
reg = read_csr(dd, RCV_STATUS); if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK)) break;
/* give up after 100us - slowest possible at 33MHz is 73us */ if (count++ > 50) {
dd_dev_err(dd, "%s: RcvStatus.RxRbufInit not set, continuing\n",
__func__); break;
}
}
}
/* set RXE CSRs to chip reset defaults */ staticvoid reset_rxe_csrs(struct hfi1_devdata *dd)
{ int i, j;
/* * RXE Kernel CSRs
*/
write_csr(dd, RCV_CTRL, 0);
init_rbufs(dd); /* RCV_STATUS read-only */ /* RCV_CONTEXTS read-only */ /* RCV_ARRAY_CNT read-only */ /* RCV_BUF_SIZE read-only */
write_csr(dd, RCV_BTH_QP, 0);
write_csr(dd, RCV_MULTICAST, 0);
write_csr(dd, RCV_BYPASS, 0);
write_csr(dd, RCV_VL15, 0); /* this is a clear-down */
write_csr(dd, RCV_ERR_INFO,
RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); /* RCV_ERR_STATUS read-only */
write_csr(dd, RCV_ERR_MASK, 0);
write_csr(dd, RCV_ERR_CLEAR, ~0ull); /* RCV_ERR_FORCE leave alone */ for (i = 0; i < 32; i++)
write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0); for (i = 0; i < 4; i++)
write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0); for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0); for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0); for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++)
clear_rsm_rule(dd, i); for (i = 0; i < 32; i++)
write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
/* * RXE Kernel and User Per-Context CSRs
*/ for (i = 0; i < chip_rcv_contexts(dd); i++) { /* kernel */
write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0); /* RCV_CTXT_STATUS read-only */
write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
/* user */ /* RCV_HDR_TAIL read-only */
write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0); /* RCV_EGR_INDEX_TAIL read-only */
write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0); /* RCV_EGR_OFFSET_TAIL read-only */ for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
write_uctxt_csr(dd, i,
RCV_TID_FLOW_TABLE + (8 * j), 0);
}
}
}
/* * Set sc2vl tables. * * They power on to zeros, so to avoid send context errors * they need to be set: * * SC 0-7 -> VL 0-7 (respectively) * SC 15 -> VL 15 * otherwise * -> VL 0
*/ staticvoid init_sc2vl_tables(struct hfi1_devdata *dd)
{ int i; /* init per architecture spec, constrained by hardware capability */
/* initialize the cached sc2vl values consistently with h/w */ for (i = 0; i < 32; i++) { if (i < 8 || i == 15)
*((u8 *)(dd->sc2vl) + i) = (u8)i; else
*((u8 *)(dd->sc2vl) + i) = 0;
}
}
/* * Read chip sizes and then reset parts to sane, disabled, values. We cannot * depend on the chip going through a power-on reset - a driver may be loaded * and unloaded many times. * * Do not write any CSR values to the chip in this routine - there may be * a reset following the (possible) FLR in this routine. *
*/ staticint init_chip(struct hfi1_devdata *dd)
{ int i; int ret = 0;
/* * Put the HFI CSRs in a known state. * Combine this with a DC reset. * * Stop the device from doing anything while we do a * reset. We know there are no other active users of * the device since we are now in charge. Turn off * off all outbound and inbound traffic and make sure * the device does not generate any interrupts.
*/
/* disable send contexts and SDMA engines */
write_csr(dd, SEND_CTRL, 0); for (i = 0; i < chip_send_contexts(dd); i++)
write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0); for (i = 0; i < chip_sdma_engines(dd); i++)
write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); /* disable port (turn off RXE inbound traffic) and contexts */
write_csr(dd, RCV_CTRL, 0); for (i = 0; i < chip_rcv_contexts(dd); i++)
write_csr(dd, RCV_CTXT_CTRL, 0); /* mask all interrupt sources */ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
/* * DC Reset: do a full DC reset before the register clear. * A recommended length of time to hold is one CSR read, * so reread the CceDcCtrl. Then, hold the DC in reset * across the clear.
*/
write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
(void)read_csr(dd, CCE_DC_CTRL);
if (use_flr) { /* * A FLR will reset the SPC core and part of the PCIe. * The parts that need to be restored have already been * saved.
*/
dd_dev_info(dd, "Resetting CSRs with FLR\n");
/* do the FLR, the DC reset will remain */
pcie_flr(dd->pcidev);
/* restore command and BARs */
ret = restore_pci_variables(dd); if (ret) {
dd_dev_err(dd, "%s: Could not restore PCI variables\n",
__func__); return ret;
}
if (is_ax(dd)) {
dd_dev_info(dd, "Resetting CSRs with FLR\n");
pcie_flr(dd->pcidev);
ret = restore_pci_variables(dd); if (ret) {
dd_dev_err(dd, "%s: Could not restore PCI variables\n",
__func__); return ret;
}
}
} else {
dd_dev_info(dd, "Resetting CSRs with writes\n");
reset_cce_csrs(dd);
reset_txe_csrs(dd);
reset_rxe_csrs(dd);
reset_misc_csrs(dd);
} /* clear the DC reset */
write_csr(dd, CCE_DC_CTRL, 0);
/* Set the LED off */
setextled(dd, 0);
/* * Clear the QSFP reset. * An FLR enforces a 0 on all out pins. The driver does not touch * ASIC_QSFPn_OUT otherwise. This leaves RESET_N low and * anything plugged constantly in reset, if it pays attention * to RESET_N. * Prime examples of this are optical cables. Set all pins high. * I2CCLK and I2CDAT will change per direction, and INT_N and * MODPRS_N are input only and their value is ignored.
*/
write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
init_chip_resources(dd); return ret;
}
staticvoid init_early_variables(struct hfi1_devdata *dd)
{ int i;
/* assign link credit variables */
dd->vau = CM_VAU;
dd->link_credits = CM_GLOBAL_CREDITS; if (is_ax(dd))
dd->link_credits--;
dd->vcu = cu_to_vcu(hfi1_cu); /* enough room for 8 MAD packets plus header - 17K */
dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau); if (dd->vl15_init > dd->link_credits)
dd->vl15_init = dd->link_credits;
write_uninitialized_csrs_and_memories(dd);
if (HFI1_CAP_IS_KSET(PKEY_CHECK)) for (i = 0; i < dd->num_pports; i++) { struct hfi1_pportdata *ppd = &dd->pport[i];
/** * hfi1_get_qp_map - get qp map * @dd: device data * @idx: index to read
*/
u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx)
{
u64 reg = read_csr(dd, RCV_QP_MAP_TABLE + (idx / 8) * 8);
reg >>= (idx % 8) * 8; return reg;
}
/** * init_qpmap_table - init qp map * @dd: device data * @first_ctxt: first context * @last_ctxt: first context * * This return sets the qpn mapping table that * is indexed by qpn[8:1]. * * The routine will round robin the 256 settings * from first_ctxt to last_ctxt. * * The first/last looks ahead to having specialized * receive contexts for mgmt and bypass. Normal * verbs traffic will assumed to be on a range * of receive contexts.
*/ staticvoid init_qpmap_table(struct hfi1_devdata *dd,
u32 first_ctxt,
u32 last_ctxt)
{
u64 reg = 0;
u64 regno = RCV_QP_MAP_TABLE; int i;
u64 ctxt = first_ctxt;
for (i = 0; i < 256; i++) {
reg |= ctxt << (8 * (i % 8));
ctxt++; if (ctxt > last_ctxt)
ctxt = first_ctxt; if (i % 8 == 7) {
write_csr(dd, regno, reg);
reg = 0;
regno += 8;
}
}
/* * Return an initialized RMT map table for users to fill in. OK if it * returns NULL, indicating no table.
*/ staticstruct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
{ struct rsm_map_table *rmt;
u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */
/* * Write the final RMT map table to the chip and free the table. OK if * table is NULL.
*/ staticvoid complete_rsm_map_table(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
{ int i;
if (rmt) { /* write table to chip */ for (i = 0; i < NUM_MAP_REGS; i++)
write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
/* return the number of RSM map table entries that will be used for QOS */ staticint qos_rmt_entries(unsignedint n_krcv_queues, unsignedint *mp, unsignedint *np)
{ int i; unsignedint m, n;
uint max_by_vl = 0;
/* is QOS active at all? */ if (n_krcv_queues < MIN_KERNEL_KCTXTS ||
num_vls == 1 ||
krcvqsset <= 1) goto no_qos;
/* determine bits for qpn */ for (i = 0; i < min_t(unsignedint, num_vls, krcvqsset); i++) if (krcvqs[i] > max_by_vl)
max_by_vl = krcvqs[i]; if (max_by_vl > 32) goto no_qos;
m = ilog2(__roundup_pow_of_two(max_by_vl));
/* determine bits for vl */
n = ilog2(__roundup_pow_of_two(num_vls));
/* reject if too much is used */ if ((m + n) > 7) goto no_qos;
if (mp)
*mp = m; if (np)
*np = n;
return 1 << (m + n);
no_qos: if (mp)
*mp = 0; if (np)
*np = 0; return 0;
}
/** * init_qos - init RX qos * @dd: device data * @rmt: RSM map table * * This routine initializes Rule 0 and the RSM map table to implement * quality of service (qos). * * If all of the limit tests succeed, qos is applied based on the array * interpretation of krcvqs where entry 0 is VL0. * * The number of vl bits (n) and the number of qpn bits (m) are computed to * feed both the RSM map table and the single rule.
*/ staticvoid init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
{ struct rsm_rule_data rrd; unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; unsignedint rmt_entries;
u64 reg;
if (!rmt) goto bail;
rmt_entries = qos_rmt_entries(dd->n_krcv_queues - 1, &m, &n); if (rmt_entries == 0) goto bail;
qpns_per_vl = 1 << m;
/* enough room in the map table? */
rmt_entries = 1 << (m + n); if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES) goto bail;
/* add qos entries to the RSM map table */ for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) { unsigned tctxt;
/* there needs to be enough room in the map table */ if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) {
dd_dev_err(dd, "FECN handling disabled - too many contexts allocated\n"); return;
}
/* * RSM will extract the destination context as an index into the * map table. The destination contexts are a sequential block * in the range start...num_rcv_contexts-1 (inclusive). * Map entries are accessed as offset + extracted value. Adjust * the added offset so this sequence can be placed anywhere in * the table - as long as the entries themselves do not wrap. * There are only enough bits in offset for the table size, so * start with that to allow for a "negative" offset.
*/
offset = (u8)(NUM_MAP_ENTRIES + rmt->used - start);
/* * For RSM intercept of Expected FECN packets: * o packet type 0 - expected * o match on F (bit 95), using select/match 1, and * o match on SH (bit 133), using select/match 2. * * Use index 1 to extract the 8-bit receive context from DestQP * (start at bit 64). Use that as the RSM map table index.
*/
rrd.offset = offset;
rrd.pkt_type = 0;
rrd.field1_off = 95;
rrd.field2_off = 133;
rrd.index1_off = 64;
rrd.index1_width = 8;
rrd.index2_off = 0;
rrd.index2_width = 0;
rrd.mask1 = 1;
rrd.value1 = 1;
rrd.mask2 = 1;
rrd.value2 = 1;
staticbool hfi1_netdev_update_rmt(struct hfi1_devdata *dd)
{
u8 i, j;
u8 ctx_id = 0;
u64 reg;
u32 regoff; int rmt_start = hfi1_netdev_get_free_rmt_idx(dd); int ctxt_count = hfi1_netdev_ctxt_count(dd);
/* We already have contexts mapped in RMT */ if (has_rsm_rule(dd, RSM_INS_VNIC) || has_rsm_rule(dd, RSM_INS_AIP)) {
dd_dev_info(dd, "Contexts are already mapped in RMT\n"); returntrue;
}
if (hfi1_is_rmt_full(rmt_start, NUM_NETDEV_MAP_ENTRIES)) {
dd_dev_err(dd, "Not enough RMT entries used = %d\n",
rmt_start); returnfalse;
}
void hfi1_deinit_aip_rsm(struct hfi1_devdata *dd)
{ /* only actually clear the rule if it's the last user asking to do so */ if (atomic_fetch_add_unless(&dd->ipoib_rsm_usr_num, -1, 0) == 1)
clear_rsm_rule(dd, RSM_INS_AIP);
}
/* enable all receive errors */
write_csr(dd, RCV_ERR_MASK, ~0ull);
rmt = alloc_rsm_map_table(dd); if (!rmt) return -ENOMEM;
/* set up QOS, including the QPN map table */
init_qos(dd, rmt);
init_fecn_handling(dd, rmt);
complete_rsm_map_table(dd, rmt); /* record number of used rsm map entries for netdev */
hfi1_netdev_set_free_rmt_idx(dd, rmt->used);
kfree(rmt);
/* * make sure RcvCtrl.RcvWcb <= PCIe Device Control * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config * space, PciCfgCap2.MaxPayloadSize in HFI). There is only one * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and * Max_PayLoad_Size set to its minimum of 128. * * Presently, RcvCtrl.RcvWcb is not modified from its default of 0 * (64 bytes). Max_Payload_Size is possibly modified upward in * tune_pcie_caps() which is called after this routine.
*/
/* Have 16 bytes (4DW) of bypass header available in header queue */
val = read_csr(dd, RCV_BYPASS);
val &= ~RCV_BYPASS_HDR_SIZE_SMASK;
val |= ((4ull & RCV_BYPASS_HDR_SIZE_MASK) <<
RCV_BYPASS_HDR_SIZE_SHIFT);
write_csr(dd, RCV_BYPASS, val); return 0;
}
/* * Fill out the given AU table using the given CU. A CU is defined in terms * AUs. The table is a an encoding: given the index, how many AUs does that * represent? * * NOTE: Assumes that the register layout is the same for the * local and remote tables.
*/ staticvoid assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
u32 csr0to3, u32 csr4to7)
{
write_csr(dd, csr0to3,
0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT |
1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT |
2ull * cu <<
SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT |
4ull * cu <<
SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
write_csr(dd, csr4to7,
8ull * cu <<
SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT |
16ull * cu <<
SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT |
32ull * cu <<
SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT |
64ull * cu <<
SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
}
staticvoid init_txe(struct hfi1_devdata *dd)
{ int i;
/* enable all PIO, SDMA, general, and Egress errors */
write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
write_csr(dd, SEND_ERR_MASK, ~0ull);
write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
/* enable all per-context and per-SDMA engine errors */ for (i = 0; i < chip_send_contexts(dd); i++)
write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull); for (i = 0; i < chip_sdma_engines(dd); i++)
write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
/* set the local CU to AU mapping */
assign_local_cm_au_table(dd, dd->vcu);
/* * Set reasonable default for Credit Return Timer * Don't set on Simulator - causes it to choke.
*/ if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
}
hw_ctxt = rcd->sc->hw_context;
write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_JOB_KEY, 0); /* * Disable send-side J_KEY integrity check, unless this is A0 h/w. * This check would not have been enabled for A0 h/w, see * set_ctxt_jkey().
*/ if (!is_ax(dd)) {
reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg);
} /* Turn off the J_KEY on the receive side */
write_kctxt_csr(dd, rcd->ctxt, RCV_KEY_CTRL, 0);
/* * Start doing the clean up the chip. Our clean up happens in multiple * stages and this is just the first.
*/ void hfi1_start_cleanup(struct hfi1_devdata *dd)
{
aspm_exit(dd);
free_cntrs(dd);
free_rcverr(dd);
finish_chip_resources(dd);
}
/* * Information can be shared between the two HFIs on the same ASIC * in the same OS. This function finds the peer device and sets * up a shared structure.
*/ staticint init_asic_data(struct hfi1_devdata *dd)
{ unsignedlong index; struct hfi1_devdata *peer; struct hfi1_asic_data *asic_data; int ret = 0;
/* pre-allocate the asic structure in case we are the first device */
asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL); if (!asic_data) return -ENOMEM;
/* first one through - set up i2c devices */ if (!peer)
ret = set_up_i2c(dd, dd->asic_data);
return ret;
}
/* * Set dd->boardname. Use a generic name if a name is not returned from * EFI variable space. * * Return 0 on success, -ENOMEM if space could not be allocated.
*/ staticint obtain_boardname(struct hfi1_devdata *dd)
{ /* generic board description */ constchar generic[] = "Cornelis Omni-Path Host Fabric Interface Adapter 100 Series"; unsignedlong size; int ret;
ret = read_hfi1_efi_var(dd, "description", &size,
(void **)&dd->boardname); if (ret) {
dd_dev_info(dd, "Board description not found\n"); /* use generic description */
dd->boardname = kstrdup(generic, GFP_KERNEL); if (!dd->boardname) return -ENOMEM;
} return 0;
}
/* * Check the interrupt registers to make sure that they are mapped correctly. * It is intended to help user identify any mismapping by VMM when the driver * is running in a VM. This function should only be called before interrupt * is set up properly. * * Return 0 on success, -EINVAL on failure.
*/ staticint check_int_registers(struct hfi1_devdata *dd)
{
u64 reg;
u64 all_bits = ~(u64)0;
u64 mask;
/* Clear CceIntMask[0] to avoid raising any interrupts */
mask = read_csr(dd, CCE_INT_MASK);
write_csr(dd, CCE_INT_MASK, 0ull);
reg = read_csr(dd, CCE_INT_MASK); if (reg) goto err_exit;
/* Clear all interrupt status bits */
write_csr(dd, CCE_INT_CLEAR, all_bits);
reg = read_csr(dd, CCE_INT_STATUS); if (reg) goto err_exit;
/* Set all interrupt status bits */
write_csr(dd, CCE_INT_FORCE, all_bits);
reg = read_csr(dd, CCE_INT_STATUS); if (reg != all_bits) goto err_exit;
return 0;
err_exit:
write_csr(dd, CCE_INT_MASK, mask);
dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n"); return -EINVAL;
}
/** * hfi1_init_dd() - Initialize most of the dd structure. * @dd: the dd device * * This is global, and is called directly at init to set up the * chip-specific function pointers for later use.
*/ int hfi1_init_dd(struct hfi1_devdata *dd)
{ struct pci_dev *pdev = dd->pcidev; struct hfi1_pportdata *ppd;
u64 reg; int i, ret; staticconstchar * const inames[] = { /* implementation names */ "RTL silicon", "RTL VCS simulation", "RTL FPGA emulation", "Functional simulator"
}; struct pci_dev *parent = pdev->bus->self;
u32 sdma_engines = chip_sdma_engines(dd);
ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) { int vl; /* init common fields */
hfi1_init_pportdata(pdev, ppd, dd, 0, 1); /* DC supports 4 link widths */
ppd->link_width_supported =
OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
ppd->link_width_downgrade_supported =
ppd->link_width_supported; /* start out enabling only 4X */
ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
ppd->link_width_downgrade_enabled =
ppd->link_width_downgrade_supported; /* link width active is 0 when link is down */ /* link width downgrade active is 0 when link is down */
if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
num_vls > HFI1_MAX_VLS_SUPPORTED) {
dd_dev_err(dd, "Invalid num_vls %u, using %u VLs\n",
num_vls, HFI1_MAX_VLS_SUPPORTED);
num_vls = HFI1_MAX_VLS_SUPPORTED;
}
ppd->vls_supported = num_vls;
ppd->vls_operational = ppd->vls_supported; /* Set the default MTU. */ for (vl = 0; vl < num_vls; vl++)
dd->vld[vl].mtu = hfi1_max_mtu;
dd->vld[15].mtu = MAX_MAD_PACKET; /* * Set the initial values to reasonable default, will be set * for real when link is up.
*/
ppd->overrun_threshold = 0x4;
ppd->phy_error_threshold = 0xf;
ppd->port_crc_mode_enabled = link_crc_mask; /* initialize supported LTP CRC mode */
ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8; /* initialize enabled LTP CRC mode */
ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4; /* start in offline */
ppd->host_link_state = HLS_DN_OFFLINE;
init_vl_arb_caches(ppd);
}
/* * Do remaining PCIe setup and save PCIe values in dd. * Any error printing is already done by the init code. * On return, we have the chip mapped.
*/
ret = hfi1_pcie_ddinit(dd, pdev); if (ret < 0) goto bail_free;
/* Save PCI space registers to rewrite after device reset */
ret = save_pci_variables(dd); if (ret < 0) goto bail_cleanup;
/* * Check interrupt registers mapping if the driver has no access to * the upstream component. In this case, it is likely that the driver * is running in a VM.
*/ if (!parent) {
ret = check_int_registers(dd); if (ret) goto bail_cleanup;
}
/* * obtain the hardware ID - NOT related to unit, which is a * software enumeration
*/
reg = read_csr(dd, CCE_REVISION2);
dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
& CCE_REVISION2_HFI_ID_MASK; /* the variable size will remove unwanted bits */
dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
dd->icode < ARRAY_SIZE(inames) ?
inames[dd->icode] : "unknown", (int)dd->irev);
/* speeds the hardware can support */
dd->pport->link_speed_supported = OPA_LINK_SPEED_25G; /* speeds allowed to run at */
dd->pport->link_speed_enabled = dd->pport->link_speed_supported; /* give a reasonable active value, will be set on link up */
dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
/* fix up link widths for emulation _p */
ppd = dd->pport; if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
ppd->link_width_supported =
ppd->link_width_enabled =
ppd->link_width_downgrade_supported =
ppd->link_width_downgrade_enabled =
OPA_LINK_WIDTH_1X;
} /* insure num_vls isn't larger than number of sdma engines */ if (HFI1_CAP_IS_KSET(SDMA) && num_vls > sdma_engines) {
dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
num_vls, sdma_engines);
num_vls = sdma_engines;
ppd->vls_supported = sdma_engines;
ppd->vls_operational = ppd->vls_supported;
}
/* * Convert the ns parameter to the 64 * cclocks used in the CSR. * Limit the max if larger than the field holds. If timeout is * non-zero, then the calculated field will be at least 1. * * Must be after icode is set up - the cclock rate depends * on knowing the hardware being used.
*/
dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64; if (dd->rcv_intr_timeout_csr >
RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
dd->rcv_intr_timeout_csr =
RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK; elseif (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
dd->rcv_intr_timeout_csr = 1;
/* needs to be done before we look for the peer device */
read_guid(dd);
/* set up shared ASIC data with peer device */
ret = init_asic_data(dd); if (ret) goto bail_cleanup;
/* obtain chip sizes, reset chip CSRs */
ret = init_chip(dd); if (ret) goto bail_cleanup;
/* read in the PCIe link speed information */
ret = pcie_speeds(dd); if (ret) goto bail_cleanup;
/* call before get_platform_config(), after init_chip_resources() */
ret = eprom_init(dd); if (ret) goto bail_free_rcverr;
/* Needs to be called before hfi1_firmware_init */
get_platform_config(dd);
/* read in firmware */
ret = hfi1_firmware_init(dd); if (ret) goto bail_cleanup;
/* * In general, the PCIe Gen3 transition must occur after the * chip has been idled (so it won't initiate any PCIe transactions * e.g. an interrupt) and before the driver changes any registers * (the transition will reset the registers). * * In particular, place this call after: * - init_chip() - the chip will not initiate any PCIe transactions * - pcie_speeds() - reads the current link speed * - hfi1_firmware_init() - the needed firmware is ready to be * downloaded
*/
ret = do_pcie_gen3_transition(dd); if (ret) goto bail_cleanup;
/* * This should probably occur in hfi1_pcie_init(), but historically * occurs after the do_pcie_gen3_transition() code.
*/
tune_pcie_caps(dd);
/* start setting dd values and adjusting CSRs */
init_early_variables(dd);
parse_platform_config(dd);
ret = obtain_boardname(dd); if (ret) goto bail_cleanup;
/* alloc VNIC/AIP rx data */
ret = hfi1_alloc_rx(dd); if (ret) goto bail_cleanup;
ret = set_up_context_variables(dd); if (ret) goto bail_cleanup;
/* set initial RXE CSRs */
ret = init_rxe(dd); if (ret) goto bail_cleanup;
/* set initial TXE CSRs */
init_txe(dd); /* set initial non-RXE, non-TXE CSRs */
init_other(dd); /* set up KDETH QP prefix in both RX and TX CSRs */
init_kdeth_qp(dd);
ret = hfi1_dev_affinity_init(dd); if (ret) goto bail_cleanup;
/* send contexts must be set up before receive contexts */
ret = init_send_contexts(dd); if (ret) goto bail_cleanup;
ret = hfi1_create_kctxts(dd); if (ret) goto bail_cleanup;
/* * Initialize aspm, to be done after gen3 transition and setting up * contexts and before enabling interrupts
*/
aspm_init(dd);
ret = init_pervl_scs(dd); if (ret) goto bail_cleanup;
/* sdma init */ for (i = 0; i < dd->num_pports; ++i) {
ret = sdma_init(dd, i); if (ret) goto bail_cleanup;
}
/* use contexts created by hfi1_create_kctxts */
ret = set_up_interrupts(dd); if (ret) goto bail_cleanup;
ret = hfi1_comp_vectors_set_up(dd); if (ret) goto bail_clear_intr;
/* set up LCB access - must be after set_up_interrupts() */
init_lcb_access(dd);
/* * Serial number is created from the base guid: * [27:24] = base guid [38:35] * [23: 0] = base guid [23: 0]
*/
snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
(dd->base_guid & 0xFFFFFF) |
((dd->base_guid >> 11) & 0xF000000));
/** * create_pbc - build a pbc for transmission * @ppd: info of physical Hfi port * @flags: special case flags or-ed in built pbc * @srate_mbs: static rate * @vl: vl * @dw_len: dword length (header words + data words + pbc words) * * Create a PBC with the given flags, rate, VL, and length. * * NOTE: The PBC created will not insert any HCRC - all callers but one are * for verbs, which does not use this PSM feature. The lone other caller * is for the diagnostic interface which calls this if the user does not * supply their own PBC.
*/
u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
u32 dw_len)
{
u64 pbc, delay = 0;
if (unlikely(srate_mbs))
delay = delay_cycles(ppd, srate_mbs, dw_len);
/* * Initialize the thermal sensor. * * After initialization, enable polling of thermal sensor through * SBus interface. In order for this to work, the SBus Master * firmware has to be loaded due to the fact that the HW polling * logic uses SBus interrupts, which are not supported with * default firmware. Otherwise, no data will be returned through * the ASIC_STS_THERM CSR.
*/ staticint thermal_init(struct hfi1_devdata *dd)
{ int ret = 0;
if (dd->icode != ICODE_RTL_SILICON ||
check_chip_resource(dd, CR_THERM_INIT, NULL)) return ret;
ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); if (ret) {
THERM_FAILURE(dd, ret, "Acquire SBus"); return ret;
}
dd_dev_info(dd, "Initializing thermal sensor\n"); /* Disable polling of thermal readings */
write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
msleep(100); /* Thermal Sensor Initialization */ /* Step 1: Reset the Thermal SBus Receiver */
ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
RESET_SBUS_RECEIVER, 0); if (ret) {
THERM_FAILURE(dd, ret, "Bus Reset"); goto done;
} /* Step 2: Set Reset bit in Thermal block */
ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
WRITE_SBUS_RECEIVER, 0x1); if (ret) {
THERM_FAILURE(dd, ret, "Therm Block Reset"); goto done;
} /* Step 3: Write clock divider value (100MHz -> 2MHz) */
ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
WRITE_SBUS_RECEIVER, 0x32); if (ret) {
THERM_FAILURE(dd, ret, "Write Clock Div"); goto done;
} /* Step 4: Select temperature mode */
ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
WRITE_SBUS_RECEIVER,
SBUS_THERM_MONITOR_MODE); if (ret) {
THERM_FAILURE(dd, ret, "Write Mode Sel"); goto done;
} /* Step 5: De-assert block reset and start conversion */
ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
WRITE_SBUS_RECEIVER, 0x2); if (ret) {
THERM_FAILURE(dd, ret, "Write Reset Deassert"); goto done;
} /* Step 5.1: Wait for first conversion (21.5ms per spec) */
msleep(22);
/* Enable polling of thermal readings */
write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
/* Set initialized flag */
ret = acquire_chip_resource(dd, CR_THERM_INIT, 0); if (ret)
THERM_FAILURE(dd, ret, "Unable to set thermal init flag");
staticvoid handle_temp_err(struct hfi1_devdata *dd)
{ struct hfi1_pportdata *ppd = &dd->pport[0]; /* * Thermal Critical Interrupt * Put the device into forced freeze mode, take link down to * offline, and put DC into reset.
*/
dd_dev_emerg(dd, "Critical temperature reached! Forcing device into freeze mode!\n");
dd->flags |= HFI1_FORCED_FREEZE;
start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT); /* * Shut DC down as much and as quickly as possible. * * Step 1: Take the link down to OFFLINE. This will cause the * 8051 to put the Serdes in reset. However, we don't want to * go through the entire link state machine since we want to * shutdown ASAP. Furthermore, this is not a graceful shutdown * but rather an attempt to save the chip. * Code below is almost the same as quiet_serdes() but avoids * all the extra work and the sleeps.
*/
ppd->driver_link_ready = 0;
ppd->link_enabled = 0;
set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) |
PLS_OFFLINE); /* * Step 2: Shutdown LCB and 8051 * After shutdown, do not restore DC_CFG_RESET value.
*/
dc_shutdown(dd);
}
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.611 Sekunden
(vorverarbeitet am 2026-04-29)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.