/* * Copyright 2023 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. *
*/ #include"umc_v12_0.h" #include"amdgpu_ras.h" #include"amdgpu_umc.h" #include"amdgpu.h" #include"umc/umc_12_0_0_offset.h" #include"umc/umc_12_0_0_sh_mask.h" #include"mp/mp_13_0_6_sh_mask.h"
/* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3],
* which can be used as die ID directly */ struct amdgpu_smuio_mcm_config_info mcm_info = {
.socket_id = adev->smuio.funcs->get_socket_id(adev),
.die_id = node_inst,
};
switch (vram_type) { case AMDGPU_VRAM_TYPE_HBM: /* other nps modes are taken as nps1 */ if (nps == AMDGPU_NPS2_PARTITION_MODE)
flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT; elseif (nps == AMDGPU_NPS4_PARTITION_MODE)
flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R11_BIT;
break; case AMDGPU_VRAM_TYPE_HBM3E:
flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT;
flip_bits->flip_row_bit = 12;
soc_pa = paddr_out->pa.pa;
channel_index = paddr_out->pa.channel_idx; /* clear loop bits in soc physical address */ for (i = 0; i < bit_num; i++)
soc_pa &= ~BIT_ULL(flip_bits[i]);
paddr_out->pa.pa = soc_pa; /* get column bit 0 and 1 in mca address */
col_lower = (err_addr >> 1) & 0x3ULL; /* extra row bit will be handled later */
row_lower = (err_addr >> UMC_V12_0_MA_R0_BIT) & 0x1fffULL;
row_lower &= ~BIT_ULL(adev->umc.flip_bits.flip_row_bit);
if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 5, 0)) {
row_high = (soc_pa >> adev->umc.flip_bits.r13_in_pa) & 0x3ULL; /* it's 2.25GB in each channel, from MCA address to PA * [R14 R13] is converted if the two bits value are 0x3, * get them from PA instead of MCA address.
*/
row_lower |= (row_high << 13);
}
if (!err_data && !dump_addr) goto out;
/* loop for all possibilities of retired bits */ for (column = 0; column < retire_unit; column++) {
soc_pa = paddr_out->pa.pa; for (i = 0; i < bit_num; i++)
soc_pa |= (((column >> i) & 0x1ULL) << flip_bits[i]);
col = ((column & 0x7) << 2) | col_lower; /* handle extra row bit */ if (bit_num == RETIRE_FLIP_BITS_NUM)
row = ((column >> 3) << adev->umc.flip_bits.flip_row_bit) |
row_lower;
if (dump_addr)
dev_info(adev->dev, "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
soc_pa, row, col, bank, channel_index);
if (err_data)
amdgpu_umc_fill_error_record(err_data, err_addr,
soc_pa, channel_index, umc_inst);
}
/* set ce error interrupt type to APIC based interrupt */
odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel,
OdEccErrInt, 0x1);
WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4, odecc_cnt_sel);
/* set error count to initial value */
WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V12_0_CE_CNT_INIT);
staticbool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev)
{ /* * Force return true, because regUMCCH0_EccCtrl * is not accessible from host side
*/ returntrue;
}
/* The IP block decode of consumption is SMU */ if (hwid != MCA_UMC_HWID_V12_0 || mcatype != MCA_UMC_MCATYPE_V12_0) {
con->umc_ecc_log.consumption_q_count++; return 0;
}
if (!status) return 0;
if (!umc_v12_0_is_deferred_error(adev, status)) return 0;
/* Reserve memory */ for (i = 0; i < count; i++)
amdgpu_ras_reserve_page(adev, page_pfn[i]);
/* The problem case is as follows: * 1. GPU A triggers a gpu ras reset, and GPU A drives * GPU B to also perform a gpu ras reset. * 2. After gpu B ras reset started, gpu B queried a DE * data. Since the DE data was queried in the ras reset * thread instead of the page retirement thread, bad * page retirement work would not be triggered. Then * even if all gpu resets are completed, the bad pages * will be cached in RAM until GPU B's bad page retirement * work is triggered again and then saved to eeprom. * Trigger delayed work to save the bad pages to eeprom in time * after gpu ras reset is completed.
*/ if (amdgpu_ras_in_recovery(adev))
schedule_delayed_work(&con->page_retirement_dwork,
msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET));
for (i = 0; i < count; i++) {
ret = amdgpu_umc_fill_error_record(err_data,
ecc_err->addr,
page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
ecc_err->channel_idx,
MCA_IPID_2_UMC_INST(ecc_err->ipid)); if (ret) break;
}
/* we only calculate die id for nps1 mode right now */
die += ((((retired_page >> 12) & 0x1ULL)^
((retired_page >> 20) & 0x1ULL) ^
((retired_page >> 27) & 0x1ULL) ^
((retired_page >> 34) & 0x1ULL) ^
((retired_page >> 41) & 0x1ULL)) << 0);
/* the original PA_C4 and PA_R13 may be cleared in retired_page, so * get them from mca_addr.
*/
die += ((((retired_page >> 13) & 0x1ULL) ^
((mca_addr >> 5) & 0x1ULL) ^
((retired_page >> 28) & 0x1ULL) ^
((mca_addr >> 23) & 0x1ULL) ^
((retired_page >> 42) & 0x1ULL)) << 1);
die &= 3;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.