/* * Copyright 2019 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. *
*/
/* * DO NOT use these for err/warn/info/debug messages. * Use dev_err, dev_warn, dev_info and dev_dbg instead. * They are more MGPU friendly.
*/ #undef pr_err #undef pr_warn #undef pr_info #undef pr_debug
/* possible frequency drift (1Mhz) */ #define EPSILON 1
#define smnPCIE_ESM_CTRL 0x111003D0
/* * SMU support ECCTABLE since version 68.42.0, * use this to check ECCTALE feature whether support
*/ #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
/* * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0, * use this to check mca_ceumc_addr record whether support
*/ #define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700
/* * SMU support BAD CHENNEL info MSG since version 68.51.00, * use this to check ECCTALE feature whether support
*/ #define SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION 0x00443300
/* The message only works on master die and NACK will be sent * back for other dies, only send it on master die.
*/ if (adev->smuio.funcs->get_socket_id(adev) ||
adev->smuio.funcs->get_die_id(adev)) return 0;
if (adev->smuio.funcs && adev->smuio.funcs->get_die_id) return adev->smuio.funcs->get_die_id(adev) == 0;
returntrue;
}
staticint aldebaran_run_board_btc(struct smu_context *smu)
{ int ret;
if (!aldebaran_is_primary(smu)) return 0;
if (smu->smc_fw_version <= 0x00441d00) return 0;
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_BoardPowerCalibration, NULL); if (ret)
dev_err(smu->adev->dev, "Board power calibration failed!\n");
return ret;
}
staticint aldebaran_run_btc(struct smu_context *smu)
{ int ret;
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_RunDcBtc, NULL); if (ret)
dev_err(smu->adev->dev, "RunDcBtc failed!\n"); else
ret = aldebaran_run_board_btc(smu);
switch (clk_id) { case PPCLK_GFXCLK: /* * CurrClock[clk_id] can provide accurate * output only when the dpm feature is enabled. * We can use Average_* for dpm disabled case. * But this is available for gfxclk/uclk/socclk/vclk/dclk.
*/ if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_GFXCLK_BIT))
member_type = METRICS_CURR_GFXCLK; else
member_type = METRICS_AVERAGE_GFXCLK; break; case PPCLK_UCLK: if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_UCLK_BIT))
member_type = METRICS_CURR_UCLK; else
member_type = METRICS_AVERAGE_UCLK; break; case PPCLK_SOCCLK: if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_SOCCLK_BIT))
member_type = METRICS_CURR_SOCCLK; else
member_type = METRICS_AVERAGE_SOCCLK; break; case PPCLK_VCLK: if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_VCN_PG_BIT))
member_type = METRICS_CURR_VCLK; else
member_type = METRICS_AVERAGE_VCLK; break; case PPCLK_DCLK: if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_VCN_PG_BIT))
member_type = METRICS_CURR_DCLK; else
member_type = METRICS_AVERAGE_DCLK; break; case PPCLK_FCLK:
member_type = METRICS_CURR_FCLK; break; default: return -EINVAL;
}
case SMU_MCLK: case SMU_SOCCLK: case SMU_FCLK: case SMU_VCLK: case SMU_DCLK: for (i = 0; i < clocks.num_levels; i++) {
clock_mhz = clocks.data[i].clocks_in_khz / 1000;
freq_match = aldebaran_freqs_in_same_level(clock_mhz, cur_value);
freq_match |= (clocks.num_levels == 1);
switch (type) { case SMU_SCLK:
single_dpm_table = &(dpm_context->dpm_tables.gfx_table); if (soft_max_level >= single_dpm_table->count) {
dev_err(smu->adev->dev, "Clock level specified %d is over max allowed %d\n",
soft_max_level, single_dpm_table->count - 1);
ret = -EINVAL; break;
}
ret = aldebaran_upload_dpm_level(smu, false,
FEATURE_MASK(FEATURE_DPM_GFXCLK_BIT),
soft_min_level); if (ret) {
dev_err(smu->adev->dev, "Failed to upload boot level to lowest!\n"); break;
}
ret = aldebaran_upload_dpm_level(smu, true,
FEATURE_MASK(FEATURE_DPM_GFXCLK_BIT),
soft_max_level); if (ret)
dev_err(smu->adev->dev, "Failed to upload dpm max level to highest!\n");
break;
case SMU_MCLK: case SMU_SOCCLK: case SMU_FCLK: /* * Should not arrive here since aldebaran does not * support mclk/socclk/fclk softmin/softmax settings
*/
ret = -EINVAL; break;
staticint aldebaran_get_current_activity_percent(struct smu_context *smu, enum amd_pp_sensors sensor,
uint32_t *value)
{ int ret = 0;
if (!value) return -EINVAL;
switch (sensor) { case AMDGPU_PP_SENSOR_GPU_LOAD:
ret = aldebaran_get_smu_metrics_data(smu,
METRICS_AVERAGE_GFXACTIVITY,
value); break; case AMDGPU_PP_SENSOR_MEM_LOAD:
ret = aldebaran_get_smu_metrics_data(smu,
METRICS_AVERAGE_MEMACTIVITY,
value); break; default:
dev_err(smu->adev->dev, "Invalid sensor for retrieving clock activity\n"); return -EINVAL;
}
return ret;
}
staticint aldebaran_thermal_get_temperature(struct smu_context *smu, enum amd_pp_sensors sensor,
uint32_t *value)
{ int ret = 0;
if (!value) return -EINVAL;
switch (sensor) { case AMDGPU_PP_SENSOR_HOTSPOT_TEMP:
ret = aldebaran_get_smu_metrics_data(smu,
METRICS_TEMPERATURE_HOTSPOT,
value); break; case AMDGPU_PP_SENSOR_EDGE_TEMP:
ret = aldebaran_get_smu_metrics_data(smu,
METRICS_TEMPERATURE_EDGE,
value); break; case AMDGPU_PP_SENSOR_MEM_TEMP:
ret = aldebaran_get_smu_metrics_data(smu,
METRICS_TEMPERATURE_MEM,
value); break; default:
dev_err(smu->adev->dev, "Invalid sensor for retrieving temp\n"); return -EINVAL;
}
return ret;
}
staticint aldebaran_read_sensor(struct smu_context *smu, enum amd_pp_sensors sensor, void *data, uint32_t *size)
{ int ret = 0;
if (amdgpu_ras_intr_triggered()) return 0;
if (!data || !size) return -EINVAL;
switch (sensor) { case AMDGPU_PP_SENSOR_MEM_LOAD: case AMDGPU_PP_SENSOR_GPU_LOAD:
ret = aldebaran_get_current_activity_percent(smu,
sensor,
(uint32_t *)data);
*size = 4; break; case AMDGPU_PP_SENSOR_GPU_AVG_POWER:
ret = aldebaran_get_smu_metrics_data(smu,
METRICS_AVERAGE_SOCKETPOWER,
(uint32_t *)data);
*size = 4; break; case AMDGPU_PP_SENSOR_HOTSPOT_TEMP: case AMDGPU_PP_SENSOR_EDGE_TEMP: case AMDGPU_PP_SENSOR_MEM_TEMP:
ret = aldebaran_thermal_get_temperature(smu, sensor,
(uint32_t *)data);
*size = 4; break; case AMDGPU_PP_SENSOR_GFX_MCLK:
ret = aldebaran_get_current_clk_freq_by_table(smu, SMU_UCLK, (uint32_t *)data); /* the output clock frequency in 10K unit */
*(uint32_t *)data *= 100;
*size = 4; break; case AMDGPU_PP_SENSOR_GFX_SCLK:
ret = aldebaran_get_current_clk_freq_by_table(smu, SMU_GFXCLK, (uint32_t *)data);
*(uint32_t *)data *= 100;
*size = 4; break; case AMDGPU_PP_SENSOR_VDDGFX:
ret = smu_v13_0_get_gfx_vdd(smu, (uint32_t *)data);
*size = 4; break; case AMDGPU_PP_SENSOR_GPU_INPUT_POWER: default:
ret = -EOPNOTSUPP; break;
}
if (!smu_cmn_feature_is_enabled(smu, SMU_FEATURE_PPT_BIT)) { if (current_power_limit)
*current_power_limit = 0; if (default_power_limit)
*default_power_limit = 0; if (max_power_limit)
*max_power_limit = 0; if (min_power_limit)
*min_power_limit = 0;
dev_warn(smu->adev->dev, "PPT feature is not enabled, power values can't be fetched.");
return 0;
}
/* Valid power data is available only from primary die. * For secondary die show the value as 0.
*/ if (aldebaran_is_primary(smu)) {
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_GetPptLimit,
&power_limit);
if (ret) { /* the last hope to figure out the ppt limit */ if (!pptable) {
dev_err(smu->adev->dev, "Cannot get PPT limit due to pptable missing!"); return -EINVAL;
}
power_limit = pptable->PptLimit;
}
}
if (current_power_limit)
*current_power_limit = power_limit; if (default_power_limit)
*default_power_limit = power_limit;
if (max_power_limit) { if (pptable)
*max_power_limit = pptable->PptLimit;
}
if (min_power_limit)
*min_power_limit = 0;
return 0;
}
staticint aldebaran_set_power_limit(struct smu_context *smu, enum smu_ppt_limit_type limit_type,
uint32_t limit)
{ /* Power limit can be set only through primary die */ if (aldebaran_is_primary(smu)) return smu_v13_0_set_power_limit(smu, limit_type, limit);
return -EINVAL;
}
staticint aldebaran_system_features_control(struct smu_context *smu, bool enable)
{ int ret;
ret = smu_v13_0_system_features_control(smu, enable); if (!ret && enable)
ret = aldebaran_run_btc(smu);
/* Disable determinism if switching to another mode */ if ((smu_dpm->dpm_level == AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM) &&
(level != AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM)) {
smu_cmn_send_smc_msg(smu, SMU_MSG_DisableDeterminism, NULL);
pstate_table->gfxclk_pstate.curr.max = gfx_table->max;
}
switch (level) {
case AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM: return 0; case AMD_DPM_FORCED_LEVEL_AUTO:
r = smu_v13_0_set_performance_level(smu, level); if (!r)
smu_v13_0_reset_custom_level(smu); return r; case AMD_DPM_FORCED_LEVEL_HIGH: case AMD_DPM_FORCED_LEVEL_LOW: case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD: case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK: case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_MCLK: case AMD_DPM_FORCED_LEVEL_PROFILE_PEAK: default: break;
}
if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) return -EINVAL;
if ((smu_dpm->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL)
&& (smu_dpm->dpm_level != AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM)) return -EINVAL;
if (smu_dpm->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL) { if (min >= max) {
dev_err(smu->adev->dev, "Minimum GFX clk should be less than the maximum allowed clock\n"); return -EINVAL;
}
if ((min == pstate_table->gfxclk_pstate.curr.min) &&
(max == pstate_table->gfxclk_pstate.curr.max)) return 0;
ret = smu_v13_0_set_soft_freq_limited_range(smu, SMU_GFXCLK,
min, max, false); if (!ret) {
pstate_table->gfxclk_pstate.curr.min = min;
pstate_table->gfxclk_pstate.curr.max = max;
}
return ret;
}
if (smu_dpm->dpm_level == AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM) { if (!max || (max < dpm_context->dpm_tables.gfx_table.min) ||
(max > dpm_context->dpm_tables.gfx_table.max)) {
dev_warn(adev->dev, "Invalid max frequency %d MHz specified for determinism\n", max); return -EINVAL;
}
/* Restore default min/max clocks and enable determinism */
min_clk = dpm_context->dpm_tables.gfx_table.min;
max_clk = dpm_context->dpm_tables.gfx_table.max;
ret = smu_v13_0_set_soft_freq_limited_range(smu, SMU_GFXCLK, min_clk, max_clk, false); if (!ret) {
usleep_range(500, 1000);
ret = smu_cmn_send_smc_msg_with_param(smu,
SMU_MSG_EnableDeterminism,
max, NULL); if (ret) {
dev_err(adev->dev, "Failed to enable determinism at GFX clock %d MHz\n", max);
} else {
pstate_table->gfxclk_pstate.curr.min = min_clk;
pstate_table->gfxclk_pstate.curr.max = max;
}
}
}
/* Only allowed in manual or determinism mode */ if ((smu_dpm->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL)
&& (smu_dpm->dpm_level != AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM)) return -EINVAL;
switch (type) { case PP_OD_EDIT_SCLK_VDDC_TABLE: if (size != 2) {
dev_err(smu->adev->dev, "Input parameter number not correct\n"); return -EINVAL;
}
if (input[0] == 0) { if (input[1] < dpm_context->dpm_tables.gfx_table.min) {
dev_warn(smu->adev->dev, "Minimum GFX clk (%ld) MHz specified is less than the minimum allowed (%d) MHz\n",
input[1], dpm_context->dpm_tables.gfx_table.min);
pstate_table->gfxclk_pstate.custom.min =
pstate_table->gfxclk_pstate.curr.min; return -EINVAL;
}
pstate_table->gfxclk_pstate.custom.min = input[1];
} elseif (input[0] == 1) { if (input[1] > dpm_context->dpm_tables.gfx_table.max) {
dev_warn(smu->adev->dev, "Maximum GFX clk (%ld) MHz specified is greater than the maximum allowed (%d) MHz\n",
input[1], dpm_context->dpm_tables.gfx_table.max);
pstate_table->gfxclk_pstate.custom.max =
pstate_table->gfxclk_pstate.curr.max; return -EINVAL;
}
pstate_table->gfxclk_pstate.custom.max = input[1];
} else { return -EINVAL;
} break; case PP_OD_RESTORE_DEFAULT_TABLE: if (size != 0) {
dev_err(smu->adev->dev, "Input parameter number not correct\n"); return -EINVAL;
} else { /* Use the default frequencies for manual and determinism mode */
min_clk = dpm_context->dpm_tables.gfx_table.min;
max_clk = dpm_context->dpm_tables.gfx_table.max;
ret = aldebaran_set_soft_freq_limited_range(
smu, SMU_GFXCLK, min_clk, max_clk, false); if (ret) return ret;
smu_v13_0_reset_custom_level(smu);
} break; case PP_OD_COMMIT_DPM_TABLE: if (size != 0) {
dev_err(smu->adev->dev, "Input parameter number not correct\n"); return -EINVAL;
} else { if (!pstate_table->gfxclk_pstate.custom.min)
pstate_table->gfxclk_pstate.custom.min =
pstate_table->gfxclk_pstate.curr.min;
if (!pstate_table->gfxclk_pstate.custom.max)
pstate_table->gfxclk_pstate.custom.max =
pstate_table->gfxclk_pstate.curr.max;
if ((dir ^ msg[i].flags) & I2C_M_RD) { /* The direction changes.
*/
dir = msg[i].flags & I2C_M_RD;
cmd->CmdConfig |= CMDCONFIG_RESTART_MASK;
}
req->NumCmds++;
/* * Insert STOP if we are at the last byte of either last * message for the transaction or the client explicitly * requires a STOP at this particular message.
*/ if ((j == msg[i].len - 1) &&
((i == num_msgs - 1) || (msg[i].flags & I2C_M_STOP))) {
cmd->CmdConfig &= ~CMDCONFIG_RESTART_MASK;
cmd->CmdConfig |= CMDCONFIG_STOP_MASK;
}
}
}
mutex_lock(&adev->pm.mutex);
r = smu_cmn_update_table(smu, SMU_TABLE_I2C_COMMANDS, 0, req, true); if (r) goto fail;
for (c = i = 0; i < num_msgs; i++) { if (!(msg[i].flags & I2C_M_RD)) {
c += msg[i].len; continue;
} for (j = 0; j < msg[i].len; j++, c++) {
SwI2cCmd_t *cmd = &res->SwI2cCmds[c];
/* Valid power data is available only from primary die */ if (aldebaran_is_primary(smu)) {
gpu_metrics->average_socket_power = metrics.AverageSocketPower;
gpu_metrics->energy_accumulator =
(uint64_t)metrics.EnergyAcc64bitHigh << 32 |
metrics.EnergyAcc64bitLow;
} else {
gpu_metrics->average_socket_power = 0;
gpu_metrics->energy_accumulator = 0;
}
if (table_version == 1) { for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
ecc_info_per_channel = &(eccinfo->ecc[i]);
ecc_info_per_channel->ce_count_lo_chip =
ecc_table->EccInfo[i].ce_count_lo_chip;
ecc_info_per_channel->ce_count_hi_chip =
ecc_table->EccInfo[i].ce_count_hi_chip;
ecc_info_per_channel->mca_umc_status =
ecc_table->EccInfo[i].mca_umc_status;
ecc_info_per_channel->mca_umc_addr =
ecc_table->EccInfo[i].mca_umc_addr;
}
} elseif (table_version == 2) { for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
ecc_info_per_channel = &(eccinfo->ecc[i]);
ecc_info_per_channel->ce_count_lo_chip =
ecc_table->EccInfo_V2[i].ce_count_lo_chip;
ecc_info_per_channel->ce_count_hi_chip =
ecc_table->EccInfo_V2[i].ce_count_hi_chip;
ecc_info_per_channel->mca_umc_status =
ecc_table->EccInfo_V2[i].mca_umc_status;
ecc_info_per_channel->mca_umc_addr =
ecc_table->EccInfo_V2[i].mca_umc_addr;
ecc_info_per_channel->mca_ceumc_addr =
ecc_table->EccInfo_V2[i].mca_ceumc_addr;
}
eccinfo->record_ce_addr_supported = 1;
}
return ret;
}
staticint aldebaran_mode1_reset(struct smu_context *smu)
{
u32 fatal_err, param; int ret = 0; struct amdgpu_device *adev = smu->adev;
fatal_err = 0;
param = SMU_RESET_MODE_1;
/* * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
*/ if (smu->smc_fw_version < 0x00440700) {
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
} else { /* fatal error triggered by ras, PMFW supports the flag
from 68.44.0 */ if ((smu->smc_fw_version >= 0x00442c00) &&
amdgpu_ras_get_fed_status(adev))
fatal_err = 1;
if (!ret)
msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
return ret;
}
staticint aldebaran_mode2_reset(struct smu_context *smu)
{ int ret = 0, index; struct amdgpu_device *adev = smu->adev; int timeout = 10;
index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_MSG,
SMU_MSG_GfxDeviceDriverReset); if (index < 0 ) return -EINVAL;
mutex_lock(&smu->message_lock); if (smu->smc_fw_version >= 0x00441400) {
ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index, SMU_RESET_MODE_2); /* This is similar to FLR, wait till max FLR timeout */
msleep(100);
dev_dbg(smu->adev->dev, "restore config space...\n"); /* Restore the config space saved during init */
amdgpu_device_load_pci_state(adev->pdev);
dev_dbg(smu->adev->dev, "wait for reset ack\n"); while (ret == -ETIME && timeout) {
ret = smu_cmn_wait_for_response(smu); /* Wait a bit more time for getting ACK */ if (ret == -ETIME) {
--timeout;
usleep_range(500, 1000); continue;
}
staticint aldebaran_smu_send_hbm_bad_page_num(struct smu_context *smu,
uint32_t size)
{ int ret = 0;
/* message SMU to update the bad page number on SMUBUS */
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetNumBadHbmPagesRetired, size, NULL); if (ret)
dev_err(smu->adev->dev, "[%s] failed to message SMU to update HBM bad pages number\n",
__func__);
staticint aldebaran_send_hbm_bad_channel_flag(struct smu_context *smu,
uint32_t size)
{ int ret = 0;
ret = aldebaran_check_bad_channel_info_support(smu); if (ret) return ret;
/* message SMU to update the bad channel info on SMUBUS */
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetBadHBMPagesRetiredFlagsPerChannel, size, NULL); if (ret)
dev_err(smu->adev->dev, "[%s] failed to message SMU to update HBM bad channel info\n",
__func__);
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.17Bemerkung:
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.