/* * Copyright 2018 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * *
*/ #include <linux/list.h> #include"amdgpu.h" #include"amdgpu_xgmi.h" #include"amdgpu_ras.h" #include"soc15.h" #include"df/df_3_6_offset.h" #include"xgmi/xgmi_4_0_0_smn.h" #include"xgmi/xgmi_4_0_0_sh_mask.h" #include"xgmi/xgmi_6_1_0_sh_mask.h" #include"wafl/wafl2_4_0_0_smn.h" #include"wafl/wafl2_4_0_0_sh_mask.h"
if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_DISABLE) return -ENOLINK;
if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_LS0) return XGMI_LINK_ACTIVE;
return XGMI_LINK_INACTIVE;
}
/** * DOC: AMDGPU XGMI Support * * XGMI is a high speed interconnect that joins multiple GPU cards * into a homogeneous memory space that is organized by a collective * hive ID and individual node IDs, both of which are 64-bit numbers. * * The file xgmi_device_id contains the unique per GPU device ID and * is stored in the /sys/class/drm/card${cardno}/device/ directory. * * Inside the device directory a sub-directory 'xgmi_hive_info' is * created which contains the hive ID and the list of nodes. * * The hive ID is stored in: * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id * * The node information is stored in numbered directories: * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id * * Each device has their own xgmi_hive_info direction with a mirror * set of node sub-directories. * * The XGMI memory space is built by contiguously adding the power of * two padded VRAM space from each node to each other. *
*/
for (i = 0; i < top->num_nodes; i++)
sprintf(buf + 3 * i, "%02x ", top->nodes[i].num_links);
return sysfs_emit(buf, "%s\n", buf);
}
static ssize_t amdgpu_xgmi_show_connected_port_num(struct device *dev, struct device_attribute *attr, char *buf)
{ struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; int i, j, size = 0; int current_node; /* * get the node id in the sysfs for the current socket and show * it in the port num info output in the sysfs for easy reading. * it is NOT the one retrieved from xgmi ta.
*/ for (i = 0; i < top->num_nodes; i++) { if (top->nodes[i].node_id == adev->gmc.xgmi.node_id) {
current_node = i; break;
}
}
if (i == top->num_nodes) return -EINVAL;
for (i = 0; i < top->num_nodes; i++) { for (j = 0; j < top->nodes[i].num_links; j++) /* node id in sysfs starts from 1 rather than 0 so +1 here */
size += sysfs_emit_at(buf, size, "%02x:%02x -> %02x:%02x\n", current_node + 1,
top->nodes[i].port_num[j].src_xgmi_port_num, i + 1,
top->nodes[i].port_num[j].dst_xgmi_port_num);
}
staticint amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
{ int ret = 0; char node[10] = { 0 };
/* Create xgmi device id file */
ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); if (ret) {
dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); return ret;
}
ret = device_create_file(adev->dev, &dev_attr_xgmi_physical_id); if (ret) {
dev_err(adev->dev, "XGMI: Failed to create device file xgmi_physical_id\n"); return ret;
}
/* Create xgmi error file */
ret = device_create_file(adev->dev, &dev_attr_xgmi_error); if (ret)
pr_err("failed to create xgmi_error\n");
/* Create xgmi num hops file */
ret = device_create_file(adev->dev, &dev_attr_xgmi_num_hops); if (ret)
pr_err("failed to create xgmi_num_hops\n");
/* Create xgmi num links file */
ret = device_create_file(adev->dev, &dev_attr_xgmi_num_links); if (ret)
pr_err("failed to create xgmi_num_links\n");
/* Create xgmi port num file if supported */ if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) {
ret = device_create_file(adev->dev, &dev_attr_xgmi_port_num); if (ret)
dev_err(adev->dev, "failed to create xgmi_port_num\n");
}
/* Create sysfs link to hive info folder on the first device */ if (hive->kobj.parent != (&adev->dev->kobj)) {
ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj, "xgmi_hive_info"); if (ret) {
dev_err(adev->dev, "XGMI: Failed to create link to hive info"); goto remove_file;
}
}
sprintf(node, "node%d", atomic_read(&hive->number_devices)); /* Create sysfs link form the hive folder to yourself */
ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node); if (ret) {
dev_err(adev->dev, "XGMI: Failed to create link from hive info"); goto remove_link;
}
hive = kzalloc(sizeof(*hive), GFP_KERNEL); if (!hive) {
dev_err(adev->dev, "XGMI: allocation failed\n");
ret = -ENOMEM;
hive = NULL; goto pro_end;
}
/* initialize new hive if not exist */
ret = kobject_init_and_add(&hive->kobj,
&amdgpu_xgmi_hive_type,
&adev->dev->kobj, "%s", "xgmi_hive_info"); if (ret) {
dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
kobject_put(&hive->kobj);
hive = NULL; goto pro_end;
}
/** * Only init hive->reset_domain for none SRIOV configuration. For SRIOV, * Host driver decide how to reset the GPU either through FLR or chain reset. * Guest side will get individual notifications from the host for the FLR * if necessary.
*/ if (!amdgpu_sriov_vf(adev)) { /** * Avoid recreating reset domain when hive is reconstructed for the case * of reset the devices in the XGMI hive during probe for passthrough GPU * See https://www.spinics.net/lists/amd-gfx/msg58836.html
*/ if (adev->reset_domain->type != XGMI_HIVE) {
hive->reset_domain =
amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive"); if (!hive->reset_domain) {
dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
ret = -ENOMEM;
kobject_put(&hive->kobj);
hive = NULL; goto pro_end;
}
} else {
amdgpu_reset_get_reset_domain(adev->reset_domain);
hive->reset_domain = adev->reset_domain;
}
}
/* * hive pstate on boot is high in vega20 so we have to go to low * pstate on after boot.
*/
hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
list_add_tail(&hive->node, &xgmi_hive_list);
pro_end: if (hive)
kobject_get(&hive->kobj);
mutex_unlock(&xgmi_mutex); return hive;
}
void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
{ if (hive)
kobject_put(&hive->kobj);
}
int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
{ int ret = 0; struct amdgpu_hive_info *hive; struct amdgpu_device *request_adev; bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; bool init_low;
hive = amdgpu_get_xgmi_hive(adev); if (!hive) return 0;
if (!hive || adev->asic_type != CHIP_VEGA20) return 0;
mutex_lock(&hive->hive_lock);
if (is_hi_req)
hive->hi_req_count++; else
hive->hi_req_count--;
/* * Vega20 only needs single peer to request pstate high for the hive to * go high but all peers must request pstate low for the hive to go low
*/ if (hive->pstate == pstate ||
(!is_hi_req && hive->hi_req_count && !init_low)) goto out;
ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate); if (ret) {
dev_err(request_adev->dev, "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
request_adev->gmc.xgmi.node_id,
request_adev->gmc.xgmi.hive_id, ret); goto out;
}
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
{ int ret;
if (amdgpu_sriov_vf(adev)) return 0;
/* Each psp need to set the latest topology */
ret = psp_xgmi_set_topology_info(&adev->psp,
atomic_read(&hive->number_devices),
&adev->psp.xgmi_context.top_info); if (ret)
dev_err(adev->dev, "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
adev->gmc.xgmi.node_id,
adev->gmc.xgmi.hive_id, ret);
return ret;
}
/* * NOTE psp_xgmi_node_info.num_hops layout is as follows: * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved) * num_hops[5:3] = reserved * num_hops[2:0] = number of hops
*/ int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, struct amdgpu_device *peer_adev)
{ struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
uint8_t num_hops_mask = 0x7; int i;
if (!adev->gmc.xgmi.supported) return 0;
for (i = 0 ; i < top->num_nodes; ++i) if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) return top->nodes[i].num_hops & num_hops_mask;
dev_err(adev->dev, "Failed to get xgmi hops count for peer %d.\n",
peer_adev->gmc.xgmi.physical_node_id);
return 0;
}
int amdgpu_xgmi_get_bandwidth(struct amdgpu_device *adev, struct amdgpu_device *peer_adev, enum amdgpu_xgmi_bw_mode bw_mode, enum amdgpu_xgmi_bw_unit bw_unit,
uint32_t *min_bw, uint32_t *max_bw)
{ bool peer_mode = bw_mode == AMDGPU_XGMI_BW_MODE_PER_PEER; int unit_scale = bw_unit == AMDGPU_XGMI_BW_UNIT_MBYTES ? 1000 : 1; int num_lanes = adev->gmc.xgmi.max_width; int speed = adev->gmc.xgmi.max_speed; int num_links = !peer_mode ? 1 : -1;
if (!(min_bw && max_bw)) return -EINVAL;
*min_bw = 0;
*max_bw = 0;
if (!adev->gmc.xgmi.supported) return -ENODATA;
if (peer_mode && !peer_adev) return -EINVAL;
if (peer_mode) { struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; int i;
for (i = 0 ; i < top->num_nodes; ++i) { if (top->nodes[i].node_id != peer_adev->gmc.xgmi.node_id) continue;
num_links = top->nodes[i].num_links; break;
}
}
if (num_links == -1) {
dev_err(adev->dev, "Failed to get number of xgmi links for peer %d.\n",
peer_adev->gmc.xgmi.physical_node_id);
} elseif (num_links) { int per_link_bw = (speed * num_lanes * unit_scale)/BITS_PER_BYTE;
/* Sharing should always be enabled for non-SRIOV. */ if (!amdgpu_sriov_vf(adev)) returntrue;
for (i = 0 ; i < top->num_nodes; ++i) if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) return !!top->nodes[i].is_sharing_enabled;
returnfalse;
}
/* * Devices that support extended data require the entire hive to initialize with * the shared memory buffer flag set. * * Hive locks and conditions apply - see amdgpu_xgmi_add_device
*/ staticint amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive, bool set_extended_data)
{ struct amdgpu_device *tmp_adev; int ret;
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false); if (ret) {
dev_err(tmp_adev->dev, "XGMI: Failed to initialize xgmi session for data partition %i\n",
set_extended_data); return ret;
}
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
ret = psp_xgmi_initialize(&adev->psp, false, true); if (ret) {
dev_err(adev->dev, "XGMI: Failed to initialize xgmi session\n"); return ret;
}
ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); if (ret) {
dev_err(adev->dev, "XGMI: Failed to get hive id\n"); return ret;
}
ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); if (ret) {
dev_err(adev->dev, "XGMI: Failed to get node id\n"); return ret;
}
} else {
adev->gmc.xgmi.hive_id = 16;
adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
}
hive = amdgpu_get_xgmi_hive(adev); if (!hive) {
ret = -EINVAL;
dev_err(adev->dev, "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); gotoexit;
}
mutex_lock(&hive->hive_lock);
if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { /* update node list for other device in the hive */ if (tmp_adev != adev) {
top_info = &tmp_adev->psp.xgmi_context.top_info;
top_info->nodes[count - 1].node_id =
adev->gmc.xgmi.node_id;
top_info->num_nodes = count;
}
ret = amdgpu_xgmi_update_topology(hive, tmp_adev); if (ret) goto exit_unlock;
}
if (amdgpu_sriov_vf(adev) &&
adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) { /* only get topology for VF being init if it can support full duplex */
ret = psp_xgmi_get_topology_info(&adev->psp, count,
&adev->psp.xgmi_context.top_info, false); if (ret) {
dev_err(adev->dev, "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
adev->gmc.xgmi.node_id,
adev->gmc.xgmi.hive_id, ret); /* To do: continue with some node failed or disable the whole hive*/ goto exit_unlock;
}
/* fill the topology info for peers instead of getting from PSP */
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
amdgpu_xgmi_fill_topology_info(adev, tmp_adev);
}
} else { /* get latest topology info for each device from psp */
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
&tmp_adev->psp.xgmi_context.top_info, false); if (ret) {
dev_err(tmp_adev->dev, "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
tmp_adev->gmc.xgmi.node_id,
tmp_adev->gmc.xgmi.hive_id, ret); /* To do : continue with some node failed or disable the whole hive */ goto exit_unlock;
}
}
}
/* get topology again for hives that support extended data */ if (adev->psp.xgmi_context.supports_extended_data) {
/* initialize the hive to get extended data. */
ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true); if (ret) goto exit_unlock;
/* get the extended data. */
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
&tmp_adev->psp.xgmi_context.top_info, true); if (ret) {
dev_err(tmp_adev->dev, "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
tmp_adev->gmc.xgmi.node_id,
tmp_adev->gmc.xgmi.hive_id, ret); goto exit_unlock;
}
}
/* initialize the hive to get non-extended data for the next round. */
ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false); if (ret) goto exit_unlock;
}
}
if (!ret)
ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
if (atomic_dec_return(&hive->number_devices) == 0) { /* Remove the hive from global hive list */
mutex_lock(&xgmi_mutex);
list_del(&hive->node);
mutex_unlock(&xgmi_mutex);
switch (adev->asic_type) { case CHIP_ARCTURUS: for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
pcs_clear_status(adev,
xgmi_pcs_err_status_reg_arct[i]); break; case CHIP_VEGA20: for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
pcs_clear_status(adev,
xgmi_pcs_err_status_reg_vg20[i]); break; case CHIP_ALDEBARAN: for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++)
pcs_clear_status(adev,
xgmi3x16_pcs_err_status_reg_aldebaran[i]); for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
pcs_clear_status(adev,
walf_pcs_err_status_reg_aldebaran[i]); break; default: break;
}
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { case IP_VERSION(6, 4, 0): case IP_VERSION(6, 4, 1): for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++)
pcs_clear_status(adev,
xgmi3x16_pcs_err_status_reg_v6_4[i]); break; default: break;
}
}
for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++)
__xgmi_v6_4_0_query_error_count(adev, &mcm_info, xgmi_v6_4_0_mca_base_array[i], err_data);
}
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
r = amdgpu_ras_init_badpage_info(tmp_adev); if (r && r != -EHWPOISON)
dev_err(tmp_adev->dev, "error during bad page data initialization");
}
}
if (reset_scheduled)
flush_work(&hive->reset_on_init_work);
return 0;
}
int amdgpu_xgmi_request_nps_change(struct amdgpu_device *adev, struct amdgpu_hive_info *hive, int req_nps_mode)
{ struct amdgpu_device *tmp_adev; int cur_nps_mode, r;
/* This is expected to be called only during unload of driver. The * request needs to be placed only once for all devices in the hive. If * one of them fail, revert the request for previous successful devices. * After placing the request, make hive mode as UNKNOWN so that other * devices don't request anymore.
*/
mutex_lock(&hive->hive_lock); if (atomic_read(&hive->requested_nps_mode) ==
UNKNOWN_MEMORY_PARTITION_MODE) {
dev_dbg(adev->dev, "Unexpected entry for hive NPS change");
mutex_unlock(&hive->hive_lock); return 0;
}
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
r = adev->gmc.gmc_funcs->request_mem_partition_mode(
tmp_adev, req_nps_mode); if (r) break;
} if (r) { /* Request back current mode if one of the requests failed */
cur_nps_mode =
adev->gmc.gmc_funcs->query_mem_partition_mode(tmp_adev);
list_for_each_entry_continue_reverse(
tmp_adev, &hive->device_list, gmc.xgmi.head)
adev->gmc.gmc_funcs->request_mem_partition_mode(
tmp_adev, cur_nps_mode);
} /* Set to UNKNOWN so that other devices don't request anymore */
atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE);
mutex_unlock(&hive->hive_lock);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.