/* SPDX-License-Identifier: MIT */
/*
* Copyright 2024 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice andthis permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
// This shader is to clean LDS, SGPRs and VGPRs. It is first 64 Dwords or 256 bytes of 192 Dwords cleaner shader.
//To turn this shader program on for complitaion change this to main and lower shader main to main_1
// MI300 : Clear SGPRs, VGPRs and LDS
// Uses two kernels launched separately:
// 1. Clean VGPRs, LDS, and lower SGPRs
// Launches one workgroup per CU, each workgroup with 4x wave64 per SIMD in the CU
// Waves are "wave64"and have 128 VGPRs each, which uses all 512 VGPRs per SIMD
// Waves in the workgroup share the 64KB of LDS
// Each wave clears SGPRs 0 - 95. Because there are 4 waves/SIMD, this is physical SGPRs 0-383
// Each wave clears 128 VGPRs, so all 512 in the SIMD
// The first wave of the workgroup clears its 64KB of LDS
// The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup
// before any wave in the workgroup could end. Without this, it is possible not all SGPRs get cleared.
// 2. Clean remaining SGPRs
// Launches a workgroup with 24 waves per workgroup, yielding 6 waves per SIMD in each CU
// Waves are allocating 96 SGPRs
// CP sets up SPI_RESOURCE_RESERVE_* registers to prevent these waves from allocating SGPRs 0-223.
// As such, these 6 waves per SIMD are allocated physical SGPRs 224-799
// Barriers do not work for >16 waves per workgroup, so we cannot start with S_BARRIER
// Instead, the shader starts with an S_SETHALT 1. Once all waves are launched CP will send unhalt command
// The shader then clears all SGPRs allocated to it, cleaning out physical SGPRs 224-799
shader main
asic(MI300)
type(CS)
wave_size(64)
// Note: original source code from SQ team
s_cmp_eq_u32 s0, 1 // Bit0 is set, sgpr0 is set then clear VGPRS and LDS as FW set COMPUTE_USER_DATA_3
s_cbranch_scc0 label_0023 // Clean VGPRs and LDS if sgpr0 of wave is set, scc = (s3 == 1)
S_BARRIER
s_mov_b32 s2, 0x80000000 // Bit31 is first_wave
s_and_b32 s2, s2, s1 // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set
s_cbranch_scc0 label_clean_sgpr_1 // Clean LDS if its first wave of ThreadGroup/WorkGroup
// CLEAR LDS
//
s_mov_b32 exec_lo, 0xffffffff
s_mov_b32 exec_hi, 0xffffffff
v_mbcnt_lo_u32_b32 v1, exec_hi, 0 // Set V1 to thread-ID (0..63)
v_mbcnt_hi_u32_b32 v1, exec_lo, v1 // Set V1 to thread-ID (0..63)
v_mul_u32_u24 v1, 0x00000008, v1 // * 8, so each thread is a double-dword address (8byte)
s_mov_b32 s2, 0x00000003f // 64 loop iteraions
s_mov_b32 m0, 0xffffffff
// Clear all of LDS space
// Each FirstWave of WorkGroup clears 64kbyte block
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.19Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.