/* * Do the module global initialization once and return its result. * It can be done on any cpu. It's always called with interrupts * disabled.
*/ staticint try_init_module_global(void)
{ struct tdx_module_args args = {}; static DEFINE_RAW_SPINLOCK(sysinit_lock); staticbool sysinit_done; staticint sysinit_ret;
lockdep_assert_irqs_disabled();
raw_spin_lock(&sysinit_lock);
if (sysinit_done) goto out;
/* RCX is module attributes and all bits are reserved */
args.rcx = 0;
sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
/* * The first SEAMCALL also detects the TDX module, thus * it can fail due to the TDX module is not loaded. * Dump message to let the user know.
*/ if (sysinit_ret == -ENODEV)
pr_err("module not loaded\n");
/** * tdx_cpu_enable - Enable TDX on local cpu * * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module * global initialization SEAMCALL if not done) on local cpu to make this * cpu be ready to run any other SEAMCALLs. * * Always call this function via IPI function calls. * * Return 0 on success, otherwise errors.
*/ int tdx_cpu_enable(void)
{ struct tdx_module_args args = {}; int ret;
if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) return -ENODEV;
lockdep_assert_irqs_disabled();
if (__this_cpu_read(tdx_lp_initialized)) return 0;
/* * The TDX module global initialization is the very first step * to enable TDX. Need to do it first (if hasn't been done) * before the per-cpu initialization.
*/
ret = try_init_module_global(); if (ret) return ret;
ret = seamcall_prerr(TDH_SYS_LP_INIT, &args); if (ret) return ret;
__this_cpu_write(tdx_lp_initialized, true);
return 0;
}
EXPORT_SYMBOL_GPL(tdx_cpu_enable);
/* * Add a memory region as a TDX memory block. The caller must make sure * all memory regions are added in address ascending order and don't * overlap.
*/ staticint add_tdx_memblock(struct list_head *tmb_list, unsignedlong start_pfn, unsignedlong end_pfn, int nid)
{ struct tdx_memblock *tmb;
tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); if (!tmb) return -ENOMEM;
/* @tmb_list is protected by mem_hotplug_lock */
list_add_tail(&tmb->list, tmb_list); return 0;
}
staticvoid free_tdx_memlist(struct list_head *tmb_list)
{ /* @tmb_list is protected by mem_hotplug_lock */ while (!list_empty(tmb_list)) { struct tdx_memblock *tmb = list_first_entry(tmb_list, struct tdx_memblock, list);
list_del(&tmb->list);
kfree(tmb);
}
}
/* * Ensure that all memblock memory regions are convertible to TDX * memory. Once this has been established, stash the memblock * ranges off in a secondary structure because memblock is modified * in memory hotplug while TDX memory regions are fixed.
*/ staticint build_tdx_memlist(struct list_head *tmb_list)
{ unsignedlong start_pfn, end_pfn; int i, nid, ret;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { /* * The first 1MB is not reported as TDX convertible memory. * Although the first 1MB is always reserved and won't end up * to the page allocator, it is still in memblock's memory * regions. Skip them manually to exclude them as TDX memory.
*/
start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); if (start_pfn >= end_pfn) continue;
/* * Add the memory regions as TDX memory. The regions in * memblock has already guaranteed they are in address * ascending order and don't overlap.
*/
ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); if (ret) goto err;
}
/* * TDH.SYS.RD -- reads one global metadata field * - RDX (in): the field to read * - R8 (out): the field data
*/
args.rdx = field_id;
ret = seamcall_prerr_ret(TDH_SYS_RD, &args); if (ret) return ret;
/* Calculate the actual TDMR size */ staticint tdmr_size_single(u16 max_reserved_per_tdmr)
{ int tdmr_sz;
/* * The actual size of TDMR depends on the maximum * number of reserved areas.
*/
tdmr_sz = sizeof(struct tdmr_info);
tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
/* * To keep things simple, allocate all TDMRs together. * The buffer needs to be physically contiguous to make * sure each TDMR is physically contiguous.
*/
tdmr_array = alloc_pages_exact(tdmr_array_sz,
GFP_KERNEL | __GFP_ZERO); if (!tdmr_array) return -ENOMEM;
tdmr_list->tdmrs = tdmr_array;
/* * Keep the size of TDMR to find the target TDMR * at a given index in the TDMR list.
*/
tdmr_list->tdmr_sz = tdmr_sz;
tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
tdmr_list->nr_consumed_tdmrs = 0;
/* Get the TDMR from the list at the given index. */ staticstruct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list, int idx)
{ int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
/* * Take the memory referenced in @tmb_list and populate the * preallocated @tdmr_list, following all the special alignment * and size rules for TDMR.
*/ staticint fill_out_tdmrs(struct list_head *tmb_list, struct tdmr_info_list *tdmr_list)
{ struct tdx_memblock *tmb; int tdmr_idx = 0;
/* * Loop over TDX memory regions and fill out TDMRs to cover them. * To keep it simple, always try to use one TDMR to cover one * memory region. * * In practice TDX supports at least 64 TDMRs. A 2-socket system * typically only consumes less than 10 of those. This code is * dumb and simple and may use more TMDRs than is strictly * required.
*/
list_for_each_entry(tmb, tmb_list, list) { struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
u64 start, end;
start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
/* * A valid size indicates the current TDMR has already * been filled out to cover the previous memory region(s).
*/ if (tdmr->size) { /* * Loop to the next if the current memory region * has already been fully covered.
*/ if (end <= tdmr_end(tdmr)) continue;
/* Otherwise, skip the already covered part. */ if (start < tdmr_end(tdmr))
start = tdmr_end(tdmr);
/* * Create a new TDMR to cover the current memory * region, or the remaining part of it.
*/
tdmr_idx++; if (tdmr_idx >= tdmr_list->max_tdmrs) {
pr_warn("initialization failed: TDMRs exhausted.\n"); return -ENOSPC;
}
tdmr = tdmr_entry(tdmr_list, tdmr_idx);
}
tdmr->base = start;
tdmr->size = end - start;
}
/* @tdmr_idx is always the index of the last valid TDMR. */
tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
/* * Warn early that kernel is about to run out of TDMRs. * * This is an indication that TDMR allocation has to be * reworked to be smarter to not run into an issue.
*/ if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
tdmr_list->nr_consumed_tdmrs,
tdmr_list->max_tdmrs);
return 0;
}
/* * Calculate PAMT size given a TDMR and a page size. The returned * PAMT size is always aligned up to 4K page boundary.
*/ staticunsignedlong tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
u16 pamt_entry_size)
{ unsignedlong pamt_sz, nr_pamt_entries;
pamt_sz = nr_pamt_entries * pamt_entry_size; /* TDX requires PAMT size must be 4K aligned */
pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
return pamt_sz;
}
/* * Locate a NUMA node which should hold the allocation of the @tdmr * PAMT. This node will have some memory covered by the TDMR. The * relative amount of memory covered is not considered.
*/ staticint tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
{ struct tdx_memblock *tmb;
/* * A TDMR must cover at least part of one TMB. That TMB will end * after the TDMR begins. But, that TMB may have started before * the TDMR. Find the next 'tmb' that _ends_ after this TDMR * begins. Ignore 'tmb' start addresses. They are irrelevant.
*/
list_for_each_entry(tmb, tmb_list, list) { if (tmb->end_pfn > PHYS_PFN(tdmr->base)) return tmb->nid;
}
/* * Fall back to allocating the TDMR's metadata from node 0 when * no TDX memory block can be found. This should never happen * since TDMRs originate from TDX memory blocks.
*/
pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
tdmr->base, tdmr_end(tdmr)); return 0;
}
/* * Allocate PAMTs from the local NUMA node of some memory in @tmb_list * within @tdmr, and set up PAMTs for @tdmr.
*/ staticint tdmr_set_up_pamt(struct tdmr_info *tdmr, struct list_head *tmb_list,
u16 pamt_entry_size[])
{ unsignedlong pamt_base[TDX_PS_NR]; unsignedlong pamt_size[TDX_PS_NR]; unsignedlong tdmr_pamt_base; unsignedlong tdmr_pamt_size; struct page *pamt; int pgsz, nid;
nid = tdmr_get_nid(tdmr, tmb_list);
/* * Calculate the PAMT size for each TDX supported page size * and the total PAMT size.
*/
tdmr_pamt_size = 0; for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
pamt_entry_size[pgsz]);
tdmr_pamt_size += pamt_size[pgsz];
}
/* * Allocate one chunk of physically contiguous memory for all * PAMTs. This helps minimize the PAMT's use of reserved areas * in overlapped TDMRs.
*/
pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
nid, &node_online_map); if (!pamt) return -ENOMEM;
/* * Break the contiguous allocation back up into the * individual PAMTs for each page size.
*/
tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
pamt_base[pgsz] = tdmr_pamt_base;
tdmr_pamt_base += pamt_size[pgsz];
}
/* * The PAMT was allocated in one contiguous unit. The 4K PAMT * should always point to the beginning of that allocation.
*/
pamt_bs = tdmr->pamt_4k_base;
pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
staticvoid tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
{ int i;
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
tdmr_free_pamt(tdmr_entry(tdmr_list, i));
}
/* Allocate and set up PAMTs for all TDMRs */ staticint tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, struct list_head *tmb_list,
u16 pamt_entry_size[])
{ int i, ret = 0;
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
pamt_entry_size); if (ret) goto err;
}
/* * Convert TDX private pages back to normal by using MOVDIR64B to * clear these pages. Note this function doesn't flush cache of * these TDX private pages. The caller should make sure of that.
*/ staticvoid reset_tdx_pages(unsignedlong base, unsignedlong size)
{ constvoid *zero_page = (constvoid *)page_address(ZERO_PAGE(0)); unsignedlong phys, end;
end = base + size; for (phys = base; phys < end; phys += 64)
movdir64b(__va(phys), zero_page);
/* * MOVDIR64B uses WC protocol. Use memory barrier to * make sure any later user of these pages sees the * updated data.
*/
mb();
}
/* Reserved area must be 4K aligned in offset and size */ if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK)) return -EINVAL;
if (idx >= max_reserved_per_tdmr) {
pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
tdmr->base, tdmr_end(tdmr)); return -ENOSPC;
}
/* * Consume one reserved area per call. Make no effort to * optimize or reduce the number of reserved areas which are * consumed by contiguous reserved areas, for instance.
*/
rsvd_areas[idx].offset = addr - tdmr->base;
rsvd_areas[idx].size = size;
*p_idx = idx + 1;
return 0;
}
/* * Go through @tmb_list to find holes between memory areas. If any of * those holes fall within @tdmr, set up a TDMR reserved area to cover * the hole.
*/ staticint tdmr_populate_rsvd_holes(struct list_head *tmb_list, struct tdmr_info *tdmr, int *rsvd_idx,
u16 max_reserved_per_tdmr)
{ struct tdx_memblock *tmb;
u64 prev_end; int ret;
/* * Start looking for reserved blocks at the * beginning of the TDMR.
*/
prev_end = tdmr->base;
list_for_each_entry(tmb, tmb_list, list) {
u64 start, end;
start = PFN_PHYS(tmb->start_pfn);
end = PFN_PHYS(tmb->end_pfn);
/* Break if this region is after the TDMR */ if (start >= tdmr_end(tdmr)) break;
/* Exclude regions before this TDMR */ if (end < tdmr->base) continue;
/* * Skip over memory areas that * have already been dealt with.
*/ if (start <= prev_end) {
prev_end = end; continue;
}
/* Add the hole before this region */
ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
start - prev_end,
max_reserved_per_tdmr); if (ret) return ret;
prev_end = end;
}
/* Add the hole after the last region if it exists. */ if (prev_end < tdmr_end(tdmr)) {
ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
tdmr_end(tdmr) - prev_end,
max_reserved_per_tdmr); if (ret) return ret;
}
return 0;
}
/* * Go through @tdmr_list to find all PAMTs. If any of those PAMTs * overlaps with @tdmr, set up a TDMR reserved area to cover the * overlapping part.
*/ staticint tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, struct tdmr_info *tdmr, int *rsvd_idx,
u16 max_reserved_per_tdmr)
{ int i, ret;
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { struct tdmr_info *tmp = tdmr_entry(tdmr_list, i); unsignedlong pamt_base, pamt_size, pamt_end;
tdmr_get_pamt(tmp, &pamt_base, &pamt_size); /* Each TDMR must already have PAMT allocated */
WARN_ON_ONCE(!pamt_size || !pamt_base);
pamt_end = pamt_base + pamt_size; /* Skip PAMTs outside of the given TDMR */ if ((pamt_end <= tdmr->base) ||
(pamt_base >= tdmr_end(tdmr))) continue;
/* Only mark the part within the TDMR as reserved */ if (pamt_base < tdmr->base)
pamt_base = tdmr->base; if (pamt_end > tdmr_end(tdmr))
pamt_end = tdmr_end(tdmr);
ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
pamt_end - pamt_base,
max_reserved_per_tdmr); if (ret) return ret;
}
return 0;
}
/* Compare function called by sort() for TDMR reserved areas */ staticint rsvd_area_cmp_func(constvoid *a, constvoid *b)
{ struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
if (r1->offset + r1->size <= r2->offset) return -1; if (r1->offset >= r2->offset + r2->size) return 1;
/* Reserved areas cannot overlap. The caller must guarantee. */
WARN_ON_ONCE(1); return -1;
}
/* * Populate reserved areas for the given @tdmr, including memory holes * (via @tmb_list) and PAMTs (via @tdmr_list).
*/ staticint tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, struct list_head *tmb_list, struct tdmr_info_list *tdmr_list,
u16 max_reserved_per_tdmr)
{ int ret, rsvd_idx = 0;
ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
max_reserved_per_tdmr); if (ret) return ret;
ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
max_reserved_per_tdmr); if (ret) return ret;
/* TDX requires reserved areas listed in address ascending order */
sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
rsvd_area_cmp_func, NULL);
return 0;
}
/* * Populate reserved areas for all TDMRs in @tdmr_list, including memory * holes (via @tmb_list) and PAMTs.
*/ staticint tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, struct list_head *tmb_list,
u16 max_reserved_per_tdmr)
{ int i;
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { int ret;
ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
tmb_list, tdmr_list, max_reserved_per_tdmr); if (ret) return ret;
}
return 0;
}
/* * Construct a list of TDMRs on the preallocated space in @tdmr_list * to cover all TDX memory regions in @tmb_list based on the TDX module * TDMR global information in @sysinfo_tdmr.
*/ staticint construct_tdmrs(struct list_head *tmb_list, struct tdmr_info_list *tdmr_list, struct tdx_sys_info_tdmr *sysinfo_tdmr)
{
u16 pamt_entry_size[TDX_PS_NR] = {
sysinfo_tdmr->pamt_4k_entry_size,
sysinfo_tdmr->pamt_2m_entry_size,
sysinfo_tdmr->pamt_1g_entry_size,
}; int ret;
ret = fill_out_tdmrs(tmb_list, tdmr_list); if (ret) return ret;
ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size); if (ret) return ret;
ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
sysinfo_tdmr->max_reserved_per_tdmr); if (ret)
tdmrs_free_pamt_all(tdmr_list);
/* * The tdmr_info_list is read-only from here on out. * Ensure that these writes are seen by other CPUs. * Pairs with a smp_rmb() in is_pamt_page().
*/
smp_wmb();
return ret;
}
staticint config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
{ struct tdx_module_args args = {};
u64 *tdmr_pa_array;
size_t array_sz; int i, ret;
/* * TDMRs are passed to the TDX module via an array of physical * addresses of each TDMR. The array itself also has certain * alignment requirement.
*/
array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
array_sz = roundup_pow_of_two(array_sz); if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL); if (!tdmr_pa_array) return -ENOMEM;
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
/* * Attempt to configure the global KeyID on all physical packages. * * This requires running code on at least one CPU in each package. * TDMR initialization) will fail will fail if any package in the * system has no online CPUs. * * This code takes no affirmative steps to online CPUs. Callers (aka. * KVM) can ensure success by ensuring sufficient CPUs are online and * can run SEAMCALLs.
*/ staticint config_global_keyid(void)
{
cpumask_var_t packages; int cpu, ret = -EINVAL;
if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) return -ENOMEM;
/* * Hardware doesn't guarantee cache coherency across different * KeyIDs. The kernel needs to flush PAMT's dirty cachelines * (associated with KeyID 0) before the TDX module can use the * global KeyID to access the PAMT. Given PAMTs are potentially * large (~1/256th of system RAM), just use WBINVD.
*/
wbinvd_on_all_cpus();
for_each_online_cpu(cpu) { /* * The key configuration only needs to be done once per * package and will return an error if configured more * than once. Avoid doing it multiple times per package.
*/ if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
packages)) continue;
/* * TDH.SYS.KEY.CONFIG cannot run concurrently on * different cpus. Do it one by one.
*/
ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true); if (ret) break;
}
/* * Initializing a TDMR can be time consuming. To avoid long * SEAMCALLs, the TDX module may only initialize a part of the * TDMR in each call.
*/ do { struct tdx_module_args args = {
.rcx = tdmr->base,
}; int ret;
ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); if (ret) return ret; /* * RDX contains 'next-to-initialize' address if * TDH.SYS.TDMR.INIT did not fully complete and * should be retried.
*/
next = args.rdx;
cond_resched(); /* Keep making SEAMCALLs until the TDMR is done */
} while (next < tdmr->base + tdmr->size);
return 0;
}
staticint init_tdmrs(struct tdmr_info_list *tdmr_list)
{ int i;
/* * This operation is costly. It can be parallelized, * but keep it simple for now.
*/ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { int ret;
ret = init_tdmr(tdmr_entry(tdmr_list, i)); if (ret) return ret;
}
return 0;
}
staticint init_tdx_module(void)
{ int ret;
ret = get_tdx_sys_info(&tdx_sysinfo); if (ret) return ret;
/* Check whether the kernel can support this module */
ret = check_features(&tdx_sysinfo); if (ret) return ret;
/* * To keep things simple, assume that all TDX-protected memory * will come from the page allocator. Make sure all pages in the * page allocator are TDX-usable memory. * * Build the list of "TDX-usable" memory regions which cover all * pages in the page allocator to guarantee that. Do it while * holding mem_hotplug_lock read-lock as the memory hotplug code * path reads the @tdx_memlist to reject any new memory.
*/
get_online_mems();
ret = build_tdx_memlist(&tdx_memlist); if (ret) goto out_put_tdxmem;
/* Allocate enough space for constructing TDMRs */
ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr); if (ret) goto err_free_tdxmem;
/* Cover all TDX-usable memory regions in TDMRs */
ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr); if (ret) goto err_free_tdmrs;
/* Pass the TDMRs and the global KeyID to the TDX module */
ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid); if (ret) goto err_free_pamts;
/* Config the key of global KeyID on all packages */
ret = config_global_keyid(); if (ret) goto err_reset_pamts;
/* Initialize TDMRs to complete the TDX module initialization */
ret = init_tdmrs(&tdx_tdmr_list); if (ret) goto err_reset_pamts;
pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
out_put_tdxmem: /* * @tdx_memlist is written here and read at memory hotplug time. * Lock out memory hotplug code while building it.
*/
put_online_mems(); return ret;
err_reset_pamts: /* * Part of PAMTs may already have been initialized by the * TDX module. Flush cache before returning PAMTs back * to the kernel.
*/
wbinvd_on_all_cpus(); /* * According to the TDX hardware spec, if the platform * doesn't have the "partial write machine check" * erratum, any kernel read/write will never cause #MC * in kernel space, thus it's OK to not convert PAMTs * back to normal. But do the conversion anyway here * as suggested by the TDX spec.
*/
tdmrs_reset_pamt_all(&tdx_tdmr_list);
err_free_pamts:
tdmrs_free_pamt_all(&tdx_tdmr_list);
err_free_tdmrs:
free_tdmr_list(&tdx_tdmr_list);
err_free_tdxmem:
free_tdx_memlist(&tdx_memlist); goto out_put_tdxmem;
}
staticint __tdx_enable(void)
{ int ret;
ret = init_tdx_module(); if (ret) {
pr_err("module initialization failed (%d)\n", ret);
tdx_module_status = TDX_MODULE_ERROR; return ret;
}
/** * tdx_enable - Enable TDX module to make it ready to run TDX guests * * This function assumes the caller has: 1) held read lock of CPU hotplug * lock to prevent any new cpu from becoming online; 2) done both VMXON * and tdx_cpu_enable() on all online cpus. * * This function requires there's at least one online cpu for each CPU * package to succeed. * * This function can be called in parallel by multiple callers. * * Return 0 if TDX is enabled successfully, otherwise error.
*/ int tdx_enable(void)
{ int ret;
if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) return -ENODEV;
lockdep_assert_cpus_held();
mutex_lock(&tdx_module_lock);
switch (tdx_module_status) { case TDX_MODULE_UNINITIALIZED:
ret = __tdx_enable(); break; case TDX_MODULE_INITIALIZED: /* Already initialized, great, tell the caller. */
ret = 0; break; default: /* Failed to initialize in the previous attempts */
ret = -EINVAL; break;
}
/* Ensure that all remote 'tdmr_list' writes are visible: */
smp_rmb();
/* * The TDX module is no longer returning TDX_SYS_NOT_READY and * is initialized. The 'tdmr_list' was initialized long ago * and is now read-only.
*/ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { unsignedlong base, size;
if (phys >= base && phys < (base + size)) returntrue;
}
returnfalse;
}
/* * Return whether the memory page at the given physical address is TDX * private memory or not. * * This can be imprecise for two known reasons: * 1. PAMTs are private memory and exist before the TDX module is * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively * short window that occurs once per boot. * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the * page. However, the page can still cause #MC until it has been * fully converted to shared using 64-byte writes like MOVDIR64B. * Buggy hosts might still leave #MC-causing memory in place which * this function can not detect.
*/ staticbool paddr_is_tdx_private(unsignedlong phys)
{ struct tdx_module_args args = {
.rcx = phys & PAGE_MASK,
};
u64 sret;
if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) returnfalse;
/* Get page type from the TDX module */
sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
/* * The SEAMCALL will not return success unless there is a * working, "ready" TDX module. Assume an absence of TDX * private pages until SEAMCALL is working.
*/ if (sret) returnfalse;
/* * SEAMCALL was successful -- read page type (via RCX): * * - PT_NDA: Page is not used by the TDX module * - PT_RSVD: Reserved for Non-TDX use * - Others: Page is used by the TDX module * * Note PAMT pages are marked as PT_RSVD but they are also TDX * private memory.
*/ switch (args.rcx) { case PT_NDA: returnfalse; case PT_RSVD: return is_pamt_page(phys); default: returntrue;
}
}
/* * Some TDX-capable CPUs have an erratum. A write to TDX private * memory poisons that memory, and a subsequent read of that memory * triggers #MC. * * Help distinguish erratum-triggered #MCs from a normal hardware one. * Just print additional message to show such #MC may be result of the * erratum.
*/ constchar *tdx_dump_mce_info(struct mce *m)
{ if (!m || !mce_is_memory_error(m) || !mce_usable_address(m)) return NULL;
if (!paddr_is_tdx_private(m->addr)) return NULL;
return"TDX private memory error. Possible kernel bug.";
}
static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
u32 *nr_tdx_keyids)
{
u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids; int ret;
/* * IA32_MKTME_KEYID_PARTIONING: * Bit [31:0]: Number of MKTME KeyIDs. * Bit [63:32]: Number of TDX private KeyIDs.
*/
ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
&_nr_tdx_keyids); if (ret || !_nr_tdx_keyids) return -EINVAL;
/* TDX KeyIDs start after the last MKTME KeyID. */
_tdx_keyid_start = _nr_mktme_keyids + 1;
/* * This check assumes that the start_pfn<->end_pfn range does not * cross multiple @tdx_memlist entries. A single memory online * event across multiple memblocks (from which @tdx_memlist * entries are derived at the time of module initialization) is * not possible. This is because memory offline/online is done * on granularity of 'struct memory_block', and the hotpluggable * memory region (one memblock) must be multiple of memory_block.
*/
list_for_each_entry(tmb, &tdx_memlist, list) { if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) returntrue;
} returnfalse;
}
/* * Empty list means TDX isn't enabled. Allow any memory * to go online.
*/ if (list_empty(&tdx_memlist)) return NOTIFY_OK;
/* * The TDX memory configuration is static and can not be * changed. Reject onlining any memory which is outside of * the static configuration whether it supports TDX or not.
*/ if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages)) return NOTIFY_OK;
staticvoid __init check_tdx_erratum(void)
{ /* * These CPUs have an erratum. A partial write from non-TD * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX * private memory poisons that memory, and a subsequent read of * that memory triggers #MC.
*/ switch (boot_cpu_data.x86_vfm) { case INTEL_SAPPHIRERAPIDS_X: case INTEL_EMERALDRAPIDS_X:
setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
}
}
void __init tdx_init(void)
{
u32 tdx_keyid_start, nr_tdx_keyids; int err;
err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids); if (err) return;
/* * The TDX module itself requires one 'global KeyID' to protect * its metadata. If there's only one TDX KeyID, there won't be * any left for TDX guests thus there's no point to enable TDX * at all.
*/ if (nr_tdx_keyids < 2) {
pr_err("initialization failed: too few private KeyIDs available.\n"); return;
}
/* * At this point, hibernation_available() indicates whether or * not hibernation support has been permanently disabled.
*/ if (hibernation_available()) {
pr_err("initialization failed: Hibernation support is enabled\n"); return;
}
#ifdefined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
acpi_suspend_lowlevel = NULL; #endif
/* * Just use the first TDX KeyID as the 'global KeyID' and * leave the rest for TDX guests.
*/
tdx_global_keyid = tdx_keyid_start;
tdx_guest_keyid_start = tdx_keyid_start + 1;
tdx_nr_guest_keyids = nr_tdx_keyids - 1;
/* Make sure all fields in @tdx_sysinfo have been populated */
mutex_lock(&tdx_module_lock); if (tdx_module_status == TDX_MODULE_INITIALIZED)
p = (conststruct tdx_sys_info *)&tdx_sysinfo;
mutex_unlock(&tdx_module_lock);
/* * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether * a CLFLUSH of pages is required before handing them to the TDX module. * Be conservative and make the code simpler by doing the CLFLUSH * unconditionally.
*/ staticvoid tdx_clflush_page(struct page *page)
{
clflush_cache_range(page_to_virt(page), PAGE_SIZE);
}
/* * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. * So despite the names, they must be interpted specially as described by the spec. Return * them only for error reporting purposes.
*/
u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size)
{ struct tdx_module_args args = {
.rcx = page_to_phys(page),
};
u64 ret;
ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.