/* * The drivers that bind to nodes in the platform-facilities * hierarchy don't support node removal, and the removal directive * from firmware is always followed by an add of an equivalent * node. The capability (e.g. RNG, encryption, compression) * represented by the node is never interrupted by the migration. * So ignore changes to this part of the tree.
*/ if (is_platfac) {
pr_notice("ignoring remove operation for %pOFfp\n", dn); return 0;
}
/* A negative 'vd' value indicates that only part of the new property * value is contained in the buffer and we need to call * ibm,update-properties again to get the rest of the value. * * A negative value is also the two's compliment of the actual value.
*/ if (vd & 0x80000000) {
vd = ~vd + 1;
more = 1;
}
if (new_prop) { /* partial property fixup */ char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL); if (!new_data) return -ENOMEM;
/* On the first call to ibm,update-properties for a node the * first property value descriptor contains an empty * property name, the property value length encoded as u32, * and the property value is the node path being updated.
*/ if (*prop_data == 0) {
prop_data++;
vd = be32_to_cpu(*(__be32 *)prop_data);
prop_data += vd + sizeof(vd);
nprops--;
}
dn = dlpar_configure_connector(drc_index, parent_dn); if (!dn) return -ENOENT;
/* * Since delete_dt_node() ignores this node type, this is the * necessary counterpart. We also know that a platform-facilities * node returned from dlpar_configure_connector() has children * attached, and dlpar_attach_node() only adds the parent, leaking * the children. So ignore these on the add side for now.
*/ if (of_node_is_type(dn, "ibm,platform-facilities")) {
pr_notice("ignoring add operation for %pOF\n", dn);
dlpar_free_cc_nodes(dn); return 0;
}
rc = dlpar_attach_node(dn, parent_dn); if (rc)
dlpar_free_cc_nodes(dn);
pr_debug("added node %pOFfp\n", dn);
return rc;
}
staticint pseries_devicetree_update(s32 scope)
{ char *rtas_buf;
__be32 *data; int update_nodes_token; int rc;
update_nodes_token = rtas_function_token(RTAS_FN_IBM_UPDATE_NODES); if (update_nodes_token == RTAS_UNKNOWN_SERVICE) return 0;
rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); if (!rtas_buf) return -ENOMEM;
do {
rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope); if (rc && rc != 1) break;
data = (__be32 *)rtas_buf + 4; while (be32_to_cpu(*data) & NODE_ACTION_MASK) { int i;
u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;
u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
data++;
for (i = 0; i < node_count; i++) { struct device_node *np;
__be32 phandle = *data++;
__be32 drc_index;
np = of_find_node_by_phandle(be32_to_cpu(phandle)); if (!np) {
pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n",
be32_to_cpu(phandle), action); continue;
}
switch (action) { case DELETE_DT_NODE:
delete_dt_node(np); break; case UPDATE_DT_NODE:
update_dt_node(np, scope); break; case ADD_DT_NODE:
drc_index = *data++;
add_dt_node(np, drc_index); break;
}
of_node_put(np);
cond_resched();
}
}
cond_resched();
} while (rc == 1);
kfree(rtas_buf); return rc;
}
void post_mobility_fixup(void)
{ int rc;
rtas_activate_firmware();
/* * We don't want CPUs to go online/offline while the device * tree is being updated.
*/
cpus_read_lock();
/* * It's common for the destination firmware to replace cache * nodes. Release all of the cacheinfo hierarchy's references * before updating the device tree.
*/
cacheinfo_teardown();
rc = pseries_devicetree_update(MIGRATION_SCOPE); if (rc)
pr_err("device tree update failed: %d\n", rc);
cacheinfo_rebuild();
cpus_read_unlock();
/* Possibly switch to a new L1 flush type */
pseries_setup_security_mitigations();
/* Reinitialise system information for hv-24x7 */
read_24x7_sys_info();
return;
}
staticint poll_vasi_state(u64 handle, unsignedlong *res)
{ unsignedlong retbuf[PLPAR_HCALL_BUFSIZE]; long hvrc; int ret;
hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle); switch (hvrc) { case H_SUCCESS:
ret = 0;
*res = retbuf[0]; break; case H_PARAMETER:
ret = -EINVAL; break; case H_FUNCTION:
ret = -EOPNOTSUPP; break; case H_HARDWARE: default:
pr_err("unexpected H_VASI_STATE result %ld\n", hvrc);
ret = -EIO; break;
} return ret;
}
staticint wait_for_vasi_session_suspending(u64 handle)
{ unsignedlong state; int ret;
/* * Wait for transition from H_VASI_ENABLED to * H_VASI_SUSPENDING. Treat anything else as an error.
*/ while (true) {
ret = poll_vasi_state(handle, &state);
if (ret != 0 || state == H_VASI_SUSPENDING) { break;
} elseif (state == H_VASI_ENABLED) {
ssleep(1);
} else {
pr_err("unexpected H_VASI_STATE result %lu\n", state);
ret = -EIO; break;
}
}
/* * Proceed even if H_VASI_STATE is unavailable. If H_JOIN or * ibm,suspend-me are also unimplemented, we'll recover then.
*/ if (ret == -EOPNOTSUPP)
ret = 0;
return ret;
}
staticvoid wait_for_vasi_session_completed(u64 handle)
{ unsignedlong state = 0; int ret;
pr_info("waiting for memory transfer to complete...\n");
/* * Wait for transition from H_VASI_RESUMED to H_VASI_COMPLETED.
*/ while (true) {
ret = poll_vasi_state(handle, &state);
/* * If the memory transfer is already complete and the migration * has been cleaned up by the hypervisor, H_PARAMETER is return, * which is translate in EINVAL by poll_vasi_state().
*/ if (ret == -EINVAL || (!ret && state == H_VASI_COMPLETED)) {
pr_info("memory transfer completed.\n"); break;
}
if (ret) {
pr_err("H_VASI_STATE return error (%d)\n", ret); break;
}
if (state != H_VASI_RESUMED) {
pr_err("unexpected H_VASI_STATE result %lu\n", state); break;
}
msleep(500);
}
}
staticvoid prod_single(unsignedint target_cpu)
{ long hvrc; int hwid;
hwid = get_hard_smp_processor_id(target_cpu);
hvrc = plpar_hcall_norets(H_PROD, hwid); if (hvrc == H_SUCCESS) return;
pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n",
target_cpu, hwid, hvrc);
}
staticvoid prod_others(void)
{ unsignedint cpu;
for_each_online_cpu(cpu) { if (cpu != smp_processor_id())
prod_single(cpu);
}
}
staticint do_suspend(void)
{
u16 saved_slb_size; int status; int ret;
pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id());
/* * The destination processor model may have fewer SLB entries * than the source. We reduce mmu_slb_size to a safe minimum * before suspending in order to minimize the possibility of * programming non-existent entries on the destination. If * suspend fails, we restore it before returning. On success * the OF reconfig path will update it from the new device * tree after resuming on the destination.
*/
saved_slb_size = clamp_slb_size();
ret = rtas_ibm_suspend_me(&status); if (ret != 0) {
pr_err("ibm,suspend-me error: %d\n", status);
slb_set_size(saved_slb_size);
}
return ret;
}
/** * struct pseries_suspend_info - State shared between CPUs for join/suspend. * @counter: Threads are to increment this upon resuming from suspend * or if an error is received from H_JOIN. The thread which performs * the first increment (i.e. sets it to 1) is responsible for * waking the other threads. * @done: False if join/suspend is in progress. True if the operation is * complete (successful or not).
*/ struct pseries_suspend_info {
atomic_t counter; bool done;
};
staticint do_join(void *arg)
{ struct pseries_suspend_info *info = arg;
atomic_t *counter = &info->counter; long hvrc; int ret;
retry: /* Must ensure MSR.EE off for H_JOIN. */
hard_irq_disable();
hvrc = plpar_hcall_norets(H_JOIN);
switch (hvrc) { case H_CONTINUE: /* * All other CPUs are offline or in H_JOIN. This CPU * attempts the suspend.
*/
ret = do_suspend(); break; case H_SUCCESS: /* * The suspend is complete and this cpu has received a * prod, or we've received a stray prod from unrelated * code (e.g. paravirt spinlocks) and we need to join * again. * * This barrier orders the return from H_JOIN above vs * the load of info->done. It pairs with the barrier * in the wakeup/prod path below.
*/
smp_mb(); if (READ_ONCE(info->done) == false) {
pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying",
smp_processor_id()); goto retry;
}
ret = 0; break; case H_BAD_MODE: case H_HARDWARE: default:
ret = -EIO;
pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",
hvrc, smp_processor_id()); break;
}
if (atomic_inc_return(counter) == 1) {
pr_info("CPU %u waking all threads\n", smp_processor_id());
WRITE_ONCE(info->done, true); /* * This barrier orders the store to info->done vs subsequent * H_PRODs to wake the other CPUs. It pairs with the barrier * in the H_SUCCESS case above.
*/
smp_mb();
prod_others();
} /* * Execution may have been suspended for several seconds, so reset * the watchdogs. touch_nmi_watchdog() also touches the soft lockup * watchdog.
*/
rcu_cpu_stall_reset();
touch_nmi_watchdog();
return ret;
}
/* * Abort reason code byte 0. We use only the 'Migrating partition' value.
*/ enum vasi_aborting_entity {
ORCHESTRATOR = 1,
VSP_SOURCE = 2,
PARTITION_FIRMWARE = 3,
PLATFORM_FIRMWARE = 4,
VSP_TARGET = 5,
MIGRATING_PARTITION = 6,
};
staticvoid pseries_cancel_migration(u64 handle, int err)
{
u32 reason_code;
u32 detail;
u8 entity; long hvrc;
ret = stop_machine(do_join, &info, cpu_online_mask); if (ret == 0) break; /* * Encountered an error. If the VASI stream is still * in Suspending state, it's likely a transient * condition related to some device in the partition * and we can retry in the hope that the cause has * cleared after some delay. * * A better design would allow drivers etc to prepare * for the suspend and avoid conditions which prevent * the suspend from succeeding. For now, we have this * mitigation.
*/
pr_notice("Partition suspend attempt %u of %u error: %d\n",
attempt, max_attempts, ret);
if (attempt == max_attempts) break;
vasi_err = poll_vasi_state(handle, &vasi_state); if (vasi_err == 0) { if (vasi_state != H_VASI_SUSPENDING) {
pr_notice("VASI state %lu after failed suspend\n",
vasi_state); break;
}
} elseif (vasi_err != -EOPNOTSUPP) {
pr_err("VASI state poll error: %d", vasi_err); break;
}
pr_notice("Will retry partition suspend after %u ms\n",
retry_interval_ms);
staticint pseries_migrate_partition(u64 handle)
{ int ret; unsignedint factor = 0;
#ifdef CONFIG_PPC_WATCHDOG
factor = nmi_wd_lpm_factor; #endif /* * When the migration is initiated, the hypervisor changes VAS * mappings to prepare before OS gets the notification and * closes all VAS windows. NX generates continuous faults during * this time and the user space can not differentiate these * faults from the migration event. So reduce this time window * by closing VAS windows at the beginning of this function.
*/
vas_migration_handler(VAS_SUSPEND);
ret = wait_for_vasi_session_suspending(handle); if (ret) goto out;
if (factor)
watchdog_hardlockup_set_timeout_pct(factor);
ret = pseries_suspend(handle); if (ret == 0) {
post_mobility_fixup(); /* * Wait until the memory transfer is complete, so that the user * space process returns from the syscall after the transfer is * complete. This allows the user hooks to be executed at the * right time.
*/
wait_for_vasi_session_completed(handle);
} else
pseries_cancel_migration(handle, ret);
if (factor)
watchdog_hardlockup_set_timeout_pct(0);
out:
vas_migration_handler(VAS_RESUME);
return ret;
}
int rtas_syscall_dispatch_ibm_suspend_me(u64 handle)
{ return pseries_migrate_partition(handle);
}
rc = kstrtou64(buf, 0, &streamid); if (rc) return rc;
rc = pseries_migrate_partition(streamid); if (rc) return rc;
return count;
}
/* * Used by drmgr to determine the kernel behavior of the migration interface. * * Version 1: Performs all PAPR requirements for migration including * firmware activation and device tree update.
*/ #define MIGRATION_API_VERSION 1
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.