/* * FD is exposed and user can use it after receiving an error. * Mark migf in error, and wake the user.
*/ staticvoid mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
{
migf->state = MLX5_MIGF_STATE_ERROR;
wake_up_interruptible(&migf->poll_wait);
}
staticint mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf,
size_t state_size, u64 full_size, bool track)
{ struct mlx5_vhca_data_buffer *buf;
size_t inc_state_size; int num_chunks; int ret; int i;
if (mvdev->chunk_mode) {
size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
/* from firmware perspective at least 'state_size' buffer should be set */
inc_state_size = max(state_size, chunk_size);
} else { if (track) { /* let's be ready for stop_copy size that might grow by 10 percents */ if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
inc_state_size = state_size;
} else {
inc_state_size = state_size;
}
}
/* let's not overflow the device specification max SAVE size */
inc_state_size = min_t(size_t, inc_state_size,
(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; for (i = 0; i < num_chunks; i++) {
buf = mlx5vf_get_data_buffer(
migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE),
DMA_FROM_DEVICE); if (IS_ERR(buf)) {
ret = PTR_ERR(buf); goto err;
}
migf->buf[i] = buf;
buf = mlx5vf_get_data_buffer(
migf,
DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
PAGE_SIZE),
DMA_NONE); if (IS_ERR(buf)) {
ret = PTR_ERR(buf); goto err;
}
migf->buf_header[i] = buf; if (mvdev->chunk_mode) {
migf->buf[i]->stop_copy_chunk_num = i + 1;
migf->buf_header[i]->stop_copy_chunk_num = i + 1;
INIT_WORK(&migf->save_data[i].work,
mlx5vf_mig_file_save_work);
migf->save_data[i].chunk_num = i + 1;
}
}
ret = mlx5vf_add_stop_copy_header(migf, track); if (ret) goto err; return 0;
err: for (i = 0; i < num_chunks; i++) { if (migf->buf[i]) {
mlx5vf_put_data_buffer(migf->buf[i]);
migf->buf[i] = NULL;
} if (migf->buf_header[i]) {
mlx5vf_put_data_buffer(migf->buf_header[i]);
migf->buf_header[i] = NULL;
}
}
if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT;
if (info.argsz < minsz) return -EINVAL;
mutex_lock(&mvdev->state_mutex); if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
ret = -EINVAL; goto err_state_unlock;
}
/* * We can't issue a SAVE command when the device is suspended, so as * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra * bytes that can't be read.
*/ if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { /* * Once the query returns it's guaranteed that there is no * active SAVE command. * As so, the other code below is safe with the proper locks.
*/
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
NULL, MLX5VF_QUERY_INC); if (ret) goto err_state_unlock;
}
mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) {
ret = -ENODEV; goto err_migf_unlock;
}
if (!end_of_data || !inc_length) {
mutex_unlock(&migf->lock); goto done;
}
mutex_unlock(&migf->lock); /* * We finished transferring the current state and the device has a * dirty state, save a new state to be ready for.
*/
buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE),
DMA_FROM_DEVICE); if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
mlx5vf_mark_err(migf); goto err_state_unlock;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); if (ret) {
mlx5vf_mark_err(migf);
mlx5vf_put_data_buffer(buf); goto err_state_unlock;
}
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM);
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
O_RDONLY); if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
kfree(migf); return ERR_PTR(ret);
}
migf->mvdev = mvdev;
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
init_waitqueue_head(&migf->poll_wait);
init_completion(&migf->save_comp); /* * save_comp is being used as a binary semaphore built from * a completion. A normal mutex cannot be used because the lock is * passed between kernel threads and lockdep can't model this.
*/
complete(&migf->save_comp);
mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
ret = mlx5vf_cmd_alloc_pd(migf); if (ret) goto out;
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); if (ret) goto out_pd;
ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track); if (ret) goto out_pd;
if (track) { /* leave the allocated buffer ready for the stop-copy phase */
buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages,
DMA_FROM_DEVICE); if (IS_ERR(buf)) {
ret = PTR_ERR(buf); goto out_pd;
}
} else {
buf = migf->buf[0];
migf->buf[0] = NULL;
}
if (*pos < 0 ||
check_add_overflow((loff_t)len, *pos, &requested_length)) return -EINVAL;
mutex_lock(&migf->mvdev->state_mutex);
mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) {
ret = -ENODEV; goto out_unlock;
}
while (len || has_work) {
has_work = false; switch (migf->load_state) { case MLX5_VF_LOAD_STATE_READ_HEADER:
ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
&buf, &len, pos,
&done, &has_work); if (ret) goto out_unlock; break; case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
{
u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE);
if (vhca_buf_header->npages < npages) {
mlx5vf_free_data_buffer(vhca_buf_header);
migf->buf_header[0] = mlx5vf_alloc_data_buffer(
migf, npages, DMA_NONE); if (IS_ERR(migf->buf_header[0])) {
ret = PTR_ERR(migf->buf_header[0]);
migf->buf_header[0] = NULL; goto out_unlock;
}
vhca_buf_header = migf->buf_header[0];
}
vhca_buf_header->start_pos = migf->max_pos;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; break;
} case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
&buf, &len, pos, &done); if (ret) goto out_unlock; break; case MLX5_VF_LOAD_STATE_PREP_IMAGE:
{
u64 size = max(migf->record_size,
migf->stop_copy_prep_size);
u32 npages = DIV_ROUND_UP(size, PAGE_SIZE);
if (vhca_buf->npages < npages) {
mlx5vf_free_data_buffer(vhca_buf);
migf->buf[0] = mlx5vf_alloc_data_buffer(
migf, npages, DMA_TO_DEVICE); if (IS_ERR(migf->buf[0])) {
ret = PTR_ERR(migf->buf[0]);
migf->buf[0] = NULL; goto out_unlock;
}
vhca_buf = migf->buf[0];
}
vhca_buf->start_pos = migf->max_pos;
migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; break;
} case MLX5_VF_LOAD_STATE_READ_IMAGE:
ret = mlx5vf_resume_read_image(migf, vhca_buf,
migf->record_size,
&buf, &len, pos, &done, &has_work); if (ret) goto out_unlock; break; case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); if (ret) goto out_unlock;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
/* prep header buf for next image */
vhca_buf_header->length = 0; /* prep data buf for next image */
vhca_buf->length = 0;
break; default: break;
}
}
out_unlock: if (ret)
migf->state = MLX5_MIGF_STATE_ERROR;
mutex_unlock(&migf->lock);
mlx5vf_state_mutex_unlock(migf->mvdev); return ret ? ret : done;
}
staticstruct file *
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
u32 new)
{
u32 cur = mvdev->mig_state; int ret;
if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
ret = mlx5vf_cmd_suspend_vhca(mvdev,
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); if (ret) return ERR_PTR(ret); return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
ret = mlx5vf_cmd_resume_vhca(mvdev,
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); if (ret) return ERR_PTR(ret); return NULL;
}
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
ret = mlx5vf_cmd_suspend_vhca(mvdev,
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) return ERR_PTR(ret); return NULL;
}
if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
ret = mlx5vf_cmd_resume_vhca(mvdev,
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) return ERR_PTR(ret); return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct mlx5_vf_migration_file *migf;
if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
ret = mlx5vf_cmd_suspend_vhca(mvdev,
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); if (ret) return ERR_PTR(ret);
ret = mlx5vf_pci_save_device_inc_data(mvdev); return ret ? ERR_PTR(ret) : NULL;
}
/* * vfio_mig_get_next_state() does not use arcs other than the above
*/
WARN_ON(true); return ERR_PTR(-EINVAL);
}
/* * This function is called in all state_mutex unlock cases to * handle a 'deferred_reset' if exists.
*/ void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
{
again:
spin_lock(&mvdev->reset_lock); if (mvdev->deferred_reset) {
mvdev->deferred_reset = false;
spin_unlock(&mvdev->reset_lock);
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
mlx5vf_disable_fds(mvdev, NULL); goto again;
}
mutex_unlock(&mvdev->state_mutex);
spin_unlock(&mvdev->reset_lock);
}
/* * As the higher VFIO layers are holding locks across reset and using * those same locks with the mm_lock we need to prevent ABBA deadlock * with the state_mutex and mm_lock. * In case the state_mutex was taken already we defer the cleanup work * to the unlock flow of the other running context.
*/
spin_lock(&mvdev->reset_lock);
mvdev->deferred_reset = true; if (!mutex_trylock(&mvdev->state_mutex)) {
spin_unlock(&mvdev->reset_lock); return;
}
spin_unlock(&mvdev->reset_lock);
mlx5vf_state_mutex_unlock(mvdev);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.