/* * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. * Copyright (c) 2020, Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE.
*/
spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
push_mkey_locked(ent, mkey_out->mkey);
ent->pending--; /* If we are doing fill_to_high_water then keep going. */
queue_adjust_cache_locked(ent);
spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
kfree(mkey_out);
}
staticint get_mkc_octo_size(unsignedint access_mode, unsignedint ndescs)
{ int ret = 0;
switch (access_mode) { case MLX5_MKC_ACCESS_MODE_MTT:
ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / sizeof(struct mlx5_mtt)); break; case MLX5_MKC_ACCESS_MODE_KSM:
ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / sizeof(struct mlx5_klm)); break; default:
WARN_ON(1);
} return ret;
}
/* Asynchronously schedule new MRs to be populated in the cache. */ staticint add_keys(struct mlx5_cache_ent *ent, unsignedint num)
{ struct mlx5r_async_create_mkey *async_create; void *mkc; int err = 0; int i;
for (i = 0; i < num; i++) {
async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
GFP_KERNEL); if (!async_create) return -ENOMEM;
mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
memory_key_mkey_entry);
set_cache_mkc(ent, mkc);
async_create->ent = ent;
err = kstrtou32_from_user(buf, count, 0, &target); if (err) return err;
/* * Target is the new value of total_mrs the user requests, however we * cannot free MRs that are in use. Compute the target value for stored * mkeys.
*/
spin_lock_irq(&ent->mkeys_queue.lock); if (target < ent->in_use) {
err = -EINVAL; goto err_unlock;
}
target = target - ent->in_use; if (target < ent->limit || target > ent->limit*2) {
err = -EINVAL; goto err_unlock;
}
err = resize_available_mrs(ent, target, false); if (err) goto err_unlock;
spin_unlock_irq(&ent->mkeys_queue.lock);
err = kstrtou32_from_user(buf, count, 0, &var); if (err) return err;
/* * Upon set we immediately fill the cache to high water mark implied by * the limit.
*/
spin_lock_irq(&ent->mkeys_queue.lock);
ent->limit = var;
err = resize_available_mrs(ent, 0, true);
spin_unlock_irq(&ent->mkeys_queue.lock); if (err) return err; return count;
}
mutex_lock(&cache->rb_lock); for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
ent = rb_entry(node, struct mlx5_cache_ent, node);
spin_lock_irq(&ent->mkeys_queue.lock);
ret = ent->mkeys_queue.ci < ent->limit;
spin_unlock_irq(&ent->mkeys_queue.lock); if (ret) {
mutex_unlock(&cache->rb_lock); returntrue;
}
}
mutex_unlock(&cache->rb_lock); returnfalse;
}
/* * Check if the bucket is outside the high/low water mark and schedule an async * update. The cache refill has hysteresis, once the low water mark is hit it is * refilled up to the high mark.
*/ staticvoid queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
{
lockdep_assert_held(&ent->mkeys_queue.lock);
if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) return; if (ent->mkeys_queue.ci < ent->limit) {
ent->fill_to_high_water = true;
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
} elseif (ent->fill_to_high_water &&
ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) { /* * Once we start populating due to hitting a low water mark * continue until we pass the high water mark.
*/
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
} elseif (ent->mkeys_queue.ci == 2 * ent->limit) {
ent->fill_to_high_water = false;
} elseif (ent->mkeys_queue.ci > 2 * ent->limit) { /* Queue deletion of excess entries */
ent->fill_to_high_water = false; if (ent->pending)
queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
secs_to_jiffies(1)); else
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
}
}
spin_lock_irq(&ent->mkeys_queue.lock); if (ent->disabled) goto out;
if (ent->fill_to_high_water &&
ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
!READ_ONCE(dev->fill_delay)) {
spin_unlock_irq(&ent->mkeys_queue.lock);
err = add_keys(ent, 1);
spin_lock_irq(&ent->mkeys_queue.lock); if (ent->disabled) goto out; if (err) { /* * EAGAIN only happens if there are pending MRs, so we * will be rescheduled when storing them. The only * failure path here is ENOMEM.
*/ if (err != -EAGAIN) {
mlx5_ib_warn(
dev, "add keys command failed, err %d\n",
err);
queue_delayed_work(cache->wq, &ent->dwork,
secs_to_jiffies(1));
}
}
} elseif (ent->mkeys_queue.ci > 2 * ent->limit) { bool need_delay;
/* * The remove_cache_mr() logic is performed as garbage * collection task. Such task is intended to be run when no * other active processes are running. * * The need_resched() will return TRUE if there are user tasks * to be activated in near future. * * In such case, we don't execute remove_cache_mr() and postpone * the garbage collection work to try to run in next cycle, in * order to free CPU resources to other tasks.
*/
spin_unlock_irq(&ent->mkeys_queue.lock);
need_delay = need_resched() || someone_adding(cache) ||
!time_after(jiffies,
READ_ONCE(cache->last_add) + 300 * HZ);
spin_lock_irq(&ent->mkeys_queue.lock); if (ent->disabled) goto out; if (need_delay) {
queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); goto out;
}
remove_cache_mr_locked(ent);
queue_adjust_cache_locked(ent);
}
out:
spin_unlock_irq(&ent->mkeys_queue.lock);
}
ent = container_of(work, struct mlx5_cache_ent, dwork.work); /* temp entries are never filled, only cleaned */ if (ent->is_tmp)
clean_keys(ent->dev, ent); else
__cache_work_func(ent);
}
staticint cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, struct mlx5r_cache_rb_key key2)
{ int res;
res = key1.ats - key2.ats; if (res) return res;
res = key1.access_mode - key2.access_mode; if (res) return res;
res = key1.access_flags - key2.access_flags; if (res) return res;
res = key1.st_index - key2.st_index; if (res) return res;
res = key1.ph - key2.ph; if (res) return res;
/* * keep ndescs the last in the compare table since the find function * searches for an exact match on all properties and only closest * match in size.
*/ return key1.ndescs - key2.ndescs;
}
/* Figure out where to put new node */ while (*new) {
cur = rb_entry(*new, struct mlx5_cache_ent, node);
parent = *new;
cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); if (cmp > 0) new = &((*new)->rb_left); if (cmp < 0) new = &((*new)->rb_right); if (cmp == 0) return -EEXIST;
}
/* Add new node and rebalance tree. */
rb_link_node(&ent->node, parent, new);
rb_insert_color(&ent->node, &cache->rb_root);
/* * Find the smallest ent with order >= requested_order.
*/ while (node) {
cur = rb_entry(node, struct mlx5_cache_ent, node);
cmp = cache_ent_key_cmp(cur->rb_key, rb_key); if (cmp > 0) {
smallest = cur;
node = node->rb_left;
} if (cmp < 0)
node = node->rb_right; if (cmp == 0) return cur;
}
/* * Limit the usage of mkeys larger than twice the required size while * also allowing the usage of smallest cache entry for small MRs.
*/
ndescs_limit = max_t(u64, rb_key.ndescs * 2,
MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
ret = mlx5_cache_ent_insert(&dev->cache, ent); if (ret) goto ent_insert_err;
if (persistent_entry) { if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
order = MLX5_IMR_KSM_CACHE_ENTRY; else
order = order_base_2(rb_key.ndescs) - 2;
/* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
MLX5_SET64(mkc, mkc, len, iova + length);
MLX5_SET(mkc, mkc, free, 0);
MLX5_SET(mkc, mkc, umr_en, 0);
err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); if (err) goto err_2;
inlen = MLX5_ST_SZ_BYTES(create_mkey_in); if (populate)
inlen += sizeof(*pas) *
roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
in = kvzalloc(inlen, GFP_KERNEL); if (!in) {
err = -ENOMEM; goto err_1;
}
pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); if (populate) { if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
err = -EINVAL; goto err_2;
}
mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
pg_cap ? MLX5_IB_MTT_PRESENT : 0);
}
/* The pg_access bit allows setting the access flags * in the page list submitted with the command.
*/
MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
populate ? pd : dev->umrc.pd); /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ if (umem->is_dmabuf && ksm_mode)
MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
switch (mdm->type) { case MLX5_IB_UAPI_DM_TYPE_MEMIC: if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) return ERR_PTR(-EINVAL);
mode = MLX5_MKC_ACCESS_MODE_MEMIC;
start_addr -= pci_resource_start(dev->pdev, 0); break; case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) return ERR_PTR(-EINVAL);
if (xlt_with_umr) { /* * If the MR was created with reg_create then it will be * configured properly but left disabled. It is safe to go ahead * and configure it again via UMR while enabling it.
*/
err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); if (err) {
mlx5_ib_dereg_mr(&mr->ibmr, NULL); return ERR_PTR(err);
}
} return &mr->ibmr;
}
staticstruct ib_mr *
reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
u64 length, u64 virt_addr, int fd, int access_flags)
{ struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_data_direct_dev *data_direct_dev; struct ib_mr *crossing_mr; struct ib_mr *crossed_mr; int ret = 0;
/* As of HW behaviour the IOVA must be page aligned in KSM mode */ if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) return ERR_PTR(-EOPNOTSUPP);
mutex_lock(&dev->data_direct_lock);
data_direct_dev = dev->data_direct_dev; if (!data_direct_dev) {
ret = -EINVAL; goto end;
}
/* The device's 'data direct mkey' was created without RO flags to * simplify things and allow for a single mkey per device. * Since RO is not a must, mask it out accordingly.
*/
access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
offset, length, virt_addr, fd,
access_flags, MLX5_MKC_ACCESS_MODE_KSM,
NULL); if (IS_ERR(crossed_mr)) {
ret = PTR_ERR(crossed_mr); goto end;
}
mutex_lock(&dev->slow_path_mutex);
crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
crossed_mr->lkey);
mutex_unlock(&dev->slow_path_mutex); if (IS_ERR(crossing_mr)) {
__mlx5_ib_dereg_mr(crossed_mr);
ret = PTR_ERR(crossing_mr); goto end;
}
/* * True if the change in access flags can be done via UMR, only some access * flags can be updated.
*/ staticbool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, unsignedint current_access_flags, unsignedint target_access_flags)
{ unsignedint diffs = current_access_flags ^ target_access_flags;
/* We only track the allocated sizes of MRs from the cache */ if (!mr->mmkey.cache_ent) returnfalse; if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) returnfalse;
staticint umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, int access_flags, int flags, struct ib_umem *new_umem,
u64 iova, unsignedlong page_size)
{ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; struct ib_umem *old_umem = mr->umem; int err;
/* * To keep everything simple the MR is revoked before we start to mess * with it. This ensure the change is atomic relative to any use of the * MR.
*/
err = mlx5r_umr_revoke_mr(mr); if (err) return err;
mr->ibmr.iova = iova;
mr->ibmr.length = new_umem->length;
mr->page_shift = order_base_2(page_size);
mr->umem = new_umem;
err = mlx5r_umr_update_mr_pas(mr, upd_flags); if (err) { /* * The MR is revoked at this point so there is no issue to free * new_umem.
*/
mr->umem = old_umem; return err;
}
if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) return ERR_PTR(-EOPNOTSUPP);
if (!(flags & IB_MR_REREG_ACCESS))
new_access_flags = mr->access_flags; if (!(flags & IB_MR_REREG_PD))
new_pd = ib_mr->pd;
if (!(flags & IB_MR_REREG_TRANS)) { struct ib_umem *umem;
/* Fast path for PD/access change */ if (can_use_umr_rereg_access(dev, mr->access_flags,
new_access_flags)) {
err = mlx5r_umr_rereg_pd_access(mr, new_pd,
new_access_flags); if (err) return ERR_PTR(err); return NULL;
} /* DM or ODP MR's don't have a normal umem so we can't re-use it */ if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) goto recreate;
/* * Only one active MR can refer to a umem at one time, revoke * the old MR before assigning the umem to the new one.
*/
err = mlx5r_umr_revoke_mr(mr); if (err) return ERR_PTR(err);
umem = mr->umem;
mr->umem = NULL;
atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
/* * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does * but the logic around releasing the umem is different
*/ if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) goto recreate;
new_umem = ib_umem_get(&dev->ib_dev, start, length,
new_access_flags); if (IS_ERR(new_umem)) return ERR_CAST(new_umem);
/* Fast path for PAS change */ if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
&page_size)) {
err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
new_umem, iova, page_size); if (err) {
ib_umem_release(new_umem); return ERR_PTR(err);
} return NULL;
} return create_real_mr(new_pd, new_umem, iova, new_access_flags, NULL);
}
/* * Everything else has no state we can preserve, just create a new MR * from scratch
*/
recreate: return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
new_access_flags, NULL, udata);
}
staticint
mlx5_alloc_priv_descs(struct ib_device *device, struct mlx5_ib_mr *mr, int ndescs, int desc_size)
{ struct mlx5_ib_dev *dev = to_mdev(device); struct device *ddev = &dev->mdev->pdev->dev; int size = ndescs * desc_size; int add_size; int ret;
add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) { int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size));
add_size = min_t(int, end - size, add_size);
}
mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); if (!mr->descs_alloc) return -ENOMEM;
if (mr->mmkey.cacheable && !mlx5_umr_revoke_mr_with_lock(mr) &&
!cache_ent_find_and_store(dev, mr)) {
ent = mr->mmkey.cache_ent; /* upon storing to a clean temp entry - schedule its cleanup */
spin_lock_irq(&ent->mkeys_queue.lock); if (from_cache)
ent->in_use--; if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
secs_to_jiffies(30));
ent->tmp_cleanup_scheduled = true;
}
spin_unlock_irq(&ent->mkeys_queue.lock); return 0;
}
if (ent) {
spin_lock_irq(&ent->mkeys_queue.lock);
ent->in_use--;
mr->mmkey.cache_ent = NULL;
spin_unlock_irq(&ent->mkeys_queue.lock);
}
if (is_odp)
mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
if (is_odp_dma_buf)
dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv,
NULL);
ret = destroy_mkey(dev, mr); if (is_odp) { if (!ret)
to_ib_umem_odp(mr->umem)->private = NULL;
mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
}
if (is_odp_dma_buf) { if (!ret)
to_ib_umem_dmabuf(mr->umem)->private = NULL;
dma_resv_unlock(
to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
} return ret;
}
/* * Any async use of the mr must hold the refcount, once the refcount * goes to zero no other thread, such as ODP page faults, prefetch, any * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
*/ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
refcount_read(&mr->mmkey.usecount) != 0 &&
xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
mlx5r_deref_wait_odp_mkey(&mr->mmkey);
if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
mr->sig, NULL, GFP_KERNEL);
if (mr->mtt_mr) {
rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); if (rc) return rc;
mr->mtt_mr = NULL;
} if (mr->klm_mr) {
rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); if (rc) return rc;
mr->klm_mr = NULL;
}
if (mlx5_core_destroy_psv(dev->mdev,
mr->sig->psv_memory.psv_idx))
mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
mr->sig->psv_memory.psv_idx); if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
mr->sig->psv_wire.psv_idx);
kfree(mr->sig);
mr->sig = NULL;
}
if (mr->data_direct) return dereg_crossing_data_direct_mr(dev, mr);
return __mlx5_ib_dereg_mr(ibmr);
}
staticvoid mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, int access_mode, int page_shift)
{ struct mlx5_ib_dev *dev = to_mdev(pd->device); void *mkc;
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
/* This is only used from the kernel, so setting the PD is OK. */
set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
MLX5_SET(mkc, mkc, free, 1);
MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
MLX5_SET(mkc, mkc, umr_en, 1);
MLX5_SET(mkc, mkc, log_page_size, page_shift); if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
access_mode == MLX5_MKC_ACCESS_MODE_MTT)
MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
}
staticint _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, int ndescs, int desc_size, int page_shift, int access_mode, u32 *in, int inlen)
{ struct mlx5_ib_dev *dev = to_mdev(pd->device); int err;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.