diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 2184af413b91..f3b66b55acfb 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -34,6 +34,7 @@ #include "xe_res_cursor.h" #include "xe_sa.h" #include "xe_sched_job.h" +#include "xe_sriov_vf_ccs.h" #include "xe_sync.h" #include "xe_trace_bo.h" #include "xe_validation.h" @@ -1103,12 +1104,16 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q, u32 batch_size, batch_size_allocated; struct xe_device *xe = gt_to_xe(gt); struct xe_res_cursor src_it, ccs_it; + struct xe_sriov_vf_ccs_ctx *ctx; + struct xe_sa_manager *bb_pool; u64 size = xe_bo_size(src_bo); struct xe_bb *bb = NULL; u64 src_L0, src_L0_ofs; u32 src_L0_pt; int err; + ctx = &xe->sriov.vf.ccs.contexts[read_write]; + xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), @@ -1141,11 +1146,15 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q, size -= src_L0; } + bb_pool = ctx->mem.ccs_bb_pool; + guard(mutex) (xe_sa_bo_swap_guard(bb_pool)); + xe_sa_bo_swap_shadow(bb_pool); + bb = xe_bb_ccs_new(gt, batch_size, read_write); if (IS_ERR(bb)) { drm_err(&xe->drm, "BB allocation failed.\n"); err = PTR_ERR(bb); - goto err_ret; + return err; } batch_size_allocated = batch_size; @@ -1194,10 +1203,52 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q, xe_assert(xe, (batch_size_allocated == bb->len)); src_bo->bb_ccs[read_write] = bb; + xe_sriov_vf_ccs_rw_update_bb_addr(ctx); + xe_sa_bo_sync_shadow(bb->bo); return 0; +} -err_ret: - return err; +/** + * xe_migrate_ccs_rw_copy_clear() - Clear the CCS read/write batch buffer + * content. + * @src_bo: The buffer object @src is currently bound to. + * @read_write : Creates BB commands for CCS read/write. + * + * Directly clearing the BB lacks atomicity and can lead to undefined + * behavior if the vCPU is halted mid-operation during the clearing + * process. To avoid this issue, we use a shadow buffer object approach. + * + * First swap the SA BO address with the shadow BO, perform the clearing + * operation on the BB, update the shadow BO in the ring buffer, then + * sync the shadow and the actual buffer to maintain consistency. + * + * Returns: None. + */ +void xe_migrate_ccs_rw_copy_clear(struct xe_bo *src_bo, + enum xe_sriov_vf_ccs_rw_ctxs read_write) +{ + struct xe_bb *bb = src_bo->bb_ccs[read_write]; + struct xe_device *xe = xe_bo_device(src_bo); + struct xe_sriov_vf_ccs_ctx *ctx; + struct xe_sa_manager *bb_pool; + u32 *cs; + + xe_assert(xe, IS_SRIOV_VF(xe)); + + ctx = &xe->sriov.vf.ccs.contexts[read_write]; + bb_pool = ctx->mem.ccs_bb_pool; + + guard(mutex) (xe_sa_bo_swap_guard(bb_pool)); + xe_sa_bo_swap_shadow(bb_pool); + + cs = xe_sa_bo_cpu_addr(bb->bo); + memset(cs, MI_NOOP, bb->len * sizeof(u32)); + xe_sriov_vf_ccs_rw_update_bb_addr(ctx); + + xe_sa_bo_sync_shadow(bb->bo); + + xe_bb_free(bb, NULL); + src_bo->bb_ccs[read_write] = NULL; } /** diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h index 260e298e5dd7..464c05dde1ba 100644 --- a/drivers/gpu/drm/xe/xe_migrate.h +++ b/drivers/gpu/drm/xe/xe_migrate.h @@ -134,6 +134,9 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q, struct xe_bo *src_bo, enum xe_sriov_vf_ccs_rw_ctxs read_write); +void xe_migrate_ccs_rw_copy_clear(struct xe_bo *src_bo, + enum xe_sriov_vf_ccs_rw_ctxs read_write); + struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate); struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate); struct dma_fence *xe_migrate_vram_copy_chunk(struct xe_bo *vram_bo, u64 vram_offset, diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c index 9959d619addc..33f4238604e1 100644 --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c @@ -150,7 +150,8 @@ static int alloc_bb_pool(struct xe_tile *tile, struct xe_sriov_vf_ccs_ctx *ctx) xe_sriov_info(xe, "Allocating %s CCS BB pool size = %lldMB\n", ctx->ctx_id ? "Restore" : "Save", bb_pool_size / SZ_1M); - sa_manager = xe_sa_bo_manager_init(tile, bb_pool_size, SZ_16); + sa_manager = __xe_sa_bo_manager_init(tile, bb_pool_size, SZ_4K, SZ_16, + XE_SA_BO_MANAGER_FLAG_SHADOW); if (IS_ERR(sa_manager)) { xe_sriov_err(xe, "Suballocator init failed with error: %pe\n", @@ -384,6 +385,18 @@ err_ret: return err; } +#define XE_SRIOV_VF_CCS_RW_BB_ADDR_OFFSET (2 * sizeof(u32)) +void xe_sriov_vf_ccs_rw_update_bb_addr(struct xe_sriov_vf_ccs_ctx *ctx) +{ + u64 addr = xe_sa_manager_gpu_addr(ctx->mem.ccs_bb_pool); + struct xe_lrc *lrc = xe_exec_queue_lrc(ctx->mig_q); + struct xe_device *xe = gt_to_xe(ctx->mig_q->gt); + + xe_device_wmb(xe); + xe_map_wr(xe, &lrc->bo->vmap, XE_SRIOV_VF_CCS_RW_BB_ADDR_OFFSET, u32, addr); + xe_device_wmb(xe); +} + /** * xe_sriov_vf_ccs_attach_bo - Insert CCS read write commands in the BO. * @bo: the &buffer object to which batch buffer commands will be added. @@ -444,9 +457,7 @@ int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo) if (!bb) continue; - memset(bb->cs, MI_NOOP, bb->len * sizeof(u32)); - xe_bb_free(bb, NULL); - bo->bb_ccs[ctx_id] = NULL; + xe_migrate_ccs_rw_copy_clear(bo, ctx_id); } return 0; } diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h index f8ca6efce9ee..00e58b36c510 100644 --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.h @@ -20,6 +20,7 @@ int xe_sriov_vf_ccs_detach_bo(struct xe_bo *bo); int xe_sriov_vf_ccs_register_context(struct xe_device *xe); void xe_sriov_vf_ccs_rebase(struct xe_device *xe); void xe_sriov_vf_ccs_print(struct xe_device *xe, struct drm_printer *p); +void xe_sriov_vf_ccs_rw_update_bb_addr(struct xe_sriov_vf_ccs_ctx *ctx); static inline bool xe_sriov_vf_ccs_ready(struct xe_device *xe) {