From 4a3f98cc34b8bfde694c6e1509989637973aff6b Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <basni@chromium.org>
Date: Sat, 20 Jun 2020 03:50:34 +0200
Subject: [PATCH] amdgpu: Add SDMA copy support.

For SCANOUT images we need to use USWC memory. However USWC
memory is very slow to read from (~25 MiB/second).

At the same time for video decoding there are some images
that are allocated with SCANOUT but at the same time frequently
accessed from the CPU. Just mapping these is going to result
in non-satisfactory performance, so this patch adds a DMA step
to copy it to memory that is faster to acces from the CPU.

Benchmarked on Grunt with
android.hardware.camera2.cts.RecordingTest#testVideoPreviewSurfaceSharing

The time that an image (~500 KiB) is kept mapped for processing,
including the time for mapping and unmapping:

plain GTT (cachaeable):  1.5-2 ms
USWC:                    45-50 ms
USWC w/ memcpy:          20-30 ms
USWC w/ SDMA copy:       3.5-5.5 ms

We can clearly see that the Android video processing code only gets
a throughput of ~10 MiB/s with USWC memory. memcpy is slightly more
efficient by getting 20-30 MiB/s, but neither of these are suitable
for 30+ fps video.

Furthermore, with SDMA copy, the timing is roughly as follows:

map:
  - Allocate plain GTT BO:        ~400-800 us
  - map src & dst BO into GPU VM: ~25 us
  - submit SDMA copy:             ~80 us
  - wait till SDMA copy finishes: ~400 us
  - unmap src & dst BO from GPU:  ~15 us
  - map dst BO into CPU:          ~30 us

unmap:
  - unmapping dst BO from CPU:    ~30 us
  - Copy not benchmarked (avoided for RO map)
  - delete BO:                    ~100 us

ideas for further improvement:
  - BO cache
  - rely on implicit sync and don't wait for the copy during
    unmapping.

Alternatives that have been rejected:
  - Use radeonsi + DRI interface: each plane gets mapped into
    its own BO, which is an issue for gralloc.
  - more persistently mapping each BO into GPU VM: this needs
    proper address space management which adds complexity.
    librm_amdgpu can do it for us but brings its own can of worms
    with dedup of the drm fd. (which makes e.g. implicit sync not
    work with any radeonsi instances in the same process)
  - Use SDMA instead of DRI/Radeonsi for more images. This is an
    issue because SDMA for images is a whole mess with lots of
    corner cases and lots of changes per generation. Furthermore,
    it wouldn't work for DCC compressed images.

TEST=Run android.hardware.camera2.cts.RecordingTest#testVideoPreviewSurfaceSharing on Grunt.
BUG=b:152378755

Change-Id: I8f5e00ff4b6d9e31f78fd4de7eb62d0d3aa66438
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/minigbm/+/2256228
Tested-by: Bas Nieuwenhuizen <basni@chromium.org>
Reviewed-by: Gurchetan Singh <gurchetansingh@chromium.org>
Commit-Queue: Bas Nieuwenhuizen <basni@chromium.org>
---
 amdgpu.c | 370 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 362 insertions(+), 8 deletions(-)

diff --git a/amdgpu.c b/amdgpu.c
index 15ceb45..3dce33a 100644
--- a/amdgpu.c
+++ b/amdgpu.c
@@ -29,6 +29,19 @@
 struct amdgpu_priv {
 	struct dri_driver dri;
 	int drm_version;
+
+	/* sdma */
+	struct drm_amdgpu_info_device dev_info;
+	uint32_t sdma_ctx;
+	uint32_t sdma_cmdbuf_bo;
+	uint64_t sdma_cmdbuf_addr;
+	uint64_t sdma_cmdbuf_size;
+	uint32_t *sdma_cmdbuf_map;
+};
+
+struct amdgpu_linear_vma_priv {
+	uint32_t handle;
+	uint32_t map_flags;
 };
 
 const static uint32_t render_target_formats[] = { DRM_FORMAT_ABGR8888, DRM_FORMAT_ARGB8888,
@@ -39,6 +52,252 @@ const static uint32_t texture_source_formats[] = { DRM_FORMAT_GR88,	      DRM_FO
 						   DRM_FORMAT_NV21,	      DRM_FORMAT_NV12,
 						   DRM_FORMAT_YVU420_ANDROID, DRM_FORMAT_YVU420 };
 
+static int query_dev_info(int fd, struct drm_amdgpu_info_device *dev_info)
+{
+	struct drm_amdgpu_info info_args = { 0 };
+
+	info_args.return_pointer = (uintptr_t)dev_info;
+	info_args.return_size = sizeof(*dev_info);
+	info_args.query = AMDGPU_INFO_DEV_INFO;
+
+	return drmCommandWrite(fd, DRM_AMDGPU_INFO, &info_args, sizeof(info_args));
+}
+
+static int sdma_init(struct amdgpu_priv *priv, int fd)
+{
+	union drm_amdgpu_ctx ctx_args = { { 0 } };
+	union drm_amdgpu_gem_create gem_create = { { 0 } };
+	struct drm_amdgpu_gem_va va_args = { 0 };
+	union drm_amdgpu_gem_mmap gem_map = { { 0 } };
+	struct drm_gem_close gem_close = { 0 };
+	int ret;
+
+	/* Ensure we can make a submission without BO lists. */
+	if (priv->drm_version < 27)
+		return 0;
+
+	/* Anything outside this range needs adjustments to the SDMA copy commands */
+	if (priv->dev_info.family < AMDGPU_FAMILY_CI || priv->dev_info.family > AMDGPU_FAMILY_NV)
+		return 0;
+
+	ctx_args.in.op = AMDGPU_CTX_OP_ALLOC_CTX;
+
+	ret = drmCommandWriteRead(fd, DRM_AMDGPU_CTX, &ctx_args, sizeof(ctx_args));
+	if (ret < 0)
+		return ret;
+
+	priv->sdma_ctx = ctx_args.out.alloc.ctx_id;
+
+	priv->sdma_cmdbuf_size = ALIGN(4096, priv->dev_info.virtual_address_alignment);
+	gem_create.in.bo_size = priv->sdma_cmdbuf_size;
+	gem_create.in.alignment = 4096;
+	gem_create.in.domains = AMDGPU_GEM_DOMAIN_GTT;
+
+	ret = drmCommandWriteRead(fd, DRM_AMDGPU_GEM_CREATE, &gem_create, sizeof(gem_create));
+	if (ret < 0)
+		goto fail_ctx;
+
+	priv->sdma_cmdbuf_bo = gem_create.out.handle;
+
+	priv->sdma_cmdbuf_addr =
+	    ALIGN(priv->dev_info.virtual_address_offset, priv->dev_info.virtual_address_alignment);
+
+	/* Map the buffer into the GPU address space so we can use it from the GPU */
+	va_args.handle = priv->sdma_cmdbuf_bo;
+	va_args.operation = AMDGPU_VA_OP_MAP;
+	va_args.flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_EXECUTABLE;
+	va_args.va_address = priv->sdma_cmdbuf_addr;
+	va_args.offset_in_bo = 0;
+	va_args.map_size = priv->sdma_cmdbuf_size;
+
+	ret = drmCommandWrite(fd, DRM_AMDGPU_GEM_VA, &va_args, sizeof(va_args));
+	if (ret)
+		goto fail_bo;
+
+	gem_map.in.handle = priv->sdma_cmdbuf_bo;
+	ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &gem_map);
+	if (ret)
+		goto fail_va;
+
+	priv->sdma_cmdbuf_map = mmap(0, priv->sdma_cmdbuf_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+				     fd, gem_map.out.addr_ptr);
+	if (priv->sdma_cmdbuf_map == MAP_FAILED) {
+		priv->sdma_cmdbuf_map = NULL;
+		ret = -ENOMEM;
+		goto fail_va;
+	}
+
+	return 0;
+fail_va:
+	va_args.operation = AMDGPU_VA_OP_UNMAP;
+	va_args.flags = 0;
+	drmCommandWrite(fd, DRM_AMDGPU_GEM_VA, &va_args, sizeof(va_args));
+fail_bo:
+	gem_close.handle = priv->sdma_cmdbuf_bo;
+	drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+fail_ctx:
+	memset(&ctx_args, 0, sizeof(ctx_args));
+	ctx_args.in.op = AMDGPU_CTX_OP_FREE_CTX;
+	ctx_args.in.ctx_id = priv->sdma_ctx;
+	drmCommandWriteRead(fd, DRM_AMDGPU_CTX, &ctx_args, sizeof(ctx_args));
+	return ret;
+}
+
+static void sdma_finish(struct amdgpu_priv *priv, int fd)
+{
+	union drm_amdgpu_ctx ctx_args = { { 0 } };
+	struct drm_amdgpu_gem_va va_args = { 0 };
+	struct drm_gem_close gem_close = { 0 };
+
+	if (!priv->sdma_cmdbuf_map)
+		return;
+
+	va_args.handle = priv->sdma_cmdbuf_bo;
+	va_args.operation = AMDGPU_VA_OP_UNMAP;
+	va_args.flags = 0;
+	va_args.va_address = priv->sdma_cmdbuf_addr;
+	va_args.offset_in_bo = 0;
+	va_args.map_size = priv->sdma_cmdbuf_size;
+	drmCommandWrite(fd, DRM_AMDGPU_GEM_VA, &va_args, sizeof(va_args));
+
+	gem_close.handle = priv->sdma_cmdbuf_bo;
+	drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+
+	ctx_args.in.op = AMDGPU_CTX_OP_FREE_CTX;
+	ctx_args.in.ctx_id = priv->sdma_ctx;
+	drmCommandWriteRead(fd, DRM_AMDGPU_CTX, &ctx_args, sizeof(ctx_args));
+}
+
+static int sdma_copy(struct amdgpu_priv *priv, int fd, uint32_t src_handle, uint32_t dst_handle,
+		     uint64_t size)
+{
+	const uint64_t max_size_per_cmd = 0x3fff00;
+	const uint32_t cmd_size = 7 * sizeof(uint32_t); /* 7 dwords, see loop below. */
+	const uint64_t max_commands = priv->sdma_cmdbuf_size / cmd_size;
+	uint64_t src_addr = priv->sdma_cmdbuf_addr + priv->sdma_cmdbuf_size;
+	uint64_t dst_addr = src_addr + size;
+	struct drm_amdgpu_gem_va va_args = { 0 };
+	unsigned cmd = 0;
+	uint64_t remaining_size = size;
+	uint64_t cur_src_addr = src_addr;
+	uint64_t cur_dst_addr = dst_addr;
+	struct drm_amdgpu_cs_chunk_ib ib = { 0 };
+	struct drm_amdgpu_cs_chunk chunks[2] = { { 0 } };
+	uint64_t chunk_ptrs[2];
+	union drm_amdgpu_cs cs = { { 0 } };
+	struct drm_amdgpu_bo_list_in bo_list = { 0 };
+	struct drm_amdgpu_bo_list_entry bo_list_entries[3] = { { 0 } };
+	union drm_amdgpu_wait_cs wait_cs = { { 0 } };
+	int ret = 0;
+
+	if (size > UINT64_MAX - max_size_per_cmd ||
+	    DIV_ROUND_UP(size, max_size_per_cmd) > max_commands)
+		return -ENOMEM;
+
+	/* Map both buffers into the GPU address space so we can access them from the GPU. */
+	va_args.handle = src_handle;
+	va_args.operation = AMDGPU_VA_OP_MAP;
+	va_args.flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_DELAY_UPDATE;
+	va_args.va_address = src_addr;
+	va_args.map_size = size;
+
+	ret = drmCommandWrite(fd, DRM_AMDGPU_GEM_VA, &va_args, sizeof(va_args));
+	if (ret)
+		return ret;
+
+	va_args.handle = dst_handle;
+	va_args.flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_DELAY_UPDATE;
+	va_args.va_address = dst_addr;
+
+	ret = drmCommandWrite(fd, DRM_AMDGPU_GEM_VA, &va_args, sizeof(va_args));
+	if (ret)
+		goto unmap_src;
+
+	while (remaining_size) {
+		uint64_t cur_size = remaining_size;
+		if (cur_size > max_size_per_cmd)
+			cur_size = max_size_per_cmd;
+
+		priv->sdma_cmdbuf_map[cmd++] = 0x01; /* linear copy */
+		priv->sdma_cmdbuf_map[cmd++] =
+		    priv->dev_info.family >= AMDGPU_FAMILY_AI ? (cur_size - 1) : cur_size;
+		priv->sdma_cmdbuf_map[cmd++] = 0;
+		priv->sdma_cmdbuf_map[cmd++] = cur_src_addr;
+		priv->sdma_cmdbuf_map[cmd++] = cur_src_addr >> 32;
+		priv->sdma_cmdbuf_map[cmd++] = cur_dst_addr;
+		priv->sdma_cmdbuf_map[cmd++] = cur_dst_addr >> 32;
+
+		remaining_size -= cur_size;
+		cur_src_addr += cur_size;
+		cur_dst_addr += cur_size;
+	}
+
+	ib.va_start = priv->sdma_cmdbuf_addr;
+	ib.ib_bytes = cmd * 4;
+	ib.ip_type = AMDGPU_HW_IP_DMA;
+
+	chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
+	chunks[1].length_dw = sizeof(ib) / 4;
+	chunks[1].chunk_data = (uintptr_t)&ib;
+
+	bo_list_entries[0].bo_handle = priv->sdma_cmdbuf_bo;
+	bo_list_entries[0].bo_priority = 8; /* Middle of range, like RADV. */
+	bo_list_entries[1].bo_handle = src_handle;
+	bo_list_entries[1].bo_priority = 8;
+	bo_list_entries[2].bo_handle = dst_handle;
+	bo_list_entries[2].bo_priority = 8;
+
+	bo_list.bo_number = 3;
+	bo_list.bo_info_size = sizeof(bo_list_entries[0]);
+	bo_list.bo_info_ptr = (uintptr_t)bo_list_entries;
+
+	chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
+	chunks[0].length_dw = sizeof(bo_list) / 4;
+	chunks[0].chunk_data = (uintptr_t)&bo_list;
+
+	chunk_ptrs[0] = (uintptr_t)&chunks[0];
+	chunk_ptrs[1] = (uintptr_t)&chunks[1];
+
+	cs.in.ctx_id = priv->sdma_ctx;
+	cs.in.num_chunks = 2;
+	cs.in.chunks = (uintptr_t)chunk_ptrs;
+
+	ret = drmCommandWriteRead(fd, DRM_AMDGPU_CS, &cs, sizeof(cs));
+	if (ret) {
+		drv_log("SDMA copy command buffer submission failed %d\n", ret);
+		goto unmap_dst;
+	}
+
+	wait_cs.in.handle = cs.out.handle;
+	wait_cs.in.ip_type = AMDGPU_HW_IP_DMA;
+	wait_cs.in.ctx_id = priv->sdma_ctx;
+	wait_cs.in.timeout = INT64_MAX;
+
+	ret = drmCommandWriteRead(fd, DRM_AMDGPU_WAIT_CS, &wait_cs, sizeof(wait_cs));
+	if (ret) {
+		drv_log("Could not wait for CS to finish\n");
+	} else if (wait_cs.out.status) {
+		drv_log("Infinite wait timed out, likely GPU hang.\n");
+		ret = -ENODEV;
+	}
+
+unmap_dst:
+	va_args.handle = dst_handle;
+	va_args.operation = AMDGPU_VA_OP_UNMAP;
+	va_args.flags = AMDGPU_VM_DELAY_UPDATE;
+	va_args.va_address = dst_addr;
+	drmCommandWrite(fd, DRM_AMDGPU_GEM_VA, &va_args, sizeof(va_args));
+
+unmap_src:
+	va_args.handle = src_handle;
+	va_args.operation = AMDGPU_VA_OP_UNMAP;
+	va_args.flags = AMDGPU_VM_DELAY_UPDATE;
+	va_args.va_address = src_addr;
+	drmCommandWrite(fd, DRM_AMDGPU_GEM_VA, &va_args, sizeof(va_args));
+
+	return ret;
+}
+
 static int amdgpu_init(struct driver *drv)
 {
 	struct amdgpu_priv *priv;
@@ -61,12 +320,23 @@ static int amdgpu_init(struct driver *drv)
 
 	drv->priv = priv;
 
+	if (query_dev_info(drv_get_fd(drv), &priv->dev_info)) {
+		free(priv);
+		drv->priv = NULL;
+		return -ENODEV;
+	}
 	if (dri_init(drv, DRI_PATH, "radeonsi")) {
 		free(priv);
 		drv->priv = NULL;
 		return -ENODEV;
 	}
 
+	if (sdma_init(priv, drv_get_fd(drv))) {
+		drv_log("SDMA init failed\n");
+
+		/* Continue, as we can still succesfully map things without SDMA. */
+	}
+
 	metadata.tiling = TILE_TYPE_LINEAR;
 	metadata.priority = 1;
 	metadata.modifier = DRM_FORMAT_MOD_LINEAR;
@@ -127,6 +397,7 @@ static int amdgpu_init(struct driver *drv)
 
 static void amdgpu_close(struct driver *drv)
 {
+	sdma_finish(drv->priv, drv_get_fd(drv));
 	dri_close(drv);
 	free(drv->priv);
 	drv->priv = NULL;
@@ -138,6 +409,7 @@ static int amdgpu_create_bo_linear(struct bo *bo, uint32_t width, uint32_t heigh
 	int ret;
 	uint32_t plane, stride;
 	union drm_amdgpu_gem_create gem_create;
+	struct amdgpu_priv *priv = bo->drv->priv;
 
 	stride = drv_stride_from_format(format, width, 0);
 	stride = ALIGN(stride, 256);
@@ -145,7 +417,8 @@ static int amdgpu_create_bo_linear(struct bo *bo, uint32_t width, uint32_t heigh
 	drv_bo_from_format(bo, stride, height, format);
 
 	memset(&gem_create, 0, sizeof(gem_create));
-	gem_create.in.bo_size = bo->meta.total_size;
+	gem_create.in.bo_size =
+	    ALIGN(bo->meta.total_size, priv->dev_info.virtual_address_alignment);
 	gem_create.in.alignment = 256;
 	gem_create.in.domain_flags = 0;
 
@@ -251,33 +524,114 @@ static int amdgpu_destroy_bo(struct bo *bo)
 
 static void *amdgpu_map_bo(struct bo *bo, struct vma *vma, size_t plane, uint32_t map_flags)
 {
+	void *addr = MAP_FAILED;
 	int ret;
 	union drm_amdgpu_gem_mmap gem_map;
+	struct drm_amdgpu_gem_create_in bo_info = { 0 };
+	struct drm_amdgpu_gem_op gem_op = { 0 };
+	uint32_t handle = bo->handles[plane].u32;
+	struct amdgpu_linear_vma_priv *priv = NULL;
+	struct amdgpu_priv *drv_priv;
 
 	if (bo->priv)
 		return dri_bo_map(bo, vma, plane, map_flags);
 
+	drv_priv = bo->drv->priv;
+	gem_op.handle = handle;
+	gem_op.op = AMDGPU_GEM_OP_GET_GEM_CREATE_INFO;
+	gem_op.value = (uintptr_t)&bo_info;
+
+	ret = drmCommandWriteRead(bo->drv->fd, DRM_AMDGPU_GEM_OP, &gem_op, sizeof(gem_op));
+	if (ret)
+		return MAP_FAILED;
+
+	vma->length = bo_info.bo_size;
+
+	if (((bo_info.domains & AMDGPU_GEM_DOMAIN_VRAM) ||
+	     (bo_info.domain_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)) &&
+	    drv_priv->sdma_cmdbuf_map) {
+		union drm_amdgpu_gem_create gem_create = { { 0 } };
+
+		priv = calloc(1, sizeof(struct amdgpu_linear_vma_priv));
+		if (!priv)
+			return MAP_FAILED;
+
+		gem_create.in.bo_size = bo_info.bo_size;
+		gem_create.in.alignment = 4096;
+		gem_create.in.domains = AMDGPU_GEM_DOMAIN_GTT;
+
+		ret = drmCommandWriteRead(bo->drv->fd, DRM_AMDGPU_GEM_CREATE, &gem_create,
+					  sizeof(gem_create));
+		if (ret < 0) {
+			drv_log("GEM create failed\n");
+			free(priv);
+			return MAP_FAILED;
+		}
+
+		priv->map_flags = map_flags;
+		handle = priv->handle = gem_create.out.handle;
+
+		ret = sdma_copy(bo->drv->priv, bo->drv->fd, bo->handles[0].u32, priv->handle,
+				bo_info.bo_size);
+		if (ret) {
+			drv_log("SDMA copy for read failed\n");
+			goto fail;
+		}
+	}
+
 	memset(&gem_map, 0, sizeof(gem_map));
-	gem_map.in.handle = bo->handles[plane].u32;
+	gem_map.in.handle = handle;
 
 	ret = drmIoctl(bo->drv->fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &gem_map);
 	if (ret) {
 		drv_log("DRM_IOCTL_AMDGPU_GEM_MMAP failed\n");
-		return MAP_FAILED;
+		goto fail;
 	}
 
-	vma->length = bo->meta.total_size;
-
-	return mmap(0, bo->meta.total_size, drv_get_prot(map_flags), MAP_SHARED, bo->drv->fd,
+	addr = mmap(0, bo->meta.total_size, drv_get_prot(map_flags), MAP_SHARED, bo->drv->fd,
 		    gem_map.out.addr_ptr);
+	if (addr == MAP_FAILED)
+		goto fail;
+
+	vma->priv = priv;
+	return addr;
+
+fail:
+	if (priv) {
+		struct drm_gem_close gem_close = { 0 };
+		gem_close.handle = priv->handle;
+		drmIoctl(bo->drv->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+		free(priv);
+	}
+	return MAP_FAILED;
 }
 
 static int amdgpu_unmap_bo(struct bo *bo, struct vma *vma)
 {
 	if (bo->priv)
 		return dri_bo_unmap(bo, vma);
-	else
-		return munmap(vma->addr, vma->length);
+	else {
+		int r = munmap(vma->addr, vma->length);
+		if (r)
+			return r;
+
+		if (vma->priv) {
+			struct amdgpu_linear_vma_priv *priv = vma->priv;
+			struct drm_gem_close gem_close = { 0 };
+
+			if (BO_MAP_WRITE & priv->map_flags) {
+				r = sdma_copy(bo->drv->priv, bo->drv->fd, priv->handle,
+					      bo->handles[0].u32, vma->length);
+				if (r)
+					return r;
+			}
+
+			gem_close.handle = priv->handle;
+			r = drmIoctl(bo->drv->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+		}
+
+		return 0;
+	}
 }
 
 static int amdgpu_bo_invalidate(struct bo *bo, struct mapping *mapping)
-- 
2.11.0