drm/nouveau/gf100-: track chan progress with non-WFI semaphore release

From VOLTA_CHANNEL_GPFIFO_A onwards, HW no longer updates the GET/GP_GET pointers in USERD following channel progress, but instead updates on a timer for compatibility, and SW is expected to implement its own method of tracking channel progress (typically via non-WFI semaphore release). Nouveau has been making use of the compatibility mode up until now, however, from BLACKWELL_CHANNEL_GPFIFO_A HW no longer supports USERD writeback at all. Allocate a per-channel buffer in system memory, and append a non-WFI semaphore release to the end of each push buffer segment to simulate the pointers previously read from USERD. This change is implemented for Fermi (which is the first to support non- WFI semaphore release) onwards, as readback from system memory is likely faster than BAR1 reads. Signed-off-by: Ben Skeggs <bskeggs@nvidia.com> Reviewed-by: Dave Airlie <airlied@redhat.com> Reviewed-by: Timur Tabi <ttabi@nvidia.com> Tested-by: Timur Tabi <ttabi@nvidia.com> Signed-off-by: Dave Airlie <airlied@redhat.com>
2025-08-05 16:54:27 +00:00 · 2024-06-19 14:23:04 +10:00 · 2024-06-19 14:23:04 +10:00 · 862450a85b
commit 862450a85b
parent d1fb887a08
12 changed files with 245 additions and 18 deletions
--- a/drivers/gpu/drm/nouveau/include/nvif/chan.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/chan.h
@ -17,7 +17,13 @@ struct nvif_chan {
 			void (*push)(struct nvif_chan *, bool main, u64 addr, u32 size,
 				     bool no_prefetch);
 			void (*kick)(struct nvif_chan *);
+			int (*post)(struct nvif_chan *, u32 gpptr, u32 pbptr);
+			u32 post_size;
 		} gpfifo;
+
+		struct {
+			int (*release)(struct nvif_chan *, u64 addr, u32 data);
+		} sem;
 	} *func;

 	struct {
@ -31,6 +37,11 @@ struct nvif_chan {
 		int free;
 	} gpfifo;

+	struct {
+		struct nvif_map map;
+		u64 addr;
+	} sema;
+
 	struct nvif_push push;

 	struct nvif_user *usermode;
@ -43,14 +54,23 @@ void nvif_chan_gpfifo_ctor(const struct nvif_chan_func *, void *userd, void *gpf
 			   void *push, u64 push_addr, u32 push_size, struct nvif_chan *);
 int nvif_chan_gpfifo_wait(struct nvif_chan *, u32 gpfifo_nr, u32 push_nr);
 void nvif_chan_gpfifo_push(struct nvif_chan *, u64 addr, u32 size, bool no_prefetch);
+int nvif_chan_gpfifo_post(struct nvif_chan *);
+
+void nvif_chan506f_gpfifo_push(struct nvif_chan *, bool main, u64 addr, u32 size, bool no_prefetch);
+void nvif_chan506f_gpfifo_kick(struct nvif_chan *);
+
+int nvif_chan906f_ctor_(const struct nvif_chan_func *, void *userd, void *gpfifo, u32 gpfifo_size,
+			void *push, u64 push_addr, u32 push_size, void *sema, u64 sema_addr,
+			struct nvif_chan *);
+u32 nvif_chan906f_read_get(struct nvif_chan *);
+u32 nvif_chan906f_gpfifo_read_get(struct nvif_chan *);
+int nvif_chan906f_gpfifo_post(struct nvif_chan *, u32 gpptr, u32 pbptr);

 int nvif_chan506f_ctor(struct nvif_chan *, void *userd, void *gpfifo, u32 gpfifo_size,
 		       void *push, u64 push_addr, u32 push_size);
-u32 nvif_chan506f_read_get(struct nvif_chan *);
-u32 nvif_chan506f_gpfifo_read_get(struct nvif_chan *);
-void nvif_chan506f_gpfifo_push(struct nvif_chan *, bool main, u64 addr, u32 size, bool no_prefetch);
-
+int nvif_chan906f_ctor(struct nvif_chan *, void *userd, void *gpfifo, u32 gpfifo_size,
+		       void *push, u64 push_addr, u32 push_size, void *sema, u64 sema_addr);
 int nvif_chanc36f_ctor(struct nvif_chan *, void *userd, void *gpfifo, u32 gpfifo_size,
-		       void *push, u64 push_addr, u32 push_size,
+		       void *push, u64 push_addr, u32 push_size, void *sema, u64 sema_addr,
 		       struct nvif_user *usermode, u32 doorbell_token);
 #endif
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@ -456,6 +456,28 @@ nouveau_bo_new_map(struct nouveau_cli *cli, u32 domain, u32 size, struct nouveau
 	return 0;
 }

+int
+nouveau_bo_new_map_gpu(struct nouveau_cli *cli, u32 domain, u32 size,
+		       struct nouveau_bo **pnvbo, struct nouveau_vma **pvma)
+{
+	struct nouveau_vmm *vmm = nouveau_cli_vmm(cli);
+	struct nouveau_bo *nvbo;
+	int ret;
+
+	ret = nouveau_bo_new_map(cli, domain, size, &nvbo);
+	if (ret)
+		return ret;
+
+	ret = nouveau_vma_new(nvbo, vmm, pvma);
+	if (ret) {
+		nouveau_bo_unpin_del(&nvbo);
+		return ret;
+	}
+
+	*pnvbo = nvbo;
+	return 0;
+}
+
 static void
 set_placement_range(struct nouveau_bo *nvbo, uint32_t domain)
 {
--- a/drivers/gpu/drm/nouveau/nouveau_bo.h
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.h
@ -92,6 +92,8 @@ void nouveau_bo_del_io_reserve_lru(struct ttm_buffer_object *bo);

 int nouveau_bo_new_pin(struct nouveau_cli *, u32 domain, u32 size, struct nouveau_bo **);
 int nouveau_bo_new_map(struct nouveau_cli *, u32 domain, u32 size, struct nouveau_bo **);
+int nouveau_bo_new_map_gpu(struct nouveau_cli *, u32 domain, u32 size,
+			   struct nouveau_bo **, struct nouveau_vma **);
 void nouveau_bo_unpin_del(struct nouveau_bo **);

 /* TODO: submit equivalent to TTM generic API upstream? */
--- a/drivers/gpu/drm/nouveau/nouveau_chan.c
+++ b/drivers/gpu/drm/nouveau/nouveau_chan.c
@ -103,6 +103,8 @@ nouveau_channel_del(struct nouveau_channel **pchan)
 		nvif_event_dtor(&chan->kill);
 		nvif_object_dtor(&chan->user);
 		nvif_mem_dtor(&chan->mem_userd);
+		nouveau_vma_del(&chan->sema.vma);
+		nouveau_bo_unpin_del(&chan->sema.bo);
 		nvif_object_dtor(&chan->push.ctxdma);
 		nouveau_vma_del(&chan->push.vma);
 		nouveau_bo_unpin_del(&chan->push.buffer);
@ -189,8 +191,10 @@ nouveau_channel_prep(struct nouveau_cli *cli,

 		chan->push.addr = chan->push.vma->addr;

-		if (device->info.family >= NV_DEVICE_INFO_V0_FERMI)
-			return 0;
+		if (device->info.family >= NV_DEVICE_INFO_V0_FERMI) {
+			return nouveau_bo_new_map_gpu(cli, NOUVEAU_GEM_DOMAIN_GART, PAGE_SIZE,
+						      &chan->sema.bo, &chan->sema.vma);
+		}

 		args.target = NV_DMA_V0_TARGET_VM;
 		args.access = NV_DMA_V0_ACCESS_VM;
@ -429,16 +433,25 @@ nouveau_channel_init(struct nouveau_channel *chan, u32 vram, u32 gart)
 		chan->user_get = 0x44;
 		chan->dma.max = (0x10000 / 4) - 2;
 	} else
-	if (chan->user.oclass < VOLTA_CHANNEL_GPFIFO_A) {
+	if (chan->user.oclass < FERMI_CHANNEL_GPFIFO) {
 		ret = nvif_chan506f_ctor(&chan->chan, chan->userd->map.ptr,
 					 (u8*)chan->push.buffer->kmap.virtual + 0x10000, 0x2000,
 					 chan->push.buffer->kmap.virtual, chan->push.addr, 0x10000);
 		if (ret)
 			return ret;
+	} else
+	if (chan->user.oclass < VOLTA_CHANNEL_GPFIFO_A) {
+		ret = nvif_chan906f_ctor(&chan->chan, chan->userd->map.ptr,
+					 (u8*)chan->push.buffer->kmap.virtual + 0x10000, 0x2000,
+					 chan->push.buffer->kmap.virtual, chan->push.addr, 0x10000,
+					 chan->sema.bo->kmap.virtual, chan->sema.vma->addr);
+		if (ret)
+			return ret;
 	} else {
 		ret = nvif_chanc36f_ctor(&chan->chan, chan->userd->map.ptr,
 					 (u8*)chan->push.buffer->kmap.virtual + 0x10000, 0x2000,
 					 chan->push.buffer->kmap.virtual, chan->push.addr, 0x10000,
+					 chan->sema.bo->kmap.virtual, chan->sema.vma->addr,
 					 &drm->client.device.user, chan->token);
 		if (ret)
 			return ret;
--- a/drivers/gpu/drm/nouveau/nouveau_chan.h
+++ b/drivers/gpu/drm/nouveau/nouveau_chan.h
@ -43,6 +43,11 @@ struct nouveau_channel {
 	u32 user_get;
 	u32 user_put;

+	struct {
+		struct nouveau_bo *bo;
+		struct nouveau_vma *vma;
+	} sema;
+
 	struct nvif_object user;
 	struct nvif_object blit;

--- a/drivers/gpu/drm/nouveau/nouveau_exec.c
+++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
@ -146,6 +146,8 @@ nouveau_exec_job_run(struct nouveau_job *job)
 		nvif_chan_gpfifo_push(&chan->chan, p->va, p->va_len, no_prefetch);
 	}

+	nvif_chan_gpfifo_post(&chan->chan);
+
 	ret = nouveau_fence_emit(fence);
 	if (ret) {
 		nouveau_fence_unref(&exec_job->fence);
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@ -866,6 +866,8 @@ revalidate:

 			nvif_chan_gpfifo_push(&chan->chan, addr, length, no_prefetch);
 		}
+
+		nvif_chan_gpfifo_post(&chan->chan);
 	} else
 	if (drm->client.device.info.chipset >= 0x25) {
 		ret = PUSH_WAIT(&chan->chan.push, req->nr_push * 2);
--- a/drivers/gpu/drm/nouveau/nvif/Kbuild
+++ b/drivers/gpu/drm/nouveau/nvif/Kbuild
@ -17,6 +17,7 @@ nvif-y += nvif/vmm.o
 # Channel classes
 nvif-y += nvif/chan.o
 nvif-y += nvif/chan506f.o
+nvif-y += nvif/chan906f.o
 nvif-y += nvif/chanc36f.o

 # Usermode classes
--- a/drivers/gpu/drm/nouveau/nvif/chan.c
+++ b/drivers/gpu/drm/nouveau/nvif/chan.c
@ -9,7 +9,16 @@ nvif_chan_gpfifo_push_kick(struct nvif_push *push)
 {
 	struct nvif_chan *chan = container_of(push, typeof(*chan), push);
 	u32 put = push->bgn - (u32 *)chan->push.mem.object.map.ptr;
-	u32 cnt = push->cur - push->bgn;
+	u32 cnt;
+
+	if (chan->func->gpfifo.post) {
+		if (push->end - push->cur < chan->func->gpfifo.post_size)
+			push->end = push->cur + chan->func->gpfifo.post_size;
+
+		WARN_ON(nvif_chan_gpfifo_post(chan));
+	}
+
+	cnt = push->cur - push->bgn;

 	chan->func->gpfifo.push(chan, true, chan->push.addr + (put << 2), cnt << 2, false);
 	chan->func->gpfifo.kick(chan);
@ -23,6 +32,16 @@ nvif_chan_gpfifo_push_wait(struct nvif_push *push, u32 push_nr)
 	return nvif_chan_gpfifo_wait(chan, 1, push_nr);
 }

+int
+nvif_chan_gpfifo_post(struct nvif_chan *chan)
+{
+	const u32 *map = chan->push.mem.object.map.ptr;
+	const u32 pbptr = (chan->push.cur - map) + chan->func->gpfifo.post_size;
+	const u32 gpptr = (chan->gpfifo.cur + 1) & chan->gpfifo.max;
+
+	return chan->func->gpfifo.post(chan, gpptr, pbptr);
+}
+
 void
 nvif_chan_gpfifo_push(struct nvif_chan *chan, u64 addr, u32 size, bool no_prefetch)
 {
@ -35,6 +54,14 @@ nvif_chan_gpfifo_wait(struct nvif_chan *chan, u32 gpfifo_nr, u32 push_nr)
 	struct nvif_push *push = &chan->push;
 	int ret = 0, time = 1000000;

+	if (gpfifo_nr) {
+		/* Account for pushbuf space needed by nvif_chan_gpfifo_post(),
+		 * if used after pushing userspace GPFIFO entries.
+		 */
+		if (chan->func->gpfifo.post)
+			push_nr += chan->func->gpfifo.post_size;
+	}
+
 	/* Account for the GPFIFO entry needed to submit pushbuf. */
 	if (push_nr)
 		gpfifo_nr++;
@ -89,6 +116,8 @@ nvif_chan_dma_wait(struct nvif_chan *chan, u32 nr)
 	u32 cur = push->cur - (u32 *)push->mem.object.map.ptr;
 	u32 free, time = 1000000;

+	nr += chan->func->gpfifo.post_size;
+
 	do {
 		u32 get = chan->func->push.read_get(chan);

@ -122,6 +151,6 @@ nvif_chan_dma_wait(struct nvif_chan *chan, u32 nr)

 	push->bgn = (u32 *)push->mem.object.map.ptr + cur;
 	push->cur = push->bgn;
-	push->end = push->bgn + free;
+	push->end = push->bgn + free - chan->func->gpfifo.post_size;
 	return 0;
 }
--- a/drivers/gpu/drm/nouveau/nvif/chan506f.c
+++ b/drivers/gpu/drm/nouveau/nvif/chan506f.c
@ -4,7 +4,7 @@
 */
 #include <nvif/chan.h>

-static void
+void
 nvif_chan506f_gpfifo_kick(struct nvif_chan *chan)
 {
 	wmb();
@ -31,13 +31,13 @@ nvif_chan506f_gpfifo_push(struct nvif_chan *chan, bool main, u64 addr, u32 size,
 		chan->push.end = chan->push.cur;
 }

-u32
+static u32
 nvif_chan506f_gpfifo_read_get(struct nvif_chan *chan)
 {
 	return nvif_rd32(&chan->userd, 0x88);
 }

-u32
+static u32
 nvif_chan506f_read_get(struct nvif_chan *chan)
 {
 	u32 tlgetlo = nvif_rd32(&chan->userd, 0x58);
--- a/drivers/gpu/drm/nouveau/nvif/chan906f.c
+++ b/drivers/gpu/drm/nouveau/nvif/chan906f.c
@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ */
+#include <nvif/chan.h>
+#include <nvif/user.h>
+#include <nvif/push906f.h>
+
+#include <nvhw/class/cl906f.h>
+
+/* Limits GPFIFO size to 1MiB, and "main" push buffer size to 64KiB. */
+#define NVIF_CHAN906F_PBPTR_BITS  15
+#define NVIF_CHAN906F_PBPTR_MASK  ((1 << NVIF_CHAN906F_PBPTR_BITS) - 1)
+
+#define NVIF_CHAN906F_GPPTR_SHIFT NVIF_CHAN906F_PBPTR_BITS
+#define NVIF_CHAN906F_GPPTR_BITS  (32 - NVIF_CHAN906F_PBPTR_BITS)
+#define NVIF_CHAN906F_GPPTR_MASK  ((1 << NVIF_CHAN906F_GPPTR_BITS) - 1)
+
+#define NVIF_CHAN906F_SEM_RELEASE_SIZE 5
+
+static int
+nvif_chan906f_sem_release(struct nvif_chan *chan, u64 addr, u32 data)
+{
+	struct nvif_push *push = &chan->push;
+	int ret;
+
+	ret = PUSH_WAIT(push, NVIF_CHAN906F_SEM_RELEASE_SIZE);
+	if (ret)
+		return ret;
+
+	PUSH_MTHD(push, NV906F, SEMAPHOREA,
+		  NVVAL(NV906F, SEMAPHOREA, OFFSET_UPPER, upper_32_bits(addr)),
+
+				SEMAPHOREB, lower_32_bits(addr),
+
+				SEMAPHOREC, data,
+
+				SEMAPHORED,
+		  NVDEF(NV906F, SEMAPHORED, OPERATION, RELEASE) |
+		  NVDEF(NV906F, SEMAPHORED, RELEASE_WFI, DIS) |
+		  NVDEF(NV906F, SEMAPHORED, RELEASE_SIZE, 16BYTE));
+
+	return 0;
+}
+
+int
+nvif_chan906f_gpfifo_post(struct nvif_chan *chan, u32 gpptr, u32 pbptr)
+{
+	return chan->func->sem.release(chan, chan->sema.addr,
+				       (gpptr << NVIF_CHAN906F_GPPTR_SHIFT) | pbptr);
+}
+
+u32
+nvif_chan906f_gpfifo_read_get(struct nvif_chan *chan)
+{
+	return nvif_rd32(&chan->sema, 0) >> NVIF_CHAN906F_GPPTR_SHIFT;
+}
+
+u32
+nvif_chan906f_read_get(struct nvif_chan *chan)
+{
+	return nvif_rd32(&chan->sema, 0) & NVIF_CHAN906F_PBPTR_MASK;
+}
+
+static const struct nvif_chan_func
+nvif_chan906f = {
+	.push.read_get = nvif_chan906f_read_get,
+	.gpfifo.read_get = nvif_chan906f_gpfifo_read_get,
+	.gpfifo.push = nvif_chan506f_gpfifo_push,
+	.gpfifo.kick = nvif_chan506f_gpfifo_kick,
+	.gpfifo.post = nvif_chan906f_gpfifo_post,
+	.gpfifo.post_size = NVIF_CHAN906F_SEM_RELEASE_SIZE,
+	.sem.release = nvif_chan906f_sem_release,
+};
+
+int
+nvif_chan906f_ctor_(const struct nvif_chan_func *func, void *userd, void *gpfifo, u32 gpfifo_size,
+		    void *push, u64 push_addr, u32 push_size, void *sema, u64 sema_addr,
+		    struct nvif_chan *chan)
+{
+	nvif_chan_gpfifo_ctor(func, userd, gpfifo, gpfifo_size, push, push_addr, push_size, chan);
+	chan->sema.map.ptr = sema;
+	chan->sema.addr = sema_addr;
+	return 0;
+}
+
+int
+nvif_chan906f_ctor(struct nvif_chan *chan, void *userd, void *gpfifo, u32 gpfifo_size,
+		   void *push, u64 push_addr, u32 push_size, void *sema, u64 sema_addr)
+{
+	return nvif_chan906f_ctor_(&nvif_chan906f, userd, gpfifo, gpfifo_size,
+				   push, push_addr, push_size, sema, sema_addr, chan);
+}
--- a/drivers/gpu/drm/nouveau/nvif/chanc36f.c
+++ b/drivers/gpu/drm/nouveau/nvif/chanc36f.c
@ -5,6 +5,9 @@
 #include <nvif/chan.h>
 #include <nvif/user.h>

+#include <nvif/push906f.h>
+#include <nvhw/class/clc36f.h>
+
 static void
 nvif_chanc36f_gpfifo_kick(struct nvif_chan *chan)
 {
@ -18,21 +21,56 @@ nvif_chanc36f_gpfifo_kick(struct nvif_chan *chan)
 	usermode->func->doorbell(usermode, chan->doorbell_token);
 }

+#define NVIF_CHANC36F_SEM_RELEASE_SIZE 6
+
+static int
+nvif_chanc36f_sem_release(struct nvif_chan *chan, u64 addr, u32 data)
+{
+	struct nvif_push *push = &chan->push;
+	int ret;
+
+	ret = PUSH_WAIT(push, NVIF_CHANC36F_SEM_RELEASE_SIZE);
+	if (ret)
+		return ret;
+
+	PUSH_MTHD(push, NVC36F, SEM_ADDR_LO, lower_32_bits(addr),
+
+				SEM_ADDR_HI, upper_32_bits(addr),
+
+				SEM_PAYLOAD_LO, data);
+
+	PUSH_MTHD(push, NVC36F, SEM_EXECUTE,
+		  NVDEF(NVC36F, SEM_EXECUTE, OPERATION, RELEASE) |
+		  NVDEF(NVC36F, SEM_EXECUTE, RELEASE_WFI, DIS) |
+		  NVDEF(NVC36F, SEM_EXECUTE, PAYLOAD_SIZE, 32BIT) |
+		  NVDEF(NVC36F, SEM_EXECUTE, RELEASE_TIMESTAMP, DIS));
+
+	return 0;
+}
+
 static const struct nvif_chan_func
 nvif_chanc36f = {
-	.push.read_get = nvif_chan506f_read_get,
-	.gpfifo.read_get = nvif_chan506f_gpfifo_read_get,
+	.push.read_get = nvif_chan906f_read_get,
+	.gpfifo.read_get = nvif_chan906f_gpfifo_read_get,
 	.gpfifo.push = nvif_chan506f_gpfifo_push,
 	.gpfifo.kick = nvif_chanc36f_gpfifo_kick,
+	.gpfifo.post = nvif_chan906f_gpfifo_post,
+	.gpfifo.post_size = NVIF_CHANC36F_SEM_RELEASE_SIZE,
+	.sem.release = nvif_chanc36f_sem_release,
 };

 int
 nvif_chanc36f_ctor(struct nvif_chan *chan, void *userd, void *gpfifo, u32 gpfifo_size,
-		   void *push, u64 push_addr, u32 push_size,
+		   void *push, u64 push_addr, u32 push_size, void *sema, u64 sema_addr,
 		   struct nvif_user *usermode, u32 doorbell_token)
 {
-	nvif_chan_gpfifo_ctor(&nvif_chanc36f, userd, gpfifo, gpfifo_size,
-			      push, push_addr, push_size, chan);
+	int ret;
+
+	ret = nvif_chan906f_ctor_(&nvif_chanc36f, userd, gpfifo, gpfifo_size,
+				  push, push_addr, push_size, sema, sema_addr, chan);
+	if (ret)
+		return ret;
+
 	chan->usermode = usermode;
 	chan->doorbell_token = doorbell_token;
 	return 0;