linux/drivers/dma-buf/dma-fence-unwrap.c

205 lines
5.1 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* dma-fence-util: misc functions for dma_fence objects
*
* Copyright (C) 2022 Advanced Micro Devices, Inc.
* Authors:
* Christian König <christian.koenig@amd.com>
*/
#include <linux/dma-fence.h>
#include <linux/dma-fence-array.h>
#include <linux/dma-fence-chain.h>
#include <linux/dma-fence-unwrap.h>
#include <linux/slab.h>
dma-fence: Use kernel's sort for merging fences One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
2024-11-15 10:21:50 +00:00
#include <linux/sort.h>
/* Internal helper to start new array iteration, don't use directly */
static struct dma_fence *
__dma_fence_unwrap_array(struct dma_fence_unwrap *cursor)
{
cursor->array = dma_fence_chain_contained(cursor->chain);
cursor->index = 0;
return dma_fence_array_first(cursor->array);
}
/**
* dma_fence_unwrap_first - return the first fence from fence containers
* @head: the entrypoint into the containers
* @cursor: current position inside the containers
*
* Unwraps potential dma_fence_chain/dma_fence_array containers and return the
* first fence.
*/
struct dma_fence *dma_fence_unwrap_first(struct dma_fence *head,
struct dma_fence_unwrap *cursor)
{
cursor->chain = dma_fence_get(head);
return __dma_fence_unwrap_array(cursor);
}
EXPORT_SYMBOL_GPL(dma_fence_unwrap_first);
/**
* dma_fence_unwrap_next - return the next fence from a fence containers
* @cursor: current position inside the containers
*
* Continue unwrapping the dma_fence_chain/dma_fence_array containers and return
* the next fence from them.
*/
struct dma_fence *dma_fence_unwrap_next(struct dma_fence_unwrap *cursor)
{
struct dma_fence *tmp;
++cursor->index;
tmp = dma_fence_array_next(cursor->array, cursor->index);
if (tmp)
return tmp;
cursor->chain = dma_fence_chain_walk(cursor->chain);
return __dma_fence_unwrap_array(cursor);
}
EXPORT_SYMBOL_GPL(dma_fence_unwrap_next);
dma-fence: Use kernel's sort for merging fences One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
2024-11-15 10:21:50 +00:00
static int fence_cmp(const void *_a, const void *_b)
{
struct dma_fence *a = *(struct dma_fence **)_a;
struct dma_fence *b = *(struct dma_fence **)_b;
if (a->context < b->context)
return -1;
else if (a->context > b->context)
return 1;
if (dma_fence_is_later(b, a))
return 1;
else if (dma_fence_is_later(a, b))
return -1;
return 0;
}
/**
* dma_fence_dedup_array - Sort and deduplicate an array of dma_fence pointers
* @fences: Array of dma_fence pointers to be deduplicated
* @num_fences: Number of entries in the @fences array
*
* Sorts the input array by context, then removes duplicate
* fences with the same context, keeping only the most recent one.
*
* The array is modified in-place and unreferenced duplicate fences are released
* via dma_fence_put(). The function returns the new number of fences after
* deduplication.
*
* Return: Number of unique fences remaining in the array.
*/
int dma_fence_dedup_array(struct dma_fence **fences, int num_fences)
{
int i, j;
sort(fences, num_fences, sizeof(*fences), fence_cmp, NULL);
/*
* Only keep the most recent fence for each context.
*/
j = 0;
for (i = 1; i < num_fences; i++) {
if (fences[i]->context == fences[j]->context)
dma_fence_put(fences[i]);
else
fences[++j] = fences[i];
}
return ++j;
}
EXPORT_SYMBOL_GPL(dma_fence_dedup_array);
/* Implementation for the dma_fence_merge() marco, don't use directly */
struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
struct dma_fence **fences,
struct dma_fence_unwrap *iter)
{
dma-fence: Add a single fence fast path for fence merging Testing some workloads in two different scenarios, such as games running under Gamescope on a Steam Deck, or vkcube under a Plasma desktop, shows that in a significant portion of calls the dma_fence_unwrap_merge helper is called with just a single unsignalled fence. Therefore it is worthile to add a fast path for that case and so bypass the memory allocation and insertion sort attempts. Tested scenarios: 1) Hogwarts Legacy under Gamescope ~1500 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.85% 1 69.80% -> The new fast path. 2-9 29.36% 9% (Ie. 91% of this bucket flattened to 1 fence) 10-19 20-40 50+ 2) Cyberpunk 2077 under Gamescope ~2400 calls per second. N Before After 0 0.71% 1 52.53% -> The new fast path. 2-9 44.38% 50.60% (Ie. half resolved to a single fence) 10-19 2.34% 20-40 0.06% 50+ 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-9 100% 0% (Ie. all resolved to a single fence) 10-19 20-40 50+ In the case of vkcube all invocations in the 2-9 bucket were actually just two input fences. v2: * Correct local variable name and hold on to unsignaled reference. (Chistian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Cc: Christian König <christian.koenig@amd.com> Cc: Friedrich Vock <friedrich.vock@gmx.de> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-4-tursulin@igalia.com
2024-11-15 10:21:51 +00:00
struct dma_fence *tmp, *unsignaled = NULL, **array;
struct dma_fence_array *result;
ktime_t timestamp;
int i, count;
count = 0;
timestamp = ns_to_ktime(0);
for (i = 0; i < num_fences; ++i) {
dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) {
if (!dma_fence_is_signaled(tmp)) {
dma-fence: Add a single fence fast path for fence merging Testing some workloads in two different scenarios, such as games running under Gamescope on a Steam Deck, or vkcube under a Plasma desktop, shows that in a significant portion of calls the dma_fence_unwrap_merge helper is called with just a single unsignalled fence. Therefore it is worthile to add a fast path for that case and so bypass the memory allocation and insertion sort attempts. Tested scenarios: 1) Hogwarts Legacy under Gamescope ~1500 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.85% 1 69.80% -> The new fast path. 2-9 29.36% 9% (Ie. 91% of this bucket flattened to 1 fence) 10-19 20-40 50+ 2) Cyberpunk 2077 under Gamescope ~2400 calls per second. N Before After 0 0.71% 1 52.53% -> The new fast path. 2-9 44.38% 50.60% (Ie. half resolved to a single fence) 10-19 2.34% 20-40 0.06% 50+ 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-9 100% 0% (Ie. all resolved to a single fence) 10-19 20-40 50+ In the case of vkcube all invocations in the 2-9 bucket were actually just two input fences. v2: * Correct local variable name and hold on to unsignaled reference. (Chistian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Cc: Christian König <christian.koenig@amd.com> Cc: Friedrich Vock <friedrich.vock@gmx.de> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-4-tursulin@igalia.com
2024-11-15 10:21:51 +00:00
dma_fence_put(unsignaled);
unsignaled = dma_fence_get(tmp);
++count;
} else {
ktime_t t = dma_fence_timestamp(tmp);
if (ktime_after(t, timestamp))
timestamp = t;
}
}
}
/*
* If we couldn't find a pending fence just return a private signaled
* fence with the timestamp of the last signaled one.
dma-fence: Add a single fence fast path for fence merging Testing some workloads in two different scenarios, such as games running under Gamescope on a Steam Deck, or vkcube under a Plasma desktop, shows that in a significant portion of calls the dma_fence_unwrap_merge helper is called with just a single unsignalled fence. Therefore it is worthile to add a fast path for that case and so bypass the memory allocation and insertion sort attempts. Tested scenarios: 1) Hogwarts Legacy under Gamescope ~1500 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.85% 1 69.80% -> The new fast path. 2-9 29.36% 9% (Ie. 91% of this bucket flattened to 1 fence) 10-19 20-40 50+ 2) Cyberpunk 2077 under Gamescope ~2400 calls per second. N Before After 0 0.71% 1 52.53% -> The new fast path. 2-9 44.38% 50.60% (Ie. half resolved to a single fence) 10-19 2.34% 20-40 0.06% 50+ 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-9 100% 0% (Ie. all resolved to a single fence) 10-19 20-40 50+ In the case of vkcube all invocations in the 2-9 bucket were actually just two input fences. v2: * Correct local variable name and hold on to unsignaled reference. (Chistian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Cc: Christian König <christian.koenig@amd.com> Cc: Friedrich Vock <friedrich.vock@gmx.de> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-4-tursulin@igalia.com
2024-11-15 10:21:51 +00:00
*
* Or if there was a single unsignaled fence left we can return it
* directly and early since that is a major path on many workloads.
*/
if (count == 0)
return dma_fence_allocate_private_stub(timestamp);
dma-fence: Add a single fence fast path for fence merging Testing some workloads in two different scenarios, such as games running under Gamescope on a Steam Deck, or vkcube under a Plasma desktop, shows that in a significant portion of calls the dma_fence_unwrap_merge helper is called with just a single unsignalled fence. Therefore it is worthile to add a fast path for that case and so bypass the memory allocation and insertion sort attempts. Tested scenarios: 1) Hogwarts Legacy under Gamescope ~1500 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.85% 1 69.80% -> The new fast path. 2-9 29.36% 9% (Ie. 91% of this bucket flattened to 1 fence) 10-19 20-40 50+ 2) Cyberpunk 2077 under Gamescope ~2400 calls per second. N Before After 0 0.71% 1 52.53% -> The new fast path. 2-9 44.38% 50.60% (Ie. half resolved to a single fence) 10-19 2.34% 20-40 0.06% 50+ 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-9 100% 0% (Ie. all resolved to a single fence) 10-19 20-40 50+ In the case of vkcube all invocations in the 2-9 bucket were actually just two input fences. v2: * Correct local variable name and hold on to unsignaled reference. (Chistian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Cc: Christian König <christian.koenig@amd.com> Cc: Friedrich Vock <friedrich.vock@gmx.de> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-4-tursulin@igalia.com
2024-11-15 10:21:51 +00:00
else if (count == 1)
return unsignaled;
dma_fence_put(unsignaled);
array = kmalloc_array(count, sizeof(*array), GFP_KERNEL);
if (!array)
return NULL;
count = 0;
dma-fence: Use kernel's sort for merging fences One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
2024-11-15 10:21:50 +00:00
for (i = 0; i < num_fences; ++i) {
dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) {
if (!dma_fence_is_signaled(tmp)) {
array[count++] = dma_fence_get(tmp);
} else {
dma-fence: Use kernel's sort for merging fences One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
2024-11-15 10:21:50 +00:00
ktime_t t = dma_fence_timestamp(tmp);
dma-fence: Use kernel's sort for merging fences One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
2024-11-15 10:21:50 +00:00
if (ktime_after(t, timestamp))
timestamp = t;
}
}
}
dma-fence: Use kernel's sort for merging fences One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
2024-11-15 10:21:50 +00:00
if (count == 0 || count == 1)
goto return_fastpath;
count = dma_fence_dedup_array(array, count);
dma-fence: Use kernel's sort for merging fences One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
2024-11-15 10:21:50 +00:00
if (count > 1) {
result = dma_fence_array_create(count, array,
dma_fence_context_alloc(1),
1, false);
if (!result) {
for (i = 0; i < count; i++)
dma_fence_put(array[i]);
tmp = NULL;
goto return_tmp;
}
return &result->base;
}
dma-fence: Use kernel's sort for merging fences One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
2024-11-15 10:21:50 +00:00
return_fastpath:
if (count == 0)
tmp = dma_fence_allocate_private_stub(timestamp);
else
tmp = array[0];
return_tmp:
kfree(array);
return tmp;
}
EXPORT_SYMBOL_GPL(__dma_fence_unwrap_merge);