linux/drivers/gpu/drm/vkms/vkms_drv.h

304 lines
9.7 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _VKMS_DRV_H_
#define _VKMS_DRV_H_
#include <linux/hrtimer.h>
#include <drm/drm.h>
#include <drm/drm_framebuffer.h>
#include <drm/drm_gem.h>
#include <drm/drm_gem_atomic_helper.h>
#include <drm/drm_encoder.h>
#include <drm/drm_writeback.h>
#define DEFAULT_DEVICE_NAME "vkms"
#define XRES_MIN 10
#define YRES_MIN 10
#define XRES_DEF 1024
#define YRES_DEF 768
#define XRES_MAX 8192
#define YRES_MAX 8192
#define NUM_OVERLAY_PLANES 8
#define VKMS_LUT_SIZE 256
/**
* struct vkms_frame_info - Structure to store the state of a frame
*
* @fb: backing drm framebuffer
* @src: source rectangle of this frame in the source framebuffer, stored in 16.16 fixed-point form
* @dst: destination rectangle in the crtc buffer, stored in whole pixel units
* @map: see @drm_shadow_plane_state.data
* @rotation: rotation applied to the source.
*
* @src and @dst should have the same size modulo the rotation.
*/
struct vkms_frame_info {
struct drm_framebuffer *fb;
struct drm_rect src, dst;
struct iosys_map map[DRM_FORMAT_MAX_PLANES];
unsigned int rotation;
drm/vkms: Implement CRC debugfs API This patch implement the necessary functions to compute and add CRCs entries: - Implement the set_crc_source() callback. - Compute CRC using crc32 on the visible part of the framebuffer. - Use ordered workqueue per output to compute and add CRC at the end of a vblank. - Use appropriate synchronization methods since the CRC computation must be atomic wrt the generated vblank event for a given atomic update, by using spinlock across atomic_begin/atomic_flush to wrap the event handling code completely and match the flip event with the CRC. Since vkms_crc_work_handle() can sleep, spinlock can't be acquired while accessing vkms_output->primary_crc to compute CRC. To make sure the data is updated and released without conflict with the vkms_crc_work_handle(), the work_struct is flushed @crtc_destroy and the data is updated before scheduling the work handle again, as follow: * CRC data update: 1- store vkms_crc_data {fb, src} per plane_state 2- @plane_duplicate_state -> allocate vkms_crc_data 3- during atomic commit (@atomic_update) -> a) copy {fb, src} to plane_state->crc_data b) get reference to fb, 3- @plane_destroy_state -> a) if (fb refcount) remove reference to fb b) deallocate crc_data * Atomic Commit: 1- vkms_plane_atomic_check 2- vkms_prepare_fb -> vmap vkms_gem_obj->vaddr 3- atomic_begin -> hold crc spinlock 4- atomic_plane_update -> a) update vkms_output->primary_crc b) get reference to fb 5- atomic_flush -> a) send vblank event while holding event_lock b) release crc spinlock * hrtimer regular callback: 1- hold crc spinlock 2- drm_crtc_handle_vblank() 3- queue vkms_crc_work_handle 4- release crc spinlock * cleanup: 1- @cleanup_fb ->vunmap vkms_gem_obj->vaddr 2- @crtc_destroy -> flush work struct 3- @plane_destroy -> a) if (fb refcount) remove reference to fb b) deallocate crc_data Signed-off-by: Haneen Mohammed <hamohammed.sa@gmail.com> [seanpaul fixed typo in vkms_crtc s/vblamk/vblank/] Signed-off-by: Sean Paul <seanpaul@chromium.org> Link: https://patchwork.freedesktop.org/patch/msgid/b948327f48c3e70ab232b4a0848ee6d033b26484.1533171495.git.hamohammed.sa@gmail.com
2018-08-02 04:10:26 +03:00
};
/**
* struct pixel_argb_u16 - Internal representation of a pixel color.
* @a: Alpha component value, stored in 16 bits, without padding, using
* machine endianness
* @r: Red component value, stored in 16 bits, without padding, using
* machine endianness
* @g: Green component value, stored in 16 bits, without padding, using
* machine endianness
* @b: Blue component value, stored in 16 bits, without padding, using
* machine endianness
*
* The goal of this structure is to keep enough precision to ensure
* correct composition results in VKMS and simplifying color
* manipulation by splitting each component into its own field.
* Caution: the byte ordering of this structure is machine-dependent,
* you can't cast it directly to AR48 or xR48.
*/
struct pixel_argb_u16 {
u16 a, r, g, b;
};
struct line_buffer {
size_t n_pixels;
struct pixel_argb_u16 *pixels;
};
/**
* typedef pixel_write_t - These functions are used to read a pixel from a
* &struct pixel_argb_u16, convert it in a specific format and write it in the @out_pixel
* buffer.
*
* @out_pixel: destination address to write the pixel
* @in_pixel: pixel to write
*/
typedef void (*pixel_write_t)(u8 *out_pixel, const struct pixel_argb_u16 *in_pixel);
struct vkms_writeback_job {
struct iosys_map data[DRM_FORMAT_MAX_PLANES];
struct vkms_frame_info wb_frame_info;
pixel_write_t pixel_write;
};
/**
* enum pixel_read_direction - Enum used internally by VKMS to represent a reading direction in a
* plane.
*/
enum pixel_read_direction {
READ_BOTTOM_TO_TOP,
READ_TOP_TO_BOTTOM,
READ_RIGHT_TO_LEFT,
READ_LEFT_TO_RIGHT
};
drm/vkms: Re-introduce line-per-line composition algorithm Re-introduce a line-by-line composition algorithm for each pixel format. This allows more performance by not requiring an indirection per pixel read. This patch is focused on readability of the code. Line-by-line composition was introduced by [1] but rewritten back to pixel-by-pixel algorithm in [2]. At this time, nobody noticed the impact on performance, and it was merged. This patch is almost a revert of [2], but in addition efforts have been made to increase readability and maintainability of the rotation handling. The blend function is now divided in two parts: - Transformation of coordinates from the output referential to the source referential - Line conversion and blending Most of the complexity of the rotation management is avoided by using drm_rect_* helpers. The remaining complexity is around the clipping, to avoid reading/writing outside source/destination buffers. The pixel conversion is now done line-by-line, so the read_pixel_t was replaced with read_pixel_line_t callback. This way the indirection is only required once per line and per plane, instead of once per pixel and per plane. The read_line_t callbacks are very similar for most pixel format, but it is required to avoid performance impact. Some helpers for color conversion were introduced to avoid code repetition: - *_to_argb_u16: perform colors conversion. They should be inlined by the compiler, and they are used to avoid repetition between multiple variants of the same format (argb/xrgb and maybe in the future for formats like bgr formats). This new algorithm was tested with: - kms_plane (for color conversions) - kms_rotation_crc (for rotations of planes) - kms_cursor_crc (for translations of planes) - kms_rotation (for all rotations and formats combinations) [3] The performance gain was mesured with kms_fb_stress [4] with some modification to fix the writeback format. The performance improvement is around 5 to 10%. [1]: commit 8ba1648567e2 ("drm: vkms: Refactor the plane composer to accept new formats") https://lore.kernel.org/all/20220905190811.25024-7-igormtorrente@gmail.com/ [2]: commit 322d716a3e8a ("drm/vkms: isolate pixel conversion functionality") https://lore.kernel.org/all/20230418130525.128733-2-mcanal@igalia.com/ [3]: https://lore.kernel.org/igt-dev/20240313-new_rotation-v2-0-6230fd5cae59@bootlin.com/ [4]: https://lore.kernel.org/all/20240422-kms_fb_stress-dev-v5-0-0c577163dc88@riseup.net/ Reviewed-by: José Expósito <jose.exposito89@gmail.com> Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241118-yuv-v14-8-2dbc2f1e222c@bootlin.com Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
2024-11-18 19:28:23 +01:00
struct vkms_plane_state;
/**
drm/vkms: Re-introduce line-per-line composition algorithm Re-introduce a line-by-line composition algorithm for each pixel format. This allows more performance by not requiring an indirection per pixel read. This patch is focused on readability of the code. Line-by-line composition was introduced by [1] but rewritten back to pixel-by-pixel algorithm in [2]. At this time, nobody noticed the impact on performance, and it was merged. This patch is almost a revert of [2], but in addition efforts have been made to increase readability and maintainability of the rotation handling. The blend function is now divided in two parts: - Transformation of coordinates from the output referential to the source referential - Line conversion and blending Most of the complexity of the rotation management is avoided by using drm_rect_* helpers. The remaining complexity is around the clipping, to avoid reading/writing outside source/destination buffers. The pixel conversion is now done line-by-line, so the read_pixel_t was replaced with read_pixel_line_t callback. This way the indirection is only required once per line and per plane, instead of once per pixel and per plane. The read_line_t callbacks are very similar for most pixel format, but it is required to avoid performance impact. Some helpers for color conversion were introduced to avoid code repetition: - *_to_argb_u16: perform colors conversion. They should be inlined by the compiler, and they are used to avoid repetition between multiple variants of the same format (argb/xrgb and maybe in the future for formats like bgr formats). This new algorithm was tested with: - kms_plane (for color conversions) - kms_rotation_crc (for rotations of planes) - kms_cursor_crc (for translations of planes) - kms_rotation (for all rotations and formats combinations) [3] The performance gain was mesured with kms_fb_stress [4] with some modification to fix the writeback format. The performance improvement is around 5 to 10%. [1]: commit 8ba1648567e2 ("drm: vkms: Refactor the plane composer to accept new formats") https://lore.kernel.org/all/20220905190811.25024-7-igormtorrente@gmail.com/ [2]: commit 322d716a3e8a ("drm/vkms: isolate pixel conversion functionality") https://lore.kernel.org/all/20230418130525.128733-2-mcanal@igalia.com/ [3]: https://lore.kernel.org/igt-dev/20240313-new_rotation-v2-0-6230fd5cae59@bootlin.com/ [4]: https://lore.kernel.org/all/20240422-kms_fb_stress-dev-v5-0-0c577163dc88@riseup.net/ Reviewed-by: José Expósito <jose.exposito89@gmail.com> Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241118-yuv-v14-8-2dbc2f1e222c@bootlin.com Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
2024-11-18 19:28:23 +01:00
* typedef pixel_read_line_t - These functions are used to read a pixel line in the source frame,
* convert it to `struct pixel_argb_u16` and write it to @out_pixel.
*
drm/vkms: Re-introduce line-per-line composition algorithm Re-introduce a line-by-line composition algorithm for each pixel format. This allows more performance by not requiring an indirection per pixel read. This patch is focused on readability of the code. Line-by-line composition was introduced by [1] but rewritten back to pixel-by-pixel algorithm in [2]. At this time, nobody noticed the impact on performance, and it was merged. This patch is almost a revert of [2], but in addition efforts have been made to increase readability and maintainability of the rotation handling. The blend function is now divided in two parts: - Transformation of coordinates from the output referential to the source referential - Line conversion and blending Most of the complexity of the rotation management is avoided by using drm_rect_* helpers. The remaining complexity is around the clipping, to avoid reading/writing outside source/destination buffers. The pixel conversion is now done line-by-line, so the read_pixel_t was replaced with read_pixel_line_t callback. This way the indirection is only required once per line and per plane, instead of once per pixel and per plane. The read_line_t callbacks are very similar for most pixel format, but it is required to avoid performance impact. Some helpers for color conversion were introduced to avoid code repetition: - *_to_argb_u16: perform colors conversion. They should be inlined by the compiler, and they are used to avoid repetition between multiple variants of the same format (argb/xrgb and maybe in the future for formats like bgr formats). This new algorithm was tested with: - kms_plane (for color conversions) - kms_rotation_crc (for rotations of planes) - kms_cursor_crc (for translations of planes) - kms_rotation (for all rotations and formats combinations) [3] The performance gain was mesured with kms_fb_stress [4] with some modification to fix the writeback format. The performance improvement is around 5 to 10%. [1]: commit 8ba1648567e2 ("drm: vkms: Refactor the plane composer to accept new formats") https://lore.kernel.org/all/20220905190811.25024-7-igormtorrente@gmail.com/ [2]: commit 322d716a3e8a ("drm/vkms: isolate pixel conversion functionality") https://lore.kernel.org/all/20230418130525.128733-2-mcanal@igalia.com/ [3]: https://lore.kernel.org/igt-dev/20240313-new_rotation-v2-0-6230fd5cae59@bootlin.com/ [4]: https://lore.kernel.org/all/20240422-kms_fb_stress-dev-v5-0-0c577163dc88@riseup.net/ Reviewed-by: José Expósito <jose.exposito89@gmail.com> Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241118-yuv-v14-8-2dbc2f1e222c@bootlin.com Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
2024-11-18 19:28:23 +01:00
* @plane: plane used as source for the pixel value
* @x_start: X (width) coordinate of the first pixel to copy. The caller must ensure that x_start
* is non-negative and smaller than @plane->frame_info->fb->width.
* @y_start: Y (height) coordinate of the first pixel to copy. The caller must ensure that y_start
* is non-negative and smaller than @plane->frame_info->fb->height.
* @direction: direction to use for the copy, starting at @x_start/@y_start
* @count: number of pixels to copy
* @out_pixel: pointer where to write the pixel values. They will be written from @out_pixel[0]
* (included) to @out_pixel[@count] (excluded). The caller must ensure that out_pixel have a
* length of at least @count.
*/
drm/vkms: Re-introduce line-per-line composition algorithm Re-introduce a line-by-line composition algorithm for each pixel format. This allows more performance by not requiring an indirection per pixel read. This patch is focused on readability of the code. Line-by-line composition was introduced by [1] but rewritten back to pixel-by-pixel algorithm in [2]. At this time, nobody noticed the impact on performance, and it was merged. This patch is almost a revert of [2], but in addition efforts have been made to increase readability and maintainability of the rotation handling. The blend function is now divided in two parts: - Transformation of coordinates from the output referential to the source referential - Line conversion and blending Most of the complexity of the rotation management is avoided by using drm_rect_* helpers. The remaining complexity is around the clipping, to avoid reading/writing outside source/destination buffers. The pixel conversion is now done line-by-line, so the read_pixel_t was replaced with read_pixel_line_t callback. This way the indirection is only required once per line and per plane, instead of once per pixel and per plane. The read_line_t callbacks are very similar for most pixel format, but it is required to avoid performance impact. Some helpers for color conversion were introduced to avoid code repetition: - *_to_argb_u16: perform colors conversion. They should be inlined by the compiler, and they are used to avoid repetition between multiple variants of the same format (argb/xrgb and maybe in the future for formats like bgr formats). This new algorithm was tested with: - kms_plane (for color conversions) - kms_rotation_crc (for rotations of planes) - kms_cursor_crc (for translations of planes) - kms_rotation (for all rotations and formats combinations) [3] The performance gain was mesured with kms_fb_stress [4] with some modification to fix the writeback format. The performance improvement is around 5 to 10%. [1]: commit 8ba1648567e2 ("drm: vkms: Refactor the plane composer to accept new formats") https://lore.kernel.org/all/20220905190811.25024-7-igormtorrente@gmail.com/ [2]: commit 322d716a3e8a ("drm/vkms: isolate pixel conversion functionality") https://lore.kernel.org/all/20230418130525.128733-2-mcanal@igalia.com/ [3]: https://lore.kernel.org/igt-dev/20240313-new_rotation-v2-0-6230fd5cae59@bootlin.com/ [4]: https://lore.kernel.org/all/20240422-kms_fb_stress-dev-v5-0-0c577163dc88@riseup.net/ Reviewed-by: José Expósito <jose.exposito89@gmail.com> Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241118-yuv-v14-8-2dbc2f1e222c@bootlin.com Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
2024-11-18 19:28:23 +01:00
typedef void (*pixel_read_line_t)(const struct vkms_plane_state *plane, int x_start,
int y_start, enum pixel_read_direction direction, int count,
struct pixel_argb_u16 out_pixel[]);
drm/vkms: Add YUV support Add support to the YUV formats bellow: - NV12/NV16/NV24 - NV21/NV61/NV42 - YUV420/YUV422/YUV444 - YVU420/YVU422/YVU444 The conversion from yuv to rgb is done with fixed-point arithmetic, using 32.32 fixed-point numbers and the drm_fixed helpers. To do the conversion, a specific matrix must be used for each color range (DRM_COLOR_*_RANGE) and encoding (DRM_COLOR_*). This matrix is stored in the `conversion_matrix` struct, along with the specific y_offset needed. This matrix is queried only once, in `vkms_plane_atomic_update` and stored in a `vkms_plane_state`. Those conversion matrices of each encoding and range were obtained by rounding the values of the original conversion matrices multiplied by 2^32. This is done to avoid the use of floating point operations. The same reading function is used for YUV and YVU formats. As the only difference between those two category of formats is the order of field, a simple swap in conversion matrix columns allows using the same function. [Louis Chauvet: - Adapted Arthur's work - Implemented the read_line_t callbacks for yuv - add struct conversion_matrix - store the whole conversion_matrix in the plane state - remove struct pixel_yuv_u8 - update the commit message - Merge the modifications from Arthur] Signed-off-by: Arthur Grillo <arthurgrillo@riseup.net> Acked-by: Maxime Ripard <mripard@kernel.org> Link: https://lore.kernel.org/r/20250415-yuv-v18-2-f2918f71ec4b@bootlin.com Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
2025-04-15 15:55:33 +02:00
/**
* struct conversion_matrix - Matrix to use for a specific encoding and range
*
* @matrix: Conversion matrix from yuv to rgb. The matrix is stored in a row-major manner and is
* used to compute rgb values from yuv values:
* [[r],[g],[b]] = @matrix * [[y],[u],[v]]
* OR for yvu formats:
* [[r],[g],[b]] = @matrix * [[y],[v],[u]]
* The values of the matrix are signed fixed-point values with 32 bits fractional part.
* @y_offset: Offset to apply on the y value.
*/
struct conversion_matrix {
s64 matrix[3][3];
int y_offset;
};
/**
* struct vkms_plane_state - Driver specific plane state
* @base: base plane state
* @frame_info: data required for composing computation
drm/vkms: Re-introduce line-per-line composition algorithm Re-introduce a line-by-line composition algorithm for each pixel format. This allows more performance by not requiring an indirection per pixel read. This patch is focused on readability of the code. Line-by-line composition was introduced by [1] but rewritten back to pixel-by-pixel algorithm in [2]. At this time, nobody noticed the impact on performance, and it was merged. This patch is almost a revert of [2], but in addition efforts have been made to increase readability and maintainability of the rotation handling. The blend function is now divided in two parts: - Transformation of coordinates from the output referential to the source referential - Line conversion and blending Most of the complexity of the rotation management is avoided by using drm_rect_* helpers. The remaining complexity is around the clipping, to avoid reading/writing outside source/destination buffers. The pixel conversion is now done line-by-line, so the read_pixel_t was replaced with read_pixel_line_t callback. This way the indirection is only required once per line and per plane, instead of once per pixel and per plane. The read_line_t callbacks are very similar for most pixel format, but it is required to avoid performance impact. Some helpers for color conversion were introduced to avoid code repetition: - *_to_argb_u16: perform colors conversion. They should be inlined by the compiler, and they are used to avoid repetition between multiple variants of the same format (argb/xrgb and maybe in the future for formats like bgr formats). This new algorithm was tested with: - kms_plane (for color conversions) - kms_rotation_crc (for rotations of planes) - kms_cursor_crc (for translations of planes) - kms_rotation (for all rotations and formats combinations) [3] The performance gain was mesured with kms_fb_stress [4] with some modification to fix the writeback format. The performance improvement is around 5 to 10%. [1]: commit 8ba1648567e2 ("drm: vkms: Refactor the plane composer to accept new formats") https://lore.kernel.org/all/20220905190811.25024-7-igormtorrente@gmail.com/ [2]: commit 322d716a3e8a ("drm/vkms: isolate pixel conversion functionality") https://lore.kernel.org/all/20230418130525.128733-2-mcanal@igalia.com/ [3]: https://lore.kernel.org/igt-dev/20240313-new_rotation-v2-0-6230fd5cae59@bootlin.com/ [4]: https://lore.kernel.org/all/20240422-kms_fb_stress-dev-v5-0-0c577163dc88@riseup.net/ Reviewed-by: José Expósito <jose.exposito89@gmail.com> Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241118-yuv-v14-8-2dbc2f1e222c@bootlin.com Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
2024-11-18 19:28:23 +01:00
* @pixel_read_line: function to read a pixel line in this plane. The creator of a
* struct vkms_plane_state must ensure that this pointer is valid
drm/vkms: Add YUV support Add support to the YUV formats bellow: - NV12/NV16/NV24 - NV21/NV61/NV42 - YUV420/YUV422/YUV444 - YVU420/YVU422/YVU444 The conversion from yuv to rgb is done with fixed-point arithmetic, using 32.32 fixed-point numbers and the drm_fixed helpers. To do the conversion, a specific matrix must be used for each color range (DRM_COLOR_*_RANGE) and encoding (DRM_COLOR_*). This matrix is stored in the `conversion_matrix` struct, along with the specific y_offset needed. This matrix is queried only once, in `vkms_plane_atomic_update` and stored in a `vkms_plane_state`. Those conversion matrices of each encoding and range were obtained by rounding the values of the original conversion matrices multiplied by 2^32. This is done to avoid the use of floating point operations. The same reading function is used for YUV and YVU formats. As the only difference between those two category of formats is the order of field, a simple swap in conversion matrix columns allows using the same function. [Louis Chauvet: - Adapted Arthur's work - Implemented the read_line_t callbacks for yuv - add struct conversion_matrix - store the whole conversion_matrix in the plane state - remove struct pixel_yuv_u8 - update the commit message - Merge the modifications from Arthur] Signed-off-by: Arthur Grillo <arthurgrillo@riseup.net> Acked-by: Maxime Ripard <mripard@kernel.org> Link: https://lore.kernel.org/r/20250415-yuv-v18-2-f2918f71ec4b@bootlin.com Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
2025-04-15 15:55:33 +02:00
* @conversion_matrix: matrix used for yuv formats to convert to rgb
*/
struct vkms_plane_state {
struct drm_shadow_plane_state base;
struct vkms_frame_info *frame_info;
drm/vkms: Re-introduce line-per-line composition algorithm Re-introduce a line-by-line composition algorithm for each pixel format. This allows more performance by not requiring an indirection per pixel read. This patch is focused on readability of the code. Line-by-line composition was introduced by [1] but rewritten back to pixel-by-pixel algorithm in [2]. At this time, nobody noticed the impact on performance, and it was merged. This patch is almost a revert of [2], but in addition efforts have been made to increase readability and maintainability of the rotation handling. The blend function is now divided in two parts: - Transformation of coordinates from the output referential to the source referential - Line conversion and blending Most of the complexity of the rotation management is avoided by using drm_rect_* helpers. The remaining complexity is around the clipping, to avoid reading/writing outside source/destination buffers. The pixel conversion is now done line-by-line, so the read_pixel_t was replaced with read_pixel_line_t callback. This way the indirection is only required once per line and per plane, instead of once per pixel and per plane. The read_line_t callbacks are very similar for most pixel format, but it is required to avoid performance impact. Some helpers for color conversion were introduced to avoid code repetition: - *_to_argb_u16: perform colors conversion. They should be inlined by the compiler, and they are used to avoid repetition between multiple variants of the same format (argb/xrgb and maybe in the future for formats like bgr formats). This new algorithm was tested with: - kms_plane (for color conversions) - kms_rotation_crc (for rotations of planes) - kms_cursor_crc (for translations of planes) - kms_rotation (for all rotations and formats combinations) [3] The performance gain was mesured with kms_fb_stress [4] with some modification to fix the writeback format. The performance improvement is around 5 to 10%. [1]: commit 8ba1648567e2 ("drm: vkms: Refactor the plane composer to accept new formats") https://lore.kernel.org/all/20220905190811.25024-7-igormtorrente@gmail.com/ [2]: commit 322d716a3e8a ("drm/vkms: isolate pixel conversion functionality") https://lore.kernel.org/all/20230418130525.128733-2-mcanal@igalia.com/ [3]: https://lore.kernel.org/igt-dev/20240313-new_rotation-v2-0-6230fd5cae59@bootlin.com/ [4]: https://lore.kernel.org/all/20240422-kms_fb_stress-dev-v5-0-0c577163dc88@riseup.net/ Reviewed-by: José Expósito <jose.exposito89@gmail.com> Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241118-yuv-v14-8-2dbc2f1e222c@bootlin.com Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
2024-11-18 19:28:23 +01:00
pixel_read_line_t pixel_read_line;
drm/vkms: Add YUV support Add support to the YUV formats bellow: - NV12/NV16/NV24 - NV21/NV61/NV42 - YUV420/YUV422/YUV444 - YVU420/YVU422/YVU444 The conversion from yuv to rgb is done with fixed-point arithmetic, using 32.32 fixed-point numbers and the drm_fixed helpers. To do the conversion, a specific matrix must be used for each color range (DRM_COLOR_*_RANGE) and encoding (DRM_COLOR_*). This matrix is stored in the `conversion_matrix` struct, along with the specific y_offset needed. This matrix is queried only once, in `vkms_plane_atomic_update` and stored in a `vkms_plane_state`. Those conversion matrices of each encoding and range were obtained by rounding the values of the original conversion matrices multiplied by 2^32. This is done to avoid the use of floating point operations. The same reading function is used for YUV and YVU formats. As the only difference between those two category of formats is the order of field, a simple swap in conversion matrix columns allows using the same function. [Louis Chauvet: - Adapted Arthur's work - Implemented the read_line_t callbacks for yuv - add struct conversion_matrix - store the whole conversion_matrix in the plane state - remove struct pixel_yuv_u8 - update the commit message - Merge the modifications from Arthur] Signed-off-by: Arthur Grillo <arthurgrillo@riseup.net> Acked-by: Maxime Ripard <mripard@kernel.org> Link: https://lore.kernel.org/r/20250415-yuv-v18-2-f2918f71ec4b@bootlin.com Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
2025-04-15 15:55:33 +02:00
struct conversion_matrix conversion_matrix;
};
struct vkms_plane {
struct drm_plane base;
};
struct vkms_color_lut {
struct drm_color_lut *base;
size_t lut_length;
s64 channel_value2index_ratio;
};
/**
* struct vkms_crtc_state - Driver specific CRTC state
*
* @base: base CRTC state
* @composer_work: work struct to compose and add CRC entries
*
* @num_active_planes: Number of active planes
* @active_planes: List containing all the active planes (counted by
* @num_active_planes). They should be stored in z-order.
* @active_writeback: Current active writeback job
* @gamma_lut: Look up table for gamma used in this CRTC
* @crc_pending: Protected by @vkms_output.composer_lock, true when the frame CRC is not computed
* yet. Used by vblank to detect if the composer is too slow.
* @wb_pending: Protected by @vkms_output.composer_lock, true when a writeback frame is requested.
* @frame_start: Protected by @vkms_output.composer_lock, saves the frame number before the start
* of the composition process.
* @frame_end: Protected by @vkms_output.composer_lock, saves the last requested frame number.
* This is used to generate enough CRC entries when the composition worker is too slow.
*/
struct vkms_crtc_state {
struct drm_crtc_state base;
struct work_struct composer_work;
drm/vkms: Fix crc worker races The issue we have is that the crc worker might fall behind. We've tried to handle this by tracking both the earliest frame for which it still needs to compute a crc, and the last one. Plus when the crtc_state changes, we have a new work item, which are all run in order due to the ordered workqueue we allocate for each vkms crtc. Trouble is there's been a few small issues in the current code: - we need to capture frame_end in the vblank hrtimer, not in the worker. The worker might run much later, and then we generate a lot of crc for which there's already a different worker queued up. - frame number might be 0, so create a new crc_pending boolean to track this without confusion. - we need to atomically grab frame_start/end and clear it, so do that all in one go. This is not going to create a new race, because if we race with the hrtimer then our work will be re-run. - only race that can happen is the following: 1. worker starts 2. hrtimer runs and updates frame_end 3. worker grabs frame_start/end, already reading the new frame_end, and clears crc_pending 4. hrtimer calls queue_work() 5. worker completes 6. worker gets re-run, crc_pending is false Explain this case a bit better by rewording the comment. v2: Demote warning level output to debug when we fail to requeue, this is expected under high load when the crc worker can't quite keep up. Cc: Shayenne Moura <shayenneluzmoura@gmail.com> Cc: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Cc: Haneen Mohammed <hamohammed.sa@gmail.com> Cc: Daniel Vetter <daniel@ffwll.ch> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Reviewed-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Tested-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Signed-off-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190606222751.32567-2-daniel.vetter@ffwll.ch
2019-06-07 00:27:42 +02:00
drm/vkms: totally reworked crc data tracking The crc computation worker needs to be able to get at some data structures and framebuffer mappings, while potentially more atomic updates are going on. The solution thus far is to copy relevant bits around, but that's very tedious. Here's a new approach, which tries to be more clever, but relies on a few not-so-obvious things: - crtc_state is always updated when a plane_state changes. Therefore we can just stuff plane_state pointers into a crtc_state. That solves the problem of easily getting at the needed plane_states. - with the flushing changes from previous patches the above also holds without races due to the next atomic update being a bit eager with cleaning up pending work - we always wait for all crc work items to complete before unmapping framebuffers. - we also need to make sure that the hrtimer fires off the right worker. Keep a new distinct crc_state pointer, under the vkms_output->lock protection for this. Note that crtc->state is updated very early in the atomic commit, way before we arm the vblank event - the vblank event should always match the buffers we use to compute the crc. This also solves an issue in the hrtimer, where we've accessed drm_crtc->state without holding the right locks (we held none - oops). - in the worker itself we can then just access the plane states we need, again solving a bunch of ordering and locking issues. Accessing plane->state requires locks, accessing the private vkms_crtc_state->active_planes pointer only requires that the memory doesn't get freed too early. The idea behind vkms_crtc_state->active_planes is that this would contain all visible planes, in z-order, as a first step towards a more generic blending implementation. Note that this patch also fixes races between prepare_fb/cleanup_fb and the crc worker accessing ->vaddr. Cc: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Cc: Haneen Mohammed <hamohammed.sa@gmail.com> Cc: Daniel Vetter <daniel@ffwll.ch> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Reviewed-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Tested-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Signed-off-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190606222751.32567-10-daniel.vetter@ffwll.ch
2019-06-07 00:27:50 +02:00
int num_active_planes;
struct vkms_plane_state **active_planes;
struct vkms_writeback_job *active_writeback;
struct vkms_color_lut gamma_lut;
drm/vkms: totally reworked crc data tracking The crc computation worker needs to be able to get at some data structures and framebuffer mappings, while potentially more atomic updates are going on. The solution thus far is to copy relevant bits around, but that's very tedious. Here's a new approach, which tries to be more clever, but relies on a few not-so-obvious things: - crtc_state is always updated when a plane_state changes. Therefore we can just stuff plane_state pointers into a crtc_state. That solves the problem of easily getting at the needed plane_states. - with the flushing changes from previous patches the above also holds without races due to the next atomic update being a bit eager with cleaning up pending work - we always wait for all crc work items to complete before unmapping framebuffers. - we also need to make sure that the hrtimer fires off the right worker. Keep a new distinct crc_state pointer, under the vkms_output->lock protection for this. Note that crtc->state is updated very early in the atomic commit, way before we arm the vblank event - the vblank event should always match the buffers we use to compute the crc. This also solves an issue in the hrtimer, where we've accessed drm_crtc->state without holding the right locks (we held none - oops). - in the worker itself we can then just access the plane states we need, again solving a bunch of ordering and locking issues. Accessing plane->state requires locks, accessing the private vkms_crtc_state->active_planes pointer only requires that the memory doesn't get freed too early. The idea behind vkms_crtc_state->active_planes is that this would contain all visible planes, in z-order, as a first step towards a more generic blending implementation. Note that this patch also fixes races between prepare_fb/cleanup_fb and the crc worker accessing ->vaddr. Cc: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Cc: Haneen Mohammed <hamohammed.sa@gmail.com> Cc: Daniel Vetter <daniel@ffwll.ch> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Reviewed-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Tested-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Signed-off-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190606222751.32567-10-daniel.vetter@ffwll.ch
2019-06-07 00:27:50 +02:00
drm/vkms: Fix crc worker races The issue we have is that the crc worker might fall behind. We've tried to handle this by tracking both the earliest frame for which it still needs to compute a crc, and the last one. Plus when the crtc_state changes, we have a new work item, which are all run in order due to the ordered workqueue we allocate for each vkms crtc. Trouble is there's been a few small issues in the current code: - we need to capture frame_end in the vblank hrtimer, not in the worker. The worker might run much later, and then we generate a lot of crc for which there's already a different worker queued up. - frame number might be 0, so create a new crc_pending boolean to track this without confusion. - we need to atomically grab frame_start/end and clear it, so do that all in one go. This is not going to create a new race, because if we race with the hrtimer then our work will be re-run. - only race that can happen is the following: 1. worker starts 2. hrtimer runs and updates frame_end 3. worker grabs frame_start/end, already reading the new frame_end, and clears crc_pending 4. hrtimer calls queue_work() 5. worker completes 6. worker gets re-run, crc_pending is false Explain this case a bit better by rewording the comment. v2: Demote warning level output to debug when we fail to requeue, this is expected under high load when the crc worker can't quite keep up. Cc: Shayenne Moura <shayenneluzmoura@gmail.com> Cc: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Cc: Haneen Mohammed <hamohammed.sa@gmail.com> Cc: Daniel Vetter <daniel@ffwll.ch> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Reviewed-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Tested-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Signed-off-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190606222751.32567-2-daniel.vetter@ffwll.ch
2019-06-07 00:27:42 +02:00
bool crc_pending;
bool wb_pending;
u64 frame_start;
u64 frame_end;
};
/**
* struct vkms_output - Internal representation of all output components in VKMS
*
* @crtc: Base CRTC in DRM
* @encoder: DRM encoder used for this output
* @connector: DRM connector used for this output
* @wb_connecter: DRM writeback connector used for this output
* @vblank_hrtimer: Timer used to trigger the vblank
* @period_ns: vblank period, in nanoseconds, used to configure @vblank_hrtimer and to compute
* vblank timestamps
* @composer_workq: Ordered workqueue for @composer_state.composer_work.
* @lock: Lock used to protect concurrent access to the composer
* @composer_enabled: Protected by @lock, true when the VKMS composer is active (crc needed or
* writeback)
* @composer_state: Protected by @lock, current state of this VKMS output
* @composer_lock: Lock used internally to protect @composer_state members
*/
struct vkms_output {
struct drm_crtc crtc;
struct drm_writeback_connector wb_connector;
struct drm_encoder wb_encoder;
struct hrtimer vblank_hrtimer;
ktime_t period_ns;
struct workqueue_struct *composer_workq;
drm/vkms: Implement CRC debugfs API This patch implement the necessary functions to compute and add CRCs entries: - Implement the set_crc_source() callback. - Compute CRC using crc32 on the visible part of the framebuffer. - Use ordered workqueue per output to compute and add CRC at the end of a vblank. - Use appropriate synchronization methods since the CRC computation must be atomic wrt the generated vblank event for a given atomic update, by using spinlock across atomic_begin/atomic_flush to wrap the event handling code completely and match the flip event with the CRC. Since vkms_crc_work_handle() can sleep, spinlock can't be acquired while accessing vkms_output->primary_crc to compute CRC. To make sure the data is updated and released without conflict with the vkms_crc_work_handle(), the work_struct is flushed @crtc_destroy and the data is updated before scheduling the work handle again, as follow: * CRC data update: 1- store vkms_crc_data {fb, src} per plane_state 2- @plane_duplicate_state -> allocate vkms_crc_data 3- during atomic commit (@atomic_update) -> a) copy {fb, src} to plane_state->crc_data b) get reference to fb, 3- @plane_destroy_state -> a) if (fb refcount) remove reference to fb b) deallocate crc_data * Atomic Commit: 1- vkms_plane_atomic_check 2- vkms_prepare_fb -> vmap vkms_gem_obj->vaddr 3- atomic_begin -> hold crc spinlock 4- atomic_plane_update -> a) update vkms_output->primary_crc b) get reference to fb 5- atomic_flush -> a) send vblank event while holding event_lock b) release crc spinlock * hrtimer regular callback: 1- hold crc spinlock 2- drm_crtc_handle_vblank() 3- queue vkms_crc_work_handle 4- release crc spinlock * cleanup: 1- @cleanup_fb ->vunmap vkms_gem_obj->vaddr 2- @crtc_destroy -> flush work struct 3- @plane_destroy -> a) if (fb refcount) remove reference to fb b) deallocate crc_data Signed-off-by: Haneen Mohammed <hamohammed.sa@gmail.com> [seanpaul fixed typo in vkms_crtc s/vblamk/vblank/] Signed-off-by: Sean Paul <seanpaul@chromium.org> Link: https://patchwork.freedesktop.org/patch/msgid/b948327f48c3e70ab232b4a0848ee6d033b26484.1533171495.git.hamohammed.sa@gmail.com
2018-08-02 04:10:26 +03:00
spinlock_t lock;
bool composer_enabled;
struct vkms_crtc_state *composer_state;
drm/vkms: totally reworked crc data tracking The crc computation worker needs to be able to get at some data structures and framebuffer mappings, while potentially more atomic updates are going on. The solution thus far is to copy relevant bits around, but that's very tedious. Here's a new approach, which tries to be more clever, but relies on a few not-so-obvious things: - crtc_state is always updated when a plane_state changes. Therefore we can just stuff plane_state pointers into a crtc_state. That solves the problem of easily getting at the needed plane_states. - with the flushing changes from previous patches the above also holds without races due to the next atomic update being a bit eager with cleaning up pending work - we always wait for all crc work items to complete before unmapping framebuffers. - we also need to make sure that the hrtimer fires off the right worker. Keep a new distinct crc_state pointer, under the vkms_output->lock protection for this. Note that crtc->state is updated very early in the atomic commit, way before we arm the vblank event - the vblank event should always match the buffers we use to compute the crc. This also solves an issue in the hrtimer, where we've accessed drm_crtc->state without holding the right locks (we held none - oops). - in the worker itself we can then just access the plane states we need, again solving a bunch of ordering and locking issues. Accessing plane->state requires locks, accessing the private vkms_crtc_state->active_planes pointer only requires that the memory doesn't get freed too early. The idea behind vkms_crtc_state->active_planes is that this would contain all visible planes, in z-order, as a first step towards a more generic blending implementation. Note that this patch also fixes races between prepare_fb/cleanup_fb and the crc worker accessing ->vaddr. Cc: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Cc: Haneen Mohammed <hamohammed.sa@gmail.com> Cc: Daniel Vetter <daniel@ffwll.ch> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Reviewed-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Tested-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Signed-off-by: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190606222751.32567-10-daniel.vetter@ffwll.ch
2019-06-07 00:27:50 +02:00
spinlock_t composer_lock;
};
struct vkms_config;
/**
* struct vkms_device - Description of a VKMS device
*
* @drm - Base device in DRM
* @faux_dev - Associated faux device
* @output - Configuration and sub-components of the VKMS device
* @config: Configuration used in this VKMS device
*/
struct vkms_device {
struct drm_device drm;
struct faux_device *faux_dev;
const struct vkms_config *config;
};
/*
* The following helpers are used to convert a member of a struct into its parent.
*/
#define drm_crtc_to_vkms_output(target) \
container_of(target, struct vkms_output, crtc)
#define drm_device_to_vkms_device(target) \
container_of(target, struct vkms_device, drm)
#define to_vkms_crtc_state(target)\
container_of(target, struct vkms_crtc_state, base)
#define to_vkms_plane_state(target)\
container_of(target, struct vkms_plane_state, base.base)
/**
* vkms_crtc_init() - Initialize a CRTC for VKMS
* @dev: DRM device associated with the VKMS buffer
* @crtc: uninitialized CRTC device
* @primary: primary plane to attach to the CRTC
* @cursor: plane to attach to the CRTC
*/
struct vkms_output *vkms_crtc_init(struct drm_device *dev,
struct drm_plane *primary,
struct drm_plane *cursor);
/**
* vkms_output_init() - Initialize all sub-components needed for a VKMS device.
*
* @vkmsdev: VKMS device to initialize
*/
int vkms_output_init(struct vkms_device *vkmsdev);
/**
* vkms_plane_init() - Initialize a plane
*
* @vkmsdev: VKMS device containing the plane
* @type: type of plane to initialize
*/
struct vkms_plane *vkms_plane_init(struct vkms_device *vkmsdev,
enum drm_plane_type type);
drm/vkms: Implement CRC debugfs API This patch implement the necessary functions to compute and add CRCs entries: - Implement the set_crc_source() callback. - Compute CRC using crc32 on the visible part of the framebuffer. - Use ordered workqueue per output to compute and add CRC at the end of a vblank. - Use appropriate synchronization methods since the CRC computation must be atomic wrt the generated vblank event for a given atomic update, by using spinlock across atomic_begin/atomic_flush to wrap the event handling code completely and match the flip event with the CRC. Since vkms_crc_work_handle() can sleep, spinlock can't be acquired while accessing vkms_output->primary_crc to compute CRC. To make sure the data is updated and released without conflict with the vkms_crc_work_handle(), the work_struct is flushed @crtc_destroy and the data is updated before scheduling the work handle again, as follow: * CRC data update: 1- store vkms_crc_data {fb, src} per plane_state 2- @plane_duplicate_state -> allocate vkms_crc_data 3- during atomic commit (@atomic_update) -> a) copy {fb, src} to plane_state->crc_data b) get reference to fb, 3- @plane_destroy_state -> a) if (fb refcount) remove reference to fb b) deallocate crc_data * Atomic Commit: 1- vkms_plane_atomic_check 2- vkms_prepare_fb -> vmap vkms_gem_obj->vaddr 3- atomic_begin -> hold crc spinlock 4- atomic_plane_update -> a) update vkms_output->primary_crc b) get reference to fb 5- atomic_flush -> a) send vblank event while holding event_lock b) release crc spinlock * hrtimer regular callback: 1- hold crc spinlock 2- drm_crtc_handle_vblank() 3- queue vkms_crc_work_handle 4- release crc spinlock * cleanup: 1- @cleanup_fb ->vunmap vkms_gem_obj->vaddr 2- @crtc_destroy -> flush work struct 3- @plane_destroy -> a) if (fb refcount) remove reference to fb b) deallocate crc_data Signed-off-by: Haneen Mohammed <hamohammed.sa@gmail.com> [seanpaul fixed typo in vkms_crtc s/vblamk/vblank/] Signed-off-by: Sean Paul <seanpaul@chromium.org> Link: https://patchwork.freedesktop.org/patch/msgid/b948327f48c3e70ab232b4a0848ee6d033b26484.1533171495.git.hamohammed.sa@gmail.com
2018-08-02 04:10:26 +03:00
/* CRC Support */
const char *const *vkms_get_crc_sources(struct drm_crtc *crtc,
size_t *count);
int vkms_set_crc_source(struct drm_crtc *crtc, const char *src_name);
int vkms_verify_crc_source(struct drm_crtc *crtc, const char *source_name,
size_t *values_cnt);
/* Composer Support */
void vkms_composer_worker(struct work_struct *work);
void vkms_set_composer(struct vkms_output *out, bool enabled);
void vkms_writeback_row(struct vkms_writeback_job *wb, const struct line_buffer *src_buffer, int y);
/* Writeback */
int vkms_enable_writeback_connector(struct vkms_device *vkmsdev, struct vkms_output *vkms_out);
drm/vkms: Implement CRC debugfs API This patch implement the necessary functions to compute and add CRCs entries: - Implement the set_crc_source() callback. - Compute CRC using crc32 on the visible part of the framebuffer. - Use ordered workqueue per output to compute and add CRC at the end of a vblank. - Use appropriate synchronization methods since the CRC computation must be atomic wrt the generated vblank event for a given atomic update, by using spinlock across atomic_begin/atomic_flush to wrap the event handling code completely and match the flip event with the CRC. Since vkms_crc_work_handle() can sleep, spinlock can't be acquired while accessing vkms_output->primary_crc to compute CRC. To make sure the data is updated and released without conflict with the vkms_crc_work_handle(), the work_struct is flushed @crtc_destroy and the data is updated before scheduling the work handle again, as follow: * CRC data update: 1- store vkms_crc_data {fb, src} per plane_state 2- @plane_duplicate_state -> allocate vkms_crc_data 3- during atomic commit (@atomic_update) -> a) copy {fb, src} to plane_state->crc_data b) get reference to fb, 3- @plane_destroy_state -> a) if (fb refcount) remove reference to fb b) deallocate crc_data * Atomic Commit: 1- vkms_plane_atomic_check 2- vkms_prepare_fb -> vmap vkms_gem_obj->vaddr 3- atomic_begin -> hold crc spinlock 4- atomic_plane_update -> a) update vkms_output->primary_crc b) get reference to fb 5- atomic_flush -> a) send vblank event while holding event_lock b) release crc spinlock * hrtimer regular callback: 1- hold crc spinlock 2- drm_crtc_handle_vblank() 3- queue vkms_crc_work_handle 4- release crc spinlock * cleanup: 1- @cleanup_fb ->vunmap vkms_gem_obj->vaddr 2- @crtc_destroy -> flush work struct 3- @plane_destroy -> a) if (fb refcount) remove reference to fb b) deallocate crc_data Signed-off-by: Haneen Mohammed <hamohammed.sa@gmail.com> [seanpaul fixed typo in vkms_crtc s/vblamk/vblank/] Signed-off-by: Sean Paul <seanpaul@chromium.org> Link: https://patchwork.freedesktop.org/patch/msgid/b948327f48c3e70ab232b4a0848ee6d033b26484.1533171495.git.hamohammed.sa@gmail.com
2018-08-02 04:10:26 +03:00
#endif /* _VKMS_DRV_H_ */