linux/drivers/media/platform/verisilicon/hantro_postproc.c

345 lines
9.6 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Hantro G1 post-processor support
*
* Copyright (C) 2019 Collabora, Ltd.
*/
#include <linux/dma-mapping.h>
#include <linux/types.h>
#include "hantro.h"
#include "hantro_hw.h"
#include "hantro_g1_regs.h"
#include "hantro_g2_regs.h"
#include "hantro_v4l2.h"
#define HANTRO_PP_REG_WRITE(vpu, reg_name, val) \
{ \
hantro_reg_write(vpu, \
&hantro_g1_postproc_regs.reg_name, \
val); \
}
#define HANTRO_PP_REG_WRITE_RELAXED(vpu, reg_name, val) \
{ \
hantro_reg_write_relaxed(vpu, \
&hantro_g1_postproc_regs.reg_name, \
val); \
}
#define VPU_PP_IN_YUYV 0x0
#define VPU_PP_IN_NV12 0x1
#define VPU_PP_IN_YUV420 0x2
#define VPU_PP_IN_YUV240_TILED 0x5
#define VPU_PP_OUT_RGB 0x0
#define VPU_PP_OUT_YUYV 0x3
static const struct hantro_postproc_regs hantro_g1_postproc_regs = {
.pipeline_en = {G1_REG_PP_INTERRUPT, 1, 0x1},
.max_burst = {G1_REG_PP_DEV_CONFIG, 0, 0x1f},
.clk_gate = {G1_REG_PP_DEV_CONFIG, 1, 0x1},
.out_swap32 = {G1_REG_PP_DEV_CONFIG, 5, 0x1},
.out_endian = {G1_REG_PP_DEV_CONFIG, 6, 0x1},
.out_luma_base = {G1_REG_PP_OUT_LUMA_BASE, 0, 0xffffffff},
.input_width = {G1_REG_PP_INPUT_SIZE, 0, 0x1ff},
.input_height = {G1_REG_PP_INPUT_SIZE, 9, 0x1ff},
.output_width = {G1_REG_PP_CONTROL, 4, 0x7ff},
.output_height = {G1_REG_PP_CONTROL, 15, 0x7ff},
.input_fmt = {G1_REG_PP_CONTROL, 29, 0x7},
.output_fmt = {G1_REG_PP_CONTROL, 26, 0x7},
.orig_width = {G1_REG_PP_MASK1_ORIG_WIDTH, 23, 0x1ff},
.display_width = {G1_REG_PP_DISPLAY_WIDTH_IN_EXT, 0, 0xfff},
.input_width_ext = {G1_REG_PP_DISPLAY_WIDTH_IN_EXT, 26, 0x7},
.input_height_ext = {G1_REG_PP_DISPLAY_WIDTH_IN_EXT, 29, 0x7},
};
bool hantro_needs_postproc(const struct hantro_ctx *ctx,
const struct hantro_fmt *fmt)
{
if (ctx->is_encoder)
return false;
if (ctx->need_postproc)
return true;
return fmt->postprocessed;
}
static void hantro_postproc_g1_enable(struct hantro_ctx *ctx)
{
struct hantro_dev *vpu = ctx->dev;
struct vb2_v4l2_buffer *dst_buf;
u32 src_pp_fmt, dst_pp_fmt;
dma_addr_t dst_dma;
/* Turn on pipeline mode. Must be done first. */
HANTRO_PP_REG_WRITE(vpu, pipeline_en, 0x1);
src_pp_fmt = VPU_PP_IN_NV12;
switch (ctx->vpu_dst_fmt->fourcc) {
case V4L2_PIX_FMT_YUYV:
dst_pp_fmt = VPU_PP_OUT_YUYV;
break;
default:
WARN(1, "output format %d not supported by the post-processor, this wasn't expected.",
ctx->vpu_dst_fmt->fourcc);
dst_pp_fmt = 0;
break;
}
dst_buf = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
dst_dma = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);
HANTRO_PP_REG_WRITE(vpu, clk_gate, 0x1);
HANTRO_PP_REG_WRITE(vpu, out_endian, 0x1);
HANTRO_PP_REG_WRITE(vpu, out_swap32, 0x1);
HANTRO_PP_REG_WRITE(vpu, max_burst, 16);
HANTRO_PP_REG_WRITE(vpu, out_luma_base, dst_dma);
HANTRO_PP_REG_WRITE(vpu, input_width, MB_WIDTH(ctx->dst_fmt.width));
HANTRO_PP_REG_WRITE(vpu, input_height, MB_HEIGHT(ctx->dst_fmt.height));
HANTRO_PP_REG_WRITE(vpu, input_fmt, src_pp_fmt);
HANTRO_PP_REG_WRITE(vpu, output_fmt, dst_pp_fmt);
HANTRO_PP_REG_WRITE(vpu, output_width, ctx->dst_fmt.width);
HANTRO_PP_REG_WRITE(vpu, output_height, ctx->dst_fmt.height);
HANTRO_PP_REG_WRITE(vpu, orig_width, MB_WIDTH(ctx->dst_fmt.width));
HANTRO_PP_REG_WRITE(vpu, display_width, ctx->dst_fmt.width);
HANTRO_PP_REG_WRITE(vpu, input_width_ext, MB_WIDTH(ctx->dst_fmt.width) >> 9);
HANTRO_PP_REG_WRITE(vpu, input_height_ext, MB_HEIGHT(ctx->dst_fmt.height >> 8));
}
static int down_scale_factor(struct hantro_ctx *ctx)
{
media: verisilicon: Do not enable G2 postproc downscale if source is narrower than destination In case of encoded input VP9 data width that is not multiple of macroblock size, which is 16 (e.g. 1080x1920 frames, where 1080 is multiple of 8), the width is padded to be a multiple of macroblock size (for 1080x1920 frames, that is 1088x1920). The hantro_postproc_g2_enable() checks whether the encoded data width is equal to decoded frame width, and if not, enables down-scale mode. For a frame where input is 1080x1920 and output is 1088x1920, this is incorrect as no down-scale happens, the frame is only padded. Enabling the down-scale mode in this case results in corrupted frames. Fix this by adjusting the check to test whether encoded data width is greater than decoded frame width, and only in that case enable the down-scale mode. To generate input test data to trigger this bug, use e.g.: $ gst-launch-1.0 videotestsrc ! video/x-raw,width=272,height=256,format=I420 ! \ vp9enc ! matroskamux ! filesink location=/tmp/test.vp9 To trigger the bug upon decoding (note that the NV12 must be forced, as that assures the output data would pass the G2 postproc): $ gst-launch-1.0 filesrc location=/tmp/test.vp9 ! matroskademux ! vp9parse ! \ v4l2slvp9dec ! video/x-raw,format=NV12 ! videoconvert ! fbdevsink Fixes: 79c987de8b35 ("media: hantro: Use post processor scaling capacities") Signed-off-by: Marek Vasut <marex@denx.de> Reviewed-by: Benjamin Gaignard <benjamin.gaignard@collabora.com> Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
2023-08-24 03:39:35 +02:00
if (ctx->src_fmt.width <= ctx->dst_fmt.width)
return 0;
return DIV_ROUND_CLOSEST(ctx->src_fmt.width, ctx->dst_fmt.width);
}
static void hantro_postproc_g2_enable(struct hantro_ctx *ctx)
{
struct hantro_dev *vpu = ctx->dev;
struct vb2_v4l2_buffer *dst_buf;
int down_scale = down_scale_factor(ctx);
int out_depth;
size_t chroma_offset;
dma_addr_t dst_dma;
dst_buf = hantro_get_dst_buf(ctx);
dst_dma = vb2_dma_contig_plane_dma_addr(&dst_buf->vb2_buf, 0);
chroma_offset = ctx->dst_fmt.plane_fmt[0].bytesperline *
ctx->dst_fmt.height;
if (down_scale) {
hantro_reg_write(vpu, &g2_down_scale_e, 1);
hantro_reg_write(vpu, &g2_down_scale_y, down_scale >> 2);
hantro_reg_write(vpu, &g2_down_scale_x, down_scale >> 2);
hantro_write_addr(vpu, G2_DS_DST, dst_dma);
hantro_write_addr(vpu, G2_DS_DST_CHR, dst_dma + (chroma_offset >> down_scale));
} else {
hantro_write_addr(vpu, G2_RS_OUT_LUMA_ADDR, dst_dma);
hantro_write_addr(vpu, G2_RS_OUT_CHROMA_ADDR, dst_dma + chroma_offset);
}
out_depth = hantro_get_format_depth(ctx->dst_fmt.pixelformat);
if (ctx->dev->variant->legacy_regs) {
u8 pp_shift = 0;
if (out_depth > 8)
pp_shift = 16 - out_depth;
hantro_reg_write(ctx->dev, &g2_rs_out_bit_depth, out_depth);
hantro_reg_write(ctx->dev, &g2_pp_pix_shift, pp_shift);
} else {
hantro_reg_write(vpu, &g2_output_8_bits, out_depth > 8 ? 0 : 1);
hantro_reg_write(vpu, &g2_output_format, out_depth > 8 ? 1 : 0);
}
hantro_reg_write(vpu, &g2_out_rs_e, 1);
}
static int hantro_postproc_g2_enum_framesizes(struct hantro_ctx *ctx,
struct v4l2_frmsizeenum *fsize)
{
/**
* G2 scaler can scale down by 0, 2, 4 or 8
* use fsize->index has power of 2 diviser
**/
if (fsize->index > 3)
return -EINVAL;
if (!ctx->src_fmt.width || !ctx->src_fmt.height)
return -EINVAL;
fsize->type = V4L2_FRMSIZE_TYPE_DISCRETE;
fsize->discrete.width = ctx->src_fmt.width >> fsize->index;
fsize->discrete.height = ctx->src_fmt.height >> fsize->index;
return 0;
}
void hantro_postproc_free(struct hantro_ctx *ctx)
{
struct hantro_dev *vpu = ctx->dev;
struct v4l2_m2m_ctx *m2m_ctx = ctx->fh.m2m_ctx;
struct vb2_queue *queue = &m2m_ctx->cap_q_ctx.q;
unsigned int i;
for (i = 0; i < queue->max_num_buffers; ++i) {
struct hantro_aux_buf *priv = &ctx->postproc.dec_q[i];
if (priv->cpu) {
dma_free_attrs(vpu->dev, priv->size, priv->cpu,
priv->dma, priv->attrs);
priv->cpu = NULL;
}
}
}
static unsigned int hantro_postproc_buffer_size(struct hantro_ctx *ctx)
{
unsigned int buf_size;
buf_size = ctx->ref_fmt.plane_fmt[0].sizeimage;
if (ctx->vpu_src_fmt->fourcc == V4L2_PIX_FMT_H264_SLICE)
buf_size += hantro_h264_mv_size(ctx->ref_fmt.width,
ctx->ref_fmt.height);
else if (ctx->vpu_src_fmt->fourcc == V4L2_PIX_FMT_VP9_FRAME)
buf_size += hantro_vp9_mv_size(ctx->ref_fmt.width,
ctx->ref_fmt.height);
media: verisilicon: Add reference buffer compression feature Reference frame compression is a feature added in the G2 decoder to compress frame buffers so that the bandwidth of storing/loading reference frames can be reduced, especially with high resolution decoded streams. The impact of compressed frames is confirmed when using perf to monitor the number of memory accesses with or without the compression feature. The following command: perf stat -a -e \ imx8_ddr0/cycles/,imx8_ddr0/read-cycles/,imx8_ddr0/write-cycles/ \ gst-launch-1.0 filesrc \ location=Jockey_3840x2160_120fps_420_8bit_HEVC_RAW.hevc ! queue ! \ h265parse ! v4l2slh265dec ! video/x-raw,format=NV12 ! fakesink Gives us these results without the compression feature: Performance counter stats for 'system wide': 1711300345 imx8_ddr0/cycles/ 892207924 imx8_ddr0/read-cycles/ 1291785864 imx8_ddr0/write-cycles/ 13.760048353 seconds time elapsed With the compression feature: Performance counter stats for 'system wide': 274526799 imx8_ddr0/cycles/ 453120194 imx8_ddr0/read-cycles/ 833391434 imx8_ddr0/write-cycles/ 18.257831534 seconds time elapsed As expected the number of read/write cycles are really lower when compression is used. Since storing the compression data requires more memory a module parameter named 'hevc_use_compression' is used to enable/disable this feature and, by default, compression isn't used. Enabling the compression feature means that the output-frames of the decoder are stored with a specific compression pixel-format. Since this pixel format is unknown, this patch restrains the compression feature usage to the cases where post-processor pixel-formats (NV12 or NV15) are selected by the applications. The Fluster compliance HEVC test suite score is still 141/147 with this patch. Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com> Tested-by: Nicolas Dufresne <nicolas.dufresne@collabora.com> Signed-off-by: Sebastian Fricke <sebastian.fricke@collabora.com> Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
2024-05-16 10:41:07 +02:00
else if (ctx->vpu_src_fmt->fourcc == V4L2_PIX_FMT_HEVC_SLICE) {
buf_size += hantro_hevc_mv_size(ctx->ref_fmt.width,
ctx->ref_fmt.height);
media: verisilicon: Add reference buffer compression feature Reference frame compression is a feature added in the G2 decoder to compress frame buffers so that the bandwidth of storing/loading reference frames can be reduced, especially with high resolution decoded streams. The impact of compressed frames is confirmed when using perf to monitor the number of memory accesses with or without the compression feature. The following command: perf stat -a -e \ imx8_ddr0/cycles/,imx8_ddr0/read-cycles/,imx8_ddr0/write-cycles/ \ gst-launch-1.0 filesrc \ location=Jockey_3840x2160_120fps_420_8bit_HEVC_RAW.hevc ! queue ! \ h265parse ! v4l2slh265dec ! video/x-raw,format=NV12 ! fakesink Gives us these results without the compression feature: Performance counter stats for 'system wide': 1711300345 imx8_ddr0/cycles/ 892207924 imx8_ddr0/read-cycles/ 1291785864 imx8_ddr0/write-cycles/ 13.760048353 seconds time elapsed With the compression feature: Performance counter stats for 'system wide': 274526799 imx8_ddr0/cycles/ 453120194 imx8_ddr0/read-cycles/ 833391434 imx8_ddr0/write-cycles/ 18.257831534 seconds time elapsed As expected the number of read/write cycles are really lower when compression is used. Since storing the compression data requires more memory a module parameter named 'hevc_use_compression' is used to enable/disable this feature and, by default, compression isn't used. Enabling the compression feature means that the output-frames of the decoder are stored with a specific compression pixel-format. Since this pixel format is unknown, this patch restrains the compression feature usage to the cases where post-processor pixel-formats (NV12 or NV15) are selected by the applications. The Fluster compliance HEVC test suite score is still 141/147 with this patch. Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com> Tested-by: Nicolas Dufresne <nicolas.dufresne@collabora.com> Signed-off-by: Sebastian Fricke <sebastian.fricke@collabora.com> Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
2024-05-16 10:41:07 +02:00
if (ctx->hevc_dec.use_compression)
buf_size += hantro_hevc_compressed_size(ctx->ref_fmt.width,
ctx->ref_fmt.height);
media: verisilicon: Add reference buffer compression feature Reference frame compression is a feature added in the G2 decoder to compress frame buffers so that the bandwidth of storing/loading reference frames can be reduced, especially with high resolution decoded streams. The impact of compressed frames is confirmed when using perf to monitor the number of memory accesses with or without the compression feature. The following command: perf stat -a -e \ imx8_ddr0/cycles/,imx8_ddr0/read-cycles/,imx8_ddr0/write-cycles/ \ gst-launch-1.0 filesrc \ location=Jockey_3840x2160_120fps_420_8bit_HEVC_RAW.hevc ! queue ! \ h265parse ! v4l2slh265dec ! video/x-raw,format=NV12 ! fakesink Gives us these results without the compression feature: Performance counter stats for 'system wide': 1711300345 imx8_ddr0/cycles/ 892207924 imx8_ddr0/read-cycles/ 1291785864 imx8_ddr0/write-cycles/ 13.760048353 seconds time elapsed With the compression feature: Performance counter stats for 'system wide': 274526799 imx8_ddr0/cycles/ 453120194 imx8_ddr0/read-cycles/ 833391434 imx8_ddr0/write-cycles/ 18.257831534 seconds time elapsed As expected the number of read/write cycles are really lower when compression is used. Since storing the compression data requires more memory a module parameter named 'hevc_use_compression' is used to enable/disable this feature and, by default, compression isn't used. Enabling the compression feature means that the output-frames of the decoder are stored with a specific compression pixel-format. Since this pixel format is unknown, this patch restrains the compression feature usage to the cases where post-processor pixel-formats (NV12 or NV15) are selected by the applications. The Fluster compliance HEVC test suite score is still 141/147 with this patch. Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com> Tested-by: Nicolas Dufresne <nicolas.dufresne@collabora.com> Signed-off-by: Sebastian Fricke <sebastian.fricke@collabora.com> Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
2024-05-16 10:41:07 +02:00
}
else if (ctx->vpu_src_fmt->fourcc == V4L2_PIX_FMT_AV1_FRAME)
buf_size += hantro_av1_mv_size(ctx->ref_fmt.width,
ctx->ref_fmt.height);
return buf_size;
}
static int hantro_postproc_alloc(struct hantro_ctx *ctx, int index)
{
struct hantro_dev *vpu = ctx->dev;
struct hantro_aux_buf *priv = &ctx->postproc.dec_q[index];
unsigned int buf_size = hantro_postproc_buffer_size(ctx);
if (!buf_size)
return -EINVAL;
/*
* The buffers on this queue are meant as intermediate
* buffers for the decoder, so no mapping is needed.
*/
priv->attrs = DMA_ATTR_NO_KERNEL_MAPPING;
priv->cpu = dma_alloc_attrs(vpu->dev, buf_size, &priv->dma,
GFP_KERNEL, priv->attrs);
if (!priv->cpu)
return -ENOMEM;
priv->size = buf_size;
return 0;
}
int hantro_postproc_init(struct hantro_ctx *ctx)
{
struct v4l2_m2m_ctx *m2m_ctx = ctx->fh.m2m_ctx;
struct vb2_queue *cap_queue = &m2m_ctx->cap_q_ctx.q;
unsigned int num_buffers = vb2_get_num_buffers(cap_queue);
unsigned int i;
int ret;
for (i = 0; i < num_buffers; i++) {
ret = hantro_postproc_alloc(ctx, i);
if (ret) {
hantro_postproc_free(ctx);
return ret;
}
}
return 0;
}
dma_addr_t
hantro_postproc_get_dec_buf_addr(struct hantro_ctx *ctx, int index)
{
struct hantro_aux_buf *priv = &ctx->postproc.dec_q[index];
unsigned int buf_size = hantro_postproc_buffer_size(ctx);
struct hantro_dev *vpu = ctx->dev;
int ret;
if (priv->size < buf_size && priv->cpu) {
/* buffer is too small, release it */
dma_free_attrs(vpu->dev, priv->size, priv->cpu,
priv->dma, priv->attrs);
priv->cpu = NULL;
}
if (!priv->cpu) {
/* buffer not already allocated, try getting a new one */
ret = hantro_postproc_alloc(ctx, index);
if (ret)
return 0;
}
if (!priv->cpu)
return 0;
return priv->dma;
}
static void hantro_postproc_g1_disable(struct hantro_ctx *ctx)
{
struct hantro_dev *vpu = ctx->dev;
HANTRO_PP_REG_WRITE(vpu, pipeline_en, 0x0);
}
static void hantro_postproc_g2_disable(struct hantro_ctx *ctx)
{
struct hantro_dev *vpu = ctx->dev;
hantro_reg_write(vpu, &g2_out_rs_e, 0);
}
void hantro_postproc_disable(struct hantro_ctx *ctx)
{
struct hantro_dev *vpu = ctx->dev;
if (vpu->variant->postproc_ops && vpu->variant->postproc_ops->disable)
vpu->variant->postproc_ops->disable(ctx);
}
void hantro_postproc_enable(struct hantro_ctx *ctx)
{
struct hantro_dev *vpu = ctx->dev;
if (vpu->variant->postproc_ops && vpu->variant->postproc_ops->enable)
vpu->variant->postproc_ops->enable(ctx);
}
int hanto_postproc_enum_framesizes(struct hantro_ctx *ctx,
struct v4l2_frmsizeenum *fsize)
{
struct hantro_dev *vpu = ctx->dev;
if (vpu->variant->postproc_ops && vpu->variant->postproc_ops->enum_framesizes)
return vpu->variant->postproc_ops->enum_framesizes(ctx, fsize);
return -EINVAL;
}
const struct hantro_postproc_ops hantro_g1_postproc_ops = {
.enable = hantro_postproc_g1_enable,
.disable = hantro_postproc_g1_disable,
};
const struct hantro_postproc_ops hantro_g2_postproc_ops = {
.enable = hantro_postproc_g2_enable,
.disable = hantro_postproc_g2_disable,
.enum_framesizes = hantro_postproc_g2_enum_framesizes,
};