drm: vkms: Refactor the plane composer to accept new formats

Currently the blend function only accepts XRGB_8888 and ARGB_8888 as a color input. This patch refactors all the functions related to the plane composition to overcome this limitation. The pixels blend is done using the new internal format. And new handlers are being added to convert a specific format to/from this internal format. So the blend operation depends on these handlers to convert to this common format. The blended result, if necessary, is converted to the writeback buffer format. This patch introduces three major differences to the blend function. 1 - All the planes are blended at once. 2 - The blend calculus is done as per line instead of per pixel. 3 - It is responsible to calculates the CRC and writing the writeback buffer(if necessary). These changes allow us to allocate way less memory in the intermediate buffer to compute these operations. Because now we don't need to have the entire intermediate image lines at once, just one line is enough. | Memory consumption (output dimensions) | |:--------------------------------------:| | Current | This patch | |:------------------:|:-----------------:| | Width * Heigth | 2 * Width | Beyond memory, we also have a minor performance benefit from all these changes. Results running the IGT[1] test `igt@kms_cursor_crc@pipe-a-cursor-512x512-onscreen` ten times: | Frametime | |:------------------------------------------:| | Implementation | Current | This commit | |:---------------:|:---------:|:------------:| | frametime range | 9~22 ms | 5~17 ms | | Average | 11.4 ms | 7.8 ms | [1] IGT commit id: bc3f6833a12221a46659535dac06ebb312490eb4 V2: Improves the performance drastically, by performing the operations per-line and not per-pixel(Pekka Paalanen). Minor improvements(Pekka Paalanen). V3: Changes the code to blend the planes all at once. This improves performance, memory consumption, and removes much of the weirdness of the V2(Pekka Paalanen and me). Minor improvements(Pekka Paalanen and me). V4: Rebase the code and adapt it to the new NUM_OVERLAY_PLANES constant. V5: Minor checkpatch fixes and the removal of TO-DO item(Melissa Wen). Several security/robustness improvents(Pekka Paalanen). Removes check_planes_x_bounds function and allows partial partly off-screen(Pekka Paalanen). V6: Fix a mismatch of some variable sizes (Pekka Paalanen). Several minor improvements (Pekka Paalanen). Reviewed-by: Melissa Wen <mwen@igalia.com> Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Igor Torrente <igormtorrente@gmail.com> Signed-off-by: Melissa Wen <melissa.srw@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20220905190811.25024-7-igormtorrente@gmail.com
author: Igor Torrente <igormtorrente@gmail.com> 2022-09-05 16:08:08 -0300
committer: Melissa Wen <melissa.srw@gmail.com> 2022-09-05 20:18:35 -0100
commit: 8ba1648567e289c90fa4f65b4204d0f160e22ac3 (patch)
tree: fcdfd9f1b1ca844da802f072b5a64836f0b01225 /drivers/gpu/drm/vkms/vkms_composer.c
parent: drm: vkms: Add fb information to `vkms_writeback_job` (diff)
download: linux-dev-8ba1648567e289c90fa4f65b4204d0f160e22ac3.tar.xz
linux-dev-8ba1648567e289c90fa4f65b4204d0f160e22ac3.zip
1 files changed, 144 insertions, 178 deletions
diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c
index bca049d879e1..5b1a8bdd8268 100644
--- a/drivers/gpu/drm/vkms/vkms_composer.c
+++ b/drivers/gpu/drm/vkms/vkms_composer.c
@@ -7,204 +7,188 @@
 #include <drm/drm_fourcc.h>
 #include <drm/drm_gem_framebuffer_helper.h>
 #include <drm/drm_vblank.h>
+#include <linux/minmax.h>
 
 #include "vkms_drv.h"
 
-static u32 get_pixel_from_buffer(int x, int y, const u8 *buffer,
-				 const struct vkms_frame_info *frame_info)
+static u16 pre_mul_blend_channel(u16 src, u16 dst, u16 alpha)
 {
-	u32 pixel;
-	int src_offset = frame_info->offset + (y * frame_info->pitch)
-					    + (x * frame_info->cpp);
+	u32 new_color;
 
-	pixel = *(u32 *)&buffer[src_offset];
+	new_color = (src * 0xffff + dst * (0xffff - alpha));
 
-	return pixel;
+	return DIV_ROUND_CLOSEST(new_color, 0xffff);
 }
 
 /**
- * compute_crc - Compute CRC value on output frame
+ * pre_mul_alpha_blend - alpha blending equation
+ * @src_frame_info: source framebuffer's metadata
+ * @stage_buffer: The line with the pixels from src_plane
+ * @output_buffer: A line buffer that receives all the blends output
  *
- * @vaddr: address to final framebuffer
- * @frame_info: framebuffer's metadata
+ * Using the information from the `frame_info`, this blends only the
+ * necessary pixels from the `stage_buffer` to the `output_buffer`
+ * using premultiplied blend formula.
  *
- * returns CRC value computed using crc32 on the visible portion of
- * the final framebuffer at vaddr_out
+ * The current DRM assumption is that pixel color values have been already
+ * pre-multiplied with the alpha channel values. See more
+ * drm_plane_create_blend_mode_property(). Also, this formula assumes a
+ * completely opaque background.
  */
-static uint32_t compute_crc(const u8 *vaddr,
-			    const struct vkms_frame_info *frame_info)
+static void pre_mul_alpha_blend(struct vkms_frame_info *frame_info,
+				struct line_buffer *stage_buffer,
+				struct line_buffer *output_buffer)
 {
-	int x, y;
-	u32 crc = 0, pixel = 0;
-	int x_src = frame_info->src.x1 >> 16;
-	int y_src = frame_info->src.y1 >> 16;
-	int h_src = drm_rect_height(&frame_info->src) >> 16;
-	int w_src = drm_rect_width(&frame_info->src) >> 16;
-
-	for (y = y_src; y < y_src + h_src; ++y) {
-		for (x = x_src; x < x_src + w_src; ++x) {
-			pixel = get_pixel_from_buffer(x, y, vaddr, frame_info);
-			crc = crc32_le(crc, (void *)&pixel, sizeof(u32));
-		}
+	int x_dst = frame_info->dst.x1;
+	struct pixel_argb_u16 *out = output_buffer->pixels + x_dst;
+	struct pixel_argb_u16 *in = stage_buffer->pixels;
+	int x_limit = min_t(size_t, drm_rect_width(&frame_info->dst),
+			    stage_buffer->n_pixels);
+
+	for (int x = 0; x < x_limit; x++) {
+		out[x].a = (u16)0xffff;
+		out[x].r = pre_mul_blend_channel(in[x].r, out[x].r, in[x].a);
+		out[x].g = pre_mul_blend_channel(in[x].g, out[x].g, in[x].a);
+		out[x].b = pre_mul_blend_channel(in[x].b, out[x].b, in[x].a);
 	}
-
-	return crc;
 }
 
-static u8 blend_channel(u8 src, u8 dst, u8 alpha)
+static bool check_y_limit(struct vkms_frame_info *frame_info, int y)
 {
-	u32 pre_blend;
-	u8 new_color;
-
-	pre_blend = (src * 255 + dst * (255 - alpha));
-
-	/* Faster div by 255 */
-	new_color = ((pre_blend + ((pre_blend + 257) >> 8)) >> 8);
+	if (y >= frame_info->dst.y1 && y < frame_info->dst.y2)
+		return true;
 
-	return new_color;
+	return false;
 }
 
 /**
- * alpha_blend - alpha blending equation
- * @argb_src: src pixel on premultiplied alpha mode
- * @argb_dst: dst pixel completely opaque
+ * @wb_frame_info: The writeback frame buffer metadata
+ * @crtc_state: The crtc state
+ * @crc32: The crc output of the final frame
+ * @output_buffer: A buffer of a row that will receive the result of the blend(s)
+ * @stage_buffer: The line with the pixels from plane being blend to the output
  *
- * blend pixels using premultiplied blend formula. The current DRM assumption
- * is that pixel color values have been already pre-multiplied with the alpha
- * channel values. See more drm_plane_create_blend_mode_property(). Also, this
- * formula assumes a completely opaque background.
+ * This function blends the pixels (Using the `pre_mul_alpha_blend`)
+ * from all planes, calculates the crc32 of the output from the former step,
+ * and, if necessary, convert and store the output to the writeback buffer.
  */
-static void alpha_blend(const u8 *argb_src, u8 *argb_dst)
+static void blend(struct vkms_writeback_job *wb,
+		  struct vkms_crtc_state *crtc_state,
+		  u32 *crc32, struct line_buffer *stage_buffer,
+		  struct line_buffer *output_buffer, size_t row_size)
 {
-	u8 alpha;
+	struct vkms_plane_state **plane = crtc_state->active_planes;
+	struct vkms_frame_info *primary_plane_info = plane[0]->frame_info;
+	u32 n_active_planes = crtc_state->num_active_planes;
+
+	int y_dst = primary_plane_info->dst.y1;
+	int h_dst = drm_rect_height(&primary_plane_info->dst);
+	int y_limit = y_dst + h_dst;
+
+	for (size_t y = y_dst; y < y_limit; y++) {
+		plane[0]->plane_read(output_buffer, primary_plane_info, y);
+
+		/* If there are other planes besides primary, we consider the active
+		 * planes should be in z-order and compose them associatively:
+		 * ((primary <- overlay) <- cursor)
+		 */
+		for (size_t i = 1; i < n_active_planes; i++) {
+			if (!check_y_limit(plane[i]->frame_info, y))
+				continue;
+
+			plane[i]->plane_read(stage_buffer, plane[i]->frame_info, y);
+			pre_mul_alpha_blend(plane[i]->frame_info, stage_buffer,
+					    output_buffer);
+		}
 
-	alpha = argb_src[3];
-	argb_dst[0] = blend_channel(argb_src[0], argb_dst[0], alpha);
-	argb_dst[1] = blend_channel(argb_src[1], argb_dst[1], alpha);
-	argb_dst[2] = blend_channel(argb_src[2], argb_dst[2], alpha);
-}
+		*crc32 = crc32_le(*crc32, (void *)output_buffer->pixels, row_size);
 
-/**
- * x_blend - blending equation that ignores the pixel alpha
- *
- * overwrites RGB color value from src pixel to dst pixel.
- */
-static void x_blend(const u8 *xrgb_src, u8 *xrgb_dst)
-{
-	memcpy(xrgb_dst, xrgb_src, sizeof(u8) * 3);
-}
-
-/**
- * blend - blend value at vaddr_src with value at vaddr_dst
- * @vaddr_dst: destination address
- * @vaddr_src: source address
- * @dst_frame_info: destination framebuffer's metadata
- * @src_frame_info: source framebuffer's metadata
- * @pixel_blend: blending equation based on plane format
- *
- * Blend the vaddr_src value with the vaddr_dst value using a pixel blend
- * equation according to the supported plane formats DRM_FORMAT_(A/XRGB8888)
- * and clearing alpha channel to an completely opaque background. This function
- * uses buffer's metadata to locate the new composite values at vaddr_dst.
- *
- * TODO: completely clear the primary plane (a = 0xff) before starting to blend
- * pixel color values
- */
-static void blend(void *vaddr_dst, void *vaddr_src,
-		  struct vkms_frame_info *dst_frame_info,
-		  struct vkms_frame_info *src_frame_info,
-		  void (*pixel_blend)(const u8 *, u8 *))
-{
-	int i, j, j_dst, i_dst;
-	int offset_src, offset_dst;
-	u8 *pixel_dst, *pixel_src;
-
-	int x_src = src_frame_info->src.x1 >> 16;
-	int y_src = src_frame_info->src.y1 >> 16;
-
-	int x_dst = src_frame_info->dst.x1;
-	int y_dst = src_frame_info->dst.y1;
-	int h_dst = drm_rect_height(&src_frame_info->dst);
-	int w_dst = drm_rect_width(&src_frame_info->dst);
-
-	int y_limit = y_src + h_dst;
-	int x_limit = x_src + w_dst;
-
-	for (i = y_src, i_dst = y_dst; i < y_limit; ++i) {
-		for (j = x_src, j_dst = x_dst; j < x_limit; ++j) {
-			offset_dst = dst_frame_info->offset
-				     + (i_dst * dst_frame_info->pitch)
-				     + (j_dst++ * dst_frame_info->cpp);
-			offset_src = src_frame_info->offset
-				     + (i * src_frame_info->pitch)
-				     + (j * src_frame_info->cpp);
-
-			pixel_src = (u8 *)(vaddr_src + offset_src);
-			pixel_dst = (u8 *)(vaddr_dst + offset_dst);
-			pixel_blend(pixel_src, pixel_dst);
-			/* clearing alpha channel (0xff)*/
-			pixel_dst[3] = 0xff;
-		}
-		i_dst++;
+		if (wb)
+			wb->wb_write(&wb->wb_frame_info, output_buffer, y);
 	}
 }
 
-static void compose_plane(struct vkms_frame_info *primary_plane_info,
-			  struct vkms_frame_info *plane_frame_info,
-			  void *vaddr_out)
+static int check_format_funcs(struct vkms_crtc_state *crtc_state,
+			      struct vkms_writeback_job *active_wb)
 {
-	struct drm_framebuffer *fb = plane_frame_info->fb;
-	void *vaddr;
-	void (*pixel_blend)(const u8 *p_src, u8 *p_dst);
+	struct vkms_plane_state **planes = crtc_state->active_planes;
+	u32 n_active_planes = crtc_state->num_active_planes;
 
-	if (WARN_ON(iosys_map_is_null(&plane_frame_info->map[0])))
-		return;
+	for (size_t i = 0; i < n_active_planes; i++)
+		if (!planes[i]->plane_read)
+			return -1;
 
-	vaddr = plane_frame_info->map[0].vaddr;
+	if (active_wb && !active_wb->wb_write)
+		return -1;
 
-	if (fb->format->format == DRM_FORMAT_ARGB8888)
-		pixel_blend = &alpha_blend;
-	else
-		pixel_blend = &x_blend;
-
-	blend(vaddr_out, vaddr, primary_plane_info,
-	      plane_frame_info, pixel_blend);
+	return 0;
 }
 
-static int compose_active_planes(void **vaddr_out,
-				 struct vkms_frame_info *primary_plane_info,
-				 struct vkms_crtc_state *crtc_state)
+static int compose_active_planes(struct vkms_writeback_job *active_wb,
+				 struct vkms_crtc_state *crtc_state,
+				 u32 *crc32)
 {
-	struct drm_framebuffer *fb = primary_plane_info->fb;
-	struct drm_gem_object *gem_obj = drm_gem_fb_get_obj(fb, 0);
-	const void *vaddr;
-	int i;
-
-	if (!*vaddr_out) {
-		*vaddr_out = kvzalloc(gem_obj->size, GFP_KERNEL);
-		if (!*vaddr_out) {
-			DRM_ERROR("Cannot allocate memory for output frame.");
-			return -ENOMEM;
-		}
+	size_t line_width, pixel_size = sizeof(struct pixel_argb_u16);
+	struct vkms_frame_info *primary_plane_info = NULL;
+	struct line_buffer output_buffer, stage_buffer;
+	struct vkms_plane_state *act_plane = NULL;
+	int ret = 0;
+
+	/*
+	 * This check exists so we can call `crc32_le` for the entire line
+	 * instead doing it for each channel of each pixel in case
+	 * `struct `pixel_argb_u16` had any gap added by the compiler
+	 * between the struct fields.
+	 */
+	static_assert(sizeof(struct pixel_argb_u16) == 8);
+
+	if (crtc_state->num_active_planes >= 1) {
+		act_plane = crtc_state->active_planes[0];
+		if (act_plane->base.base.plane->type == DRM_PLANE_TYPE_PRIMARY)
+			primary_plane_info = act_plane->frame_info;
 	}
 
+	if (!primary_plane_info)
+		return -EINVAL;
+
 	if (WARN_ON(iosys_map_is_null(&primary_plane_info->map[0])))
 		return -EINVAL;
 
-	vaddr = primary_plane_info->map[0].vaddr;
+	if (WARN_ON(check_format_funcs(crtc_state, active_wb)))
+		return -EINVAL;
 
-	memcpy(*vaddr_out, vaddr, gem_obj->size);
+	line_width = drm_rect_width(&primary_plane_info->dst);
+	stage_buffer.n_pixels = line_width;
+	output_buffer.n_pixels = line_width;
 
-	/* If there are other planes besides primary, we consider the active
-	 * planes should be in z-order and compose them associatively:
-	 * ((primary <- overlay) <- cursor)
-	 */
-	for (i = 1; i < crtc_state->num_active_planes; i++)
-		compose_plane(primary_plane_info,
-			      crtc_state->active_planes[i]->frame_info,
-			      *vaddr_out);
+	stage_buffer.pixels = kvmalloc(line_width * pixel_size, GFP_KERNEL);
+	if (!stage_buffer.pixels) {
+		DRM_ERROR("Cannot allocate memory for the output line buffer");
+		return -ENOMEM;
+	}
 
-	return 0;
+	output_buffer.pixels = kvmalloc(line_width * pixel_size, GFP_KERNEL);
+	if (!output_buffer.pixels) {
+		DRM_ERROR("Cannot allocate memory for intermediate line buffer");
+		ret = -ENOMEM;
+		goto free_stage_buffer;
+	}
+
+	if (active_wb) {
+		struct vkms_frame_info *wb_frame_info = &active_wb->wb_frame_info;
+
+		wb_frame_info->src = primary_plane_info->src;
+		wb_frame_info->dst = primary_plane_info->dst;
+	}
+
+	blend(active_wb, crtc_state, crc32, &stage_buffer,
+	      &output_buffer, line_width * pixel_size);
+
+	kvfree(output_buffer.pixels);
+free_stage_buffer:
+	kvfree(stage_buffer.pixels);
+
+	return ret;
 }
 
 /**
@@ -222,13 +206,11 @@ void vkms_composer_worker(struct work_struct *work)
 						struct vkms_crtc_state,
 						composer_work);
 	struct drm_crtc *crtc = crtc_state->base.crtc;
+	struct vkms_writeback_job *active_wb = crtc_state->active_writeback;
 	struct vkms_output *out = drm_crtc_to_vkms_output(crtc);
-	struct vkms_frame_info *primary_plane_info = NULL;
-	struct vkms_plane_state *act_plane = NULL;
 	bool crc_pending, wb_pending;
-	void *vaddr_out = NULL;
-	u32 crc32 = 0;
 	u64 frame_start, frame_end;
+	u32 crc32 = 0;
 	int ret;
 
 	spin_lock_irq(&out->composer_lock);
@@ -248,35 +230,19 @@ void vkms_composer_worker(struct work_struct *work)
 	if (!crc_pending)
 		return;
 
-	if (crtc_state->num_active_planes >= 1) {
-		act_plane = crtc_state->active_planes[0];
-		if (act_plane->base.base.plane->type == DRM_PLANE_TYPE_PRIMARY)
-			primary_plane_info = act_plane->frame_info;
-	}
-
-	if (!primary_plane_info)
-		return;
-
 	if (wb_pending)
-		vaddr_out = crtc_state->active_writeback->data[0].vaddr;
+		ret = compose_active_planes(active_wb, crtc_state, &crc32);
+	else
+		ret = compose_active_planes(NULL, crtc_state, &crc32);
 
-	ret = compose_active_planes(&vaddr_out, primary_plane_info,
-				    crtc_state);
-	if (ret) {
-		if (ret == -EINVAL && !wb_pending)
-			kvfree(vaddr_out);
+	if (ret)
 		return;
-	}
-
-	crc32 = compute_crc(vaddr_out, primary_plane_info);
 
 	if (wb_pending) {
 		drm_writeback_signal_completion(&out->wb_connector, 0);
 		spin_lock_irq(&out->composer_lock);
 		crtc_state->wb_pending = false;
 		spin_unlock_irq(&out->composer_lock);
-	} else {
-		kvfree(vaddr_out);
 	}
 
 	/*
author	Igor Torrente <igormtorrente@gmail.com>	2022-09-05 16:08:08 -0300
committer	Melissa Wen <melissa.srw@gmail.com>	2022-09-05 20:18:35 -0100
commit	8ba1648567e289c90fa4f65b4204d0f160e22ac3 (patch)
tree	fcdfd9f1b1ca844da802f072b5a64836f0b01225 /drivers/gpu/drm/vkms/vkms_composer.c
parent	drm: vkms: Add fb information to `vkms_writeback_job` (diff)
download	linux-dev-8ba1648567e289c90fa4f65b4204d0f160e22ac3.tar.xz linux-dev-8ba1648567e289c90fa4f65b4204d0f160e22ac3.zip