From b84c583a26e6f9145cde5de0f5d1b175603c3fc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Deuc=D0=B5?= <shurd@sasktel.net>
Date: Wed, 23 Oct 2024 00:37:12 -0400
Subject: [PATCH] Rejigger width iterpolation.

The old code expected the L1 cache to be fairly large, and the
prefetcher to be fairly smart, and did updates by columns to save
some math.  This change performs width interpolation row-by-row
so even the dumbest prefetcher can get it right, and there's no
need to keep the whole source and destination images in the cache.

This may help out older processors when scaling with interpolation
(most commonly used in fullscreen).

It's entirely possible though that this won't be enough and they'll
still need to use "External" scaling.
---
 src/conio/scale.c       | 45 ++++++++++++++++++++++++-----------------
 src/conio/xbr.c         | 11 +++++-----
 src/syncterm/syncterm.c |  1 -
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/src/conio/scale.c b/src/conio/scale.c
index bcd60d83bb..afddfa3a2a 100644
--- a/src/conio/scale.c
+++ b/src/conio/scale.c
@@ -352,15 +352,15 @@ do_scale(struct rectlist* rect, int fwidth, int fheight)
 
 #if 0
 fprintf(stderr, "Plan:\n"
-"start:       %dx%d\n"
+"start:       %zux%zu\n"
 "pointymulti: %d\n"
 "pointy5:     %d\n"
 "pointy3:     %d\n"
 "xBR4:        %d\n"
 "xBR2:        %d\n"
 "Multiply:    %dx%d\n"
-"hinterp:     %zu -> %zu\n"
-"winterp:     %zu -> %zu\n",
+"hinterp:     %zu -> %d\n"
+"winterp:     %zu -> %d\n",
 csrc->w, csrc->h, pointymult, pointy5, pointy3, xbr4, xbr2, xmult, ymult, csrc->h * yscale, fheight, csrc->w * xscale, fwidth);
 #endif
 	// And scale...
@@ -762,7 +762,7 @@ struct YCoCg_data {
 	signed Cg;
 };
 
-static void
+static inline void
 RGB_to_YCoCg(const uint32_t RGB, struct YCoCg_data *YCoCg)
 {
 	signed R, G, B, tmp;
@@ -777,7 +777,7 @@ RGB_to_YCoCg(const uint32_t RGB, struct YCoCg_data *YCoCg)
 	YCoCg->Y = tmp + (YCoCg->Cg >> 1);
 }
 
-static uint32_t
+static inline uint32_t
 YCoCg_to_RGB(struct YCoCg_data *YCoCg)
 {
 	signed Ri, Gi, Bi, tmp;
@@ -793,7 +793,7 @@ YCoCg_to_RGB(struct YCoCg_data *YCoCg)
 	return (R << 16) | (G << 8) | B;
 }
 
-static uint32_t
+static inline uint32_t
 blend_YCoCg(const uint32_t c1, const uint32_t c2, const uint16_t weight)
 {
 	const uint16_t iw = 65535 - weight;
@@ -823,34 +823,41 @@ interpolate_width(uint32_t const* src, uint32_t* dst, const int width, const int
 	int x, y;
 	const double mult = (double)width / newwidth;
 	uint32_t *s = dst;
+	const int wm1 = width - 1;
 
-	for (x = 0; x < newwidth; x++) {
-		// First, calculate which two pixels this is between.
-		const double xpos = mult * x;
-		const int xposi = xpos;
-		const uint16_t weight = xpos * 65536;
-		dst = &s[x];
-		for (y = 0; y < height; y++) {
+	int srow_start = 0;
+	int drow_start = 0;
+	for (y = 0; y < height; y++) {
+		double xpos = 0.0;
+		dst = &s[drow_start];
+		for (x = 0; x < newwidth; x++) {
+			// First, calculate which two pixels this is between.
+			const int xposi = xpos;
+			const uint16_t weight = xpos * 65536;
+			const int yposi = srow_start + xposi;
 			if (weight == 0) {
 				// Exact match!
-				*dst = src[width * y + xposi];
+				*dst = src[yposi];
 			}
 			else {
 				// Now pick the two pixels
-				const uint32_t pix1 = src[y * width + xposi];
+				const uint32_t pix1 = src[yposi];
 				uint32_t pix2;
-				if (xposi < width - 1)
-					pix2 = src[y * width + xposi + 1];
+				if (xposi < wm1)
+					pix2 = src[yposi + 1];
 				else
-					pix2 = src[y * width + xposi];
+					pix2 = src[yposi];
 				if (pix1 == pix2)
 					*dst = pix1;
 				else {
 					*dst = blend_YCoCg(pix1, pix2, weight);
 				}
 			}
-			dst += newwidth;
+			xpos += mult;
+			dst++;
 		}
+		srow_start += width;
+		drow_start += newwidth;
 	}
 }
 
diff --git a/src/conio/xbr.c b/src/conio/xbr.c
index 1c4b02e625..b817e0385a 100644
--- a/src/conio/xbr.c
+++ b/src/conio/xbr.c
@@ -30,7 +30,6 @@
 
 #include <inttypes.h>
 #include <stdlib.h>
-#include "scale.h"
 
 #define LB_MASK       0x00FEFEFE
 #define RED_BLUE_MASK 0x00FF00FF
@@ -46,7 +45,7 @@ struct YCoCg_data {
 	signed Cg;
 };
 
-static void
+static inline void
 RGB_to_YCoCg(const uint32_t RGB, struct YCoCg_data *YCoCg)
 {
 	int R, G, B, tmp;
@@ -61,7 +60,7 @@ RGB_to_YCoCg(const uint32_t RGB, struct YCoCg_data *YCoCg)
 	YCoCg->Y = tmp + (YCoCg->Cg >> 1);
 }
 
-static uint32_t pixel_diff(const uint32_t x, const uint32_t y)
+static inline uint32_t pixel_diff(const uint32_t x, const uint32_t y)
 {
 	struct YCoCg_data yccx;
 	struct YCoCg_data yccy;
@@ -71,9 +70,9 @@ static uint32_t pixel_diff(const uint32_t x, const uint32_t y)
 	RGB_to_YCoCg(x, &yccx);
 	RGB_to_YCoCg(y, &yccy);
 
-    return (ABSDIFF(yccx.Y, yccy.Y)) +
-           (ABSDIFF(yccx.Co, yccy.Co) >> 1) +
-           (ABSDIFF(yccx.Cg, yccy.Cg) >> 1);
+	return (ABSDIFF(yccx.Y, yccy.Y)) +
+	    (ABSDIFF(yccx.Co, yccy.Co) >> 1) +
+	    (ABSDIFF(yccx.Cg, yccy.Cg) >> 1);
 }
 
 #define ALPHA_BLEND_128_W(a, b) ((((a) & LB_MASK) >> 1) + (((b) & LB_MASK) >> 1))
diff --git a/src/syncterm/syncterm.c b/src/syncterm/syncterm.c
index c10fad142f..51469c7b94 100644
--- a/src/syncterm/syncterm.c
+++ b/src/syncterm/syncterm.c
@@ -82,7 +82,6 @@ enum {
 #include "ssh.h"
 #endif
 #include "fonts.h"
-#include "scale.h"
 #include "syncterm.h"
 #include "term.h"
 #include "uifcinit.h"
-- 
GitLab