Optimizations:

1) Keep a rectangle updated per-screen rather than regenerate each time 2) Strip palette info when putting pixels into rectangles rather than during scaling 3) Tighten up the screen locks a bit 4) Don't require a full resend of both screens on an update request 5) Only force a redraw for cursor movement when the cursor is visible (And force it whenever the cursor changes) 6) Avoid doubles in interpolation 7) Heavily optimize interpolate_height() interpolate_width() likely doesn't need it because it's generally not used and also it reads from the next pixel in memory making the prefetchers job easier. 8) Fix some memory-leak-on-error issues 9) For ARGB8 XImages, manipulate the data directly rather than through XPutPixel() At this point, the scaling and X11 output time is heavily dominated by cache misses. The only really effective way to reduce this hit is to spread the work across all the L3 caches in the system or move it into the GPU. With the latest updates, at the SyncTERM menu, over 90% of the time is spent in the rendering pipeline, and over 90% of that time is spent thrashing the caches... the only real easy win left is vectorizing, but that's highly compiler specific. To that end, I've switched to -O3 for release builds. There was a comment that -finline-functions broke Baja "badly", but that's clearly false since -f-inline-functions has been part of -O2 for quite a while now, and Baja doesn't seem any more broken that it ever was.

Optimizations:
b5488bb3 · Deucе · 27c79cad · b5488bb3 · b5488bb3 · b5488bb3
Commit b5488bb3 authored 4 years ago by Deucе
--- a/src/build/Common.gmake
+++ b/src/build/Common.gmake
@@ -437,9 +437,9 @@ ifdef DEBUG
 CFLAGS	+=	-D_DEBUG
 CFLAGS +=	-Wall -Wno-char-subscripts
 else # RELEASE
- # -finline functions breaks the baja build badly.
- # This also means that -O3 won't work either.
- CFLAGS	:= -O2 -fomit-frame-pointer -ffast-math -funroll-loops $(CFLAGS)
+ # -finline-functions (used to) break the baja build badly.
+ # This also meant that -O3 wouldn't work either.
+ CFLAGS	:= -O3 -fomit-frame-pointer -ffast-math -funroll-loops $(CFLAGS)
 endif

 -include targets.mk

--- a/src/conio/bitmap_con.c
+++ b/src/conio/bitmap_con.c
@@ -69,6 +69,7 @@ struct bitmap_screen {
 	int		screenheight;
 	pthread_mutex_t	screenlock;
 	int		update_pixels;
+	struct rectlist *rect;
 };

 struct bitmap_callbacks {
@@ -114,10 +115,11 @@ static int bitmap_draw_one_char(unsigned int xpos, unsigned int ypos);
 static void cb_flush(void);
 static int check_redraw(void);
 static void blinker_thread(void *data);
-static __inline struct bitmap_screen *current_screen(void);
+static __inline void both_screens(struct bitmap_screen** current, struct bitmap_screen** noncurrent);
 static int update_from_vmem(int force);
 static int bitmap_vmem_puttext_locked(int sx, int sy, int ex, int ey, struct vmem_cell *fill);
 static uint32_t color_value(uint32_t col);
+void bitmap_drv_free_rect(struct rectlist *rect);

 /**************************************************************/
 /* These functions get called from the driver and ciolib only */
@@ -286,7 +288,8 @@ static int bitmap_vmem_puttext_locked(int sx, int sy, int ex, int ey, struct vme
 	return(1);
 }

-static void set_vmem_cell(struct vstat_vmem *vmem_ptr, size_t pos, uint16_t cell, uint32_t fg, uint32_t bg)
+static void
+set_vmem_cell(struct vstat_vmem *vmem_ptr, size_t pos, uint16_t cell, uint32_t fg, uint32_t bg)
 {
 	int		altfont;
 	int		font;
@@ -436,16 +439,15 @@ static struct rectlist *alloc_full_rect(struct bitmap_screen *screen)
 static uint32_t color_value(uint32_t col)
 {
 	if (col & 0x80000000)
-		return col;
+		return col & 0xffffff;
 	if ((col & 0xffffff) < sizeof(palette) / sizeof(palette[0]))
-		return (col & 0xff000000) | palette[col & 0xffffff];
+		return palette[col & 0xffffff] & 0xffffff;
 	fprintf(stderr, "Invalid colour value: %08x\n", col);
-	return 0xff000000;
+	return 0;
 }

 static struct rectlist *get_full_rectangle_locked(struct bitmap_screen *screen)
 {
-	size_t i;
 	struct rectlist *rect;

 	// TODO: Some sort of caching here would make things faster...?
@@ -453,8 +455,7 @@ static struct rectlist *get_full_rectangle_locked(struct bitmap_screen *screen)
 		rect = alloc_full_rect(screen);
 		if (!rect)
 			return rect;
-		for (i=0; i<screen->screenwidth*screen->screenheight; i++)
-			rect->data[i] = color_value(screen->screen[i]);
+		memcpy(rect->data, screen->rect->data, sizeof(*rect->data) * screen->screenwidth * screen->screenheight);
 		return rect;
 	}
 	return NULL;
@@ -486,6 +487,7 @@ static int bitmap_draw_one_char(unsigned int xpos, unsigned int ypos)
 	uint8_t fb = 0;
 	int		y;
 	int		fontoffset;
+	int		pixeloffset;
 	unsigned char *this_font;
 	WORD	sch;
 	struct vstat_vmem *vmem_ptr;
@@ -501,22 +503,6 @@ static int bitmap_draw_one_char(unsigned int xpos, unsigned int ypos)
 		return(-1);
 	}

-	pthread_mutex_lock(&screena.screenlock);
-	pthread_mutex_lock(&screenb.screenlock);
-
-	if ((xoffset + vstat.charwidth > screena.screenwidth) || (yoffset + vstat.charheight > screena.screenheight) ||
-	    (xoffset + vstat.charwidth > screenb.screenwidth) || (yoffset + vstat.charheight > screenb.screenheight)) {
-		pthread_mutex_unlock(&screenb.screenlock);
-		pthread_mutex_unlock(&screena.screenlock);
-		return(-1);
-	}
-
-	if((!screena.screen) || (!screenb.screen)) {
-		pthread_mutex_unlock(&screenb.screenlock);
-		pthread_mutex_unlock(&screena.screenlock);
-		return(-1);
-	}
-
 	sch=vmem_ptr->vmem[(ypos-1)*cio_textinfo.screenwidth+(xpos-1)].legacy_attr << 8 | vmem_ptr->vmem[(ypos-1)*cio_textinfo.screenwidth+(xpos-1)].ch;
 	fg = vmem_ptr->vmem[(ypos-1)*cio_textinfo.screenwidth+(xpos-1)].fg;
 	bg = vmem_ptr->vmem[(ypos-1)*cio_textinfo.screenwidth+(xpos-1)].bg;
@@ -539,8 +525,6 @@ static int bitmap_draw_one_char(unsigned int xpos, unsigned int ypos)
 					this_font = (unsigned char *)conio_fontdata[vmem_ptr->vmem[(ypos-1)*cio_textinfo.screenwidth+(xpos-1)].font].eight_by_sixteen;
 					break;
 				default:
-					pthread_mutex_unlock(&screenb.screenlock);
-					pthread_mutex_unlock(&screena.screenlock);
 					return(-1);
 			}
 		}
@@ -550,7 +534,24 @@ static int bitmap_draw_one_char(unsigned int xpos, unsigned int ypos)
 	fdw = vstat.charwidth - (vstat.flags & VIDMODES_FLAG_EXPAND) ? 1 : 0;
 	fontoffset=(sch & 0xff) * (vstat.charheight * ((fdw + 7) / 8));

+	pthread_mutex_lock(&screena.screenlock);
+	pthread_mutex_lock(&screenb.screenlock);
+
+	if ((xoffset + vstat.charwidth > screena.screenwidth) || (yoffset + vstat.charheight > screena.screenheight) ||
+	    (xoffset + vstat.charwidth > screenb.screenwidth) || (yoffset + vstat.charheight > screenb.screenheight)) {
+		pthread_mutex_unlock(&screenb.screenlock);
+		pthread_mutex_unlock(&screena.screenlock);
+		return(-1);
+	}
+
+	if((!screena.screen) || (!screenb.screen)) {
+		pthread_mutex_unlock(&screenb.screenlock);
+		pthread_mutex_unlock(&screena.screenlock);
+		return(-1);
+	}
+
 	draw_fg = ((!(sch & 0x8000)) || vstat.no_blink);
+	pixeloffset = PIXEL_OFFSET(screena, xoffset, yoffset);
 	for(y=0; y<vstat.charheight; y++) {
 		for(x=0; x<vstat.charwidth; x++) {
 			fdx = x;
@@ -574,34 +575,39 @@ static int bitmap_draw_one_char(unsigned int xpos, unsigned int ypos)
 			}

 			if(fb & (0x80 >> (fdx & 7)) && draw_fg) {
-				if (screena.screen[PIXEL_OFFSET(screena, xoffset + x, yoffset + y)] != fg) {
+				if (screena.screen[pixeloffset] != fg) {
 					screena.update_pixels = 1;
-					screena.screen[PIXEL_OFFSET(screena, xoffset + x, yoffset + y)] = fg;
+					screena.screen[pixeloffset] = fg;
+					screena.rect->data[pixeloffset] = color_value(fg);
 				}
 			}
 			else {
-				if (screena.screen[PIXEL_OFFSET(screena, xoffset + x, yoffset + y)] != bg) {
+				if (screena.screen[pixeloffset] != bg) {
 					screena.update_pixels = 1;
-					screena.screen[PIXEL_OFFSET(screena, xoffset + x, yoffset + y)] = bg;
+					screena.screen[pixeloffset] = bg;
+					screena.rect->data[pixeloffset] = color_value(bg);
 				}
 			}

 			if(fb & (0x80 >> (fdx & 7))) {
-				if (screenb.screen[PIXEL_OFFSET(screenb, xoffset + x, yoffset + y)]!=fg) {
+				if (screenb.screen[pixeloffset] != fg) {
 					screenb.update_pixels = 1;
-					screenb.screen[PIXEL_OFFSET(screenb, xoffset + x, yoffset + y)]=fg;
+					screenb.screen[pixeloffset] = fg;
+					screenb.rect->data[pixeloffset] = color_value(fg);
 				}
 			}
 			else {
-				if (screenb.screen[PIXEL_OFFSET(screenb, xoffset+x, yoffset+y)]!=bg) {
+				if (screenb.screen[pixeloffset] != bg) {
 					screenb.update_pixels = 1;
-					screenb.screen[PIXEL_OFFSET(screenb, xoffset+x, yoffset+y)]=bg;
+					screenb.screen[pixeloffset]=bg;
+					screenb.rect->data[pixeloffset] = color_value(bg);
 				}
 			}
-
+			pixeloffset++;
 		}
 		if (x & 0x07)
 			fontoffset++;
+		pixeloffset += screena.screenwidth - vstat.charwidth;
 	}
 	pthread_mutex_unlock(&screenb.screenlock);
 	pthread_mutex_unlock(&screena.screenlock);
@@ -642,6 +648,7 @@ static void blinker_thread(void *data)
 	int curs_changed;
 	int blink_changed;
 	struct bitmap_screen *screen;
+	struct bitmap_screen *ncscreen;

 	SetThreadName("Blinker");
 	while(1) {
@@ -649,7 +656,7 @@ static void blinker_thread(void *data)
 		blink_changed = 0;
 		do {
 			SLEEP(10);
-			screen = current_screen();
+			both_screens(&screen, &ncscreen);
 		} while (screen->screen == NULL);
 		count++;
 		if (count==25) {
@@ -690,7 +697,13 @@ static void blinker_thread(void *data)
 					request_redraw();
 		}
 		pthread_mutex_lock(&screen->screenlock);
+		// TODO: Maybe we can optimize the blink_changed forced update?
 		if (screen->update_pixels || curs_changed || blink_changed) {
+			// If the other screen is update_pixels == 2, clear it.
+			pthread_mutex_lock(&ncscreen->screenlock);
+			if (ncscreen->update_pixels == 2)
+				ncscreen->update_pixels = 0;
+			pthread_mutex_unlock(&ncscreen->screenlock);
 			rect = get_full_rectangle_locked(screen);
 			screen->update_pixels = 0;
 			pthread_mutex_unlock(&screen->screenlock);
@@ -711,6 +724,13 @@ static void blinker_thread(void *data)
 	}
 }

+static __inline struct bitmap_screen *noncurrent_screen_locked(void)
+{
+	if (vstat.blink)
+		return &screenb;
+	return &screena;
+}
+
 static __inline struct bitmap_screen *current_screen_locked(void)
 {
 	if (vstat.blink)
@@ -718,14 +738,12 @@ static __inline struct bitmap_screen *current_screen_locked(void)
 	return(&screenb);
 }

-static __inline struct bitmap_screen *current_screen(void)
+static __inline void both_screens(struct bitmap_screen** current, struct bitmap_screen** noncurrent)
 {
-	struct bitmap_screen *ret;
-
 	pthread_mutex_lock(&vstatlock);
-	ret = current_screen_locked();
+	*current = current_screen_locked();
+	*noncurrent = noncurrent_screen_locked();
 	pthread_mutex_unlock(&vstatlock);
-	return(ret);
 }

 /*
@@ -901,6 +919,7 @@ void bitmap_gotoxy(int x, int y)
 		cio_textinfo.cury=y;
 		vstat.curs_col = x + cio_textinfo.winleft - 1;
 		vstat.curs_row = y + cio_textinfo.wintop - 1;
+		if (cursor_visible_locked())
 			force_cursor = 1;
 	}
 	pthread_mutex_unlock(&vstatlock);
@@ -921,10 +940,12 @@ void bitmap_setcursortype(int type)
 		case _SOLIDCURSOR:
 			vstat.curs_start=0;
 			vstat.curs_end=vstat.charheight-1;
+			force_cursor = 1;
 			break;
 		default:
 		    vstat.curs_start = vstat.default_curs_start;
 		    vstat.curs_end = vstat.default_curs_end;
+			force_cursor = 1;
 			break;
 	}
 	pthread_mutex_unlock(&vstatlock);
@@ -1159,6 +1180,7 @@ int bitmap_movetext(int x, int y, int ex, int ey, int tox, int toy)
 	pthread_mutex_lock(&screena.screenlock);
 	for(screeny=0; screeny < height*vstat.charheight; screeny++) {
 		memmove(&(screena.screen[ssourcepos+sdestoffset]), &(screena.screen[ssourcepos]), sizeof(screena.screen[0])*width*vstat.charwidth);
+		memmove(&(screena.rect->data[ssourcepos+sdestoffset]), &(screena.rect->data[ssourcepos]), sizeof(screena.screen[0])*width*vstat.charwidth);
 		ssourcepos += direction * vstat.scrnwidth;
 	}
 	screena.update_pixels = 1;
@@ -1176,6 +1198,7 @@ int bitmap_movetext(int x, int y, int ex, int ey, int tox, int toy)
 	pthread_mutex_lock(&screenb.screenlock);
 	for(screeny=0; screeny < height*vstat.charheight; screeny++) {
 		memmove(&(screenb.screen[ssourcepos+sdestoffset]), &(screenb.screen[ssourcepos]), sizeof(screenb.screen[0])*width*vstat.charwidth);
+		memmove(&(screenb.rect->data[ssourcepos+sdestoffset]), &(screenb.rect->data[ssourcepos]), sizeof(screenb.screen[0])*width*vstat.charwidth);
 		ssourcepos += direction * vstat.scrnwidth;
 	}
 	screenb.update_pixels = 1;
@@ -1265,6 +1288,7 @@ void bitmap_setcustomcursor(int s, int e, int r, int b, int v)
 		vstat.curs_blinks=b;
 	if(v>=0)
 		vstat.curs_visible=v;
+	force_cursor = 1;
 	pthread_mutex_unlock(&vstatlock);
 	pthread_mutex_unlock(&blinker_lock);
 }
@@ -1344,6 +1368,7 @@ int bitmap_setpixel(uint32_t x, uint32_t y, uint32_t colour)
 		if (screena.screen[PIXEL_OFFSET(screena, x, y)] != colour) {
 			screena.screen[PIXEL_OFFSET(screena, x, y)]=colour;
 			screena.update_pixels = 1;
+			screena.rect->data[PIXEL_OFFSET(screena, x, y)]=color_value(colour);
 		}
 	}
 	pthread_mutex_unlock(&screena.screenlock);
@@ -1353,6 +1378,7 @@ int bitmap_setpixel(uint32_t x, uint32_t y, uint32_t colour)
 		if (screenb.screen[PIXEL_OFFSET(screenb, x, y)] != colour) {
 			screenb.screen[PIXEL_OFFSET(screenb, x, y)]=colour;
 			screenb.update_pixels = 1;
+			screenb.rect->data[PIXEL_OFFSET(screenb, x, y)]=color_value(colour);
 		}
 	}
 	pthread_mutex_unlock(&screenb.screenlock);
@@ -1401,10 +1427,14 @@ int bitmap_setpixels(uint32_t sx, uint32_t sy, uint32_t ex, uint32_t ey, uint32_
 		if (mask == NULL) {
 			for (x = sx; x <= ex; x++) {
 				screena.screen[PIXEL_OFFSET(screena, x, y)] = pixels->pixels[pos];
-				if (pixels->pixelsb)
+				if (pixels->pixelsb) {
 					screenb.screen[PIXEL_OFFSET(screenb, x, y)] = pixels->pixelsb[pos];
-				else
+					screenb.rect->data[PIXEL_OFFSET(screenb, x, y)] = color_value(pixels->pixelsb[pos]);
+				}
+				else {
 					screenb.screen[PIXEL_OFFSET(screenb, x, y)] = pixels->pixels[pos];
+					screenb.rect->data[PIXEL_OFFSET(screenb, x, y)] = color_value(pixels->pixels[pos]);
+				}
 				pos++;
 			}
 		}
@@ -1415,10 +1445,15 @@ int bitmap_setpixels(uint32_t sx, uint32_t sy, uint32_t ex, uint32_t ey, uint32_
 				mask_bit = 0x80 >> mask_bit;
 				if (m[mask_byte] & mask_bit) {
 					screena.screen[PIXEL_OFFSET(screena, x, y)] = pixels->pixels[pos];
-					if (pixels->pixelsb)
+					screena.rect->data[PIXEL_OFFSET(screena, x, y)] = color_value(pixels->pixels[pos]);
+					if (pixels->pixelsb) {
 						screenb.screen[PIXEL_OFFSET(screenb, x, y)] = pixels->pixelsb[pos];
-					else
+						screenb.rect->data[PIXEL_OFFSET(screenb, x, y)] = color_value(pixels->pixelsb[pos]);
+					}
+					else {
 						screenb.screen[PIXEL_OFFSET(screenb, x, y)] = pixels->pixels[pos];
+						screenb.rect->data[PIXEL_OFFSET(screenb, x, y)] = color_value(pixels->pixels[pos]);
+					}
 				}
 				pos++;
 			}
@@ -1597,6 +1632,13 @@ static int init_screen(struct bitmap_screen *screen, int *width, int *height)
 	screen->screen = newscreen;
 	memset_u32(screen->screen, vstat.palette[0], screen->screenwidth * screen->screenheight);
 	screen->update_pixels = 1;
+	bitmap_drv_free_rect(screen->rect);
+	screen->rect = alloc_full_rect(screen);
+	if (screen->rect == NULL) {
+		pthread_mutex_unlock(&screen->screenlock);
+		return(-1);
+	}
+	memset_u32(screen->rect->data, color_value(vstat.palette[0]), screen->rect->rect.width * screen->rect->rect.height);
 	pthread_mutex_unlock(&screen->screenlock);
 	return(0);
 }
@@ -1714,12 +1756,13 @@ int bitmap_drv_init(void (*drawrect_cb) (struct rectlist *data)

 void bitmap_drv_request_pixels(void)
 {
-	// TODO: We may need something extra now... this results in two updates.
 	pthread_mutex_lock(&screena.screenlock);
-	screena.update_pixels = 1;
+	if (screena.update_pixels == 0)
+		screena.update_pixels = 2;
 	pthread_mutex_unlock(&screena.screenlock);
 	pthread_mutex_lock(&screenb.screenlock);
-	screenb.update_pixels = 1;
+	if (screenb.update_pixels == 0)
+		screenb.update_pixels = 2;
 	pthread_mutex_unlock(&screenb.screenlock);
 }

@@ -1731,6 +1774,8 @@ void bitmap_drv_request_some_pixels(int x, int y, int width, int height)

 void bitmap_drv_free_rect(struct rectlist *rect)
 {
+	if (rect == NULL)
+		return;
 	pthread_mutex_lock(&free_rect_lock);
 	rect->next = free_rects;
 	free_rects = rect;

--- a/src/conio/scale.c
+++ b/src/conio/scale.c
@@ -185,6 +185,8 @@ get_buffer(void)
 void
 release_buffer(struct graphics_buffer *buf)
 {
+	if (buf == NULL)
+		return;
 	buf->next = free_list;
 	free_list = buf;
 }
@@ -724,30 +726,29 @@ pointy_scale3(uint32_t* src, uint32_t* dest, int width, int height)
 	}
 }

-static
-uint32_t blend(const uint32_t c1, const uint32_t c2, const double weight)
+static uint32_t
+blend(const uint32_t c1, const uint32_t c2, int weight)
 {
 	uint8_t yuv1[4];
 	uint8_t yuv2[4];
-	int y, u, v;
-	const double iw = 1.0 - weight;
+	uint8_t yuv3[4];
+	const double iw = 256 - weight;

 	*(uint32_t *)yuv1 = r2y[c1];
 	*(uint32_t *)yuv2 = r2y[c2];
 #ifdef __BIG_ENDIAN__
-	y = yuv1[1] * iw + yuv2[1] * weight;
-	u = yuv1[2] * iw + yuv2[2] * weight;
-	v = yuv1[3] * iw + yuv2[3] * weight;
+	yuv3[0] = 0;
+	yuv3[1] = (yuv1[1] * iw + yuv2[1] * weight) / 256;
+	yuv3[2] = (yuv1[2] * iw + yuv2[2] * weight) / 256;
+	yuv3[3] = (yuv1[3] * iw + yuv2[3] * weight) / 256;
 #else
-	y = yuv1[2] * iw + yuv2[2] * weight;
-	u = yuv1[1] * iw + yuv2[1] * weight;
-	v = yuv1[0] * iw + yuv2[0] * weight;
+	yuv3[3] = 0;
+	yuv3[2] = (yuv1[2] * iw + yuv2[2] * weight) / 256;
+	yuv3[1] = (yuv1[1] * iw + yuv2[1] * weight) / 256;
+	yuv3[0] = (yuv1[0] * iw + yuv2[0] * weight) / 256;
 #endif
-	CLAMP(y);
-	CLAMP(u);
-	CLAMP(v);

-	return y2r[(y<<16)|(u<<8)|v];
+	return y2r[*(uint32_t*)yuv3];
 }

 /*
@@ -773,12 +774,12 @@ interpolate_width(uint32_t* src, uint32_t* dst, int width, int height, int newwi
 			else {
 				const double weight = xpos - xposi;
 				// Now pick the two pixels
-				const uint32_t pix1 = src[y * width + xposi] & 0xffffff;
+				const uint32_t pix1 = src[y * width + xposi];
 				uint32_t pix2;
 				if (xposi < width - 1)
-					pix2 = src[y * width + xposi + 1] & 0xffffff;
+					pix2 = src[y * width + xposi + 1];
 				else
-					pix2 = src[y * width + xposi] & 0xffffff;
+					pix2 = src[y * width + xposi];
 				if (pix1 == pix2)
 					*dst = pix1;
 				else {
@@ -799,23 +800,53 @@ static void
 interpolate_height(uint32_t* src, uint32_t* dst, int width, int height, int newheight)
 {
 	int x, y;
-	bool em = false;
 	const double mult = (double)height / newheight;
-
+	double ypos = 0;
+	int last_yposi = 0;
+	int ywn = width;
+	static uint32_t *nline = NULL;
+	static uint32_t *tline = NULL;
+	static size_t nsz = 0;
+	static size_t tsz = 0;
+	uint32_t *stmp;
+
+	if (nsz < width * 4) {
+		stmp = realloc(nline, width * 4);
+		if (stmp == NULL)
+			goto fail;
+		nline = stmp;
+		nsz = width * 4;
+	}
+	if (tsz < width * 4) {
+		stmp = realloc(tline, width * 4);
+		if (stmp == NULL)
+			goto fail;
+		tline = stmp;
+		tsz = width * 4;
+	}
+
+	memcpy(tline, src, width * sizeof(*tline));
+	memcpy(nline, src + width, width * sizeof(*tline));
 	for (y = 0; y < newheight; y++) {
-		const double ypos = mult * y;
 		const int yposi = ypos;
-		em = (y == ypos || yposi >= height - 1);
-		if (em) {
-			memcpy(dst, &src[yposi * width], width * sizeof(dst[0]));
+		if (yposi != last_yposi) {
+			ywn += width;
+			last_yposi = yposi;
+			stmp = tline;
+			tline = nline;
+			nline = stmp;
+			memcpy(nline, &src[ywn], nsz);
+		}
+		if (y == ypos || yposi >= height - 1) {
+			memcpy(dst, tline, tsz);
 			dst += width;
 		}
 		else {
-			const double weight = ypos - yposi;
+			const uint8_t weight = ypos * 256;
 			for (x = 0; x < width; x++) {
 				// Now pick the two pixels
-				const uint32_t pix1 = src[yposi * width + x] & 0xffffff;
-				const uint32_t pix2 = src[(yposi + 1) * width + x] & 0xffffff;
+				const uint32_t pix1 = tline[x];
+				const uint32_t pix2 = nline[x];
 				if (pix1 == pix2)
 					*dst = pix1;
 				else
@@ -823,7 +854,19 @@ interpolate_height(uint32_t* src, uint32_t* dst, int width, int height, int newh
 				dst++;
 			}
 		}
+		ypos += mult;
 	}
+
+	return;
+fail:
+	free(nline);
+	free(tline);
+	nline = NULL;
+	tline = NULL;
+	nsz = 0;
+	tsz = 0;
+	memcpy(src, dst, width * height * sizeof(*src));
+	fprintf(stderr, "Allocation failure in interpolate_height()!");
 }

 static void

--- a/src/conio/x_events.c
+++ b/src/conio/x_events.c
@@ -464,7 +464,8 @@ static int video_init()
    return(0);
 }

-static void local_draw_rect(struct rectlist *rect)
+static void
+local_draw_rect(struct rectlist *rect)
 {
 	int x, y, xoff = 0, yoff = 0;
 	unsigned int r, g, b;
@@ -476,10 +477,12 @@ static void local_draw_rect(struct rectlist *rect)
 	int idx;
 	uint32_t last_pixel = 0x55555555;
 	struct graphics_buffer *source;
-	int lines;
+	bool isRGB8 = false;

-	if (bitmap_width != rect->rect.width || bitmap_height != rect->rect.height)
+	if (bitmap_width != rect->rect.width || bitmap_height != rect->rect.height) {
+		bitmap_drv_free_rect(rect);
 		return;
+	}

 	xoff = (x11_window_width - xim->width) / 2;
 	if (xoff < 0)
@@ -490,12 +493,12 @@ static void local_draw_rect(struct rectlist *rect)

 	// Scale...
 	source = do_scale(rect, x_cvstat.scaling, x_cvstat.scaling, x_cvstat.aspect_width, x_cvstat.aspect_height);
+	if (source == NULL) {
 		bitmap_drv_free_rect(rect);
-	if (source == NULL)
 		return;
+	}
 	cleft = source->w;
 	ctop = source->h;
-	lines = 0;

 	xoff = (x11_window_width - source->w) / 2;
 	if (xoff < 0)
@@ -511,6 +514,9 @@ static void local_draw_rect(struct rectlist *rect)

 	/* TODO: Translate into local colour depth */
 	idx = 0;
+	if (visual.red_mask == 0xff0000 && visual.green_mask == 0x00ff00 && visual.blue_mask == 0x0000ff)
+		isRGB8 = true;
+
 	for (y = 0; y < source->h; y++) {
 		for (x = 0; x < source->w; x++) {
 			if (last) {
@@ -529,6 +535,11 @@ static void local_draw_rect(struct rectlist *rect)
 					continue;
 				}
 			}
+			if (isRGB8) {
+				pixel = source->data[idx];
+				((uint32_t*)xim->data)[idx] = pixel;
+			}
+			else {
 				if (last_pixel != source->data[idx]) {
 					last_pixel = source->data[idx];
 					r = source->data[idx] >> 16 & 0xff;
@@ -552,17 +563,16 @@ static void local_draw_rect(struct rectlist *rect)
 						pixel |= (b >> (0-b_shift)) & visual.blue_mask;
 				}
 #ifdef XPutPixel
-			XPutPixel(xim, (x + rect->rect.x), (y + rect->rect.y), pixel);
+				XPutPixel(xim, x, y, pixel);
 #else
-			x11.XPutPixel(xim, (x + rect->rect.x), (y + rect->rect.y), pixel);
+				x11.XPutPixel(xim, x, y, pixel);
 #endif
+			}
 			idx++;
 		}
-		lines++;
 		/* This line was changed */
 		// TODO: Previously this did one update per display line...
-		if (last && (cbottom != y || y == source->h - 1) && cright >= 0) {
-			lines = 0;
+		if (last && cright >= 0 && (cbottom != y || y == source->h - 1)) {
 			x11.XPutImage(dpy, win, gc, xim, cleft, ctop
 			    , cleft + xoff, ctop + yoff
 			    , (cright - cleft + 1), (cbottom - ctop + 1));
@@ -573,7 +583,7 @@ static void local_draw_rect(struct rectlist *rect)
 	}

 	if (last == NULL)
-		x11.XPutImage(dpy, win, gc, xim, rect->rect.x, rect->rect.y, rect->rect.x + xoff, rect->rect.y + yoff, source->w, source->h);
+		x11.XPutImage(dpy, win, gc, xim, 0, 0, xoff, yoff, source->w, source->h);
 	else
 		release_buffer(last);
 	last = source;
@@ -1231,7 +1241,7 @@ void x11_event_thread(void *args)
 			case -1:
 				/*
 				* Errno might be wrong, so we just select again.
-				* This could cause a problem is something really
+				* This could cause a problem if something really
 				* was wrong with select....
 				*/