From 5e4964c4b2e80c9d413a38716fed8045ab1b2de2 Mon Sep 17 00:00:00 2001
From: "Rob Swindell (on Windows 11)" <rob@synchro.net>
Date: Tue, 6 Feb 2024 11:52:10 -0800
Subject: [PATCH] Add utf8_decode_firstbyte()

for use when all you need is the length (and validity) of the first byte in
a UTF-8 sequence.
---
 src/encode/utf8.c | 14 ++++++++++++++
 src/encode/utf8.h |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/src/encode/utf8.c b/src/encode/utf8.c
index 16cc15c398..56c5bd710f 100644
--- a/src/encode/utf8.c
+++ b/src/encode/utf8.c
@@ -23,6 +23,20 @@
 #include "unicode.h"
 #include <string.h>
 
+int utf8_decode_firstbyte(char ch)
+{
+	/* Check syntax and work out the encoded value (if correct) */
+	if ((ch & 0x80) == 0)
+		return 1;
+	if ((ch & 0xe0) == 0xc0)
+		return 2;
+	if ((ch & 0xf0) == 0xe0)
+		return 3;
+	if ((ch & 0xf8) == 0xf0)
+		return 4;
+	return 0; // error
+}
+
 char* utf8_normalize_str(char* str)
 {
 	char* dest = str;
diff --git a/src/encode/utf8.h b/src/encode/utf8.h
index a16d9e3ffe..99e8e50c62 100644
--- a/src/encode/utf8.h
+++ b/src/encode/utf8.h
@@ -32,6 +32,9 @@
 extern "C" {
 #endif
 
+// Decode a UTF-8 first byte, returns length of character sequence (1-4) or 0 on error
+int utf8_decode_firstbyte(char ch);
+
 // Returns true if the string is valid UTF-8
 bool utf8_str_is_valid(const char*);
 
-- 
GitLab