From 5e4964c4b2e80c9d413a38716fed8045ab1b2de2 Mon Sep 17 00:00:00 2001 From: "Rob Swindell (on Windows 11)" <rob@synchro.net> Date: Tue, 6 Feb 2024 11:52:10 -0800 Subject: [PATCH] Add utf8_decode_firstbyte() for use when all you need is the length (and validity) of the first byte in a UTF-8 sequence. --- src/encode/utf8.c | 14 ++++++++++++++ src/encode/utf8.h | 3 +++ 2 files changed, 17 insertions(+) diff --git a/src/encode/utf8.c b/src/encode/utf8.c index 16cc15c398..56c5bd710f 100644 --- a/src/encode/utf8.c +++ b/src/encode/utf8.c @@ -23,6 +23,20 @@ #include "unicode.h" #include <string.h> +int utf8_decode_firstbyte(char ch) +{ + /* Check syntax and work out the encoded value (if correct) */ + if ((ch & 0x80) == 0) + return 1; + if ((ch & 0xe0) == 0xc0) + return 2; + if ((ch & 0xf0) == 0xe0) + return 3; + if ((ch & 0xf8) == 0xf0) + return 4; + return 0; // error +} + char* utf8_normalize_str(char* str) { char* dest = str; diff --git a/src/encode/utf8.h b/src/encode/utf8.h index a16d9e3ffe..99e8e50c62 100644 --- a/src/encode/utf8.h +++ b/src/encode/utf8.h @@ -32,6 +32,9 @@ extern "C" { #endif +// Decode a UTF-8 first byte, returns length of character sequence (1-4) or 0 on error +int utf8_decode_firstbyte(char ch); + // Returns true if the string is valid UTF-8 bool utf8_str_is_valid(const char*); -- GitLab