diff --git a/src/encode/utf8.c b/src/encode/utf8.c index 16cc15c39863320d83b469ea7fd5844deb2b104f..56c5bd710f29cbc599c1f4e90e9c3fcf8576143a 100644 --- a/src/encode/utf8.c +++ b/src/encode/utf8.c @@ -23,6 +23,20 @@ #include "unicode.h" #include <string.h> +int utf8_decode_firstbyte(char ch) +{ + /* Check syntax and work out the encoded value (if correct) */ + if ((ch & 0x80) == 0) + return 1; + if ((ch & 0xe0) == 0xc0) + return 2; + if ((ch & 0xf0) == 0xe0) + return 3; + if ((ch & 0xf8) == 0xf0) + return 4; + return 0; // error +} + char* utf8_normalize_str(char* str) { char* dest = str; diff --git a/src/encode/utf8.h b/src/encode/utf8.h index a16d9e3ffe910017ecb99c4ae55abb60d98d0efe..99e8e50c62218e6c8c14da6856d45e90e24feb11 100644 --- a/src/encode/utf8.h +++ b/src/encode/utf8.h @@ -32,6 +32,9 @@ extern "C" { #endif +// Decode a UTF-8 first byte, returns length of character sequence (1-4) or 0 on error +int utf8_decode_firstbyte(char ch); + // Returns true if the string is valid UTF-8 bool utf8_str_is_valid(const char*);