diff --git a/src/encode/utf8.c b/src/encode/utf8.c index 270ffd7940edd79ec72d21c0caa0123c38dcaa7b..8579c2c1f0cdbd88146c57ed83fc75439f18570d 100644 --- a/src/encode/utf8.c +++ b/src/encode/utf8.c @@ -251,10 +251,12 @@ int cp437_to_utf8_str(const char* str, char* dest, size_t maxlen, unsigned char return retval; } +#define is_unicode_surrogate(value) \ + (value >= UNICODE_BLOCK_SURROGATE_BEGIN && value <= UNICODE_BLOCK_SURROGATE_END) // From openssl/crypto/asn1/a_utf8.c: /* - * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 1995-2021 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy @@ -309,6 +311,8 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val) value |= *p++ & 0x3f; if (value < 0x800) return -4; + if (is_unicode_surrogate(value)) + return -2; ret = 3; } else if ((*p & 0xf8) == 0xf0) { if (len < 4) @@ -324,40 +328,6 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val) if (value < 0x10000) return -4; ret = 4; - } else if ((*p & 0xfc) == 0xf8) { - if (len < 5) - return -1; - if (((p[1] & 0xc0) != 0x80) - || ((p[2] & 0xc0) != 0x80) - || ((p[3] & 0xc0) != 0x80) - || ((p[4] & 0xc0) != 0x80)) - return -3; - value = ((unsigned long)(*p++ & 0x3)) << 24; - value |= ((unsigned long)(*p++ & 0x3f)) << 18; - value |= ((unsigned long)(*p++ & 0x3f)) << 12; - value |= (*p++ & 0x3f) << 6; - value |= *p++ & 0x3f; - if (value < 0x200000) - return -4; - ret = 5; - } else if ((*p & 0xfe) == 0xfc) { - if (len < 6) - return -1; - if (((p[1] & 0xc0) != 0x80) - || ((p[2] & 0xc0) != 0x80) - || ((p[3] & 0xc0) != 0x80) - || ((p[4] & 0xc0) != 0x80) - || ((p[5] & 0xc0) != 0x80)) - return -3; - value = ((unsigned long)(*p++ & 0x1)) << 30; - value |= ((unsigned long)(*p++ & 0x3f)) << 24; - value |= ((unsigned long)(*p++ & 0x3f)) << 18; - value |= ((unsigned long)(*p++ & 0x3f)) << 12; - value |= (*p++ & 0x3f) << 6; - value |= *p++ & 0x3f; - if (value < 0x4000000) - return -4; - ret = 6; } else return -2; if(val != NULL) @@ -368,15 +338,15 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val) /* * This takes a character 'value' and writes the UTF8 encoded value in 'str' * where 'str' is a buffer containing 'len' characters. Returns the number of - * characters written or -1 if 'len' is too small. 'str' can be set to NULL - * in which case it just returns the number of characters. It will need at - * most 6 characters. + * characters written, -1 if 'len' is too small or -2 if 'value' is out of + * range. 'str' can be set to NULL in which case it just returns the number of + * characters. It will need at most 4 characters. */ int utf8_putc(char *str, size_t len, enum unicode_codepoint value) { if (!str) - len = 6; /* Maximum we will need */ + len = 4; /* Maximum we will need */ else if (len <= 0) return -1; if (value < 0x80) { @@ -394,6 +364,8 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value) return 2; } if (value < 0x10000) { + if (is_unicode_surrogate(value)) + return -2; if (len < 3) return -1; if (str) { @@ -403,7 +375,7 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value) } return 3; } - if (value < 0x200000) { + if (value < UNICODE_LIMIT) { if (len < 4) return -1; if (str) { @@ -414,27 +386,5 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value) } return 4; } - if (value < 0x4000000) { - if (len < 5) - return -1; - if (str) { - *str++ = (unsigned char)(((value >> 24) & 0x3) | 0xf8); - *str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80); - *str = (unsigned char)((value & 0x3f) | 0x80); - } - return 5; - } - if (len < 6) - return -1; - if (str) { - *str++ = (unsigned char)(((value >> 30) & 0x1) | 0xfc); - *str++ = (unsigned char)(((value >> 24) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80); - *str = (unsigned char)((value & 0x3f) | 0x80); - } - return 6; + return -2; } diff --git a/src/xpdev/unicode_defs.h b/src/xpdev/unicode_defs.h index 790836fc53672ea5c1a4db3b6ed324285c3fc8fc..7618abc83e92fdde086039bfaeaf85ded05dff5c 100644 --- a/src/xpdev/unicode_defs.h +++ b/src/xpdev/unicode_defs.h @@ -692,7 +692,10 @@ enum unicode_codepoint { UNICODE_HALFWIDTH_BLACK_SQUARE = 0xFFED, UNICODE_HALFWIDTH_WHITE_CIRCLE = 0xFFEE, - UNICODE_REPLACEMENT_CHARACTER = 0xFFFD + UNICODE_REPLACEMENT_CHARACTER = 0xFFFD, + + UNICODE_MAX = 0x10FFFF, + UNICODE_LIMIT }; // Blocks @@ -728,6 +731,8 @@ enum unicode_codepoint { #define UNICIDE_BLOCK_YI_RADICALS_END 0xA4CF // Fullwidth #define UNICIDE_BLOCK_HANGUL_SYLLABLES_BEGIN 0xAC00 // Fullwidth #define UNICIDE_BLOCK_HANGUL_SYLLABLES_END 0xD7AF // Fullwidth +#define UNICODE_BLOCK_SURROGATE_BEGIN 0xD800 +#define UNICODE_BLOCK_SURROGATE_END 0xDFFF #define UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_BEGIN 0xF900 // Fullwidth #define UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_END 0xFAFF // Fullwidth #define UNICODE_BLOCK_VERTICAL_FORMS_BEGIN 0xFE10 // Fullwidth