Skip to content
Snippets Groups Projects
Commit 46b1f86f authored by Rob Swindell's avatar Rob Swindell :speech_balloon:
Browse files

Re-synchronize utf8_getc/putc with OpenSSL 3.0

Now correctly rejects UTF-8 encoded Unicode surrogates and does not support
5 and 6 byte UTF-8 encodings.

For reference:
https://github.com/openssl/openssl/commit/ba64e5a92a6f009e311ad1c3565817820a1632a4
parent 334e768c
No related branches found
No related tags found
1 merge request!463MRC mods by Codefenix (2024-10-20)
Pipeline #2454 failed
...@@ -251,10 +251,12 @@ int cp437_to_utf8_str(const char* str, char* dest, size_t maxlen, unsigned char ...@@ -251,10 +251,12 @@ int cp437_to_utf8_str(const char* str, char* dest, size_t maxlen, unsigned char
return retval; return retval;
} }
#define is_unicode_surrogate(value) \
(value >= UNICODE_BLOCK_SURROGATE_BEGIN && value <= UNICODE_BLOCK_SURROGATE_END)
// From openssl/crypto/asn1/a_utf8.c: // From openssl/crypto/asn1/a_utf8.c:
/* /*
* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. * Copyright 1995-2021 The OpenSSL Project Authors. All Rights Reserved.
* *
* Licensed under the Apache License 2.0 (the "License"). You may not use * Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy * this file except in compliance with the License. You can obtain a copy
...@@ -309,6 +311,8 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val) ...@@ -309,6 +311,8 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val)
value |= *p++ & 0x3f; value |= *p++ & 0x3f;
if (value < 0x800) if (value < 0x800)
return -4; return -4;
if (is_unicode_surrogate(value))
return -2;
ret = 3; ret = 3;
} else if ((*p & 0xf8) == 0xf0) { } else if ((*p & 0xf8) == 0xf0) {
if (len < 4) if (len < 4)
...@@ -324,40 +328,6 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val) ...@@ -324,40 +328,6 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val)
if (value < 0x10000) if (value < 0x10000)
return -4; return -4;
ret = 4; ret = 4;
} else if ((*p & 0xfc) == 0xf8) {
if (len < 5)
return -1;
if (((p[1] & 0xc0) != 0x80)
|| ((p[2] & 0xc0) != 0x80)
|| ((p[3] & 0xc0) != 0x80)
|| ((p[4] & 0xc0) != 0x80))
return -3;
value = ((unsigned long)(*p++ & 0x3)) << 24;
value |= ((unsigned long)(*p++ & 0x3f)) << 18;
value |= ((unsigned long)(*p++ & 0x3f)) << 12;
value |= (*p++ & 0x3f) << 6;
value |= *p++ & 0x3f;
if (value < 0x200000)
return -4;
ret = 5;
} else if ((*p & 0xfe) == 0xfc) {
if (len < 6)
return -1;
if (((p[1] & 0xc0) != 0x80)
|| ((p[2] & 0xc0) != 0x80)
|| ((p[3] & 0xc0) != 0x80)
|| ((p[4] & 0xc0) != 0x80)
|| ((p[5] & 0xc0) != 0x80))
return -3;
value = ((unsigned long)(*p++ & 0x1)) << 30;
value |= ((unsigned long)(*p++ & 0x3f)) << 24;
value |= ((unsigned long)(*p++ & 0x3f)) << 18;
value |= ((unsigned long)(*p++ & 0x3f)) << 12;
value |= (*p++ & 0x3f) << 6;
value |= *p++ & 0x3f;
if (value < 0x4000000)
return -4;
ret = 6;
} else } else
return -2; return -2;
if(val != NULL) if(val != NULL)
...@@ -368,15 +338,15 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val) ...@@ -368,15 +338,15 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val)
/* /*
* This takes a character 'value' and writes the UTF8 encoded value in 'str' * This takes a character 'value' and writes the UTF8 encoded value in 'str'
* where 'str' is a buffer containing 'len' characters. Returns the number of * where 'str' is a buffer containing 'len' characters. Returns the number of
* characters written or -1 if 'len' is too small. 'str' can be set to NULL * characters written, -1 if 'len' is too small or -2 if 'value' is out of
* in which case it just returns the number of characters. It will need at * range. 'str' can be set to NULL in which case it just returns the number of
* most 6 characters. * characters. It will need at most 4 characters.
*/ */
int utf8_putc(char *str, size_t len, enum unicode_codepoint value) int utf8_putc(char *str, size_t len, enum unicode_codepoint value)
{ {
if (!str) if (!str)
len = 6; /* Maximum we will need */ len = 4; /* Maximum we will need */
else if (len <= 0) else if (len <= 0)
return -1; return -1;
if (value < 0x80) { if (value < 0x80) {
...@@ -394,6 +364,8 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value) ...@@ -394,6 +364,8 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value)
return 2; return 2;
} }
if (value < 0x10000) { if (value < 0x10000) {
if (is_unicode_surrogate(value))
return -2;
if (len < 3) if (len < 3)
return -1; return -1;
if (str) { if (str) {
...@@ -403,7 +375,7 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value) ...@@ -403,7 +375,7 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value)
} }
return 3; return 3;
} }
if (value < 0x200000) { if (value < UNICODE_LIMIT) {
if (len < 4) if (len < 4)
return -1; return -1;
if (str) { if (str) {
...@@ -414,27 +386,5 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value) ...@@ -414,27 +386,5 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value)
} }
return 4; return 4;
} }
if (value < 0x4000000) { return -2;
if (len < 5)
return -1;
if (str) {
*str++ = (unsigned char)(((value >> 24) & 0x3) | 0xf8);
*str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
*str = (unsigned char)((value & 0x3f) | 0x80);
}
return 5;
}
if (len < 6)
return -1;
if (str) {
*str++ = (unsigned char)(((value >> 30) & 0x1) | 0xfc);
*str++ = (unsigned char)(((value >> 24) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
*str = (unsigned char)((value & 0x3f) | 0x80);
}
return 6;
} }
...@@ -692,7 +692,10 @@ enum unicode_codepoint { ...@@ -692,7 +692,10 @@ enum unicode_codepoint {
UNICODE_HALFWIDTH_BLACK_SQUARE = 0xFFED, UNICODE_HALFWIDTH_BLACK_SQUARE = 0xFFED,
UNICODE_HALFWIDTH_WHITE_CIRCLE = 0xFFEE, UNICODE_HALFWIDTH_WHITE_CIRCLE = 0xFFEE,
UNICODE_REPLACEMENT_CHARACTER = 0xFFFD UNICODE_REPLACEMENT_CHARACTER = 0xFFFD,
UNICODE_MAX = 0x10FFFF,
UNICODE_LIMIT
}; };
// Blocks // Blocks
...@@ -728,6 +731,8 @@ enum unicode_codepoint { ...@@ -728,6 +731,8 @@ enum unicode_codepoint {
#define UNICIDE_BLOCK_YI_RADICALS_END 0xA4CF // Fullwidth #define UNICIDE_BLOCK_YI_RADICALS_END 0xA4CF // Fullwidth
#define UNICIDE_BLOCK_HANGUL_SYLLABLES_BEGIN 0xAC00 // Fullwidth #define UNICIDE_BLOCK_HANGUL_SYLLABLES_BEGIN 0xAC00 // Fullwidth
#define UNICIDE_BLOCK_HANGUL_SYLLABLES_END 0xD7AF // Fullwidth #define UNICIDE_BLOCK_HANGUL_SYLLABLES_END 0xD7AF // Fullwidth
#define UNICODE_BLOCK_SURROGATE_BEGIN 0xD800
#define UNICODE_BLOCK_SURROGATE_END 0xDFFF
#define UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_BEGIN 0xF900 // Fullwidth #define UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_BEGIN 0xF900 // Fullwidth
#define UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_END 0xFAFF // Fullwidth #define UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_END 0xFAFF // Fullwidth
#define UNICODE_BLOCK_VERTICAL_FORMS_BEGIN 0xFE10 // Fullwidth #define UNICODE_BLOCK_VERTICAL_FORMS_BEGIN 0xFE10 // Fullwidth
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment