Synchronet now requires the libarchive development package (e.g. libarchive-dev on Debian-based Linux distros, libarchive.org for more info) to build successfully.

Commit 46b1f86f authored by Rob Swindell's avatar Rob Swindell 💬
Browse files

Re-synchronize utf8_getc/putc with OpenSSL 3.0

Now correctly rejects UTF-8 encoded Unicode surrogates and does not support
5 and 6 byte UTF-8 encodings.

For reference:
https://github.com/openssl/openssl/commit/ba64e5a92a6f009e311ad1c3565817820a1632a4
parent 334e768c
Pipeline #2454 failed with stage
in 12 minutes and 8 seconds
......@@ -251,10 +251,12 @@ int cp437_to_utf8_str(const char* str, char* dest, size_t maxlen, unsigned char
return retval;
}
#define is_unicode_surrogate(value) \
(value >= UNICODE_BLOCK_SURROGATE_BEGIN && value <= UNICODE_BLOCK_SURROGATE_END)
// From openssl/crypto/asn1/a_utf8.c:
/*
* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 1995-2021 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
......@@ -309,6 +311,8 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val)
value |= *p++ & 0x3f;
if (value < 0x800)
return -4;
if (is_unicode_surrogate(value))
return -2;
ret = 3;
} else if ((*p & 0xf8) == 0xf0) {
if (len < 4)
......@@ -324,40 +328,6 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val)
if (value < 0x10000)
return -4;
ret = 4;
} else if ((*p & 0xfc) == 0xf8) {
if (len < 5)
return -1;
if (((p[1] & 0xc0) != 0x80)
|| ((p[2] & 0xc0) != 0x80)
|| ((p[3] & 0xc0) != 0x80)
|| ((p[4] & 0xc0) != 0x80))
return -3;
value = ((unsigned long)(*p++ & 0x3)) << 24;
value |= ((unsigned long)(*p++ & 0x3f)) << 18;
value |= ((unsigned long)(*p++ & 0x3f)) << 12;
value |= (*p++ & 0x3f) << 6;
value |= *p++ & 0x3f;
if (value < 0x200000)
return -4;
ret = 5;
} else if ((*p & 0xfe) == 0xfc) {
if (len < 6)
return -1;
if (((p[1] & 0xc0) != 0x80)
|| ((p[2] & 0xc0) != 0x80)
|| ((p[3] & 0xc0) != 0x80)
|| ((p[4] & 0xc0) != 0x80)
|| ((p[5] & 0xc0) != 0x80))
return -3;
value = ((unsigned long)(*p++ & 0x1)) << 30;
value |= ((unsigned long)(*p++ & 0x3f)) << 24;
value |= ((unsigned long)(*p++ & 0x3f)) << 18;
value |= ((unsigned long)(*p++ & 0x3f)) << 12;
value |= (*p++ & 0x3f) << 6;
value |= *p++ & 0x3f;
if (value < 0x4000000)
return -4;
ret = 6;
} else
return -2;
if(val != NULL)
......@@ -368,15 +338,15 @@ int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val)
/*
* This takes a character 'value' and writes the UTF8 encoded value in 'str'
* where 'str' is a buffer containing 'len' characters. Returns the number of
* characters written or -1 if 'len' is too small. 'str' can be set to NULL
* in which case it just returns the number of characters. It will need at
* most 6 characters.
* characters written, -1 if 'len' is too small or -2 if 'value' is out of
* range. 'str' can be set to NULL in which case it just returns the number of
* characters. It will need at most 4 characters.
*/
int utf8_putc(char *str, size_t len, enum unicode_codepoint value)
{
if (!str)
len = 6; /* Maximum we will need */
len = 4; /* Maximum we will need */
else if (len <= 0)
return -1;
if (value < 0x80) {
......@@ -394,6 +364,8 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value)
return 2;
}
if (value < 0x10000) {
if (is_unicode_surrogate(value))
return -2;
if (len < 3)
return -1;
if (str) {
......@@ -403,7 +375,7 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value)
}
return 3;
}
if (value < 0x200000) {
if (value < UNICODE_LIMIT) {
if (len < 4)
return -1;
if (str) {
......@@ -414,27 +386,5 @@ int utf8_putc(char *str, size_t len, enum unicode_codepoint value)
}
return 4;
}
if (value < 0x4000000) {
if (len < 5)
return -1;
if (str) {
*str++ = (unsigned char)(((value >> 24) & 0x3) | 0xf8);
*str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
*str = (unsigned char)((value & 0x3f) | 0x80);
}
return 5;
}
if (len < 6)
return -1;
if (str) {
*str++ = (unsigned char)(((value >> 30) & 0x1) | 0xfc);
*str++ = (unsigned char)(((value >> 24) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80);
*str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80);
*str = (unsigned char)((value & 0x3f) | 0x80);
}
return 6;
return -2;
}
......@@ -692,7 +692,10 @@ enum unicode_codepoint {
UNICODE_HALFWIDTH_BLACK_SQUARE = 0xFFED,
UNICODE_HALFWIDTH_WHITE_CIRCLE = 0xFFEE,
UNICODE_REPLACEMENT_CHARACTER = 0xFFFD
UNICODE_REPLACEMENT_CHARACTER = 0xFFFD,
UNICODE_MAX = 0x10FFFF,
UNICODE_LIMIT
};
// Blocks
......@@ -728,6 +731,8 @@ enum unicode_codepoint {
#define UNICIDE_BLOCK_YI_RADICALS_END 0xA4CF // Fullwidth
#define UNICIDE_BLOCK_HANGUL_SYLLABLES_BEGIN 0xAC00 // Fullwidth
#define UNICIDE_BLOCK_HANGUL_SYLLABLES_END 0xD7AF // Fullwidth
#define UNICODE_BLOCK_SURROGATE_BEGIN 0xD800
#define UNICODE_BLOCK_SURROGATE_END 0xDFFF
#define UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_BEGIN 0xF900 // Fullwidth
#define UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_END 0xFAFF // Fullwidth
#define UNICODE_BLOCK_VERTICAL_FORMS_BEGIN 0xFE10 // Fullwidth
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment