From 7c9fbc7e77eb80e60f5094b38b7c2a3cedf5006d Mon Sep 17 00:00:00 2001 From: rswindell <> Date: Wed, 10 Jul 2019 00:02:40 +0000 Subject: [PATCH] enum-ification (use enum unicode_codepoint instead of uint32_t). Replaced unicode_is_zerowidth() with unicode_width(), in preparation for "fullwidth" char support. Added UNICODE_UNDEFINED definition (0x0000) (UNICODE_NULL is already defined, at least in MSVC). --- src/encode/utf8.c | 12 ++++++------ src/encode/utf8.h | 8 ++++---- src/xpdev/unicode.c | 20 +++++++++++++++----- src/xpdev/unicode.h | 10 +++++----- src/xpdev/unicode_defs.h | 37 ++++++++++++++++++++++++------------- 5 files changed, 54 insertions(+), 33 deletions(-) diff --git a/src/encode/utf8.c b/src/encode/utf8.c index b8afb75edc..c7c427e072 100644 --- a/src/encode/utf8.c +++ b/src/encode/utf8.c @@ -141,7 +141,7 @@ char* utf8_normalize_str(char* str) /* Replace all multi-byte UTF-8 sequences with 'ch' or 'zwch' (when non-zero) */ /* When ch and zwch are 0, effectively strips all UTF-8 chars from str */ -char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_ch, char unsupported_zwch, char error_ch) +char* utf8_replace_chars(char* str, char (*lookup)(enum unicode_codepoint), char unsupported_ch, char unsupported_zwch, char error_ch) { char* end = str + strlen(str); char* dest = str; @@ -153,7 +153,7 @@ char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_c len = 1; continue; } - uint32_t codepoint = 0; + enum unicode_codepoint codepoint = 0; len = utf8_getc(src, end - src, &codepoint); if(len < 2) { if(error_ch) @@ -168,7 +168,7 @@ char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_c continue; } } - if(unicode_is_zerowidth(codepoint)) { + if(unicode_width(codepoint) == 0) { if(unsupported_zwch) *dest++ = unsupported_zwch; } @@ -200,7 +200,7 @@ int cp437_to_utf8_str(const char* str, char* dest, size_t maxlen, unsigned char retval = -1; break; } - uint32_t codepoint = 0; + enum unicode_codepoint codepoint = 0; if(*p >= minval) codepoint = cp437_unicode_tbl[*p]; if(codepoint) { @@ -241,7 +241,7 @@ int cp437_to_utf8_str(const char* str, char* dest, size_t maxlen, unsigned char * -4 = character encoded incorrectly (not minimal length). */ -int utf8_getc(const char *str, size_t len, uint32_t* val) +int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val) { const unsigned char *p; unsigned long value; @@ -339,7 +339,7 @@ int utf8_getc(const char *str, size_t len, uint32_t* val) * most 6 characters. */ -int utf8_putc(char *str, size_t len, uint32_t value) +int utf8_putc(char *str, size_t len, enum unicode_codepoint value) { if (!str) len = 6; /* Maximum we will need */ diff --git a/src/encode/utf8.h b/src/encode/utf8.h index 2d3293cb17..e0a3cb8d04 100644 --- a/src/encode/utf8.h +++ b/src/encode/utf8.h @@ -36,9 +36,9 @@ #ifndef UTF8_H_ #define UTF8_H_ -#include <stdint.h> #include <stdbool.h> #include <stdlib.h> +#include "unicode_defs.h" #define UTF8_MAX_LEN 6 // Longest possible UTF-8 sequence @@ -57,17 +57,17 @@ char* utf8_normalize_str(char* str); // 'unsupported_ch' is the character used to replace unsupported Unicode codepoints (optional) // 'unsupported_zwch' is the character used to replace unsupported zero-width Unicode codepoints (optional) // 'error_ch' is the character used to replace invalid UTF-8 sequence bytes (optional) -char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_ch, char unsupported_zwch, char error_ch); +char* utf8_replace_chars(char* str, char (*lookup)(enum unicode_codepoint), char unsupported_ch, char unsupported_zwch, char error_ch); // Convert a CP437 char string (src) to UTF-8 string (dest) up to 'maxlen' chars long (sans NUL-terminator) // 'minval' can be used to limit the range of converted chars int cp437_to_utf8_str(const char* src, char* dest, size_t maxlen, unsigned char minval); // Decode a UTF-8 sequence to a UNICODE code point -int utf8_getc(const char* str, size_t len, uint32_t* codepoint); +int utf8_getc(const char* str, size_t len, enum unicode_codepoint* codepoint); // Encode a UNICODE code point into a UTF-8 sequence (str) -int utf8_putc(char* str, size_t len, uint32_t codepoint); +int utf8_putc(char* str, size_t len, enum unicode_codepoint codepoint); #if defined(__cplusplus) } diff --git a/src/xpdev/unicode.c b/src/xpdev/unicode.c index 355132b18f..8fea13355f 100644 --- a/src/xpdev/unicode.c +++ b/src/xpdev/unicode.c @@ -47,7 +47,7 @@ // CP437 character to/from UNICODE code point conversion // The CP437 character value is the index into the table. // If the value at that index is 0, no translation is needed (1:1 mapping). -uint32_t cp437_unicode_tbl[] = +enum unicode_codepoint cp437_unicode_tbl[] = { /* 0x00 */ 0, /* 0x01 */ 0x263A, @@ -308,7 +308,7 @@ uint32_t cp437_unicode_tbl[] = /* 0xFF */ 0x00A0 }; -bool unicode_is_zerowidth(uint32_t u) +size_t unicode_width(enum unicode_codepoint u) { switch(u) { case UNICODE_ZERO_WIDTH_SPACE: @@ -331,12 +331,13 @@ bool unicode_is_zerowidth(uint32_t u) case UNICODE_VARIATION_SELECTOR_15: case UNICODE_VARIATION_SELECTOR_16: case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE: - return true; + return 0; + /* TODO: return 2 for "fullwdith" chars */ } - return false; + return 1; } -char unicode_to_cp437(uint32_t codepoint) +char unicode_to_cp437(enum unicode_codepoint codepoint) { switch(codepoint) { case 0: return '\0'; @@ -444,6 +445,15 @@ char unicode_to_cp437(uint32_t codepoint) case UNICODE_EM_SPACE: return ' '; + case UNICODE_SQUARE_ROOT: return CP437_CHAR_SQUARE_ROOT; + case UNICODE_CHECK_MARK: + case UNICODE_HEAVY_CHECK_MARK: return CP437_CHAR_CHECK_MARK; + + case UNICODE_MULTIPLICATION_X: + case UNICODE_HEAVY_MULTIPLICATION_X: + case UNICODE_BALLOT_X: + case UNICODE_HEAVY_BALLOT_X: return 'x'; + case UNICODE_OVERLINE: case 0x2500: // Box Drawings Light Horizontal case 0x2501: // Box Drawings Heavy Horizontal diff --git a/src/xpdev/unicode.h b/src/xpdev/unicode.h index 6dda706351..2b534a79d4 100644 --- a/src/xpdev/unicode.h +++ b/src/xpdev/unicode.h @@ -36,16 +36,16 @@ #ifndef UNICODE_H_ #define UNICODE_H_ -#include <stdint.h> -#include <stdbool.h> +#include <stdlib.h> +#include "unicode_defs.h" #if defined(__cplusplus) extern "C" { #endif -extern uint32_t cp437_unicode_tbl[]; -bool unicode_is_zerowidth(uint32_t); -char unicode_to_cp437(uint32_t); +extern enum unicode_codepoint cp437_unicode_tbl[]; +size_t unicode_width(enum unicode_codepoint); +char unicode_to_cp437(enum unicode_codepoint); #if defined(__cplusplus) } diff --git a/src/xpdev/unicode_defs.h b/src/xpdev/unicode_defs.h index 97bf03aca6..555bdab7fd 100644 --- a/src/xpdev/unicode_defs.h +++ b/src/xpdev/unicode_defs.h @@ -37,8 +37,9 @@ #define UNICODE_DEFS_H_ enum unicode_codepoint { - UNICODE_NO_BREAK_SPACE = 0x00A0, + UNICODE_UNDEFINED = 0x0000, // UNICODE_NULL() is defined + UNICODE_NO_BREAK_SPACE = 0x00A0, UNICODE_INVERTED_EXCLAMATION_MARK = 0x00A1, UNICODE_CENT_SIGN = 0x00A2, UNICODE_POUND_SIGN = 0x00A3, @@ -121,18 +122,6 @@ enum unicode_codepoint { UNICODE_LATIN_SMALL_LETTER_Y_WITH_DIAERESIS = 0x00FF, UNICODE_LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS = 0x0178, - UNICODE_EN_QUAD = 0x2000, - UNICODE_EM_QUAD = 0x2001, - UNICODE_EN_SPACE = 0x2002, - UNICODE_EM_SPACE = 0x2003, - UNICODE_ZERO_WIDTH_SPACE = 0x200B, - UNICODE_ZERO_WIDTH_NON_JOINER = 0x200C, - UNICODE_ZERO_WIDTH_JOINER = 0x200D, - UNICODE_EM_DASH = 0x2014, - UNICODE_BULLET = 0x2022, - UNICODE_DOUBLE_EXCLAMATION_MARK = 0x203c, - UNICODE_OVERLINE = 0x203E, - UNICODE_GREEK_CAPITAL_LETTER_HETA = 0x0370, UNICODE_GREEK_SMALL_LETTER_HETA = 0x0371, UNICODE_GREEK_CAPITAL_LETTER_ARCHAIC_SAMPI = 0x0372, @@ -239,8 +228,30 @@ enum unicode_codepoint { UNICODE_GREEK_LETTER_SAMPI = 0x03E0, UNICODE_GREEK_SMALL_LETTER_SAMPI = 0x03E1, + UNICODE_EN_QUAD = 0x2000, + UNICODE_EM_QUAD = 0x2001, + UNICODE_EN_SPACE = 0x2002, + UNICODE_EM_SPACE = 0x2003, + UNICODE_ZERO_WIDTH_SPACE = 0x200B, + UNICODE_ZERO_WIDTH_NON_JOINER = 0x200C, + UNICODE_ZERO_WIDTH_JOINER = 0x200D, + UNICODE_EM_DASH = 0x2014, + UNICODE_BULLET = 0x2022, + UNICODE_DOUBLE_EXCLAMATION_MARK = 0x203c, + UNICODE_OVERLINE = 0x203E, + + UNICODE_SQUARE_ROOT = 0x221A, + UNICODE_BLACK_SQUARE = 0x25A0, + UNICODE_CHECK_MARK = 0x2713, + UNICODE_HEAVY_CHECK_MARK = 0x2714, + + UNICODE_MULTIPLICATION_X = 0x2715, + UNICODE_HEAVY_MULTIPLICATION_X = 0x2716, + UNICODE_BALLOT_X = 0x2717, + UNICODE_HEAVY_BALLOT_X = 0x2718, + UNICODE_VARIATION_SELECTOR_1 = 0xFE00, UNICODE_VARIATION_SELECTOR_2 = 0xFE01, UNICODE_VARIATION_SELECTOR_3 = 0xFE02, -- GitLab