Skip to content
Snippets Groups Projects
Commit 7c9fbc7e authored by rswindell's avatar rswindell
Browse files

enum-ification (use enum unicode_codepoint instead of uint32_t).

Replaced unicode_is_zerowidth() with unicode_width(), in preparation for
"fullwidth" char support.
Added UNICODE_UNDEFINED definition (0x0000)
(UNICODE_NULL is already defined, at least in MSVC).
parent aa9c0dfa
No related branches found
No related tags found
No related merge requests found
......@@ -141,7 +141,7 @@ char* utf8_normalize_str(char* str)
/* Replace all multi-byte UTF-8 sequences with 'ch' or 'zwch' (when non-zero) */
/* When ch and zwch are 0, effectively strips all UTF-8 chars from str */
char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_ch, char unsupported_zwch, char error_ch)
char* utf8_replace_chars(char* str, char (*lookup)(enum unicode_codepoint), char unsupported_ch, char unsupported_zwch, char error_ch)
{
char* end = str + strlen(str);
char* dest = str;
......@@ -153,7 +153,7 @@ char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_c
len = 1;
continue;
}
uint32_t codepoint = 0;
enum unicode_codepoint codepoint = 0;
len = utf8_getc(src, end - src, &codepoint);
if(len < 2) {
if(error_ch)
......@@ -168,7 +168,7 @@ char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_c
continue;
}
}
if(unicode_is_zerowidth(codepoint)) {
if(unicode_width(codepoint) == 0) {
if(unsupported_zwch)
*dest++ = unsupported_zwch;
}
......@@ -200,7 +200,7 @@ int cp437_to_utf8_str(const char* str, char* dest, size_t maxlen, unsigned char
retval = -1;
break;
}
uint32_t codepoint = 0;
enum unicode_codepoint codepoint = 0;
if(*p >= minval)
codepoint = cp437_unicode_tbl[*p];
if(codepoint) {
......@@ -241,7 +241,7 @@ int cp437_to_utf8_str(const char* str, char* dest, size_t maxlen, unsigned char
* -4 = character encoded incorrectly (not minimal length).
*/
int utf8_getc(const char *str, size_t len, uint32_t* val)
int utf8_getc(const char *str, size_t len, enum unicode_codepoint* val)
{
const unsigned char *p;
unsigned long value;
......@@ -339,7 +339,7 @@ int utf8_getc(const char *str, size_t len, uint32_t* val)
* most 6 characters.
*/
int utf8_putc(char *str, size_t len, uint32_t value)
int utf8_putc(char *str, size_t len, enum unicode_codepoint value)
{
if (!str)
len = 6; /* Maximum we will need */
......
......@@ -36,9 +36,9 @@
#ifndef UTF8_H_
#define UTF8_H_
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include "unicode_defs.h"
#define UTF8_MAX_LEN 6 // Longest possible UTF-8 sequence
......@@ -57,17 +57,17 @@ char* utf8_normalize_str(char* str);
// 'unsupported_ch' is the character used to replace unsupported Unicode codepoints (optional)
// 'unsupported_zwch' is the character used to replace unsupported zero-width Unicode codepoints (optional)
// 'error_ch' is the character used to replace invalid UTF-8 sequence bytes (optional)
char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_ch, char unsupported_zwch, char error_ch);
char* utf8_replace_chars(char* str, char (*lookup)(enum unicode_codepoint), char unsupported_ch, char unsupported_zwch, char error_ch);
// Convert a CP437 char string (src) to UTF-8 string (dest) up to 'maxlen' chars long (sans NUL-terminator)
// 'minval' can be used to limit the range of converted chars
int cp437_to_utf8_str(const char* src, char* dest, size_t maxlen, unsigned char minval);
// Decode a UTF-8 sequence to a UNICODE code point
int utf8_getc(const char* str, size_t len, uint32_t* codepoint);
int utf8_getc(const char* str, size_t len, enum unicode_codepoint* codepoint);
// Encode a UNICODE code point into a UTF-8 sequence (str)
int utf8_putc(char* str, size_t len, uint32_t codepoint);
int utf8_putc(char* str, size_t len, enum unicode_codepoint codepoint);
#if defined(__cplusplus)
}
......
......@@ -47,7 +47,7 @@
// CP437 character to/from UNICODE code point conversion
// The CP437 character value is the index into the table.
// If the value at that index is 0, no translation is needed (1:1 mapping).
uint32_t cp437_unicode_tbl[] =
enum unicode_codepoint cp437_unicode_tbl[] =
{
/* 0x00 */ 0,
/* 0x01 */ 0x263A,
......@@ -308,7 +308,7 @@ uint32_t cp437_unicode_tbl[] =
/* 0xFF */ 0x00A0
};
bool unicode_is_zerowidth(uint32_t u)
size_t unicode_width(enum unicode_codepoint u)
{
switch(u) {
case UNICODE_ZERO_WIDTH_SPACE:
......@@ -331,12 +331,13 @@ bool unicode_is_zerowidth(uint32_t u)
case UNICODE_VARIATION_SELECTOR_15:
case UNICODE_VARIATION_SELECTOR_16:
case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE:
return true;
return 0;
/* TODO: return 2 for "fullwdith" chars */
}
return false;
return 1;
}
char unicode_to_cp437(uint32_t codepoint)
char unicode_to_cp437(enum unicode_codepoint codepoint)
{
switch(codepoint) {
case 0: return '\0';
......@@ -444,6 +445,15 @@ char unicode_to_cp437(uint32_t codepoint)
case UNICODE_EM_SPACE:
return ' ';
case UNICODE_SQUARE_ROOT: return CP437_CHAR_SQUARE_ROOT;
case UNICODE_CHECK_MARK:
case UNICODE_HEAVY_CHECK_MARK: return CP437_CHAR_CHECK_MARK;
case UNICODE_MULTIPLICATION_X:
case UNICODE_HEAVY_MULTIPLICATION_X:
case UNICODE_BALLOT_X:
case UNICODE_HEAVY_BALLOT_X: return 'x';
case UNICODE_OVERLINE:
case 0x2500: // Box Drawings Light Horizontal
case 0x2501: // Box Drawings Heavy Horizontal
......
......@@ -36,16 +36,16 @@
#ifndef UNICODE_H_
#define UNICODE_H_
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include "unicode_defs.h"
#if defined(__cplusplus)
extern "C" {
#endif
extern uint32_t cp437_unicode_tbl[];
bool unicode_is_zerowidth(uint32_t);
char unicode_to_cp437(uint32_t);
extern enum unicode_codepoint cp437_unicode_tbl[];
size_t unicode_width(enum unicode_codepoint);
char unicode_to_cp437(enum unicode_codepoint);
#if defined(__cplusplus)
}
......
......@@ -37,8 +37,9 @@
#define UNICODE_DEFS_H_
enum unicode_codepoint {
UNICODE_NO_BREAK_SPACE = 0x00A0,
UNICODE_UNDEFINED = 0x0000, // UNICODE_NULL() is defined
UNICODE_NO_BREAK_SPACE = 0x00A0,
UNICODE_INVERTED_EXCLAMATION_MARK = 0x00A1,
UNICODE_CENT_SIGN = 0x00A2,
UNICODE_POUND_SIGN = 0x00A3,
......@@ -121,18 +122,6 @@ enum unicode_codepoint {
UNICODE_LATIN_SMALL_LETTER_Y_WITH_DIAERESIS = 0x00FF,
UNICODE_LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS = 0x0178,
UNICODE_EN_QUAD = 0x2000,
UNICODE_EM_QUAD = 0x2001,
UNICODE_EN_SPACE = 0x2002,
UNICODE_EM_SPACE = 0x2003,
UNICODE_ZERO_WIDTH_SPACE = 0x200B,
UNICODE_ZERO_WIDTH_NON_JOINER = 0x200C,
UNICODE_ZERO_WIDTH_JOINER = 0x200D,
UNICODE_EM_DASH = 0x2014,
UNICODE_BULLET = 0x2022,
UNICODE_DOUBLE_EXCLAMATION_MARK = 0x203c,
UNICODE_OVERLINE = 0x203E,
UNICODE_GREEK_CAPITAL_LETTER_HETA = 0x0370,
UNICODE_GREEK_SMALL_LETTER_HETA = 0x0371,
UNICODE_GREEK_CAPITAL_LETTER_ARCHAIC_SAMPI = 0x0372,
......@@ -239,8 +228,30 @@ enum unicode_codepoint {
UNICODE_GREEK_LETTER_SAMPI = 0x03E0,
UNICODE_GREEK_SMALL_LETTER_SAMPI = 0x03E1,
UNICODE_EN_QUAD = 0x2000,
UNICODE_EM_QUAD = 0x2001,
UNICODE_EN_SPACE = 0x2002,
UNICODE_EM_SPACE = 0x2003,
UNICODE_ZERO_WIDTH_SPACE = 0x200B,
UNICODE_ZERO_WIDTH_NON_JOINER = 0x200C,
UNICODE_ZERO_WIDTH_JOINER = 0x200D,
UNICODE_EM_DASH = 0x2014,
UNICODE_BULLET = 0x2022,
UNICODE_DOUBLE_EXCLAMATION_MARK = 0x203c,
UNICODE_OVERLINE = 0x203E,
UNICODE_SQUARE_ROOT = 0x221A,
UNICODE_BLACK_SQUARE = 0x25A0,
UNICODE_CHECK_MARK = 0x2713,
UNICODE_HEAVY_CHECK_MARK = 0x2714,
UNICODE_MULTIPLICATION_X = 0x2715,
UNICODE_HEAVY_MULTIPLICATION_X = 0x2716,
UNICODE_BALLOT_X = 0x2717,
UNICODE_HEAVY_BALLOT_X = 0x2718,
UNICODE_VARIATION_SELECTOR_1 = 0xFE00,
UNICODE_VARIATION_SELECTOR_2 = 0xFE01,
UNICODE_VARIATION_SELECTOR_3 = 0xFE02,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment