From 1d23efcd8480b1021ca00fbb14db41b113a2f2b1 Mon Sep 17 00:00:00 2001 From: rswindell <> Date: Mon, 8 Jul 2019 04:23:48 +0000 Subject: [PATCH] Add function cp437_to_utf8_str(). --- src/encode/utf8.c | 28 ++++++++++++++++++++++++++++ src/encode/utf8.h | 15 +++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/encode/utf8.c b/src/encode/utf8.c index a210e6d0c9..94daa78948 100644 --- a/src/encode/utf8.c +++ b/src/encode/utf8.c @@ -34,6 +34,7 @@ ****************************************************************************/ #include "utf8.h" +#include "unicode.h" #include <stdbool.h> #include <string.h> @@ -218,6 +219,33 @@ bool utf8_str_is_valid(const char* str) return true; } +int cp437_to_utf8_str(const char* str, char* dest, size_t maxlen, unsigned char minval) +{ + int retval = 0; + size_t outlen = 0; + for(const unsigned char* p = str; *p != 0; p++) { + if(outlen >= maxlen) { + retval = -1; + break; + } + uint32_t codepoint = 0; + if(*p >= minval) + codepoint = cp437_unicode_tbl[*p]; + if(codepoint) { + retval = utf8_putc(dest + outlen, maxlen - outlen, codepoint); + if(retval < 1) + break; + outlen += retval; + } else { + *(dest + outlen) = *p; + outlen++; + } + } + *(dest + outlen) = 0; + return retval; +} + + // From openssl/crypto/asn1/a_utf8.c: /* * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. diff --git a/src/encode/utf8.h b/src/encode/utf8.h index c959b4457e..2d3293cb17 100644 --- a/src/encode/utf8.h +++ b/src/encode/utf8.h @@ -48,13 +48,24 @@ extern "C" { // Returns true if the string is valid UTF-8 bool utf8_str_is_valid(const char*); + // Normalizes (to ASCII) chars in UTF-8 string 'str', in-place, resulting in string <= original in length char* utf8_normalize_str(char* str); -// Replace or strip UTF-8 sequences in str -// If table ('tbl') of unicode codepoints if non-NULL is an array of 256 codepoints to map to 8-bit chars + +// Replace or strip UTF-8 sequences in str (in-place) +// 'lookup' is a Unicode codepoint look-up function (optional) +// 'unsupported_ch' is the character used to replace unsupported Unicode codepoints (optional) +// 'unsupported_zwch' is the character used to replace unsupported zero-width Unicode codepoints (optional) +// 'error_ch' is the character used to replace invalid UTF-8 sequence bytes (optional) char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_ch, char unsupported_zwch, char error_ch); + +// Convert a CP437 char string (src) to UTF-8 string (dest) up to 'maxlen' chars long (sans NUL-terminator) +// 'minval' can be used to limit the range of converted chars +int cp437_to_utf8_str(const char* src, char* dest, size_t maxlen, unsigned char minval); + // Decode a UTF-8 sequence to a UNICODE code point int utf8_getc(const char* str, size_t len, uint32_t* codepoint); + // Encode a UNICODE code point into a UTF-8 sequence (str) int utf8_putc(char* str, size_t len, uint32_t codepoint); -- GitLab