Commit 550bdffc authored by rswindell's avatar rswindell
Browse files

Moved cp437_unicode_tbl and unicode_is_zerowidth() to (new file) unicode.c.

New function (derived from sbbs_t::utf8_to_cp437()): unicode_to_cp437()
New utf8 functions: utf8_replace_chars(), utf8_str_is_valid().
utf8_getc() enhancement: val arg may be NULL (for length/validation uses).
parent 2d0f0753
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
# OBJODIR, DIRSEP, and OFILE must be pre-defined # OBJODIR, DIRSEP, and OFILE must be pre-defined
OBJS = $(OBJODIR)$(DIRSEP)cp437_unicode_tbl$(OFILE) \ OBJS = $(OBJODIR)$(DIRSEP)unicode$(OFILE) \
$(OBJODIR)$(DIRSEP)utf8$(OFILE) \ $(OBJODIR)$(DIRSEP)utf8$(OFILE) \
$(OBJODIR)$(DIRSEP)uucode$(OFILE) \ $(OBJODIR)$(DIRSEP)uucode$(OFILE) \
$(OBJODIR)$(DIRSEP)yenc$(OFILE) \ $(OBJODIR)$(DIRSEP)yenc$(OFILE) \
......
...@@ -34,6 +34,8 @@ ...@@ -34,6 +34,8 @@
****************************************************************************/ ****************************************************************************/
#include "utf8.h" #include "utf8.h"
#include <stdbool.h>
#include <string.h>
char* utf8_normalize_str(char* str) char* utf8_normalize_str(char* str)
{ {
...@@ -136,6 +138,86 @@ char* utf8_normalize_str(char* str) ...@@ -136,6 +138,86 @@ char* utf8_normalize_str(char* str)
return str; return str;
} }
static bool unicode_is_zerowidth(uint32_t u)
{
switch(u) {
case 0x200B: // ZERO WIDTH SPACE
case 0x200C: // ZERO WIDTH NON-JOINER
case 0x200D: // ZERO WIDTH JOINER
case 0xFE00: // VARIATION SELECTOR-1
case 0xFE01: // VARIATION SELECTOR-2
case 0xFE02: // VARIATION SELECTOR-3
case 0xFE03: // VARIATION SELECTOR-4
case 0xFE04: // VARIATION SELECTOR-5
case 0xFE05: // VARIATION SELECTOR-6
case 0xFE06: // VARIATION SELECTOR-7
case 0xFE07: // VARIATION SELECTOR-8
case 0xFE08: // VARIATION SELECTOR-9
case 0xFE09: // VARIATION SELECTOR-10
case 0xFE0A: // VARIATION SELECTOR-11
case 0xFE0B: // VARIATION SELECTOR-12
case 0xFE0C: // VARIATION SELECTOR-13
case 0xFE0D: // VARIATION SELECTOR-14
case 0xFE0E: // VARIATION SELECTOR-15
case 0xFE0F: // VARIATION SELECTOR-16
case 0xFEFF: // ZERO WIDTH NO-BREAK SPACE
return true;
}
return false;
}
/* Replace all multi-byte UTF-8 sequences with 'ch' or 'zwch' (when non-zero) */
/* When ch and zwch are 0, effectively strips all UTF-8 chars from str */
char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_ch, char unsupported_zwch, char error_ch)
{
char* end = str + strlen(str);
char* dest = str;
int len ;
for(char* src= str; src < end; src += len) {
if(!(*src & 0x80)) {
*dest++ = *src;
len = 1;
continue;
}
uint32_t codepoint = 0;
len = utf8_getc(src, end - src, &codepoint);
if(len < 2) {
if(error_ch)
*dest++ = error_ch;
len = 1;
continue;
}
if(lookup != NULL) {
char ch = lookup(codepoint);
if(ch) {
*dest++ = ch;
continue;
}
}
if(unicode_is_zerowidth(codepoint)) {
if(unsupported_zwch)
*dest++ = unsupported_zwch;
}
else if(unsupported_ch)
*dest++ = unsupported_ch;
}
*dest = 0;
return str;
}
bool utf8_str_is_valid(const char* str)
{
const char* end = str + strlen(str);
while (str < end) {
int len = utf8_getc(str, end - str, NULL);
if (len < 1)
return false;
str += len;
}
return true;
}
// From openssl/crypto/asn1/a_utf8.c: // From openssl/crypto/asn1/a_utf8.c:
/* /*
* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
...@@ -244,7 +326,8 @@ int utf8_getc(const char *str, size_t len, uint32_t* val) ...@@ -244,7 +326,8 @@ int utf8_getc(const char *str, size_t len, uint32_t* val)
ret = 6; ret = 6;
} else } else
return -2; return -2;
*val = value; if(val != NULL)
*val = value;
return ret; return ret;
} }
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#define UTF8_H_ #define UTF8_H_
#include <stdint.h> #include <stdint.h>
#include <stdbool.h>
#include <stdlib.h> #include <stdlib.h>
#define UTF8_MAX_LEN 6 // Longest possible UTF-8 sequence #define UTF8_MAX_LEN 6 // Longest possible UTF-8 sequence
...@@ -45,8 +46,13 @@ ...@@ -45,8 +46,13 @@
extern "C" { extern "C" {
#endif #endif
// Returns true if the string is valid UTF-8
bool utf8_str_is_valid(const char*);
// Normalizes (to ASCII) chars in UTF-8 string 'str', in-place, resulting in string <= original in length // Normalizes (to ASCII) chars in UTF-8 string 'str', in-place, resulting in string <= original in length
char* utf8_normalize_str(char* str); char* utf8_normalize_str(char* str);
// Replace or strip UTF-8 sequences in str
// If table ('tbl') of unicode codepoints if non-NULL is an array of 256 codepoints to map to 8-bit chars
char* utf8_replace_chars(char* str, char (*lookup)(uint32_t), char unsupported_ch, char unsupported_zwch, char error_ch);
// Decode a UTF-8 sequence to a UNICODE code point // Decode a UTF-8 sequence to a UNICODE code point
int utf8_getc(const char* str, size_t len, uint32_t* codepoint); int utf8_getc(const char* str, size_t len, uint32_t* codepoint);
// Encode a UNICODE code point into a UTF-8 sequence (str) // Encode a UNICODE code point into a UTF-8 sequence (str)
......
/* Synchronet CP437 <-> UNICODE translation table */ /* Synchronet Unicode encode/decode/translate functions */
/* $Id$ */ /* $Id$ */
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
* Note: If this box doesn't appear square, then you need to fix your tabs. * * Note: If this box doesn't appear square, then you need to fix your tabs. *
****************************************************************************/ ****************************************************************************/
#include "cp437_unicode_tbl.h" #include "unicode.h"
// Want UNICDE encodings of terminal control characters? // Want UNICDE encodings of terminal control characters?
#if defined USE_UNICODE_FOR_TERM_CTRL_CHARS #if defined USE_UNICODE_FOR_TERM_CTRL_CHARS
...@@ -305,3 +305,220 @@ uint32_t cp437_unicode_tbl[] = ...@@ -305,3 +305,220 @@ uint32_t cp437_unicode_tbl[] =
/* 0xFE */ 0xFFED, //0x25A0, /* 0xFE */ 0xFFED, //0x25A0,
/* 0xFF */ 0x00A0 /* 0xFF */ 0x00A0
}; };
bool unicode_is_zerowidth(uint32_t u)
{
switch(u) {
case 0x200B: // ZERO WIDTH SPACE
case 0x200C: // ZERO WIDTH NON-JOINER
case 0x200D: // ZERO WIDTH JOINER
case 0xFE00: // VARIATION SELECTOR-1
case 0xFE01: // VARIATION SELECTOR-2
case 0xFE02: // VARIATION SELECTOR-3
case 0xFE03: // VARIATION SELECTOR-4
case 0xFE04: // VARIATION SELECTOR-5
case 0xFE05: // VARIATION SELECTOR-6
case 0xFE06: // VARIATION SELECTOR-7
case 0xFE07: // VARIATION SELECTOR-8
case 0xFE08: // VARIATION SELECTOR-9
case 0xFE09: // VARIATION SELECTOR-10
case 0xFE0A: // VARIATION SELECTOR-11
case 0xFE0B: // VARIATION SELECTOR-12
case 0xFE0C: // VARIATION SELECTOR-13
case 0xFE0D: // VARIATION SELECTOR-14
case 0xFE0E: // VARIATION SELECTOR-15
case 0xFE0F: // VARIATION SELECTOR-16
case 0xFEFF: // ZERO WIDTH NO-BREAK SPACE
return true;
}
return false;
}
char unicode_to_cp437(uint32_t codepoint)
{
switch(codepoint) {
case 0x00B4: // ACUTE ACCENT
return '\'';
case 0x00CD: // LATIN CAPITAL LETTER I WITH ACUTE
return '\xA1'; // Lower-case Letter i with Acute
case 0x2014: // EM DASH
return '\xC4';
case 0x2022: // BULLET
return '\xF9';
case 0x203E: // OVERLINE
case 0x2500: // Box Drawings Light Horizontal
case 0x2501: // Box Drawings Heavy Horizontal
case 0x2504: // Box Drawings Light Triple Dash Horizontal
case 0x2505: // Box Drawings Heavy Triple Dash Horizontal
case 0x2508: // Box Drawings Light Quadruple Dash Horizontal
case 0x2509: // Box Drawings Heavy Quadruple Dash Horizontal
case 0x254C: // Box Drawings Light Double Dash Horizontal
case 0x254D: // Box Drawings Heavy Double Dash Horizontal
case 0x2574: // Box Drawings Light Left
case 0x2576: // Box Drawings Light Right
case 0x2578: // Box Drawings Heavy Left
case 0x257A: // Box Drawings Heavy Right
case 0x257C: // Box Drawings Light Left and Heavy Right
case 0x257E: // Box Drawings Heavy Left and Light Right
return '\xC4';
case 0x2502: // Box Drawings Light Vertical
case 0x2503: // Box Drawings Heavy Vertical
case 0x2506: // Box Drawings Light Triple Dash Vertical
case 0x2507: // Box Drawings Heavy Triple Dash Vertical
case 0x250A: // Box Drawings Light Quadruple Dash Vertical
case 0x250B: // Box Drawings Heavy Quadruple Dash Vertical
return '\xB3';
case 0x250C: // BOX DRAWINGS LIGHT DOWN AND RIGHT
case 0x250D:
case 0x250E:
case 0x250F: // BOX DRAWINGS HEAVY DOWN AND RIGHT
return '\xDA';
case 0x2510: // BOX DRAWINGS LIGHT DOWN AND LEFT
case 0x2511:
case 0x2512:
case 0x2513: // BOX DRAWINGS HEAVY DOWN AND LEFT
return '\xBF';
case 0x2514: // BOX DRAWINGS LIGHT UP AND RIGHT
case 0x2515:
case 0x2516:
case 0x2517: // BOX DRAWINGS HEAVY UP AND RIGHT
return '\xC0';
case 0x2518: // BOX DRAWINGS LIGHT UP AND LEFT
case 0x2519:
case 0x251A:
case 0x251B: // BOX DRAWINGS HEAVY UP AND LEFT
return '\xD9';
case 0x251C: // BOX DRAWINGS LIGHT VERTICAL AND RIGHT
case 0x251D:
case 0x251E:
case 0x251F:
case 0x2520:
case 0x2521:
case 0x2522:
case 0x2523: // BOX DRAWINGS HEAVY VERTICAL AND RIGHT
return '\xC3';
case 0x2524: // BOX DRAWINGS LIGHT VERTICAL AND LEFT
case 0x2525:
case 0x2526:
case 0x2527:
case 0x2528:
case 0x2529:
case 0x252A:
case 0x252B:
return '\xB4';
case 0x252C: // BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
case 0x252D:
case 0x252E:
case 0x252F:
case 0x2530:
case 0x2531:
case 0x2532: // BOX DRAWINGS LEFT LIGHT AND RIGHT DOWN HEAVY
case 0x2533: // BOX DRAWINGS HEAVY DOWN AND HORIZONTAL
return '\xC2';
case 0x2534: // BOX DRAWINGS LIGHT UP AND HORIZONTAL
case 0x2535: // BOX DRAWINGS LEFT HEAVY AND RIGHT UP LIGHT
case 0x2536: // BOX DRAWINGS RIGHT HEAVY AND LEFT UP LIGHT
case 0x2537: // BOX DRAWINGS UP LIGHT AND HORIZONTAL HEAVY
case 0x2538: // BOX DRAWINGS UP HEAVY AND HORIZONTAL LIGHT
case 0x2539: // BOX DRAWINGS RIGHT LIGHT AND LEFT UP HEAVY
case 0x253A: // BOX DRAWINGS LEFT LIGHT AND RIGHT UP HEAVY
case 0x253B: // BOX DRAWINGS HEAVY UP AND HORIZONTAL
return '\xC1';
case 0x253C: // BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
case 0x253D: // BOX DRAWINGS LEFT HEAVY AND RIGHT VERTICAL LIGHT
case 0x253E: // BOX DRAWINGS RIGHT HEAVY AND LEFT VERTICAL LIGHT
case 0x253F: // BOX DRAWINGS VERTICAL LIGHT AND HORIZONTAL HEAVY
case 0x2540: // BOX DRAWINGS UP HEAVY AND DOWN HORIZONTAL LIGHT
case 0x2541: // BOX DRAWINGS DOWN HEAVY AND UP HORIZONTAL LIGHT
case 0x2542: // BOX DRAWINGS VERTICAL HEAVY AND HORIZONTAL LIGHT
case 0x2543: // BOX DRAWINGS LEFT UP HEAVY AND RIGHT DOWN LIGHT
case 0x2544: // BOX DRAWINGS RIGHT UP HEAVY AND LEFT DOWN LIGHT
case 0x2545: // BOX DRAWINGS LEFT DOWN HEAVY AND RIGHT UP LIGHT
case 0x2546: // BOX DRAWINGS RIGHT DOWN HEAVY AND LEFT UP LIGHT
case 0x2547: // BOX DRAWINGS DOWN LIGHT AND UP HORIZONTAL HEAVY
case 0x2548: // BOX DRAWINGS UP LIGHT AND DOWN HORIZONTAL HEAVY
case 0x2549: // BOX DRAWINGS RIGHT LIGHT AND LEFT VERTICAL HEAVY
case 0x254A: // BOX DRAWINGS LEFT LIGHT AND RIGHT VERTICAL HEAVY
case 0x254B: // BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL
return '\xC5';
case 0x254E: // BOX DRAWINGS LIGHT DOUBLE DASH VERTICAL
case 0x254F: // BOX DRAWINGS HEAVY DOUBLE DASH VERTICAL
return '|';
case 0x256D: // BOX DRAWINGS LIGHT ARC DOWN AND RIGHT
return '\xDA';
case 0x256E: // BOX DRAWINGS LIGHT ARC DOWN AND LEFT
return '\xBF';
case 0x256F: // BOX DRAWINGS LIGHT ARC UP AND LEFT
return '\xD9';
case 0x2570: // BOX DRAWINGS LIGHT ARC UP AND RIGHT
return '\xC0';
case 0x2571: // BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT
return '/';
case 0x2572: // BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT
return '\\';
case 0x2573: // BOX DRAWINGS LIGHT DIAGONAL CROSS
return 'X';
case 0x2575: // Box Drawings Light Up
case 0x2577: // Box Drawings Light Down
case 0x2579: // Box Drawings Heavy Up
case 0x257B: // Box Drawings Heavy Down
case 0x257D: // Box Drawings Light Up and Heavy Down
case 0x257F: // Box Drawings Heavy Up and Light Down
return '\xB3';
case 0x2581: // Lower One Eighth Block
return '_';
case 0x2582: // Lower One Quarter Block
case 0x2583: // Lower Three Eighths Block
return '\x16';
case 0x2585: // Lower Five Eighths Block
case 0x2586: // Lower Three Quarters Block
case 0x2587: // Lower Seven Eighths Block
return '\xDC';
case 0x2588: // Full Block
case 0x2589: // Left Seven Eighths Block
return '\xDB';
case 0x258A: // Left Three Quarters Block
case 0x258B: // Left Five Eighths Block
case 0x258C: // Left Half Block
case 0x258D: // Left Three Eighths Block
case 0x258E: // Left One Quarter Block
case 0x258F: // Left One Eighth Block
return '\xDD';
case 0x2590: // Right Half Block
case 0x2595: // Right One Eighth Block
return '\xDE';
case 0x2594: // Upper One Eighth Block
return '\xDF';
}
return '\0'; // Not-mapped
}
/* Synchronet CP437 <-> UNICODE translation table */ /* Synchronet Unicode encode/decode/translate functions */
/* $Id$ */ /* $Id$ */
/**************************************************************************** /****************************************************************************
* @format.tab-size 4 (Plain Text/Source Code File Header) * * @format.tab-size 4 (Plain Text/Source Code File Header) *
* @format.use-tabs true (see http://www.synchro.net/ptsc_hdr.html) * * @format.use-tabs true (see http://www.synchro.net/ptsc_hdr.html) *
* * * *
* Copyright Rob Swindell - http://www.synchro.net/copyright.html * * Copyright Rob Swindell - http://www.synchro.net/copyright.html *
* * * *
* This library is free software; you can redistribute it and/or * * This library is free software; you can redistribute it and/or *
* modify it under the terms of the GNU Lesser General Public License * * modify it under the terms of the GNU Lesser General Public License *
* as published by the Free Software Foundation; either version 2 * * as published by the Free Software Foundation; either version 2 *
* of the License, or (at your option) any later version. * * of the License, or (at your option) any later version. *
* See the GNU Lesser General Public License for more details: lgpl.txt or * * See the GNU Lesser General Public License for more details: lgpl.txt or *
* http://www.fsf.org/copyleft/lesser.html * * http://www.fsf.org/copyleft/lesser.html *
* * * *
* Anonymous FTP access to the most recent released source is available at * * Anonymous FTP access to the most recent released source is available at *
* ftp://vert.synchro.net, ftp://cvs.synchro.net and ftp://ftp.synchro.net * * ftp://vert.synchro.net, ftp://cvs.synchro.net and ftp://ftp.synchro.net *
* * * *
* Anonymous CVS access to the development source and modification history * * Anonymous CVS access to the development source and modification history *
* is available at cvs.synchro.net:/cvsroot/sbbs, example: * * is available at cvs.synchro.net:/cvsroot/sbbs, example: *
* cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs login * * cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs login *
* (just hit return, no password is necessary) * * (just hit return, no password is necessary) *
* cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs checkout src * * cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs checkout src *
* * * *
* For Synchronet coding style and modification guidelines, see * * For Synchronet coding style and modification guidelines, see *
* http://www.synchro.net/source.html * * http://www.synchro.net/source.html *
* * * *
* You are encouraged to submit any modifications (preferably in Unix diff * * You are encouraged to submit any modifications (preferably in Unix diff *
* format) via e-mail to mods@synchro.net * * format) via e-mail to mods@synchro.net *
* * * *
* Note: If this box doesn't appear square, then you need to fix your tabs. * * Note: If this box doesn't appear square, then you need to fix your tabs. *
****************************************************************************/ ****************************************************************************/
#ifndef CP437_UNICODE_TBL_H_ #ifndef UNICODE_H_
#define CP437_UNICODE_TBL_H_ #define UNICODE_H_
#include <stdint.h> #include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus #if defined(__cplusplus)
extern "C" { extern "C" {
#endif #endif
extern uint32_t cp437_unicode_tbl[]; uint32_t cp437_unicode_tbl[];
bool unicode_is_zerowidth(uint32_t);
char unicode_to_cp437(uint32_t);
#ifdef __cplusplus #if defined(__cplusplus)
} }
#endif #endif
#endif // Don't add anything after this line #endif // Don't add anything after this line
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment