Commit 9c7f945a authored by rswindell's avatar rswindell
Browse files

Decided to use UNICODE code points rather than UTF-8 sequences for faster

UNICODE -> CP437 conversions (no string/memory compares needed), so
cp437_utf8_tbl.* is being replaced by cp437_unicode_tbl.*.

Added utf8 module which includes:
- utf8_normalize_str() - normalize US-ASCIIZ string, from mailsrvr.c
- utf8_putc() - encoder / length calculator
- utf8_getc() - decoder

The latter 2 were swiped from OpenSSL's a_utf8.c.
parent d2df590f
/* Synchronet CP437 <-> UNICODE translation table */
/* $Id$ */
/****************************************************************************
* @format.tab-size 4 (Plain Text/Source Code File Header) *
* @format.use-tabs true (see http://www.synchro.net/ptsc_hdr.html) *
* *
* Copyright Rob Swindell - http://www.synchro.net/copyright.html *
* *
* This library is free software; you can redistribute it and/or *
* modify it under the terms of the GNU Lesser General Public License *
* as published by the Free Software Foundation; either version 2 *
* of the License, or (at your option) any later version. *
* See the GNU Lesser General Public License for more details: lgpl.txt or *
* http://www.fsf.org/copyleft/lesser.html *
* *
* Anonymous FTP access to the most recent released source is available at *
* ftp://vert.synchro.net, ftp://cvs.synchro.net and ftp://ftp.synchro.net *
* *
* Anonymous CVS access to the development source and modification history *
* is available at cvs.synchro.net:/cvsroot/sbbs, example: *
* cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs login *
* (just hit return, no password is necessary) *
* cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs checkout src *
* *
* For Synchronet coding style and modification guidelines, see *
* http://www.synchro.net/source.html *
* *
* You are encouraged to submit any modifications (preferably in Unix diff *
* format) via e-mail to mods@synchro.net *
* *
* Note: If this box doesn't appear square, then you need to fix your tabs. *
****************************************************************************/
#include "cp437_unicode_tbl.h"
// Want UNICDE encodings of terminal control characters?
#if defined USE_UNICODE_FOR_TERM_CTRL_CHARS
# define UNICODE_TERM_CTRL_CHAR_CODE(x) x
#else
# define UNICODE_TERM_CTRL_CHAR_CODE(x) 0
#endif
// CP437 character to/from UNICODE code point conversion
// The CP437 character value is the index into the table.
// If the value at that index is 0, no translation is needed (1:1 mapping).
uint32_t cp437_unicode_tbl[] =
{
/* 0x00 */ 0,
/* 0x01 */ 0x263A,
/* 0x02 */ 0x263B,
/* 0x03 */ 0x2665,
/* 0x04 */ 0x2666,
/* 0x05 */ 0x2663,
/* 0x06 */ 0x2660,
/* 0x07 '\a' */ UNICODE_TERM_CTRL_CHAR_CODE(0x2022),
/* 0x08 '\b' */ UNICODE_TERM_CTRL_CHAR_CODE(0x25D8),
/* 0x09 '\t' */ UNICODE_TERM_CTRL_CHAR_CODE(0x25CB),
/* 0x0A '\n' */ UNICODE_TERM_CTRL_CHAR_CODE(0x25D9),
/* 0x0B */ 0x2642,
/* 0x0C '\f' */ UNICODE_TERM_CTRL_CHAR_CODE(0x2640),
/* 0x0D '\r' */ UNICODE_TERM_CTRL_CHAR_CODE(0x266A),
/* 0x0E */ 0x266B,
/* 0x0F */ 0x263C,
/* 0x10 */ 0x25BA,
/* 0x11 */ 0x25C4,
/* 0x12 */ 0x2195,
/* 0x13 */ 0x203C,
/* 0x14 */ 0x00B6,
/* 0x15 */ 0x00A7,
/* 0x16 */ 0x25AC,
/* 0x17 */ 0x21A8,
/* 0x18 */ 0x2191,
/* 0x19 */ 0x2193,
/* 0x1A */ 0x2192,
/* 0x1B */ 0x2190,
/* 0x1C */ 0x221F,
/* 0x1D */ 0x2194,
/* 0x1E */ 0x25B2,
/* 0x1F */ 0x25BC,
/* 0x20-0x7F (1:1 with US-ASCII and CP437) */
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
/* 0x80 */ 0x00C7,
/* 0x81 */ 0x00FC,
/* 0x82 */ 0x00E9,
/* 0x83 */ 0x00E2,
/* 0x84 */ 0x00E4,
/* 0x85 */ 0x00E0,
/* 0x86 */ 0x00E5,
/* 0x87 */ 0x00E7,
/* 0x88 */ 0x00EA,
/* 0x89 */ 0x00EB,
/* 0x8A */ 0x00E8,
/* 0x8B */ 0x00EF,
/* 0x8C */ 0x00EE,
/* 0x8D */ 0x00EC,
/* 0x8E */ 0x00C4,
/* 0x8F */ 0x00C5,
/* 0x90 */ 0x00C9,
/* 0x91 */ 0x00E6,
/* 0x92 */ 0x00C6,
/* 0x93 */ 0x00F4,
/* 0x94 */ 0x00F6,
/* 0x95 */ 0x00F2,
/* 0x96 */ 0x00F8,
/* 0x97 */ 0x00F9,
/* 0x98 */ 0x00FF,
/* 0x99 */ 0x00D6,
/* 0x9A */ 0x00DC,
/* 0x9B */ 0x00A2,
/* 0x9C */ 0x00A3,
/* 0x9D */ 0x00A5,
/* 0x9E */ 0x20A7,
/* 0x9F */ 0x0192,
/* 0xA0 */ 0x00E1,
/* 0xA1 */ 0x00ED,
/* 0xA2 */ 0x00F3,
/* 0xA3 */ 0x00FA,
/* 0xA4 */ 0x00F1,
/* 0xA5 */ 0x00D1,
/* 0xA6 */ 0x00AA,
/* 0xA7 */ 0x00BA,
/* 0xA8 */ 0x00BF,
/* 0xA9 */ 0x2310,
/* 0xAA */ 0x00AC,
/* 0xAB */ 0x00BD,
/* 0xAC */ 0x00BC,
/* 0xAD */ 0x00A1,
/* 0xAE */ 0x00AB,
/* 0xAF */ 0x00BB,
/* 0xB0 */ 0x2591,
/* 0xB1 */ 0x2592,
/* 0xB2 */ 0x2593,
/* 0xB3 */ 0x2502,
/* 0xB4 */ 0x2524,
/* 0xB5 */ 0x2561,
/* 0xB6 */ 0x2562,
/* 0xB7 */ 0x2556,
/* 0xB8 */ 0x2555,
/* 0xB9 */ 0x2563,
/* 0xBA */ 0x2551,
/* 0xBB */ 0x2557,
/* 0xBC */ 0x255D,
/* 0xBD */ 0x255C,
/* 0xBE */ 0x255B,
/* 0xBF */ 0x2510,
/* 0xC0 */ 0x2514,
/* 0xC1 */ 0x2534,
/* 0xC2 */ 0x252C,
/* 0xC3 */ 0x251C,
/* 0xC4 */ 0x2500,
/* 0xC5 */ 0x253C,
/* 0xC6 */ 0x255E,
/* 0xC7 */ 0x255F,
/* 0xC8 */ 0x255A,
/* 0xC9 */ 0x2554,
/* 0xCA */ 0x2569,
/* 0xCB */ 0x2566,
/* 0xCC */ 0x2560,
/* 0xCD */ 0x2550,
/* 0xCE */ 0x256C,
/* 0xCF */ 0x2567,
/* 0xD0 */ 0x2568,
/* 0xD1 */ 0x2564,
/* 0xD2 */ 0x2565,
/* 0xD3 */ 0x2559,
/* 0xD4 */ 0x2558,
/* 0xD5 */ 0x2552,
/* 0xD6 */ 0x2553,
/* 0xD7 */ 0x256B,
/* 0xD8 */ 0x256A,
/* 0xD9 */ 0x2518,
/* 0xDA */ 0x250C,
/* 0xDB */ 0x2588,
/* 0xDC */ 0x2584,
/* 0xDD */ 0x258C,
/* 0xDE */ 0x2590,
/* 0xDF */ 0x2580,
/* 0xE0 */ 0x03B1,
/* 0xE1 */ 0x00DF,
/* 0xE2 */ 0x0393,
/* 0xE3 */ 0x03C0,
/* 0xE4 */ 0x03A3,
/* 0xE5 */ 0x03C3,
/* 0xE6 */ 0x00B5,
/* 0xE7 */ 0x03C4,
/* 0xE8 */ 0x03A6,
/* 0xE9 */ 0x0398,
/* 0xEA */ 0x03A9,
/* 0xEB */ 0x03B4,
/* 0xEC */ 0x221E,
/* 0xED */ 0x03C6,
/* 0xEE */ 0x03B5,
/* 0xEF */ 0x2229,
/* 0xF0 */ 0x2261,
/* 0xF1 */ 0x00B1,
/* 0xF2 */ 0x2265,
/* 0xF3 */ 0x2264,
/* 0xF4 */ 0x2320,
/* 0xF5 */ 0x2321,
/* 0xF6 */ 0x00F7,
/* 0xF7 */ 0x2248,
/* 0xF8 */ 0x00B0,
/* 0xF9 */ 0x2219,
/* 0xFA */ 0x00B7,
/* 0xFB */ 0x221A,
/* 0xFC */ 0x207F,
/* 0xFD */ 0x00B2,
/* 0xFE */ 0xFFED, //0x25A0,
/* 0xFF */ 0x00A0
};
/* Synchronet CP437 <-> UNICODE translation table */
/* $Id$ */
/****************************************************************************
* @format.tab-size 4 (Plain Text/Source Code File Header) *
* @format.use-tabs true (see http://www.synchro.net/ptsc_hdr.html) *
* *
* Copyright Rob Swindell - http://www.synchro.net/copyright.html *
* *
* This library is free software; you can redistribute it and/or *
* modify it under the terms of the GNU Lesser General Public License *
* as published by the Free Software Foundation; either version 2 *
* of the License, or (at your option) any later version. *
* See the GNU Lesser General Public License for more details: lgpl.txt or *
* http://www.fsf.org/copyleft/lesser.html *
* *
* Anonymous FTP access to the most recent released source is available at *
* ftp://vert.synchro.net, ftp://cvs.synchro.net and ftp://ftp.synchro.net *
* *
* Anonymous CVS access to the development source and modification history *
* is available at cvs.synchro.net:/cvsroot/sbbs, example: *
* cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs login *
* (just hit return, no password is necessary) *
* cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs checkout src *
* *
* For Synchronet coding style and modification guidelines, see *
* http://www.synchro.net/source.html *
* *
* You are encouraged to submit any modifications (preferably in Unix diff *
* format) via e-mail to mods@synchro.net *
* *
* Note: If this box doesn't appear square, then you need to fix your tabs. *
****************************************************************************/
#ifndef CP437_UNICODE_TBL_H_
#define CP437_UNICODE_TBL_H_
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
uint32_t cp437_unicode_tbl[];
#ifdef __cplusplus
}
#endif
#endif // Don't add anything after this line
......@@ -6,7 +6,8 @@
# OBJODIR, DIRSEP, and OFILE must be pre-defined
OBJS = $(OBJODIR)$(DIRSEP)cp437_utf8_tbl$(OFILE) \
OBJS = $(OBJODIR)$(DIRSEP)cp437_unicode_tbl$(OFILE) \
$(OBJODIR)$(DIRSEP)utf8$(OFILE) \
$(OBJODIR)$(DIRSEP)uucode$(OFILE) \
$(OBJODIR)$(DIRSEP)yenc$(OFILE) \
$(OBJODIR)$(DIRSEP)lzh$(OFILE) \
......
/* Synchronet UTF-8 encode/decode/translate functions */
/* $Id$ */
/****************************************************************************
* @format.tab-size 4 (Plain Text/Source Code File Header) *
* @format.use-tabs true (see http://www.synchro.net/ptsc_hdr.html) *
* *
* Copyright Rob Swindell - http://www.synchro.net/copyright.html *
* *
* This library is free software; you can redistribute it and/or *
* modify it under the terms of the GNU Lesser General Public License *
* as published by the Free Software Foundation; either version 2 *
* of the License, or (at your option) any later version. *
* See the GNU Lesser General Public License for more details: lgpl.txt or *
* http://www.fsf.org/copyleft/lesser.html *
* *
* Anonymous FTP access to the most recent released source is available at *
* ftp://vert.synchro.net, ftp://cvs.synchro.net and ftp://ftp.synchro.net *
* *
* Anonymous CVS access to the development source and modification history *
* is available at cvs.synchro.net:/cvsroot/sbbs, example: *
* cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs login *
* (just hit return, no password is necessary) *
* cvs -d :pserver:anonymous@cvs.synchro.net:/cvsroot/sbbs checkout src *
* *
* For Synchronet coding style and modification guidelines, see *
* http://www.synchro.net/source.html *
* *
* You are encouraged to submit any modifications (preferably in Unix diff *
* format) via e-mail to mods@synchro.net *
* *
* Note: If this box doesn't appear square, then you need to fix your tabs. *
****************************************************************************/
#include "utf8.h"
char* utf8_normalize_str(char* str)
{
char* dest = str;
for(char* src = str; *src != 0; src++) {
if(*src == '\xC2' && *(src + 1) == '\xA0') { // NO-BREAK SPACE
src++;
*dest++ = ' ';
continue;
}
if(*src == '\xE2') {
if(*(src + 1) == '\x80') {
switch(*(src + 2)) {
case '\x82': // EN SPACE
case '\x83': // EM SPACE
src += 2;
*dest++ = ' ';
continue;
case '\x8B': // ZERO WIDTH SPACE
case '\x8C': // ZERO WIDTH NON-JOINER
case '\x8D': // ZERO WIDTH JOINER
src += 2;
continue;
case '\x90': // HYPHEN
case '\x91': // NON-BREAKING HYPHEN
case '\x92': // FIGURE DASH
case '\x93': // EN DASH
src += 2;
*dest++ = '-';
continue;
case '\x98': // LEFT SINGLE QUOTATION MARK
src += 2;
*dest++ = '`';
continue;
case '\x99': // RIGHT SINGLE QUOTATION MARK
case '\xB2': // PRIME
src += 2;
*dest++ = '\'';
continue;
case '\x9C': // LEFT DOUBLE QUOTATION MARK
case '\x9D': // RIGHT DOUBLE QUOTATION MARK
src += 2;
*dest++ = '"';
continue;
case '\xA6': // HORIZONTAL ELLIPSIS -> ASCII periods (3)
src += 2;
for(int i = 0; i < 3; i++)
*dest++ = '.';
continue;
}
}
else if(*(src + 1) == '\x81') {
switch(*(src + 2)) {
case '\x83': // HYPEN BULLET
src += 2;
*dest++ = '-';
continue;
case '\x84': // FRACTION SLASH
src += 2;
*dest++ = '/';
continue;
}
}
else if(*(src + 1) == '\x88') {
switch(*(src + 2)) {
case '\x92': // MINUS SIGN
src += 2;
*dest++ = '-';
continue;
}
}
}
else if(*src == '\xEF') {
if(*(src + 1) == '\xBB' && *(src + 2) == '\xBF') {
// Zero Width No-Break Space (BOM, ZWNBSP)
src += 2;
continue;
}
if(*(src + 1) == '\xBC') {
if(*(src + 2) >= '\x81' && *(src + 2) <= '\xBF') { // FULLWIDTH EXCLAMATION MARK through FULLWIDTH LOW LINE
src += 2;
*src -= '\x81';
*dest++ = '!' + *src;
continue;
}
}
else if(*(src + 1) == '\xBD') {
if(*(src + 2) >= '\x80' && *(src + 2) <= '\x9E') { // FULLWIDTH GRAVE ACCENT through FULLWIDTH TILDE
src += 2;
*src -= '\x80';
*dest++ = '`' + *src;
continue;
}
}
}
*dest++ = *src;
}
*dest = 0;
return str;
}
// From openssl/crypto/asn1/a_utf8.c:
/*
* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
/* UTF8 utilities */
/*-
* This parses a UTF8 string one character at a time. It is passed a pointer
* to the string and the length of the string. It sets 'value' to the value of
* the current character. It returns the number of characters read or a
* negative error code:
* -1 = string too short
* -2 = illegal character
* -3 = subsequent characters not of the form 10xxxxxx
* -4 = character encoded incorrectly (not minimal length).
*/
int utf8_getc(const char *str, size_t len, uint32_t* val)
{
const unsigned char *p;
unsigned long value;
int ret;
if (len <= 0)
return 0;
p = str;
/* Check syntax and work out the encoded value (if correct) */
if ((*p & 0x80) == 0) {
value = *p++ & 0x7f;
ret = 1;
} else if ((*p & 0xe0) == 0xc0) {
if (len < 2)
return -1;
if ((p[1] & 0xc0) != 0x80)
return -3;
value = (*p++ & 0x1f) << 6;
value |= *p++ & 0x3f;
if (value < 0x80)
return -4;
ret = 2;
} else if ((*p & 0xf0) == 0xe0) {
if (len < 3)
return -1;
if (((p[1] & 0xc0) != 0x80)
|| ((p[2] & 0xc0) != 0x80))
return -3;
value = (*p++ & 0xf) << 12;
value |= (*p++ & 0x3f) << 6;
value |= *p++ & 0x3f;
if (value < 0x800)
return -4;
ret = 3;
} else if ((*p & 0xf8) == 0xf0) {
if (len < 4)
return -1;
if (((p[1] & 0xc0) != 0x80)
|| ((p[2] & 0xc0) != 0x80)
|| ((p[3] & 0xc0) != 0x80))
return -3;
value = ((unsigned long)(*p++ & 0x7)) << 18;
value |= (*p++ & 0x3f) << 12;
value |= (*p++ & 0x3f) << 6;
value |= *p++ & 0x3f;
if (value < 0x10000)
return -4;
ret = 4;
} else if ((*p & 0xfc) == 0xf8) {
if (len < 5)
return -1;
if (((p[1] & 0xc0) != 0x80)
|| ((p[2] & 0xc0) != 0x80)
|| ((p[3] & 0xc0) != 0x80)
|| ((p[4] & 0xc0) != 0x80))
return -3;
value = ((unsigned long)(*p++ & 0x3)) << 24;
value |= ((unsigned long)(*p++ & 0x3f)) << 18;
value |= ((unsigned long)(*p++ & 0x3f)) << 12;
value |= (*p++ & 0x3f) << 6;
value |= *p++ & 0x3f;
if (value < 0x200000)
return -4;
ret = 5;
} else if ((*p & 0xfe) == 0xfc) {
if (len < 6)
return -1;
if (((p[1] & 0xc0) != 0x80)
|| ((p[2] & 0xc0) != 0x80)
|| ((p[3] & 0xc0) != 0x80)
|| ((p[4] & 0xc0) != 0x80)
|| ((p[5] & 0xc0) != 0x80))
return -3;
value = ((unsigned long)(*p++ & 0x1)) << 30;
value |= ((unsigned long)(*p++ & 0x3f)) << 24;
value |= ((unsigned long)(*p++ & 0x3f)) << 18;
value |= ((unsigned long)(*p++ & 0x3f)) << 12;
value |= (*p++ & 0x3f) << 6;
value |= *p++ & 0x3f;
if (value < 0x4000000)
return -4;
ret = 6;
} else
return -2;
*val = value;
return ret;
}
/*
* This takes a character 'value' and writes the UTF8 encoded value in 'str'
* where 'str' is a buffer containing 'len' characters. Returns the number of
* characters written or -1 if 'len' is too small. 'str' can be set to NULL
* in which case it just returns the number of characters. It will need at