Skip to content
Snippets Groups Projects
Commit 2bca5489 authored by Rob Swindell's avatar Rob Swindell :speech_balloon:
Browse files

UTF-8 improvements

Don't corrupt UTF-8 strings with SAFECOPY() (use new SAFECOPY_UTF8).

Some terminals (notably, Windows Terminal) display zero width UNICODE chars
as a single column-wide space. <sigh> Auto-detect the zero-width "width"
(1 or 0) of the terminal during connection and UTF-8 auto-detection.

getstr() works a lot better now with UTF-8 strings with wide chars (e.g.
emojis), but likely much more to do.
parent 5cc3767e
No related branches found
No related tags found
No related merge requests found
Pipeline #5812 failed
......@@ -167,7 +167,7 @@ char* utf8_replace_chars(char* str, char (*lookup)(enum unicode_codepoint), char
continue;
}
}
if(unicode_width(codepoint) == 0) {
if(unicode_is_zerowidth(codepoint)) {
if(unsupported_zwch)
*dest++ = unsupported_zwch;
}
......@@ -191,7 +191,7 @@ bool utf8_str_is_valid(const char* str)
}
// Return the total printed-width of UTF-8 string (str) accounting for zero/half/full-width codepoints
size_t utf8_str_total_width(const char* str)
size_t utf8_str_total_width(const char* str, size_t zerowidth)
{
size_t count = 0;
const char* end = str + strlen(str);
......@@ -200,14 +200,14 @@ size_t utf8_str_total_width(const char* str)
int len = utf8_getc(str, end - str, &codepoint);
if (len < 1)
break;
count += unicode_width(codepoint);
count += unicode_width(codepoint, zerowidth);
str += len;
}
return count;
}
// Return the count of chars within the specified width range in UTF-8 string (str)
size_t utf8_str_count_width(const char* str, size_t min_width, size_t max_width)
size_t utf8_str_count_width(const char* str, size_t min_width, size_t max_width, size_t zerowidth)
{
size_t count = 0;
const char* end = str + strlen(str);
......@@ -216,7 +216,7 @@ size_t utf8_str_count_width(const char* str, size_t min_width, size_t max_width)
int len = utf8_getc(str, end - str, &codepoint);
if (len < 1)
break;
size_t width = unicode_width(codepoint);
size_t width = unicode_width(codepoint, zerowidth);
if(width >= min_width && width <= max_width)
count++;
str += len;
......
......@@ -39,10 +39,10 @@ int utf8_decode_firstbyte(char ch);
bool utf8_str_is_valid(const char*);
// Returns the fixed printed-width of the UTF-8 string
size_t utf8_str_total_width(const char*);
size_t utf8_str_total_width(const char*, size_t zerowidth);
// Return the count of chars within the specified width range in UTF-8 string (str)
size_t utf8_str_count_width(const char*, size_t min_width, size_t max_width);
size_t utf8_str_count_width(const char*, size_t min_width, size_t max_width, size_t zerowidth);
// Like strlcpy(), but doesn't leave a partial UTF-8 sequence at the end of dst
size_t utf8_strlcpy(char* dst, const char* src, size_t size);
......
......@@ -501,8 +501,10 @@ bool sbbs_t::answer()
if(x >= TERM_COLS_MIN && x <= TERM_COLS_MAX) cols=x;
if(y >= TERM_ROWS_MIN && y <= TERM_ROWS_MAX) rows=y;
} else { // second report
if(x < 3) // ZWNBSP didn't move cursor (more than one column)
if(x < 3) { // ZWNBSP didn't move cursor (more than one column)
autoterm |= UTF8;
unicode_zerowidth = x - 1;
}
}
} else if(sscanf(p, "[=67;84;101;114;109;%u;%u", &x, &y) == 2 && *lastchar(p) == 'c') {
lprintf(LOG_INFO,"received CTerm version report: %u.%u", x, y);
......
......@@ -201,9 +201,9 @@ int sbbs_t::show_atcode(const char *instr, JSObject* obj)
}
if(pmode & P_UTF8) {
if(term_supports(UTF8))
fmt.disp_len += strlen(cp) - utf8_str_total_width(cp);
fmt.disp_len += strlen(cp) - utf8_str_total_width(cp, unicode_zerowidth);
else
fmt.disp_len += strlen(cp) - utf8_str_count_width(cp, /* min: */1, /* max: */2);
fmt.disp_len += strlen(cp) - utf8_str_count_width(cp, /* min: */1, /* max: */2, unicode_zerowidth);
}
if(fmt.align == fmt.left)
bprintf(pmode, "%-*.*s",fmt.disp_len,fmt.disp_len,cp);
......
......@@ -40,8 +40,9 @@ void sbbs_t::redrwstr(char *strin, int i, int l, int mode)
column+=rprintf("%-*.*s",l,l,strin);
cleartoeol();
if(i<l) {
auto_utf8(strin, mode);
if(mode&P_UTF8)
l = utf8_str_total_width(strin);
l = utf8_str_total_width(strin, unicode_zerowidth);
cursor_left(l-i);
}
}
......
......@@ -29,7 +29,7 @@ char* sbbs_t::auto_utf8(const char* str, int& mode)
{
if(strncmp(str, "\xEF\xBB\xBF", 3) == 0) {
mode |= P_UTF8;
return (char*)(str + 3);
return (char*)str;
}
if(mode & P_AUTO_UTF8) {
if(!str_is_ascii(str) && utf8_str_is_valid(str))
......@@ -155,7 +155,7 @@ size_t sbbs_t::bstrlen(const char *str, int mode)
len = utf8_getc(str, end - str, &codepoint);
if(len < 1)
break;
count += unicode_width(codepoint);;
count += unicode_width(codepoint, unicode_zerowidth);
} else
count++;
str += len;
......@@ -347,7 +347,7 @@ size_t sbbs_t::print_utf8_as_cp437(const char* str, size_t len)
char ch = unicode_to_cp437(codepoint);
if(ch)
outchar(ch);
else if(unicode_width(codepoint) > 0) {
else if(unicode_width(codepoint, unicode_zerowidth) > 0) {
outchar(CP437_INVERTED_QUESTION_MARK);
char seq[32] = "";
for(size_t i = 0; i < len; i++)
......@@ -796,7 +796,7 @@ int sbbs_t::outchar(enum unicode_codepoint codepoint, const char* cp437_fallback
if(len < 1)
return len;
putcom(str, len);
inc_column(unicode_width(codepoint));
inc_column(unicode_width(codepoint, unicode_zerowidth));
return 0;
}
if(cp437_fallback == NULL)
......
......@@ -21,6 +21,7 @@
#include "sbbs.h"
#include "cmdshell.h"
#include "utf8.h"
/****************************************************************************/
/* Mails a message to usernumber. 'top' is a buffer to place at beginning */
......@@ -47,9 +48,9 @@ bool sbbs_t::email(int usernumber, const char *top, const char *subj, int mode,
smbmsg_t msg;
if(subj != NULL)
SAFECOPY(title, subj);
SAFECOPY_UTF8(title, subj);
if(remsg != NULL && title[0] == 0)
SAFECOPY(title, remsg->subj);
SAFECOPY_UTF8(title, remsg->subj);
if(useron.etoday>=cfg.level_emailperday[useron.level] && !SYSOP && !(useron.exempt&FLAG('M'))) {
bputs(text[TooManyEmailsToday]);
......
......@@ -80,7 +80,7 @@ size_t sbbs_t::getstr(char *strout, size_t maxlen, int mode, const str_list_t hi
}
SAFECOPY(undo,str1);
i=l=strlen(str1);
i=l=bstrlen(str1, P_AUTO_UTF8);
if(mode&K_AUTODEL && str1[0] && !(mode&K_NOECHO)) {
ch=getkey(mode|K_GETSTR);
attr(atr);
......@@ -92,7 +92,7 @@ size_t sbbs_t::getstr(char *strout, size_t maxlen, int mode, const str_list_t hi
else {
for(i=0;i<l;i++)
outchar(BS);
column+=rputs(str1);
column+=bputs(str1, P_AUTO_UTF8);
i=l;
}
if(ch!=' ' && ch!=TAB)
......@@ -353,7 +353,7 @@ size_t sbbs_t::getstr(char *strout, size_t maxlen, int mode, const str_list_t hi
break;
case CTRL_R: /* Ctrl-R Redraw Line */
if(!(mode&K_NOECHO))
redrwstr(str1,i,l,K_MSG);
redrwstr(str1,i,l, P_AUTO_UTF8);
break;
case TERM_KEY_INSERT: /* Ctrl-V Toggles Insert/Overwrite */
if(mode&K_NOECHO)
......
......@@ -4668,6 +4668,7 @@ js_utf8_get_width(JSContext *cx, uintN argc, jsval *arglist)
jsval *argv=JS_ARGV(cx, arglist);
char* str = NULL;
jsrefcount rc;
int zerowidth = 1;
JS_SET_RVAL(cx, arglist, JSVAL_VOID);
......@@ -4680,7 +4681,7 @@ js_utf8_get_width(JSContext *cx, uintN argc, jsval *arglist)
return JS_TRUE;
rc=JS_SUSPENDREQUEST(cx);
size_t width = utf8_str_total_width(str);
size_t width = utf8_str_total_width(str, zerowidth);
JS_RESUMEREQUEST(cx, rc);
free(str);
......
......@@ -92,18 +92,18 @@ bool sbbs_t::postmsg(int subnum, int wm_mode, smb_t* resmb, smbmsg_t* remsg)
}
if(remsg) {
SAFECOPY(title, msghdr_field(remsg, remsg->subj, NULL, term_supports(UTF8)));
SAFECOPY_UTF8(title, msghdr_field(remsg, remsg->subj, NULL, term_supports(UTF8)));
if(remsg->hdr.attr&MSG_ANONYMOUS)
SAFECOPY(from,text[Anonymous]);
else
SAFECOPY(from, msghdr_field(remsg, remsg->from, NULL, term_supports(UTF8)));
SAFECOPY_UTF8(from, msghdr_field(remsg, remsg->from, NULL, term_supports(UTF8)));
// If user posted this message, reply to the original recipient again
if(remsg->to != NULL
&& ((remsg->from_ext != NULL && atoi(remsg->from_ext)==useron.number)
|| stricmp(useron.alias,remsg->from) == 0 || stricmp(useron.name,remsg->from) == 0))
SAFECOPY(touser, msghdr_field(remsg, remsg->to, NULL, term_supports(UTF8)));
SAFECOPY_UTF8(touser, msghdr_field(remsg, remsg->to, NULL, term_supports(UTF8)));
else
SAFECOPY(touser,from);
SAFECOPY_UTF8(touser,from);
if(remsg->to != NULL)
strListPush(&names, remsg->to);
msgattr=(ushort)(remsg->hdr.attr&MSG_PRIVATE);
......
......@@ -1013,7 +1013,7 @@ int sbbs_t::scanposts(int subnum, int mode, const char *find)
,msghdr_field(&msg, msg.subj)
,timestr(msg.hdr.when_written.time)));
if(msg.from_net.addr==NULL)
SAFECOPY(str,msg.from);
SAFECOPY_UTF8(str,msg.from);
else if(msg.from_net.type==NET_FIDO)
SAFEPRINTF2(str,"%s@%s",msg.from
,smb_faddrtoa((faddr_t *)msg.from_net.addr,tmp));
......
......@@ -266,6 +266,8 @@ extern int thread_suid_broken; /* NPTL is no longer broken */
#include "xpdatetime.h"
#include "unicode_defs.h"
#define SAFECOPY_UTF8(dst, src) if(utf8_str_is_valid(src)) utf8_strlcpy(dst, src, sizeof dst); else SAFECOPY(dst, src)
/***********************/
/* Synchronet-specific */
/***********************/
......@@ -577,6 +579,7 @@ public:
int tabstop = 8; /* Current symmetric-tabstop (size) */
int lastlinelen = 0; /* The previously displayed line length */
int autoterm=0; /* Auto-detected terminal type */
size_t unicode_zerowidth=0;
char terminal[TELNET_TERM_MAXLEN+1]{}; // <- answer() writes to this
int cterm_version=0;/* (MajorVer*1000) + MinorVer */
link_list_t savedlines{};
......
......@@ -294,10 +294,9 @@ enum unicode_codepoint cp437_unicode_tbl[] =
/* 0xFF */ UNICODE_NO_BREAK_SPACE
};
size_t unicode_width(enum unicode_codepoint u)
bool unicode_is_zerowidth(enum unicode_codepoint u)
{
switch(u) {
case UNICODE_UNDEFINED:
case UNICODE_ZERO_WIDTH_SPACE:
case UNICODE_ZERO_WIDTH_NON_JOINER:
case UNICODE_ZERO_WIDTH_JOINER:
......@@ -318,6 +317,15 @@ size_t unicode_width(enum unicode_codepoint u)
case UNICODE_VARIATION_SELECTOR_15:
case UNICODE_VARIATION_SELECTOR_16:
case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE:
return true;
}
return false;
}
size_t unicode_width(enum unicode_codepoint u, size_t zerowidth)
{
switch(u) {
case UNICODE_UNDEFINED:
return 0;
// Exceptions to the ranges (blocks/sub-blocks) in the default case
case UNICODE_CIRCLED_NUMBER_TEN_ON_BLACK_SQUARE:
......@@ -383,6 +391,8 @@ size_t unicode_width(enum unicode_codepoint u)
|| (u >= UNICODE_BLOCK_EXTA_SYMBOLS_AND_PICTOGRAPHS_BEGIN && u <= UNICODE_BLOCK_EXTA_SYMBOLS_AND_PICTOGRAPHS_END)
)
return 2;
if(unicode_is_zerowidth(u))
return zerowidth;
return 1;
}
}
......
......@@ -23,6 +23,7 @@
#define UNICODE_H_
#include <stdlib.h>
#include "gen_defs.h"
#include "unicode_defs.h"
#if defined(__cplusplus)
......@@ -30,7 +31,8 @@ extern "C" {
#endif
extern enum unicode_codepoint cp437_unicode_tbl[];
size_t unicode_width(enum unicode_codepoint);
bool unicode_is_zerowidth(enum unicode_codepoint);
size_t unicode_width(enum unicode_codepoint, size_t zerowidth);
char unicode_to_cp437(enum unicode_codepoint);
char unicode_to_latin1(enum unicode_codepoint);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment