From b4bce3b8fabb122cec07769e5fbf022aa8d51535 Mon Sep 17 00:00:00 2001 From: Jim Warner Date: Sat, 6 Jan 2018 00:00:00 -0600 Subject: [PATCH] top: adapt utf8 logic to support extra wide characters Back when top was refactored to support UTF-8 encoding it was acknowledged that languages like zh_CN were not supported. That was because a single 'character' might require more than a single 'column' when it's printed. Well I've now figured out how to accommodate languages like that. My adaptation is represented in this patch. [ and just in case someone wishes to avoid the extra ] [ runtime costs, a #define OFF_XTRAWIDE is included. ] Along the way, I've cleaned up some miscellaneous code supporting the 'Inspect' feature so that the rightmost screen column was always used rather than being blank. [ interestingly, my xterm & urxvt terminal emulators ] [ are able to split extra wide characters then print ] [ 1/2 of such graphics in the last column. the gnome ] [ terminal emulator does not duplicate such behavior ] [ but prints 1 extra character in same width window. ] Reference(s): . Sep, 2017 - original utf8 support commit 7ef38420a4ef69380ad467a2f49737f2e84d5c89 Signed-off-by: Jim Warner --- top/top.c | 57 +++++++++++++++++++++++++++++++++++++------------------ top/top.h | 2 ++ 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/top/top.c b/top/top.c index b398bf37..e27c696e 100644 --- a/top/top.c +++ b/top/top.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -680,6 +681,24 @@ static char UTF8_tab[] = { }; // ( 0xF5 & beyond invalid ) + /* + * Accommodate any potential differences between some multibyte + * character sequence and the screen columns needed to print it */ +static inline int utf8_cols (const unsigned char *p, int n) { +#ifndef OFF_XTRAWIDE + wchar_t wc; + int wlen; + + (void)mbtowc(&wc, (const char *)p, n); + if ((wlen = wcwidth(wc)) < 1) wlen = 1; + return wlen; +#else + (void)p; (void)n; + return 1; +#endif +} // end: utf8_cols + + /* * Determine difference between total bytes versus printable * characters in that passed, potentially multi-byte, string */ @@ -690,8 +709,8 @@ static int utf8_delta (const char *str) { while (*p) { // -1 represents a decoding error, pretend it's untranslated ... if (0 > (clen = UTF8_tab[*p])) return 0; + cnum += utf8_cols(p, clen); p += clen; - ++cnum; } return (int)((const char *)p - str) - cnum; } // end: utf8_delta @@ -708,8 +727,8 @@ static int utf8_embody (const char *str, int width) { while (*p) { // -1 represents a decoding error, pretend it's untranslated ... if (0 > (clen = UTF8_tab[*p])) return width; + if (width < (cnum += utf8_cols(p, clen))) break; p += clen; - if (++cnum >= width) break; } } return (int)((const char *)p - str); @@ -3108,15 +3127,15 @@ static void insp_find_str (int ch, int *col, int *row) { * while visible search matches display with capclr_hdr for emphasis. * ( we hide ugly plumbing in macros to concentrate on the algorithm ) */ static void insp_mkrow_raw (int col, int row) { - #define maxSZ ( Screen_cols - (to + 1) ) + #define maxSZ ( Screen_cols - to ) #define capNO { if (hicap) { putp(Caps_off); hicap = 0; } } #define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \ fr += Insp_sel->flen -1; to += Insp_sel->flen; hicap = 0; } #ifndef INSP_JUSTNOT - #define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \ - PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 2; hicap = 1; } - #define mkUNP { int x = maxSZ; const char *p = fmtmk("<%02X>", uch); \ - PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 4; hicap = 1; } + #define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \ + PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 2; hicap = 1; } + #define mkUNP { const char *p = fmtmk("<%02X>", uch); \ + PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 4; hicap = 1; } #else #define mkCTL { if ((to += 2) <= Screen_cols) \ PUTT("%s^%c", (!hicap) ? Curwin->capclr_msg : "", uch + '@'); hicap = 1; } @@ -3125,7 +3144,7 @@ static void insp_mkrow_raw (int col, int row) { #endif #define mkSTD { capNO; if (++to <= Screen_cols) { static char _str[2]; \ _str[0] = uch; putp(_str); } } - char tline[SCREENMAX]; + unsigned char tline[SCREENMAX]; int fr, to, ofs; int hicap = 0; @@ -3133,7 +3152,7 @@ static void insp_mkrow_raw (int col, int row) { memcpy(tline, Insp_p[row] + col, sizeof(tline)); else tline[0] = '\n'; - for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; fr++) { + for (fr = 0, to = 0, ofs = 0; to < Screen_cols; fr++) { if (!ofs) ofs = insp_find_ofs(col + fr, row); if (col + fr < ofs) { @@ -3166,20 +3185,20 @@ static void insp_mkrow_raw (int col, int row) { * characters will then be displayed in two positions like '^A'. * ( assuming they can even get past those 'gettext' utilities ) */ static void insp_mkrow_utf8 (int col, int row) { - #define maxSZ ( Screen_cols - (to + 1) ) + #define maxSZ ( Screen_cols - to ) #define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \ fr += Insp_sel->flen; to += Insp_sel->flen; } #ifndef INSP_JUSTNOT - #define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \ - PUTT("%s%.*s%s", Curwin->capclr_msg, x, p, Caps_off); to += 2; } + #define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \ + PUTT("%s%.*s%s", Curwin->capclr_msg, maxSZ, p, Caps_off); to += 2; } #else #define mkCTL { if ((to += 2) <= Screen_cols) \ PUTT("%s^%c%s", Curwin->capclr_msg, uch + '@', Caps_off); } #endif #define mkNUL { buf1[0] = ' '; doPUT(buf1) } - #define doPUT(buf) if (++to <= Screen_cols) putp(buf); + #define doPUT(buf) if ((to += cno) <= Screen_cols) putp(buf); static char buf1[2], buf2[3], buf3[4], buf4[5]; - char tline[BIGBUFSIZ]; + unsigned char tline[BIGBUFSIZ]; int fr, to, ofs; col = utf8_proper_col(Insp_p[row], col, 1); @@ -3187,15 +3206,17 @@ static void insp_mkrow_utf8 (int col, int row) { memcpy(tline, Insp_p[row] + col, sizeof(tline)); else tline[0] = '\n'; - for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; ) { + for (fr = 0, to = 0, ofs = 0; to < Screen_cols; ) { if (!ofs) ofs = insp_find_ofs(col + fr, row); if (col + fr < ofs) { - unsigned char uch = tline[fr++]; - switch (UTF8_tab[(int)uch]) { + unsigned char uch = tline[fr]; + int bno = UTF8_tab[uch]; + int cno = utf8_cols(&tline[fr++], bno); + switch (bno) { case 1: if (uch == '\n') break; - else if (uch < 32) mkCTL + if (uch < 32) mkCTL else if (uch == 127) mkNUL else { buf1[0] = uch; doPUT(buf1) } break; diff --git a/top/top.h b/top/top.h index 37ba4bf0..67106509 100644 --- a/top/top.h +++ b/top/top.h @@ -45,6 +45,7 @@ //#define OFF_SCROLLBK /* disable tty emulators scrollback buffer */ //#define OFF_STDERROR /* disable our stderr buffering (redirect) */ //#define OFF_STDIOLBF /* disable our own stdout _IOFBF override */ +//#define OFF_XTRAWIDE /* disable our extra wide multi-byte logic */ //#define PRETEND2_5_X /* pretend we're linux 2.5.x (for IO-wait) */ //#define PRETEND8CPUS /* pretend we're smp with 8 ticsers (sic) */ //#define PRETENDNOCAP /* use a terminal without essential caps */ @@ -685,6 +686,7 @@ typedef struct WIN_t { //atic void xalloc_our_handler (const char *fmts, ...); /*------ Special UTF-8 Multi-Byte support ------------------------------*/ /*atic char UTF8_tab[] = { ... } */ +//atic inline int utf8_cols (const unsigned char *p, int n); //atic int utf8_delta (const char *str); //atic int utf8_embody (const char *str, int width); //atic const char *utf8_justify (const char *str, int width, int justr);