From 9773c56add6446d418c0677f306c8771356f0c01 Mon Sep 17 00:00:00 2001 From: Jim Warner Date: Thu, 28 Sep 2017 00:22:22 -0500 Subject: [PATCH] top: refactored for correct multi-byte string handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When this project first began implementing translation support nearly 6 years ago, we overcame many 'gettext' obstacles and limitations. And, of course, there were not any actual translations at the time so our testing was quite limited plus, in many cases, only simulated. None of that, however, can justify or excuse the total lack of attention to top's approach to NLS, especially since some actual translations have existed for years. When the issue referenced below was raised, I suffered immediate feelings of anxiety, doubt and pending doom. This was mostly because top strives to avoid line wrap at all costs and that did not bode well for multi-byte translated strings, using several bytes per character. I was also concerned over possible performance impact, assuming it was even possible to properly handle utf8. But, after wrestling with the problem for several days those initial feelings have now been replaced by guilt over any trouble I initially caused those translators. One can only imagine how frustrating it must have been after the translation effort to then see top display a misaligned column header and fields management page or truncated screens like those of help or color mapping. ------------------------------------------------------ Ok, with that off my chest let's review these changes, now that top properly handles UTF8 multi-byte strings. . Performance - virtually all of this newly added cost for multi-byte support is incurred during interactions with the user. So, performance is not really an issue. The one occasion when performance is impacted is found during 'summary_show()' processing, due to an addition of one new call to 'utf8_delta()' in 'show_special()'. . Extra Wide Characters - I have not yet and may never figure out a way to support languages like zh_CN where the characters can be wider than most other languages. . Translated User Name - at some future point we could implement translation of user names. But as the author of the issue acknowledged such names are non-standard. Thus task display still incurs no new multi-byte costs beyond those already incurred in that escape.c module. For raising the issue I extend my sincerest thanks to: Göran Uddeborg Reference(s): https://gitlab.com/procps-ng/procps/issues/68 Signed-off-by: Jim Warner --- top/top.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++----- top/top.h | 5 +++ 2 files changed, 95 insertions(+), 9 deletions(-) diff --git a/top/top.c b/top/top.c index cacee284..3b354074 100644 --- a/top/top.c +++ b/top/top.c @@ -480,6 +480,76 @@ static void sig_resize (int dont_care_sig) { (void)dont_care_sig; } // end: sig_resize +/*###### Special UTF-8 Multi-Byte support ##############################*/ + + /* Support for NLS translated 'string' length */ +static char UTF8_tab[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 - 0x0F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 - 0x1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 - 0x2F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 - 0x3F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 - 0x4F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 - 0x5F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 - 0x6F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 - 0x7F + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0x80 - 0x8F + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0x90 - 0x9F + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0xA0 - 0xAF + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0xB0 - 0xBF + -1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0 - 0xCF, 0xC2 = begins 2 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0 - 0xDF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0 - 0xEF, 0xE0 = begins 3 + 4, 4, 4, 4, 4,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0xF0 - 0xFF, 0xF0 = begins 4 +}; // ( 0xF5 & beyond invalid ) + + + /* + * Determine difference between total bytes versus printable + * characters in that passed, potentially multi-byte, string */ +static int utf8_delta (const char *str) { + const unsigned char *p = (const unsigned char *)str; + int clen, cnum = 0; + + while (*p) { + // -1 represents a decoding error, pretend it's untranslated ... + if (0 > (clen = UTF8_tab[*p])) return 0; + p += clen; + ++cnum; + } + return (int)((const char *)p - str) - cnum; +} // end: utf8_delta + + + /* + * Determine a logical end within a potential multi-byte string + * where maximum printable chars could be accommodated in width */ +static int utf8_embody (const char *str, int width) { + const unsigned char *p = (const unsigned char *)str; + int clen, cnum = 0; + + while (*p) { + // -1 represents a decoding error, pretend it's untranslated ... + if (0 > (clen = UTF8_tab[*p])) return width; + if (cnum + 1 >= width) break; + p += clen; + ++cnum; + } + return (int)((const char *)p - str); +} // end: utf8_embody + + + /* + * Like the regular justify_pad routine but this guy + * can accommodate the multi-byte translated strings */ +static const char *utf8_justify (const char *str, int width, int justr) { + static char l_fmt[] = "%-*.*s%s", r_fmt[] = "%*.*s%s"; + static char buf[SCREENMAX]; + + width += utf8_delta(str); + snprintf(buf, sizeof(buf), justr ? r_fmt : l_fmt, width, width, str, COLPADSTR); + return buf; +} // end: utf8_justify + /*###### Misc Color/Display support ####################################*/ /* @@ -571,7 +641,7 @@ static void show_msg (const char *str) { PUTT("%s%s %.*s %s%s%s" , tg2(0, Msg_row) , Curwin->capclr_msg - , Screen_cols - 2 + , utf8_embody(str, Screen_cols - 2) , str , Cap_curs_hide , Caps_off @@ -584,8 +654,11 @@ static void show_msg (const char *str) { /* * Show an input prompt + larger cursor (if possible) */ static int show_pmt (const char *str) { - int rc; + char buf[MEDBUFSIZ]; + int len; + snprintf(buf, sizeof(buf), "%.*s", utf8_embody(str, Screen_cols - 2), str); + len = utf8_delta(buf); #ifdef PRETENDNOCAP PUTT("\n%s%s%.*s %s%s%s" #else @@ -593,14 +666,15 @@ static int show_pmt (const char *str) { #endif , tg2(0, Msg_row) , Curwin->capclr_pmt - , Screen_cols - 2 - , str + , (Screen_cols - 2) + len + , buf , Cap_curs_huge , Caps_off , Cap_clr_eol); fflush(stdout); + len = strlen(buf) - len; // +1 for the space we added or -1 for the cursor... - return ((rc = (int)strlen(str)+1) < Screen_cols) ? rc : Screen_cols-1; + return (len + 1 < Screen_cols) ? len + 1 : Screen_cols - 1; } // end: show_pmt @@ -679,6 +753,7 @@ static void show_special (int interact, const char *glob) { , Curwin->captab[ch], room, sub_beg, Caps_off); rp = scat(rp, tmp); room -= (sub_end - sub_beg); + room += utf8_delta(sub_beg); sub_beg = (sub_end += 2); break; default: // nothin' special, just text @@ -1719,7 +1794,7 @@ static void build_headers (void) { if (EU_CMD == f) ckCMDS(w); else ckITEM(f); - s = scat(s, justify_pad(N_col(f) + s = scat(s, utf8_justify(N_col(f) , VARcol(f) ? w->varcolsz : Fieldstab[f].width , CHKw(w, Fieldstab[f].align))); #ifdef USE_X_COLHDR @@ -1933,21 +2008,27 @@ static void display_fields (int focus, int extend) { const char *e = (i == focus && extend) ? w->capclr_hdr : ""; FLG_t f = FLDget(w, i); char sbuf[xSUFX+1]; + int xcol, xfld; - // prep sacrificial suffix + // obtain translated deltas (if any) ... + xcol = utf8_delta(fmtmk("%.*s", utf8_embody(N_col(f), 7), N_col(f))); + xfld = utf8_delta(fmtmk("%.*s", utf8_embody(N_fld(f), smax), N_fld(f))); + + // prep sacrificial suffix ... snprintf(sbuf, sizeof(sbuf), "= %s", N_fld(f)); - PUTT("%s%c%s%s %s%-7.7s%s%s%s %-*.*s%s" + PUTT("%s%c%s%s %s%-*.*s%s%s%s %-*.*s%s" , tg2(x, y) , b ? '*' : ' ' , b ? w->cap_bold : Cap_norm , e , i == focus ? w->capclr_hdr : "" + , 7 + xcol, 7 + xcol , N_col(f) , Cap_norm , b ? w->cap_bold : "" , e - , smax, smax + , smax + xfld, smax + xfld , sbuf , Cap_norm); } diff --git a/top/top.h b/top/top.h index 45a57778..6d3f5fb7 100644 --- a/top/top.h +++ b/top/top.h @@ -547,6 +547,11 @@ typedef struct WIN_t { //atic void sig_endpgm (int dont_care_sig); //atic void sig_paused (int dont_care_sig); //atic void sig_resize (int dont_care_sig); +/*------ Special UTF-8 Multi-Byte support ------------------------------*/ +/*atic char UTF8_tab[] = { ... } */ +//atic int utf8_delta (const char *str); +//atic int utf8_embody (const char *str, int width); +//atic const char *utf8_justify (const char *str, int width, int justr); /*------ Misc Color/Display support ------------------------------------*/ //atic void capsmk (WIN_t *q); //atic void show_msg (const char *str);