From 7ef38420a4ef69380ad467a2f49737f2e84d5c89 Mon Sep 17 00:00:00 2001
From: Jim Warner <james.warner@comcast.net>
Date: Wed, 27 Sep 2017 00:02:22 -0500
Subject: [PATCH] top: refactored for correct multi-byte string handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When this project first began implementing translation
support nearly 6 years ago, we overcame many 'gettext'
obstacles and limitations.  And, of course, there were
not any actual translations at the time so our testing
was quite limited plus, in many cases, only simulated.

None of that, however, can justify or excuse the total
lack of attention to top's approach to NLS, especially
since some actual translations have existed for years.

When the issue referenced below was raised, I suffered
immediate feelings of anxiety, doubt and pending doom.
This was mostly because top strives to avoid line wrap
at all costs and that did not bode well for multi-byte
translated strings, using several bytes per character.

I was also concerned over possible performance impact,
assuming it was even possible to properly handle utf8.

But, after wrestling with the problem for several days
those initial feelings have now been replaced by guilt
over any trouble I initially caused those translators.

One can only imagine how frustrating it must have been
after the translation effort to then see top display a
misaligned column header and fields management page or
truncated screens like those of help or color mapping.
------------------------------------------------------

Ok, with that off my chest let's review these changes,
now that top properly handles UTF8 multi-byte strings.

. Performance - virtually all of this newly added cost
for multi-byte support is incurred during interactions
with the user. So, performance is not really an issue.

The one occasion when performance is impacted is found
during 'summary_show()' processing, due to an addition
of one new call to 'utf8_delta()' in 'show_special()'.

. Extra Wide Characters - I have not yet and may never
figure out a way to support languages like zh_CN where
the characters can be wider than most other languages.

. Translated User Name - at some future point we could
implement translation of user names. But as the author
of the issue acknowledged such names are non-standard.
Thus task display still incurs no new multi-byte costs
beyond those already incurred in that escape.c module.

For raising the issue I extend my sincerest thanks to:
Göran Uddeborg

Reference(s):
https://gitlab.com/procps-ng/procps/issues/68

Signed-off-by: Jim Warner <james.warner@comcast.net>
---
 top/top.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 top/top.h |  5 +++
 2 files changed, 95 insertions(+), 9 deletions(-)

diff --git a/top/top.c b/top/top.c
index 6c0b2ca1..5389ef78 100644
--- a/top/top.c
+++ b/top/top.c
@@ -641,6 +641,76 @@ static void sig_resize (int dont_care_sig) {
    (void)dont_care_sig;
 } // end: sig_resize
 
+/*######  Special UTF-8 Multi-Byte support  ##############################*/
+
+        /* Support for NLS translated 'string' length */
+static char UTF8_tab[] = {
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 - 0x0F
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 - 0x1F
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 - 0x2F
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 - 0x3F
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 - 0x4F
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 - 0x5F
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 - 0x6F
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 - 0x7F
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0x80 - 0x8F
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0x90 - 0x9F
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0xA0 - 0xAF
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0xB0 - 0xBF
+  -1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0 - 0xCF, 0xC2 = begins 2
+   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0 - 0xDF
+   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0 - 0xEF, 0xE0 = begins 3
+   4, 4, 4, 4, 4,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 0xF0 - 0xFF, 0xF0 = begins 4
+};                                                 //            ( 0xF5 & beyond invalid )
+
+
+        /*
+         * Determine difference between total bytes versus printable
+         * characters in that passed, potentially multi-byte, string */
+static int utf8_delta (const char *str) {
+    const unsigned char *p = (const unsigned char *)str;
+    int clen, cnum = 0;
+
+    while (*p) {
+        // -1 represents a decoding error, pretend it's untranslated ...
+        if (0 > (clen = UTF8_tab[*p])) return 0;
+        p += clen;
+        ++cnum;
+    }
+    return (int)((const char *)p - str) - cnum;
+} // end: utf8_delta
+
+
+        /*
+         * Determine a logical end within a potential multi-byte string
+         * where maximum printable chars could be accommodated in width */
+static int utf8_embody (const char *str, int width) {
+    const unsigned char *p = (const unsigned char *)str;
+    int clen, cnum = 0;
+
+    while (*p) {
+        // -1 represents a decoding error, pretend it's untranslated ...
+        if (0 > (clen = UTF8_tab[*p])) return width;
+        if (cnum + 1 >= width) break;
+        p += clen;
+        ++cnum;
+    }
+    return (int)((const char *)p - str);
+} // end: utf8_embody
+
+
+        /*
+         * Like the regular justify_pad routine but this guy
+         * can accommodate the multi-byte translated strings */
+static const char *utf8_justify (const char *str, int width, int justr) {
+   static char l_fmt[]  = "%-*.*s%s", r_fmt[] = "%*.*s%s";
+   static char buf[SCREENMAX];
+
+   width += utf8_delta(str);
+   snprintf(buf, sizeof(buf), justr ? r_fmt : l_fmt, width, width, str, COLPADSTR);
+   return buf;
+} // end: utf8_justify
+
 /*######  Misc Color/Display support  ####################################*/
 
         /*
@@ -732,7 +802,7 @@ static void show_msg (const char *str) {
    PUTT("%s%s %.*s %s%s%s"
       , tg2(0, Msg_row)
       , Curwin->capclr_msg
-      , Screen_cols - 2
+      , utf8_embody(str, Screen_cols - 2)
       , str
       , Cap_curs_hide
       , Caps_off
@@ -745,8 +815,11 @@ static void show_msg (const char *str) {
         /*
          * Show an input prompt + larger cursor (if possible) */
 static int show_pmt (const char *str) {
-   int rc;
+   char buf[MEDBUFSIZ];
+   int len;
 
+   snprintf(buf, sizeof(buf), "%.*s", utf8_embody(str, Screen_cols - 2), str);
+   len = utf8_delta(buf);
 #ifdef PRETENDNOCAP
    PUTT("\n%s%s%.*s %s%s%s"
 #else
@@ -754,14 +827,15 @@ static int show_pmt (const char *str) {
 #endif
       , tg2(0, Msg_row)
       , Curwin->capclr_pmt
-      , Screen_cols - 2
-      , str
+      , (Screen_cols - 2) + len
+      , buf
       , Cap_curs_huge
       , Caps_off
       , Cap_clr_eol);
    fflush(stdout);
+   len = strlen(buf) - len;
    // +1 for the space we added or -1 for the cursor...
-   return ((rc = (int)strlen(str)+1) < Screen_cols) ? rc : Screen_cols-1;
+   return (len + 1 < Screen_cols) ? len + 1 : Screen_cols - 1;
 } // end: show_pmt
 
 
@@ -840,6 +914,7 @@ static void show_special (int interact, const char *glob) {
                   , Curwin->captab[ch], room, sub_beg, Caps_off);
                rp = scat(rp, tmp);
                room -= (sub_end - sub_beg);
+               room += utf8_delta(sub_beg);
                sub_beg = (sub_end += 2);
                break;
             default:                   // nothin' special, just text
@@ -1906,7 +1981,7 @@ static void build_headers (void) {
 #endif
             if (EU_CMD == f && CHKw(w, Show_CMDLIN)) Frames_libflags |= L_CMDLINE;
             Frames_libflags |= Fieldstab[f].lflg;
-            s = scat(s, justify_pad(N_col(f)
+            s = scat(s, utf8_justify(N_col(f)
                , VARcol(f) ? w->varcolsz : Fieldstab[f].width
                , CHKw(w, Fieldstab[f].align)));
 #ifdef USE_X_COLHDR
@@ -2116,21 +2191,27 @@ static void display_fields (int focus, int extend) {
       const char *e = (i == focus && extend) ? w->capclr_hdr : "";
       FLG_t f = FLDget(w, i);
       char sbuf[xSUFX+1];
+      int xcol, xfld;
 
-      // prep sacrificial suffix
+      // obtain translated deltas (if any) ...
+      xcol = utf8_delta(fmtmk("%.*s", utf8_embody(N_col(f), 7), N_col(f)));
+      xfld = utf8_delta(fmtmk("%.*s", utf8_embody(N_fld(f), smax), N_fld(f)));
+
+      // prep sacrificial suffix ...
       snprintf(sbuf, sizeof(sbuf), "= %s", N_fld(f));
 
-      PUTT("%s%c%s%s %s%-7.7s%s%s%s %-*.*s%s"
+      PUTT("%s%c%s%s %s%-*.*s%s%s%s %-*.*s%s"
          , tg2(x, y)
          , b ? '*' : ' '
          , b ? w->cap_bold : Cap_norm
          , e
          , i == focus ? w->capclr_hdr : ""
+         , 7 + xcol, 7 + xcol
          , N_col(f)
          , Cap_norm
          , b ? w->cap_bold : ""
          , e
-         , smax, smax
+         , smax + xfld, smax + xfld
          , sbuf
          , Cap_norm);
    }
diff --git a/top/top.h b/top/top.h
index 8141b0bd..6b6e8645 100644
--- a/top/top.h
+++ b/top/top.h
@@ -678,6 +678,11 @@ typedef struct WIN_t {
 //atic void          sig_endpgm (int dont_care_sig);
 //atic void          sig_paused (int dont_care_sig);
 //atic void          sig_resize (int dont_care_sig);
+/*------  Special UTF-8 Multi-Byte support  ------------------------------*/
+/*atic char          UTF8_tab[] = { ... }                                 */
+//atic int           utf8_delta (const char *str);
+//atic int           utf8_embody (const char *str, int width);
+//atic const char   *utf8_justify (const char *str, int width, int justr);
 /*------  Misc Color/Display support  ------------------------------------*/
 //atic void          capsmk (WIN_t *q);
 //atic void          show_msg (const char *str);