From 3d5b60693109dc5bdaa977b9a25d715a24a33133 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 31 Jan 2010 05:55:55 +0100 Subject: [PATCH] ls: fix handling of broken unicode sequences Signed-off-by: Denys Vlasenko --- libbb/unicode.c | 47 ++++++++++++++++++++++++---------------------- testsuite/ls.tests | 46 +++++++++++++++++++++++---------------------- 2 files changed, 49 insertions(+), 44 deletions(-) diff --git a/libbb/unicode.c b/libbb/unicode.c index 4e7e3a96a..7c41ef30b 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c @@ -139,6 +139,8 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n) return org_n - n; } +#define ERROR_WCHAR (~(wchar_t)0) + static const char *mbstowc_internal(wchar_t *res, const char *src) { int bytes; @@ -159,16 +161,22 @@ static const char *mbstowc_internal(wchar_t *res, const char *src) c <<= 1; bytes++; } while ((c & 0x80) && bytes < 6); - if (bytes == 1) - return NULL; + if (bytes == 1) { + /* A bare "continuation" byte. Say, 80 */ + *res = ERROR_WCHAR; + return src; + } c = (uint8_t)(c) >> bytes; while (--bytes) { - unsigned ch = (unsigned char) *src++; + unsigned ch = (unsigned char) *src; if ((ch & 0xc0) != 0x80) { - return NULL; + /* Missing "continuation" byte. Example: e0 80 */ + *res = ERROR_WCHAR; + return src; } c = (c << 6) + (ch & 0x3f); + src++; } /* TODO */ @@ -177,8 +185,8 @@ static const char *mbstowc_internal(wchar_t *res, const char *src) /* 11110000 10000000 10000100 10000000 converts to 0x100 */ /* correct encoding: 11000100 10000000 */ if (c <= 0x7f) { /* crude check */ - return NULL; - //or maybe 0xfffd; /* replacement character */ + *res = ERROR_WCHAR; + return src; } *res = c; @@ -204,7 +212,7 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n) while (n) { wchar_t wc; src = mbstowc_internal(&wc, src); - if (src == NULL) /* error */ + if (wc == ERROR_WCHAR) /* error */ return (size_t) -1L; if (dest) *dest++ = wc; @@ -312,20 +320,15 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char goto subst; } #else - { - const char *src1 = mbstowc_internal(&wc, src); - /* src = NULL: invalid sequence is seen, - * else: wc is set, src is advanced to next mb char - */ - if (src1) { /* no error */ - if (wc == 0) /* end-of-string */ - break; - src = src1; - } else { /* error */ - src++; - goto subst; - } - } + src = mbstowc_internal(&wc, src); + /* src is advanced to next mb char + * wc == ERROR_WCHAR: invalid sequence is seen + * else: wc is set + */ + if (wc == ERROR_WCHAR) /* error */ + goto subst; + if (wc == 0) /* end-of-string */ + break; #endif if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR) goto subst; @@ -411,7 +414,7 @@ unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src) } #else src = mbstowc_internal(&wc, src); - if (!src || wc == 0) /* error, or end-of-string */ + if (wc == ERROR_WCHAR || wc == 0) /* error, or end-of-string */ return width; #endif w = wcwidth(wc); diff --git a/testsuite/ls.tests b/testsuite/ls.tests index b0c5da7f9..8d0f2c2ea 100755 --- a/testsuite/ls.tests +++ b/testsuite/ls.tests @@ -11,9 +11,11 @@ mkdir ls.testdir || exit 1 # testing "test name" "command" "expected result" "file input" "stdin" -# The test isn't passing correctly now - all | chars should line up -# perfectly in the correctly passed test. +# With Unicode provided by libc locale, I'm not sure this test can pass. +# I suspect we might fail to skip exactly correct number of bytes +# over broked unicode sequences. test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ +&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ && test x"$CONFIG_SUBST_WCHAR" = x"63" \ && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ && testing "ls unicode test" \ @@ -73,40 +75,40 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ 0053____"?_?_"____________________________________________________________| 0054_3.3__Sequences_with_last_continuation_byte_missing___________________| 0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| -0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"??"______| -0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"???"______| -0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"????"______| -0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?????"______| +0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| +0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| +0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| +0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| 0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______| -0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"??"______| -0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"???"______| -0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"????"______| -0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?????"______| +0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______| +0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______| +0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______| +0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______| 0065_3.4__Concatenation_of_incomplete_sequences___________________________| -0066____"??????????????????????????????"______________________________________________________| +0066____"??????????"______________________________________________________| 0067_3.5__Impossible_bytes________________________________________________| 0068_3.5.1__fe_=_"?"______________________________________________________| 0069_3.5.2__ff_=_"?"______________________________________________________| 0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________| 0071_4__Overlong_sequences________________________________________________| 0072_4.1__Examples_of_an_overlong_ASCII_character_________________________| -0073_4.1.1_U+002F_=_c0_af_____________=_"??"_______________________________| -0074_4.1.2_U+002F_=_e0_80_af__________=_"???"_______________________________| -0075_4.1.3_U+002F_=_f0_80_80_af_______=_"????"_______________________________| -0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?????"_______________________________| -0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"??????"_______________________________| +0073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________| +0074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________| +0075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________| +0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________| +0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________| 0078_4.2__Maximum_overlong_sequences______________________________________| -0079_4.2.1__U-0000007F_=_c1_bf_____________=_"??"__________________________| +0079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________| 0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________| 0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________| 0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________| 0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________| 0084_4.3__Overlong_representation_of_the_NUL_character____________________| -0085_4.3.1__U+0000_=_c0_80_____________=_"??"______________________________| -0086_4.3.2__U+0000_=_e0_80_80__________=_"???"______________________________| -0087_4.3.3__U+0000_=_f0_80_80_80_______=_"????"______________________________| -0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?????"______________________________| -0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"??????"______________________________| +0085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________| +0086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________| +0087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________| +0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________| +0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________| 0090_5__Illegal_code_positions____________________________________________| 0091_5.1_Single_UTF-16_surrogates_________________________________________| 0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________|