From: Bonita.Montero@gmail.com   
      
   Am 03.12.2025 um 19:33 schrieb Michael Sanders:   
   > On Wed, 3 Dec 2025 06:24:23 +0100, Bonita Montero wrote:   
   >   
   >>> Here I'm running any mixture of: Windows/BSD/Linix Mint LMDE.   
   >> Windows has the ...W() APIs along with codepage-based APIs with   
   >> the ...A() Suffix. The W()-APIs support UTF-16, so no need for   
   > Hi Bonita.   
   >   
   > Yes that's correct, but...   
   >   
   > - that assumes we know in advance what the character is   
   >   
   > - it would only work under Windows   
   >   
   > We want portability across diverse OSs. In my case, the program   
   > does NOT care what the character is, it simply needs to be able   
   > to find it when searching data & displaying it in an ordered way.   
   VC++ supports C- and C++ locale if you like to have it portable.   
   Especially the locale-support in C++ with its facets is very nice   
   to handle: https://en.cppreference.com/w/cpp/locale.html   
      
   >   
   > The code below works perfectly:   
   >   
   > #include    
   > #include    
   >   
   > int utf8_display_width(const char *s) {   
   > int w = 0;   
   >   
   > while (*s) {   
   > unsigned char b = *s;   
   > unsigned cp;   
   > int n;   
   >   
   > // UTF-8 decoder   
   > if (b <= 0x7F) { // 1-byte ASCII   
   > cp = b;   
   > n = 1;   
   > } else if (b >= 0xC0 && b <= 0xDF) { // 2-byte   
   > cp = ((b & 0x1F) << 6) |   
   > (s[1] & 0x3F);   
   > n = 2;   
   > } else if (b >= 0xE0 && b <= 0xEF) { // 3-byte   
   > cp = ((b & 0x0F) << 12) |   
   > ((s[1] & 0x3F) << 6) |   
   > (s[2] & 0x3F);   
   > n = 3;   
   > } else if (b >= 0xF0 && b <= 0xF7) { // 4-byte   
   > cp = ((b & 0x07) << 18) |   
   > ((s[1] & 0x3F) << 12) |   
   > ((s[2] & 0x3F) << 6) |   
   > (s[3] & 0x3F);   
   > n = 4;   
   > } else { // invalid, treat as 1-byte   
   > cp = b;   
   > n = 1;   
   > }   
   >   
   > // display width   
   > if (cp >= 0x0300 && cp <= 0x036F) {} // combining marks like é   
   (zero width)   
   > else if ( // double-width characters...   
   > (cp >= 0x1100 && cp <= 0x115F) || // hangul jamo   
   > (cp >= 0x2E80 && cp <= 0xA4CF) || // cjk radicals & unified   
   ideographs   
   > (cp >= 0xAC00 && cp <= 0xD7A3) || // hangul syllables   
   > (cp >= 0xF900 && cp <= 0xFAFF) || // cjk compatibility   
   ideographs   
   > (cp >= 0x1F300 && cp <= 0x1FAFF) // emoji + symbols   
   > ) { w += 2; }   
   > // exceptional wide characters (unicode requirement I've read   
   elsewhere)   
   > else if (cp == 0x2329 || cp == 0x232A) { w += 2; }   
   > else { w += 1; } // normal width for everything else   
   >   
   > s += n;   
   > }   
   >   
   > return w;   
   > }   
   >   
   > int main(void) {   
   > const char *tests[] = {   
   > "hello",   
   > "Café",   
   > "漢字",   
   > "✓",   
   > "🙂",   
   > NULL   
   > };   
   >   
   > // find maximum display width in 1st column   
   > int maxw = 0;   
   > for (int i = 0; tests[i]; i++) {   
   > int w = utf8_display_width(tests[i]);   
   > if (w > maxw) maxw = w;   
   > }   
   >   
   > // total padding after each 1st column + 3 spaces   
   > int total_pad = maxw + 3;   
   >   
   > for (int i = 0; tests[i]; i++) {   
   > int w = utf8_display_width(tests[i]);   
   > int sl = strlen(tests[i]);   
   > printf("%s", tests[i]);   
   > int pad = total_pad - w;   
   > while (pad-- > 0) putchar(' ');   
   > printf("strlen: %d utf8 display width: %d\n", sl, w);   
   > }   
   >   
   > return 0;   
   > }   
   >   
   > // eof   
   >   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|