From: porkchop@invalid.foo   
      
   On Wed, 3 Dec 2025 06:24:23 +0100, Bonita Montero wrote:   
      
   >> Here I'm running any mixture of: Windows/BSD/Linix Mint LMDE.   
   >   
   > Windows has the ...W() APIs along with codepage-based APIs with   
   > the ...A() Suffix. The W()-APIs support UTF-16, so no need for   
      
   Hi Bonita.   
      
   Yes that's correct, but...   
      
   - that assumes we know in advance what the character is   
      
   - it would only work under Windows   
      
   We want portability across diverse OSs. In my case, the program   
   does NOT care what the character is, it simply needs to be able   
   to find it when searching data & displaying it in an ordered way.   
      
   The code below works perfectly:   
      
   #include    
   #include    
      
   int utf8_display_width(const char *s) {   
    int w = 0;   
      
    while (*s) {   
    unsigned char b = *s;   
    unsigned cp;   
    int n;   
      
    // UTF-8 decoder   
    if (b <= 0x7F) { // 1-byte ASCII   
    cp = b;   
    n = 1;   
    } else if (b >= 0xC0 && b <= 0xDF) { // 2-byte   
    cp = ((b & 0x1F) << 6) |   
    (s[1] & 0x3F);   
    n = 2;   
    } else if (b >= 0xE0 && b <= 0xEF) { // 3-byte   
    cp = ((b & 0x0F) << 12) |   
    ((s[1] & 0x3F) << 6) |   
    (s[2] & 0x3F);   
    n = 3;   
    } else if (b >= 0xF0 && b <= 0xF7) { // 4-byte   
    cp = ((b & 0x07) << 18) |   
    ((s[1] & 0x3F) << 12) |   
    ((s[2] & 0x3F) << 6) |   
    (s[3] & 0x3F);   
    n = 4;   
    } else { // invalid, treat as 1-byte   
    cp = b;   
    n = 1;   
    }   
      
    // display width   
    if (cp >= 0x0300 && cp <= 0x036F) {} // combining marks like é   
   (zero width)   
    else if ( // double-width characters...   
    (cp >= 0x1100 && cp <= 0x115F) || // hangul jamo   
    (cp >= 0x2E80 && cp <= 0xA4CF) || // cjk radicals & unified   
   ideographs   
    (cp >= 0xAC00 && cp <= 0xD7A3) || // hangul syllables   
    (cp >= 0xF900 && cp <= 0xFAFF) || // cjk compatibility ideographs   
    (cp >= 0x1F300 && cp <= 0x1FAFF) // emoji + symbols   
    ) { w += 2; }   
    // exceptional wide characters (unicode requirement I've read   
   elsewhere)   
    else if (cp == 0x2329 || cp == 0x232A) { w += 2; }   
    else { w += 1; } // normal width for everything else   
      
    s += n;   
    }   
      
    return w;   
   }   
      
   int main(void) {   
    const char *tests[] = {   
    "hello",   
    "Café",   
    "漢字",   
    "✓",   
    "🙂",   
    NULL   
    };   
      
    // find maximum display width in 1st column   
    int maxw = 0;   
    for (int i = 0; tests[i]; i++) {   
    int w = utf8_display_width(tests[i]);   
    if (w > maxw) maxw = w;   
    }   
      
    // total padding after each 1st column + 3 spaces   
    int total_pad = maxw + 3;   
      
    for (int i = 0; tests[i]; i++) {   
    int w = utf8_display_width(tests[i]);   
    int sl = strlen(tests[i]);   
    printf("%s", tests[i]);   
    int pad = total_pad - w;   
    while (pad-- > 0) putchar(' ');   
    printf("strlen: %d utf8 display width: %d\n", sl, w);   
    }   
      
    return 0;   
   }   
      
   // eof   
      
   --   
   :wq   
   Mike Sanders   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|