home bbs files messages ]

Forums before death by AOL, social media and spammers... "We can't have nice things"

   comp.lang.c      Meh, in C you gotta define EVERYTHING      243,242 messages   

[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]

   Message 242,329 of 243,242   
   Michael Sanders to Bonita Montero   
   Re: Unicode...   
   03 Dec 25 18:33:05   
   
   From: porkchop@invalid.foo   
      
   On Wed, 3 Dec 2025 06:24:23 +0100, Bonita Montero wrote:   
      
   >> Here I'm running any mixture of: Windows/BSD/Linix Mint LMDE.   
   >   
   > Windows has the ...W() APIs along with codepage-based APIs with   
   > the ...A() Suffix. The W()-APIs support UTF-16, so no need for   
      
   Hi Bonita.   
      
   Yes that's correct, but...   
      
   - that assumes we know in advance what the character is   
      
   - it would only work under Windows   
      
   We want portability across diverse OSs. In my case, the program   
   does NOT care what the character is, it simply needs to be able   
   to find it when searching data & displaying it in an ordered way.   
      
   The code below works perfectly:   
      
   #include    
   #include    
      
   int utf8_display_width(const char *s) {   
       int w = 0;   
      
       while (*s) {   
           unsigned char b = *s;   
           unsigned cp;   
           int n;   
      
           // UTF-8 decoder   
           if (b <= 0x7F) { // 1-byte ASCII   
               cp = b;   
               n  = 1;   
           } else if (b >= 0xC0 && b <= 0xDF) { // 2-byte   
               cp = ((b & 0x1F) << 6) |   
                    (s[1] & 0x3F);   
               n  = 2;   
           } else if (b >= 0xE0 && b <= 0xEF) { // 3-byte   
               cp = ((b & 0x0F) << 12)   |   
                    ((s[1] & 0x3F) << 6) |   
                    (s[2] & 0x3F);   
                n = 3;   
           } else if (b >= 0xF0 && b <= 0xF7) { // 4-byte   
               cp = ((b & 0x07) << 18)    |   
                    ((s[1] & 0x3F) << 12) |   
                    ((s[2] & 0x3F) << 6)  |   
                    (s[3] & 0x3F);   
                n = 4;   
           } else { // invalid, treat as 1-byte   
               cp = b;   
               n  = 1;   
           }   
      
           // display width   
           if (cp >= 0x0300 && cp <= 0x036F) {}   // combining marks like é   
   (zero width)   
           else if (                              // double-width characters...   
               (cp >= 0x1100  && cp <= 0x115F) || // hangul jamo   
               (cp >= 0x2E80  && cp <= 0xA4CF) || // cjk radicals & unified   
   ideographs   
               (cp >= 0xAC00  && cp <= 0xD7A3) || // hangul syllables   
               (cp >= 0xF900  && cp <= 0xFAFF) || // cjk compatibility ideographs   
               (cp >= 0x1F300 && cp <= 0x1FAFF)   // emoji + symbols   
           ) { w += 2; }   
           // exceptional wide characters (unicode requirement I've read   
   elsewhere)   
           else if (cp == 0x2329 || cp == 0x232A) { w += 2; }   
           else { w += 1; } // normal width for everything else   
      
           s += n;   
       }   
      
       return w;   
   }   
      
   int main(void) {   
       const char *tests[] = {   
           "hello",   
           "Café",   
           "漢字",   
           "✓",   
           "🙂",   
           NULL   
       };   
      
       // find maximum display width in 1st column   
       int maxw = 0;   
       for (int i = 0; tests[i]; i++) {   
           int w = utf8_display_width(tests[i]);   
           if (w > maxw) maxw = w;   
       }   
      
       // total padding after each 1st column + 3 spaces   
       int total_pad = maxw + 3;   
      
       for (int i = 0; tests[i]; i++) {   
           int w = utf8_display_width(tests[i]);   
           int sl = strlen(tests[i]);   
           printf("%s", tests[i]);   
           int pad = total_pad - w;   
           while (pad-- > 0) putchar(' ');   
           printf("strlen: %d  utf8 display width: %d\n", sl, w);   
       }   
      
       return 0;   
   }   
      
   // eof   
      
   --   
   :wq   
   Mike Sanders   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   

[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]


(c) 1994,  bbs@darkrealms.ca