... darkrealms ...

Forums before death by AOL, social media and spammers... "We can't have nice things"
comp.lang.c
Meh, in C you gotta define EVERYTHING
243,242 messages
[ << oldest | < older | list | newer > | newest >> ]
Message 242,341 of 243,242
Bonita Montero to All
Re: Unicode...
04 Dec 25 14:03:54
   From: Bonita.Montero@gmail.com   
      
   Am 03.12.2025 um 19:33 schrieb Michael Sanders:   
   > On Wed, 3 Dec 2025 06:24:23 +0100, Bonita Montero wrote:   
   >   
   >>> Here I'm running any mixture of: Windows/BSD/Linix Mint LMDE.   
   >> Windows has the ...W() APIs along with codepage-based APIs with   
   >> the ...A() Suffix. The W()-APIs support UTF-16, so no need for   
   > Hi Bonita.   
   >   
   > Yes that's correct, but...   
   >   
   > - that assumes we know in advance what the character is   
   >   
   > - it would only work under Windows   
   >   
   > We want portability across diverse OSs. In my case, the program   
   > does NOT care what the character is, it simply needs to be able   
   > to find it when searching data & displaying it in an ordered way.   
   VC++ supports C- and C++ locale if you like to have it portable.   
   Especially the locale-support in C++ with its facets is very nice   
   to handle: https://en.cppreference.com/w/cpp/locale.html   
      
   >   
   > The code below works perfectly:   
   >   
   > #include    
   > #include    
   >   
   > int utf8_display_width(const char *s) {   
   >      int w = 0;   
   >   
   >      while (*s) {   
   >          unsigned char b = *s;   
   >          unsigned cp;   
   >          int n;   
   >   
   >          // UTF-8 decoder   
   >          if (b <= 0x7F) { // 1-byte ASCII   
   >              cp = b;   
   >              n  = 1;   
   >          } else if (b >= 0xC0 && b <= 0xDF) { // 2-byte   
   >              cp = ((b & 0x1F) << 6) |   
   >                   (s[1] & 0x3F);   
   >              n  = 2;   
   >          } else if (b >= 0xE0 && b <= 0xEF) { // 3-byte   
   >              cp = ((b & 0x0F) << 12)   |   
   >                   ((s[1] & 0x3F) << 6) |   
   >                   (s[2] & 0x3F);   
   >               n = 3;   
   >          } else if (b >= 0xF0 && b <= 0xF7) { // 4-byte   
   >              cp = ((b & 0x07) << 18)    |   
   >                   ((s[1] & 0x3F) << 12) |   
   >                   ((s[2] & 0x3F) << 6)  |   
   >                   (s[3] & 0x3F);   
   >               n = 4;   
   >          } else { // invalid, treat as 1-byte   
   >              cp = b;   
   >              n  = 1;   
   >          }   
   >   
   >          // display width   
   >          if (cp >= 0x0300 && cp <= 0x036F) {}   // combining marks like é   
   (zero width)   
   >          else if (                              // double-width characters...   
   >              (cp >= 0x1100  && cp <= 0x115F) || // hangul jamo   
   >              (cp >= 0x2E80  && cp <= 0xA4CF) || // cjk radicals & unified   
   ideographs   
   >              (cp >= 0xAC00  && cp <= 0xD7A3) || // hangul syllables   
   >              (cp >= 0xF900  && cp <= 0xFAFF) || // cjk compatibility   
   ideographs   
   >              (cp >= 0x1F300 && cp <= 0x1FAFF)   // emoji + symbols   
   >          ) { w += 2; }   
   >          // exceptional wide characters (unicode requirement I've read   
   elsewhere)   
   >          else if (cp == 0x2329 || cp == 0x232A) { w += 2; }   
   >          else { w += 1; } // normal width for everything else   
   >   
   >          s += n;   
   >      }   
   >   
   >      return w;   
   > }   
   >   
   > int main(void) {   
   >      const char *tests[] = {   
   >          "hello",   
   >          "Café",   
   >          "漢字",   
   >          "✓",   
   >          "🙂",   
   >          NULL   
   >      };   
   >   
   >      // find maximum display width in 1st column   
   >      int maxw = 0;   
   >      for (int i = 0; tests[i]; i++) {   
   >          int w = utf8_display_width(tests[i]);   
   >          if (w > maxw) maxw = w;   
   >      }   
   >   
   >      // total padding after each 1st column + 3 spaces   
   >      int total_pad = maxw + 3;   
   >   
   >      for (int i = 0; tests[i]; i++) {   
   >          int w = utf8_display_width(tests[i]);   
   >          int sl = strlen(tests[i]);   
   >          printf("%s", tests[i]);   
   >          int pad = total_pad - w;   
   >          while (pad-- > 0) putchar(' ');   
   >          printf("strlen: %d  utf8 display width: %d\n", sl, w);   
   >      }   
   >   
   >      return 0;   
   > }   
   >   
   > // eof   
   >   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)
[ << oldest | < older | list | newer > | newest >> ]