From: Keith.S.Thompson+u@gmail.com   
      
   Michael Sanders writes:   
   > Well, I finally got bitten by Unicode.   
   >   
   > Managed a work around, but I don't have enough experience   
   > with Unicode to know just exactly what I'm doing...   
   >   
   > #include    
   > #include    
   >   
   > static int utf8_width(const char *s) {   
   > int w = 0;   
   > const unsigned char *p = (const unsigned char *)s;   
   >   
   > while (*p) {   
   > if (*p < 0x80) { w++; p++; } // ASCII 1-byte   
   > else if ((*p & 0xE0) == 0xC0) { w++; p += 2; } // 2-byte UTF-8   
   > else if ((*p & 0xF0) == 0xE0) { w++; p += 3; } // 3-byte UTF-8   
   > else if ((*p & 0xF8) == 0xF0) { w++; p += 4; } // 4-byte UTF-8   
   > else { w++; p++; } // fallback   
   > }   
   >   
   > return w;   
   > }   
   >   
   > int main(void) {   
   > const char *s = "élan";   
   > printf("string: %s\n", s);   
   > printf("strlen: %d\n", strlen(s)); // 4   
   > printf("utf8_width: %d\n", utf8_width(s)); //5   
   >   
   > return 0;   
   > }   
      
   I haven't really looked at the algorithm, but strlen returns a result   
   of type size_t, so the correct format in the second printf call is   
   "%zu", not "%d".   
      
   It would make sense for utf8_width to return size_t, which would   
   mean that the format in the third printf call would also be "%zu".   
      
   --   
   Keith Thompson (The_Other_Keith) Keith.S.Thompson+u@gmail.com   
   void Void(void) { Void(); } /* The recursive call of the void */   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|