home bbs files messages ]

Forums before death by AOL, social media and spammers... "We can't have nice things"

   comp.lang.c      Meh, in C you gotta define EVERYTHING      243,242 messages   

[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]

   Message 241,984 of 243,242   
   Michael Sanders to All   
   Unicode Sorting (Was Re: Unicode...)   
   16 Nov 25 20:30:46   
   
   From: porkchop@invalid.foo   
      
   Because someone reached out & helped me:   
      
      
      
   (earnest thanks I've hit the ground running), I'll gladly pay it forward...   
      
   Hybrid Unicode/ASCII sorting:   
      
   #include    
   #include    
   #include    
      
   static int sort_order = 0; // 0 = sort ascending A-Z, 1 = sort descending Z-A   
      
   int utf8_decode(const char *s, unsigned int *cp_out) {   
       const unsigned char *p = (const unsigned char *)s;   
       unsigned char c0 = p[0];   
      
       if (c0 < 0x80) { *cp_out = c0; return 1; }   
      
       if ((c0 & 0xE0) == 0xC0) {   
           unsigned char c1 = p[1];   
           if ((c1 & 0xC0) != 0x80) { *cp_out = c0; return 1; }   
           *cp_out = ((c0 & 0x1F) << 6) | (c1 & 0x3F);   
           return 2;   
       }   
      
       if ((c0 & 0xF0) == 0xE0) {   
           unsigned char c1 = p[1], c2 = p[2];   
           if ((c1 & 0xC0) != 0x80 || (c2 & 0xC0) != 0x80) {   
               *cp_out = c0; return 1;   
           }   
           *cp_out = ((c0 & 0x0F) << 12) |   
                     ((c1 & 0x3F) << 6)  |   
                     (c2 & 0x3F);   
           return 3;   
       }   
      
       if ((c0 & 0xF8) == 0xF0) {   
           unsigned char c1 = p[1], c2 = p[2], c3 = p[3];   
           if ((c1 & 0xC0) != 0x80 ||   
               (c2 & 0xC0) != 0x80 ||   
               (c3 & 0xC0) != 0x80) {   
               *cp_out = c0; return 1;   
           }   
           *cp_out = ((c0 & 0x07) << 18) |   
                     ((c1 & 0x3F) << 12) |   
                     ((c2 & 0x3F) << 6)  |   
                     (c3 & 0x3F);   
           return 4;   
       }   
      
       *cp_out = c0;   
       return 1;   
   }   
      
   int utf8_cmp(const char *sa, const char *sb) {   
       while (*sa && *sb) {   
           unsigned int ca, cb;   
           int la = utf8_decode(sa, &ca);   
           int lb = utf8_decode(sb, &cb);   
           if (ca != cb) return (ca < cb) ? -1 : 1;   
           sa += la;   
           sb += lb;   
       }   
       if (*sa == '\0' && *sb == '\0') return 0;   
       return (*sa == '\0') ? -1 : 1;   
   }   
      
   // qsort comparator wrapper   
   static int cmp_wrap(const void *A, const void *B) {   
       const char *a = *(const char * const *)A;   
       const char *b = *(const char * const *)B;   
       int r = utf8_cmp(a, b);   
       return sort_order ? -r : r;   
   }   
      
   int main(void) {   
       /* Test set with ASCII, accented chars, Chinese, emoji */   
       const char *items[] = {   
           "Apple",   
           "Banana",   
           "Árbol",   
           "世界",   
           "你",   
           "😀 Emoji",   
           "Zebra",   
           "Österreich",   
           "ábaco",   
           "Ωμέγα",   
           NULL   
       };   
      
       int count = 0;   
       while (items[count]) count++;   
      
       printf("UTF-8 SORT (ASCENDING):\n\n");   
       sort_order = 0;   
       qsort(items, count, sizeof(char *), cmp_wrap);   
       for (int i = 0; i < count; i++) printf(" %s\n", items[i]);   
      
       printf("\nUTF-8 SORT (DESCENDING):\n\n");   
       sort_order = 1;   
       qsort(items, count, sizeof(char *), cmp_wrap);   
       for (int i = 0; i < count; i++) printf(" %s\n", items[i]);   
      
       return 0;   
   }   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   

[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]


(c) 1994,  bbs@darkrealms.ca