Forums before death by AOL, social media and spammers... "We can't have nice things"
|    comp.lang.c    |    Meh, in C you gotta define EVERYTHING    |    243,242 messages    |
[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]
|    Message 242,037 of 243,242    |
|    Bonita Montero to All    |
|    Re: Unicode...    |
|    22 Nov 25 15:08:16    |
   
   From: Bonita.Montero@gmail.com   
      
   Am 22.11.2025 um 14:38 schrieb bart:   
   > Doesn't compile, even after I add suitable *intrin headers.   
   > I took out L_FORCEINLINE (not recognised); added std:: to countr_one,   
   > but it still gave me errors like this:   
   > C:/tdm/lib/gcc/x86_64-w64-mingw32/14.1.0/include/popcntintrin.h: In   
   > lambda function:   
   > C:/tdm/lib/gcc/x86_64-w64-mingw32/14.1.0/include/popcntintrin.h:42:1:   
   > error: inlining failed in call to 'always_inline' 'long long int   
   > _mm_popcnt_u64(long long unsigned int)': target specific option mismatch   
   > 42 | _mm_popcnt_u64 (unsigned long long __X)   
   > | ^~~~~~~~~~~~~~   
   > You have to give complete compilable code or have only simple   
   > dependencies like stdio.h.   
      
   Try __attribute__((always_inline)) instead. The code requires enabled   
   AVX512 compilation   
   with g++ and a AVX512-compatible CPU (Intel since Skylake-X Xeons, AMD   
   since Zen4).   
   If you want to test for an older CPU you can stick with the below code,   
   which is AVX2.   
   On my CPU this is only seven times faster than yours. AVX-512 really   
   rocks the house.   
      
   size_t utf8Width256( const char *s )   
   {   
    __m256i const   
    ZERO = _mm256_setzero_si256(),   
    ONE_MASK = _mm256_set1_epi8( (char)0x80 ),   
    ONE_HEAD = ZERO,   
    TWO_MASK = _mm256_set1_epi8( (char)0xE0 ),   
    TWO_HEAD = _mm256_set1_epi8( (char)0xC0 ),   
    THREE_MASK = _mm256_set1_epi8( (char)0xF0 ),   
    THREE_HEAD = _mm256_set1_epi8( (char)0xE0 ),   
    FOUR_MASK = _mm256_set1_epi8( (char)0xF8 ),   
    FOUR_HEAD = _mm256_set1_epi8( (char)0xF0 );   
    uintptr_t   
    begin = (uintptr_t)s,   
    base = begin & -32;   
    s = (char *)base;   
    size_t count = 0;   
    __m256i chunk;   
    uint32_t nzMask;   
    auto doChunk = [&]() L_FORCEINLINE   
    {   
    uint32_t   
    one = _mm256_movemask_epi8( _mm256_cmpeq_epi8(   
   _mm256_and_si256( chunk, ONE_MASK ), ONE_HEAD ) ) & nzMask,   
    two = _mm256_movemask_epi8( _mm256_cmpeq_epi8(   
   _mm256_and_si256( chunk, TWO_MASK ), TWO_HEAD ) ) & nzMask,   
    three = _mm256_movemask_epi8( _mm256_cmpeq_epi8(   
   _mm256_and_si256( chunk, THREE_MASK ), THREE_HEAD ) ) & nzMask,   
    four = _mm256_movemask_epi8( _mm256_cmpeq_epi8(   
   _mm256_and_si256( chunk, FOUR_MASK ), FOUR_HEAD ) ) & nzMask;   
    count += _mm_popcnt_u64( one ) + _mm_popcnt_u64( two ) +   
   _mm_popcnt_u64( three ) + _mm_popcnt_u64( four );   
    };   
      
    chunk = _mm256_loadu_si256( (__m256i *)s );   
    unsigned head = (unsigned)(begin - base);   
    nzMask = ~_mm256_movemask_epi8( _mm256_cmpeq_epi8( chunk, ZERO ) )   
    >> head;   
    unsigned ones = countr_one( nzMask );   
    nzMask &= ones < 32 ? (1ull << ones) - 1 : -1;   
    nzMask <<= head;   
    doChunk();   
    if( (int32_t)nzMask >= 0 )   
    return count;   
    for( ; ; )   
    {   
    s += 32;   
    chunk = _mm256_loadu_si256( (__m256i *)s );   
    nzMask = ~_mm256_movemask_epi8( _mm256_cmpeq_epi8( chunk, ZERO )   
   );   
    ones = countr_one( nzMask );   
    nzMask = ones < 32 ? (1ull << ones) - 1 : -1;   
    if( !nzMask )   
    break;   
    doChunk();   
    }   
    return count;   
   }   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|
[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]
(c) 1994, bbs@darkrealms.ca