Forums before death by AOL, social media and spammers... "We can't have nice things"
|    comp.lang.c    |    Meh, in C you gotta define EVERYTHING    |    243,242 messages    |
[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]
|    Message 242,038 of 243,242    |
|    bart to Bonita Montero    |
|    Re: Unicode...    |
|    22 Nov 25 13:38:27    |
   
   From: bc@freeuk.com   
      
   On 22/11/2025 13:10, Bonita Montero wrote:   
   > This code with AVX512BW and BMI1 is 13,5 times faster than yours on my   
   > Zen4-PC.   
   >   
   > size_t utf8Width2( const char *s )   
   >   
   > {   
   > __m512i const   
   > ZERO = _mm512_setzero_si512(),   
   > ONE_MASK = _mm512_set1_epi8( (char)0x80 ),   
   > ONE_HEAD = ZERO,   
   > TWO_MASK = _mm512_set1_epi8( (char)0xE0 ),   
   > TWO_HEAD = _mm512_set1_epi8( (char)0xC0 ),   
   > THREE_MASK = _mm512_set1_epi8( (char)0xF0 ),   
   > THREE_HEAD = _mm512_set1_epi8( (char)0xE0 ),   
   > FOUR_MASK = _mm512_set1_epi8( (char)0xF8 ),   
   > FOUR_HEAD = _mm512_set1_epi8( (char)0xF0 );   
   > uintptr_t   
   > begin = (uintptr_t)s,   
   > base = begin & -64;   
   > s = (char *)base;   
   > size_t count = 0;   
   > __m512i chunk;   
   > uint64_t nzMask;   
   > auto doChunk = [&]() L_FORCEINLINE   
   > {   
   > uint64_t   
   > one = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk,   
   > ONE_MASK ), ONE_HEAD ) & nzMask,   
   > two = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk,   
   > TWO_MASK ), TWO_HEAD ) & nzMask,   
   > three = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk,   
   > THREE_MASK ), THREE_HEAD ) & nzMask,   
   > four = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk,   
   > FOUR_MASK ), FOUR_HEAD ) & nzMask;   
   > count += _mm_popcnt_u64( one ) + _mm_popcnt_u64( two ) +   
   > _mm_popcnt_u64( three ) + _mm_popcnt_u64( four );   
   > };   
   > chunk = _mm512_loadu_si512( s );   
   > unsigned head = (unsigned)(begin - base);   
   > nzMask = ~_mm512_cmpeq_epi8_mask( chunk, ZERO ) >> head;   
   > unsigned ones = countr_one( nzMask );   
   > nzMask &= ones < 64 ? (1ull << ones) - 1 : -1;   
   > nzMask <<= head;   
   > doChunk();   
   > if( (int64_t)nzMask >= 0 )   
   > return count;   
   > for( ; ; )   
   > {   
   > s += 64;   
   > chunk = _mm512_loadu_si512( s );   
   > nzMask = ~_mm512_cmpeq_epi8_mask( chunk, ZERO );   
   > ones = countr_one( nzMask );   
   > nzMask = ones < 64 ? (1ull << ones) - 1 : -1;   
   > if( !nzMask )   
   > break;   
   > doChunk();   
   > }   
   > return count;   
   > }   
      
      
   Doesn't compile, even after I add suitable *intrin headers.   
      
   I took out L_FORCEINLINE (not recognised); added std:: to countr_one,   
   but it still gave me errors like this:   
      
   C:/tdm/lib/gcc/x86_64-w64-mingw32/14.1.0/include/popcntintrin.h: In   
   lambda function:   
   C:/tdm/lib/gcc/x86_64-w64-mingw32/14.1.0/include/popcntintrin.h:42:1:   
   error: inlining failed in call to 'always_inline' 'long long int   
   _mm_popcnt_u64(long long unsigned int)': target specific option mismatch   
    42 | _mm_popcnt_u64 (unsigned long long __X)   
    | ^~~~~~~~~~~~~~   
      
      
   You have to give complete compilable code or have only simple   
   dependencies like stdio.h.   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|
[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]
(c) 1994, bbs@darkrealms.ca