From: Bonita.Montero@gmail.com   
      
   Take this and -mavx512bw and -std=c++23.   
      
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #if defined(_MSC_VER)   
    #include    
   #elif defined(__GNUC__) || defined(__clang__)   
    #include    
   #endif   
   #include "inline.h"   
      
   #if defined(_MSC_VER) && !defined(__clang__)   
    #pragma warning(disable: 26815) // dangling pointer   
   #endif   
      
   using namespace std;   
   using namespace chrono;   
      
   template   
    requires std::same_as || std::same_as   
   NOINLINE size_t utf8Width( View str )   
   {   
    ptrdiff_t rem = str.end() - str.begin(), w = 0, width;   
    for( auto it = str.begin(); rem > 0; rem -= width, ++w ) [[likely]]   
    {   
    width = countl_one( (unsigned char)*it );   
    width += (size_t)!width;   
    if constexpr( Validate )   
    if( (*it & 0xC0) == 0x80 || width > min( 4Z, rem ) )   
   [[unlikely]]   
    return -1;   
    auto end = it + width;   
    if constexpr( !Validate )   
    it = end;   
    else   
    while( ++it != end )   
    if( (*it & 0xC0) != 0x80 )   
    return -1;   
    }   
    if constexpr( Validate )   
    if( rem )   
    return -1;   
    return w;   
   }   
      
   NOINLINE size_t utf8widthC( char const *str )   
   {   
    size_t length = 0, n;   
    for( char8_t c; (c = *str); ++length )   
    {   
    if( (c & 0x80) == 0 )   
    n = 1;   
    else if( (c & 0xE0) == 0xC0 )   
    n = 2;   
    else if( (c & 0xF0) == 0xE0 )   
    n = 3;   
    else   
    n = 4;   
    n += (size_t)!n;   
    str += n;   
    }   
    return length;   
   }   
      
   NOINLINE size_t utf8Width512( const char *s )   
   {   
    __m512i const   
    ZERO = _mm512_setzero_si512(),   
    ONE_MASK = _mm512_set1_epi8( (char)0x80 ),   
    ONE_HEAD = ZERO,   
    TWO_MASK = _mm512_set1_epi8( (char)0xE0 ),   
    TWO_HEAD = _mm512_set1_epi8( (char)0xC0 ),   
    THREE_MASK = _mm512_set1_epi8( (char)0xF0 ),   
    THREE_HEAD = _mm512_set1_epi8( (char)0xE0 ),   
    FOUR_MASK = _mm512_set1_epi8( (char)0xF8 ),   
    FOUR_HEAD = _mm512_set1_epi8( (char)0xF0 );   
    uintptr_t   
    begin = (uintptr_t)s,   
    base = begin & -64;   
    s = (char *)base;   
    size_t count = 0;   
    __m512i chunk;   
    uint64_t nzMask;   
    auto doChunk = [&]() L_FORCEINLINE   
    {   
    uint64_t   
    one = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk,   
   ONE_MASK ), ONE_HEAD ) & nzMask,   
    two = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk,   
   TWO_MASK ), TWO_HEAD ) & nzMask,   
    three = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk,   
   THREE_MASK ), THREE_HEAD ) & nzMask,   
    four = _mm512_cmpeq_epi8_mask( _mm512_and_si512( chunk,   
   FOUR_MASK ), FOUR_HEAD ) & nzMask;   
    count += _mm_popcnt_u64( one ) + _mm_popcnt_u64( two ) +   
   _mm_popcnt_u64( three ) + _mm_popcnt_u64( four );   
    };   
    chunk = _mm512_loadu_si512( s );   
    unsigned head = (unsigned)(begin - base);   
    nzMask = ~_mm512_cmpeq_epi8_mask( chunk, ZERO ) >> head;   
    unsigned ones = countr_one( nzMask );   
    nzMask &= ones < 64 ? (1ull << ones) - 1 : -1;   
    nzMask <<= head;   
    doChunk();   
    if( (int64_t)nzMask >= 0 )   
    return count;   
    for( ; ; )   
    {   
    s += 64;   
    chunk = _mm512_loadu_si512( s );   
    nzMask = ~_mm512_cmpeq_epi8_mask( chunk, ZERO );   
    ones = countr_one( nzMask );   
    nzMask = ones < 64 ? (1ull << ones) - 1 : -1;   
    if( !nzMask )   
    break;   
    doChunk();   
    }   
    return count;   
   }   
      
   NOINLINE size_t utf8Width256( const char *s )   
   {   
    __m256i const   
    ZERO = _mm256_setzero_si256(),   
    ONE_MASK = _mm256_set1_epi8( (char)0x80 ),   
    ONE_HEAD = ZERO,   
    TWO_MASK = _mm256_set1_epi8( (char)0xE0 ),   
    TWO_HEAD = _mm256_set1_epi8( (char)0xC0 ),   
    THREE_MASK = _mm256_set1_epi8( (char)0xF0 ),   
    THREE_HEAD = _mm256_set1_epi8( (char)0xE0 ),   
    FOUR_MASK = _mm256_set1_epi8( (char)0xF8 ),   
    FOUR_HEAD = _mm256_set1_epi8( (char)0xF0 );   
    uintptr_t   
    begin = (uintptr_t)s,   
    base = begin & -32;   
    s = (char *)base;   
    size_t count = 0;   
    __m256i chunk;   
    uint32_t nzMask;   
    auto doChunk = [&]() L_FORCEINLINE   
    {   
    uint32_t   
    one = _mm256_movemask_epi8( _mm256_cmpeq_epi8(   
   _mm256_and_si256( chunk, ONE_MASK ), ONE_HEAD ) ) & nzMask,   
    two = _mm256_movemask_epi8( _mm256_cmpeq_epi8(   
   _mm256_and_si256( chunk, TWO_MASK ), TWO_HEAD ) ) & nzMask,   
    three = _mm256_movemask_epi8( _mm256_cmpeq_epi8(   
   _mm256_and_si256( chunk, THREE_MASK ), THREE_HEAD ) ) & nzMask,   
    four = _mm256_movemask_epi8( _mm256_cmpeq_epi8(   
   _mm256_and_si256( chunk, FOUR_MASK ), FOUR_HEAD ) ) & nzMask;   
    count += _mm_popcnt_u64( one ) + _mm_popcnt_u64( two ) +   
   _mm_popcnt_u64( three ) + _mm_popcnt_u64( four );   
    };   
    chunk = _mm256_loadu_si256( (__m256i *)s );   
    unsigned head = (unsigned)(begin - base);   
    nzMask = ~_mm256_movemask_epi8( _mm256_cmpeq_epi8( chunk, ZERO ) )   
    >> head;   
    unsigned ones = countr_one( nzMask );   
    nzMask &= ones < 32 ? (1ull << ones) - 1 : -1;   
    nzMask <<= head;   
    doChunk();   
    if( (int32_t)nzMask >= 0 )   
    return count;   
    for( ; ; )   
    {   
    s += 32;   
    chunk = _mm256_loadu_si256( (__m256i *)s );   
    nzMask = ~_mm256_movemask_epi8( _mm256_cmpeq_epi8( chunk, ZERO )   
   );   
    ones = countr_one( nzMask );   
    nzMask = ones < 32 ? (1ull << ones) - 1 : -1;   
    if( !nzMask )   
    break;   
    doChunk();   
    }   
    return count;   
   }   
      
   int main()   
   {   
    constexpr unsigned   
    TYPE1_BITS = 7,   
    TYPE2_BITS = 11,   
    TYPE3_BITS = 16,   
    TYPE4_BITS = 21;   
    constexpr char32_t   
    TYPE1_END = 1 << TYPE1_BITS,   
    TYPE2_END = 1 << TYPE2_BITS,   
    TYPE3_END = 1 << TYPE3_BITS,   
    TYPE4_END = 1 << TYPE4_BITS;   
      
   [continued in next message]   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|