From: Bonita.Montero@gmail.com   
      
   Now I've developed a benchmark which tests the static comparison approach   
   vs. the table approach vs. an AVX2-approach vs. an AVX-512 approach. This   
   are the results with clang 20:   
      
    check: 2.17442   
    table: 2.00056 (109%)   
    AVX-256: 0.183048 (1093%, 1188%)   
    AVX-512: 0.0639528 (286%, 3128%, 3400%)   
      
   The number in the brackets are the speedups against the before results.   
   So the AVX-512 solution is 30+ times than the byte-wise solutions.   
      
   This is the code:   
      
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include "inline.h"   
      
   using namespace std;   
   using namespace filesystem;   
   using namespace chrono;   
      
   template   
   bool binary( string const &buf );   
   template   
   bool binaryAvx( string const &buf );   
      
   int main()   
   {   
    ifstream ifs;   
    ifs.exceptions( ios_base::failbit | ios_base::badbit );   
    ifs.open( "main.cpp", ios_base::binary | ios_base::ate );   
    streampos pos = ifs.tellg();   
    if( pos > (size_t)-1 )   
    throw ios_base::failure( "file too large", error_code(   
   (int)errc::file_too_large, generic_category() ) );   
    string buf( (size_t)pos, 0 );   
    ifs.seekg( 0 );   
    ifs.read( buf.data(), buf.size() );   
    array results;   
    using test_fn = function;   
    auto bench = [&]( size_t i, char const *what, test_fn const &test )   
   L_FORCEINLINE -> int   
    {   
    int ret = 0;   
    auto start = high_resolution_clock::now();   
   #if defined(NDEBUG)   
    constexpr size_t N = 1'000'000;   
   #else   
    constexpr size_t N = 1'000;   
   #endif   
    for( size_t r = N; r; --r )   
    ret += test( buf );   
    double secs = (double)duration_cast(   
   high_resolution_clock::now() - start ).count() / 1.0e9;   
    cout << what << ": " << secs;   
    results[i] = secs;   
    if( i )   
    {   
    cout << " (";   
    do   
    {   
    cout << (int)(100.0 * results[--i] / secs + 0.5) <<   
   "%";   
    if( i )   
    cout << ", ";   
    } while( i );   
    cout << ")";   
    }   
    cout << endl;   
    return ret;   
    };   
    struct test { char const *descr; test_fn fn; };   
    array tests =   
    {   
    test( "check", +[]( string const &str ) -> int { return   
   binary( str ); } ),   
    test( "table", +[]( string const &str ) -> int { return   
   binary( str ); } ),   
    test( "AVX-256", +[]( string const &str ) -> int { return   
   binaryAvx( str ); } ),   
    test( "AVX-512", +[]( string const &str ) -> int { return   
   binaryAvx( str ); } )   
    };   
    int ret = 0;   
    for( size_t t = 0; test const &test : tests )   
    ret += bench( t++, test.descr, test.fn );   
    return ret;   
   }   
      
   template   
   bool binary( string const &buf )   
   {   
    static auto invalid = []( unsigned char c ) static { return c <   
   0x20 && c != '\r' && c != '\n' && c != '\t'; };   
    if constexpr( Table )   
    {   
    static vector invalidTbl = Table ? []()   
    {   
    vector ret( numeric_limits::max() );   
    for( size_t c = ret.size(); c--; )   
    ret[c] = invalid( (unsigned char)c );   
    return ret;   
    }() : vector();   
    return find_if( buf.begin(), buf.end(), [&]( unsigned char c )   
   { return invalidTbl[c]; } ) == buf.end();   
    }   
    else   
    return find_if( buf.begin(), buf.end(), invalid ) == buf.end();   
   }   
      
   template   
   bool binaryAvx( string const &buf )   
   {   
    char const   
    *pBegin = buf.data(),   
    *pEnd = pBegin + buf.size();   
    if constexpr( Avx512 )   
    {   
    size_t   
    head = (size_t)pBegin & 63,   
    tail = (size_t)pEnd & 63;   
    span<__m512i const> range( (__m512i *)(pBegin - head), (__m512i   
   *)(pEnd - tail + (tail ? 64 : 0)) );   
    __m512i const   
    printable = _mm512_set1_epi8( (char)0x20 ),   
    cr = _mm512_set1_epi8( (char)'\r' ),   
    lf = _mm512_set1_epi8( (char)'\n' ),   
    tab = _mm512_set1_epi8( (char)'\t' );   
    uint64_t mask = (uint64_t)-1ll << head;   
    auto cur = range.begin(), end = range.end();   
    auto doChunk = [&]() -> bool   
    {   
    __m512i chunk = _mm512_loadu_epi8( (void *)to_address( cur   
   ) );   
    uint64_t   
    spaMask = _mm512_cmpge_epu8_mask( chunk, printable ),   
    crMask = _mm512_cmpeq_epi8_mask( chunk, cr ),   
    lfMask = _mm512_cmpeq_epi8_mask( chunk, lf ),   
    tabMask = _mm512_cmpeq_epi8_mask( chunk, tab );   
    return ((spaMask | crMask | lfMask | tabMask) & mask) ==   
   mask;   
    };   
    for( ; cur != end - (bool)tail; ++cur, mask = -1ll )   
    if( !doChunk() )   
    return false;   
    if( tail )   
    {   
    mask = ~((uint64_t)-1ll << tail);   
    if( !doChunk() )   
    return false;   
    }   
    }   
    else   
    {   
    size_t   
    head = (size_t)pBegin & 31,   
    tail = (size_t)pEnd & 31;   
    span<__m256i const> range( (__m256i *)(pBegin - head), (__m256i   
   *)(pEnd - tail + (tail ? 32 : 0)) );   
    __m256i const   
    zero = _mm256_setzero_si256(),   
    printable = _mm256_set1_epi8( (char)0xE0 ),   
    cr = _mm256_set1_epi8( (char)'\r' ),   
    lf = _mm256_set1_epi8( (char)'\n' ),   
    tab = _mm256_set1_epi8( (char)'\t' );   
    uint32_t mask = (uint32_t)-1 << head;   
    auto cur = range.begin(), end = range.end();   
    auto doChunk = [&]() -> bool   
    {   
    __m256i chunk = _mm256_loadu_epi8( (void *)to_address( cur   
   ) );   
    uint32_t   
    spaMask = ~_mm256_movemask_epi8( _mm256_cmpeq_epi8(   
   _mm256_and_si256( chunk, printable ), zero ) ),   
      
   [continued in next message]   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|