From: Bonita.Montero@nospicedham.gmail.com   
      
   That's not really asm I want to ask for, but a question about TSX/RTM,   
   so this is x86-architecture-related. I want to test whether TSX/RTM   
   could be faster when having an atomic operation on a size_t-sized   
   operand than LOCK XADD or LOCK CMPXCHG.   
      
   So here's the test-code:   
      
   #if defined(_MSC_VER)   
    #include    
    #include    
   #elif defined(__unix__)   
    #include    
    #include    
    #include    
    #include    
   #endif   
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
      
   bool hasTSX();   
      
   using namespace std;   
   using namespace chrono;   
      
   inline   
   size_t fetchAdd( size_t volatile &v, size_t a )   
   {   
   #if defined(_MSC_VER)   
    #if defined(_M_X64)   
    return (size_t)_InterlockedExchangeAdd64( &(__int64 &)v, (__int64)a );   
    #elif defined(_M_IX86)   
    return (size_t)_InterlockedExchangeAdd( &(long &)v, (long)a );   
    #else   
    #error unsupported architecture   
    #endif   
   #elif defined(__GNUC__) || defined(__clang__)   
    return __sync_fetch_and_add( &v, a );   
   #else   
    #error unsupported architecture   
   #endif   
   }   
      
   inline   
   bool rtmFetchAdd( size_t volatile &v, size_t a )   
   {   
    if( _xbegin() == _XBEGIN_STARTED )   
    {   
    v += a;   
    _xend();   
    return true;   
    }   
    else   
    return false;   
   }   
      
   inline   
   size_t compareExchange( size_t volatile &v, size_t c, size_t x )   
   {   
   #if defined(_MSC_VER)   
    #if defined(_M_X64)   
    return (size_t)_InterlockedCompareExchange64( &(__int64 &)v,   
   (__int64)x, (__int64)c );   
    #elif defined(_M_IX86)   
    return (size_t)_InterlockedCompareExchange( &(long &)v, (long)x,   
   (long)c );   
    #else   
    #error unsupported architecture   
    #endif   
   #elif defined(__GNUC__) || defined(__clang__)   
    return __sync_val_compare_and_swap( &v, c, x );   
   #else   
    #error unsupported architecture   
   #endif   
   }   
      
   int main( int argc, char **argv )   
   {   
    if( argc < 2 )   
    return -1;   
    double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);   
      
    auto thrXadd = []( uint8_t volatile &run, size_t adds, size_t   
   volatile &atm, atomic &misses )   
    {   
    while( !run );   
    for( size_t i = adds; i; --i )   
    fetchAdd( atm, 1 );   
    };   
    auto thrXchg = []( uint8_t volatile &run, size_t adds, size_t   
   volatile &atm, atomic &misses )   
    {   
    while( !run );   
    size_t missed = 0;   
    for( size_t i = adds, cmp = atm; i; --i )   
    {   
    for( size_t res; ; )   
    if( (res = compareExchange( atm, cmp, cmp + 1 )) == cmp )   
    {   
    cmp = cmp + 1;   
    break;   
    }   
    else   
    cmp = res,   
    ++missed;   
    }   
    misses.fetch_add( missed );   
    };   
    auto rtmAdd = []( uint8_t volatile &run, size_t adds, size_t   
   volatile &atm, atomic &misses )   
    {   
    while( !run );   
    size_t missed = 0;   
    for( size_t i = adds; i; --i )   
    while( !rtmFetchAdd( atm, 1 ) )   
    ++missed;   
    misses.fetch_add( missed );   
    };   
    using threadfunc = void (*)( uint8_t volatile &, size_t, size_t   
   volatile &, atomic & );   
    array atf;   
    array threadDescr;   
    size_t nTests;   
    size_t const ADDS = 10'000'000;   
    unsigned nProcessors = thread::hardware_concurrency();   
      
    atf[0] = thrXadd;   
    atf[1] = thrXchg;   
    atf[2] = rtmAdd;   
    threadDescr[0] = "xadd-thread";   
    threadDescr[1] = "cmpxchge-thread";   
    threadDescr[2] = "rtm-thread";   
    nTests = hasTSX() ? atf.size() : atf.size() - 1;   
      
    for( size_t m = 0; m != nTests; ++m )   
    {   
    cout << threadDescr[m] << ":" << endl;   
    for( unsigned nThreads = 1; nThreads <= nProcessors; ++nThreads )   
    {   
    atomic misses( 0 );   
    uint8_t run = false;   
    size_t atm;   
    vector threads;   
    for( unsigned i = 0; i != nThreads; ++i )   
    {   
    threads.emplace_back( atf[m], ref( run ), ADDS, ref(   
   atm ), ref( misses ) );   
   #if defined(_MSC_VER)   
    SetThreadAffinityMask( threads[i].native_handle(),   
   (DWORD_PTR)1 << i );   
   #elif defined(__unix__)   
    cpu_set_t cpuset;   
    CPU_ZERO(&cpuset);   
    CPU_SET(i, &cpuset);   
    pthread_setaffinity_np( threads[i].native_handle(),   
   sizeof cpuset, &cpuset );   
   #endif   
    }   
    time_point start =   
   high_resolution_clock::now();   
    run = true;   
    for( unsigned i = 0; i != nThreads; ++i )   
    threads[i].join();   
    uint64_t ns = (uint64_t)duration_cast(   
   high_resolution_clock::now() - start ).count();;   
      
    double nsPerAdd = (double)ns / nThreads / ADDS / 1.0e9;   
    cout << "threads: " << nThreads << " cycles: " << nsPerAdd   
   / nsPerClockCycle << " misses-ratio: " << (int)(100.0 * (size_t)misses /   
   nThreads / ADDS) << "%" << endl;   
    }   
    cout << endl;   
    }   
   }   
      
   bool hasTSX()   
   {   
   #if defined(_MSC_VER)   
    int regs[4];   
    __cpuidex( regs, 7, 0 );   
    return regs[1] & (1 << 11);   
   #else   
    return true;   
   #endif   
   }   
      
   So can anyone here compile this with MSVC++ or gcc / clang on a Skylake   
   or newer CPU with TSX and give me the output? With gcc / clang you need   
   the compiler-option "-mrtm" to enable RTM.   
   When running the program you need to give the base-clock of the CPU. The   
   Program gives an estimate (because the real clock may vary because of   
   bootsing) of the clock-cycles spent on each successful increment.   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|