... darkrealms ...

Forums before death by AOL, social media and spammers... "We can't have nice things"
comp.lang.asm.x86
Ahh, the lost art of x86 assembly
4,675 messages
[ << oldest | < older | list | newer > | newest >> ]
Message 3,960 of 4,675
Bonita Montero to All
RTM-question
27 Sep 19 22:01:33
   From: Bonita.Montero@nospicedham.gmail.com   
      
   That's not really asm I want to ask for, but a question about TSX/RTM,   
   so this is x86-architecture-related. I want to test whether TSX/RTM   
   could be faster when having an atomic operation on a size_t-sized   
   operand than LOCK XADD or LOCK CMPXCHG.   
      
   So here's the test-code:   
      
   #if defined(_MSC_VER)   
        #include    
        #include    
   #elif defined(__unix__)   
        #include    
        #include    
        #include    
        #include    
   #endif   
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
      
   bool hasTSX();   
      
   using namespace std;   
   using namespace chrono;   
      
   inline   
   size_t fetchAdd( size_t volatile &v, size_t a )   
   {   
   #if defined(_MSC_VER)   
        #if defined(_M_X64)   
        return (size_t)_InterlockedExchangeAdd64( &(__int64 &)v, (__int64)a );   
        #elif defined(_M_IX86)   
        return (size_t)_InterlockedExchangeAdd( &(long &)v, (long)a );   
        #else   
            #error unsupported architecture   
        #endif   
   #elif defined(__GNUC__) || defined(__clang__)   
        return __sync_fetch_and_add( &v, a );   
   #else   
            #error unsupported architecture   
   #endif   
   }   
      
   inline   
   bool rtmFetchAdd( size_t volatile &v, size_t a )   
   {   
        if( _xbegin() == _XBEGIN_STARTED )   
        {   
            v += a;   
            _xend();   
            return true;   
        }   
        else   
            return false;   
   }   
      
   inline   
   size_t compareExchange( size_t volatile &v, size_t c, size_t x )   
   {   
   #if defined(_MSC_VER)   
        #if defined(_M_X64)   
        return (size_t)_InterlockedCompareExchange64( &(__int64 &)v,   
   (__int64)x, (__int64)c );   
        #elif defined(_M_IX86)   
        return (size_t)_InterlockedCompareExchange( &(long &)v, (long)x,   
   (long)c );   
        #else   
            #error unsupported architecture   
        #endif   
   #elif defined(__GNUC__) || defined(__clang__)   
        return __sync_val_compare_and_swap( &v, c, x );   
   #else   
            #error unsupported architecture   
   #endif   
   }   
      
   int main( int argc, char **argv )   
   {   
        if( argc < 2 )   
            return -1;   
        double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);   
      
        auto thrXadd = []( uint8_t volatile &run, size_t adds, size_t   
   volatile &atm, atomic &misses )   
        {   
            while( !run );   
            for( size_t i = adds; i; --i )   
                fetchAdd( atm, 1 );   
        };   
        auto thrXchg = []( uint8_t volatile &run, size_t adds, size_t   
   volatile &atm, atomic &misses )   
        {   
            while( !run );   
            size_t missed = 0;   
            for( size_t i = adds, cmp = atm; i; --i )   
            {   
                for( size_t res; ; )   
                    if( (res = compareExchange( atm, cmp, cmp + 1 )) == cmp )   
                    {   
                        cmp = cmp + 1;   
                        break;   
                    }   
                    else   
                        cmp = res,   
                        ++missed;   
            }   
            misses.fetch_add( missed );   
        };   
        auto rtmAdd = []( uint8_t volatile &run, size_t adds, size_t   
   volatile &atm, atomic &misses )   
        {   
            while( !run );   
            size_t missed = 0;   
            for( size_t i = adds; i; --i )   
                while( !rtmFetchAdd( atm, 1 ) )   
                    ++missed;   
            misses.fetch_add( missed );   
        };   
        using threadfunc = void (*)( uint8_t volatile &, size_t, size_t   
   volatile &, atomic & );   
        array   atf;   
        array threadDescr;   
        size_t                 nTests;   
        size_t const           ADDS = 10'000'000;   
        unsigned               nProcessors = thread::hardware_concurrency();   
      
        atf[0]         = thrXadd;   
        atf[1]         = thrXchg;   
        atf[2]         = rtmAdd;   
        threadDescr[0] = "xadd-thread";   
        threadDescr[1] = "cmpxchge-thread";   
        threadDescr[2] = "rtm-thread";   
        nTests         = hasTSX() ? atf.size() : atf.size() - 1;   
      
        for( size_t m = 0; m != nTests; ++m )   
        {   
            cout << threadDescr[m] << ":" << endl;   
            for( unsigned nThreads = 1; nThreads <= nProcessors; ++nThreads )   
            {   
                atomic misses( 0 );   
                uint8_t        run = false;   
                size_t         atm;   
                vector threads;   
                for( unsigned i = 0; i != nThreads; ++i )   
                {   
                    threads.emplace_back( atf[m], ref( run ), ADDS, ref(   
   atm ), ref( misses ) );   
   #if defined(_MSC_VER)   
                    SetThreadAffinityMask( threads[i].native_handle(),   
   (DWORD_PTR)1 << i );   
   #elif defined(__unix__)   
                    cpu_set_t cpuset;   
                    CPU_ZERO(&cpuset);   
                    CPU_SET(i, &cpuset);   
                    pthread_setaffinity_np( threads[i].native_handle(),   
   sizeof cpuset, &cpuset );   
   #endif   
                }   
                time_point start =   
   high_resolution_clock::now();   
                run = true;   
                for( unsigned i = 0; i != nThreads; ++i )   
                    threads[i].join();   
                uint64_t ns = (uint64_t)duration_cast(   
   high_resolution_clock::now() - start ).count();;   
      
                double nsPerAdd = (double)ns / nThreads / ADDS / 1.0e9;   
                cout << "threads: " << nThreads << " cycles: " << nsPerAdd   
   / nsPerClockCycle << " misses-ratio: " << (int)(100.0 * (size_t)misses /   
   nThreads / ADDS) << "%" << endl;   
            }   
            cout << endl;   
        }   
   }   
      
   bool hasTSX()   
   {   
   #if defined(_MSC_VER)   
        int regs[4];   
        __cpuidex( regs, 7, 0 );   
        return regs[1] & (1 << 11);   
   #else   
        return true;   
   #endif   
   }   
      
   So can anyone here compile this with MSVC++ or gcc / clang on a Skylake   
   or newer CPU with TSX and give me the output? With gcc / clang you need   
   the compiler-option "-mrtm" to enable RTM.   
   When running the program you need to give the base-clock of the CPU. The   
   Program gives an estimate (because the real clock may vary because of   
   bootsing) of the clock-cycles spent on each successful increment.   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)
[ << oldest | < older | list | newer > | newest >> ]