From: already5chosen@yahoo.com   
      
   On Tue, 16 Dec 2025 17:51:28 -0000 (UTC)   
   Thomas Koenig wrote:   
      
   > Michael S schrieb:   
   >   
   > > Today I tested speed of gcc implementation of Decimal128   
   > > (BID-encoded, of course) on Intel Core i7-14700.   
   > > Average time in nsec:   
   > > op Add Sub Mul Div   
   > > P-Core 33 33 86 76   
   > > E-Core 46 48 121 108   
   > >   
   > > Counter-intuitively, division is faster than multiplications.   
   > > And both appear much slower than necessary.   
   >   
   > Interesting. Could you provide the benchmark used?   
      
   // tb.cpp   
   #include    
   #include    
   #include    
   #include    
   #include    
   #include    
      
   extern "C" {   
   void uut(void*, const void*, const void*);   
   };   
      
   static inline   
   uint64_t umulh(uint64_t a,uint64_t b) {   
    return uint64_t(((unsigned __int128)a * b)>>64);   
   }   
      
   int main(int , char** )   
   {   
    const int N_PAIRS = 1000000;   
    const int N_ITER = 17;   
    typedef unsigned __int128 u128;   
    std::vector src(N_PAIRS*2);   
    std::mt19937_64 prng(1);   
    const unsigned EXP_BIAS = 6143;   
    const unsigned EXP_SHIFT = 113;   
    for (int i = 0; i < N_PAIRS*2; ++i) {   
    // generate pseudo-random number in range [1e33:1e34-1]   
    const uint64_t RNG_LO = (long long)1e17;   
    const uint64_t RNG_HI = (long long)9e16;   
    const uint64_t BASE_HI = (long long)1e16;   
    uint64_t lo = umulh(prng(), RNG_LO); // [0:1e17-1]   
    uint64_t hi = umulh(prng(), RNG_HI) + BASE_HI; // [1e16:1e17-1]   
    u128 val = (u128)hi*RNG_LO + lo;   
    unsigned exp = EXP_BIAS + umulh(prng(), 50) - 25;   
    const u128 exp_val = (u128)exp << EXP_SHIFT;   
    src[i] = val | exp_val;   
    }   
      
    std::vector dt(N_ITER);   
    for (int it = 0; it < N_ITER; ++it) {   
    struct timespec t0;   
    clock_gettime(CLOCK_MONOTONIC, &t0);   
    u128 dummy1 = 0;   
    const u128* pSrc = src.data();   
    for (int i = 0; i < N_PAIRS; ++i) {   
    u128 rat;   
    uut(&rat, &pSrc[i*2+0], &pSrc[i*2+1]);   
    dummy1 ^= rat;   
    }   
    if (dummy1 == 42)   
    printf("Blue Moon\n");   
    struct timespec t1;   
    clock_gettime(CLOCK_MONOTONIC, &t1);   
    dt[it] = (t1.tv_sec - t0.tv_sec)*(long long)(1e9) + (long   
    long)t1.tv_nsec - (long long)t0.tv_nsec; }   
    // find median   
    std::nth_element(&dt[0], &dt[N_ITER/2], &dt[N_ITER]);   
    long long dt_med = dt[N_ITER/2];   
    printf("%.1f nsec\n", (double)dt_med / N_PAIRS);   
      
    return 0;   
   }   
   // end tb.cpp   
      
      
   // gcc_dec128add.c   
   #include    
      
   void uut(void* pRes, const void* pA, const void* pB)   
   {   
    _Decimal128 a, b, res;   
    memcpy(&a, pA, sizeof(a));   
    memcpy(&b, pB, sizeof(b));   
    res = a + b;   
    memcpy(pRes, &res, sizeof(res));   
   }   
   // end gcc_dec128add.c   
      
   // gcc_dec128sub.c   
   #include    
      
   void uut(void* pRes, const void* pA, const void* pB)   
   {   
    _Decimal128 a, b, res;   
    memcpy(&a, pA, sizeof(a));   
    memcpy(&b, pB, sizeof(b));   
    res = a - b;   
    memcpy(pRes, &res, sizeof(res));   
   }   
   // end gcc_dec128sub.c   
      
   // gcc_dec128mul.c   
   #include    
      
   void uut(void* pRes, const void* pA, const void* pB)   
   {   
    _Decimal128 a, b, res;   
    memcpy(&a, pA, sizeof(a));   
    memcpy(&b, pB, sizeof(b));   
    res = a * b;   
    memcpy(pRes, &res, sizeof(res));   
   }   
   // end gcc_dec128mul.c   
      
   // gcc_dec128div.c   
   #include    
      
   void uut(void* pRat, const void* pNum, const void* pDen)   
   {   
    _Decimal128 den, num, rat;   
    memcpy(&num, pNum, sizeof(num));   
    memcpy(&den, pDen, sizeof(den));   
    rat = num / den;   
    memcpy(pRat, &rat, sizeof(rat));   
   }   
   // end gcc_dec128div.c   
      
      
   Build script   
   COPT="-O2 -Wall -march=haswell -mtune=skylake"   
   mkdir -p obj   
   mkdir -p out   
   g++ -c $COPT tb.cpp -o obj/tb.o   
   gcc -c $COPT gcc_dec128add.c -o obj/gcc_dec128add.o   
   gcc -c $COPT gcc_dec128sub.c -o obj/gcc_dec128sub.o   
   gcc -c $COPT gcc_dec128mul.c -o obj/gcc_dec128mul.o   
   gcc -c $COPT gcc_dec128div.c -o obj/gcc_dec128div.o   
   g++ -s obj/tb.o obj/gcc_dec128add.o -o out/tst_add.exe   
   g++ -s obj/tb.o obj/gcc_dec128sub.o -o out/tst_sub.exe   
   g++ -s obj/tb.o obj/gcc_dec128mul.o -o out/tst_mul.exe   
   g++ -s obj/tb.o obj/gcc_dec128div.o -o out/tst_div.exe   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|