From: already5chosen@yahoo.com   
      
   On Wed, 20 Aug 2025 10:50:39 +0200   
   Terje Mathisen wrote:   
      
      
   >   
   > Very impressive Michael!   
   >   
   > I particularly like how you are interleaving ADOX and ADCX to gain   
   > two carry bits without having to save them off to an additional   
   > register.   
   >   
   > Terje   
   >   
      
   It is interesting as an exercise in ADX extension programming, but in   
   practice it is only 0-10% faster than much simpler and smaller code   
   presented in the other post that uses no ISA extensions so runs on every   
   iAMD64 CPU since K8.   
   I suspect that this result is quite representative of the gains that   
   can be achieved with ADX. May be, if there is a crypto requirement   
   of independence of execution time from inputs then the gain would be   
   somewhat bigger, but even there I would be very surprised to find 1.5x   
   gain.   
   Overall, I think that time spent by Intel engineers on invention of ADX   
   could have been spent much better.   
      
      
   Going back to the task of 3-way addition, another approach that can   
   utilize the same idea of breaking data dependency is using SIMD.   
   In case of 4 cores that I tested SIMD means AVX2.   
   The are results of AVX2 implementation that unrolls by two i.e. 512   
   output bits per iteration of the inner loop.   
      
   Platform RC GM SK Z3   
   add3_avxq_u2 226.7 823.3 321.1 309.5   
      
   The speed is about equal to more unrolled ADX variant on RC, faster on   
   Z3, much faster on SK and much slower on GM. Unlike ADX, it runs on   
   Intel Haswell and on few pre-Zen AMD CPUs.   
      
    .file "add3_avxq_u2.s"   
    .text   
    .p2align 4   
    .globl add3   
    .def add3; .scl 2; .type 32; .endef   
    .seh_proc add3   
   add3:   
    subq $56, %rsp   
    .seh_stackalloc 56   
    vmovups %xmm6, 32(%rsp)   
    .seh_savexmm %xmm6, 32   
    .seh_endprologue   
    # %rcx - dst   
    # %rdx - a   
    # %r8 - b   
    # %r9 - c   
    sub %rcx, %rdx # %rdx - a-dst   
    sub %rcx, %r8 # %r8 - b-dst   
    sub %rcx, %r9 # %r9 - c-dst   
    vpcmpeqq %ymm6, %ymm6, %ymm6   
    vpsllq $63, %ymm6, %ymm6 # ymm6[0:3] = msbit = 2**63   
    vpxor %xmm5, %xmm5, %xmm5 # ymm5[0] = carry = 0   
    mov $127, %eax   
    .loop:   
    vpxor (%rdx,%rcx), %ymm6, %ymm0   
    # ymm0[0:3] = iA[0:3] = a[0:3] - msbit   
    vpxor 32(%rdx,%rcx), %ymm6, %ymm1   
    # ymm1[0:3] = iA[4:7] = a[4:7] - msbit   
    vpaddq (%r8, %rcx), %ymm0, %ymm2   
    # ymm2[0:3] = iSum1[0:3] = iA[0:3]+b[0:3]   
    vpaddq 32(%r8, %rcx), %ymm1, %ymm3   
    # ymm3[0:3] = iSum1[4:7] = iA[4:7] + b[4:7]   
    vpcmpgtq %ymm2, %ymm0, %ymm4   
    # ymm4[0:3] = c1[0:3] = iA[0:3] > iSum1[0:3]   
    vpaddq (%r9, %rcx), %ymm2, %ymm0   
    # ymm0[0:3] = iSum2[0:3] = iSum1[0:3]+c[0:3]   
    vpcmpgtq %ymm0, %ymm2, %ymm2   
    # ymm2[0:3] = c2[0:3] = iSum1[0:3] > iSum2[0:3]   
    vpaddq %ymm4, %ymm2, %ymm2   
    # ymm2[0:3] = cSum0[0:3] = c1[0:3]+c2[0:3]   
    vpcmpgtq %ymm3, %ymm1, %ymm4   
    # ymm4[0:3] = c1[4:7] = iA[4:7] > iSum1[4:7]   
    vpaddq 32(%r9, %rcx), %ymm3, %ymm1   
    # ymm1[0:3] = iSum2[4:7] = iSum1[4:7] + c[4:7]   
    vpcmpgtq %ymm1, %ymm3, %ymm3   
    # ymm3[0:3] = c2[4:7] = iSum1[4:7] > iSum2[4:7]   
    vpaddq %ymm4, %ymm3, %ymm3   
    # ymm3[0:3] = cSum0[4:7] = c1[4:7] + c2[4:7]   
    vpermq $0x93, %ymm2, %ymm4   
    # ymm4[0:3] = cSum0[3,0:2]   
    vpblendd $3, %ymm5, %ymm4, %ymm2   
    # ymm1[0:3] = cSum[0:3] = { carry[0], cSum0[0,1,2] }   
    vpermq $0x93, %ymm3, %ymm5   
    # ymm5[0:3] = cSum0[7,4:6] == carry   
    vpblendd $3, %ymm4, %ymm5, %ymm3   
    # ymm3[0:3] = cSum[4:7] = { cSum0[3], cSum0[4:6] }   
    .add_carry:   
    vpsubq %ymm2, %ymm0, %ymm2   
    # ymm2[0:3] = iSum3[0:3] = iSum2[0:3] - cSum[0:3]   
    vpsubq %ymm3, %ymm1, %ymm3   
    # ymm3[0:3] = iSum3[4:7] = iSum2[4:7] - cSum[4:7]   
    vpcmpgtq %ymm2, %ymm0, %ymm0   
    # ymm0[0:3] = c3[0:3] = iSum2[0:3] > iSum3[0:3]   
    vpcmpgtq %ymm3, %ymm1, %ymm1   
    # ymm3[0:3] = c3[4:7] = iSum2[4:7] > iSum3[4:7]   
    vpor %ymm0, %ymm1, %ymm4   
    vptest %ymm4, %ymm4   
    jne .prop_carry   
    vpxor %ymm2, %ymm6, %ymm0   
    # ymm0[0:3] = uSum3[0:3] = iSum3[0:3] + msbit   
    vpxor %ymm3, %ymm6, %ymm1   
    # ymm1[4:7] = uSum3[4:7] = iSum3[4:7] + msbit   
    vmovdqu %ymm0, (%rcx)   
    vmovdqu %ymm1, 32(%rcx)   
    addq $64, %rcx   
    dec %eax   
    jnz .loop   
      
    # last 7   
    vpxor (%rdx,%rcx), %ymm6, %ymm0   
    # ymm0[0:3] = iA[0:3] = a[0:3] - msbit   
    vpxor 24(%rdx,%rcx), %ymm6, %ymm1   
    # ymm1[0:3] = iA[3:6] = a[3:6] - msbit   
    vpaddq (%r8, %rcx), %ymm0, %ymm2   
    # ymm2[0:3] = iSum1[0:3] = iA[0:3]+b[0:3]   
    vpaddq 24(%r8, %rcx), %ymm1, %ymm3   
    # ymm3[0:3] = iSum1[3:6] = iA[3:6] + b[3:6]   
    vpcmpgtq %ymm2, %ymm0, %ymm4   
    # ymm4[0:3] = c1[0:3] = iA[0:3] > iSum1[0:3]   
    vpaddq (%r9, %rcx), %ymm2, %ymm0   
    # ymm0[0:3] = iSum2[0:3] = iSum1[0:3]+c[0:3]   
    vpcmpgtq %ymm0, %ymm2, %ymm2   
    # ymm2[0:3] = c2[0:3] = iSum1[0:3] > iSum2[0:3]   
    vpaddq %ymm4, %ymm2, %ymm2   
    # ymm2[0:3] = cSum0[0:3] = c1[0:3]+c2[0:3]   
    vpcmpgtq %ymm3, %ymm1, %ymm4   
    # ymm4[0:3] = c1[3:6] = iA[3:6] > iSum1[3:6]   
    vpaddq 24(%r9, %rcx), %ymm3, %ymm1   
    # ymm1[0:3] = iSum2[3:6] = iSum1[3:6] + c[3:6]   
    vpcmpgtq %ymm1, %ymm3, %ymm3   
    # ymm3[0:3] = c2[3:6] = iSum1[3:6] > iSum2[3:6]   
    vpaddq %ymm4, %ymm3, %ymm3   
    # ymm3[0:3] = cSum[4:7] = cSum0[3:6] = c1[3:6] + c2[367]   
    vpermq $0x93, %ymm2, %ymm4   
    # ymm2[0:3] = cSum0[3,0,1,2]   
    vpblendd $3, %ymm5, %ymm4, %ymm2   
    # ymm1[0:3] = cSum[0:3] = { carry[0], cSum0[0,1,2] }   
    vpermq $0xF9, %ymm1, %ymm1   
    # ymm3[0:3] = iSum2[4:6,6]   
    .add_carry2:   
    vpsubq %ymm2, %ymm0, %ymm2   
    # ymm2[0:3] = iSum3[0:3] = iSum2[0:3] - cSum[0:3]   
    vpsubq %ymm3, %ymm1, %ymm3   
    # ymm3[0:3] = iSum3[4:7] = iSum2[4:7] - cSum[4:7]   
    vpcmpgtq %ymm2, %ymm0, %ymm0   
    # ymm0[0:3] = c3[0:3] = iSum2[0:3] > iSum3[0:3]   
    vpcmpgtq %ymm3, %ymm1, %ymm1   
    # ymm1[0:3] = c3[4:7] = iSum2[4:7] > iSum3[4:7]   
    vptest %ymm0, %ymm0   
    jne .prop_carry2   
    vptest %ymm1, %ymm1   
    jne .prop_carry2   
    vpxor %ymm2, %ymm6, %ymm0   
    # ymm0[0:3] = uSum3[0:3] = iSum3[0:3] + msbit   
    vpxor %ymm3, %ymm6, %ymm1   
    # ymm1[4:7] = uSum3[4:7] = iSum3[4:7] + msbit   
    vmovdqu %ymm0, (%rcx)   
    vmovdqu %xmm1, 32(%rcx)   
    vextractf128 $1, %ymm1, %xmm1   
    vmovq %xmm1, 48(%rcx)   
      
    lea -(127*64)(%rcx), %rax   
    vzeroupper   
    vmovups 32(%rsp), %xmm6   
    addq $56, %rsp   
    ret   
      
   .prop_carry:   
    # input:   
    # ymm0[0:3] = c3[0:3]   
      
   [continued in next message]   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|