home bbs files messages ]

Forums before death by AOL, social media and spammers... "We can't have nice things"

   comp.arch      Apparently more than just beeps & boops      131,241 messages   

[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]

   Message 129,447 of 131,241   
   Michael S to Terje Mathisen   
   Re: 3-way long addition (1/2)   
   20 Aug 25 14:16:55   
   
   From: already5chosen@yahoo.com   
      
   On Wed, 20 Aug 2025 10:50:39 +0200   
   Terje Mathisen  wrote:   
      
      
   >   
   > Very impressive Michael!   
   >   
   > I particularly like how you are interleaving ADOX and ADCX to gain   
   > two carry bits without having to save them off to an additional   
   > register.   
   >   
   > Terje   
   >   
      
   It is interesting as an exercise in ADX extension programming, but in   
   practice it is only 0-10% faster than much simpler and smaller code   
   presented in the other post that uses no ISA extensions so runs on every   
   iAMD64 CPU since K8.   
   I suspect that this result is quite representative of the gains that   
   can be achieved with ADX. May be, if there is a crypto requirement   
   of independence of execution time from inputs then the gain would be   
   somewhat bigger, but even there I would be very surprised to find 1.5x   
   gain.   
   Overall, I think that time spent by Intel engineers on invention of ADX   
   could have been spent much better.   
      
      
   Going back to the task of 3-way addition, another approach that can   
   utilize the same idea of breaking data dependency is using SIMD.   
   In case of 4 cores that I tested SIMD means AVX2.   
   The are results of AVX2 implementation that unrolls by two i.e. 512   
   output bits per iteration of the inner loop.   
      
   Platform         RC     GM     SK     Z3   
   add3_avxq_u2     226.7  823.3  321.1  309.5   
      
   The speed is about equal to more unrolled ADX variant on RC, faster on   
   Z3, much faster on SK and much slower on GM. Unlike ADX, it runs on   
   Intel Haswell and on few pre-Zen AMD CPUs.   
      
       .file   "add3_avxq_u2.s"   
       .text   
       .p2align 4   
       .globl  add3   
       .def    add3;   .scl    2;  .type   32; .endef   
       .seh_proc   add3   
   add3:   
       subq    $56, %rsp   
       .seh_stackalloc 56   
       vmovups %xmm6, 32(%rsp)   
       .seh_savexmm    %xmm6, 32   
       .seh_endprologue   
       # %rcx - dst   
       # %rdx - a   
       # %r8  - b   
       # %r9  - c   
       sub %rcx, %rdx   # %rdx - a-dst   
       sub %rcx, %r8    # %r8  - b-dst   
       sub %rcx, %r9    # %r9  - c-dst   
       vpcmpeqq    %ymm6, %ymm6, %ymm6   
     vpsllq    $63,   %ymm6, %ymm6        # ymm6[0:3] = msbit = 2**63   
       vpxor   %xmm5, %xmm5, %xmm5        # ymm5[0]   = carry = 0   
       mov $127, %eax   
       .loop:   
         vpxor    (%rdx,%rcx), %ymm6, %ymm0   
           # ymm0[0:3] = iA[0:3]    = a[0:3] - msbit   
         vpxor  32(%rdx,%rcx), %ymm6, %ymm1   
           # ymm1[0:3] = iA[4:7]    = a[4:7]     -  msbit   
         vpaddq     (%r8, %rcx), %ymm0, %ymm2   
           # ymm2[0:3] = iSum1[0:3] = iA[0:3]+b[0:3]   
         vpaddq 32(%r8, %rcx), %ymm1, %ymm3   
           # ymm3[0:3] = iSum1[4:7] = iA[4:7]    + b[4:7]   
         vpcmpgtq  %ymm2,      %ymm0, %ymm4   
           # ymm4[0:3] = c1[0:3] = iA[0:3] > iSum1[0:3]   
         vpaddq     (%r9, %rcx), %ymm2, %ymm0   
           # ymm0[0:3] = iSum2[0:3] = iSum1[0:3]+c[0:3]   
         vpcmpgtq  %ymm0,      %ymm2, %ymm2   
   	    # ymm2[0:3] = c2[0:3] = iSum1[0:3] > iSum2[0:3]   
         vpaddq  %ymm4,      %ymm2, %ymm2   
   	    # ymm2[0:3] = cSum0[0:3] = c1[0:3]+c2[0:3]   
         vpcmpgtq  %ymm3,      %ymm1, %ymm4   
   	    # ymm4[0:3] = c1[4:7]    = iA[4:7]    > iSum1[4:7]   
         vpaddq 32(%r9, %rcx), %ymm3, %ymm1   
   	    # ymm1[0:3] = iSum2[4:7] = iSum1[4:7] + c[4:7]   
         vpcmpgtq  %ymm1,      %ymm3, %ymm3   
   	    # ymm3[0:3] = c2[4:7]    = iSum1[4:7] > iSum2[4:7]   
         vpaddq    %ymm4,      %ymm3, %ymm3   
   	   # ymm3[0:3] = cSum0[4:7] = c1[4:7]    + c2[4:7]   
         vpermq    $0x93,      %ymm2, %ymm4   
   	   # ymm4[0:3] = cSum0[3,0:2]   
         vpblendd  $3, %ymm5,  %ymm4, %ymm2   
   	   # ymm1[0:3] = cSum[0:3] = { carry[0], cSum0[0,1,2] }   
         vpermq    $0x93,      %ymm3, %ymm5   
   	   # ymm5[0:3] = cSum0[7,4:6] == carry   
         vpblendd  $3, %ymm4,  %ymm5, %ymm3   
   	   # ymm3[0:3] = cSum[4:7] = { cSum0[3], cSum0[4:6] }   
       .add_carry:   
         vpsubq    %ymm2,      %ymm0, %ymm2   
   	   # ymm2[0:3] = iSum3[0:3] = iSum2[0:3] - cSum[0:3]   
         vpsubq    %ymm3,      %ymm1, %ymm3   
   	   # ymm3[0:3] = iSum3[4:7] = iSum2[4:7] - cSum[4:7]   
         vpcmpgtq  %ymm2,      %ymm0, %ymm0   
   	   # ymm0[0:3] = c3[0:3] = iSum2[0:3] > iSum3[0:3]   
         vpcmpgtq  %ymm3,      %ymm1, %ymm1   
   	   # ymm3[0:3] = c3[4:7] = iSum2[4:7] > iSum3[4:7]   
         vpor      %ymm0,      %ymm1, %ymm4   
         vptest    %ymm4,      %ymm4   
       jne .prop_carry   
         vpxor     %ymm2,      %ymm6, %ymm0   
   	   # ymm0[0:3] = uSum3[0:3] = iSum3[0:3] + msbit   
         vpxor     %ymm3,      %ymm6, %ymm1   
   	   # ymm1[4:7] = uSum3[4:7] = iSum3[4:7] + msbit   
         vmovdqu     %ymm0,     (%rcx)   
         vmovdqu     %ymm1,   32(%rcx)   
         addq      $64,        %rcx   
         dec       %eax   
       jnz .loop   
      
     # last 7   
       vpxor    (%rdx,%rcx), %ymm6, %ymm0   
   	 # ymm0[0:3] = iA[0:3]    = a[0:3] - msbit   
       vpxor  24(%rdx,%rcx), %ymm6, %ymm1   
   	 # ymm1[0:3] = iA[3:6]    = a[3:6]     -  msbit   
       vpaddq     (%r8, %rcx), %ymm0, %ymm2   
   	 # ymm2[0:3] = iSum1[0:3] = iA[0:3]+b[0:3]   
       vpaddq 24(%r8, %rcx), %ymm1, %ymm3   
   	 # ymm3[0:3] = iSum1[3:6] = iA[3:6]    + b[3:6]   
       vpcmpgtq  %ymm2,      %ymm0, %ymm4   
   	 # ymm4[0:3] = c1[0:3] = iA[0:3] > iSum1[0:3]   
       vpaddq     (%r9, %rcx), %ymm2, %ymm0   
   	 # ymm0[0:3] = iSum2[0:3] = iSum1[0:3]+c[0:3]   
       vpcmpgtq  %ymm0,      %ymm2, %ymm2   
   	 # ymm2[0:3] = c2[0:3] = iSum1[0:3] > iSum2[0:3]   
       vpaddq    %ymm4,      %ymm2, %ymm2   
   	 # ymm2[0:3] = cSum0[0:3] = c1[0:3]+c2[0:3]   
       vpcmpgtq  %ymm3,      %ymm1, %ymm4   
   	 # ymm4[0:3] = c1[3:6]    = iA[3:6]    > iSum1[3:6]   
       vpaddq 24(%r9, %rcx), %ymm3, %ymm1   
   	 # ymm1[0:3] = iSum2[3:6] = iSum1[3:6] + c[3:6]   
       vpcmpgtq  %ymm1,      %ymm3, %ymm3   
   	 # ymm3[0:3] = c2[3:6]    = iSum1[3:6] > iSum2[3:6]   
       vpaddq      %ymm4,    %ymm3, %ymm3   
   	 # ymm3[0:3] = cSum[4:7]  = cSum0[3:6] = c1[3:6] + c2[367]   
       vpermq    $0x93,      %ymm2, %ymm4   
   	 # ymm2[0:3] = cSum0[3,0,1,2]   
       vpblendd  $3, %ymm5,  %ymm4, %ymm2   
   	 # ymm1[0:3] = cSum[0:3] = { carry[0], cSum0[0,1,2] }   
       vpermq    $0xF9,      %ymm1, %ymm1   
   	 # ymm3[0:3] = iSum2[4:6,6]   
       .add_carry2:   
       vpsubq    %ymm2,      %ymm0, %ymm2   
   	 # ymm2[0:3] = iSum3[0:3] = iSum2[0:3] - cSum[0:3]   
       vpsubq    %ymm3,      %ymm1, %ymm3   
   	 # ymm3[0:3] = iSum3[4:7] = iSum2[4:7] - cSum[4:7]   
       vpcmpgtq  %ymm2,      %ymm0, %ymm0   
   	 # ymm0[0:3] = c3[0:3] = iSum2[0:3] > iSum3[0:3]   
       vpcmpgtq  %ymm3,      %ymm1, %ymm1   
   	 # ymm1[0:3] = c3[4:7] = iSum2[4:7] > iSum3[4:7]   
       vptest    %ymm0,      %ymm0   
       jne .prop_carry2   
       vptest    %ymm1,      %ymm1   
       jne .prop_carry2   
       vpxor     %ymm2,      %ymm6, %ymm0   
   	 # ymm0[0:3] = uSum3[0:3] = iSum3[0:3] + msbit   
       vpxor     %ymm3,      %ymm6, %ymm1   
   	 # ymm1[4:7] = uSum3[4:7] = iSum3[4:7] + msbit   
       vmovdqu   %ymm0,     (%rcx)   
       vmovdqu   %xmm1,   32(%rcx)   
       vextractf128 $1, %ymm1, %xmm1   
       vmovq     %xmm1,   48(%rcx)   
      
       lea -(127*64)(%rcx), %rax   
       vzeroupper   
       vmovups 32(%rsp), %xmm6   
       addq    $56, %rsp   
       ret   
      
   .prop_carry:   
     # input:   
     # ymm0[0:3] = c3[0:3]   
      
   [continued in next message]   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   

[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]


(c) 1994,  bbs@darkrealms.ca