... darkrealms ...

Forums before death by AOL, social media and spammers... "We can't have nice things"
comp.arch
Apparently more than just beeps & boops
131,241 messages
[ << oldest | < older | list | newer > | newest >> ]
Message 129,442 of 131,241
Michael S to Anton Ertl
Re: 3-way long addition (was: VAX)
19 Aug 25 23:03:01
   From: already5chosen@yahoo.com   
      
   On Tue, 19 Aug 2025 05:47:01 GMT   
   anton@mips.complang.tuwien.ac.at (Anton Ertl) wrote:   
      
   >   
   > One other problem is that according to Agner Fog's instruction tables,   
   > even the latest and greatest CPUs from AMD and Intel that he measured   
   > (Zen5 and Tiger Lake) can only execute one adc/adcx/adox per cycle,   
      
   I didn't measure on either TGL or Zen5, but both Raptor Cove and Zen3   
   are certainly capable of more than 1 adcx|adox per cycle.   
      
   Below are Execution times of very heavily unrolled adcx/adox code with   
   dependency broken by trick similiar to above:   
      
   Platform         RC     GM     SK     Z3   
   add3_my_adx_u17  244.5  471.1  482.4  407.0   
      
   Considering that there are 2166 adcx/adox/adc instructions, we have   
   following number of adcx/adox/adc instructions per clock:   
   Platform         RC     GM     SK    Z3   
                   1.67   1.10   1.05   1.44   
      
   For Gracemont and Skylake there exists a possibility of small   
   measurement mistake, but Raptor Cove appears to be capable of at least 2   
   instructions of this type per clock while Zen3 capable of at least 1.5   
   but more likely also 2.   
   It looks to me that the bottlenecks on both RC and Z3 are either rename   
   phase or more likely L1$ access. It seems that while Golden/Raptore Cove   
   can occasionally issue 3 load + 2 stores per clock, it can not sustain   
   more than 3 load-or-store accesses per clock.   
      
      
   Code:   
      
     .file "add3_my_adx_u17.s"   
     .text   
     .p2align 4   
     .globl  add3   
     .def  add3; .scl  2;  .type 32; .endef   
     .seh_proc add3   
   add3:   
     pushq %rsi   
     .seh_pushreg  %rsi   
     pushq %rbx   
     .seh_pushreg  %rbx   
     .seh_endprologue   
     # %rcx - dst   
     # %rdx - a   
     # %r8  - b   
     # %r9  - c   
     sub %rdx, %rcx   
     mov %rcx, %r10  # r10 = dst - a   
     sub %rdx, %r8   # r8  = b - a   
     sub %rdx, %r9   # r9  = c - c   
     mov %rdx, %r11  # r11 - a   
     mov $60,  %edx   
     xor %ecx, %ecx   
     .p2align 4   
     .loop:   
       xor   %ebx,       %ebx # CF <= 0, OF <= 0, EBX <= 0   
       mov  (%r11),      %rsi   
       adcx (%r11,%r8),  %rsi   
       adox (%r11,%r9),  %rsi   
      
       mov  8(%r11),     %rax   
       adcx 8(%r11,%r8), %rax   
       adox 8(%r11,%r9), %rax   
       mov    %rax, 8(%r10,%r11)   
      
       mov  16(%r11),     %rax   
       adcx 16(%r11,%r8), %rax   
       adox 16(%r11,%r9), %rax   
       mov   %rax, 16(%r10,%r11)   
      
       mov  24(%r11),     %rax   
       adcx 24(%r11,%r8), %rax   
       adox 24(%r11,%r9), %rax   
       mov   %rax, 24(%r10,%r11)   
      
       mov  32(%r11),     %rax   
       adcx 32(%r11,%r8), %rax   
       adox 32(%r11,%r9), %rax   
       mov   %rax, 32(%r10,%r11)   
      
       mov  40(%r11),     %rax   
       adcx 40(%r11,%r8), %rax   
       adox 40(%r11,%r9), %rax   
       mov   %rax, 40(%r10,%r11)   
      
       mov  48(%r11),     %rax   
       adcx 48(%r11,%r8), %rax   
       adox 48(%r11,%r9), %rax   
       mov   %rax, 48(%r10,%r11)   
      
       mov  56(%r11),     %rax   
       adcx 56(%r11,%r8), %rax   
       adox 56(%r11,%r9), %rax   
       mov   %rax, 56(%r10,%r11)   
      
       mov  64(%r11),     %rax   
       adcx 64(%r11,%r8), %rax   
       adox 64(%r11,%r9), %rax   
       mov   %rax, 64(%r10,%r11)   
      
       mov  72(%r11),     %rax   
       adcx 72(%r11,%r8), %rax   
       adox 72(%r11,%r9), %rax   
       mov   %rax, 72(%r10,%r11)   
      
       mov  80(%r11),     %rax   
       adcx 80(%r11,%r8), %rax   
       adox 80(%r11,%r9), %rax   
       mov   %rax, 80(%r10,%r11)   
      
       mov  88(%r11),     %rax   
       adcx 88(%r11,%r8), %rax   
       adox 88(%r11,%r9), %rax   
       mov   %rax, 88(%r10,%r11)   
      
       mov  96(%r11),     %rax   
       adcx 96(%r11,%r8), %rax   
       adox 96(%r11,%r9), %rax   
       mov   %rax, 96(%r10,%r11)   
      
       mov  104(%r11),     %rax   
       adcx 104(%r11,%r8), %rax   
       adox 104(%r11,%r9), %rax   
       mov      %rax,  104(%r10,%r11)   
      
       mov  112(%r11),     %rax   
       adcx 112(%r11,%r8), %rax   
       adox 112(%r11,%r9), %rax   
       mov   %rax, 112(%r10,%r11)   
      
       mov  120(%r11),     %rax   
       adcx 120(%r11,%r8), %rax   
       adox 120(%r11,%r9), %rax   
       mov   %rax, 120(%r10,%r11)   
      
       lea 136(%r11),      %r11   
      
       mov  -8(%r11),     %rax   
       adcx -8(%r11,%r8), %rax   
       adox -8(%r11,%r9), %rax   
       mov   %rax, -8(%r10,%r11)   
      
       mov   %ebx,       %eax  # EAX <= 0   
       adcx  %ebx,       %eax  # EAX <= OF, OF <= 0   
       adox  %ebx,       %eax  # EAX <= OF, OF <= 0   
      
       add   %rcx,       %rsi   
       jc .prop_carry   
     .carry_done:   
       mov   %rsi, -136(%r10,%r11)   
       mov   %eax,       %ecx   
       dec   %edx   
     jnz .loop   
      
     # last 3   
     mov   (%r11),      %rax   
     mov  8(%r11),      %rdx   
     mov 16(%r11),      %rbx   
     add   (%r11,%r8),  %rax   
     adc  8(%r11,%r8),  %rdx   
     adc 16(%r11,%r8),  %rbx   
     add   (%r11,%r9),  %rax   
     adc  8(%r11,%r9),  %rdx   
     adc 16(%r11,%r9),  %rbx   
     add    %rcx,       %rax   
     adc    $0,         %rdx   
     adc    $0,         %rbx   
     mov   %rax,   (%r10,%r11)   
     mov   %rdx,  8(%r10,%r11)   
     mov   %rbx, 16(%r10,%r11)   
      
     lea (-1020*8)(%r10,%r11), %rax   
     popq  %rbx   
     popq  %rsi   
     ret   
      
   .prop_carry:   
     lea -128(%r10,%r11), %rbx   
     xor %ecx, %ecx   
     addq $1,   (%rbx)   
     adc %rcx,  8(%rbx)   
     adc %rcx, 16(%rbx)   
     adc %rcx, 24(%rbx)   
     adc %rcx, 32(%rbx)   
     adc %rcx, 40(%rbx)   
     adc %rcx, 48(%rbx)   
     adc %rcx, 56(%rbx)   
     adc %rcx, 64(%rbx)   
     adc %rcx, 72(%rbx)   
     adc %rcx, 80(%rbx)   
     adc %rcx, 88(%rbx)   
     adc %rcx, 96(%rbx)   
     adc %rcx,104(%rbx)   
     adc %rcx,112(%rbx)   
     adc %rcx,120(%rbx)   
     adc %ecx,    %eax   
     jmp .carry_done   
   .seh_endproc   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)
[ << oldest | < older | list | newer > | newest >> ]