... darkrealms ...

Forums before death by AOL, social media and spammers... "We can't have nice things"
comp.arch
Apparently more than just beeps & boops
131,241 messages
[ << oldest | < older | list | newer > | newest >> ]
Message 129,441 of 131,241
Michael S to Anton Ertl
Re: 3-way long addition (was: VAX)
19 Aug 25 17:20:54
   From: already5chosen@yahoo.com   
      
   On Tue, 19 Aug 2025 07:09:51 GMT   
   anton@mips.complang.tuwien.ac.at (Anton Ertl) wrote:   
      
   > anton@mips.complang.tuwien.ac.at (Anton Ertl) writes:   
   > >The idea is interesting, but I don't understand the code.  The   
   > >following looks funny to me:   
   > >   
   > >1) You increment edx in increment_edx, then jump back to edx_ready   
   > >and   
   > >   immediately overwrite edx with ebx.  Then you do nothing with it,   
   > >   and then you clear edx in the next iteration.  So both the "inc   
   > >   edx" and the "mov edx, ebx" look like dead code to me that can be   
   > >   optimized away.   
   > >   
   > >2) There is a loop-carried dependency through ebx, and the number   
   > >   accumulating in ebx and the carry check makes no sense with that.   
   > >   
   > >Could it be that you wanted to do "mov ebx, edx" at edx_ready?  It   
   > >all makes more sense with that.  ebx then contains the carry from   
   > >the last cycle on entry.  The carry dependency chain starts at   
   > >clearing edx, then gets to additional carries, then is copied to   
   > >ebx, transferred into the next iteration, and is ended there by   
   > >overwriting ebx.  No dependency cycles (except the loop counter and   
   > >addresses, which can be dealt with by hardware or by unrolling), and   
   > >ebx contains the carry from the last iteration   
   > >   
   > >One other problem is that according to Agner Fog's instruction   
   > >tables, even the latest and greatest CPUs from AMD and Intel that he   
   > >measured (Zen5 and Tiger Lake) can only execute one adc/adcx/adox   
   > >per cycle, and adc has a latency of 1, so breaking the dependency   
   > >chain in a beneficial way should avoid the use of adc.  For our   
   > >three-summand add, it's not clear if adcx and adox can run in the   
   > >same cycle, but looking at your measurements, it is unlikely.   
   > >   
   > >So we would need something other than "adc edx, edx" to set the carry   
   > >register.  According to Agner Fog Zen3 can perform 2 cmovc per cycle   
   > >(and Zen5 can do 4/cycle), so that might be the way to do it.  E.g.,   
   > >have 1 in edi, and then do, for two-summand addition:   
   > >   
   > >   mov edi,1   
   > >   xor ebx,ebx   
   > >next:   
   > >   xor edx, edx   
   > >   mov rax,[rsi+rcx*8]   
   > >   add rax,[r8+rcx*8]   
   > >   cmovc edx, edi   
   > >   add rbx,rax   
   > >   jc  incremen_edx   
   > >   ; eliminate data dependency between loop iteration   
   > >   ; replace it by very predictable control dependency   
   > >edx_ready:   
   > >   mov edx, ebx   
   > >   mov [rdi+rcx*8],rax   
   > >   inc rcx   
   > >   cmp rcx,r10   
   > >   jb next   
   > >   ...   
   > >   ret   
   > >; that code is placed after return   
   > >; it is executed extremely rarely.For random inputs-approximately   
   > >never incremen_edx:   
   > >  inc edx   
   > >  jmp edx_ready   
   >   
   > Forgot to fix the "mov edx, ebx" here.  One other thing: I think that   
   > the "add rbx, rax" should be "add rax, rbx".  You want to add the   
   > carry to rax before storing the result.  So the version with just one   
   > iteration would be:   
      
   To many back and force mental switches between Intel and AT&T syntax.   
   The real code that I measured was for Windows platform, but in AT&T   
   (gnu) syntax.   
   Below is full function with loop unrolled by 3. The rest may be   
   I'd answer later, right now I don't have time.   
      
   	.file	"add3_my_u3.s"   
   	.text   
   	.p2align 4   
   	.globl	add3   
   	.def	add3;	.scl	2;	.type   
   	32;	.endef   
   	.seh_proc	add3   
   add3:   
   	pushq	%r13   
   	.seh_pushreg	%r13   
   	pushq	%r12   
   	.seh_pushreg	%r12   
   	pushq	%rbp   
   	.seh_pushreg	%rbp   
   	pushq	%rdi   
   	.seh_pushreg	%rdi   
   	pushq	%rsi   
   	.seh_pushreg	%rsi   
   	pushq	%rbx   
   	.seh_pushreg	%rbx   
   	.seh_endprologue   
   	# %rcx - dst   
   	# %rdx - a   
   	# %r8  - b   
   	# %r9  - c   
   	sub %rcx, %rdx   
   	sub %rcx, %r8   
   	sub %rcx, %r9   
   	mov $341, %ebx   
   	xor %eax, %eax   
   	.loop:   
   	  xor %esi, %esi   
   	  mov   (%rcx,%rdx), %rdi   
   	  mov  8(%rcx,%rdx), %rbp   
   	  mov 16(%rcx,%rdx), %r10   
   	  add   (%rcx,%r8),  %rdi   
   	  adc  8(%rcx,%r8),  %rbp   
   	  adc 16(%rcx,%r8),  %r10   
   	  adc    %esi,       %esi   
   	  add   (%rcx,%r9),  %rdi   
   	  adc  8(%rcx,%r9),  %rbp   
   	  adc 16(%rcx,%r9),  %r10   
   	  adc    $0,         %esi   
   	  add    %rax,       %rdi # add carry from the previous   
   	iteration   
   	  jc .prop_carry   
   	.carry_done:   
   	  mov %esi,    %eax   
   	  mov %rdi,   (%rcx)   
   	  mov %rbp,  8(%rcx)   
   	  mov %r10, 16(%rcx)   
   	  lea 24(%rcx), %rcx   
   	  dec %ebx   
   	jnz .loop   
      
   	sub $(1023*8), %rcx   
   	mov %rcx, %rax   
      
   	popq	%rbx   
   	popq	%rsi   
   	popq	%rdi   
   	popq	%rbp   
   	popq	%r12   
   	popq	%r13   
   	ret   
      
   .prop_carry:   
     add $1, %rbp   
     adc $0, %r10   
     adc $0, %esi   
     jmp .carry_done   
      
   .seh_endproc   
      
      
      
      
      
      
      
   >   
   >    mov edi,1   
   >    xor ebx,ebx   
   > next:   
   >    xor edx, edx   
   >    mov rax,[rsi+rcx*8]   
   >    add rax,[r8+rcx*8]   
   >    cmovc edx, edi   
   >    add rax,rbx   
   >    jc  incremen_edx   
   >    ; eliminate data dependency between loop iteration   
   >    ; replace it by very predictable control dependency   
   > edx_ready:   
   >    mov ebx, edx   
   >    mov [rdi+rcx*8],rax   
   >    inc rcx   
   >    cmp rcx,r10   
   >    jb next   
   >    ...   
   >    ret   
   > ; that code is placed after return   
   > ; it is executed extremely rarely.For random inputs-approximately   
   > never incremen_edx:   
   >   inc edx   
   >   jmp edx_ready   
   >   
   > And the version with the two additional adc-using iterations would be   
   > (with an additional correction):   
   >   
   >    mov edi,1   
   >    xor ebx,ebx   
   > next:   
   >    mov rax,[rsi+rcx*8]   
   >    add [r8+rcx*8], rax   
   >    mov rax,[rsi+rcx*8+8]   
   >    adc [r8+rcx*8+8], rax   
   >    xor edx, edx   
   >    mov rax,[rsi+rcx*8+16]   
   >    adc rax,[r8+rcx*8+16]   
   >    cmovc edx, edi   
   >    add rax,rbx   
   >    jc  incremen_edx   
   >    ; eliminate data dependency between loop iteration   
   >    ; replace it by very predictable control dependency   
   > edx_ready:   
   >    mov ebx, edx   
   >    mov [rdi+rcx*8+16],rax   
   >    add rcx,3   
   >    cmp rcx,r10   
   >    jb next   
   >    ...   
   >    ret   
   > ; that code is placed after return   
   > ; it is executed extremely rarely.For random inputs-approximately   
   > never incremen_edx:   
   >   inc edx   
   >   jmp edx_ready   
   >   
   > - anton   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)
[ << oldest | < older | list | newer > | newest >> ]