Forums before death by AOL, social media and spammers... "We can't have nice things"
|    comp.arch    |    Apparently more than just beeps & boops    |    131,241 messages    |
[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]
|    Message 129,442 of 131,241    |
|    Michael S to Anton Ertl    |
|    Re: 3-way long addition (was: VAX)    |
|    19 Aug 25 23:03:01    |
      From: already5chosen@yahoo.com              On Tue, 19 Aug 2025 05:47:01 GMT       anton@mips.complang.tuwien.ac.at (Anton Ertl) wrote:              >       > One other problem is that according to Agner Fog's instruction tables,       > even the latest and greatest CPUs from AMD and Intel that he measured       > (Zen5 and Tiger Lake) can only execute one adc/adcx/adox per cycle,              I didn't measure on either TGL or Zen5, but both Raptor Cove and Zen3       are certainly capable of more than 1 adcx|adox per cycle.              Below are Execution times of very heavily unrolled adcx/adox code with       dependency broken by trick similiar to above:              Platform RC GM SK Z3       add3_my_adx_u17 244.5 471.1 482.4 407.0              Considering that there are 2166 adcx/adox/adc instructions, we have       following number of adcx/adox/adc instructions per clock:       Platform RC GM SK Z3        1.67 1.10 1.05 1.44              For Gracemont and Skylake there exists a possibility of small       measurement mistake, but Raptor Cove appears to be capable of at least 2       instructions of this type per clock while Zen3 capable of at least 1.5       but more likely also 2.       It looks to me that the bottlenecks on both RC and Z3 are either rename       phase or more likely L1$ access. It seems that while Golden/Raptore Cove       can occasionally issue 3 load + 2 stores per clock, it can not sustain       more than 3 load-or-store accesses per clock.                     Code:               .file "add3_my_adx_u17.s"        .text        .p2align 4        .globl add3        .def add3; .scl 2; .type 32; .endef        .seh_proc add3       add3:        pushq %rsi        .seh_pushreg %rsi        pushq %rbx        .seh_pushreg %rbx        .seh_endprologue        # %rcx - dst        # %rdx - a        # %r8 - b        # %r9 - c        sub %rdx, %rcx        mov %rcx, %r10 # r10 = dst - a        sub %rdx, %r8 # r8 = b - a        sub %rdx, %r9 # r9 = c - c        mov %rdx, %r11 # r11 - a        mov $60, %edx        xor %ecx, %ecx        .p2align 4        .loop:        xor %ebx, %ebx # CF <= 0, OF <= 0, EBX <= 0        mov (%r11), %rsi        adcx (%r11,%r8), %rsi        adox (%r11,%r9), %rsi               mov 8(%r11), %rax        adcx 8(%r11,%r8), %rax        adox 8(%r11,%r9), %rax        mov %rax, 8(%r10,%r11)               mov 16(%r11), %rax        adcx 16(%r11,%r8), %rax        adox 16(%r11,%r9), %rax        mov %rax, 16(%r10,%r11)               mov 24(%r11), %rax        adcx 24(%r11,%r8), %rax        adox 24(%r11,%r9), %rax        mov %rax, 24(%r10,%r11)               mov 32(%r11), %rax        adcx 32(%r11,%r8), %rax        adox 32(%r11,%r9), %rax        mov %rax, 32(%r10,%r11)               mov 40(%r11), %rax        adcx 40(%r11,%r8), %rax        adox 40(%r11,%r9), %rax        mov %rax, 40(%r10,%r11)               mov 48(%r11), %rax        adcx 48(%r11,%r8), %rax        adox 48(%r11,%r9), %rax        mov %rax, 48(%r10,%r11)               mov 56(%r11), %rax        adcx 56(%r11,%r8), %rax        adox 56(%r11,%r9), %rax        mov %rax, 56(%r10,%r11)               mov 64(%r11), %rax        adcx 64(%r11,%r8), %rax        adox 64(%r11,%r9), %rax        mov %rax, 64(%r10,%r11)               mov 72(%r11), %rax        adcx 72(%r11,%r8), %rax        adox 72(%r11,%r9), %rax        mov %rax, 72(%r10,%r11)               mov 80(%r11), %rax        adcx 80(%r11,%r8), %rax        adox 80(%r11,%r9), %rax        mov %rax, 80(%r10,%r11)               mov 88(%r11), %rax        adcx 88(%r11,%r8), %rax        adox 88(%r11,%r9), %rax        mov %rax, 88(%r10,%r11)               mov 96(%r11), %rax        adcx 96(%r11,%r8), %rax        adox 96(%r11,%r9), %rax        mov %rax, 96(%r10,%r11)               mov 104(%r11), %rax        adcx 104(%r11,%r8), %rax        adox 104(%r11,%r9), %rax        mov %rax, 104(%r10,%r11)               mov 112(%r11), %rax        adcx 112(%r11,%r8), %rax        adox 112(%r11,%r9), %rax        mov %rax, 112(%r10,%r11)               mov 120(%r11), %rax        adcx 120(%r11,%r8), %rax        adox 120(%r11,%r9), %rax        mov %rax, 120(%r10,%r11)               lea 136(%r11), %r11               mov -8(%r11), %rax        adcx -8(%r11,%r8), %rax        adox -8(%r11,%r9), %rax        mov %rax, -8(%r10,%r11)               mov %ebx, %eax # EAX <= 0        adcx %ebx, %eax # EAX <= OF, OF <= 0        adox %ebx, %eax # EAX <= OF, OF <= 0               add %rcx, %rsi        jc .prop_carry        .carry_done:        mov %rsi, -136(%r10,%r11)        mov %eax, %ecx        dec %edx        jnz .loop               # last 3        mov (%r11), %rax        mov 8(%r11), %rdx        mov 16(%r11), %rbx        add (%r11,%r8), %rax        adc 8(%r11,%r8), %rdx        adc 16(%r11,%r8), %rbx        add (%r11,%r9), %rax        adc 8(%r11,%r9), %rdx        adc 16(%r11,%r9), %rbx        add %rcx, %rax        adc $0, %rdx        adc $0, %rbx        mov %rax, (%r10,%r11)        mov %rdx, 8(%r10,%r11)        mov %rbx, 16(%r10,%r11)               lea (-1020*8)(%r10,%r11), %rax        popq %rbx        popq %rsi        ret              .prop_carry:        lea -128(%r10,%r11), %rbx        xor %ecx, %ecx        addq $1, (%rbx)        adc %rcx, 8(%rbx)        adc %rcx, 16(%rbx)        adc %rcx, 24(%rbx)        adc %rcx, 32(%rbx)        adc %rcx, 40(%rbx)        adc %rcx, 48(%rbx)        adc %rcx, 56(%rbx)        adc %rcx, 64(%rbx)        adc %rcx, 72(%rbx)        adc %rcx, 80(%rbx)        adc %rcx, 88(%rbx)        adc %rcx, 96(%rbx)        adc %rcx,104(%rbx)        adc %rcx,112(%rbx)        adc %rcx,120(%rbx)        adc %ecx, %eax        jmp .carry_done       .seh_endproc              --- SoupGate-Win32 v1.05        * Origin: you cannot sedate... all the things you hate (1:229/2)    |
[   << oldest   |   < older   |   list   |   newer >   |   newest >>   ]
(c) 1994, bbs@darkrealms.ca