From: peter.noreply@tin.it   
      
   On Fri, 16 Jan 2026 23:10:24 -0800   
   Paul Rubin wrote:   
      
   > Hans Bezemer writes:   
   > > 5. I added GCC extension support to 4tH in version 3.62.0. At the   
   > > time, it improved performance by about 25%. By accident I found out   
   > > that was no longer true. switch() based was faster. I didn't know   
   > > there had been changes in that regard to GCC.   
   >   
   > If you mean the goto *a feature, these days you might try using tail   
   > calls instead. GCC and LLVM both now support a musttail attribute that   
   > ensures this optimization, or signals a compile-time error if it can't.   
   >   
   > https://lwn.net/Articles/1033373/   
      
   I got interested to understand how tail calls could improve compared   
   to computed gotos. So I took the five first "opcodes" from the VM in   
   NTF64/LXF64 to compared the generated asm.   
   The VM was written from the begining in X64 assembler (13 years ago)   
   4 years ago I also implemented the VM i C to simplify porting to ARM64.   
   At that time the asm version was about 10% faster then the generated   
   C code, today the speed is about the same. C compilers have improved.   
   It was implemented using computed gotos, usingthe following macro   
   as the nesting code ending each "opcode"   
      
   #define RELOAD() code=*ip++; goto *jmp_table[code]    
      
   for the tail call version it was changed to   
      
   RELOAD() opcode func=(opcode)tbl[*ip++]; __attribute__((musttail))   
    return func(ip, tbl, TOP, FTOP, sp, rp, fp, lp)   
      
   (line brooken to be readable)   
      
   The noop "opcode has just the nesting and produces the following code   
      
    movzx r9d, byte ptr [rcx]   
    inc rcx   
    jmp qword ptr [rax + 8*r9]   
      
   and for the tailcall version   
      
    movzx eax, byte ptr [r12]   
    inc r12   
    mov rax, qword ptr [r13 + 8*rax]   
    rex64 jmp rax   
      
   both compiled with   
   clang -S -Wall -O2 -masm=intel -o vm8test3.asm vm8tail.c   
      
   As I suspected the code is practically identical!   
      
   It also turns out that the musttail attribute is not necessary   
   It will generate a tailcall aanyway. The difference is that with   
   musttail it will report an error if it cannot do the tailcall.   
      
   Much more important is the __attribute__((preserve_none)) before   
   each function. This indicated that more registers will be used to pass   
   parameters. As seen above I pass 8 parameters to each function and   
   they need to be in registers to match the asmbler written code.   
   This is done automatically in the goto version as everything is in   
   one function there.   
      
   In the end it is more how you like to write your VM, as one function   
   or one for each "opcode".   
      
   Unfortunately GCC does not recognize preserve_none and uses the stack   
   for some parameters   
      
   Here is my test code   
      
   // VM8 C variant using computed goto   
      
   #include    
      
   #define UNS8 unsigned char   
   #define INT64 long long int   
   #define UNS64 unsigned long long int   
      
   #define RELOAD() code=*ip++; goto *jmp_table[code]    
      
   void VM8(UNS8 *ip, UNS64 *sp, UNS64 *rp, double *fp, UNS64 *lp ) {   
      
   const static void* jmp_table[] = {    
    &&noop,   
    &&swap,   
    &&rot,   
    &&eqzero,   
    &&negate,   
   };   
      
    UNS8 code=*ip;   
    UNS64 tmp;   
    UNS64 TOP=*sp++;   
   // double FTOP=*fp++;   
      
    RELOAD();   
       
      
    noop: // do nothing   
    RELOAD();   
    swap: // swap   
    tmp=sp[0];   
    sp[0]=TOP;   
    TOP=tmp;   
    RELOAD();   
    rot: // rot   
    tmp=TOP;   
    TOP=sp[1];   
    sp[1]=sp[0];   
    sp[0]=tmp;   
    RELOAD();   
    eqzero: // 0=   
    TOP=-(TOP==0);   
    RELOAD();   
    negate: // negate   
    TOP=-TOP;   
    RELOAD();   
       
       
   } //vm8   
      
      
   And here is the tail call version. Sorry for the long lines!   
      
   // VM8 C variant using tailcalls   
      
   #include    
      
   #define UNS8 unsigned char   
   #define INT64 long long int   
   #define UNS64 unsigned long long int   
      
      
   typedef __attribute__((preserve_none)) void (*opcode) (UNS8*, UNS64*, UNS64,   
   double, UNS64*, UNS64*, double*, UNS64*);   
      
   #define RELOAD() opcode func=(opcode)tbl[*ip++]; __attribute__((musttail))   
   return func(ip, tbl, TOP, FTOP, sp, rp, fp, lp)    
      
   #define FUNC __attribute__((preserve_none)) void   
      
   FUNC noop(UNS8 *ip, UNS64 *tbl, UNS64 TOP, double FTOP, UNS64 *sp, UNS64 *rp,   
   double *fp, UNS64 *lp ) // do nothing   
    {   
    RELOAD();   
    }   
      
   FUNC swap(UNS8 *ip, UNS64 *tbl, UNS64 TOP, double FTOP, UNS64 *sp, UNS64 *rp,   
   double *fp, UNS64 *lp ) // swap   
    {UNS64 tmp;   
    tmp=sp[0];   
    sp[0]=TOP;   
    TOP=tmp;   
    RELOAD();}   
      
   FUNC rot(UNS8 *ip, UNS64 *tbl, UNS64 TOP, double FTOP, UNS64 *sp, UNS64 *rp,   
   double *fp, UNS64 *lp ) // rot   
    {UNS64 tmp=TOP;   
    TOP=sp[1];   
    sp[1]=sp[0];   
    sp[0]=tmp;   
    RELOAD();}   
      
   FUNC eqzero(UNS8 *ip, UNS64 *tbl, UNS64 TOP, double FTOP, UNS64 *sp, UNS64   
   *rp, double *fp, UNS64 *lp ) // 0=   
    {TOP=-(TOP==0);   
    RELOAD();}   
      
   FUNC negate(UNS8 *ip, UNS64 *tbl, UNS64 TOP, double FTOP, UNS64 *sp, UNS64   
   *rp, double *fp, UNS64 *lp ) // negate   
    {TOP=-TOP;   
    RELOAD();}   
      
   opcode jmp_table[]={    
    noop,   
    swap,   
    rot,   
    eqzero,   
    negate,   
   };   
      
      
      
   void VM8(UNS8 *ip, UNS64 *sp, UNS64 *rp, double *fp, UNS64 *lp ) {   
      
      
    UNS64 *tbl=(UNS64*)&jmp_table;   
    UNS64 TOP=*sp++;   
    double FTOP=*fp++;   
      
      
    opcode func=(opcode)tbl[*ip++];   
    func( ip, tbl, TOP, FTOP, sp, rp, fp, lp);   
      
   }    
       
    //vm8   
      
   BR   
   Peter   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)   
|