... darkrealms ...

Forums before death by AOL, social media and spammers... "We can't have nice things"
comp.lang.asm.x86
Ahh, the lost art of x86 assembly
4,675 messages
[ << oldest | < older | list | newer > | newest >> ]
Message 2,743 of 4,675
Rod Pemberton to James Harris
Re: Optimize stricmp() algorithm (casele
27 Jun 17 17:48:41
   From: NeedNotReplyHere@nospicedham.xrsevnneqk.cem   
      
   On Tue, 27 Jun 2017 09:51:06 +0100   
   James Harris  wrote:   
      
   > It's interesting that for lower() DJGPP uses a lookup table.   
      
   No, that's not DJGPP.  The Terje/Ben used example used an array as a   
   lookup table.  Was that posted on the CLAX thread?  I don't think it   
   was.  It might be in the comp.lang.c thread.  Anyway, lower[] is a 256   
   char ASCII table that I filled in with tolower()'s values with a loop   
   in main().  It was used to complete the routine.  (Technically, the   
   "array" should be unsigned char, but I didn't do that.)   
      
   > > 	je	L10   
   > > L3:   
   > > 	sub	ecx, esi   
   > > 	pop	ebx   
   > > 	mov	eax, ecx   
   > > 	pop	esi   
   > > 	pop	ebp   
   > > 	ret   
   >   
   > That's good code but it occurred to me that because the offset is the   
   > same into each string, one offset could be incremented instead of two   
   > string pointers. Then, rather than the following (if ESI and EDI are   
   > the string pointers)   
   >  [...]   
   > if the offset is in EDX then the equivalent would be a bit shorter.   
   > That could end up being faster. And it saves a register.   
      
   That might shorten the code.  Usually though, using an offset instead   
   of a pointer requires an addition of the offset to the pointer.   
      
   Did you mean something like this?  I kept the temporaries (local   
   variables) to encourage the compiler to place them in registers.   
      
   int rp_jh_stricmp(void *p0, void *p1)   
   {   
     int a,b,i;   
     char *x,*y;   
      
     i=0;   
     a=!0;   
     x=p0;   
     y=p1;   
      
     while(1)   
     {   
       if(!a)   
         break;   
       a=x[i];   
       b=y[i];   
       i++;   
       if(a==b)   
         continue;   
       if(lower[a]==lower[b])   
         continue;   
       break;   
     }   
     return(a-b);   
   }   
      
   With -O2, 32-bit GCC (DJGPP for DOS) generates:   
      
   .globl _rp_jh_stricmp   
   _rp_jh_stricmp:   
   	push	ebp   
   	mov	edx, 1   
   	mov	ebp, esp   
   	push	edi   
   	push	esi   
   	push	ebx   
   	mov	edi, DWORD PTR [ebp+8]   
   	mov	esi, DWORD PTR [ebp+12]   
   	xor	ebx, ebx   
   L10:   
   	test	edx, edx   
   	je	L3   
   	movsx	edx, BYTE PTR [edi+ebx]   
   	movsx	ecx, BYTE PTR [esi+ebx]   
   	inc	ebx   
   	cmp	edx, ecx   
   	je	L10   
   	mov	al, BYTE PTR _lower[ecx]   
   	cmp	BYTE PTR _lower[edx], al   
   	je	L10   
   L3:   
   	sub	edx, ecx   
   	pop	ebx   
   	mov	eax, edx   
   	pop	esi   
   	pop	edi   
   	pop	ebp   
   	ret   
      
      
   With -O2, 64-bit GCC (for Linux) generates:   
      
   	.globl	rp_jh_stricmp   
   	.type	rp_jh_stricmp, @function   
   rp_jh_stricmp:   
   .LFB15:   
   	.cfi_startproc   
   	push	rbx   
   	.cfi_def_cfa_offset 16   
   	.cfi_offset 3, -16   
   	xor	edx, edx   
   .L2:   
   	movsx	eax, BYTE PTR [rdi+rdx]   
   	movsx	ecx, BYTE PTR [rsi+rdx]   
   	cmp	eax, ecx   
   	je	.L4   
   	movsx	r8, ecx   
   	movsx	r9, eax   
   	movzx	ebx, BYTE PTR lower[r8]   
   	cmp	BYTE PTR lower[r9], bl   
   	je	.L4   
   .L3:   
   	sub	eax, ecx   
   	pop	rbx   
   	.cfi_remember_state   
   	.cfi_def_cfa_offset 8   
   	ret   
   	.p2align 4,,10   
   	.p2align 3   
   .L4:   
   	.cfi_restore_state   
   	add	rdx, 1   
   	test	eax, eax   
   	jne	.L2   
   	.p2align 4,,3   
   	jmp	.L3   
   	.cfi_endproc   
      
   For 32-bit, this cuts out one inc from the main loop, but places an   
   addition into both movs instructions.   
      
   For 64-bit, this cuts out two "add ,1"s from the main loop, places one   
   of them further below, but places an addition into both movs   
   instructions.   
      
   Is this faster?  I can't tell from comparing the code.   
      
   > I saw you (Rod) make a good point in another thread that if lower()   
   > is a function call then its overhead can be avoided in many cases by   
   > XOR of the two bytes to see if they /might/ match. If the XOR is 0   
   > then they match. If it is 0x20 then the might match. Otherwise, they   
   > cannot match and there's no need to lower-case either of them.   
      
   That is how I'd approach it for assembly.  It might turn out good for C   
   as well ...  I haven't tried that.  Maybe, this:   
      
   int rp_xor_stricmp(void *p0, void *p1)   
   {   
     int t;   
     char *x,*y;   
      
     x=p0;   
     y=p1;   
      
     while(1)   
     {   
       if(!((*x)&(*y)))   
         break;   
       t=(*x)^(*y);   
       x++;   
       y++;   
       if(!t)   
         continue;   
       if(t==0x20)   
         continue;   
       break;   
     }   
     x--;   
     y--;   
     return((*x)-(*y));   
   }   
      
      
   Rod Pemberton   
   --   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)
[ << oldest | < older | list | newer > | newest >> ]