... darkrealms ...

Forums before death by AOL, social media and spammers... "We can't have nice things"
comp.lang.asm.x86
Ahh, the lost art of x86 assembly
4,675 messages
[ << oldest | < older | list | newer > | newest >> ]
Message 3,266 of 4,675
Benjamin David Lunt to All
yuy2 to rgb
05 Feb 18 16:06:08
   From: zfysz@nospicedham.fysnet.net   
      
   Hi guys,   
      
   Here is something for you.  If you are so inclined.   
      
   I have been working on my USB Camera code and have a routine   
   to convert the stream of data from yuy2 to RGB.  Here is my   
   (generic) C routine:   
      
   void yuy2_to_rgb565(void *targ, void *src, int cnt) {   
     bit8u *s = (bit8u *) src;   
     bit16u *t = (bit16u *) targ;   
      
     while (cnt > 0) {   
       int y0 = *s++;   
       int u0 = *s++;   
       int y1 = *s++;   
       int v0 = *s++;   
       cnt -= 4;   
      
       int c = y0 - 16;   // luma   
       int d = u0 - 128;  // cr   
       int e = v0 - 128;  // cb   
      
       *t++ =   
     ((298 * c + 409 * e + 128) & 0xF800)       |    // R   
    (((298 * c - 100 * d - 208 * e + 128) & 0xFC00) >> 5) | //G   
    (((298 * c + 516 * d + 128) & 0xF800) >> 11);    // B   
      
       c = y1 - 16;   
      
       *t++ =   
      ((298 * c + 409 * e + 128) & 0xF800)       |   // R   
     (((298 * c - 100 * d - 208 * e + 128) & 0xFC00) >> 5) | // G   
     (((298 * c + 516 * d + 128) & 0xF800) >> 11);   // B   
     }   
   }   
      
   (I tried to make it narrow enough not to wrap in the post, but   
    please watch for wrap)   
      
   The compiler creates a fairly quick assembly code out of it:   
      
   0084DB90  51                push ecx   
   0084DB91  8B442408          mov eax,[esp+0x8]   
   0084DB95  53                push ebx   
   0084DB96  56                push esi   
   0084DB97  8B742414          mov esi,[esp+0x14]   
   ...   
   0084DB9D  89442418          mov [esp+0x18],eax   
   ...   
   0084DBA6  83C404            add esp,byte +0x4   
   0084DBA9  87C9              xchg ecx,ecx   
   0084DBAB  8B442418          mov eax,[esp+0x18]   
   0084DBAF  85C0              test eax,eax   
   0084DBB1  0F8E04010000      jng dword 0x84dcbb   
   0084DBB7  48                dec eax   
   0084DBB8  C1E802            shr eax,byte 0x2   
   0084DBBB  40                inc eax   
   0084DBBC  55                push ebp   
   0084DBBD  8944241C          mov [esp+0x1c],eax   
   0084DBC1  57                push edi   
   0084DBC2  0FB606            movzx eax,byte [esi]   
   0084DBC5  0FB64E01          movzx ecx,byte [esi+0x1]   
   0084DBC9  46                inc esi   
   0084DBCA  0FB65601          movzx edx,byte [esi+0x1]   
   0084DBCE  46                inc esi   
   0084DBCF  83C0F0            add eax,byte -0x10   
   0084DBD2  8D5980            lea ebx,[ecx-0x80]   
   0084DBD5  8BC8              mov ecx,eax   
   0084DBD7  69C02A010000      imul eax,eax,dword 0x12a   
   0084DBDD  89542410          mov [esp+0x10],edx   
   0084DBE1  69C92A010000      imul ecx,ecx,dword 0x12a   
   0084DBE7  0FB65601          movzx edx,byte [esi+0x1]   
   0084DBEB  46                inc esi   
   0084DBEC  8D6A80            lea ebp,[edx-0x80]   
   0084DBEF  8BD5              mov edx,ebp   
   0084DBF1  69ED99010000      imul ebp,ebp,dword 0x199   
   0084DBF7  896C2418          mov [esp+0x18],ebp   
   0084DBFB  69D2D0000000      imul edx,edx,dword 0xd0   
   0084DC01  8BFB              mov edi,ebx   
   0084DC03  69DB04020000      imul ebx,ebx,dword 0x204   
   0084DC09  6BFF64            imul edi,edi,byte +0x64   
   0084DC0C  8BE9              mov ebp,ecx   
   0084DC0E  2BEF              sub ebp,edi   
   0084DC10  2BEA              sub ebp,edx   
   0084DC12  81C580000000      add ebp,0x80   
   0084DC18  C1FD05            sar ebp,byte 0x5   
   0084DC1B  8D8C0B80000000    lea ecx,[ebx+ecx+0x80]   
   0084DC22  81E5E0070000      and ebp,0x7e0   
   0084DC28  C1F90B            sar ecx,byte 0xb   
   0084DC2B  83E11F            and ecx,byte +0x1f   
   0084DC2E  0BE9              or ebp,ecx   
   0084DC30  8B4C2418          mov ecx,[esp+0x18]   
   0084DC34  8D840880000000    lea eax,[eax+ecx+0x80]   
   0084DC3B  2500F80000        and eax,0xf800   
   0084DC40  0BE8              or ebp,eax   
   0084DC42  8B44241C          mov eax,[esp+0x1c]   
   0084DC46  668928            mov [eax],bp   
   0084DC49  83C002            add eax,byte +0x2   
   0084DC4C  8944241C          mov [esp+0x1c],eax   
   0084DC50  8B442410          mov eax,[esp+0x10]   
   0084DC54  83C0F0            add eax,byte -0x10   
   0084DC57  8BC8              mov ecx,eax   
   0084DC59  69C02A010000      imul eax,eax,dword 0x12a   
   0084DC5F  69C92A010000      imul ecx,ecx,dword 0x12a   
   0084DC65  8BE9              mov ebp,ecx   
   0084DC67  2BEF              sub ebp,edi   
   0084DC69  2BEA              sub ebp,edx   
   0084DC6B  8B542418          mov edx,[esp+0x18]   
   0084DC6F  81C580000000      add ebp,0x80   
   0084DC75  C1FD05            sar ebp,byte 0x5   
   0084DC78  8D8C0B80000000    lea ecx,[ebx+ecx+0x80]   
   0084DC7F  8D841080000000    lea eax,[eax+edx+0x80]   
   0084DC86  C1F90B            sar ecx,byte 0xb   
   0084DC89  81E5E0070000      and ebp,0x7e0   
   0084DC8F  2500F80000        and eax,0xf800   
   0084DC94  83E11F            and ecx,byte +0x1f   
   0084DC97  0BE9              or ebp,ecx   
   0084DC99  0BE8              or ebp,eax   
   0084DC9B  8B44241C          mov eax,[esp+0x1c]   
   0084DC9F  668928            mov [eax],bp   
   0084DCA2  83C002            add eax,byte +0x2   
   0084DCA5  8944241C          mov [esp+0x1c],eax   
   0084DCA9  8B442420          mov eax,[esp+0x20]   
   0084DCAD  46                inc esi   
   0084DCAE  48                dec eax   
   0084DCAF  89442420          mov [esp+0x20],eax   
   0084DCB3  0F8509FFFFFF      jnz dword 0x84dbc2   
   0084DCB9  5F                pop edi   
   0084DCBA  5D                pop ebp   
   ...   
   0084DCC4  83C404            add esp,byte +0x4   
   0084DCC7  5E                pop esi   
   0084DCC8  5B                pop ebx   
   0084DCC9  59                pop ecx   
   0084DCCA  C3                ret   
      
   (The ... is where I called a timing call.)   
      
   Since you have to convert two pixels at a time, the two   
   sets of calculations can be somewhat combined, since they   
   do almost the exact same thing (as the compiler's optimizer   
   figured out).   
      
   On top of that, everything is 16-bit so I even wrote a   
   16-bit version that used less memory access and more   
   register access.   
      
   With all the tries I did, I couldn't beat the compiler.   
   Surprise. Surprise.   
      
   However, and this is where I am at a loss.  I don't know   
   or have not worked with SSE2 instructions, or whichever   
   instruction set will allow you to do multiple calculations   
   at a time.  I have only read comments stating that I (the   
   reader) should use SSE2 (or AVX or whatever it is) to speed   
   up the code (generic post of someone else's code not even   
   remotely close to this routine).   
      
   So, this is why I came here.  I know that some of you   
   are quite fluent in these matters.  How would you write   
   a routine, in Intel x86 assembly, that would beat the   
   compiler's code above?   
      
   As your probably know, the conversion needs to be extremely   
   fast...   
      
   The sky is the limit, as long as this sky is 32-bit, not   
   64-bit.  Any 32-bit instruction set is okay.   
      
   Ready?  Go.   
      
   Thanks,   
   Ben   
      
   --   
   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-   
   Forever Young Software   
   http://www.fysnet.net/index.htm   
   http://www.fysnet.net/osdesign_book_series.htm   
   To reply by email, please remove the zzzzzz's   
      
   Batteries not included, some Assembly required.   
      
   --- SoupGate-Win32 v1.05   
    * Origin: you cannot sedate... all the things you hate (1:229/2)
[ << oldest | < older | list | newer > | newest >> ]