void static fmemcpy(void *dest, void *src, int n) {
#if 1 __asm { mov edi,dest mov esi,src mov ecx,n push ecx // align 64 byte and ecx,63 rep movsb pop ecx shr ecx,6 cmp ecx,0 je _mmx_copy_end _mmx_copy: movq mm0,[esi+0] movq mm1,[esi+8] movq [edi+0],mm0 movq [edi+8],mm1 movq mm2,[esi+16] movq mm3,[esi+24] movq [edi+16],mm2 movq [edi+24],mm3 movq mm0,[esi+32] movq mm1,[esi+40] movq [edi+32],mm0 movq [edi+40],mm1 movq mm2,[esi+48] movq mm3,[esi+56] movq [edi+48],mm2 movq [edi+56],mm3 add esi,64 add edi,64 dec ecx jnz _mmx_copy _mmx_copy_end: emms } #else __asm { mov esi, src mov ecx, n mov ebx, ecx shr ebx, 11 // 2048 bytes at a time mov edi, dest loop2k: // Copy 2k into temporary buffer push edi mov edi, tbuf mov ecx, 2048 shr ecx, 6 loopMemToL1: prefetchnta 64[ESI] // Prefetch next loop, non-temporal prefetchnta 96[ESI] movq mm1, 0[ESI] // Read in source data movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] movq 0[EDI], mm1 // Store into L1 movq 8[EDI], mm2 movq 16[EDI], mm3 movq 24[EDI], mm4 movq 32[EDI], mm5 movq 40[EDI], mm6 movq 48[EDI], mm7 movq 56[EDI], mm0 add esi, 64 add edi, 64 dec ecx jnz loopMemToL1 pop edi // Now copy from L1 to system memory push esi mov esi, tbuf mov ecx, 2048 shr ecx, 6 loopL1ToMem: movq mm1, 0[ESI] // Read in source data from L1 movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] movntq 0[EDI], mm1 // Non-temporal stores movntq 8[EDI], mm2 movntq 16[EDI], mm3 movntq 24[EDI], mm4 movntq 32[EDI], mm5 movntq 40[EDI], mm6 movntq 48[EDI], mm7 movntq 56[EDI], mm0 add esi, 64 add edi, 64 dec ecx jnz loopL1ToMem pop esi // Do next 2k block dec ebx jnz loop2k } #endif }

|