SSE2 provides functionality for performing faster on aligned memory. By copying the first and last bytes of an unaligned memory destination using the conventional unaligned functionality, and copying everything in between as aligned, it is possible to make use of this performance improvement on large unaligned memory blocks as well.
In this graph the green lines are the conventional memcpy available in Microsoft Visual Studio 2008, the red lines are the SSE memcpy function available in Nevrax NeL, and the blue lines are the custom SSE2 function. The bright colored lines are the performance on alinged memory blocks, while the dark colored lines are tested on differently unaligned blocks of memory. Horizontally the copy function is tested on different sizes of memory, on the vertical axis the copy speed is displayed in MB/s.
As you can see, NeL’s SSE memcpy performs very well on aligned memory, but gives horrible performance on unaligned memory, as it does not take the aligning of the memory blocks into account. The builtin memcpy function is fastest of all at copying blocks below 128 bytes, but also reaches it’s speed limit there. The SSE2 memcpy takes larger sizes to get to it’s maximum performance, but peaks above NeL’s aligned SSE memcpy even for unaligned memory blocks.
Code is available below, ask before using.
void *memcpy_kaetemi_sse2(void *dst, void *src, int nBytes) { __asm { // Copyright (C) 2009 Jan Boon (Kaetemi) // optimized on Intel Core 2 Duo T7500 mov ecx, nBytes mov edi, dst mov esi, src add ecx, edi prefetchnta [esi] prefetchnta [esi+32] prefetchnta [esi+64] prefetchnta [esi+96] // handle nBytes lower than 128 cmp nBytes, 512 jge fast slow: mov bl, [esi] mov [edi], bl inc edi inc esi cmp ecx, edi jnz slow jmp end fast: // align dstEnd to 128 bytes and ecx, 0xFFFFFF80 // get srcEnd aligned to dstEnd aligned to 128 bytes mov ebx, esi sub ebx, edi add ebx, ecx // skip unaligned copy if dst is aligned mov eax, edi and edi, 0xFFFFFF80 cmp eax, edi jne first jmp more first: // copy the first 128 bytes unaligned movdqu xmm0, [esi] movdqu xmm1, [esi+16] movdqu xmm2, [esi+32] movdqu xmm3, [esi+48] movdqu xmm4, [esi+64] movdqu xmm5, [esi+80] movdqu xmm6, [esi+96] movdqu xmm7, [esi+112] movdqu [eax], xmm0 movdqu [eax+16], xmm1 movdqu [eax+32], xmm2 movdqu [eax+48], xmm3 movdqu [eax+64], xmm4 movdqu [eax+80], xmm5 movdqu [eax+96], xmm6 movdqu [eax+112], xmm7 // add 128 bytes to edi aligned earlier add edi, 128 // offset esi by the same value sub eax, edi sub esi, eax // last bytes if dst at dstEnd cmp ecx, edi jnz more jmp last more: // handle equally aligned arrays mov eax, esi and eax, 0xFFFFFF80 cmp eax, esi jne unaligned4k aligned4k: mov eax, esi add eax, 4096 cmp eax, ebx jle aligned4kin cmp ecx, edi jne alignedlast jmp last aligned4kin: prefetchnta [esi] prefetchnta [esi+32] prefetchnta [esi+64] prefetchnta [esi+96] add esi, 128 cmp eax, esi jne aligned4kin sub esi, 4096 alinged4kout: movdqa xmm0, [esi] movdqa xmm1, [esi+16] movdqa xmm2, [esi+32] movdqa xmm3, [esi+48] movdqa xmm4, [esi+64] movdqa xmm5, [esi+80] movdqa xmm6, [esi+96] movdqa xmm7, [esi+112] movntdq [edi], xmm0 movntdq [edi+16], xmm1 movntdq [edi+32], xmm2 movntdq [edi+48], xmm3 movntdq [edi+64], xmm4 movntdq [edi+80], xmm5 movntdq [edi+96], xmm6 movntdq [edi+112], xmm7 add esi, 128 add edi, 128 cmp eax, esi jne alinged4kout jmp aligned4k alignedlast: mov eax, esi alignedlastin: prefetchnta [esi] prefetchnta [esi+32] prefetchnta [esi+64] prefetchnta [esi+96] add esi, 128 cmp ebx, esi jne alignedlastin mov esi, eax alignedlastout: movdqa xmm0, [esi] movdqa xmm1, [esi+16] movdqa xmm2, [esi+32] movdqa xmm3, [esi+48] movdqa xmm4, [esi+64] movdqa xmm5, [esi+80] movdqa xmm6, [esi+96] movdqa xmm7, [esi+112] movntdq [edi], xmm0 movntdq [edi+16], xmm1 movntdq [edi+32], xmm2 movntdq [edi+48], xmm3 movntdq [edi+64], xmm4 movntdq [edi+80], xmm5 movntdq [edi+96], xmm6 movntdq [edi+112], xmm7 add esi, 128 add edi, 128 cmp ecx, edi jne alignedlastout jmp last unaligned4k: mov eax, esi add eax, 4096 cmp eax, ebx jle unaligned4kin cmp ecx, edi jne unalignedlast jmp last unaligned4kin: prefetchnta [esi] prefetchnta [esi+32] prefetchnta [esi+64] prefetchnta [esi+96] add esi, 128 cmp eax, esi jne unaligned4kin sub esi, 4096 unalinged4kout: movdqu xmm0, [esi] movdqu xmm1, [esi+16] movdqu xmm2, [esi+32] movdqu xmm3, [esi+48] movdqu xmm4, [esi+64] movdqu xmm5, [esi+80] movdqu xmm6, [esi+96] movdqu xmm7, [esi+112] movntdq [edi], xmm0 movntdq [edi+16], xmm1 movntdq [edi+32], xmm2 movntdq [edi+48], xmm3 movntdq [edi+64], xmm4 movntdq [edi+80], xmm5 movntdq [edi+96], xmm6 movntdq [edi+112], xmm7 add esi, 128 add edi, 128 cmp eax, esi jne unalinged4kout jmp unaligned4k unalignedlast: mov eax, esi unalignedlastin: prefetchnta [esi] prefetchnta [esi+32] prefetchnta [esi+64] prefetchnta [esi+96] add esi, 128 cmp ebx, esi jne unalignedlastin mov esi, eax unalignedlastout: movdqu xmm0, [esi] movdqu xmm1, [esi+16] movdqu xmm2, [esi+32] movdqu xmm3, [esi+48] movdqu xmm4, [esi+64] movdqu xmm5, [esi+80] movdqu xmm6, [esi+96] movdqu xmm7, [esi+112] movntdq [edi], xmm0 movntdq [edi+16], xmm1 movntdq [edi+32], xmm2 movntdq [edi+48], xmm3 movntdq [edi+64], xmm4 movntdq [edi+80], xmm5 movntdq [edi+96], xmm6 movntdq [edi+112], xmm7 add esi, 128 add edi, 128 cmp ecx, edi jne unalignedlastout jmp last last: // get the last 128 bytes mov ecx, nBytes mov edi, dst mov esi, src add edi, ecx add esi, ecx sub edi, 128 sub esi, 128 // copy the last 128 bytes unaligned movdqu xmm0, [esi] movdqu xmm1, [esi+16] movdqu xmm2, [esi+32] movdqu xmm3, [esi+48] movdqu xmm4, [esi+64] movdqu xmm5, [esi+80] movdqu xmm6, [esi+96] movdqu xmm7, [esi+112] movdqu [edi], xmm0 movdqu [edi+16], xmm1 movdqu [edi+32], xmm2 movdqu [edi+48], xmm3 movdqu [edi+64], xmm4 movdqu [edi+80], xmm5 movdqu [edi+96], xmm6 movdqu [edi+112], xmm7 end: } return dst; }
Recent Comments