Fastest way to zero out memory - stream past cache with movntdq/_mm_stream_si128
As a rule of thumb, this technique is only beneficial if the buffer is larger than half the largest level cache.
Visual C++ (2010):
Intel C++ (for vs 2010)
rep stos: Fill (E)CX doublewords at ES:[(E)DI--] with EAX, while ECX-- != 0
#include <time.h> #include <stdio.h> #include <stdlib.h> #include <stdlib.h> #include <emmintrin.h> #include <intrin.h> typedef unsigned long long ull; ull tsc; int clk; // Stream 64 Bytes to DRAM, bypass caches. _p must be 16-byte aligned. template<typename T> inline void memstream(T *_p, const __m128i& i) { char* p = (char*)_p; _mm_stream_si128((__m128i *)&p[0], i); _mm_stream_si128((__m128i *)&p[16], i); _mm_stream_si128((__m128i *)&p[32], i); _mm_stream_si128((__m128i *)&p[48], i); } inline void serialize() {int a[4]; __cpuid(a,0);} inline void starttimer() {clk = clock(); serialize(); tsc = __rdtsc();} inline void stoptimer(char* n) { serialize(); ull tsc2 = __rdtsc(); tsc = tsc2 - tsc; clk = clock() - clk; printf("%-10s %6i msec, %10I64i clocks, \n", n, clk, tsc); } int main() { serialize(); ull tsc = __rdtsc(); serialize(); ull tsc2 = __rdtsc(); printf("rdtsc overhead: %i ticks\n", tsc2 - tsc); // Allocate 1.2 GB of RAM const int cnt = 1024*1024*400; size_t sz = cnt * sizeof(int); int *ary = (int*)malloc(sz+64); int p = (int)ary; p += 64; p = p & ~63; ary = (int*)p; // align on 64 bytes // Can set it to anything, limitation is memory bandwidth anyways. __m128i zero = {0};/*{1,2,3,4, 5,6,7,8, 9,10,11,12, 13,14,15,16};*/ // Run a few times to get accurate results do { starttimer(); for (int i = 0; i < cnt; i++) ary[i] = 0; stoptimer("ary[i] = 0"); // 900 with msvc, 358 with intel starttimer(); for (int i = 0; i < cnt; i += 16) memstream(&ary[i], zero); stoptimer("memstream"); // 358 with both starttimer(); memset(ary, 0, sz); stoptimer("memset"); // 940 } while(getchar()); // Ensure array not optimized away srand(time(0)); printf("%i\n", ary[rand()]); }
Visual C++ (2010):
ary[i] = 0 962 msec, 2309504493 clocks, memstream 362 msec, 869966541 clocks, memset 949 msec, 2276475597 clocks, ary[i] = 0 958 msec, 2298528630 clocks, memstream 365 msec, 876925611 clocks, memset 960 msec, 2303561970 clocks, ary[i] = 0 943 msec, 2263462995 clocks, memstream 363 msec, 870981457 clocks, memset 968 msec, 2321940605 clocks,Generated assembly:
00C71076 xor eax,eax 00C71078 mov ecx,19000000h 00C7107D mov edi,esi 00C7107F rep stos dword ptr es:[edi] 00C71081 lea eax,[esi+20h] 00C71084 mov ecx,1900000h 00C71089 lea esp,[esp] 00C71090 movntdq xmmword ptr [eax-20h],xmm0 00C71095 movntdq xmmword ptr [eax-10h],xmm0 00C7109A movntdq xmmword ptr [eax],xmm0 00C7109E movntdq xmmword ptr [eax+10h],xmm0 00C710A3 add eax,40h 00C710A6 dec ecx 00C710A7 jne main+90h (0C71090h) 00C710A9 push 64000000h 00C710AE push ecx 00C710AF push esi 00C710B0 call memset (0C75910h)Note that: MSVC converts the ary = 0 loop to Intel's implementation for memset, while calling memset.
Intel C++ (for vs 2010)
ary[i] = 0 435 msec, 1044279793 clocks, memstream 357 msec, 855606474 clocks, memset 928 msec, 2226664378 clocks, ary[i] = 0 361 msec, 866360328 clocks, memstream 371 msec, 890675064 clocks, memset 933 msec, 2238792639 clocks, ary[i] = 0 368 msec, 882105732 clocks, memstream 358 msec, 858516585 clocks, memset 959 msec, 2301478323 clocks,Generated assembly:
00071088 movntdq xmmword ptr [esi+eax*4],xmm0 0007108D add eax,4 00071090 cmp eax,19000000h 00071095 jb main+88h (71088h) 00071097 movdqa xmm0,xmmword ptr [___xt_z+24h (79160h)] 0007109F mov eax,ebx 000710A1 mov edx,eax 000710A3 inc eax 000710A4 shl edx,6 000710A7 cmp eax,1900000h 000710AC movntdq xmmword ptr [edx+esi],xmm0 000710B1 movntdq xmmword ptr [edx+esi+10h],xmm0 000710B7 movntdq xmmword ptr [edx+esi+20h],xmm0 000710BD movntdq xmmword ptr [edx+esi+30h],xmm0 000710C3 jb main+0A1h (710A1h) 000710C5 mov edi,esi 000710C7 xor eax,eax 000710C9 mov ecx,19000000h // = 1024*1024*400 000710CE rep stos dword ptr es:[edi]Note that: There's no call to memset (which however also uses rep stos making no big difference). The Intel compiler is smart enought to use movntdq (= _mm_stream_si128) by itself.
rep stos: Fill (E)CX doublewords at ES:[(E)DI--] with EAX, while ECX-- != 0
2 all (& paul frischknecht): just FYI, these results are invalid AND the code that you've posted is pretty horrible. For example 'align on 64 bytes' code is invalid on x64. Code "for (int i = 0; i < cnt; i += 16)" should use 64 bytes as per the memstream.
ReplyDeleteWell this was only intended for 32 bits, casting a pointer to int is indeed a bad idea in 64 bits, you're right xD. The + 16 is correct because ary is int*, so we skip 16 * 4 = 64 bytes.
ReplyDelete