Fastest way to zero out memory - stream past cache with movntdq/_mm_stream_si128

As a rule of thumb, this technique is only beneficial if the buffer is larger than half the largest level cache.
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#include <emmintrin.h>
#include <intrin.h>

typedef unsigned long long ull;

ull tsc;
int clk;

// Stream 64 Bytes to DRAM, bypass caches. _p must be 16-byte aligned.
template<typename T>
inline void memstream(T *_p, const __m128i& i)
{
    char* p = (char*)_p;
    _mm_stream_si128((__m128i *)&p[0], i);
    _mm_stream_si128((__m128i *)&p[16], i);
    _mm_stream_si128((__m128i *)&p[32], i);
    _mm_stream_si128((__m128i *)&p[48], i);
}

inline void serialize() {int a[4]; __cpuid(a,0);}

inline void starttimer() {clk = clock(); serialize(); tsc = __rdtsc();}

inline void stoptimer(char* n) {
    serialize(); ull tsc2 = __rdtsc(); tsc = tsc2 - tsc;  clk = clock() - clk;
    printf("%-10s %6i msec, %10I64i clocks, \n", n, clk, tsc); 
}

int main()
{
    serialize();
    ull tsc = __rdtsc();

    serialize();
    ull tsc2 = __rdtsc();

    printf("rdtsc overhead: %i ticks\n", tsc2 - tsc);

    // Allocate 1.2 GB of RAM
    const int cnt = 1024*1024*400; 
    size_t sz = cnt * sizeof(int);
    int *ary = (int*)malloc(sz+64);
    int p = (int)ary; p += 64; p = p & ~63; ary = (int*)p; // align on 64 bytes

    // Can set it to anything, limitation is memory bandwidth anyways.
    __m128i zero = {0};/*{1,2,3,4,
                    5,6,7,8,
                    9,10,11,12,
                    13,14,15,16};*/

    // Run a few times to get accurate results
    do {
        starttimer();
        for (int i = 0; i < cnt; i++)
            ary[i] = 0;
        stoptimer("ary[i] = 0"); // 900 with msvc, 358 with intel

        starttimer();
        for (int i = 0; i < cnt; i += 16)
            memstream(&ary[i], zero);
        stoptimer("memstream"); // 358 with both

        starttimer();
        memset(ary, 0, sz);
        stoptimer("memset"); // 940

    } while(getchar());

    // Ensure array not optimized away
    srand(time(0)); 
    printf("%i\n", ary[rand()]);
}

Visual C++ (2010):
ary[i] = 0    962 msec, 2309504493 clocks,
memstream     362 msec,  869966541 clocks,
memset        949 msec, 2276475597 clocks,

ary[i] = 0    958 msec, 2298528630 clocks,
memstream     365 msec,  876925611 clocks,
memset        960 msec, 2303561970 clocks,

ary[i] = 0    943 msec, 2263462995 clocks,
memstream     363 msec,  870981457 clocks,
memset        968 msec, 2321940605 clocks,

Generated assembly:
00C71076  xor         eax,eax  
00C71078  mov         ecx,19000000h  
00C7107D  mov         edi,esi  
00C7107F  rep stos    dword ptr es:[edi]  

00C71081  lea         eax,[esi+20h]  
00C71084  mov         ecx,1900000h  
00C71089  lea         esp,[esp]  

00C71090  movntdq     xmmword ptr [eax-20h],xmm0  
00C71095  movntdq     xmmword ptr [eax-10h],xmm0  
00C7109A  movntdq     xmmword ptr [eax],xmm0  
00C7109E  movntdq     xmmword ptr [eax+10h],xmm0  
00C710A3  add         eax,40h  
00C710A6  dec         ecx  
00C710A7  jne         main+90h (0C71090h)  

00C710A9  push        64000000h  
00C710AE  push        ecx  
00C710AF  push        esi  
00C710B0  call        memset (0C75910h) 
Note that: MSVC converts the ary = 0 loop to Intel's implementation for memset, while calling memset.

Intel C++ (for vs 2010)
ary[i] = 0    435 msec, 1044279793 clocks,
memstream     357 msec,  855606474 clocks,
memset        928 msec, 2226664378 clocks,

ary[i] = 0    361 msec,  866360328 clocks,
memstream     371 msec,  890675064 clocks,
memset        933 msec, 2238792639 clocks,

ary[i] = 0    368 msec,  882105732 clocks,
memstream     358 msec,  858516585 clocks,
memset        959 msec, 2301478323 clocks,

Generated assembly:
00071088  movntdq     xmmword ptr [esi+eax*4],xmm0  
0007108D  add         eax,4  
00071090  cmp         eax,19000000h  
00071095  jb          main+88h (71088h)  

00071097  movdqa      xmm0,xmmword ptr [___xt_z+24h (79160h)]  
0007109F  mov         eax,ebx 
 
000710A1  mov         edx,eax  
000710A3  inc         eax  
000710A4  shl         edx,6  
000710A7  cmp         eax,1900000h  
000710AC  movntdq     xmmword ptr [edx+esi],xmm0  
000710B1  movntdq     xmmword ptr [edx+esi+10h],xmm0  
000710B7  movntdq     xmmword ptr [edx+esi+20h],xmm0  
000710BD  movntdq     xmmword ptr [edx+esi+30h],xmm0  
000710C3  jb          main+0A1h (710A1h)  

000710C5  mov         edi,esi  
000710C7  xor         eax,eax  
000710C9  mov         ecx,19000000h  // = 1024*1024*400
000710CE  rep stos    dword ptr es:[edi]  

Note that: There's no call to memset (which however also uses rep stos making no big difference). The Intel compiler is smart enought to use movntdq (= _mm_stream_si128) by itself.

 rep stos: Fill (E)CX doublewords at ES:[(E)DI--] with EAX, while ECX-- != 0

Comments

  1. 2 all (& paul frischknecht): just FYI, these results are invalid AND the code that you've posted is pretty horrible. For example 'align on 64 bytes' code is invalid on x64. Code "for (int i = 0; i < cnt; i += 16)" should use 64 bytes as per the memstream.

    ReplyDelete
  2. Well this was only intended for 32 bits, casting a pointer to int is indeed a bad idea in 64 bits, you're right xD. The + 16 is correct because ary is int*, so we skip 16 * 4 = 64 bytes.

    ReplyDelete

Post a Comment

Popular posts from this blog

Coins - Pay to Win

How to monitor a directory for changes with ReadDirectoryChangesW