Thursday, May 31, 2012

Fastest way to zero out memory - stream past cache with movntdq/_mm_stream_si128

As a rule of thumb, this technique is only beneficial if the buffer is larger than half the largest level cache.
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#include <emmintrin.h>
#include <intrin.h>

typedef unsigned long long ull;

ull tsc;
int clk;

// Stream 64 Bytes to DRAM, bypass caches. _p must be 16-byte aligned.
template<typename T>
inline void memstream(T *_p, const __m128i& i)
    char* p = (char*)_p;
    _mm_stream_si128((__m128i *)&p[0], i);
    _mm_stream_si128((__m128i *)&p[16], i);
    _mm_stream_si128((__m128i *)&p[32], i);
    _mm_stream_si128((__m128i *)&p[48], i);

inline void serialize() {int a[4]; __cpuid(a,0);}

inline void starttimer() {clk = clock(); serialize(); tsc = __rdtsc();}

inline void stoptimer(char* n) {
    serialize(); ull tsc2 = __rdtsc(); tsc = tsc2 - tsc;  clk = clock() - clk;
    printf("%-10s %6i msec, %10I64i clocks, \n", n, clk, tsc); 

int main()
    ull tsc = __rdtsc();

    ull tsc2 = __rdtsc();

    printf("rdtsc overhead: %i ticks\n", tsc2 - tsc);

    // Allocate 1.2 GB of RAM
    const int cnt = 1024*1024*400; 
    size_t sz = cnt * sizeof(int);
    int *ary = (int*)malloc(sz+64);
    int p = (int)ary; p += 64; p = p & ~63; ary = (int*)p; // align on 64 bytes

    // Can set it to anything, limitation is memory bandwidth anyways.
    __m128i zero = {0};/*{1,2,3,4,

    // Run a few times to get accurate results
    do {
        for (int i = 0; i < cnt; i++)
            ary[i] = 0;
        stoptimer("ary[i] = 0"); // 900 with msvc, 358 with intel

        for (int i = 0; i < cnt; i += 16)
            memstream(&ary[i], zero);
        stoptimer("memstream"); // 358 with both

        memset(ary, 0, sz);
        stoptimer("memset"); // 940

    } while(getchar());

    // Ensure array not optimized away
    printf("%i\n", ary[rand()]);

Visual C++ (2010):
ary[i] = 0    962 msec, 2309504493 clocks,
memstream     362 msec,  869966541 clocks,
memset        949 msec, 2276475597 clocks,

ary[i] = 0    958 msec, 2298528630 clocks,
memstream     365 msec,  876925611 clocks,
memset        960 msec, 2303561970 clocks,

ary[i] = 0    943 msec, 2263462995 clocks,
memstream     363 msec,  870981457 clocks,
memset        968 msec, 2321940605 clocks,

Generated assembly:
00C71076  xor         eax,eax  
00C71078  mov         ecx,19000000h  
00C7107D  mov         edi,esi  
00C7107F  rep stos    dword ptr es:[edi]  

00C71081  lea         eax,[esi+20h]  
00C71084  mov         ecx,1900000h  
00C71089  lea         esp,[esp]  

00C71090  movntdq     xmmword ptr [eax-20h],xmm0  
00C71095  movntdq     xmmword ptr [eax-10h],xmm0  
00C7109A  movntdq     xmmword ptr [eax],xmm0  
00C7109E  movntdq     xmmword ptr [eax+10h],xmm0  
00C710A3  add         eax,40h  
00C710A6  dec         ecx  
00C710A7  jne         main+90h (0C71090h)  

00C710A9  push        64000000h  
00C710AE  push        ecx  
00C710AF  push        esi  
00C710B0  call        memset (0C75910h) 
Note that: MSVC converts the ary = 0 loop to Intel's implementation for memset, while calling memset.

Intel C++ (for vs 2010)
ary[i] = 0    435 msec, 1044279793 clocks,
memstream     357 msec,  855606474 clocks,
memset        928 msec, 2226664378 clocks,

ary[i] = 0    361 msec,  866360328 clocks,
memstream     371 msec,  890675064 clocks,
memset        933 msec, 2238792639 clocks,

ary[i] = 0    368 msec,  882105732 clocks,
memstream     358 msec,  858516585 clocks,
memset        959 msec, 2301478323 clocks,

Generated assembly:
00071088  movntdq     xmmword ptr [esi+eax*4],xmm0  
0007108D  add         eax,4  
00071090  cmp         eax,19000000h  
00071095  jb          main+88h (71088h)  

00071097  movdqa      xmm0,xmmword ptr [___xt_z+24h (79160h)]  
0007109F  mov         eax,ebx 
000710A1  mov         edx,eax  
000710A3  inc         eax  
000710A4  shl         edx,6  
000710A7  cmp         eax,1900000h  
000710AC  movntdq     xmmword ptr [edx+esi],xmm0  
000710B1  movntdq     xmmword ptr [edx+esi+10h],xmm0  
000710B7  movntdq     xmmword ptr [edx+esi+20h],xmm0  
000710BD  movntdq     xmmword ptr [edx+esi+30h],xmm0  
000710C3  jb          main+0A1h (710A1h)  

000710C5  mov         edi,esi  
000710C7  xor         eax,eax  
000710C9  mov         ecx,19000000h  // = 1024*1024*400
000710CE  rep stos    dword ptr es:[edi]  

Note that: There's no call to memset (which however also uses rep stos making no big difference). The Intel compiler is smart enought to use movntdq (= _mm_stream_si128) by itself.

 rep stos: Fill (E)CX doublewords at ES:[(E)DI--] with EAX, while ECX-- != 0

Tuesday, May 29, 2012

No installers

When publishing software, make it installerless and portable (meaning it shouldn't write it's settings to the registry but to a user specifiable folder (a subfolder of itself by default)).

Also, when zipping things up don't put the files directly in the root folder, forcing people to use the "extract to folder..." option instead of "extract here". This stops people from downloading your software to a temporary folder and opening it in a zip viewing program, then dragging and dropping it somewhere else, because the'y first have to create a folder whereever they want to put it.

Instead, create a single folder in the zip and put the files there. Download some of my software to see what I mean.

GnuCalc - A good commandline calculator

Download: Get
(place readline5.dll in GnuCalc bin folder).

For integer only, "set /a 2+2" is good enough.


Monday, May 14, 2012

highgui.h, highgui.dll download

fatal error C1083: Cannot open include file: 'highgui.h': No such file or directory

Just came across some OpenGL Tutorials which depend on OpenCV Lib's Image loading. OpenCV is a huge library with tons of other stuff you won't need for compiling these samples, so I went ahead and repacked just everything you need for compiling and running these samples (only tested simpleGLUT-Texturing). (only vs10x86 binaries and required headers included)

When compiling these samples, make sure to set Linker > General > Output File back to default and remove any input libraries, then add the ones included here (e.g. by just drag and dropping them into the source file area of vs10) and copy the dlls in bin to your project dir.