Fastest way to zero out memory - stream past cache with movntdq/_mm_stream_si128
As a rule of thumb, this technique is only beneficial if the buffer is larger than half the largest level cache. #include <time.h> #include <stdio.h> #include <stdlib.h> #include <stdlib.h> #include <emmintrin.h> #include <intrin.h> typedef unsigned long long ull ; ull tsc ; int clk ; // Stream 64 Bytes to DRAM, bypass caches. _p must be 16-byte aligned. template < typename T > inline void memstream ( T * _p , const __m128i & i ) { char * p = ( char *) _p ; _mm_stream_si128 (( __m128i *)& p [ 0 ], i ); _mm_stream_si128 (( __m128i *)& p [ 16 ], i ); _mm_stream_si128 (( __m128i *)& p [ 32 ], i ); _mm_stream_si128 (( __m128i *)& p [ 48 ], i ); } inline void serialize () { int a [ 4 ]; __cpuid ( a , 0 );} inline void starttimer () { clk = clock (); serialize (); tsc = __rdtsc ();} inline void stoptimer ( char * n ) { serialize (); ull tsc2 = __rdtsc