SpeedTest.cpp 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. #include "SpeedTest.h"
  2. #include "Random.h"
  3. #include <stdio.h> // for printf
  4. #include <memory.h> // for memset
  5. #include <math.h> // for sqrt
  6. #include <algorithm> // for sort
  7. //-----------------------------------------------------------------------------
  8. // We view our timing values as a series of random variables V that has been
  9. // contaminated with occasional outliers due to cache misses, thread
  10. // preemption, etcetera. To filter out the outliers, we search for the largest
  11. // subset of V such that all its values are within three standard deviations
  12. // of the mean.
  13. double CalcMean ( std::vector<double> & v )
  14. {
  15. double mean = 0;
  16. for(int i = 0; i < (int)v.size(); i++)
  17. {
  18. mean += v[i];
  19. }
  20. mean /= double(v.size());
  21. return mean;
  22. }
  23. double CalcMean ( std::vector<double> & v, int a, int b )
  24. {
  25. double mean = 0;
  26. for(int i = a; i <= b; i++)
  27. {
  28. mean += v[i];
  29. }
  30. mean /= (b-a+1);
  31. return mean;
  32. }
  33. double CalcStdv ( std::vector<double> & v, int a, int b )
  34. {
  35. double mean = CalcMean(v,a,b);
  36. double stdv = 0;
  37. for(int i = a; i <= b; i++)
  38. {
  39. double x = v[i] - mean;
  40. stdv += x*x;
  41. }
  42. stdv = sqrt(stdv / (b-a+1));
  43. return stdv;
  44. }
  45. // Return true if the largest value in v[0,len) is more than three
  46. // standard deviations from the mean
  47. bool ContainsOutlier ( std::vector<double> & v, size_t len )
  48. {
  49. double mean = 0;
  50. for(size_t i = 0; i < len; i++)
  51. {
  52. mean += v[i];
  53. }
  54. mean /= double(len);
  55. double stdv = 0;
  56. for(size_t i = 0; i < len; i++)
  57. {
  58. double x = v[i] - mean;
  59. stdv += x*x;
  60. }
  61. stdv = sqrt(stdv / double(len));
  62. double cutoff = mean + stdv*3;
  63. return v[len-1] > cutoff;
  64. }
  65. // Do a binary search to find the largest subset of v that does not contain
  66. // outliers.
  67. void FilterOutliers ( std::vector<double> & v )
  68. {
  69. std::sort(v.begin(),v.end());
  70. size_t len = 0;
  71. for(size_t x = 0x40000000; x; x = x >> 1 )
  72. {
  73. if((len | x) >= v.size()) continue;
  74. if(!ContainsOutlier(v,len | x))
  75. {
  76. len |= x;
  77. }
  78. }
  79. v.resize(len);
  80. }
  81. // Iteratively tighten the set to find a subset that does not contain
  82. // outliers. I'm not positive this works correctly in all cases.
  83. void FilterOutliers2 ( std::vector<double> & v )
  84. {
  85. std::sort(v.begin(),v.end());
  86. int a = 0;
  87. int b = (int)(v.size() - 1);
  88. for(int i = 0; i < 10; i++)
  89. {
  90. //printf("%d %d\n",a,b);
  91. double mean = CalcMean(v,a,b);
  92. double stdv = CalcStdv(v,a,b);
  93. double cutA = mean - stdv*3;
  94. double cutB = mean + stdv*3;
  95. while((a < b) && (v[a] < cutA)) a++;
  96. while((b > a) && (v[b] > cutB)) b--;
  97. }
  98. std::vector<double> v2;
  99. v2.insert(v2.begin(),v.begin()+a,v.begin()+b+1);
  100. v.swap(v2);
  101. }
  102. //-----------------------------------------------------------------------------
  103. // We really want the rdtsc() calls to bracket the function call as tightly
  104. // as possible, but that's hard to do portably. We'll try and get as close as
  105. // possible by marking the function as NEVER_INLINE (to keep the optimizer from
  106. // moving it) and marking the timing variables as "volatile register".
  107. NEVER_INLINE int64_t timehash ( pfHash hash, const void * key, int len, int seed )
  108. {
  109. volatile register int64_t begin,end;
  110. uint32_t temp[16];
  111. begin = rdtsc();
  112. hash(key,len,seed,temp);
  113. end = rdtsc();
  114. return end-begin;
  115. }
  116. //-----------------------------------------------------------------------------
  117. // Specialized procedure for small lengths. Serialize invocations of the hash
  118. // function, make sure they would not be computed in parallel on an out-of-order CPU.
  119. NEVER_INLINE int64_t timehash_small ( pfHash hash, const void * key, int len, int seed )
  120. {
  121. const int NUM_TRIALS = 200;
  122. volatile unsigned long long int begin, end;
  123. uint32_t hash_temp[16] = {};
  124. uint32_t *buf = new uint32_t[(len + 3) / 4];
  125. memcpy(buf,key,len);
  126. begin = rdtsc();
  127. for(int i = 0; i < NUM_TRIALS; i++) {
  128. hash(buf,len,seed,hash_temp);
  129. // XXX Add dependency between invocations of hash-function to prevent parallel
  130. // evaluation of them. However this way the invocations still would not be
  131. // fully serialized. Another option is to use lfence instruction (load-from-memory
  132. // serialization instruction) or mfence (load-from-memory AND store-to-memory
  133. // serialization instruction):
  134. // __asm volatile ("lfence");
  135. // It's hard to say which one is the most realistic and sensible approach.
  136. seed += hash_temp[0];
  137. buf[0] ^= hash_temp[0];
  138. }
  139. end = rdtsc();
  140. delete[] buf;
  141. return (int64_t)((end - begin) / (double)NUM_TRIALS);
  142. }
  143. //-----------------------------------------------------------------------------
  144. double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int blocksize, const int align )
  145. {
  146. Rand r(seed);
  147. uint8_t * buf = new uint8_t[blocksize + 512];
  148. uint64_t t1 = reinterpret_cast<uint64_t>(buf);
  149. t1 = (t1 + 255) & BIG_CONSTANT(0xFFFFFFFFFFFFFF00);
  150. t1 += align;
  151. uint8_t * block = reinterpret_cast<uint8_t*>(t1);
  152. r.rand_p(block,blocksize);
  153. //----------
  154. std::vector<double> times;
  155. times.reserve(trials);
  156. for(int itrial = 0; itrial < trials; itrial++)
  157. {
  158. r.rand_p(block,blocksize);
  159. double t;
  160. if(blocksize < 100)
  161. {
  162. t = (double)timehash_small(hash,block,blocksize,itrial);
  163. }
  164. else
  165. {
  166. t = (double)timehash(hash,block,blocksize,itrial);
  167. }
  168. if(t > 0) times.push_back(t);
  169. }
  170. //----------
  171. std::sort(times.begin(),times.end());
  172. FilterOutliers(times);
  173. delete [] buf;
  174. return CalcMean(times);
  175. }
  176. //-----------------------------------------------------------------------------
  177. // 256k blocks seem to give the best results.
  178. void BulkSpeedTest ( pfHash hash, uint32_t seed )
  179. {
  180. const int trials = 2999;
  181. const int blocksize = 256 * 1024;
  182. printf("Bulk speed test - %d-byte keys\n",blocksize);
  183. double sumbpc = 0.0;
  184. volatile double warmup_cycles = SpeedTest(hash,seed,trials,blocksize,0);
  185. for(int align = 7; align >= 0; align--)
  186. {
  187. double cycles = SpeedTest(hash,seed,trials,blocksize,align);
  188. double bestbpc = double(blocksize)/cycles;
  189. double bestbps = (bestbpc * 3000000000.0 / 1048576.0);
  190. printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",align,bestbpc,bestbps);
  191. sumbpc += bestbpc;
  192. }
  193. sumbpc = sumbpc / 8.0;
  194. printf("Average - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",sumbpc,(sumbpc * 3000000000.0 / 1048576.0));
  195. }
  196. //-----------------------------------------------------------------------------
  197. double TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose )
  198. {
  199. const int trials = 99999;
  200. if(verbose) printf("Small key speed test - %4d-byte keys - ",keysize);
  201. double cycles = SpeedTest(hash,seed,trials,keysize,0);
  202. printf("%8.2f cycles/hash\n",cycles);
  203. return cycles;
  204. }
  205. //-----------------------------------------------------------------------------