bench-stencil3.cc 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. // -*- mode: c++; coding: utf-8 -*-
  2. /// @file bench-stencil3.cc
  3. /// @brief Stencil-as-view.
  4. // (c) Daniel Llorens - 2016-2017
  5. // This library is free software; you can redistribute it and/or modify it under
  6. // the terms of the GNU Lesser General Public License as published by the Free
  7. // Software Foundation; either version 3 of the License, or (at your option) any
  8. // later version.
  9. // TODO Bad performance, see also bench-stencil[12].cc.
  10. #include <iostream>
  11. #include <iomanip>
  12. #include <random>
  13. #include "ra/ra.hh"
  14. #include "ra/test.hh"
  15. #include "ra/bench.hh"
  16. using std::cout, std::endl, std::flush, ra::TestRecorder;
  17. using real = double;
  18. int nx = 100;
  19. int ny = 100;
  20. int nz = 100;
  21. int ts = 10;
  22. auto I = ra::iota(nx-2, 1);
  23. auto J = ra::iota(ny-2, 1);
  24. auto K = ra::iota(nz-2, 1);
  25. constexpr ra::Small<real, 3, 3, 3> mask = { 0, 0, 0, 0, 1, 0, 0, 0, 0,
  26. 0, 1, 0, 1, -6, 1, 0, 1, 0,
  27. 0, 0, 0, 0, 1, 0, 0, 0, 0 };
  28. #define THEOP template <class A_, class Anext_, class Astencil_> __attribute__((noinline)) \
  29. auto operator()(A_ & A, Anext_ & Anext, Astencil_ & Astencil)
  30. // sensitive to RA_DO_CHECK.
  31. struct f_raw
  32. {
  33. THEOP
  34. {
  35. for (int i=1; i+1<nx; ++i) {
  36. for (int j=1; j+1<ny; ++j) {
  37. for (int k=1; k+1<nz; ++k) {
  38. Anext(i, j, k) = -6*A(i, j, k)
  39. + A(i+1, j, k) + A(i, j+1, k) + A(i, j, k+1)
  40. + A(i-1, j, k) + A(i, j-1, k) + A(i, j, k-1);
  41. }
  42. }
  43. }
  44. std::swap(A.p, Anext.p);
  45. };
  46. };
  47. // about as fast as f_raw, but no stencil. Insensitive to RA_DO_CHECK.
  48. struct f_slices
  49. {
  50. THEOP
  51. {
  52. Anext(I, J, K) = -6*A(I, J, K)
  53. + A(I+1, J, K) + A(I, J+1, K) + A(I, J, K+1)
  54. + A(I-1, J, K) + A(I, J-1, K) + A(I, J, K-1);
  55. std::swap(A.p, Anext.p);
  56. };
  57. };
  58. // with stencil, about as fast as f_raw. Sensitive to RA_DO_CHECK.
  59. struct f_stencil_explicit
  60. {
  61. THEOP
  62. {
  63. Astencil.p = A.data();
  64. Anext(I, J, K) = map([](auto && A) { return -6*A(1, 1, 1)
  65. + A(2, 1, 1) + A(1, 2, 1) + A(1, 1, 2)
  66. + A(0, 1, 1) + A(1, 0, 1) + A(1, 1, 0); },
  67. iter<3>(Astencil));
  68. std::swap(A.p, Anext.p);
  69. };
  70. };
  71. // sum() inside uses run time sizes and 3-dim ply_ravel loop which is much (10x w/gcc) slower. TODO
  72. struct f_stencil_arrayop
  73. {
  74. THEOP
  75. {
  76. Astencil.p = A.data();
  77. Anext(I, J, K) = map([](auto && s) { return sum(s*mask); }, iter<3>(Astencil));
  78. std::swap(A.p, Anext.p);
  79. };
  80. };
  81. // allows traversal order to be chosen between all 6 axes in ply_ravel. 30x slower. TODO
  82. struct f_sumprod
  83. {
  84. THEOP
  85. {
  86. Astencil.p = A.data();
  87. Anext(I, J, K) = 0; // TODO miss notation for sum-of-axes without preparing destination...
  88. Anext(I, J, K) += map(ra::wrank<3, 3>(ra::times()), Astencil, mask);
  89. std::swap(A.p, Anext.p);
  90. };
  91. };
  92. // variant of the above, much faster somehow (TODO).
  93. struct f_sumprod2
  94. {
  95. THEOP
  96. {
  97. Astencil.p = A.data();
  98. Anext(I, J, K) = 0;
  99. plyf(map(ra::wrank<0, 3, 3>([](auto && A, auto && B, auto && C) { A += B*C; }), Anext(I, J, K), Astencil, mask));
  100. std::swap(A.p, Anext.p);
  101. };
  102. };
  103. int main()
  104. {
  105. TestRecorder tr(std::cout);
  106. std::random_device rand;
  107. real value = rand();
  108. auto bench = [&](auto & A, auto & Anext, auto & Astencil, auto && ref, auto && tag, auto && f)
  109. {
  110. auto bv = Benchmark().repeats(ts).runs(3)
  111. .once_f([&](auto && repeat)
  112. {
  113. Anext = 0.;
  114. A = value;
  115. repeat([&]() { f(A, Anext, Astencil); });
  116. });
  117. tr.info(std::setw(5), std::fixed, Benchmark::avg(bv)/A.size()/1e-9, " ns [",
  118. Benchmark::stddev(bv)/A.size()/1e-9 ,"] ", tag)
  119. .test_rel_error(ref, A, 1e-11);
  120. };
  121. ra::Big<real, 3> Aref;
  122. tr.section("static rank");
  123. {
  124. ra::Big<real, 3> A({nx, ny, nz}, 1.);
  125. ra::Big<real, 3> Anext({nx, ny, nz}, 0.);
  126. auto Astencil = stencil(A, 1, 1);
  127. cout << "Astencil " << format_array(Astencil(0, 0, 0, ra::dots<3>), "|", " ") << endl;
  128. #define BENCH(ref, op) bench(A, Anext, Astencil, ref, STRINGIZE(op), op {});
  129. BENCH(A, f_raw);
  130. Aref = ra::Big<real, 3>(A);
  131. BENCH(Aref, f_slices);
  132. BENCH(Aref, f_stencil_explicit);
  133. BENCH(Aref, f_stencil_arrayop);
  134. BENCH(Aref, f_sumprod);
  135. BENCH(Aref, f_sumprod2);
  136. #undef BENCH
  137. }
  138. tr.section("dynamic rank");
  139. {
  140. ra::Big<real> B({nx, ny, nz}, 1.);
  141. ra::Big<real> Bnext({nx, ny, nz}, 0.);
  142. auto Bstencil = stencil(B, 1, 1);
  143. cout << "Bstencil " << format_array(Bstencil(0, 0, 0, ra::dots<3>), "|", " ") << endl;
  144. #define BENCH(ref, op) bench(B, Bnext, Bstencil, ref, STRINGIZE(op), op {});
  145. // BENCH(Aref, f_raw); // TODO very slow
  146. BENCH(Aref, f_slices);
  147. BENCH(Aref, f_stencil_explicit);
  148. BENCH(Aref, f_stencil_arrayop);
  149. #undef BENCH
  150. }
  151. return tr.summary();
  152. }