bench-stencil2.cc 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. // -*- mode: c++; coding: utf-8 -*-
  2. // ra-ra/bench - Stencil-as-view (rank 2).
  3. // (c) Daniel Llorens - 2016-2023
  4. // This library is free software; you can redistribute it and/or modify it under
  5. // the terms of the GNU Lesser General Public License as published by the Free
  6. // Software Foundation; either version 3 of the License, or (at your option) any
  7. // later version.
  8. #include <iostream>
  9. #include <iomanip>
  10. #include <random>
  11. #include "ra/bench.hh"
  12. using std::cout, std::endl, std::flush, ra::TestRecorder;
  13. using real = double;
  14. int nx = 1000;
  15. int ny = 1000;
  16. int ts = 10;
  17. auto I = ra::iota(nx-2, 1);
  18. auto J = ra::iota(ny-2, 1);
  19. constexpr ra::Small<real, 3, 3> mask = { 0, 1, 0,
  20. 1, -4, 1,
  21. 0, 1, 0 };
  22. #define THEOP template <class A_, class Anext_, class Astencil_> __attribute__((noinline)) \
  23. auto operator()(A_ & A, Anext_ & Anext, Astencil_ & Astencil)
  24. // sensitive to RA_DO_CHECK.
  25. struct f_raw
  26. {
  27. THEOP
  28. {
  29. for (int i=1; i+1<nx; ++i) {
  30. for (int j=1; j+1<ny; ++j) {
  31. Anext(i, j) = -4*A(i, j)
  32. + A(i+1, j) + A(i, j+1)
  33. + A(i-1, j) + A(i, j-1);
  34. }
  35. }
  36. std::swap(A.cp, Anext.cp);
  37. };
  38. };
  39. // about as fast as f_raw, but no stencil. Insensitive to RA_DO_CHECK.
  40. struct f_slices
  41. {
  42. THEOP
  43. {
  44. Anext(I, J) = -4*A(I, J)
  45. + A(I+1, J) + A(I, J+1)
  46. + A(I-1, J) + A(I, J-1);
  47. std::swap(A.cp, Anext.cp);
  48. };
  49. };
  50. // with stencil, about as fast as f_raw. Sensitive to RA_DO_CHECK.
  51. struct f_stencil_explicit
  52. {
  53. THEOP
  54. {
  55. Astencil.cp = A.data();
  56. Anext(I, J) = map([](auto && A) { return -4*A(1, 1)
  57. + A(2, 1) + A(1, 2)
  58. + A(0, 1) + A(1, 0); },
  59. iter<2>(Astencil));
  60. std::swap(A.cp, Anext.cp);
  61. };
  62. };
  63. // sum() inside uses run time sizes and 2-dim ply_ravel loop which is slower (2x w/gcc). TODO
  64. struct f_stencil_arrayop
  65. {
  66. THEOP
  67. {
  68. Astencil.cp = A.data();
  69. Anext(I, J) = map([](auto && s) { return sum(s*mask); }, iter<2>(Astencil));
  70. std::swap(A.cp, Anext.cp);
  71. };
  72. };
  73. // allows traversal order to be chosen between all 6 axes in ply_ravel. 30x slower. TODO
  74. struct f_sumprod
  75. {
  76. THEOP
  77. {
  78. Astencil.cp = A.data();
  79. Anext(I, J) = 0; // TODO miss notation for sum-of-axes without preparing destination...
  80. Anext(I, J) += map(ra::wrank<2, 2>(std::multiplies<>()), Astencil, mask);
  81. std::swap(A.cp, Anext.cp);
  82. };
  83. };
  84. // variant of the above, much faster (TODO).
  85. struct f_sumprod2
  86. {
  87. THEOP
  88. {
  89. Astencil.cp = A.data();
  90. Anext(I, J) = 0;
  91. ply_fixed(map(ra::wrank<0, 2, 2>([](auto && A, auto && B, auto && C) { A += B*C; }), Anext(I, J), Astencil, mask));
  92. std::swap(A.cp, Anext.cp);
  93. };
  94. };
  95. int main()
  96. {
  97. TestRecorder tr(std::cout);
  98. std::random_device rand;
  99. real value = rand();
  100. auto bench = [&](auto & A, auto & Anext, auto & Astencil, auto && ref, auto && tag, auto && f)
  101. {
  102. auto bv = Benchmark().repeats(ts).runs(3)
  103. .once_f([&](auto && repeat)
  104. {
  105. Anext = 0.;
  106. A = value;
  107. repeat([&]() { f(A, Anext, Astencil); });
  108. });
  109. tr.info(std::setw(5), std::fixed, Benchmark::avg(bv)/A.size()/1e-9, " ns [",
  110. Benchmark::stddev(bv)/A.size()/1e-9 ,"] ", tag)
  111. .test_rel(ref, A, 1e-10);
  112. };
  113. ra::Big<real, 2> Aref;
  114. tr.section("static rank");
  115. {
  116. ra::Big<real, 2> A({nx, ny}, 1.);
  117. ra::Big<real, 2> Anext({nx, ny}, 0.);
  118. auto Astencil = stencil(A, 1, 1);
  119. cout << "Astencil " << format_array(Astencil(0, 0, ra::dots<2>), "|", " ") << endl;
  120. #define BENCH(ref, op) bench(A, Anext, Astencil, ref, STRINGIZE(op), op {});
  121. BENCH(A, f_raw);
  122. Aref = ra::Big<real, 2>(A);
  123. BENCH(Aref, f_slices);
  124. BENCH(Aref, f_stencil_explicit);
  125. BENCH(Aref, f_stencil_arrayop);
  126. BENCH(Aref, f_sumprod);
  127. BENCH(Aref, f_sumprod2);
  128. #undef BENCH
  129. }
  130. tr.section("dynamic rank");
  131. {
  132. ra::Big<real> B({nx, ny}, 1.);
  133. ra::Big<real> Bnext({nx, ny}, 0.);
  134. auto Bstencil = stencil(B, 1, 1);
  135. cout << "Bstencil " << format_array(Bstencil(0, 0, ra::dots<2>), "|", " ") << endl;
  136. #define BENCH(ref, op) bench(B, Bnext, Bstencil, ref, STRINGIZE(op), op {});
  137. // BENCH(Aref, f_raw); // TODO very slow
  138. BENCH(Aref, f_slices);
  139. BENCH(Aref, f_stencil_explicit);
  140. BENCH(Aref, f_stencil_arrayop);
  141. #undef BENCH
  142. }
  143. return tr.summary();
  144. }