bench-stencil1.C 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. // (c) Daniel Llorens - 2016-2017
  2. // This library is free software; you can redistribute it and/or modify it under
  3. // the terms of the GNU Lesser General Public License as published by the Free
  4. // Software Foundation; either version 3 of the License, or (at your option) any
  5. // later version.
  6. /// @file bench-stencil1.C
  7. /// @brief Stencil-as-view.
  8. // TODO Bad performance, see also bench-stencil[23].C.
  9. #include <iostream>
  10. #include <iomanip>
  11. #include <random>
  12. #include "ra/operators.H"
  13. #include "ra/io.H"
  14. #include "ra/test.H"
  15. #include "ra/bench.H"
  16. using std::cout, std::endl, std::flush;
  17. using real = double;
  18. int nx = 1000000;
  19. int ts = 10;
  20. auto I = ra::iota(nx-2, 1);
  21. constexpr ra::Small<real, 3> mask = { 1, -2, 1 };
  22. #define THEOP template <class A_, class Anext_, class Astencil_> __attribute__((noinline)) \
  23. auto operator()(A_ & A, Anext_ & Anext, Astencil_ & Astencil)
  24. // sensitive to RA_CHECK_BOUNDS.
  25. struct f_raw
  26. {
  27. THEOP
  28. {
  29. for (int i=1; i+1<nx; ++i) {
  30. Anext(i) = -2*A(i) + A(i+1) + A(i-1);
  31. }
  32. std::swap(A.p, Anext.p);
  33. };
  34. };
  35. // about as fast as f_raw, but no stencil. Insensitive to RA_CHECK_BOUNDS.
  36. struct f_slices
  37. {
  38. THEOP
  39. {
  40. Anext(I) = -2*A(I) + A(I+1) + A(I-1);
  41. std::swap(A.p, Anext.p);
  42. };
  43. };
  44. // with stencil, about as fast as f_raw. Sensitive to RA_CHECK_BOUNDS.
  45. struct f_stencil_explicit
  46. {
  47. THEOP
  48. {
  49. Astencil.p = A.data();
  50. Anext(I) = map([](auto && A) { return -2*A(1) + A(2) + A(0); },
  51. iter<1>(Astencil));
  52. std::swap(A.p, Anext.p);
  53. };
  54. };
  55. // sum() inside uses run time sizes and 1-dim ply_ravel loop which is a bit slower. TODO
  56. struct f_stencil_arrayop
  57. {
  58. THEOP
  59. {
  60. Astencil.p = A.data();
  61. Anext(I) = map([](auto && s) { return sum(s*mask); }, iter<1>(Astencil));
  62. std::swap(A.p, Anext.p);
  63. };
  64. };
  65. // allows traversal order to be chosen between all 6 axes in ply_ravel. 30x slower. TODO
  66. struct f_sumprod
  67. {
  68. THEOP
  69. {
  70. Astencil.p = A.data();
  71. Anext(I) = 0; // TODO miss notation for sum-of-axes without preparing destination...
  72. Anext(I) += map(ra::wrank<1, 1>(ra::times()), Astencil, mask);
  73. std::swap(A.p, Anext.p);
  74. };
  75. };
  76. // variant of the above, much faster (TODO).
  77. struct f_sumprod2
  78. {
  79. THEOP
  80. {
  81. Astencil.p = A.data();
  82. Anext(I) = 0;
  83. plyf(map(ra::wrank<0, 1, 1>([](auto && A, auto && B, auto && C) { A += B*C; }), Anext(I), Astencil, mask));
  84. std::swap(A.p, Anext.p);
  85. };
  86. };
  87. int main()
  88. {
  89. TestRecorder tr(std::cout);
  90. std::random_device rand;
  91. real value = rand();
  92. auto bench = [&](auto & A, auto & Anext, auto & Astencil, auto && ref, auto && tag, auto && f)
  93. {
  94. auto bv = Benchmark().repeats(ts).runs(3)
  95. .once_f([&](auto && repeat)
  96. {
  97. Anext = 0.;
  98. A = value;
  99. repeat([&]() { f(A, Anext, Astencil); });
  100. });
  101. tr.info(std::setw(5), std::fixed, Benchmark::avg(bv)/A.size()/1e-9, " ns [",
  102. Benchmark::stddev(bv)/A.size()/1e-9 ,"] ", tag)
  103. .test_rel_error(ref, A, 1e-11);
  104. };
  105. ra::Big<real, 1> Aref;
  106. tr.section("static rank");
  107. {
  108. ra::Big<real, 1> A({nx}, 1.);
  109. ra::Big<real, 1> Anext({nx}, 0.);
  110. auto Astencil = stencil(A, 1, 1);
  111. cout << "Astencil " << format_array(Astencil(0, ra::dots<1>), "|", " ") << endl;
  112. #define BENCH(ref, op) bench(A, Anext, Astencil, ref, STRINGIZE(op), op {});
  113. BENCH(A, f_raw);
  114. Aref = ra::Big<real, 1>(A);
  115. BENCH(Aref, f_slices);
  116. BENCH(Aref, f_stencil_explicit);
  117. BENCH(Aref, f_stencil_arrayop);
  118. BENCH(Aref, f_sumprod);
  119. BENCH(Aref, f_sumprod2);
  120. #undef BENCH
  121. }
  122. tr.section("dynamic rank");
  123. {
  124. ra::Big<real> B({nx}, 1.);
  125. ra::Big<real> Bnext({nx}, 0.);
  126. auto Bstencil = stencil(B, 1, 1);
  127. cout << "Bstencil " << format_array(Bstencil(0, ra::dots<1>), "|", " ") << endl;
  128. #define BENCH(ref, op) bench(B, Bnext, Bstencil, ref, STRINGIZE(op), op {});
  129. // BENCH(Aref, f_raw); // TODO very slow
  130. BENCH(Aref, f_slices);
  131. BENCH(Aref, f_stencil_explicit);
  132. BENCH(Aref, f_stencil_arrayop);
  133. #undef BENCH
  134. }
  135. return tr.summary();
  136. }