bench-stencil1.cc 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. // -*- mode: c++; coding: utf-8 -*-
  2. // ra-ra/bench - Stencil-as-view (rank 1).
  3. // (c) Daniel Llorens - 2016-2023
  4. // This library is free software; you can redistribute it and/or modify it under
  5. // the terms of the GNU Lesser General Public License as published by the Free
  6. // Software Foundation; either version 3 of the License, or (at your option) any
  7. // later version.
  8. #include <iostream>
  9. #include <iomanip>
  10. #include <random>
  11. #include "ra/test.hh"
  12. using std::cout, std::endl, std::flush, ra::TestRecorder, ra::Benchmark;
  13. using real = double;
  14. int nx = 1000000;
  15. int ts = 10;
  16. auto I = ra::iota(nx-2, 1);
  17. constexpr ra::Small<real, 3> mask = { 1, -2, 1 };
  18. #define THEOP template <class A_, class Anext_, class Astencil_> __attribute__((noinline)) \
  19. auto operator()(A_ & A, Anext_ & Anext, Astencil_ & Astencil)
  20. // sensitive to RA_DO_CHECK.
  21. struct f_raw
  22. {
  23. THEOP
  24. {
  25. for (int i=1; i+1<nx; ++i) {
  26. Anext(i) = -2*A(i) + A(i+1) + A(i-1);
  27. }
  28. std::swap(A.cp, Anext.cp);
  29. };
  30. };
  31. // about as fast as f_raw, but no stencil. Insensitive to RA_DO_CHECK.
  32. struct f_slices
  33. {
  34. THEOP
  35. {
  36. Anext(I) = -2*A(I) + A(I+1) + A(I-1);
  37. std::swap(A.cp, Anext.cp);
  38. };
  39. };
  40. // with stencil, about as fast as f_raw. Sensitive to RA_DO_CHECK.
  41. struct f_stencil_explicit
  42. {
  43. THEOP
  44. {
  45. Astencil.cp = A.data();
  46. Anext(I) = map([](auto && A) { return -2*A(1) + A(2) + A(0); },
  47. iter<1>(Astencil));
  48. std::swap(A.cp, Anext.cp);
  49. };
  50. };
  51. // sum() inside uses run time sizes and 1-dim ply_ravel loop which is a bit slower. TODO
  52. struct f_stencil_arrayop
  53. {
  54. THEOP
  55. {
  56. Astencil.cp = A.data();
  57. Anext(I) = map([](auto && s) { return sum(s*mask); }, iter<1>(Astencil));
  58. std::swap(A.cp, Anext.cp);
  59. };
  60. };
  61. // allows traversal order to be chosen between all 6 axes in ply_ravel. 30x slower. TODO
  62. struct f_sumprod
  63. {
  64. THEOP
  65. {
  66. Astencil.cp = A.data();
  67. Anext(I) = 0; // TODO miss notation for sum-of-axes without preparing destination...
  68. Anext(I) += map(ra::wrank<1, 1>(std::multiplies<>()), Astencil, mask);
  69. std::swap(A.cp, Anext.cp);
  70. };
  71. };
  72. // variant of the above, much faster (TODO).
  73. struct f_sumprod2
  74. {
  75. THEOP
  76. {
  77. Astencil.cp = A.data();
  78. Anext(I) = 0;
  79. ply_fixed(map(ra::wrank<0, 1, 1>([](auto && A, auto && B, auto && C) { A += B*C; }), Anext(I), Astencil, mask));
  80. std::swap(A.cp, Anext.cp);
  81. };
  82. };
  83. int main()
  84. {
  85. TestRecorder tr(std::cout);
  86. std::random_device rand;
  87. real value = rand();
  88. auto bench = [&](auto & A, auto & Anext, auto & Astencil, auto && ref, auto && tag, auto && f)
  89. {
  90. auto bv = Benchmark().repeats(ts).runs(3)
  91. .once_f([&](auto && repeat)
  92. {
  93. Anext = 0.;
  94. A = value;
  95. repeat([&]() { f(A, Anext, Astencil); });
  96. });
  97. tr.info(std::setw(5), std::fixed, Benchmark::avg(bv)/A.size()/1e-9, " ns [",
  98. Benchmark::stddev(bv)/A.size()/1e-9 ,"] ", tag)
  99. .test_rel(ref, A, 1e-11);
  100. };
  101. ra::Big<real, 1> Aref;
  102. tr.section("static rank");
  103. {
  104. ra::Big<real, 1> A({nx}, 1.);
  105. ra::Big<real, 1> Anext({nx}, 0.);
  106. auto Astencil = stencil(A, 1, 1);
  107. cout << "Astencil " << format_array(Astencil(0, ra::dots<1>), { .sep0 = "|" }) << endl;
  108. #define BENCH(ref, op) bench(A, Anext, Astencil, ref, STRINGIZE(op), op {});
  109. BENCH(A, f_raw);
  110. Aref = ra::Big<real, 1>(A);
  111. BENCH(Aref, f_slices);
  112. BENCH(Aref, f_stencil_explicit);
  113. BENCH(Aref, f_stencil_arrayop);
  114. BENCH(Aref, f_sumprod);
  115. BENCH(Aref, f_sumprod2);
  116. #undef BENCH
  117. }
  118. tr.section("dynamic rank");
  119. {
  120. ra::Big<real> B({nx}, 1.);
  121. ra::Big<real> Bnext({nx}, 0.);
  122. auto Bstencil = stencil(B, 1, 1);
  123. cout << "Bstencil " << format_array(Bstencil(0, ra::dots<1>), { .sep0 = "|" }) << endl;
  124. #define BENCH(ref, op) bench(B, Bnext, Bstencil, ref, STRINGIZE(op), op {});
  125. // BENCH(Aref, f_raw); // TODO very slow
  126. BENCH(Aref, f_slices);
  127. BENCH(Aref, f_stencil_explicit);
  128. BENCH(Aref, f_stencil_arrayop);
  129. #undef BENCH
  130. }
  131. return tr.summary();
  132. }