PAIRWISE_SUM.h 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. #ifndef _PAIRWISE_SUM_h_
  2. #define _PAIRWISE_SUM_h_
  3. /* PAIRWISE_SUM.h
  4. *
  5. * Copyright (C) 2017,2018 Paul Boersma <paul.boersma@uva.nl>
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions are met:
  9. *
  10. * 1. Redistributions must retain the above copyright notice,
  11. * this list of conditions and the following disclaimer.
  12. *
  13. * 2. Redistributions in binary form must reproduce the above copyright notice,
  14. * this list of conditions and the following disclaimer in the documentation
  15. * and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  19. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  20. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
  21. * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  22. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  23. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  24. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  26. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. *
  28. * (This is the two-clause BSD license, which is approximately identical to CC-BY.)
  29. */
  30. /*
  31. # PAIRWISE SUMMATION FOR C AND C++
  32. The pairwise summation algorithm as implemented here is approximately 1.5 times
  33. faster than using the usual naive sequential summation loop,
  34. and is also hundreds of times more precise than sequential summation.
  35. Using pairwise summation instead of sequential summation will improve all your
  36. code for sums, means, inner products, norms, matrix multiplications,
  37. and CPU-based neural networks.
  38. This is not necessarily a small thing in terms of CO2.
  39. The macro you will call is
  40. PAIRWISE_SUM (
  41. AccumulatorType, // the type of the accumulator: long double, double, float;
  42. sumVariableName, // the variable name of the accumulator, e.g. "sum";
  43. CounterType, // the type of the term counters: long, int, int64_t, intptr_t;
  44. sizeExpression, // the total number of terms to sum;
  45. initializeStatement, // the statement that declares and initializes the loop pointer(s),
  46. // which is/are typically a pointer or pointers to the data;
  47. termExpression // the expression that computes a "term" (thing to sum)
  48. // from a piece or pieces of data.
  49. incrementStatement, // the statement that increments the loop pointer(s)
  50. // before the next piece(s) of data is/are fetched;
  51. )
  52. We explain this in detail below, pedagogically starting with an analogous macro
  53. for the more usual "sequential" summation algorithm.
  54. ## 1. SEQUENTIAL SUMMATION: A SINGLE ACCUMULATOR
  55. The simplest use of summation of multiple values is if you have a single array of
  56. floating-point data to sum. If this array is x [1..size],
  57. then the usual straightforward algorithm for adding the terms is
  58. long double sum = 0.0;
  59. for (long i = 1; i <= size; i ++)
  60. sum += x [i];
  61. printf ("%.17g", (double) sum);
  62. This is *sequential summation*: a single *accumulator*, namely the variable `sum`,
  63. accumulates every element sequentially. Note that even though the array `x` will
  64. typically be of type `double`, the accumulator is of type `long double` in this example:
  65. using 80 bits for summing is much better than using 64 bits (1000 times more precise),
  66. and on modern computers it is usually equally fast.
  67. The formulation above, which uses array indexing, is equivalent to a formulation
  68. in terms of a "looping pointer":
  69. long double sum = 0.0;
  70. const double *xx = & x [1]; // the looping pointer
  71. for (long i = 1; i <= size; i ++) {
  72. sum += *xx;
  73. xx += 1;
  74. }
  75. printf ("%.17g", (double) sum);
  76. The looping-pointer formulation consists of defining (declaring and initializing)
  77. the pointer before the loop, and then inside the loop
  78. first dereferencing the pointer and then incrementing it.
  79. In this little algorithm we can discern the following seven ingredients:
  80. - AccumulatorType: long double
  81. - sumVariableName: "sum"
  82. - CounterType: long
  83. - sizeExpression: "size"
  84. - initializeStatement: "const double *xx = x"
  85. - termExpression: "*xx"
  86. - incrementStatement: "xx += 1"
  87. The algorithm can therefore be replaced with
  88. SEQUENTIAL_SUM (long double, sum, long, size, const double *xx = & x [1], *xx, xx += 1)
  89. printf ("%.17g", (double) sum);
  90. where the SEQUENTIAL_SUM macro is defined as:
  91. */
  92. #define SEQUENTIAL_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \
  93. initializeStatement, termExpression, incrementStatement) \
  94. \
  95. AccumulatorType sumVariableName = 0.0; \
  96. {/* scope */ \
  97. initializeStatement; \
  98. CounterType _n = sizeExpression; /* to ensure that the size expression is evaluated only once */ \
  99. for (CounterType _i = 1; _i <= _n; _i ++) { \
  100. sumVariableName += termExpression; \
  101. incrementStatement; \
  102. } \
  103. }
  104. /*
  105. ## 2. SEQUENTIAL SUMMATION IN MACHINE CODE
  106. How would you sum the four numbers a, b, c and d,
  107. if all that your computer can do is adding two numbers at a time?
  108. You could write the sum in one stroke in C, without parentheses:
  109. sum = a + b + c + d;
  110. Evaluation of pluses in C proceeds from left to right, so this formulation means the same as
  111. sum = ((a + b) + c) + d;
  112. In machine-like C, using the "registers" r1, r2, r3 and r4, you could implement this as
  113. r1 = a;
  114. r2 = b;
  115. r1 += r2;
  116. r3 = c;
  117. r1 += r3;
  118. r4 = d;
  119. r1 += r4;
  120. sum = r1;
  121. All these three formulations lead to identical true machine code,
  122. at least with Clang or GCC, and with optimization option -O3.
  123. ## 3. SPEED OF SEQUENTIAL SUMMATION
  124. If your processor has good prefetching, then the four loads from RAM (moving a, b, c,
  125. and d into the registers r1, r2, r3 and r4) will take up no time at all.
  126. The three additions, however, will require three clock cycles in total:
  127. as all additions are performed on r1, each addition will have to wait
  128. until the previous addition has finished. Adding up 64 numbers will require 63 cycles.
  129. ## 4. PRECISION OF SEQUENTIAL SUMMATION
  130. The worst case floating-point rounding error for sequential summation is N * epsilon,
  131. where N is the number of numbers to add, and epsilon is the precision of one number.
  132. The expected random-walk rounding error is epsilon * sqrt (N).
  133. ## 5. PAIRWISE SUMMATION: MULTIPLE ACCUMULATORS
  134. An alternative way to add the four numbers, next to sequential summation,
  135. is to proceed in a *pairwise* manner, dividing the four numbers into two groups,
  136. then summing these separately, then summing the two results. In short C this would be:
  137. sum = (a + b) + (c + d);
  138. This pairwise summation continues recursively. For instance, you add 8 numbers this way:
  139. sum = ((a + b) + (c + d)) + ((e + f) + (g + h));
  140. ## 6. PAIRWISE SUMMATION IN MACHINE CODE
  141. In machine-like C, using the "registers" r1, r2, r3 and r4, you can implement this as
  142. r1 = a;
  143. r2 = b;
  144. r1 += r2;
  145. r3 = c;
  146. r4 = d;
  147. r3 += r4;
  148. r1 += r3;
  149. sum = r1;
  150. The number of loads, stores and additions is identical to those in the sequential case.
  151. ## 7. SPEED OF PAIRWISE SUMMATION
  152. Pairwise summation can be faster than sequential summation,
  153. because the addition of r2 to r1 can be performed in parallel to the addition of r4 to r3.
  154. The three additions in section 5 could therefore be performed in two clock cycles.
  155. Indeed, both compilers that I am using in 2017 (Clang on the Mac, and GCC on Windows and Linux)
  156. take only 0.30 nanoseconds per addition, which is just over two-thirds of the clock period
  157. of the 2.3 GHz processor (which is 0.435 nanoseconds) of my 2014 Macbook Pro.
  158. On a processor far, far away, which has more than 64 registers, perfect prefetching,
  159. and perfectly parallel operations on independent registers, 64 terms may be added
  160. within 6 clock cycles: six is the number of times we need to add something to r1,
  161. and each of these additions has to wait for the result of the previous addition.
  162. Theoretically, then, but not yet in practice, the execution time has fallen from order N
  163. to order log(N), at least for low N (and for high N it will be of order N,
  164. but with a much lower multiplication factor than sequential summation has).
  165. ## 8. PRECISION OF PAIRWISE SUMMATION
  166. The worst-case rounding error is only epsilon * log (N),
  167. and the expected random-walk rounding error is only epsilon * sqrt (log (N)).
  168. Both are much better than in the sequential summation case.
  169. ## 9. HOW TO USE PAIRWISE SUMMATION
  170. For pairwise summation we use the exact same macro arguments as for sequential summation:
  171. PAIRWISE_SUM (long double, sum, long, size, const double *xx = & x [1], *xx, xx += 1)
  172. printf ("%.17g", (double) sum);
  173. The macro starts by declaring a variable of type `AccumulatorType` and name `sumVariableName`.
  174. Thus, the expansion of the macro effectively starts with the declaration `long double sum;`
  175. and the variable `sum` will be available after the macro ends,
  176. so that you can use it in your subsequent code, as is done here by using `(double) sum`.
  177. This variable is used to accumulate the resulting sum and should for best results
  178. therefore be capable of accumulating partial sums with little rounding error.
  179. For this reason, you will usually want AccumulatorType to be `long double`.
  180. If long doubles are not available on your platform, or if they are slow on your platform,
  181. you can use `double` instead, or even `float`.
  182. The size of the array to sum comes in in the argument `sizeExpression`, and in the expansion
  183. of the macro this number and all macro-internal counters are of type `CounterType`.
  184. In the example above I used `long` for simplicity, but you should note that on 64-bit Windows
  185. a `long` is only 32 bits, whereas the natural type for array indexing on a 64-bit machine
  186. would be `int64_t`; I therefore tend to use a CounterType of `intptr_t` myself,
  187. because this is 32 bits on 32-bit machines and 64 bits on 64-bit machines.
  188. If your `size` is an `int` (always 32 bits nowadays), you would use a CounterType of `int`;
  189. it is not beneficial to use a wider CounterType than the width of your `size`.
  190. The fifth argument of the macro declares and initializes the loop pointer,
  191. as in `const double *xx = & x [1]` above. This pointer starts out pointing at the first
  192. element of the array.
  193. The sixth argument of the macro is an expression that should evaluate to the value of a term,
  194. given the current value of the loop pointer(s), as in `*xx` above.
  195. The seventh argument is the formula you use for incrementing the loop pointer(s),
  196. as in `xx += 1` above. The macro uses this formula to prepare for the retrieval of
  197. the next term in the summation.
  198. More complicated use cases than a single array also exist. For instance, if you have to compute
  199. the inner product of the two arrays x [1..n] and y [1..n], you can do
  200. PAIRWISE_SUM (long double, inner, long, n,
  201. const double *xx = & x [1]; // the semicolon ensures that this line and the next form a single argument
  202. const double *yy = & y [1],
  203. (long double) *xx * (long double) *yy,
  204. (xx += 1, yy += 1)
  205. )
  206. printf ("%.17g", (double) inner);
  207. Note for the sixth argument: you can see here that you can do the two increments simultaneously
  208. by using parentheses and a comma; fortunately, the C macro preprocessor understands enough
  209. about parentheses to see that you mean the sixth argument to be a single argument.
  210. Note for the seventh argument: as 64-bit multiplication loses a lot of the precision of its
  211. two 64-bit operands, it is advisable to convert both operands to `long double`
  212. *before* the multiplication, as is done here. This usually costs no extra computation
  213. time (it can actually be faster). If you do
  214. PAIRWISE_SUM (long double, inner, long, n,
  215. const double *xx = & x [1]; const double *yy = & y [1], *xx * *yy, (++ xx, ++ yy))
  216. printf ("%.17g", (double) inner);
  217. instead, the conversion to `long double` is done (by the macro) *after* the multiplication,
  218. which is less precise.
  219. Other use cases include array multiplication with strides...
  220. PAIRWISE_SUM (long double, inner, long, n,
  221. const double *xx = & x [1]; // note the funny semicolon again
  222. const double *yy = & y [1],
  223. (long double) *xx * (long double) *yy,
  224. (xx += xstride, yy += ystride)
  225. )
  226. printf ("%.17g", (double) inner);
  227. ... and small-lag convolution...
  228. for (long i = 1; i <= xSize - kernelSize + 1; i ++) {
  229. PAIRWISE_SUM (long double, conv, long, kernelSize,
  230. const double *xx = & x [i];
  231. const double *filter = & kernel [kernelSize],
  232. (long double) *xx * (long double) *filter,
  233. (xx += 1, filter -= 1)
  234. )
  235. result [i] = conv;
  236. }
  237. ... and matrix multiplication, and computing norms.
  238. ## 10. IMPLEMENTATION OF PAIRWISE SUMMATION: LOW POWERS OF 2
  239. We have to implement pairwise summation with a macro,
  240. because the sixth and seventh arguments to PAIRWISE_SUM() have to be formulas for getting
  241. the next element to add. We use fixed formulas for adding 2, 4, 8, 16 or 32 terms,
  242. and a stack for the next 57 higher powers of 2, up to 2^62,
  243. so that our summation will work for N up to 2^63 - 1.
  244. The fixed formulas for the low powers of 2 are recursively defined macros:
  245. */
  246. #define PAIRWISE_SUM_1_TERM(AccumulatorType, accumulator, termExpression, incrementStatement) \
  247. AccumulatorType accumulator = termExpression; \
  248. incrementStatement;
  249. #define PAIRWISE_SUM_2_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
  250. PAIRWISE_SUM_1_TERM (AccumulatorType, accumulator, termExpression, incrementStatement) \
  251. { \
  252. PAIRWISE_SUM_1_TERM (AccumulatorType, _r2, termExpression, incrementStatement) \
  253. accumulator += _r2; \
  254. }
  255. #define PAIRWISE_SUM_4_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
  256. PAIRWISE_SUM_2_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
  257. { \
  258. PAIRWISE_SUM_2_TERMS (AccumulatorType, _r3, termExpression, incrementStatement) \
  259. accumulator += _r3; \
  260. }
  261. #define PAIRWISE_SUM_8_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
  262. PAIRWISE_SUM_4_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
  263. { \
  264. PAIRWISE_SUM_4_TERMS (AccumulatorType, _r4, termExpression, incrementStatement) \
  265. accumulator += _r4; \
  266. }
  267. #define PAIRWISE_SUM_16_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
  268. PAIRWISE_SUM_8_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
  269. { \
  270. PAIRWISE_SUM_8_TERMS (AccumulatorType, _r5, termExpression, incrementStatement) \
  271. accumulator += _r5; \
  272. }
  273. #define PAIRWISE_SUM_32_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
  274. PAIRWISE_SUM_16_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
  275. { \
  276. PAIRWISE_SUM_16_TERMS (AccumulatorType, _r6, termExpression, incrementStatement) \
  277. accumulator += _r6; \
  278. }
  279. /*
  280. (The difference between the variable names "_r2", "_r3" and so on is not strictly needed,
  281. but making them the same would lead the compiler to issue warnings about shadowed variables.)
  282. */
  283. /*
  284. ## 11. IMPLEMENTATION OF PAIRWISE ADDITION: HIGH POWERS OF 2
  285. Higher powers of 2 than 2^5 (= 32) go on a stack. We sum 64 values at each stroke,
  286. and this requires a fixed formula for these 64 terms:
  287. */
  288. #define PAIRWISE_SUM_64_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
  289. PAIRWISE_SUM_32_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
  290. { \
  291. PAIRWISE_SUM_32_TERMS (AccumulatorType, _r7, termExpression, incrementStatement) \
  292. accumulator += _r7; \
  293. }
  294. #define PAIRWISE_SUM_128_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
  295. PAIRWISE_SUM_64_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
  296. { \
  297. PAIRWISE_SUM_64_TERMS (AccumulatorType, _r8, termExpression, incrementStatement) \
  298. accumulator += _r8; \
  299. }
  300. #define PAIRWISE_SUM_256_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
  301. PAIRWISE_SUM_128_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
  302. { \
  303. PAIRWISE_SUM_128_TERMS (AccumulatorType, _r9, termExpression, incrementStatement) \
  304. accumulator += _r9; \
  305. }
  306. /*
  307. A generalization about the timing of the summations in all the above macros
  308. is that r(i+1) is added to r(i) precisely when r(i+1) contains the same
  309. number of terms as r(i). This criterion for collapsing the partial sums
  310. is also used in the stack logic below.
  311. */
  312. #define PAIRWISE_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \
  313. initializeStatement, termExpression, incrementStatement) \
  314. \
  315. AccumulatorType sumVariableName = 0.0; \
  316. {/* scope */ \
  317. initializeStatement; \
  318. CounterType _n = sizeExpression; \
  319. if (_n & 1) { \
  320. PAIRWISE_SUM_1_TERM (AccumulatorType, _partialSum, termExpression, incrementStatement) \
  321. sumVariableName += _partialSum; \
  322. } \
  323. if (_n & 2) { \
  324. PAIRWISE_SUM_2_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
  325. sumVariableName += _partialSum; \
  326. } \
  327. if (_n & 4) { \
  328. PAIRWISE_SUM_4_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
  329. sumVariableName += _partialSum; \
  330. } \
  331. if (_n & 8) { \
  332. PAIRWISE_SUM_8_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
  333. sumVariableName += _partialSum; \
  334. } \
  335. if (_n & 16) { \
  336. PAIRWISE_SUM_16_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
  337. sumVariableName += _partialSum; \
  338. } \
  339. if (_n & 32) { \
  340. PAIRWISE_SUM_32_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
  341. sumVariableName += _partialSum; \
  342. } \
  343. const int _baseCasePower = 6; /* because the base case is 64 = 2^6 terms */ \
  344. CounterType _numberOfBaseCases = _n >> _baseCasePower; \
  345. if (_numberOfBaseCases != 0) { \
  346. /* */ \
  347. /* The value of _powers [0] stays at 0, to denote the bottom of the stack. */ \
  348. /* The maximum value of _powers [1] should be 62, which denotes that 2^62 terms */ \
  349. /* have been accumulated into _partialSumStack [1]. This the maximum, */ \
  350. /* because _n can be at most 2^63-1 (assuming CounterType is 64 bits). */ \
  351. /* The maximum value of _powers [2] should then be 61. */ \
  352. /* The maximum value of _powers [3] should be 60. */ \
  353. /* ... */ \
  354. /* The maximum value of _powers [57] should be 6. It ends there, because */ \
  355. /* 2^6 is the granularity with which base case sums are put on the stack. */ \
  356. /* The maximum value of _powers [58] should also be 6, */ \
  357. /* because this can be the situation just before collapsing the top of the stack. */ \
  358. /* However, if the whole stack is filled up like this, the actual number of */ \
  359. /* terms is already 2^63 (i.e. 2^62 + 2^61 + 2^60 + ... 2^6 + 2^6). Therefore, we */ \
  360. /* need one element less, so the highest index of _powers [] should be 57. */ \
  361. /* For 32-bit counters, this highest index is 25. */ \
  362. /* */ \
  363. const int _numberOfBitsInCounterType = 8 * sizeof (CounterType); /* 64 or 32 */ \
  364. const int _highestIndex = _numberOfBitsInCounterType - 1 - _baseCasePower; \
  365. AccumulatorType _partialSumStack [1 + _highestIndex]; /* 8 bytes too many, but better code */ \
  366. unsigned char _powers [1 + _highestIndex]; \
  367. _powers [0] = 0; \
  368. int _stackPointer = 0; \
  369. for (CounterType _ipart = 1; _ipart <= _numberOfBaseCases; _ipart ++) { \
  370. /* */ \
  371. /* Compute the sum of the next 64 data points. */ \
  372. /* */ \
  373. PAIRWISE_SUM_64_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
  374. /* */ \
  375. /* Put this sum on top of the stack. */ \
  376. /* */ \
  377. _partialSumStack [++ _stackPointer] = _partialSum; \
  378. _powers [_stackPointer] = _baseCasePower; \
  379. /* */ \
  380. /* The collapse criterion: */ \
  381. /* */ \
  382. while (_powers [_stackPointer] == _powers [_stackPointer - 1]) { \
  383. _partialSumStack [_stackPointer - 1] += _partialSumStack [_stackPointer]; \
  384. _powers [-- _stackPointer] += 1; \
  385. } \
  386. } \
  387. /* */ \
  388. /* Add all the elements of the stack, starting at the top. */ \
  389. /* */ \
  390. while (_stackPointer > 0) { \
  391. sumVariableName += _partialSumStack [_stackPointer --]; \
  392. } \
  393. } \
  394. }
  395. /*
  396. Note that we don't do the usual trick with `do {...} while (0)` that would allow you to add
  397. a semicolon after the `PAIRWISE_SUM()` call. This is to prevent the suggestion that the macro
  398. constitutes a single statement. The macro contains a sequence of two things: the definition
  399. of `sumVariableName` and a long block that usually changes the value of `sumVariableName`.
  400. Hence, the macro cannot be used as a single statement
  401. and e.g. has to be bracketed if used in an `else` clause.
  402. You are therefore advised to call `PAIRWISE_SUM()` without appending the misleading semicolon.
  403. */
  404. /*
  405. ## 12. SOME LESS GOOD SUMMATION ALGORITHMS
  406. The summation algorithm (at least for computing the mean) in the statistics software R
  407. is two-loop summation. This is fairly precise, but very slow:
  408. */
  409. #define TWO_LOOP_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \
  410. initializeStatement, termExpression, incrementStatement) \
  411. \
  412. AccumulatorType sumVariableName = 0.0; \
  413. {/* scope */ \
  414. CounterType _n = sizeExpression; \
  415. {/* scope */ \
  416. initializeStatement; \
  417. for (CounterType _i = 1; _i <= _n; _i ++) { \
  418. sumVariableName += termExpression; \
  419. incrementStatement; \
  420. } \
  421. } \
  422. AccumulatorType _mean = sumVariableName / _n; \
  423. {/* scope */ \
  424. sumVariableName = 0.0; \
  425. initializeStatement; \
  426. for (CounterType _i = 1; _i <= _n; _i ++) { \
  427. sumVariableName += (termExpression) - _mean; \
  428. incrementStatement; \
  429. } \
  430. sumVariableName += _mean * _n; \
  431. } \
  432. }
  433. /*
  434. Another one is the Kahan algorithm. Its precision is similar to that of pairwise summation,
  435. but it is extremely slow:
  436. */
  437. #define KAHAN_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \
  438. initializeStatement, termExpression, incrementStatement) \
  439. \
  440. AccumulatorType sumVariableName = 0.0; \
  441. {/* scope */ \
  442. initializeStatement; \
  443. CounterType _n = sizeExpression; \
  444. AccumulatorType _correction = 0.0; \
  445. for (CounterType _i = 1; _i <= _n; _i ++) { \
  446. AccumulatorType _correctedTerm = (termExpression) - _correction; \
  447. AccumulatorType _newSum = sumVariableName + _correctedTerm; \
  448. _correction = (_newSum - sumVariableName) - _correctedTerm; \
  449. sumVariableName = _newSum; \
  450. incrementStatement; \
  451. } \
  452. }
  453. /* End of file PAIRWISE_SUM.h */
  454. #endif