MAT.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. /* MAT.cpp
  2. *
  3. * Copyright (C) 2017,2018 Paul Boersma
  4. *
  5. * This code is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This code is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. * See the GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this work. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. #define USE_CBLAS_GEMM 0
  19. #define USE_GSL_GEMM 0
  20. #ifdef macintosh
  21. #define USE_APPLE_GEMM 0
  22. #else
  23. #define USE_APPLE_GEMM 0
  24. #endif
  25. #include "melder.h"
  26. #include "../dwsys/NUM2.h"
  27. #if USE_CBLAS_GEMM
  28. #include "../dwsys/NUMcblas.h"
  29. #endif
  30. #if USE_GSL_GEMM
  31. #include "../external/gsl/gsl_blas.h"
  32. #endif
  33. #if USE_APPLE_GEMM
  34. #include <Accelerate/Accelerate.h>
  35. #endif
  36. void MATcentreEachColumn_inplace (const MAT& x) noexcept {
  37. for (integer icol = 1; icol <= x.ncol; icol ++) {
  38. double columnMean = NUMcolumnMean (x, icol);
  39. for (integer irow = 1; irow <= x.nrow; irow ++)
  40. x [irow] [icol] -= columnMean;
  41. }
  42. }
  43. void MATcentreEachRow_inplace (const MAT& x) noexcept {
  44. for (integer irow = 1; irow <= x.nrow; irow ++)
  45. VECcentre_inplace (x.row (irow));
  46. }
  47. void MATdoubleCentre_inplace (const MAT& x) noexcept {
  48. MATcentreEachRow_inplace (x);
  49. MATcentreEachColumn_inplace (x);
  50. }
  51. void MATmtm_preallocated (const MAT& target, const constMAT& x) noexcept {
  52. Melder_assert (target.nrow == x.ncol);
  53. Melder_assert (target.ncol == x.ncol);
  54. #if 0
  55. for (integer irow = 1; irow <= target.nrow; irow ++) {
  56. for (integer icol = irow; icol <= target.ncol; icol ++) {
  57. longdouble t = 0.0;
  58. for (integer k = 1; k <= x.nrow; k ++)
  59. t += x [k] [irow] * x [k] [icol];
  60. target [irow] [icol] = target [icol] [irow] = double (t);
  61. }
  62. }
  63. #elif 0
  64. for (integer irow = 1; irow <= target.nrow; irow ++) {
  65. for (integer icol = irow; icol <= target.ncol; icol ++) {
  66. PAIRWISE_SUM (longdouble, sum, integer, x.nrow,
  67. const double *px1 = & x [1] [irow];
  68. const double *px2 = & x [1] [icol],
  69. (longdouble) *px1 * (longdouble) *px2,
  70. (px1 += x.ncol, px2 += x.ncol)
  71. )
  72. target [irow] [icol] = target [icol] [irow] = double (sum);
  73. }
  74. }
  75. #else
  76. for (integer irow = 1; irow <= target.nrow; irow ++)
  77. for (integer icol = irow; icol <= target.ncol; icol ++)
  78. target [irow] [icol] = 0.0;
  79. for (integer k = 1; k <= x.nrow; k ++)
  80. for (integer irow = 1; irow <= target.nrow; irow ++)
  81. for (integer icol = irow; icol <= target.ncol; icol ++)
  82. target [irow] [icol] += x [k] [irow] * x [k] [icol];
  83. for (integer irow = 2; irow <= target.nrow; irow ++)
  84. for (integer icol = 1; icol < irow; icol ++)
  85. target [irow] [icol] = target [icol] [irow];
  86. #endif
  87. }
  88. void MATmul_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
  89. for (integer irow = 1; irow <= target.nrow; irow ++) {
  90. for (integer icol = 1; icol <= target.ncol; icol ++) {
  91. PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
  92. const double *px = & x [irow] [1];
  93. const double *py = & y [1] [icol],
  94. (longdouble) *px * (longdouble) *py,
  95. (px += 1, py += y.ncol)
  96. )
  97. target [irow] [icol] = double (sum);
  98. }
  99. }
  100. }
  101. inline constVEC VECrow_nocheck (const constMAT& mat, integer rowNumber) {
  102. return constVEC (mat.at [rowNumber], mat.ncol);
  103. }
  104. void MATmul_fast_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
  105. #if USE_CBLAS_GEMM
  106. /*
  107. This version is 49,0.75,0.32,0.33 ns per multiply-add for size = 1,10,100,1000.
  108. */
  109. double alpha = 1.0, beta = 0.0;
  110. NUMblas_dgemm ("N", "N", & target.nrow, & target.ncol, & x.ncol, & alpha,
  111. (double *) & x [1] [1], & x.nrow, (double *) & y [1] [1], & y.nrow, & beta, & target [1] [1], & target.nrow);
  112. #elif USE_GSL_GEMM
  113. /*
  114. This version is 34,0.72,0.31,0.41(NoTrans;1.34Trans) ns per multiply-add for size = 1,10,100,1000.
  115. */
  116. gsl_matrix gslx { (size_t) x.nrow, (size_t) x.ncol, (size_t) x.nrow, (double *) & x [1] [1], nullptr, false };
  117. gsl_matrix gsly { (size_t) y.nrow, (size_t) y.ncol, (size_t) y.nrow, (double *) & y [1] [1], nullptr, false };
  118. gsl_matrix gsltarget { (size_t) target.nrow, (size_t) target.ncol, (size_t) target.nrow, (double *) & target [1] [1], nullptr, false };
  119. gsl_blas_dgemm (CblasTrans, CblasTrans, 1.0, & gslx, & gsly, 0.0, & gsltarget);
  120. #elif USE_APPLE_GEMM
  121. cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, target.nrow, target.ncol, x.ncol,
  122. 1.0, & x [1] [1], x.nrow, & y [1] [1], y.nrow, 0.0, & target [1] [1], target.nrow); // 24,0.71,0.31,0.45
  123. #elif 0
  124. /*
  125. This version is 10,0.76,0.32,0.34 ns per multiply-add for size = 1,10,100,1000.
  126. The trick is to have the inner loop run along two final indices.
  127. Note that the multiplication factor within the inner loop is constant,
  128. so it will be moved out of the loop by the compiler.
  129. */
  130. for (integer irow = 1; irow <= target.nrow; irow ++) {
  131. for (integer icol = 1; icol <= target.ncol; icol ++)
  132. target [irow] [icol] = 0.0;
  133. for (integer i = 1; i <= x.ncol; i ++)
  134. for (integer icol = 1; icol <= target.ncol; icol ++)
  135. target [irow] [icol] += x [irow] [i] * y [i] [icol];
  136. }
  137. #elif 0
  138. /*
  139. This version is 20,0.80,0.32,0.33 ns per multiply-add for size = 1,10,100,1000.
  140. */
  141. double *ptarget = & asvector (target) [1];
  142. const double *px = & asvector (x) [1], *py = & asvector (y) [1];
  143. for (integer irow = 0; irow < target.nrow; irow ++) {
  144. for (integer icol = 0; icol < target.ncol; icol ++)
  145. ptarget [irow * target.ncol + icol] = 0.0;
  146. for (integer i = 0; i < x.ncol; i ++)
  147. for (integer icol = 0; icol < target.ncol; icol ++)
  148. ptarget [irow * target.ncol + icol] += px [irow * x.ncol + i] * py [i * y.ncol + icol];
  149. }
  150. #elif 0
  151. /*
  152. Naive slow implementation, via stored row pointers.
  153. This version is 7.5,0.69,0.87,1.87 ns per multiply-add for size = 1,10,100,1000.
  154. */
  155. for (integer irow = 1; irow <= target.nrow; irow ++) {
  156. for (integer icol = 1; icol <= target.ncol; icol ++) {
  157. target [irow] [icol] = 0.0;
  158. for (integer i = 1; i <= x.ncol; i ++)
  159. target [irow] [icol] += x [irow] [i] * y [i] [icol];
  160. }
  161. }
  162. #elif 0
  163. /*
  164. Naive slow implementation, via computed row pointers.
  165. This version is not slower than the version with stored pointers,
  166. although the inner loop contains the multiplication i * y.ncol,
  167. which the compiler cannot get rid of
  168. (some compiler may replace it with a y.ncol stride addition).
  169. It anything, this version is slightly faster than the one with stored pointers:
  170. the speed is 9.1,0.63,0.83,1.83 ns per multiply-add for size = 1,10,100,1000.
  171. */
  172. double *ptarget = & asvector (target) [1];
  173. const double *px = & asvector (x) [1], *py = & asvector (y) [1];
  174. for (integer irow = 0; irow < target.nrow; irow ++) {
  175. for (integer icol = 0; icol < target.ncol; icol ++) {
  176. ptarget [irow * target.ncol + icol] = 0.0;
  177. for (integer i = 0; i < x.ncol; i ++)
  178. ptarget [irow * target.ncol + icol] += px [irow * x.ncol + i] * py [i * y.ncol + icol];
  179. }
  180. }
  181. #elif 0
  182. /*
  183. Another attempt to slow down the computation,
  184. namely by making matrix indexing compute a vector, with size information and all.
  185. This version is 8.4,1.00,0.95,2.38 ns per multiply-add for size = 1,10,100,1000.
  186. That is really somewhat slower, but is that because of the size computation
  187. or because of the range check!
  188. */
  189. for (integer irow = 1; irow <= target.nrow; irow ++) {
  190. for (integer icol = 1; icol <= target.ncol; icol ++) {
  191. target [irow] [icol] = 0.0;
  192. for (integer i = 1; i <= x.ncol; i ++)
  193. target [irow] [icol] += x.row (irow) [i] * y.row (i) [icol];
  194. }
  195. }
  196. #elif 0
  197. /*
  198. Here we get rid of the row number check, but we still compute
  199. a whole vector with size information. Programmingwise, this would be our ideal.
  200. This version is 7.5,0.70,0.87,2.04 ns per multiply-add for size = 1,10,100,1000.
  201. It seems to be slightly slower for large matrices than the first stored-row-pointer version.
  202. */
  203. for (integer irow = 1; irow <= target.nrow; irow ++) {
  204. for (integer icol = 1; icol <= target.ncol; icol ++) {
  205. target [irow] [icol] = 0.0;
  206. for (integer i = 1; i <= x.ncol; i ++)
  207. target [irow] [icol] += VECrow_nocheck (x, irow) [i] * VECrow_nocheck (y, i) [icol];
  208. }
  209. }
  210. #else
  211. /*
  212. The smart version, with whole-vector computation. Still programmatically ideal.
  213. This version is 14.6,0.66,0.31,0.36 ns per multiply-add for size = 1,10,100,1000.
  214. */
  215. for (integer irow = 1; irow <= target.nrow; irow ++) {
  216. for (integer icol = 1; icol <= target.ncol; icol ++)
  217. target [irow] [icol] = 0.0;
  218. for (integer i = 1; i <= x.ncol; i ++)
  219. for (integer icol = 1; icol <= target.ncol; icol ++)
  220. target [irow] [icol] += VECrow_nocheck (x, irow) [i] * VECrow_nocheck (y, i) [icol];
  221. }
  222. #endif
  223. }
  224. void MATmul_nt_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
  225. for (integer irow = 1; irow <= target.nrow; irow ++) {
  226. for (integer icol = 1; icol <= target.ncol; icol ++) {
  227. PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
  228. const double *px = & x [irow] [1];
  229. const double *py = & y [icol] [1],
  230. (longdouble) *px * (longdouble) *py,
  231. (px += 1, py += 1)
  232. )
  233. target [irow] [icol] = double (sum);
  234. }
  235. }
  236. }
  237. void MATmul_tn_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
  238. for (integer irow = 1; irow <= target.nrow; irow ++) {
  239. for (integer icol = 1; icol <= target.ncol; icol ++) {
  240. PAIRWISE_SUM (longdouble, sum, integer, x.nrow,
  241. const double *px = & x [1] [irow];
  242. const double *py = & y [1] [icol],
  243. (longdouble) *px * (longdouble) *py,
  244. (px += x.ncol, py += y.ncol)
  245. )
  246. target [irow] [icol] = double (sum);
  247. }
  248. }
  249. }
  250. void MATmul_tn_fast_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
  251. for (integer irow = 1; irow <= target.nrow; irow ++) {
  252. for (integer icol = 1; icol <= target.ncol; icol ++)
  253. target [irow] [icol] = 0.0;
  254. for (integer i = 1; i <= x.nrow; i ++)
  255. for (integer icol = 1; icol <= target.ncol; icol ++)
  256. target [irow] [icol] += x [i] [irow] * y [i] [icol];
  257. }
  258. }
  259. void MATmul_tt_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
  260. for (integer irow = 1; irow <= target.nrow; irow ++) {
  261. for (integer icol = 1; icol <= target.ncol; icol ++) {
  262. PAIRWISE_SUM (longdouble, sum, integer, x.nrow,
  263. const double *px = & x [1] [irow];
  264. const double *py = & y [icol] [1],
  265. (longdouble) *px * (longdouble) *py,
  266. (px += x.ncol, py += 1)
  267. )
  268. target [irow] [icol] = double (sum);
  269. }
  270. }
  271. }
  272. void MATmul_tt_fast_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept(false) {
  273. integer n = x.nrow;
  274. if (n > 100) {
  275. autoMAT xt = MATtranspose (x), yt = MATtranspose (y);
  276. MATmul_fast_preallocated (target, xt.get(), yt.get());
  277. return;
  278. }
  279. MATmul_tt_preallocated_ (target, x, y);
  280. }
  281. autoMAT MATouter (const constVEC& x, const constVEC& y) {
  282. autoMAT result = MATraw (x.size, y.size);
  283. for (integer irow = 1; irow <= x.size; irow ++)
  284. for (integer icol = 1; icol <= y.size; icol ++)
  285. result [irow] [icol] = x [irow] * y [icol];
  286. return result;
  287. }
  288. autoMAT MATpeaks (const constVEC& x, bool includeEdges, int interpolate, bool sortByHeight) {
  289. if (x.size < 2) includeEdges = false;
  290. integer numberOfPeaks = 0;
  291. for (integer i = 2; i < x.size; i ++)
  292. if (x [i] > x [i - 1] && x [i] >= x [i + 1])
  293. numberOfPeaks ++;
  294. if (includeEdges) {
  295. if (x [1] > x [2]) numberOfPeaks ++;
  296. if (x [x.size] > x [x.size - 1]) numberOfPeaks ++;
  297. }
  298. autoMAT result = MATraw (2, numberOfPeaks);
  299. integer peakNumber = 0;
  300. if (includeEdges && x [1] > x [2]) {
  301. result [1] [++ peakNumber] = 1;
  302. result [2] [peakNumber] = x [1];
  303. }
  304. for (integer i = 2; i < x.size; i ++) {
  305. if (x [i] > x [i - 1] && x [i] >= x [i + 1]) {
  306. ++ peakNumber;
  307. if (interpolate != 0) { // this is not a boolean; there could follow more options
  308. /*
  309. Parabolic interpolation.
  310. */
  311. double dy = 0.5 * (x [i + 1] - x [i - 1]);
  312. double d2y = (x [i] - x [i - 1]) + (x [i] - x [i + 1]);
  313. Melder_assert (d2y > 0.0);
  314. result [1] [peakNumber] = (double) i + dy / d2y;
  315. result [2] [peakNumber] = x [i] + 0.5 * dy * (dy / d2y);
  316. } else {
  317. /*
  318. Don't interpolate: choose the nearest index.
  319. */
  320. result [1] [peakNumber] = i;
  321. result [2] [peakNumber] = x [i];
  322. }
  323. }
  324. }
  325. if (includeEdges && x [x.size] > x [x.size - 1]) {
  326. result [1] [++ peakNumber] = x.size;
  327. result [2] [peakNumber] = x [x.size];
  328. }
  329. Melder_assert (peakNumber == numberOfPeaks);
  330. if (sortByHeight) {
  331. for (integer i = 1; i <= numberOfPeaks; i ++)
  332. result [2] [i] *= -1.0;
  333. NUMsort2 (result.ncol, result [2], result [1]);
  334. for (integer i = 1; i <= numberOfPeaks; i ++)
  335. result [2] [i] *= -1.0;
  336. }
  337. return result;
  338. }
  339. /* End of file MAT.cpp */