123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357 |
- /* MAT.cpp
- *
- * Copyright (C) 2017,2018 Paul Boersma
- *
- * This code is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This code is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this work. If not, see <http://www.gnu.org/licenses/>.
- */
- #define USE_CBLAS_GEMM 0
- #define USE_GSL_GEMM 0
- #ifdef macintosh
- #define USE_APPLE_GEMM 0
- #else
- #define USE_APPLE_GEMM 0
- #endif
- #include "melder.h"
- #include "../dwsys/NUM2.h"
- #if USE_CBLAS_GEMM
- #include "../dwsys/NUMcblas.h"
- #endif
- #if USE_GSL_GEMM
- #include "../external/gsl/gsl_blas.h"
- #endif
- #if USE_APPLE_GEMM
- #include <Accelerate/Accelerate.h>
- #endif
- void MATcentreEachColumn_inplace (const MAT& x) noexcept {
- for (integer icol = 1; icol <= x.ncol; icol ++) {
- double columnMean = NUMcolumnMean (x, icol);
- for (integer irow = 1; irow <= x.nrow; irow ++)
- x [irow] [icol] -= columnMean;
- }
- }
- void MATcentreEachRow_inplace (const MAT& x) noexcept {
- for (integer irow = 1; irow <= x.nrow; irow ++)
- VECcentre_inplace (x.row (irow));
- }
- void MATdoubleCentre_inplace (const MAT& x) noexcept {
- MATcentreEachRow_inplace (x);
- MATcentreEachColumn_inplace (x);
- }
- void MATmtm_preallocated (const MAT& target, const constMAT& x) noexcept {
- Melder_assert (target.nrow == x.ncol);
- Melder_assert (target.ncol == x.ncol);
- #if 0
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = irow; icol <= target.ncol; icol ++) {
- longdouble t = 0.0;
- for (integer k = 1; k <= x.nrow; k ++)
- t += x [k] [irow] * x [k] [icol];
- target [irow] [icol] = target [icol] [irow] = double (t);
- }
- }
- #elif 0
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = irow; icol <= target.ncol; icol ++) {
- PAIRWISE_SUM (longdouble, sum, integer, x.nrow,
- const double *px1 = & x [1] [irow];
- const double *px2 = & x [1] [icol],
- (longdouble) *px1 * (longdouble) *px2,
- (px1 += x.ncol, px2 += x.ncol)
- )
- target [irow] [icol] = target [icol] [irow] = double (sum);
- }
- }
- #else
- for (integer irow = 1; irow <= target.nrow; irow ++)
- for (integer icol = irow; icol <= target.ncol; icol ++)
- target [irow] [icol] = 0.0;
- for (integer k = 1; k <= x.nrow; k ++)
- for (integer irow = 1; irow <= target.nrow; irow ++)
- for (integer icol = irow; icol <= target.ncol; icol ++)
- target [irow] [icol] += x [k] [irow] * x [k] [icol];
- for (integer irow = 2; irow <= target.nrow; irow ++)
- for (integer icol = 1; icol < irow; icol ++)
- target [irow] [icol] = target [icol] [irow];
- #endif
- }
- void MATmul_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++) {
- PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
- const double *px = & x [irow] [1];
- const double *py = & y [1] [icol],
- (longdouble) *px * (longdouble) *py,
- (px += 1, py += y.ncol)
- )
- target [irow] [icol] = double (sum);
- }
- }
- }
- inline constVEC VECrow_nocheck (const constMAT& mat, integer rowNumber) {
- return constVEC (mat.at [rowNumber], mat.ncol);
- }
- void MATmul_fast_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
- #if USE_CBLAS_GEMM
- /*
- This version is 49,0.75,0.32,0.33 ns per multiply-add for size = 1,10,100,1000.
- */
- double alpha = 1.0, beta = 0.0;
- NUMblas_dgemm ("N", "N", & target.nrow, & target.ncol, & x.ncol, & alpha,
- (double *) & x [1] [1], & x.nrow, (double *) & y [1] [1], & y.nrow, & beta, & target [1] [1], & target.nrow);
- #elif USE_GSL_GEMM
- /*
- This version is 34,0.72,0.31,0.41(NoTrans;1.34Trans) ns per multiply-add for size = 1,10,100,1000.
- */
- gsl_matrix gslx { (size_t) x.nrow, (size_t) x.ncol, (size_t) x.nrow, (double *) & x [1] [1], nullptr, false };
- gsl_matrix gsly { (size_t) y.nrow, (size_t) y.ncol, (size_t) y.nrow, (double *) & y [1] [1], nullptr, false };
- gsl_matrix gsltarget { (size_t) target.nrow, (size_t) target.ncol, (size_t) target.nrow, (double *) & target [1] [1], nullptr, false };
- gsl_blas_dgemm (CblasTrans, CblasTrans, 1.0, & gslx, & gsly, 0.0, & gsltarget);
- #elif USE_APPLE_GEMM
- cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, target.nrow, target.ncol, x.ncol,
- 1.0, & x [1] [1], x.nrow, & y [1] [1], y.nrow, 0.0, & target [1] [1], target.nrow); // 24,0.71,0.31,0.45
- #elif 0
- /*
- This version is 10,0.76,0.32,0.34 ns per multiply-add for size = 1,10,100,1000.
- The trick is to have the inner loop run along two final indices.
- Note that the multiplication factor within the inner loop is constant,
- so it will be moved out of the loop by the compiler.
- */
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++)
- target [irow] [icol] = 0.0;
- for (integer i = 1; i <= x.ncol; i ++)
- for (integer icol = 1; icol <= target.ncol; icol ++)
- target [irow] [icol] += x [irow] [i] * y [i] [icol];
- }
- #elif 0
- /*
- This version is 20,0.80,0.32,0.33 ns per multiply-add for size = 1,10,100,1000.
- */
- double *ptarget = & asvector (target) [1];
- const double *px = & asvector (x) [1], *py = & asvector (y) [1];
- for (integer irow = 0; irow < target.nrow; irow ++) {
- for (integer icol = 0; icol < target.ncol; icol ++)
- ptarget [irow * target.ncol + icol] = 0.0;
- for (integer i = 0; i < x.ncol; i ++)
- for (integer icol = 0; icol < target.ncol; icol ++)
- ptarget [irow * target.ncol + icol] += px [irow * x.ncol + i] * py [i * y.ncol + icol];
- }
- #elif 0
- /*
- Naive slow implementation, via stored row pointers.
- This version is 7.5,0.69,0.87,1.87 ns per multiply-add for size = 1,10,100,1000.
- */
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++) {
- target [irow] [icol] = 0.0;
- for (integer i = 1; i <= x.ncol; i ++)
- target [irow] [icol] += x [irow] [i] * y [i] [icol];
- }
- }
- #elif 0
- /*
- Naive slow implementation, via computed row pointers.
- This version is not slower than the version with stored pointers,
- although the inner loop contains the multiplication i * y.ncol,
- which the compiler cannot get rid of
- (some compiler may replace it with a y.ncol stride addition).
- It anything, this version is slightly faster than the one with stored pointers:
- the speed is 9.1,0.63,0.83,1.83 ns per multiply-add for size = 1,10,100,1000.
- */
- double *ptarget = & asvector (target) [1];
- const double *px = & asvector (x) [1], *py = & asvector (y) [1];
- for (integer irow = 0; irow < target.nrow; irow ++) {
- for (integer icol = 0; icol < target.ncol; icol ++) {
- ptarget [irow * target.ncol + icol] = 0.0;
- for (integer i = 0; i < x.ncol; i ++)
- ptarget [irow * target.ncol + icol] += px [irow * x.ncol + i] * py [i * y.ncol + icol];
- }
- }
- #elif 0
- /*
- Another attempt to slow down the computation,
- namely by making matrix indexing compute a vector, with size information and all.
- This version is 8.4,1.00,0.95,2.38 ns per multiply-add for size = 1,10,100,1000.
- That is really somewhat slower, but is that because of the size computation
- or because of the range check!
- */
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++) {
- target [irow] [icol] = 0.0;
- for (integer i = 1; i <= x.ncol; i ++)
- target [irow] [icol] += x.row (irow) [i] * y.row (i) [icol];
- }
- }
- #elif 0
- /*
- Here we get rid of the row number check, but we still compute
- a whole vector with size information. Programmingwise, this would be our ideal.
- This version is 7.5,0.70,0.87,2.04 ns per multiply-add for size = 1,10,100,1000.
- It seems to be slightly slower for large matrices than the first stored-row-pointer version.
- */
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++) {
- target [irow] [icol] = 0.0;
- for (integer i = 1; i <= x.ncol; i ++)
- target [irow] [icol] += VECrow_nocheck (x, irow) [i] * VECrow_nocheck (y, i) [icol];
- }
- }
- #else
- /*
- The smart version, with whole-vector computation. Still programmatically ideal.
- This version is 14.6,0.66,0.31,0.36 ns per multiply-add for size = 1,10,100,1000.
- */
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++)
- target [irow] [icol] = 0.0;
- for (integer i = 1; i <= x.ncol; i ++)
- for (integer icol = 1; icol <= target.ncol; icol ++)
- target [irow] [icol] += VECrow_nocheck (x, irow) [i] * VECrow_nocheck (y, i) [icol];
- }
- #endif
- }
- void MATmul_nt_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++) {
- PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
- const double *px = & x [irow] [1];
- const double *py = & y [icol] [1],
- (longdouble) *px * (longdouble) *py,
- (px += 1, py += 1)
- )
- target [irow] [icol] = double (sum);
- }
- }
- }
- void MATmul_tn_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++) {
- PAIRWISE_SUM (longdouble, sum, integer, x.nrow,
- const double *px = & x [1] [irow];
- const double *py = & y [1] [icol],
- (longdouble) *px * (longdouble) *py,
- (px += x.ncol, py += y.ncol)
- )
- target [irow] [icol] = double (sum);
- }
- }
- }
- void MATmul_tn_fast_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++)
- target [irow] [icol] = 0.0;
- for (integer i = 1; i <= x.nrow; i ++)
- for (integer icol = 1; icol <= target.ncol; icol ++)
- target [irow] [icol] += x [i] [irow] * y [i] [icol];
- }
- }
- void MATmul_tt_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept {
- for (integer irow = 1; irow <= target.nrow; irow ++) {
- for (integer icol = 1; icol <= target.ncol; icol ++) {
- PAIRWISE_SUM (longdouble, sum, integer, x.nrow,
- const double *px = & x [1] [irow];
- const double *py = & y [icol] [1],
- (longdouble) *px * (longdouble) *py,
- (px += x.ncol, py += 1)
- )
- target [irow] [icol] = double (sum);
- }
- }
- }
- void MATmul_tt_fast_preallocated_ (const MAT& target, const constMAT& x, const constMAT& y) noexcept(false) {
- integer n = x.nrow;
- if (n > 100) {
- autoMAT xt = MATtranspose (x), yt = MATtranspose (y);
- MATmul_fast_preallocated (target, xt.get(), yt.get());
- return;
- }
- MATmul_tt_preallocated_ (target, x, y);
- }
- autoMAT MATouter (const constVEC& x, const constVEC& y) {
- autoMAT result = MATraw (x.size, y.size);
- for (integer irow = 1; irow <= x.size; irow ++)
- for (integer icol = 1; icol <= y.size; icol ++)
- result [irow] [icol] = x [irow] * y [icol];
- return result;
- }
- autoMAT MATpeaks (const constVEC& x, bool includeEdges, int interpolate, bool sortByHeight) {
- if (x.size < 2) includeEdges = false;
- integer numberOfPeaks = 0;
- for (integer i = 2; i < x.size; i ++)
- if (x [i] > x [i - 1] && x [i] >= x [i + 1])
- numberOfPeaks ++;
- if (includeEdges) {
- if (x [1] > x [2]) numberOfPeaks ++;
- if (x [x.size] > x [x.size - 1]) numberOfPeaks ++;
- }
- autoMAT result = MATraw (2, numberOfPeaks);
- integer peakNumber = 0;
- if (includeEdges && x [1] > x [2]) {
- result [1] [++ peakNumber] = 1;
- result [2] [peakNumber] = x [1];
- }
- for (integer i = 2; i < x.size; i ++) {
- if (x [i] > x [i - 1] && x [i] >= x [i + 1]) {
- ++ peakNumber;
- if (interpolate != 0) { // this is not a boolean; there could follow more options
- /*
- Parabolic interpolation.
- */
- double dy = 0.5 * (x [i + 1] - x [i - 1]);
- double d2y = (x [i] - x [i - 1]) + (x [i] - x [i + 1]);
- Melder_assert (d2y > 0.0);
- result [1] [peakNumber] = (double) i + dy / d2y;
- result [2] [peakNumber] = x [i] + 0.5 * dy * (dy / d2y);
- } else {
- /*
- Don't interpolate: choose the nearest index.
- */
- result [1] [peakNumber] = i;
- result [2] [peakNumber] = x [i];
- }
- }
- }
- if (includeEdges && x [x.size] > x [x.size - 1]) {
- result [1] [++ peakNumber] = x.size;
- result [2] [peakNumber] = x [x.size];
- }
- Melder_assert (peakNumber == numberOfPeaks);
- if (sortByHeight) {
- for (integer i = 1; i <= numberOfPeaks; i ++)
- result [2] [i] *= -1.0;
- NUMsort2 (result.ncol, result [2], result [1]);
- for (integer i = 1; i <= numberOfPeaks; i ++)
- result [2] [i] *= -1.0;
- }
- return result;
- }
- /* End of file MAT.cpp */
|