123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- #include "stdlib.h"
- #include "util.h"
- #include "dataset.h"
- void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
- {
- int i, j, k, ii, jj, kk;
- if(coreid > 1) return;
- if (coreid == 0) {
- // for ( ii = 0; ii < 32; ii+=IC )
- for ( kk = 0; kk < 32; kk+=16 )
- for ( j = 0; j < 16; j++ )
- // for ( j = 0; j < 16; j++ )
- {
- for ( i = 0; i < 32; i+=8 )
- // for ( i = ii; i < ii + IC && i < 32; i+=8 )
- {
- data_t temp0 = C[i+j*32];
- data_t temp1 = C[i+j*32+1];
- data_t temp2 = C[i+j*32+2];
- data_t temp3 = C[i+j*32+3];
- data_t temp4 = C[i+j*32+4];
- data_t temp5 = C[i+j*32+5];
- data_t temp6 = C[i+j*32+6];
- data_t temp7 = C[i+j*32+7];
- for ( k = kk; k < kk+16 && k < 32; k++ )
- // for ( k = 0; k < 32; k++ )
- {
- data_t tempA = A[j*32+k];
- temp0 += tempA * B[k*32 + i];
- temp1 += tempA * B[k*32 + i+1];
- temp2 += tempA * B[k*32 + i+2];
- temp3 += tempA * B[k*32 + i+3];
- temp4 += tempA * B[k*32 + i+4];
- temp5 += tempA * B[k*32 + i+5];
- temp6 += tempA * B[k*32 + i+6];
- temp7 += tempA * B[k*32 + i+7];
- }
- C[i+j*32] = temp0;
- C[i+j*32+1] = temp1;
- C[i+j*32+2] = temp2;
- C[i+j*32+3] = temp3;
- C[i+j*32+4] = temp4;
- C[i+j*32+5] = temp5;
- C[i+j*32+6] = temp6;
- C[i+j*32+7] = temp7;
- }
- }
- }
- if(coreid == 1 || ncores == 1) {
- // for ( ii = 0; ii < 32; ii+=IC )
- for ( kk = 0; kk < 32; kk+=16 )
- for ( j = 16; j < 32; j++ )
- // for ( j = 16; j < 32; j++ )
- {
- for ( i = 0; i < 32; i+=8 )
- // for ( i = ii; i < ii + IC && i < 32; i+=8 )
- {
- data_t temp0 = C[i+j*32];
- data_t temp1 = C[i+j*32+1];
- data_t temp2 = C[i+j*32+2];
- data_t temp3 = C[i+j*32+3];
- data_t temp4 = C[i+j*32+4];
- data_t temp5 = C[i+j*32+5];
- data_t temp6 = C[i+j*32+6];
- data_t temp7 = C[i+j*32+7];
- for ( k = kk; k < kk+16 && k < 32; k++ )
- {
- data_t tempA = A[j*32+k];
- temp0 += tempA * B[k*32 + i];
- temp1 += tempA * B[k*32 + i+1];
- temp2 += tempA * B[k*32 + i+2];
- temp3 += tempA * B[k*32 + i+3];
- temp4 += tempA * B[k*32 + i+4];
- temp5 += tempA * B[k*32 + i+5];
- temp6 += tempA * B[k*32 + i+6];
- temp7 += tempA * B[k*32 + i+7];
- }
- C[i+j*32] = temp0;
- C[i+j*32+1] = temp1;
- C[i+j*32+2] = temp2;
- C[i+j*32+3] = temp3;
- C[i+j*32+4] = temp4;
- C[i+j*32+5] = temp5;
- C[i+j*32+6] = temp6;
- C[i+j*32+7] = temp7;
- }
- }
- }
- }
|