bk_matmul.c 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. #include "stdlib.h"
  2. #include "util.h"
  3. #include "dataset.h"
  4. void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
  5. {
  6. int i, j, k, ii, jj, kk;
  7. if(coreid > 1) return;
  8. if (coreid == 0) {
  9. // for ( ii = 0; ii < 32; ii+=IC )
  10. for ( kk = 0; kk < 32; kk+=16 )
  11. for ( j = 0; j < 16; j++ )
  12. // for ( j = 0; j < 16; j++ )
  13. {
  14. for ( i = 0; i < 32; i+=8 )
  15. // for ( i = ii; i < ii + IC && i < 32; i+=8 )
  16. {
  17. data_t temp0 = C[i+j*32];
  18. data_t temp1 = C[i+j*32+1];
  19. data_t temp2 = C[i+j*32+2];
  20. data_t temp3 = C[i+j*32+3];
  21. data_t temp4 = C[i+j*32+4];
  22. data_t temp5 = C[i+j*32+5];
  23. data_t temp6 = C[i+j*32+6];
  24. data_t temp7 = C[i+j*32+7];
  25. for ( k = kk; k < kk+16 && k < 32; k++ )
  26. // for ( k = 0; k < 32; k++ )
  27. {
  28. data_t tempA = A[j*32+k];
  29. temp0 += tempA * B[k*32 + i];
  30. temp1 += tempA * B[k*32 + i+1];
  31. temp2 += tempA * B[k*32 + i+2];
  32. temp3 += tempA * B[k*32 + i+3];
  33. temp4 += tempA * B[k*32 + i+4];
  34. temp5 += tempA * B[k*32 + i+5];
  35. temp6 += tempA * B[k*32 + i+6];
  36. temp7 += tempA * B[k*32 + i+7];
  37. }
  38. C[i+j*32] = temp0;
  39. C[i+j*32+1] = temp1;
  40. C[i+j*32+2] = temp2;
  41. C[i+j*32+3] = temp3;
  42. C[i+j*32+4] = temp4;
  43. C[i+j*32+5] = temp5;
  44. C[i+j*32+6] = temp6;
  45. C[i+j*32+7] = temp7;
  46. }
  47. }
  48. }
  49. if(coreid == 1 || ncores == 1) {
  50. // for ( ii = 0; ii < 32; ii+=IC )
  51. for ( kk = 0; kk < 32; kk+=16 )
  52. for ( j = 16; j < 32; j++ )
  53. // for ( j = 16; j < 32; j++ )
  54. {
  55. for ( i = 0; i < 32; i+=8 )
  56. // for ( i = ii; i < ii + IC && i < 32; i+=8 )
  57. {
  58. data_t temp0 = C[i+j*32];
  59. data_t temp1 = C[i+j*32+1];
  60. data_t temp2 = C[i+j*32+2];
  61. data_t temp3 = C[i+j*32+3];
  62. data_t temp4 = C[i+j*32+4];
  63. data_t temp5 = C[i+j*32+5];
  64. data_t temp6 = C[i+j*32+6];
  65. data_t temp7 = C[i+j*32+7];
  66. for ( k = kk; k < kk+16 && k < 32; k++ )
  67. {
  68. data_t tempA = A[j*32+k];
  69. temp0 += tempA * B[k*32 + i];
  70. temp1 += tempA * B[k*32 + i+1];
  71. temp2 += tempA * B[k*32 + i+2];
  72. temp3 += tempA * B[k*32 + i+3];
  73. temp4 += tempA * B[k*32 + i+4];
  74. temp5 += tempA * B[k*32 + i+5];
  75. temp6 += tempA * B[k*32 + i+6];
  76. temp7 += tempA * B[k*32 + i+7];
  77. }
  78. C[i+j*32] = temp0;
  79. C[i+j*32+1] = temp1;
  80. C[i+j*32+2] = temp2;
  81. C[i+j*32+3] = temp3;
  82. C[i+j*32+4] = temp4;
  83. C[i+j*32+5] = temp5;
  84. C[i+j*32+6] = temp6;
  85. C[i+j*32+7] = temp7;
  86. }
  87. }
  88. }
  89. }