bo_matmul.c 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. #include "stdlib.h"
  2. #include "util.h"
  3. #include "dataset.h"
  4. void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
  5. {
  6. int i, j, k;
  7. data_t B_trans[32*32];
  8. data_t acc_temp0, acc_temp1;
  9. data_t *A_j, *B_i;
  10. data_t *A_j_k, *B_i_k;
  11. int z;
  12. //for (i = 0; i < 32; i++) {
  13. // for (j = 0; j < 32; j++) {
  14. // B_trans[i*lda+j] = B[i+j*lda];
  15. // }
  16. //}
  17. if (coreid == 0) {
  18. for (i = 0; i < 32; i++) {
  19. B_i = B_trans+i*32;
  20. for (z = 0; z < 32; z++) {
  21. *(B_i+z) = B[i+z*32];
  22. }
  23. for (j = 0; j < 16; j+=2) {
  24. A_j = A+j*lda;
  25. acc_temp0 = 0;
  26. for (k = 0; k < 32; k+=8) {
  27. A_j_k = A_j+k;
  28. B_i_k = B_i+k;
  29. acc_temp0 += *(A_j_k) * *(B_i_k);
  30. acc_temp0 += *(A_j_k + 1) * *(B_i_k + 1);
  31. acc_temp0 += *(A_j_k + 2) * *(B_i_k + 2);
  32. acc_temp0 += *(A_j_k + 3) * *(B_i_k + 3);
  33. acc_temp0 += *(A_j_k + 4) * *(B_i_k + 4);
  34. acc_temp0 += *(A_j_k + 5) * *(B_i_k + 5);
  35. acc_temp0 += *(A_j_k + 6) * *(B_i_k + 6);
  36. acc_temp0 += *(A_j_k + 7) * *(B_i_k + 7);
  37. }
  38. A_j += 32;
  39. acc_temp1 = 0;
  40. for (k = 0; k < 32; k+=8) {
  41. acc_temp1 += *(A_j+k) * *(B_i+k);
  42. acc_temp1 += *(A_j+k + 1) * *(B_i+k + 1);
  43. acc_temp1 += *(A_j+k + 2) * *(B_i+k + 2);
  44. acc_temp1 += *(A_j+k + 3) * *(B_i+k + 3);
  45. acc_temp1 += *(A_j+k + 4) * *(B_i+k + 4);
  46. acc_temp1 += *(A_j+k + 5) * *(B_i+k + 5);
  47. acc_temp1 += *(A_j+k + 6) * *(B_i+k + 6);
  48. acc_temp1 += *(A_j+k + 7) * *(B_i+k + 7);
  49. }
  50. C[i + j*lda] = acc_temp0;
  51. C[i + (j+1)*lda] = acc_temp1;
  52. }
  53. }
  54. }
  55. if (coreid == 1 || ncores == 1) {
  56. for (i = 0; i < 32; i++) {
  57. B_i = B_trans+i*32;
  58. for (z = 0; z < 32; z++) {
  59. *(B_i+z) = B[i+z*32];
  60. }
  61. for (j = 16; j < 32; j+=2) {
  62. A_j = A+j*lda;
  63. acc_temp0 = 0;
  64. for (k = 0; k < 32; k+=8) {
  65. acc_temp0 += *(A_j+k) * *(B_i+k);
  66. acc_temp0 += *(A_j+k + 1) * *(B_i+k + 1);
  67. acc_temp0 += *(A_j+k + 2) * *(B_i+k + 2);
  68. acc_temp0 += *(A_j+k + 3) * *(B_i+k + 3);
  69. acc_temp0 += *(A_j+k + 4) * *(B_i+k + 4);
  70. acc_temp0 += *(A_j+k + 5) * *(B_i+k + 5);
  71. acc_temp0 += *(A_j+k + 6) * *(B_i+k + 6);
  72. acc_temp0 += *(A_j+k + 7) * *(B_i+k + 7);
  73. }
  74. A_j += 32;
  75. acc_temp1 = 0;
  76. for (k = 0; k < 32; k+=8) {
  77. acc_temp1 += *(A_j+k) * *(B_i+k);
  78. acc_temp1 += *(A_j+k + 1) * *(B_i+k + 1);
  79. acc_temp1 += *(A_j+k + 2) * *(B_i+k + 2);
  80. acc_temp1 += *(A_j+k + 3) * *(B_i+k + 3);
  81. acc_temp1 += *(A_j+k + 4) * *(B_i+k + 4);
  82. acc_temp1 += *(A_j+k + 5) * *(B_i+k + 5);
  83. acc_temp1 += *(A_j+k + 6) * *(B_i+k + 6);
  84. acc_temp1 += *(A_j+k + 7) * *(B_i+k + 7);
  85. }
  86. C[i + j*lda] = acc_temp0;
  87. C[i + (j+1)*lda] = acc_temp1;
  88. }
  89. }
  90. }
  91. }