al_matmul.c 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. #include "stdlib.h"
  2. #include "util.h"
  3. #include "dataset.h"
  4. void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
  5. {
  6. int i, j, k, x;
  7. data_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  8. data_t temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
  9. //complete Q1
  10. if(coreid > 1) return;
  11. if(coreid == 0) {
  12. for(j = 0; j < 32; j++) {
  13. temp0 = C[j*lda];
  14. temp1 = C[1 + j*lda];
  15. temp2 = C[2 + j*lda];
  16. temp3 = C[3 + j*lda];
  17. temp4 = C[4 + j*lda];
  18. temp5 = C[5 + j*lda];
  19. temp6 = C[6 + j*lda];
  20. temp7 = C[7 + j*lda];
  21. temp8 = C[8 + j*lda];
  22. temp9 = C[9 + j*lda];
  23. temp10 = C[10 + j*lda];
  24. temp11 = C[11 + j*lda];
  25. temp12 = C[12 + j*lda];
  26. temp13 = C[13 + j*lda];
  27. temp14 = C[14 + j*lda];
  28. temp15 = C[15 + j*lda];
  29. for(k = 0; k < 32; k++) {
  30. temp0 += A[j*lda + k] * B[k*lda];
  31. temp1 += A[j*lda + k] * B[1+k*lda];
  32. temp2 += A[j*lda + k] * B[2+k*lda];
  33. temp3 += A[j*lda + k] * B[3+k*lda];
  34. temp4 += A[j*lda + k] * B[4+k*lda];
  35. temp5 += A[j*lda + k] * B[5+k*lda];
  36. temp6 += A[j*lda + k] * B[6+k*lda];
  37. temp7 += A[j*lda + k] * B[7+k*lda];
  38. temp8 += A[j*lda + k] * B[8+k*lda];
  39. temp9 += A[j*lda + k] * B[9+k*lda];
  40. temp10 += A[j*lda + k] * B[10+k*lda];
  41. temp11 += A[j*lda + k] * B[11+k*lda];
  42. temp12 += A[j*lda + k] * B[12+k*lda];
  43. temp13 += A[j*lda + k] * B[13+k*lda];
  44. temp14 += A[j*lda + k] * B[14+k*lda];
  45. temp15 += A[j*lda + k] * B[15+k*lda];
  46. }
  47. C[j*lda] = temp0;
  48. C[1 + j*lda] = temp1;
  49. C[2 + j*lda] = temp2;
  50. C[3 + j*lda] = temp3;
  51. C[4 + j*lda] = temp4;
  52. C[5 + j*lda] = temp5;
  53. C[6 + j*lda] = temp6;
  54. C[7 + j*lda] = temp7;
  55. C[8 + j*lda] = temp8;
  56. C[9 + j*lda] = temp9;
  57. C[10 + j*lda] = temp10;
  58. C[11 + j*lda] = temp11;
  59. C[12 + j*lda] = temp12;
  60. C[13 + j*lda] = temp13;
  61. C[14 + j*lda] = temp14;
  62. C[15 + j*lda] = temp15;
  63. }
  64. }
  65. if( coreid == 1 || ncores == 1) {
  66. for(j = 0; j < 32; j++) {
  67. temp0 = C[16 + j*lda];
  68. temp1 = C[17 + j*lda];
  69. temp2 = C[18 + j*lda];
  70. temp3 = C[19 + j*lda];
  71. temp4 = C[20 + j*lda];
  72. temp5 = C[21 + j*lda];
  73. temp6 = C[22 + j*lda];
  74. temp7 = C[23 + j*lda];
  75. temp8 = C[24 + j*lda];
  76. temp9 = C[25 + j*lda];
  77. temp10 = C[26 + j*lda];
  78. temp11 = C[27 + j*lda];
  79. temp12 = C[28 + j*lda];
  80. temp13 = C[29 + j*lda];
  81. temp14 = C[30 + j*lda];
  82. temp15 = C[31 + j*lda];
  83. for(k = 0; k < 32; k++) {
  84. temp0 += A[j*lda + k] * B[16 + k*lda];
  85. temp1 += A[j*lda + k] * B[17 + k*lda];
  86. temp2 += A[j*lda + k] * B[18 + k*lda];
  87. temp3 += A[j*lda + k] * B[19 + k*lda];
  88. temp4 += A[j*lda + k] * B[20 + k*lda];
  89. temp5 += A[j*lda + k] * B[21 + k*lda];
  90. temp6 += A[j*lda + k] * B[22 + k*lda];
  91. temp7 += A[j*lda + k] * B[23 + k*lda];
  92. temp8 += A[j*lda + k] * B[24 + k*lda];
  93. temp9 += A[j*lda + k] * B[25 + k*lda];
  94. temp10 += A[j*lda + k] * B[26 + k*lda];
  95. temp11 += A[j*lda + k] * B[27 + k*lda];
  96. temp12 += A[j*lda + k] * B[28 + k*lda];
  97. temp13 += A[j*lda + k] * B[29 + k*lda];
  98. temp14 += A[j*lda + k] * B[30 + k*lda];
  99. temp15 += A[j*lda + k] * B[31 + k*lda];
  100. }
  101. C[16 + j*lda] = temp0;
  102. C[17 + j*lda] = temp1;
  103. C[18 + j*lda] = temp2;
  104. C[19 + j*lda] = temp3;
  105. C[20 + j*lda] = temp4;
  106. C[21 + j*lda] = temp5;
  107. C[22 + j*lda] = temp6;
  108. C[23 + j*lda] = temp7;
  109. C[24 + j*lda] = temp8;
  110. C[25 + j*lda] = temp9;
  111. C[26 + j*lda] = temp10;
  112. C[27 + j*lda] = temp11;
  113. C[28 + j*lda] = temp12;
  114. C[29 + j*lda] = temp13;
  115. C[30 + j*lda] = temp14;
  116. C[31 + j*lda] = temp15;
  117. }
  118. }
  119. }