VertexLoaderARM64.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. // Copyright 2015 Dolphin Emulator Project
  2. // Licensed under GPLv2+
  3. // Refer to the license.txt file included.
  4. #include "VideoCommon/VertexLoaderARM64.h"
  5. #include "VideoCommon/VertexLoaderManager.h"
  6. using namespace Arm64Gen;
  7. ARM64Reg src_reg = X0;
  8. ARM64Reg dst_reg = X1;
  9. ARM64Reg count_reg = W2;
  10. ARM64Reg skipped_reg = W17;
  11. ARM64Reg scratch1_reg = W16;
  12. ARM64Reg scratch2_reg = W15;
  13. ARM64Reg scratch3_reg = W14;
  14. ARM64Reg scratch4_reg = W13;
  15. ARM64Reg saved_count = W12;
  16. ARM64Reg stride_reg = X11;
  17. ARM64Reg arraybase_reg = X10;
  18. ARM64Reg scale_reg = X9;
  19. static const float GC_ALIGNED16(scale_factors[]) =
  20. {
  21. 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3),
  22. 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7),
  23. 1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11),
  24. 1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15),
  25. 1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19),
  26. 1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23),
  27. 1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27),
  28. 1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31),
  29. };
  30. VertexLoaderARM64::VertexLoaderARM64(const TVtxDesc& vtx_desc, const VAT& vtx_att)
  31. : VertexLoaderBase(vtx_desc, vtx_att), m_float_emit(this)
  32. {
  33. if (!IsInitialized())
  34. return;
  35. AllocCodeSpace(4096);
  36. ClearCodeSpace();
  37. GenerateVertexLoader();
  38. WriteProtect();
  39. }
  40. void VertexLoaderARM64::GetVertexAddr(int array, u64 attribute, ARM64Reg reg)
  41. {
  42. ADD(reg, src_reg, m_src_ofs);
  43. if (attribute & MASK_INDEXED)
  44. {
  45. if (attribute == INDEX8)
  46. {
  47. LDRB(INDEX_UNSIGNED, scratch1_reg, reg, 0);
  48. m_src_ofs += 1;
  49. }
  50. else
  51. {
  52. LDRH(INDEX_UNSIGNED, scratch1_reg, reg, 0);
  53. m_src_ofs += 2;
  54. REV16(scratch1_reg, scratch1_reg);
  55. }
  56. if (array == ARRAY_POSITION)
  57. {
  58. EOR(scratch2_reg, scratch1_reg, 0, attribute == INDEX8 ? 7 : 15); // 0xFF : 0xFFFF
  59. m_skip_vertex = CBZ(scratch2_reg);
  60. }
  61. LDR(INDEX_UNSIGNED, scratch2_reg, stride_reg, array * 4);
  62. MUL(scratch1_reg, scratch1_reg, scratch2_reg);
  63. LDR(INDEX_UNSIGNED, EncodeRegTo64(scratch2_reg), arraybase_reg, array * 8);
  64. ADD(EncodeRegTo64(reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg));
  65. }
  66. }
  67. s32 VertexLoaderARM64::GetAddressImm(int array, u64 attribute, Arm64Gen::ARM64Reg reg, u32 align)
  68. {
  69. if (attribute & MASK_INDEXED ||
  70. (m_src_ofs > 255 && (m_src_ofs & (align - 1))))
  71. GetVertexAddr(array, attribute, reg);
  72. else
  73. return m_src_ofs;
  74. return -1;
  75. }
  76. int VertexLoaderARM64::ReadVertex(u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format, s32 offset)
  77. {
  78. ARM64Reg coords = count_in == 3 ? Q31 : D31;
  79. ARM64Reg scale = count_in == 3 ? Q30 : D30;
  80. int elem_size = 1 << (format / 2);
  81. int load_bytes = elem_size * count_in;
  82. int load_size = load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16;
  83. load_size <<= 3;
  84. elem_size <<= 3;
  85. if (offset == -1)
  86. {
  87. if (count_in == 1)
  88. m_float_emit.LDR(elem_size, INDEX_UNSIGNED, coords, EncodeRegTo64(scratch1_reg), 0);
  89. else
  90. m_float_emit.LD1(elem_size, 1, coords, EncodeRegTo64(scratch1_reg));
  91. }
  92. else if (offset & (load_size - 1)) // Not aligned - unscaled
  93. {
  94. m_float_emit.LDUR(load_size, coords, src_reg, offset);
  95. }
  96. else
  97. {
  98. m_float_emit.LDR(load_size, INDEX_UNSIGNED, coords, src_reg, offset);
  99. }
  100. if (format != FORMAT_FLOAT)
  101. {
  102. // Extend and convert to float
  103. switch (format)
  104. {
  105. case FORMAT_UBYTE:
  106. m_float_emit.UXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  107. m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  108. break;
  109. case FORMAT_BYTE:
  110. m_float_emit.SXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  111. m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  112. break;
  113. case FORMAT_USHORT:
  114. m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  115. m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  116. break;
  117. case FORMAT_SHORT:
  118. m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  119. m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords));
  120. break;
  121. }
  122. m_float_emit.SCVTF(32, coords, coords);
  123. if (dequantize && scaling_exponent)
  124. {
  125. m_float_emit.LDR(32, INDEX_UNSIGNED, scale, scale_reg, scaling_exponent * 4);
  126. m_float_emit.FMUL(32, coords, coords, scale, 0);
  127. }
  128. }
  129. else
  130. {
  131. m_float_emit.REV32(8, coords, coords);
  132. }
  133. const u32 write_size = count_out == 3 ? 128 : count_out * 32;
  134. const u32 mask = count_out == 3 ? 0xF : count_out == 2 ? 0x7 : 0x3;
  135. if (m_dst_ofs < 256)
  136. {
  137. m_float_emit.STUR(write_size, coords, dst_reg, m_dst_ofs);
  138. }
  139. else if (!(m_dst_ofs & mask))
  140. {
  141. m_float_emit.STR(write_size, INDEX_UNSIGNED, coords, dst_reg, m_dst_ofs);
  142. }
  143. else
  144. {
  145. ADD(EncodeRegTo64(scratch2_reg), dst_reg, m_dst_ofs);
  146. m_float_emit.ST1(32, 1, coords, EncodeRegTo64(scratch2_reg));
  147. }
  148. // Z-Freeze
  149. if (native_format == &m_native_vtx_decl.position)
  150. {
  151. CMP(count_reg, 3);
  152. FixupBranch dont_store = B(CC_GT);
  153. MOVI2R(EncodeRegTo64(scratch2_reg), (u64)VertexLoaderManager::position_cache);
  154. ORR(scratch1_reg, WSP, count_reg, ArithOption(count_reg, ST_LSL, 4));
  155. ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg));
  156. m_float_emit.STUR(write_size, coords, EncodeRegTo64(scratch1_reg), -16);
  157. SetJumpTarget(dont_store);
  158. }
  159. native_format->components = count_out;
  160. native_format->enable = true;
  161. native_format->offset = m_dst_ofs;
  162. native_format->type = VAR_FLOAT;
  163. native_format->integer = false;
  164. m_dst_ofs += sizeof(float) * count_out;
  165. if (attribute == DIRECT)
  166. m_src_ofs += load_bytes;
  167. return load_bytes;
  168. }
  169. void VertexLoaderARM64::ReadColor(u64 attribute, int format, s32 offset)
  170. {
  171. int load_bytes = 0;
  172. switch (format)
  173. {
  174. case FORMAT_24B_888:
  175. case FORMAT_32B_888x:
  176. case FORMAT_32B_8888:
  177. if (offset == -1)
  178. LDR(INDEX_UNSIGNED, scratch2_reg, EncodeRegTo64(scratch1_reg), 0);
  179. else if (offset & 3) // Not aligned - unscaled
  180. LDUR(scratch2_reg, src_reg, offset);
  181. else
  182. LDR(INDEX_UNSIGNED, scratch2_reg, src_reg, offset);
  183. if (format != FORMAT_32B_8888)
  184. ORR(scratch2_reg, scratch2_reg, 8, 7); // 0xFF000000
  185. STR(INDEX_UNSIGNED, scratch2_reg, dst_reg, m_dst_ofs);
  186. load_bytes = 3 + (format != FORMAT_24B_888);
  187. break;
  188. case FORMAT_16B_565:
  189. // RRRRRGGG GGGBBBBB
  190. // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
  191. if (offset == -1)
  192. LDRH(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0);
  193. else if (offset & 1) // Not aligned - unscaled
  194. LDURH(scratch2_reg, src_reg, offset);
  195. else
  196. LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, offset);
  197. REV16(scratch3_reg, scratch3_reg);
  198. // B
  199. AND(scratch2_reg, scratch3_reg, 32, 4);
  200. ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 3));
  201. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 5));
  202. ORR(scratch1_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 16));
  203. // G
  204. UBFM(scratch2_reg, scratch3_reg, 5, 10);
  205. ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
  206. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6));
  207. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 8));
  208. // R
  209. UBFM(scratch2_reg, scratch3_reg, 11, 15);
  210. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 3));
  211. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 2));
  212. // A
  213. ORR(scratch2_reg, scratch2_reg, 8, 7); // 0xFF000000
  214. STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs);
  215. load_bytes = 2;
  216. break;
  217. case FORMAT_16B_4444:
  218. // BBBBAAAA RRRRGGGG
  219. // REV16 - RRRRGGGG BBBBAAAA
  220. // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
  221. if (offset == -1)
  222. LDRH(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0);
  223. else if (offset & 1) // Not aligned - unscaled
  224. LDURH(scratch2_reg, src_reg, offset);
  225. else
  226. LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, offset);
  227. // R
  228. UBFM(scratch1_reg, scratch3_reg, 4, 7);
  229. // G
  230. AND(scratch2_reg, scratch3_reg, 32, 3);
  231. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 8));
  232. // B
  233. UBFM(scratch2_reg, scratch3_reg, 12, 15);
  234. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 16));
  235. // A
  236. UBFM(scratch2_reg, scratch3_reg, 8, 11);
  237. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 24));
  238. // Final duplication
  239. ORR(scratch1_reg, scratch1_reg, scratch1_reg, ArithOption(scratch1_reg, ST_LSL, 4));
  240. STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs);
  241. load_bytes = 2;
  242. break;
  243. case FORMAT_24B_6666:
  244. // RRRRRRGG GGGGBBBB BBAAAAAA
  245. // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR
  246. if (offset == -1)
  247. LDR(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0);
  248. else if (offset & 3) // Not aligned - unscaled
  249. LDUR(scratch2_reg, src_reg, offset);
  250. else
  251. LDR(INDEX_UNSIGNED, scratch3_reg, src_reg, m_src_ofs);
  252. REV32(scratch3_reg, scratch3_reg);
  253. // A
  254. AND(scratch2_reg, scratch3_reg, 32, 5);
  255. ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
  256. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6));
  257. ORR(scratch1_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 24));
  258. // B
  259. UBFM(scratch2_reg, scratch3_reg, 6, 11);
  260. ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
  261. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6));
  262. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 16));
  263. // G
  264. UBFM(scratch2_reg, scratch3_reg, 12, 17);
  265. ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
  266. ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6));
  267. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 8));
  268. // R
  269. UBFM(scratch2_reg, scratch3_reg, 18, 23);
  270. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2));
  271. ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 4));
  272. STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs);
  273. load_bytes = 3;
  274. break;
  275. }
  276. if (attribute == DIRECT)
  277. m_src_ofs += load_bytes;
  278. }
  279. void VertexLoaderARM64::GenerateVertexLoader()
  280. {
  281. // R0 - Source pointer
  282. // R1 - Destination pointer
  283. // R2 - Count
  284. // R30 - LR
  285. //
  286. // R0 return how many
  287. //
  288. // Registers we don't have to worry about saving
  289. // R9-R17 are caller saved temporaries
  290. // R18 is a temporary or platform specific register(iOS)
  291. //
  292. // VFP registers
  293. // We can touch all except v8-v15
  294. // If we need to use those, we need to retain the lower 64bits(!) of the register
  295. MOV(skipped_reg, WSP);
  296. MOV(saved_count, count_reg);
  297. MOVI2R(stride_reg, (u64)&g_main_cp_state.array_strides);
  298. MOVI2R(arraybase_reg, (u64)&VertexLoaderManager::cached_arraybases);
  299. MOVI2R(scale_reg, (u64)&scale_factors);
  300. const u8* loop_start = GetCodePtr();
  301. if (m_VtxDesc.PosMatIdx)
  302. {
  303. LDRB(INDEX_UNSIGNED, scratch1_reg, src_reg, m_src_ofs);
  304. AND(scratch1_reg, scratch1_reg, 0, 5);
  305. STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs);
  306. // Z-Freeze
  307. CMP(count_reg, 3);
  308. FixupBranch dont_store = B(CC_GT);
  309. MOVI2R(EncodeRegTo64(scratch2_reg), (u64)VertexLoaderManager::position_matrix_index - sizeof(u32));
  310. STR(INDEX_UNSIGNED, scratch1_reg, EncodeRegTo64(scratch2_reg), 0);
  311. SetJumpTarget(dont_store);
  312. m_native_components |= VB_HAS_POSMTXIDX;
  313. m_native_vtx_decl.posmtx.components = 4;
  314. m_native_vtx_decl.posmtx.enable = true;
  315. m_native_vtx_decl.posmtx.offset = m_dst_ofs;
  316. m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE;
  317. m_native_vtx_decl.posmtx.integer = true;
  318. m_src_ofs += sizeof(u8);
  319. m_dst_ofs += sizeof(u32);
  320. }
  321. u32 texmatidx_ofs[8];
  322. const u64 tm[8] = {
  323. m_VtxDesc.Tex0MatIdx, m_VtxDesc.Tex1MatIdx, m_VtxDesc.Tex2MatIdx, m_VtxDesc.Tex3MatIdx,
  324. m_VtxDesc.Tex4MatIdx, m_VtxDesc.Tex5MatIdx, m_VtxDesc.Tex6MatIdx, m_VtxDesc.Tex7MatIdx,
  325. };
  326. for (int i = 0; i < 8; i++)
  327. {
  328. if (tm[i])
  329. texmatidx_ofs[i] = m_src_ofs++;
  330. }
  331. // Position
  332. {
  333. int elem_size = 1 << (m_VtxAttr.PosFormat / 2);
  334. int load_bytes = elem_size * (m_VtxAttr.PosElements + 2);
  335. int load_size = load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16;
  336. load_size <<= 3;
  337. s32 offset = GetAddressImm(ARRAY_POSITION, m_VtxDesc.Position, EncodeRegTo64(scratch1_reg), load_size);
  338. int pos_elements = m_VtxAttr.PosElements + 2;
  339. ReadVertex(m_VtxDesc.Position, m_VtxAttr.PosFormat, pos_elements, pos_elements,
  340. m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position, offset);
  341. }
  342. if (m_VtxDesc.Normal)
  343. {
  344. static const u8 map[8] = {7, 6, 15, 14};
  345. u8 scaling_exponent = map[m_VtxAttr.NormalFormat];
  346. s32 offset = -1;
  347. for (int i = 0; i < (m_VtxAttr.NormalElements ? 3 : 1); i++)
  348. {
  349. if (!i || m_VtxAttr.NormalIndex3)
  350. {
  351. int elem_size = 1 << (m_VtxAttr.NormalFormat / 2);
  352. int load_bytes = elem_size * 3;
  353. int load_size = load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16;
  354. offset = GetAddressImm(ARRAY_NORMAL, m_VtxDesc.Normal, EncodeRegTo64(scratch1_reg), load_size << 3);
  355. if (offset == -1)
  356. ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), i * elem_size * 3);
  357. else
  358. offset += i * elem_size * 3;
  359. }
  360. int bytes_read = ReadVertex(m_VtxDesc.Normal, m_VtxAttr.NormalFormat, 3, 3,
  361. true, scaling_exponent, &m_native_vtx_decl.normals[i], offset);
  362. if (offset == -1)
  363. ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), bytes_read);
  364. else
  365. offset += bytes_read;
  366. }
  367. m_native_components |= VB_HAS_NRM0;
  368. if (m_VtxAttr.NormalElements)
  369. m_native_components |= VB_HAS_NRM1 | VB_HAS_NRM2;
  370. }
  371. const u64 col[2] = {m_VtxDesc.Color0, m_VtxDesc.Color1};
  372. for (int i = 0; i < 2; i++)
  373. {
  374. m_native_vtx_decl.colors[i].components = 4;
  375. m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE;
  376. m_native_vtx_decl.colors[i].integer = false;
  377. if (col[i])
  378. {
  379. u32 align = 4;
  380. if (m_VtxAttr.color[i].Comp == FORMAT_16B_565 ||
  381. m_VtxAttr.color[i].Comp == FORMAT_16B_4444)
  382. align = 2;
  383. s32 offset = GetAddressImm(ARRAY_COLOR + i, col[i], EncodeRegTo64(scratch1_reg), align);
  384. ReadColor(col[i], m_VtxAttr.color[i].Comp, offset);
  385. m_native_components |= VB_HAS_COL0 << i;
  386. m_native_vtx_decl.colors[i].components = 4;
  387. m_native_vtx_decl.colors[i].enable = true;
  388. m_native_vtx_decl.colors[i].offset = m_dst_ofs;
  389. m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE;
  390. m_native_vtx_decl.colors[i].integer = false;
  391. m_dst_ofs += 4;
  392. }
  393. }
  394. const u64 tc[8] = {
  395. m_VtxDesc.Tex0Coord, m_VtxDesc.Tex1Coord, m_VtxDesc.Tex2Coord, m_VtxDesc.Tex3Coord,
  396. m_VtxDesc.Tex4Coord, m_VtxDesc.Tex5Coord, m_VtxDesc.Tex6Coord, m_VtxDesc.Tex7Coord,
  397. };
  398. for (int i = 0; i < 8; i++)
  399. {
  400. m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
  401. m_native_vtx_decl.texcoords[i].type = VAR_FLOAT;
  402. m_native_vtx_decl.texcoords[i].integer = false;
  403. int elements = m_VtxAttr.texCoord[i].Elements + 1;
  404. if (tc[i])
  405. {
  406. m_native_components |= VB_HAS_UV0 << i;
  407. int elem_size = 1 << (m_VtxAttr.texCoord[i].Format / 2);
  408. int load_bytes = elem_size * (elements + 2);
  409. int load_size = load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16;
  410. load_size <<= 3;
  411. s32 offset = GetAddressImm(ARRAY_TEXCOORD0 + i, tc[i], EncodeRegTo64(scratch1_reg), load_size);
  412. u8 scaling_exponent = m_VtxAttr.texCoord[i].Frac;
  413. ReadVertex(tc[i], m_VtxAttr.texCoord[i].Format, elements, tm[i] ? 2 : elements,
  414. m_VtxAttr.ByteDequant, scaling_exponent, &m_native_vtx_decl.texcoords[i], offset);
  415. }
  416. if (tm[i])
  417. {
  418. m_native_components |= VB_HAS_TEXMTXIDX0 << i;
  419. m_native_vtx_decl.texcoords[i].components = 3;
  420. m_native_vtx_decl.texcoords[i].enable = true;
  421. m_native_vtx_decl.texcoords[i].type = VAR_FLOAT;
  422. m_native_vtx_decl.texcoords[i].integer = false;
  423. LDRB(INDEX_UNSIGNED, scratch2_reg, src_reg, texmatidx_ofs[i]);
  424. m_float_emit.UCVTF(S31, scratch2_reg);
  425. if (tc[i])
  426. {
  427. m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs);
  428. m_dst_ofs += sizeof(float);
  429. }
  430. else
  431. {
  432. m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
  433. if (m_dst_ofs < 256)
  434. {
  435. STUR(SP, dst_reg, m_dst_ofs);
  436. }
  437. else if (!(m_dst_ofs & 7))
  438. {
  439. // If m_dst_ofs isn't 8byte aligned we can't store an 8byte zero register
  440. // So store two 4byte zero registers
  441. // The destination is always 4byte aligned
  442. STR(INDEX_UNSIGNED, WSP, dst_reg, m_dst_ofs);
  443. STR(INDEX_UNSIGNED, WSP, dst_reg, m_dst_ofs + 4);
  444. }
  445. else
  446. {
  447. STR(INDEX_UNSIGNED, SP, dst_reg, m_dst_ofs);
  448. }
  449. m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs + 8);
  450. m_dst_ofs += sizeof(float) * 3;
  451. }
  452. }
  453. }
  454. // Prepare for the next vertex.
  455. ADD(dst_reg, dst_reg, m_dst_ofs);
  456. const u8* cont = GetCodePtr();
  457. ADD(src_reg, src_reg, m_src_ofs);
  458. SUB(count_reg, count_reg, 1);
  459. CBNZ(count_reg, loop_start);
  460. if (m_VtxDesc.Position & MASK_INDEXED)
  461. {
  462. SUB(W0, saved_count, skipped_reg);
  463. RET(X30);
  464. SetJumpTarget(m_skip_vertex);
  465. ADD(skipped_reg, skipped_reg, 1);
  466. B(cont);
  467. }
  468. else
  469. {
  470. MOV(W0, saved_count);
  471. RET(X30);
  472. }
  473. FlushIcache();
  474. m_VertexSize = m_src_ofs;
  475. m_native_vtx_decl.stride = m_dst_ofs;
  476. }
  477. int VertexLoaderARM64::RunVertices(DataReader src, DataReader dst, int count)
  478. {
  479. m_numLoadedVertices += count;
  480. return ((int (*)(u8* src, u8* dst, int count))region)(src.GetPointer(), dst.GetPointer(), count);
  481. }