armfrag.s 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656
  1. ;********************************************************************
  2. ;* *
  3. ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. ;* *
  8. ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
  9. ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. ;* *
  11. ;********************************************************************
  12. ; Original implementation:
  13. ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
  14. ; last mod: $Id$
  15. ;********************************************************************
  16. AREA |.text|, CODE, READONLY
  17. GET armopts.s
  18. ; Vanilla ARM v4 versions
  19. EXPORT oc_frag_copy_list_arm
  20. EXPORT oc_frag_recon_intra_arm
  21. EXPORT oc_frag_recon_inter_arm
  22. EXPORT oc_frag_recon_inter2_arm
  23. oc_frag_copy_list_arm PROC
  24. ; r0 = _dst_frame
  25. ; r1 = _src_frame
  26. ; r2 = _ystride
  27. ; r3 = _fragis
  28. ; <> = _nfragis
  29. ; <> = _frag_buf_offs
  30. LDR r12,[r13] ; r12 = _nfragis
  31. STMFD r13!,{r4-r6,r11,r14}
  32. SUBS r12, r12, #1
  33. LDR r4,[r3],#4 ; r4 = _fragis[fragii]
  34. LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
  35. BLT ofcl_arm_end
  36. SUB r2, r2, #4
  37. ofcl_arm_lp
  38. LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]]
  39. SUBS r12, r12, #1
  40. ; Stall (on XScale)
  41. ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off
  42. LDR r6, [r4], #4
  43. ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off
  44. LDR r5, [r4], r2
  45. STR r6, [r11],#4
  46. LDR r6, [r4], #4
  47. STR r5, [r11],r2
  48. LDR r5, [r4], r2
  49. STR r6, [r11],#4
  50. LDR r6, [r4], #4
  51. STR r5, [r11],r2
  52. LDR r5, [r4], r2
  53. STR r6, [r11],#4
  54. LDR r6, [r4], #4
  55. STR r5, [r11],r2
  56. LDR r5, [r4], r2
  57. STR r6, [r11],#4
  58. LDR r6, [r4], #4
  59. STR r5, [r11],r2
  60. LDR r5, [r4], r2
  61. STR r6, [r11],#4
  62. LDR r6, [r4], #4
  63. STR r5, [r11],r2
  64. LDR r5, [r4], r2
  65. STR r6, [r11],#4
  66. LDR r6, [r4], #4
  67. STR r5, [r11],r2
  68. LDR r5, [r4], r2
  69. STR r6, [r11],#4
  70. LDR r6, [r4], #4
  71. STR r5, [r11],r2
  72. LDR r5, [r4]
  73. LDRGE r4,[r3],#4 ; r4 = _fragis[fragii]
  74. STR r6, [r11],#4
  75. STR r5, [r11]
  76. BGE ofcl_arm_lp
  77. ofcl_arm_end
  78. LDMFD r13!,{r4-r6,r11,PC}
  79. oc_frag_recon_intra_arm
  80. ; r0 = unsigned char *_dst
  81. ; r1 = int _ystride
  82. ; r2 = const ogg_int16_t _residue[64]
  83. STMFD r13!,{r4,r5,r14}
  84. MOV r14,#8
  85. MOV r5, #255
  86. SUB r1, r1, #7
  87. ofrintra_lp_arm
  88. LDRSH r3, [r2], #2
  89. LDRSH r4, [r2], #2
  90. LDRSH r12,[r2], #2
  91. ADDS r3, r3, #128
  92. CMPGT r5, r3
  93. EORLT r3, r5, r3, ASR #32
  94. STRB r3, [r0], #1
  95. ADDS r4, r4, #128
  96. CMPGT r5, r4
  97. EORLT r4, r5, r4, ASR #32
  98. LDRSH r3, [r2], #2
  99. STRB r4, [r0], #1
  100. ADDS r12,r12,#128
  101. CMPGT r5, r12
  102. EORLT r12,r5, r12,ASR #32
  103. LDRSH r4, [r2], #2
  104. STRB r12,[r0], #1
  105. ADDS r3, r3, #128
  106. CMPGT r5, r3
  107. EORLT r3, r5, r3, ASR #32
  108. LDRSH r12,[r2], #2
  109. STRB r3, [r0], #1
  110. ADDS r4, r4, #128
  111. CMPGT r5, r4
  112. EORLT r4, r5, r4, ASR #32
  113. LDRSH r3, [r2], #2
  114. STRB r4, [r0], #1
  115. ADDS r12,r12,#128
  116. CMPGT r5, r12
  117. EORLT r12,r5, r12,ASR #32
  118. LDRSH r4, [r2], #2
  119. STRB r12,[r0], #1
  120. ADDS r3, r3, #128
  121. CMPGT r5, r3
  122. EORLT r3, r5, r3, ASR #32
  123. STRB r3, [r0], #1
  124. ADDS r4, r4, #128
  125. CMPGT r5, r4
  126. EORLT r4, r5, r4, ASR #32
  127. STRB r4, [r0], r1
  128. SUBS r14,r14,#1
  129. BGT ofrintra_lp_arm
  130. LDMFD r13!,{r4,r5,PC}
  131. ENDP
  132. oc_frag_recon_inter_arm PROC
  133. ; r0 = unsigned char *dst
  134. ; r1 = const unsigned char *src
  135. ; r2 = int ystride
  136. ; r3 = const ogg_int16_t residue[64]
  137. STMFD r13!,{r5,r9-r11,r14}
  138. MOV r9, #8
  139. MOV r5, #255
  140. SUB r2, r2, #7
  141. ofrinter_lp_arm
  142. LDRSH r12,[r3], #2
  143. LDRB r14,[r1], #1
  144. LDRSH r11,[r3], #2
  145. LDRB r10,[r1], #1
  146. ADDS r12,r12,r14
  147. CMPGT r5, r12
  148. EORLT r12,r5, r12,ASR #32
  149. STRB r12,[r0], #1
  150. ADDS r11,r11,r10
  151. CMPGT r5, r11
  152. LDRSH r12,[r3], #2
  153. LDRB r14,[r1], #1
  154. EORLT r11,r5, r11,ASR #32
  155. STRB r11,[r0], #1
  156. ADDS r12,r12,r14
  157. CMPGT r5, r12
  158. LDRSH r11,[r3], #2
  159. LDRB r10,[r1], #1
  160. EORLT r12,r5, r12,ASR #32
  161. STRB r12,[r0], #1
  162. ADDS r11,r11,r10
  163. CMPGT r5, r11
  164. LDRSH r12,[r3], #2
  165. LDRB r14,[r1], #1
  166. EORLT r11,r5, r11,ASR #32
  167. STRB r11,[r0], #1
  168. ADDS r12,r12,r14
  169. CMPGT r5, r12
  170. LDRSH r11,[r3], #2
  171. LDRB r10,[r1], #1
  172. EORLT r12,r5, r12,ASR #32
  173. STRB r12,[r0], #1
  174. ADDS r11,r11,r10
  175. CMPGT r5, r11
  176. LDRSH r12,[r3], #2
  177. LDRB r14,[r1], #1
  178. EORLT r11,r5, r11,ASR #32
  179. STRB r11,[r0], #1
  180. ADDS r12,r12,r14
  181. CMPGT r5, r12
  182. LDRSH r11,[r3], #2
  183. LDRB r10,[r1], r2
  184. EORLT r12,r5, r12,ASR #32
  185. STRB r12,[r0], #1
  186. ADDS r11,r11,r10
  187. CMPGT r5, r11
  188. EORLT r11,r5, r11,ASR #32
  189. STRB r11,[r0], r2
  190. SUBS r9, r9, #1
  191. BGT ofrinter_lp_arm
  192. LDMFD r13!,{r5,r9-r11,PC}
  193. ENDP
  194. oc_frag_recon_inter2_arm PROC
  195. ; r0 = unsigned char *dst
  196. ; r1 = const unsigned char *src1
  197. ; r2 = const unsigned char *src2
  198. ; r3 = int ystride
  199. LDR r12,[r13]
  200. ; r12= const ogg_int16_t residue[64]
  201. STMFD r13!,{r4-r8,r14}
  202. MOV r14,#8
  203. MOV r8, #255
  204. SUB r3, r3, #7
  205. ofrinter2_lp_arm
  206. LDRB r5, [r1], #1
  207. LDRB r6, [r2], #1
  208. LDRSH r4, [r12],#2
  209. LDRB r7, [r1], #1
  210. ADD r5, r5, r6
  211. ADDS r5, r4, r5, LSR #1
  212. CMPGT r8, r5
  213. LDRB r6, [r2], #1
  214. LDRSH r4, [r12],#2
  215. EORLT r5, r8, r5, ASR #32
  216. STRB r5, [r0], #1
  217. ADD r7, r7, r6
  218. ADDS r7, r4, r7, LSR #1
  219. CMPGT r8, r7
  220. LDRB r5, [r1], #1
  221. LDRB r6, [r2], #1
  222. LDRSH r4, [r12],#2
  223. EORLT r7, r8, r7, ASR #32
  224. STRB r7, [r0], #1
  225. ADD r5, r5, r6
  226. ADDS r5, r4, r5, LSR #1
  227. CMPGT r8, r5
  228. LDRB r7, [r1], #1
  229. LDRB r6, [r2], #1
  230. LDRSH r4, [r12],#2
  231. EORLT r5, r8, r5, ASR #32
  232. STRB r5, [r0], #1
  233. ADD r7, r7, r6
  234. ADDS r7, r4, r7, LSR #1
  235. CMPGT r8, r7
  236. LDRB r5, [r1], #1
  237. LDRB r6, [r2], #1
  238. LDRSH r4, [r12],#2
  239. EORLT r7, r8, r7, ASR #32
  240. STRB r7, [r0], #1
  241. ADD r5, r5, r6
  242. ADDS r5, r4, r5, LSR #1
  243. CMPGT r8, r5
  244. LDRB r7, [r1], #1
  245. LDRB r6, [r2], #1
  246. LDRSH r4, [r12],#2
  247. EORLT r5, r8, r5, ASR #32
  248. STRB r5, [r0], #1
  249. ADD r7, r7, r6
  250. ADDS r7, r4, r7, LSR #1
  251. CMPGT r8, r7
  252. LDRB r5, [r1], #1
  253. LDRB r6, [r2], #1
  254. LDRSH r4, [r12],#2
  255. EORLT r7, r8, r7, ASR #32
  256. STRB r7, [r0], #1
  257. ADD r5, r5, r6
  258. ADDS r5, r4, r5, LSR #1
  259. CMPGT r8, r5
  260. LDRB r7, [r1], r3
  261. LDRB r6, [r2], r3
  262. LDRSH r4, [r12],#2
  263. EORLT r5, r8, r5, ASR #32
  264. STRB r5, [r0], #1
  265. ADD r7, r7, r6
  266. ADDS r7, r4, r7, LSR #1
  267. CMPGT r8, r7
  268. EORLT r7, r8, r7, ASR #32
  269. STRB r7, [r0], r3
  270. SUBS r14,r14,#1
  271. BGT ofrinter2_lp_arm
  272. LDMFD r13!,{r4-r8,PC}
  273. ENDP
  274. [ OC_ARM_ASM_EDSP
  275. EXPORT oc_frag_copy_list_edsp
  276. oc_frag_copy_list_edsp PROC
  277. ; r0 = _dst_frame
  278. ; r1 = _src_frame
  279. ; r2 = _ystride
  280. ; r3 = _fragis
  281. ; <> = _nfragis
  282. ; <> = _frag_buf_offs
  283. LDR r12,[r13] ; r12 = _nfragis
  284. STMFD r13!,{r4-r11,r14}
  285. SUBS r12, r12, #1
  286. LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
  287. LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs
  288. BLT ofcl_edsp_end
  289. ofcl_edsp_lp
  290. MOV r4, r1
  291. LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]]
  292. SUBS r12, r12, #1
  293. ; Stall (on XScale)
  294. LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off
  295. LDRD r8, [r4, r2]!
  296. ; Stall
  297. STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off
  298. STRD r8, [r5, r2]!
  299. ; Stall
  300. LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive
  301. LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse
  302. LDRD r10,[r4, r2]! ; than us only doing 2, and having to do
  303. ; another pair of LDRD/STRD later on.
  304. ; Stall
  305. STRD r6, [r5, r2]!
  306. STRD r8, [r5, r2]!
  307. STRD r10,[r5, r2]!
  308. LDRD r6, [r4, r2]!
  309. LDRD r8, [r4, r2]!
  310. LDRD r10,[r4, r2]!
  311. STRD r6, [r5, r2]!
  312. STRD r8, [r5, r2]!
  313. STRD r10,[r5, r2]!
  314. LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
  315. BGE ofcl_edsp_lp
  316. ofcl_edsp_end
  317. LDMFD r13!,{r4-r11,PC}
  318. ENDP
  319. ]
  320. [ OC_ARM_ASM_MEDIA
  321. EXPORT oc_frag_recon_intra_v6
  322. EXPORT oc_frag_recon_inter_v6
  323. EXPORT oc_frag_recon_inter2_v6
  324. oc_frag_recon_intra_v6 PROC
  325. ; r0 = unsigned char *_dst
  326. ; r1 = int _ystride
  327. ; r2 = const ogg_int16_t _residue[64]
  328. STMFD r13!,{r4-r6,r14}
  329. MOV r14,#8
  330. MOV r12,r2
  331. LDR r6, =0x00800080
  332. ofrintra_v6_lp
  333. LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222
  334. LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666
  335. SUBS r14,r14,#1
  336. QADD16 r2, r2, r6
  337. QADD16 r3, r3, r6
  338. QADD16 r4, r4, r6
  339. QADD16 r5, r5, r6
  340. USAT16 r2, #8, r2 ; r2 = __11__00
  341. USAT16 r3, #8, r3 ; r3 = __33__22
  342. USAT16 r4, #8, r4 ; r4 = __55__44
  343. USAT16 r5, #8, r5 ; r5 = __77__66
  344. ORR r2, r2, r2, LSR #8 ; r2 = __111100
  345. ORR r3, r3, r3, LSR #8 ; r3 = __333322
  346. ORR r4, r4, r4, LSR #8 ; r4 = __555544
  347. ORR r5, r5, r5, LSR #8 ; r5 = __777766
  348. PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100
  349. PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544
  350. STRD r2, [r0], r1
  351. BGT ofrintra_v6_lp
  352. LDMFD r13!,{r4-r6,PC}
  353. ENDP
  354. oc_frag_recon_inter_v6 PROC
  355. ; r0 = unsigned char *_dst
  356. ; r1 = const unsigned char *_src
  357. ; r2 = int _ystride
  358. ; r3 = const ogg_int16_t _residue[64]
  359. STMFD r13!,{r4-r7,r14}
  360. MOV r14,#8
  361. ofrinter_v6_lp
  362. LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222
  363. SUBS r14,r14,#1
  364. [ OC_ARM_CAN_UNALIGN_LDRD
  365. LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544
  366. |
  367. LDR r5, [r1, #4]
  368. LDR r4, [r1], r2
  369. ]
  370. PKHBT r12,r6, r7, LSL #16 ; r12= 22220000
  371. PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
  372. UXTB16 r6,r4 ; r6 = __22__00
  373. UXTB16 r4,r4, ROR #8 ; r4 = __33__11
  374. QADD16 r12,r12,r6 ; r12= xx22xx00
  375. QADD16 r4, r7, r4 ; r4 = xx33xx11
  376. LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666
  377. USAT16 r4, #8, r4 ; r4 = __33__11
  378. USAT16 r12,#8,r12 ; r12= __22__00
  379. ORR r4, r12,r4, LSL #8 ; r4 = 33221100
  380. PKHBT r12,r6, r7, LSL #16 ; r12= 66664444
  381. PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555
  382. UXTB16 r6,r5 ; r6 = __66__44
  383. UXTB16 r5,r5, ROR #8 ; r5 = __77__55
  384. QADD16 r12,r12,r6 ; r12= xx66xx44
  385. QADD16 r5, r7, r5 ; r5 = xx77xx55
  386. USAT16 r12,#8, r12 ; r12= __66__44
  387. USAT16 r5, #8, r5 ; r4 = __77__55
  388. ORR r5, r12,r5, LSL #8 ; r5 = 33221100
  389. STRD r4, [r0], r2
  390. BGT ofrinter_v6_lp
  391. LDMFD r13!,{r4-r7,PC}
  392. ENDP
  393. oc_frag_recon_inter2_v6 PROC
  394. ; r0 = unsigned char *_dst
  395. ; r1 = const unsigned char *_src1
  396. ; r2 = const unsigned char *_src2
  397. ; r3 = int _ystride
  398. LDR r12,[r13]
  399. ; r12= const ogg_int16_t _residue[64]
  400. STMFD r13!,{r4-r9,r14}
  401. MOV r14,#8
  402. ofrinter2_v6_lp
  403. LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666
  404. SUBS r14,r14,#1
  405. LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544
  406. LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544
  407. PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444
  408. PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555
  409. UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
  410. UXTB16 r5, r4 ; r5 = __66__44
  411. UXTB16 r4, r4, ROR #8 ; r4 = __77__55
  412. QADD16 r8, r8, r5 ; r8 = xx66xx44
  413. QADD16 r9, r9, r4 ; r9 = xx77xx55
  414. LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000
  415. USAT16 r8, #8, r8 ; r8 = __66__44
  416. LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100
  417. USAT16 r9, #8, r9 ; r9 = __77__55
  418. LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100
  419. ORR r9, r8, r9, LSL #8 ; r9 = 77665544
  420. PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000
  421. UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
  422. PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
  423. UXTB16 r5, r4 ; r5 = __22__00
  424. UXTB16 r4, r4, ROR #8 ; r4 = __33__11
  425. QADD16 r8, r8, r5 ; r8 = xx22xx00
  426. QADD16 r7, r7, r4 ; r7 = xx33xx11
  427. USAT16 r8, #8, r8 ; r8 = __22__00
  428. USAT16 r7, #8, r7 ; r7 = __33__11
  429. ORR r8, r8, r7, LSL #8 ; r8 = 33221100
  430. STRD r8, [r0], r3
  431. BGT ofrinter2_v6_lp
  432. LDMFD r13!,{r4-r9,PC}
  433. ENDP
  434. ]
  435. [ OC_ARM_ASM_NEON
  436. EXPORT oc_frag_copy_list_neon
  437. EXPORT oc_frag_recon_intra_neon
  438. EXPORT oc_frag_recon_inter_neon
  439. EXPORT oc_frag_recon_inter2_neon
  440. oc_frag_copy_list_neon PROC
  441. ; r0 = _dst_frame
  442. ; r1 = _src_frame
  443. ; r2 = _ystride
  444. ; r3 = _fragis
  445. ; <> = _nfragis
  446. ; <> = _frag_buf_offs
  447. LDR r12,[r13] ; r12 = _nfragis
  448. STMFD r13!,{r4-r7,r14}
  449. CMP r12, #1
  450. LDRGE r6, [r3] ; r6 = _fragis[fragii]
  451. LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
  452. BLT ofcl_neon_end
  453. ; Stall (2 on Xscale)
  454. LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
  455. ; Stall (on XScale)
  456. MOV r7, r6 ; Guarantee PLD points somewhere valid.
  457. ofcl_neon_lp
  458. ADD r4, r1, r6
  459. VLD1.64 {D0}, [r4@64], r2
  460. ADD r5, r0, r6
  461. VLD1.64 {D1}, [r4@64], r2
  462. SUBS r12, r12, #1
  463. VLD1.64 {D2}, [r4@64], r2
  464. LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii]
  465. VLD1.64 {D3}, [r4@64], r2
  466. LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
  467. VLD1.64 {D4}, [r4@64], r2
  468. ADDGT r7, r1, r6
  469. VLD1.64 {D5}, [r4@64], r2
  470. PLD [r7]
  471. VLD1.64 {D6}, [r4@64], r2
  472. PLD [r7, r2]
  473. VLD1.64 {D7}, [r4@64]
  474. PLD [r7, r2, LSL #1]
  475. VST1.64 {D0}, [r5@64], r2
  476. ADDGT r7, r7, r2, LSL #2
  477. VST1.64 {D1}, [r5@64], r2
  478. PLD [r7, -r2]
  479. VST1.64 {D2}, [r5@64], r2
  480. PLD [r7]
  481. VST1.64 {D3}, [r5@64], r2
  482. PLD [r7, r2]
  483. VST1.64 {D4}, [r5@64], r2
  484. PLD [r7, r2, LSL #1]
  485. VST1.64 {D5}, [r5@64], r2
  486. ADDGT r7, r7, r2, LSL #2
  487. VST1.64 {D6}, [r5@64], r2
  488. PLD [r7, -r2]
  489. VST1.64 {D7}, [r5@64]
  490. BGT ofcl_neon_lp
  491. ofcl_neon_end
  492. LDMFD r13!,{r4-r7,PC}
  493. ENDP
  494. oc_frag_recon_intra_neon PROC
  495. ; r0 = unsigned char *_dst
  496. ; r1 = int _ystride
  497. ; r2 = const ogg_int16_t _residue[64]
  498. VMOV.I16 Q0, #128
  499. VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
  500. VQADD.S16 Q8, Q8, Q0
  501. VQADD.S16 Q9, Q9, Q0
  502. VQADD.S16 Q10,Q10,Q0
  503. VQADD.S16 Q11,Q11,Q0
  504. VQADD.S16 Q12,Q12,Q0
  505. VQADD.S16 Q13,Q13,Q0
  506. VQADD.S16 Q14,Q14,Q0
  507. VQADD.S16 Q15,Q15,Q0
  508. VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle
  509. VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle
  510. VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle
  511. VST1.64 {D16},[r0@64], r1
  512. VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle
  513. VST1.64 {D17},[r0@64], r1
  514. VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle
  515. VST1.64 {D18},[r0@64], r1
  516. VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle
  517. VST1.64 {D19},[r0@64], r1
  518. VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle
  519. VST1.64 {D20},[r0@64], r1
  520. VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle
  521. VST1.64 {D21},[r0@64], r1
  522. VST1.64 {D22},[r0@64], r1
  523. VST1.64 {D23},[r0@64], r1
  524. MOV PC,R14
  525. ENDP
  526. oc_frag_recon_inter_neon PROC
  527. ; r0 = unsigned char *_dst
  528. ; r1 = const unsigned char *_src
  529. ; r2 = int _ystride
  530. ; r3 = const ogg_int16_t _residue[64]
  531. VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
  532. VLD1.64 {D0}, [r1], r2
  533. VLD1.64 {D2}, [r1], r2
  534. VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00
  535. VLD1.64 {D4}, [r1], r2
  536. VMOVL.U8 Q1, D2 ; etc
  537. VLD1.64 {D6}, [r1], r2
  538. VMOVL.U8 Q2, D4
  539. VMOVL.U8 Q3, D6
  540. VQADD.S16 Q8, Q8, Q0
  541. VLD1.64 {D0}, [r1], r2
  542. VQADD.S16 Q9, Q9, Q1
  543. VLD1.64 {D2}, [r1], r2
  544. VQADD.S16 Q10,Q10,Q2
  545. VLD1.64 {D4}, [r1], r2
  546. VQADD.S16 Q11,Q11,Q3
  547. VLD1.64 {D6}, [r1], r2
  548. VMOVL.U8 Q0, D0
  549. VMOVL.U8 Q1, D2
  550. VMOVL.U8 Q2, D4
  551. VMOVL.U8 Q3, D6
  552. VQADD.S16 Q12,Q12,Q0
  553. VQADD.S16 Q13,Q13,Q1
  554. VQADD.S16 Q14,Q14,Q2
  555. VQADD.S16 Q15,Q15,Q3
  556. VQMOVUN.S16 D16,Q8
  557. VQMOVUN.S16 D17,Q9
  558. VQMOVUN.S16 D18,Q10
  559. VST1.64 {D16},[r0@64], r2
  560. VQMOVUN.S16 D19,Q11
  561. VST1.64 {D17},[r0@64], r2
  562. VQMOVUN.S16 D20,Q12
  563. VST1.64 {D18},[r0@64], r2
  564. VQMOVUN.S16 D21,Q13
  565. VST1.64 {D19},[r0@64], r2
  566. VQMOVUN.S16 D22,Q14
  567. VST1.64 {D20},[r0@64], r2
  568. VQMOVUN.S16 D23,Q15
  569. VST1.64 {D21},[r0@64], r2
  570. VST1.64 {D22},[r0@64], r2
  571. VST1.64 {D23},[r0@64], r2
  572. MOV PC,R14
  573. ENDP
  574. oc_frag_recon_inter2_neon PROC
  575. ; r0 = unsigned char *_dst
  576. ; r1 = const unsigned char *_src1
  577. ; r2 = const unsigned char *_src2
  578. ; r3 = int _ystride
  579. LDR r12,[r13]
  580. ; r12= const ogg_int16_t _residue[64]
  581. VLDMIA r12,{D16-D31}
  582. VLD1.64 {D0}, [r1], r3
  583. VLD1.64 {D4}, [r2], r3
  584. VLD1.64 {D1}, [r1], r3
  585. VLD1.64 {D5}, [r2], r3
  586. VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100
  587. VLD1.64 {D2}, [r1], r3
  588. VLD1.64 {D6}, [r2], r3
  589. VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00
  590. VLD1.64 {D3}, [r1], r3
  591. VMOVL.U8 Q2, D5 ; etc
  592. VLD1.64 {D7}, [r2], r3
  593. VHADD.U8 Q3, Q1, Q3
  594. VQADD.S16 Q8, Q8, Q0
  595. VQADD.S16 Q9, Q9, Q2
  596. VLD1.64 {D0}, [r1], r3
  597. VMOVL.U8 Q1, D6
  598. VLD1.64 {D4}, [r2], r3
  599. VMOVL.U8 Q3, D7
  600. VLD1.64 {D1}, [r1], r3
  601. VQADD.S16 Q10,Q10,Q1
  602. VLD1.64 {D5}, [r2], r3
  603. VQADD.S16 Q11,Q11,Q3
  604. VLD1.64 {D2}, [r1], r3
  605. VHADD.U8 Q2, Q0, Q2
  606. VLD1.64 {D6}, [r2], r3
  607. VLD1.64 {D3}, [r1], r3
  608. VMOVL.U8 Q0, D4
  609. VLD1.64 {D7}, [r2], r3
  610. VMOVL.U8 Q2, D5
  611. VHADD.U8 Q3, Q1, Q3
  612. VQADD.S16 Q12,Q12,Q0
  613. VQADD.S16 Q13,Q13,Q2
  614. VMOVL.U8 Q1, D6
  615. VMOVL.U8 Q3, D7
  616. VQADD.S16 Q14,Q14,Q1
  617. VQADD.S16 Q15,Q15,Q3
  618. VQMOVUN.S16 D16,Q8
  619. VQMOVUN.S16 D17,Q9
  620. VQMOVUN.S16 D18,Q10
  621. VST1.64 {D16},[r0@64], r3
  622. VQMOVUN.S16 D19,Q11
  623. VST1.64 {D17},[r0@64], r3
  624. VQMOVUN.S16 D20,Q12
  625. VST1.64 {D18},[r0@64], r3
  626. VQMOVUN.S16 D21,Q13
  627. VST1.64 {D19},[r0@64], r3
  628. VQMOVUN.S16 D22,Q14
  629. VST1.64 {D20},[r0@64], r3
  630. VQMOVUN.S16 D23,Q15
  631. VST1.64 {D21},[r0@64], r3
  632. VST1.64 {D22},[r0@64], r3
  633. VST1.64 {D23},[r0@64], r3
  634. MOV PC,R14
  635. ENDP
  636. ]
  637. END