armloop.s 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677
  1. ;********************************************************************
  2. ;* *
  3. ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. ;* *
  8. ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
  9. ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. ;* *
  11. ;********************************************************************
  12. ; Original implementation:
  13. ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
  14. ; last mod: $Id$
  15. ;********************************************************************
  16. AREA |.text|, CODE, READONLY
  17. GET armopts.s
  18. EXPORT oc_loop_filter_frag_rows_arm
  19. ; Which bit this is depends on the order of packing within a bitfield.
  20. ; Hopefully that doesn't change among any of the relevant compilers.
  21. OC_FRAG_CODED_FLAG * 1
  22. ; Vanilla ARM v4 version
  23. loop_filter_h_arm PROC
  24. ; r0 = unsigned char *_pix
  25. ; r1 = int _ystride
  26. ; r2 = int *_bv
  27. ; preserves r0-r3
  28. STMFD r13!,{r3-r6,r14}
  29. MOV r14,#8
  30. MOV r6, #255
  31. lfh_arm_lp
  32. LDRB r3, [r0, #-2] ; r3 = _pix[0]
  33. LDRB r12,[r0, #1] ; r12= _pix[3]
  34. LDRB r4, [r0, #-1] ; r4 = _pix[1]
  35. LDRB r5, [r0] ; r5 = _pix[2]
  36. SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
  37. ADD r3, r3, #4
  38. SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
  39. ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
  40. ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
  41. MOV r12,r12,ASR #3
  42. LDRSB r12,[r2, r12]
  43. ; Stall (2 on Xscale)
  44. ADDS r4, r4, r12
  45. CMPGT r6, r4
  46. EORLT r4, r6, r4, ASR #32
  47. SUBS r5, r5, r12
  48. CMPGT r6, r5
  49. EORLT r5, r6, r5, ASR #32
  50. STRB r4, [r0, #-1]
  51. STRB r5, [r0], r1
  52. SUBS r14,r14,#1
  53. BGT lfh_arm_lp
  54. SUB r0, r0, r1, LSL #3
  55. LDMFD r13!,{r3-r6,PC}
  56. ENDP
  57. loop_filter_v_arm PROC
  58. ; r0 = unsigned char *_pix
  59. ; r1 = int _ystride
  60. ; r2 = int *_bv
  61. ; preserves r0-r3
  62. STMFD r13!,{r3-r6,r14}
  63. MOV r14,#8
  64. MOV r6, #255
  65. lfv_arm_lp
  66. LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0]
  67. LDRB r12,[r0, r1] ; r12= _pix[3]
  68. LDRB r4, [r0, -r1] ; r4 = _pix[1]
  69. LDRB r5, [r0] ; r5 = _pix[2]
  70. SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
  71. ADD r3, r3, #4
  72. SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
  73. ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
  74. ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
  75. MOV r12,r12,ASR #3
  76. LDRSB r12,[r2, r12]
  77. ; Stall (2 on Xscale)
  78. ADDS r4, r4, r12
  79. CMPGT r6, r4
  80. EORLT r4, r6, r4, ASR #32
  81. SUBS r5, r5, r12
  82. CMPGT r6, r5
  83. EORLT r5, r6, r5, ASR #32
  84. STRB r4, [r0, -r1]
  85. STRB r5, [r0], #1
  86. SUBS r14,r14,#1
  87. BGT lfv_arm_lp
  88. SUB r0, r0, #8
  89. LDMFD r13!,{r3-r6,PC}
  90. ENDP
  91. oc_loop_filter_frag_rows_arm PROC
  92. ; r0 = _ref_frame_data
  93. ; r1 = _ystride
  94. ; r2 = _bv
  95. ; r3 = _frags
  96. ; r4 = _fragi0
  97. ; r5 = _fragi0_end
  98. ; r6 = _fragi_top
  99. ; r7 = _fragi_bot
  100. ; r8 = _frag_buf_offs
  101. ; r9 = _nhfrags
  102. MOV r12,r13
  103. STMFD r13!,{r0,r4-r11,r14}
  104. LDMFD r12,{r4-r9}
  105. ADD r2, r2, #127 ; _bv += 127
  106. CMP r4, r5 ; if(_fragi0>=_fragi0_end)
  107. BGE oslffri_arm_end ; bail
  108. SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
  109. BLE oslffri_arm_end ; bail
  110. ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
  111. ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
  112. SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
  113. oslffri_arm_lp1
  114. MOV r10,r4 ; r10= fragi = _fragi0
  115. ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
  116. oslffri_arm_lp2
  117. LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
  118. LDR r0, [r13] ; r0 = _ref_frame_data
  119. LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
  120. TST r14,#OC_FRAG_CODED_FLAG
  121. BEQ oslffri_arm_uncoded
  122. CMP r10,r4 ; if (fragi>_fragi0)
  123. ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
  124. BLGT loop_filter_h_arm
  125. CMP r4, r6 ; if (_fragi0>_fragi_top)
  126. BLGT loop_filter_v_arm
  127. CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
  128. LDRLT r12,[r3] ; r12 = _frags[fragi+1]
  129. ADD r0, r0, #8
  130. ADD r10,r10,#1 ; r10 = fragi+1;
  131. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  132. CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
  133. BLLT loop_filter_h_arm
  134. CMP r10,r7 ; if (fragi<_fragi_bot)
  135. LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
  136. SUB r0, r0, #8
  137. ADD r0, r0, r1, LSL #3
  138. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  139. CMPLT r12,#OC_FRAG_CODED_FLAG
  140. BLLT loop_filter_v_arm
  141. CMP r10,r11 ; while(fragi<=fragi_end-1)
  142. BLE oslffri_arm_lp2
  143. MOV r4, r10 ; r4 = fragi0 += _nhfrags
  144. CMP r4, r5
  145. BLT oslffri_arm_lp1
  146. oslffri_arm_end
  147. LDMFD r13!,{r0,r4-r11,PC}
  148. oslffri_arm_uncoded
  149. ADD r10,r10,#1
  150. CMP r10,r11
  151. BLE oslffri_arm_lp2
  152. MOV r4, r10 ; r4 = _fragi0 += _nhfrags
  153. CMP r4, r5
  154. BLT oslffri_arm_lp1
  155. LDMFD r13!,{r0,r4-r11,PC}
  156. ENDP
  157. [ OC_ARM_ASM_MEDIA
  158. EXPORT oc_loop_filter_init_v6
  159. EXPORT oc_loop_filter_frag_rows_v6
  160. oc_loop_filter_init_v6 PROC
  161. ; r0 = _bv
  162. ; r1 = _flimit (=L from the spec)
  163. MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L>
  164. AND r1, r1, #255 ; r1 = ll=r1&0xFF
  165. ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll>
  166. PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll>
  167. STR r1, [r0]
  168. MOV PC,r14
  169. ENDP
  170. ; We could use the same strategy as the v filter below, but that would require
  171. ; 40 instructions to load the data and transpose it into columns and another
  172. ; 32 to write out the results at the end, plus the 52 instructions to do the
  173. ; filtering itself.
  174. ; This is slightly less, and less code, even assuming we could have shared the
  175. ; 52 instructions in the middle with the other function.
  176. ; It executes slightly fewer instructions than the ARMv6 approach David Conrad
  177. ; proposed for FFmpeg, but not by much:
  178. ; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
  179. ; His is a lot less code, though, because it only does two rows at once instead
  180. ; of four.
  181. loop_filter_h_v6 PROC
  182. ; r0 = unsigned char *_pix
  183. ; r1 = int _ystride
  184. ; r2 = int _ll
  185. ; preserves r0-r3
  186. STMFD r13!,{r4-r11,r14}
  187. LDR r12,=0x10003
  188. BL loop_filter_h_core_v6
  189. ADD r0, r0, r1, LSL #2
  190. BL loop_filter_h_core_v6
  191. SUB r0, r0, r1, LSL #2
  192. LDMFD r13!,{r4-r11,PC}
  193. ENDP
  194. loop_filter_h_core_v6 PROC
  195. ; r0 = unsigned char *_pix
  196. ; r1 = int _ystride
  197. ; r2 = int _ll
  198. ; r12= 0x10003
  199. ; Preserves r0-r3, r12; Clobbers r4-r11.
  200. LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0>
  201. ; Single issue
  202. LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0>
  203. UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2>
  204. UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1>
  205. UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2>
  206. UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1>
  207. PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1>
  208. PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2>
  209. SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2>
  210. SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3>
  211. SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2>
  212. SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4>
  213. LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0>
  214. MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
  215. LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0>
  216. PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p>
  217. UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2>
  218. UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p>
  219. UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1>
  220. UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2>
  221. PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2>
  222. SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2>
  223. UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1>
  224. SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3>
  225. SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2>
  226. SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4>
  227. ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2>
  228. MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
  229. PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1>
  230. PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r>
  231. ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1>
  232. UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r>
  233. MOV r10,#0
  234. ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p>
  235. ; Single issue
  236. ; There's no min, max or abs instruction.
  237. ; SSUB8 and SEL will work for abs, and we can do all the rest with
  238. ; unsigned saturated adds, which means the GE flags are still all
  239. ; set when we're done computing lflim(abs(R_i),L).
  240. ; This allows us to both add and subtract, and split the results by
  241. ; the original sign of R_i.
  242. SSUB8 r7, r10,r6
  243. ; Single issue
  244. SEL r7, r7, r6 ; r7 = abs(R_i)
  245. ; Single issue
  246. UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0)
  247. ; Single issue
  248. UQADD8 r7, r7, r4
  249. ; Single issue
  250. UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
  251. ; Single issue
  252. UQSUB8 r4, r8, r7
  253. UQADD8 r5, r9, r7
  254. UQADD8 r8, r8, r7
  255. UQSUB8 r9, r9, r7
  256. SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L)
  257. SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L)
  258. MOV r5, r9, LSR #24 ; r5 = s2
  259. STRB r5, [r0,#2]!
  260. MOV r4, r8, LSR #24 ; r4 = s1
  261. STRB r4, [r0,#-1]
  262. MOV r5, r9, LSR #8 ; r5 = r2
  263. STRB r5, [r0,-r1]!
  264. MOV r4, r8, LSR #8 ; r4 = r1
  265. STRB r4, [r0,#-1]
  266. MOV r5, r9, LSR #16 ; r5 = q2
  267. STRB r5, [r0,-r1]!
  268. MOV r4, r8, LSR #16 ; r4 = q1
  269. STRB r4, [r0,#-1]
  270. ; Single issue
  271. STRB r9, [r0,-r1]!
  272. ; Single issue
  273. STRB r8, [r0,#-1]
  274. MOV PC,r14
  275. ENDP
  276. ; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
  277. ; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
  278. ; This works just as well, with the following procedure for computing the
  279. ; filter value, f:
  280. ; u = ~UHADD8(p1,~p2);
  281. ; v = UHADD8(~p1,p2);
  282. ; m = v-u;
  283. ; a = m^UHADD8(m^p0,m^~p3);
  284. ; f = UHADD8(UHADD8(a,u1),v1);
  285. ; where f = 127+R, with R in [-127,128] defined as in the spec.
  286. ; This is exactly the same amount of arithmetic as the version that uses PAVGB
  287. ; as the basic operator.
  288. ; It executes about 2/3 the number of instructions of David Conrad's approach,
  289. ; but requires more code, because it does all eight columns at once, instead
  290. ; of four at a time.
  291. loop_filter_v_v6 PROC
  292. ; r0 = unsigned char *_pix
  293. ; r1 = int _ystride
  294. ; r2 = int _ll
  295. ; preserves r0-r11
  296. STMFD r13!,{r4-r11,r14}
  297. LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1>
  298. LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0>
  299. LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2>
  300. MVN r14,r6 ; r14= ~p1
  301. LDRD r10,[r0, r1] ; r11,r10= <p7|p3>
  302. ; Filter the first four columns.
  303. MVN r12,r8 ; r12= ~p2
  304. UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1
  305. UHADD8 r12,r12,r6 ; r12= p1+~p2>>1
  306. MVN r10, r10 ; r10=~p3
  307. MVN r12,r12 ; r12= u1=~p1+p2+1>>1
  308. SSUB8 r14,r14,r12 ; r14= m1=v1-u1
  309. ; Single issue
  310. EOR r4, r4, r14 ; r4 = m1^p0
  311. EOR r10,r10,r14 ; r10= m1^~p3
  312. UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1
  313. ; Single issue
  314. EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
  315. SADD8 r14,r14,r12 ; r14= v1=m1+u1
  316. UHADD8 r4, r4, r12 ; r4 = a1+u1>>1
  317. MVN r12,r9 ; r12= ~p6
  318. UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1
  319. ; Filter the second four columns.
  320. MVN r14,r7 ; r14= ~p5
  321. UHADD8 r12,r12,r7 ; r12= p5+~p6>>1
  322. UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1
  323. MVN r12,r12 ; r12= u2=~p5+p6+1>>1
  324. MVN r11,r11 ; r11=~p7
  325. SSUB8 r10,r14,r12 ; r10= m2=v2-u2
  326. ; Single issue
  327. EOR r5, r5, r10 ; r5 = m2^p4
  328. EOR r11,r11,r10 ; r11= m2^~p7
  329. UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1
  330. ; Single issue
  331. EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
  332. ; Single issue
  333. UHADD8 r5, r5, r12 ; r5 = a2+u2>>1
  334. LDR r12,=0x7F7F7F7F ; r12 = {127}x4
  335. UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1
  336. ; Now split f[i] by sign.
  337. ; There's no min or max instruction.
  338. ; We could use SSUB8 and SEL, but this is just as many instructions and
  339. ; dual issues more (for v7 without NEON).
  340. UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0
  341. UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0
  342. UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0)
  343. UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
  344. UQADD8 r10,r10,r11
  345. UQADD8 r4, r4, r14
  346. UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
  347. UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
  348. UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0
  349. UQADD8 r6, r6, r10
  350. UQSUB8 r8, r8, r10
  351. UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0
  352. UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L)
  353. UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L)
  354. UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0)
  355. UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
  356. UQADD8 r11,r11,r10
  357. UQADD8 r5, r5, r14
  358. UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
  359. UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
  360. UQADD8 r7, r7, r11
  361. UQSUB8 r9, r9, r11
  362. UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L)
  363. STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6]
  364. UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L)
  365. STRD r8, [r0] ; [p6:p2] = [r9: r8]
  366. LDMFD r13!,{r4-r11,PC}
  367. ENDP
  368. oc_loop_filter_frag_rows_v6 PROC
  369. ; r0 = _ref_frame_data
  370. ; r1 = _ystride
  371. ; r2 = _bv
  372. ; r3 = _frags
  373. ; r4 = _fragi0
  374. ; r5 = _fragi0_end
  375. ; r6 = _fragi_top
  376. ; r7 = _fragi_bot
  377. ; r8 = _frag_buf_offs
  378. ; r9 = _nhfrags
  379. MOV r12,r13
  380. STMFD r13!,{r0,r4-r11,r14}
  381. LDMFD r12,{r4-r9}
  382. LDR r2, [r2] ; ll = *(int *)_bv
  383. CMP r4, r5 ; if(_fragi0>=_fragi0_end)
  384. BGE oslffri_v6_end ; bail
  385. SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
  386. BLE oslffri_v6_end ; bail
  387. ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
  388. ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
  389. SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
  390. oslffri_v6_lp1
  391. MOV r10,r4 ; r10= fragi = _fragi0
  392. ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
  393. oslffri_v6_lp2
  394. LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
  395. LDR r0, [r13] ; r0 = _ref_frame_data
  396. LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
  397. TST r14,#OC_FRAG_CODED_FLAG
  398. BEQ oslffri_v6_uncoded
  399. CMP r10,r4 ; if (fragi>_fragi0)
  400. ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
  401. BLGT loop_filter_h_v6
  402. CMP r4, r6 ; if (fragi0>_fragi_top)
  403. BLGT loop_filter_v_v6
  404. CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
  405. LDRLT r12,[r3] ; r12 = _frags[fragi+1]
  406. ADD r0, r0, #8
  407. ADD r10,r10,#1 ; r10 = fragi+1;
  408. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  409. CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
  410. BLLT loop_filter_h_v6
  411. CMP r10,r7 ; if (fragi<_fragi_bot)
  412. LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
  413. SUB r0, r0, #8
  414. ADD r0, r0, r1, LSL #3
  415. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  416. CMPLT r12,#OC_FRAG_CODED_FLAG
  417. BLLT loop_filter_v_v6
  418. CMP r10,r11 ; while(fragi<=fragi_end-1)
  419. BLE oslffri_v6_lp2
  420. MOV r4, r10 ; r4 = fragi0 += nhfrags
  421. CMP r4, r5
  422. BLT oslffri_v6_lp1
  423. oslffri_v6_end
  424. LDMFD r13!,{r0,r4-r11,PC}
  425. oslffri_v6_uncoded
  426. ADD r10,r10,#1
  427. CMP r10,r11
  428. BLE oslffri_v6_lp2
  429. MOV r4, r10 ; r4 = fragi0 += nhfrags
  430. CMP r4, r5
  431. BLT oslffri_v6_lp1
  432. LDMFD r13!,{r0,r4-r11,PC}
  433. ENDP
  434. ]
  435. [ OC_ARM_ASM_NEON
  436. EXPORT oc_loop_filter_init_neon
  437. EXPORT oc_loop_filter_frag_rows_neon
  438. oc_loop_filter_init_neon PROC
  439. ; r0 = _bv
  440. ; r1 = _flimit (=L from the spec)
  441. MOV r1, r1, LSL #1 ; r1 = 2*L
  442. VDUP.S16 Q15, r1 ; Q15= 2L in U16s
  443. VST1.64 {D30,D31}, [r0@128]
  444. MOV PC,r14
  445. ENDP
  446. loop_filter_h_neon PROC
  447. ; r0 = unsigned char *_pix
  448. ; r1 = int _ystride
  449. ; r2 = int *_bv
  450. ; preserves r0-r3
  451. ; We assume Q15= 2*L in U16s
  452. ; My best guesses at cycle counts (and latency)--vvv
  453. SUB r12,r0, #2
  454. ; Doing a 2-element structure load saves doing two VTRN's below, at the
  455. ; cost of using two more slower single-lane loads vs. the faster
  456. ; all-lane loads.
  457. ; It's less code this way, though, and benches a hair faster, but it
  458. ; leaves D2 and D4 swapped.
  459. VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1
  460. ; D2 = ____________3322
  461. VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1
  462. ; D6 = ____________7766
  463. VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1
  464. ; D2 = ________BBAA3322
  465. VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1
  466. ; D6 = ________FFEE7766
  467. VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1
  468. ; D2 = ____JJIIBBAA3322
  469. VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1
  470. ; D6 = ____NNMMFFEE7766
  471. VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1
  472. ; D2 = RRQQJJIIBBAA3322
  473. VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1
  474. ; D6 = VVUUNNMMFFEE7766
  475. VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1
  476. VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1
  477. VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
  478. VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3
  479. ADD r12,r0, #8
  480. VADD.S16 Q0, Q0, Q8 ; 1,3
  481. PLD [r12]
  482. VADD.S16 Q0, Q0, Q8 ; 1,3
  483. PLD [r12,r1]
  484. VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
  485. PLD [r12,r1, LSL #1]
  486. VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
  487. ADD r12,r12,r1, LSL #2
  488. ; We want to do
  489. ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
  490. ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
  491. ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
  492. ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
  493. ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
  494. ; So we've reduced the left and right hand terms to be the same, except
  495. ; for a negation.
  496. ; Stall x3
  497. VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
  498. PLD [r12,-r1]
  499. VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
  500. PLD [r12]
  501. VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
  502. PLD [r12,r1]
  503. VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
  504. PLD [r12,r1,LSL #1]
  505. VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
  506. ADD r12,r12,r1, LSL #2
  507. ; Now we need to correct for the sign of f.
  508. ; For negative elements of Q0, we want to subtract the appropriate
  509. ; element of Q9. For positive elements we want to add them. No NEON
  510. ; instruction exists to do this, so we need to negate the negative
  511. ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
  512. VADD.S16 Q9, Q9, Q0 ; 1,3
  513. PLD [r12,-r1]
  514. VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
  515. ; Bah. No VRSBW.U8
  516. ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
  517. VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
  518. VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
  519. VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1
  520. VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1
  521. SUB r12,r0, #1
  522. VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1
  523. VST1.16 {D4[0]}, [r12], r1
  524. VST1.16 {D2[0]}, [r12], r1
  525. VST1.16 {D4[1]}, [r12], r1
  526. VST1.16 {D2[1]}, [r12], r1
  527. VST1.16 {D4[2]}, [r12], r1
  528. VST1.16 {D2[2]}, [r12], r1
  529. VST1.16 {D4[3]}, [r12], r1
  530. VST1.16 {D2[3]}, [r12], r1
  531. MOV PC,r14
  532. ENDP
  533. loop_filter_v_neon PROC
  534. ; r0 = unsigned char *_pix
  535. ; r1 = int _ystride
  536. ; r2 = int *_bv
  537. ; preserves r0-r3
  538. ; We assume Q15= 2*L in U16s
  539. ; My best guesses at cycle counts (and latency)--vvv
  540. SUB r12,r0, r1, LSL #1
  541. VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1
  542. VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1
  543. VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1
  544. VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1
  545. VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3
  546. VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
  547. ADD r12, #8
  548. VADD.S16 Q0, Q0, Q8 ; 1,3
  549. PLD [r12]
  550. VADD.S16 Q0, Q0, Q8 ; 1,3
  551. PLD [r12,r1]
  552. VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
  553. SUB r12, r0, r1
  554. VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
  555. ; We want to do
  556. ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
  557. ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
  558. ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
  559. ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
  560. ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
  561. ; So we've reduced the left and right hand terms to be the same, except
  562. ; for a negation.
  563. ; Stall x3
  564. VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
  565. VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
  566. ; Stall x2
  567. VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
  568. VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
  569. ; Stall x2
  570. VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
  571. ; Now we need to correct for the sign of f.
  572. ; For negative elements of Q0, we want to subtract the appropriate
  573. ; element of Q9. For positive elements we want to add them. No NEON
  574. ; instruction exists to do this, so we need to negate the negative
  575. ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
  576. ; Stall x3
  577. VADD.S16 Q9, Q9, Q0 ; 1,3
  578. ; Stall x2
  579. VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
  580. ; Bah. No VRSBW.U8
  581. ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
  582. VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
  583. VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
  584. VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1
  585. VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1
  586. VST1.64 {D2}, [r12@64], r1
  587. VST1.64 {D4}, [r12@64], r1
  588. MOV PC,r14
  589. ENDP
  590. oc_loop_filter_frag_rows_neon PROC
  591. ; r0 = _ref_frame_data
  592. ; r1 = _ystride
  593. ; r2 = _bv
  594. ; r3 = _frags
  595. ; r4 = _fragi0
  596. ; r5 = _fragi0_end
  597. ; r6 = _fragi_top
  598. ; r7 = _fragi_bot
  599. ; r8 = _frag_buf_offs
  600. ; r9 = _nhfrags
  601. MOV r12,r13
  602. STMFD r13!,{r0,r4-r11,r14}
  603. LDMFD r12,{r4-r9}
  604. CMP r4, r5 ; if(_fragi0>=_fragi0_end)
  605. BGE oslffri_neon_end; bail
  606. SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
  607. BLE oslffri_neon_end ; bail
  608. VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s
  609. ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
  610. ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
  611. SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
  612. oslffri_neon_lp1
  613. MOV r10,r4 ; r10= fragi = _fragi0
  614. ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
  615. oslffri_neon_lp2
  616. LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
  617. LDR r0, [r13] ; r0 = _ref_frame_data
  618. LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
  619. TST r14,#OC_FRAG_CODED_FLAG
  620. BEQ oslffri_neon_uncoded
  621. CMP r10,r4 ; if (fragi>_fragi0)
  622. ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
  623. BLGT loop_filter_h_neon
  624. CMP r4, r6 ; if (_fragi0>_fragi_top)
  625. BLGT loop_filter_v_neon
  626. CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
  627. LDRLT r12,[r3] ; r12 = _frags[fragi+1]
  628. ADD r0, r0, #8
  629. ADD r10,r10,#1 ; r10 = fragi+1;
  630. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  631. CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
  632. BLLT loop_filter_h_neon
  633. CMP r10,r7 ; if (fragi<_fragi_bot)
  634. LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
  635. SUB r0, r0, #8
  636. ADD r0, r0, r1, LSL #3
  637. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  638. CMPLT r12,#OC_FRAG_CODED_FLAG
  639. BLLT loop_filter_v_neon
  640. CMP r10,r11 ; while(fragi<=fragi_end-1)
  641. BLE oslffri_neon_lp2
  642. MOV r4, r10 ; r4 = _fragi0 += _nhfrags
  643. CMP r4, r5
  644. BLT oslffri_neon_lp1
  645. oslffri_neon_end
  646. LDMFD r13!,{r0,r4-r11,PC}
  647. oslffri_neon_uncoded
  648. ADD r10,r10,#1
  649. CMP r10,r11
  650. BLE oslffri_neon_lp2
  651. MOV r4, r10 ; r4 = _fragi0 += _nhfrags
  652. CMP r4, r5
  653. BLT oslffri_neon_lp1
  654. LDMFD r13!,{r0,r4-r11,PC}
  655. ENDP
  656. ]
  657. END