celt_pitch_xcorr_arm.s 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552
  1. ; Copyright (c) 2007-2008 CSIRO
  2. ; Copyright (c) 2007-2009 Xiph.Org Foundation
  3. ; Copyright (c) 2013 Parrot
  4. ; Written by Aurélien Zanelli
  5. ;
  6. ; Redistribution and use in source and binary forms, with or without
  7. ; modification, are permitted provided that the following conditions
  8. ; are met:
  9. ;
  10. ; - Redistributions of source code must retain the above copyright
  11. ; notice, this list of conditions and the following disclaimer.
  12. ;
  13. ; - Redistributions in binary form must reproduce the above copyright
  14. ; notice, this list of conditions and the following disclaimer in the
  15. ; documentation and/or other materials provided with the distribution.
  16. ;
  17. ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18. ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19. ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20. ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
  21. ; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  22. ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  23. ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  24. ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  25. ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  26. ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  27. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. AREA |.text|, CODE, READONLY
  29. GET celt/arm/armopts.s
  30. IF OPUS_ARM_MAY_HAVE_EDSP
  31. EXPORT celt_pitch_xcorr_edsp
  32. ENDIF
  33. IF OPUS_ARM_MAY_HAVE_NEON
  34. EXPORT celt_pitch_xcorr_neon
  35. ENDIF
  36. IF OPUS_ARM_MAY_HAVE_NEON
  37. ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
  38. xcorr_kernel_neon PROC
  39. xcorr_kernel_neon_start
  40. ; input:
  41. ; r3 = int len
  42. ; r4 = opus_val16 *x
  43. ; r5 = opus_val16 *y
  44. ; q0 = opus_val32 sum[4]
  45. ; output:
  46. ; q0 = opus_val32 sum[4]
  47. ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
  48. ; internal usage:
  49. ; r12 = int j
  50. ; d3 = y_3|y_2|y_1|y_0
  51. ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
  52. ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
  53. ; q8 = scratch
  54. ;
  55. ; Load y[0...3]
  56. ; This requires len>0 to always be valid (which we assert in the C code).
  57. VLD1.16 {d5}, [r5]!
  58. SUBS r12, r3, #8
  59. BLE xcorr_kernel_neon_process4
  60. ; Process 8 samples at a time.
  61. ; This loop loads one y value more than we actually need. Therefore we have to
  62. ; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
  63. ; reading past the end of the array.
  64. xcorr_kernel_neon_process8
  65. ; This loop has 19 total instructions (10 cycles to issue, minimum), with
  66. ; - 2 cycles of ARM insrtuctions,
  67. ; - 10 cycles of load/store/byte permute instructions, and
  68. ; - 9 cycles of data processing instructions.
  69. ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
  70. ; latter two categories, meaning the whole loop should run in 10 cycles per
  71. ; iteration, barring cache misses.
  72. ;
  73. ; Load x[0...7]
  74. VLD1.16 {d6, d7}, [r4]!
  75. ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
  76. ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
  77. VAND d3, d5, d5
  78. SUBS r12, r12, #8
  79. ; Load y[4...11]
  80. VLD1.16 {d4, d5}, [r5]!
  81. VMLAL.S16 q0, d3, d6[0]
  82. VEXT.16 d16, d3, d4, #1
  83. VMLAL.S16 q0, d4, d7[0]
  84. VEXT.16 d17, d4, d5, #1
  85. VMLAL.S16 q0, d16, d6[1]
  86. VEXT.16 d16, d3, d4, #2
  87. VMLAL.S16 q0, d17, d7[1]
  88. VEXT.16 d17, d4, d5, #2
  89. VMLAL.S16 q0, d16, d6[2]
  90. VEXT.16 d16, d3, d4, #3
  91. VMLAL.S16 q0, d17, d7[2]
  92. VEXT.16 d17, d4, d5, #3
  93. VMLAL.S16 q0, d16, d6[3]
  94. VMLAL.S16 q0, d17, d7[3]
  95. BGT xcorr_kernel_neon_process8
  96. ; Process 4 samples here if we have > 4 left (still reading one extra y value).
  97. xcorr_kernel_neon_process4
  98. ADDS r12, r12, #4
  99. BLE xcorr_kernel_neon_process2
  100. ; Load x[0...3]
  101. VLD1.16 d6, [r4]!
  102. ; Use VAND since it's a data processing instruction again.
  103. VAND d4, d5, d5
  104. SUB r12, r12, #4
  105. ; Load y[4...7]
  106. VLD1.16 d5, [r5]!
  107. VMLAL.S16 q0, d4, d6[0]
  108. VEXT.16 d16, d4, d5, #1
  109. VMLAL.S16 q0, d16, d6[1]
  110. VEXT.16 d16, d4, d5, #2
  111. VMLAL.S16 q0, d16, d6[2]
  112. VEXT.16 d16, d4, d5, #3
  113. VMLAL.S16 q0, d16, d6[3]
  114. ; Process 2 samples here if we have > 2 left (still reading one extra y value).
  115. xcorr_kernel_neon_process2
  116. ADDS r12, r12, #2
  117. BLE xcorr_kernel_neon_process1
  118. ; Load x[0...1]
  119. VLD2.16 {d6[],d7[]}, [r4]!
  120. ; Use VAND since it's a data processing instruction again.
  121. VAND d4, d5, d5
  122. SUB r12, r12, #2
  123. ; Load y[4...5]
  124. VLD1.32 {d5[]}, [r5]!
  125. VMLAL.S16 q0, d4, d6
  126. VEXT.16 d16, d4, d5, #1
  127. ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
  128. ; instead of VEXT, since it's a data-processing instruction.
  129. VSRI.64 d5, d4, #32
  130. VMLAL.S16 q0, d16, d7
  131. ; Process 1 sample using the extra y value we loaded above.
  132. xcorr_kernel_neon_process1
  133. ; Load next *x
  134. VLD1.16 {d6[]}, [r4]!
  135. ADDS r12, r12, #1
  136. ; y[0...3] are left in d5 from prior iteration(s) (if any)
  137. VMLAL.S16 q0, d5, d6
  138. MOVLE pc, lr
  139. ; Now process 1 last sample, not reading ahead.
  140. ; Load last *y
  141. VLD1.16 {d4[]}, [r5]!
  142. VSRI.64 d4, d5, #16
  143. ; Load last *x
  144. VLD1.16 {d6[]}, [r4]!
  145. VMLAL.S16 q0, d4, d6
  146. MOV pc, lr
  147. ENDP
  148. ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
  149. ; opus_val32 *xcorr, int len, int max_pitch, int arch)
  150. celt_pitch_xcorr_neon PROC
  151. ; input:
  152. ; r0 = opus_val16 *_x
  153. ; r1 = opus_val16 *_y
  154. ; r2 = opus_val32 *xcorr
  155. ; r3 = int len
  156. ; output:
  157. ; r0 = int maxcorr
  158. ; internal usage:
  159. ; r4 = opus_val16 *x (for xcorr_kernel_neon())
  160. ; r5 = opus_val16 *y (for xcorr_kernel_neon())
  161. ; r6 = int max_pitch
  162. ; r12 = int j
  163. ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
  164. ; ignored:
  165. ; int arch
  166. STMFD sp!, {r4-r6, lr}
  167. LDR r6, [sp, #16]
  168. VMOV.S32 q15, #1
  169. ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
  170. SUBS r6, r6, #4
  171. BLT celt_pitch_xcorr_neon_process4_done
  172. celt_pitch_xcorr_neon_process4
  173. ; xcorr_kernel_neon parameters:
  174. ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
  175. MOV r4, r0
  176. MOV r5, r1
  177. VEOR q0, q0, q0
  178. ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
  179. ; So we don't save/restore any other registers.
  180. BL xcorr_kernel_neon_start
  181. SUBS r6, r6, #4
  182. VST1.32 {q0}, [r2]!
  183. ; _y += 4
  184. ADD r1, r1, #8
  185. VMAX.S32 q15, q15, q0
  186. ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
  187. BGE celt_pitch_xcorr_neon_process4
  188. ; We have less than 4 sums left to compute.
  189. celt_pitch_xcorr_neon_process4_done
  190. ADDS r6, r6, #4
  191. ; Reduce maxcorr to a single value
  192. VMAX.S32 d30, d30, d31
  193. VPMAX.S32 d30, d30, d30
  194. ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
  195. BLE celt_pitch_xcorr_neon_done
  196. ; Now compute each remaining sum one at a time.
  197. celt_pitch_xcorr_neon_process_remaining
  198. MOV r4, r0
  199. MOV r5, r1
  200. VMOV.I32 q0, #0
  201. SUBS r12, r3, #8
  202. BLT celt_pitch_xcorr_neon_process_remaining4
  203. ; Sum terms 8 at a time.
  204. celt_pitch_xcorr_neon_process_remaining_loop8
  205. ; Load x[0...7]
  206. VLD1.16 {q1}, [r4]!
  207. ; Load y[0...7]
  208. VLD1.16 {q2}, [r5]!
  209. SUBS r12, r12, #8
  210. VMLAL.S16 q0, d4, d2
  211. VMLAL.S16 q0, d5, d3
  212. BGE celt_pitch_xcorr_neon_process_remaining_loop8
  213. ; Sum terms 4 at a time.
  214. celt_pitch_xcorr_neon_process_remaining4
  215. ADDS r12, r12, #4
  216. BLT celt_pitch_xcorr_neon_process_remaining4_done
  217. ; Load x[0...3]
  218. VLD1.16 {d2}, [r4]!
  219. ; Load y[0...3]
  220. VLD1.16 {d3}, [r5]!
  221. SUB r12, r12, #4
  222. VMLAL.S16 q0, d3, d2
  223. celt_pitch_xcorr_neon_process_remaining4_done
  224. ; Reduce the sum to a single value.
  225. VADD.S32 d0, d0, d1
  226. VPADDL.S32 d0, d0
  227. ADDS r12, r12, #4
  228. BLE celt_pitch_xcorr_neon_process_remaining_loop_done
  229. ; Sum terms 1 at a time.
  230. celt_pitch_xcorr_neon_process_remaining_loop1
  231. VLD1.16 {d2[]}, [r4]!
  232. VLD1.16 {d3[]}, [r5]!
  233. SUBS r12, r12, #1
  234. VMLAL.S16 q0, d2, d3
  235. BGT celt_pitch_xcorr_neon_process_remaining_loop1
  236. celt_pitch_xcorr_neon_process_remaining_loop_done
  237. VST1.32 {d0[0]}, [r2]!
  238. VMAX.S32 d30, d30, d0
  239. SUBS r6, r6, #1
  240. ; _y++
  241. ADD r1, r1, #2
  242. ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
  243. BGT celt_pitch_xcorr_neon_process_remaining
  244. celt_pitch_xcorr_neon_done
  245. VMOV.32 r0, d30[0]
  246. LDMFD sp!, {r4-r6, pc}
  247. ENDP
  248. ENDIF
  249. IF OPUS_ARM_MAY_HAVE_EDSP
  250. ; This will get used on ARMv7 devices without NEON, so it has been optimized
  251. ; to take advantage of dual-issuing where possible.
  252. xcorr_kernel_edsp PROC
  253. xcorr_kernel_edsp_start
  254. ; input:
  255. ; r3 = int len
  256. ; r4 = opus_val16 *_x (must be 32-bit aligned)
  257. ; r5 = opus_val16 *_y (must be 32-bit aligned)
  258. ; r6...r9 = opus_val32 sum[4]
  259. ; output:
  260. ; r6...r9 = opus_val32 sum[4]
  261. ; preserved: r0-r5
  262. ; internal usage
  263. ; r2 = int j
  264. ; r12,r14 = opus_val16 x[4]
  265. ; r10,r11 = opus_val16 y[4]
  266. STMFD sp!, {r2,r4,r5,lr}
  267. LDR r10, [r5], #4 ; Load y[0...1]
  268. SUBS r2, r3, #4 ; j = len-4
  269. LDR r11, [r5], #4 ; Load y[2...3]
  270. BLE xcorr_kernel_edsp_process4_done
  271. LDR r12, [r4], #4 ; Load x[0...1]
  272. ; Stall
  273. xcorr_kernel_edsp_process4
  274. ; The multiplies must issue from pipeline 0, and can't dual-issue with each
  275. ; other. Every other instruction here dual-issues with a multiply, and is
  276. ; thus "free". There should be no stalls in the body of the loop.
  277. SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
  278. LDR r14, [r4], #4 ; Load x[2...3]
  279. SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
  280. SUBS r2, r2, #4 ; j-=4
  281. SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
  282. SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
  283. SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
  284. LDR r10, [r5], #4 ; Load y[4...5]
  285. SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
  286. SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
  287. SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
  288. LDRGT r12, [r4], #4 ; Load x[0...1]
  289. SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
  290. SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
  291. SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
  292. SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
  293. SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
  294. LDR r11, [r5], #4 ; Load y[6...7]
  295. SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
  296. SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
  297. SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
  298. BGT xcorr_kernel_edsp_process4
  299. xcorr_kernel_edsp_process4_done
  300. ADDS r2, r2, #4
  301. BLE xcorr_kernel_edsp_done
  302. LDRH r12, [r4], #2 ; r12 = *x++
  303. SUBS r2, r2, #1 ; j--
  304. ; Stall
  305. SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
  306. LDRHGT r14, [r4], #2 ; r14 = *x++
  307. SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
  308. SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
  309. SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
  310. BLE xcorr_kernel_edsp_done
  311. SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
  312. SUBS r2, r2, #1 ; j--
  313. SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
  314. LDRH r10, [r5], #2 ; r10 = y_4 = *y++
  315. SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
  316. LDRHGT r12, [r4], #2 ; r12 = *x++
  317. SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
  318. BLE xcorr_kernel_edsp_done
  319. SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
  320. CMP r2, #1 ; j--
  321. SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
  322. LDRH r2, [r5], #2 ; r2 = y_5 = *y++
  323. SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
  324. LDRHGT r14, [r4] ; r14 = *x
  325. SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
  326. BLE xcorr_kernel_edsp_done
  327. SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
  328. LDRH r11, [r5] ; r11 = y_6 = *y
  329. SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
  330. SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
  331. SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
  332. xcorr_kernel_edsp_done
  333. LDMFD sp!, {r2,r4,r5,pc}
  334. ENDP
  335. celt_pitch_xcorr_edsp PROC
  336. ; input:
  337. ; r0 = opus_val16 *_x (must be 32-bit aligned)
  338. ; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
  339. ; r2 = opus_val32 *xcorr
  340. ; r3 = int len
  341. ; output:
  342. ; r0 = maxcorr
  343. ; internal usage
  344. ; r4 = opus_val16 *x
  345. ; r5 = opus_val16 *y
  346. ; r6 = opus_val32 sum0
  347. ; r7 = opus_val32 sum1
  348. ; r8 = opus_val32 sum2
  349. ; r9 = opus_val32 sum3
  350. ; r1 = int max_pitch
  351. ; r12 = int j
  352. ; ignored:
  353. ; int arch
  354. STMFD sp!, {r4-r11, lr}
  355. MOV r5, r1
  356. LDR r1, [sp, #36]
  357. MOV r4, r0
  358. TST r5, #3
  359. ; maxcorr = 1
  360. MOV r0, #1
  361. BEQ celt_pitch_xcorr_edsp_process1u_done
  362. ; Compute one sum at the start to make y 32-bit aligned.
  363. SUBS r12, r3, #4
  364. ; r14 = sum = 0
  365. MOV r14, #0
  366. LDRH r8, [r5], #2
  367. BLE celt_pitch_xcorr_edsp_process1u_loop4_done
  368. LDR r6, [r4], #4
  369. MOV r8, r8, LSL #16
  370. celt_pitch_xcorr_edsp_process1u_loop4
  371. LDR r9, [r5], #4
  372. SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
  373. LDR r7, [r4], #4
  374. SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
  375. LDR r8, [r5], #4
  376. SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
  377. SUBS r12, r12, #4 ; j-=4
  378. SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
  379. LDRGT r6, [r4], #4
  380. BGT celt_pitch_xcorr_edsp_process1u_loop4
  381. MOV r8, r8, LSR #16
  382. celt_pitch_xcorr_edsp_process1u_loop4_done
  383. ADDS r12, r12, #4
  384. celt_pitch_xcorr_edsp_process1u_loop1
  385. LDRHGE r6, [r4], #2
  386. ; Stall
  387. SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
  388. SUBSGE r12, r12, #1
  389. LDRHGT r8, [r5], #2
  390. BGT celt_pitch_xcorr_edsp_process1u_loop1
  391. ; Restore _x
  392. SUB r4, r4, r3, LSL #1
  393. ; Restore and advance _y
  394. SUB r5, r5, r3, LSL #1
  395. ; maxcorr = max(maxcorr, sum)
  396. CMP r0, r14
  397. ADD r5, r5, #2
  398. MOVLT r0, r14
  399. SUBS r1, r1, #1
  400. ; xcorr[i] = sum
  401. STR r14, [r2], #4
  402. BLE celt_pitch_xcorr_edsp_done
  403. celt_pitch_xcorr_edsp_process1u_done
  404. ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
  405. SUBS r1, r1, #4
  406. BLT celt_pitch_xcorr_edsp_process2
  407. celt_pitch_xcorr_edsp_process4
  408. ; xcorr_kernel_edsp parameters:
  409. ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
  410. MOV r6, #0
  411. MOV r7, #0
  412. MOV r8, #0
  413. MOV r9, #0
  414. BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
  415. ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
  416. CMP r0, r6
  417. ; _y+=4
  418. ADD r5, r5, #8
  419. MOVLT r0, r6
  420. CMP r0, r7
  421. MOVLT r0, r7
  422. CMP r0, r8
  423. MOVLT r0, r8
  424. CMP r0, r9
  425. MOVLT r0, r9
  426. STMIA r2!, {r6-r9}
  427. SUBS r1, r1, #4
  428. BGE celt_pitch_xcorr_edsp_process4
  429. celt_pitch_xcorr_edsp_process2
  430. ADDS r1, r1, #2
  431. BLT celt_pitch_xcorr_edsp_process1a
  432. SUBS r12, r3, #4
  433. ; {r10, r11} = {sum0, sum1} = {0, 0}
  434. MOV r10, #0
  435. MOV r11, #0
  436. LDR r8, [r5], #4
  437. BLE celt_pitch_xcorr_edsp_process2_loop_done
  438. LDR r6, [r4], #4
  439. LDR r9, [r5], #4
  440. celt_pitch_xcorr_edsp_process2_loop4
  441. SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
  442. LDR r7, [r4], #4
  443. SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
  444. SUBS r12, r12, #4 ; j-=4
  445. SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
  446. LDR r8, [r5], #4
  447. SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
  448. LDRGT r6, [r4], #4
  449. SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
  450. SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
  451. SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
  452. LDRGT r9, [r5], #4
  453. SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
  454. BGT celt_pitch_xcorr_edsp_process2_loop4
  455. celt_pitch_xcorr_edsp_process2_loop_done
  456. ADDS r12, r12, #2
  457. BLE celt_pitch_xcorr_edsp_process2_1
  458. LDR r6, [r4], #4
  459. ; Stall
  460. SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
  461. LDR r9, [r5], #4
  462. SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
  463. SUB r12, r12, #2
  464. SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
  465. MOV r8, r9
  466. SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
  467. celt_pitch_xcorr_edsp_process2_1
  468. LDRH r6, [r4], #2
  469. ADDS r12, r12, #1
  470. ; Stall
  471. SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
  472. LDRHGT r7, [r4], #2
  473. SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
  474. BLE celt_pitch_xcorr_edsp_process2_done
  475. LDRH r9, [r5], #2
  476. SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
  477. SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
  478. celt_pitch_xcorr_edsp_process2_done
  479. ; Restore _x
  480. SUB r4, r4, r3, LSL #1
  481. ; Restore and advance _y
  482. SUB r5, r5, r3, LSL #1
  483. ; maxcorr = max(maxcorr, sum0)
  484. CMP r0, r10
  485. ADD r5, r5, #2
  486. MOVLT r0, r10
  487. SUB r1, r1, #2
  488. ; maxcorr = max(maxcorr, sum1)
  489. CMP r0, r11
  490. ; xcorr[i] = sum
  491. STR r10, [r2], #4
  492. MOVLT r0, r11
  493. STR r11, [r2], #4
  494. celt_pitch_xcorr_edsp_process1a
  495. ADDS r1, r1, #1
  496. BLT celt_pitch_xcorr_edsp_done
  497. SUBS r12, r3, #4
  498. ; r14 = sum = 0
  499. MOV r14, #0
  500. BLT celt_pitch_xcorr_edsp_process1a_loop_done
  501. LDR r6, [r4], #4
  502. LDR r8, [r5], #4
  503. LDR r7, [r4], #4
  504. LDR r9, [r5], #4
  505. celt_pitch_xcorr_edsp_process1a_loop4
  506. SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
  507. SUBS r12, r12, #4 ; j-=4
  508. SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
  509. LDRGE r6, [r4], #4
  510. SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
  511. LDRGE r8, [r5], #4
  512. SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
  513. LDRGE r7, [r4], #4
  514. LDRGE r9, [r5], #4
  515. BGE celt_pitch_xcorr_edsp_process1a_loop4
  516. celt_pitch_xcorr_edsp_process1a_loop_done
  517. ADDS r12, r12, #2
  518. LDRGE r6, [r4], #4
  519. LDRGE r8, [r5], #4
  520. ; Stall
  521. SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
  522. SUBGE r12, r12, #2
  523. SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
  524. ADDS r12, r12, #1
  525. LDRHGE r6, [r4], #2
  526. LDRHGE r8, [r5], #2
  527. ; Stall
  528. SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
  529. ; maxcorr = max(maxcorr, sum)
  530. CMP r0, r14
  531. ; xcorr[i] = sum
  532. STR r14, [r2], #4
  533. MOVLT r0, r14
  534. celt_pitch_xcorr_edsp_done
  535. LDMFD sp!, {r4-r11, pc}
  536. ENDP
  537. ENDIF
  538. END