armidct.s 61 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854
  1. ;********************************************************************
  2. ;* *
  3. ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. ;* *
  8. ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
  9. ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. ;* *
  11. ;********************************************************************
  12. ; Original implementation:
  13. ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
  14. ; last mod: $Id$
  15. ;********************************************************************
  16. AREA |.text|, CODE, READONLY
  17. GET armopts.s
  18. EXPORT oc_idct8x8_1_arm
  19. EXPORT oc_idct8x8_arm
  20. oc_idct8x8_1_arm PROC
  21. ; r0 = ogg_int16_t *_y
  22. ; r1 = ogg_uint16_t _dc
  23. ORR r1, r1, r1, LSL #16
  24. MOV r2, r1
  25. MOV r3, r1
  26. MOV r12,r1
  27. STMIA r0!,{r1,r2,r3,r12}
  28. STMIA r0!,{r1,r2,r3,r12}
  29. STMIA r0!,{r1,r2,r3,r12}
  30. STMIA r0!,{r1,r2,r3,r12}
  31. STMIA r0!,{r1,r2,r3,r12}
  32. STMIA r0!,{r1,r2,r3,r12}
  33. STMIA r0!,{r1,r2,r3,r12}
  34. STMIA r0!,{r1,r2,r3,r12}
  35. MOV PC, r14
  36. ENDP
  37. oc_idct8x8_arm PROC
  38. ; r0 = ogg_int16_t *_y
  39. ; r1 = ogg_int16_t *_x
  40. ; r2 = int _last_zzi
  41. CMP r2, #3
  42. BLE oc_idct8x8_3_arm
  43. CMP r2, #6
  44. BLE oc_idct8x8_6_arm
  45. CMP r2, #10
  46. BLE oc_idct8x8_10_arm
  47. oc_idct8x8_slow_arm
  48. STMFD r13!,{r4-r11,r14}
  49. SUB r13,r13,#64*2
  50. ; Row transforms
  51. STR r0, [r13,#-4]!
  52. ADD r0, r13, #4 ; Write to temp storage.
  53. BL idct8core_arm
  54. BL idct8core_arm
  55. BL idct8core_arm
  56. BL idct8core_arm
  57. BL idct8core_arm
  58. BL idct8core_arm
  59. BL idct8core_arm
  60. BL idct8core_arm
  61. LDR r0, [r13], #4 ; Write to the final destination.
  62. SUB r2, r1, #8*16
  63. ; Clear input data for next block.
  64. MOV r4, #0
  65. MOV r5, #0
  66. MOV r6, #0
  67. MOV r7, #0
  68. STMIA r2!,{r4,r5,r6,r7}
  69. STMIA r2!,{r4,r5,r6,r7}
  70. STMIA r2!,{r4,r5,r6,r7}
  71. STMIA r2!,{r4,r5,r6,r7}
  72. STMIA r2!,{r4,r5,r6,r7}
  73. STMIA r2!,{r4,r5,r6,r7}
  74. STMIA r2!,{r4,r5,r6,r7}
  75. STMIA r2!,{r4,r5,r6,r7}
  76. MOV r1, r13 ; And read from temp storage.
  77. ; Column transforms
  78. BL idct8core_down_arm
  79. BL idct8core_down_arm
  80. BL idct8core_down_arm
  81. BL idct8core_down_arm
  82. BL idct8core_down_arm
  83. BL idct8core_down_arm
  84. BL idct8core_down_arm
  85. BL idct8core_down_arm
  86. ADD r13,r13,#64*2
  87. LDMFD r13!,{r4-r11,PC}
  88. ENDP
  89. oc_idct8x8_10_arm PROC
  90. STMFD r13!,{r4-r11,r14}
  91. SUB r13,r13,#64*2
  92. ; Row transforms
  93. MOV r2, r0
  94. MOV r0, r13 ; Write to temp storage.
  95. BL idct4core_arm
  96. BL idct3core_arm
  97. BL idct2core_arm
  98. BL idct1core_arm
  99. ; Clear input data for next block.
  100. MOV r4, #0
  101. STR r4, [r1,#-4*16]!
  102. STR r4, [r1,#4]
  103. STR r4, [r1,#16]
  104. STR r4, [r1,#20]
  105. STR r4, [r1,#32]
  106. STR r4, [r1,#48]
  107. MOV r1, r13 ; Read from temp storage.
  108. MOV r0, r2 ; Write to the final destination
  109. oc_idct8x8_10_arm_cols
  110. ; Column transforms
  111. BL idct4core_down_arm
  112. BL idct4core_down_arm
  113. BL idct4core_down_arm
  114. BL idct4core_down_arm
  115. BL idct4core_down_arm
  116. BL idct4core_down_arm
  117. BL idct4core_down_arm
  118. BL idct4core_down_arm
  119. ADD r13,r13,#64*2
  120. LDMFD r13!,{r4-r11,PC}
  121. ENDP
  122. oc_idct8x8_6_arm PROC
  123. STMFD r13!,{r4-r7,r9-r11,r14}
  124. SUB r13,r13,#64*2
  125. ; Row transforms
  126. MOV r2, r0
  127. MOV r0, r13 ; Write to temp storage.
  128. BL idct3core_arm
  129. BL idct2core_arm
  130. BL idct1core_arm
  131. ; Clear input data for next block.
  132. MOV r4, #0
  133. STR r4, [r1,#-3*16]!
  134. STR r4, [r1,#4]
  135. STR r4, [r1,#16]
  136. STR r4, [r1,#32]
  137. MOV r1, r13 ; Read from temp storage.
  138. MOV r0, r2 ; Write to the final destination
  139. ; Column transforms
  140. BL idct3core_down_arm
  141. BL idct3core_down_arm
  142. BL idct3core_down_arm
  143. BL idct3core_down_arm
  144. BL idct3core_down_arm
  145. BL idct3core_down_arm
  146. BL idct3core_down_arm
  147. BL idct3core_down_arm
  148. ADD r13,r13,#64*2
  149. LDMFD r13!,{r4-r7,r9-r11,PC}
  150. ENDP
  151. oc_idct8x8_3_arm PROC
  152. STMFD r13!,{r4-r7,r9-r11,r14}
  153. SUB r13,r13,#64*2
  154. ; Row transforms
  155. MOV r2, r0
  156. MOV r0, r13 ; Write to temp storage.
  157. BL idct2core_arm
  158. BL idct1core_arm
  159. ; Clear input data for next block.
  160. MOV r4, #0
  161. STR r4, [r1,#-2*16]!
  162. STR r4, [r1,#16]
  163. MOV r1, r13 ; Read from temp storage.
  164. MOV r0, r2 ; Write to the final destination
  165. ; Column transforms
  166. BL idct2core_down_arm
  167. BL idct2core_down_arm
  168. BL idct2core_down_arm
  169. BL idct2core_down_arm
  170. BL idct2core_down_arm
  171. BL idct2core_down_arm
  172. BL idct2core_down_arm
  173. BL idct2core_down_arm
  174. ADD r13,r13,#64*2
  175. LDMFD r13!,{r4-r7,r9-r11,PC}
  176. ENDP
  177. idct1core_arm PROC
  178. ; r0 = ogg_int16_t *_y (destination)
  179. ; r1 = const ogg_int16_t *_x (source)
  180. LDRSH r3, [r1], #16
  181. MOV r12,#0x05
  182. ORR r12,r12,#0xB500
  183. MUL r3, r12, r3
  184. ; Stall ?
  185. MOV r3, r3, ASR #16
  186. STRH r3, [r0], #2
  187. STRH r3, [r0, #14]
  188. STRH r3, [r0, #30]
  189. STRH r3, [r0, #46]
  190. STRH r3, [r0, #62]
  191. STRH r3, [r0, #78]
  192. STRH r3, [r0, #94]
  193. STRH r3, [r0, #110]
  194. MOV PC,R14
  195. ENDP
  196. idct2core_arm PROC
  197. ; r0 = ogg_int16_t *_y (destination)
  198. ; r1 = const ogg_int16_t *_x (source)
  199. LDRSH r9, [r1], #16 ; r9 = x[0]
  200. LDR r12,OC_C4S4
  201. LDRSH r11,[r1, #-14] ; r11= x[1]
  202. LDR r3, OC_C7S1
  203. MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
  204. LDR r10,OC_C1S7
  205. MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
  206. MOV r9, r9, ASR #16 ; r9 = t[0]
  207. MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
  208. MOV r3, r3, ASR #16 ; r3 = t[4]
  209. MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
  210. MOV r11,r11,ASR #16 ; r11= t[7]
  211. MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
  212. MOV r10,r10,ASR #16 ; r10= t[5]
  213. ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]
  214. ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
  215. SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
  216. ADD r3, r3, r9 ; r3 = t[0]+t[4]
  217. ADD r11,r11,r9 ; r11= t[0]+t[7]
  218. STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
  219. STRH r12,[r0, #14] ; y[1] = t[0]+t[6]
  220. STRH r10,[r0, #30] ; y[2] = t[0]+t[5]
  221. STRH r3, [r0, #46] ; y[3] = t[0]+t[4]
  222. RSB r3, r3, r9, LSL #1 ; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
  223. RSB r10,r10,r9, LSL #1 ; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
  224. RSB r12,r12,r9, LSL #1 ; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
  225. RSB r11,r11,r9, LSL #1 ; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
  226. STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
  227. STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
  228. STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
  229. STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
  230. MOV PC,r14
  231. ENDP
  232. idct2core_down_arm PROC
  233. ; r0 = ogg_int16_t *_y (destination)
  234. ; r1 = const ogg_int16_t *_x (source)
  235. LDRSH r9, [r1], #16 ; r9 = x[0]
  236. LDR r12,OC_C4S4
  237. LDRSH r11,[r1, #-14] ; r11= x[1]
  238. LDR r3, OC_C7S1
  239. MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
  240. LDR r10,OC_C1S7
  241. MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
  242. MOV r9, r9, ASR #16 ; r9 = t[0]
  243. MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
  244. ADD r9, r9, #8 ; r9 = t[0]+8
  245. MOV r3, r3, ASR #16 ; r3 = t[4]
  246. MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
  247. MOV r11,r11,ASR #16 ; r11= t[7]
  248. MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
  249. MOV r10,r10,ASR #16 ; r10= t[5]
  250. ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]+8
  251. ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
  252. SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
  253. ADD r3, r3, r9 ; r3 = t[0]+t[4]+8
  254. ADD r11,r11,r9 ; r11= t[0]+t[7]+8
  255. ; TODO: This is wrong.
  256. ; The C code truncates to 16 bits by storing to RAM and doing the
  257. ; shifts later; we've got an extra 4 bits here.
  258. MOV r4, r11,ASR #4
  259. MOV r5, r12,ASR #4
  260. MOV r6, r10,ASR #4
  261. MOV r7, r3, ASR #4
  262. RSB r3, r3, r9, LSL #1 ;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
  263. RSB r10,r10,r9, LSL #1 ;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
  264. RSB r12,r12,r9, LSL #1 ;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
  265. RSB r11,r11,r9, LSL #1 ;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
  266. MOV r3, r3, ASR #4
  267. MOV r10,r10,ASR #4
  268. MOV r12,r12,ASR #4
  269. MOV r11,r11,ASR #4
  270. STRH r4, [r0], #2 ; y[0] = t[0]+t[7]
  271. STRH r5, [r0, #14] ; y[1] = t[0]+t[6]
  272. STRH r6, [r0, #30] ; y[2] = t[0]+t[5]
  273. STRH r7, [r0, #46] ; y[3] = t[0]+t[4]
  274. STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
  275. STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
  276. STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
  277. STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
  278. MOV PC,r14
  279. ENDP
  280. idct3core_arm PROC
  281. LDRSH r9, [r1], #16 ; r9 = x[0]
  282. LDR r12,OC_C4S4 ; r12= OC_C4S4
  283. LDRSH r3, [r1, #-12] ; r3 = x[2]
  284. LDR r10,OC_C6S2 ; r10= OC_C6S2
  285. MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
  286. LDR r4, OC_C2S6 ; r4 = OC_C2S6
  287. MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
  288. LDRSH r11,[r1, #-14] ; r11= x[1]
  289. MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
  290. LDR r4, OC_C7S1 ; r4 = OC_C7S1
  291. LDR r5, OC_C1S7 ; r5 = OC_C1S7
  292. MOV r9, r9, ASR #16 ; r9 = t[0]
  293. MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
  294. ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]
  295. MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
  296. MOV r4, r4, ASR #16 ; r4 = t[4]
  297. MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
  298. MOV r11,r11,ASR #16 ; r11= t[7]
  299. MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
  300. ADD r10,r9, r10,ASR #16 ; r10= t[1] = t[0]+t[2]
  301. RSB r6, r10,r9, LSL #1 ; r6 = t[2] = t[0]-t[2]
  302. ; r3 = t2[0] = t[0]+t[3]
  303. RSB r9, r3, r9, LSL #1 ; r9 = t2[3] = t[0]-t[3]
  304. MOV r12,r12,ASR #16 ; r12= t[6]
  305. ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
  306. RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
  307. ADD r11,r3, r11 ; r11= t2[0]+t[7]
  308. ADD r5, r10,r5 ; r5 = t[1]+t2[6]
  309. ADD r12,r6, r12 ; r12= t[2]+t2[5]
  310. ADD r4, r9, r4 ; r4 = t2[3]+t[4]
  311. STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
  312. STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
  313. STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
  314. STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
  315. RSB r11,r11,r3, LSL #1 ; r11= t2[0] - t[7]
  316. RSB r5, r5, r10,LSL #1 ; r5 = t[1] - t2[6]
  317. RSB r12,r12,r6, LSL #1 ; r6 = t[2] - t2[5]
  318. RSB r4, r4, r9, LSL #1 ; r4 = t2[3] - t[4]
  319. STRH r4, [r0, #62] ; y[4] = t2[3]-t[4]
  320. STRH r12,[r0, #78] ; y[5] = t[2]-t2[5]
  321. STRH r5, [r0, #94] ; y[6] = t[1]-t2[6]
  322. STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
  323. MOV PC,R14
  324. ENDP
  325. idct3core_down_arm PROC
  326. LDRSH r9, [r1], #16 ; r9 = x[0]
  327. LDR r12,OC_C4S4 ; r12= OC_C4S4
  328. LDRSH r3, [r1, #-12] ; r3 = x[2]
  329. LDR r10,OC_C6S2 ; r10= OC_C6S2
  330. MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
  331. LDR r4, OC_C2S6 ; r4 = OC_C2S6
  332. MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
  333. LDRSH r11,[r1, #-14] ; r11= x[1]
  334. MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
  335. LDR r4, OC_C7S1 ; r4 = OC_C7S1
  336. LDR r5, OC_C1S7 ; r5 = OC_C1S7
  337. MOV r9, r9, ASR #16 ; r9 = t[0]
  338. MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
  339. ADD r9, r9, #8 ; r9 = t[0]+8
  340. MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
  341. ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]+8
  342. MOV r4, r4, ASR #16 ; r4 = t[4]
  343. MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
  344. MOV r11,r11,ASR #16 ; r11= t[7]
  345. MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
  346. ADD r10,r9, r10,ASR #16 ; r10= t[1]+8 = t[0]+t[2]+8
  347. RSB r6, r10,r9, LSL #1 ; r6 = t[2]+8 = t[0]-t[2]+8
  348. ; r3 = t2[0]+8 = t[0]+t[3]+8
  349. RSB r9, r3, r9, LSL #1 ; r9 = t2[3]+8 = t[0]-t[3]+8
  350. MOV r12,r12,ASR #16 ; r12= t[6]
  351. ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
  352. RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
  353. ADD r11,r3, r11 ; r11= t2[0]+t[7] +8
  354. ADD r5, r10,r5 ; r5 = t[1] +t2[6]+8
  355. ADD r12,r6, r12 ; r12= t[2] +t2[5]+8
  356. ADD r4, r9, r4 ; r4 = t2[3]+t[4] +8
  357. RSB r3, r11,r3, LSL #1 ; r11= t2[0] - t[7] + 8
  358. RSB r10,r5, r10,LSL #1 ; r5 = t[1] - t2[6] + 8
  359. RSB r6, r12,r6, LSL #1 ; r6 = t[2] - t2[5] + 8
  360. RSB r9, r4, r9, LSL #1 ; r4 = t2[3] - t[4] + 8
  361. ; TODO: This is wrong.
  362. ; The C code truncates to 16 bits by storing to RAM and doing the
  363. ; shifts later; we've got an extra 4 bits here.
  364. MOV r11,r11,ASR #4
  365. MOV r5, r5, ASR #4
  366. MOV r12,r12,ASR #4
  367. MOV r4, r4, ASR #4
  368. MOV r9, r9, ASR #4
  369. MOV r6, r6, ASR #4
  370. MOV r10,r10,ASR #4
  371. MOV r3, r3, ASR #4
  372. STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
  373. STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
  374. STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
  375. STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
  376. STRH r9, [r0, #62] ; y[4] = t2[3]-t[4]
  377. STRH r6, [r0, #78] ; y[5] = t[2]-t2[5]
  378. STRH r10,[r0, #94] ; y[6] = t[1]-t2[6]
  379. STRH r3, [r0, #110] ; y[7] = t2[0]-t[7]
  380. MOV PC,R14
  381. ENDP
  382. idct4core_arm PROC
  383. ; r0 = ogg_int16_t *_y (destination)
  384. ; r1 = const ogg_int16_t *_x (source)
  385. LDRSH r9, [r1], #16 ; r9 = x[0]
  386. LDR r10,OC_C4S4 ; r10= OC_C4S4
  387. LDRSH r12,[r1, #-12] ; r12= x[2]
  388. LDR r4, OC_C6S2 ; r4 = OC_C6S2
  389. MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
  390. LDR r5, OC_C2S6 ; r5 = OC_C2S6
  391. MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
  392. LDRSH r3, [r1, #-14] ; r3 = x[1]
  393. MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
  394. LDR r6, OC_C7S1 ; r6 = OC_C7S1
  395. LDR r12,OC_C1S7 ; r12= OC_C1S7
  396. LDRSH r11,[r1, #-10] ; r11= x[3]
  397. MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
  398. LDR r7, OC_C5S3 ; r7 = OC_C5S3
  399. MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
  400. LDR r8, OC_C3S5 ; r8 = OC_C3S5
  401. MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
  402. MOV r9, r9, ASR #16 ; r9 = t[0]
  403. MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
  404. MOV r6, r6, ASR #16 ; r6 = t[4]
  405. ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  406. ; before multiplying, not after (this is not equivalent)
  407. SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
  408. RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
  409. MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
  410. MOV r3, r3, ASR #16 ; r3 = t[7]
  411. ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
  412. RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
  413. MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
  414. ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2]
  415. RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2]
  416. ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3]
  417. RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3]
  418. MOV r3, r3, ASR #16 ; r3 = t2[6]
  419. ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
  420. RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
  421. ADD r11,r5, r11 ; r11= t[0]+t2[7]
  422. ADD r6, r4, r6 ; r6 = t[1]+t3[6]
  423. ADD r3, r10,r3 ; r3 = t[2]+t3[5]
  424. ADD r7, r9, r7 ; r7 = t[3]+t2[4]
  425. STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
  426. STRH r6, [r0, #14] ; y[1] = t[1]+t2[6]
  427. STRH r3, [r0, #30] ; y[2] = t[2]+t2[5]
  428. STRH r7, [r0, #46] ; y[3] = t2[3]+t[4]
  429. RSB r11,r11,r5, LSL #1 ; r11= t[0]-t2[7]
  430. RSB r6, r6, r4, LSL #1 ; r6 = t[1]-t3[6]
  431. RSB r3, r3, r10,LSL #1 ; r3 = t[2]-t3[5]
  432. RSB r7, r7, r9, LSL #1 ; r7 = t[3]-t2[4]
  433. STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
  434. STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
  435. STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
  436. STRH r11, [r0, #110] ; y[7] = t2[0]-t[7]
  437. MOV PC,r14
  438. ENDP
  439. idct4core_down_arm PROC
  440. ; r0 = ogg_int16_t *_y (destination)
  441. ; r1 = const ogg_int16_t *_x (source)
  442. LDRSH r9, [r1], #16 ; r9 = x[0]
  443. LDR r10,OC_C4S4 ; r10= OC_C4S4
  444. LDRSH r12,[r1, #-12] ; r12= x[2]
  445. LDR r4, OC_C6S2 ; r4 = OC_C6S2
  446. MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
  447. LDR r5, OC_C2S6 ; r5 = OC_C2S6
  448. MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
  449. LDRSH r3, [r1, #-14] ; r3 = x[1]
  450. MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
  451. LDR r6, OC_C7S1 ; r6 = OC_C7S1
  452. LDR r12,OC_C1S7 ; r12= OC_C1S7
  453. LDRSH r11,[r1, #-10] ; r11= x[3]
  454. MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
  455. LDR r7, OC_C5S3 ; r7 = OC_C5S3
  456. MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
  457. LDR r8, OC_C3S5 ; r8 = OC_C3S5
  458. MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
  459. MOV r9, r9, ASR #16 ; r9 = t[0]
  460. MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
  461. MOV r6, r6, ASR #16 ; r6 = t[4]
  462. ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  463. ; before multiplying, not after (this is not equivalent)
  464. SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
  465. RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
  466. MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
  467. MOV r3, r3, ASR #16 ; r3 = t[7]
  468. ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
  469. RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
  470. ADD r9, r9, #8 ; r9 = t[0]+8
  471. MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
  472. ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] + 8
  473. RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] + 8
  474. ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] + 8
  475. RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] + 8
  476. MOV r3, r3, ASR #16 ; r3 = t2[6]
  477. ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
  478. RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
  479. ADD r5, r5, r11 ; r5 = t[0]+t2[7]+8
  480. ADD r4, r4, r6 ; r4 = t[1]+t3[6]+8
  481. ADD r10,r10,r3 ; r10= t[2]+t3[5]+8
  482. ADD r9, r9, r7 ; r9 = t[3]+t2[4]+8
  483. SUB r11,r5, r11,LSL #1 ; r11= t[0]-t2[7]+8
  484. SUB r6, r4, r6, LSL #1 ; r6 = t[1]-t3[6]+8
  485. SUB r3, r10,r3, LSL #1 ; r3 = t[2]-t3[5]+8
  486. SUB r7, r9, r7, LSL #1 ; r7 = t[3]-t2[4]+8
  487. ; TODO: This is wrong.
  488. ; The C code truncates to 16 bits by storing to RAM and doing the
  489. ; shifts later; we've got an extra 4 bits here.
  490. MOV r11,r11,ASR #4
  491. MOV r6, r6, ASR #4
  492. MOV r3, r3, ASR #4
  493. MOV r7, r7, ASR #4
  494. MOV r9, r9, ASR #4
  495. MOV r10,r10,ASR #4
  496. MOV r4, r4, ASR #4
  497. MOV r5, r5, ASR #4
  498. STRH r5,[r0], #2 ; y[0] = t[0]+t[7]
  499. STRH r4, [r0, #14] ; y[1] = t[1]+t2[6]
  500. STRH r10,[r0, #30] ; y[2] = t[2]+t2[5]
  501. STRH r9, [r0, #46] ; y[3] = t2[3]+t[4]
  502. STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
  503. STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
  504. STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
  505. STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
  506. MOV PC,r14
  507. ENDP
  508. idct8core_arm PROC
  509. ; r0 = ogg_int16_t *_y (destination)
  510. ; r1 = const ogg_int16_t *_x (source)
  511. LDRSH r2, [r1],#16 ; r2 = x[0]
  512. STMFD r13!,{r1,r14}
  513. LDRSH r6, [r1, #-8] ; r6 = x[4]
  514. LDR r12,OC_C4S4 ; r12= C4S4
  515. LDRSH r4, [r1, #-12] ; r4 = x[2]
  516. ADD r2, r2, r6 ; r2 = x[0] + x[4]
  517. SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
  518. ; For spec compliance, these sums must be truncated to 16-bit precision
  519. ; _before_ the multiply (not after).
  520. ; Sadly, ARMv4 provides no simple way to do that.
  521. MOV r2, r2, LSL #16
  522. MOV r6, r6, LSL #16
  523. MOV r2, r2, ASR #16
  524. MOV r6, r6, ASR #16
  525. MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
  526. LDRSH r8, [r1, #-4] ; r8 = x[6]
  527. LDR r7, OC_C6S2 ; r7 = OC_C6S2
  528. MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
  529. LDR r14,OC_C2S6 ; r14= OC_C2S6
  530. MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
  531. LDR r5, OC_C7S1 ; r5 = OC_C7S1
  532. MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
  533. MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
  534. MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
  535. MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
  536. MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
  537. LDR r7, OC_C1S7 ; r7 = OC_C1S7
  538. SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
  539. LDRSH r14,[r1, #-14] ; r14= x[1]
  540. ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
  541. LDRSH r8, [r1, #-2] ; r8 = x[7]
  542. MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
  543. LDRSH r10,[r1, #-6] ; r10= x[5]
  544. MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
  545. MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
  546. MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
  547. MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
  548. MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
  549. LDRSH r1, [r1, #-10] ; r1 = x[3]
  550. LDR r5, OC_C3S5 ; r5 = OC_C3S5
  551. LDR r11,OC_C5S3 ; r11= OC_C5S3
  552. ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
  553. MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
  554. SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
  555. MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
  556. MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
  557. MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
  558. MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
  559. MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
  560. SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
  561. ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
  562. ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
  563. ; r10=t[6] r12=C4S4 r14=t[5]
  564. ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  565. ; before multiplying, not after (this is not equivalent)
  566. ; Stage 2
  567. ; 4-5 butterfly
  568. ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
  569. SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
  570. MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
  571. ; 7-6 butterfly
  572. ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
  573. SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
  574. MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
  575. ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
  576. ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
  577. ; Stage 3
  578. ; 0-3 butterfly
  579. ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3]
  580. SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3]
  581. ; 1-2 butterfly
  582. ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2]
  583. SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2]
  584. ; 6-5 butterfly
  585. MOV r14,r14,ASR #16 ; r14= t2[5]
  586. ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
  587. SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
  588. ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
  589. ; r10=t3[6] r14=t3[5]
  590. ; Stage 4
  591. ADD r2, r2, r8 ; r2 = t[0] + t[7]
  592. ADD r6, r6, r10 ; r6 = t[1] + t[6]
  593. ADD r3, r3, r14 ; r3 = t[2] + t[5]
  594. ADD r4, r4, r9 ; r4 = t[3] + t[4]
  595. SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7]
  596. SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6]
  597. SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5]
  598. SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4]
  599. STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
  600. STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
  601. STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
  602. STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
  603. STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
  604. STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
  605. STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
  606. STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
  607. LDMFD r13!,{r1,PC}
  608. ENDP
  609. idct8core_down_arm PROC
  610. ; r0 = ogg_int16_t *_y (destination)
  611. ; r1 = const ogg_int16_t *_x (source)
  612. LDRSH r2, [r1],#16 ; r2 = x[0]
  613. STMFD r13!,{r1,r14}
  614. LDRSH r6, [r1, #-8] ; r6 = x[4]
  615. LDR r12,OC_C4S4 ; r12= C4S4
  616. LDRSH r4, [r1, #-12] ; r4 = x[2]
  617. ADD r2, r2, r6 ; r2 = x[0] + x[4]
  618. SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
  619. ; For spec compliance, these sums must be truncated to 16-bit precision
  620. ; _before_ the multiply (not after).
  621. ; Sadly, ARMv4 provides no simple way to do that.
  622. MOV r2, r2, LSL #16
  623. MOV r6, r6, LSL #16
  624. MOV r2, r2, ASR #16
  625. MOV r6, r6, ASR #16
  626. MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
  627. LDRSH r8, [r1, #-4] ; r8 = x[6]
  628. LDR r7, OC_C6S2 ; r7 = OC_C6S2
  629. MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
  630. LDR r14,OC_C2S6 ; r14= OC_C2S6
  631. MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
  632. LDR r5, OC_C7S1 ; r5 = OC_C7S1
  633. MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
  634. MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
  635. MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
  636. MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
  637. MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
  638. LDR r7, OC_C1S7 ; r7 = OC_C1S7
  639. SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
  640. LDRSH r14,[r1, #-14] ; r14= x[1]
  641. ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
  642. LDRSH r8, [r1, #-2] ; r8 = x[7]
  643. MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
  644. LDRSH r10,[r1, #-6] ; r10= x[5]
  645. MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
  646. MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
  647. MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
  648. MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
  649. MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
  650. LDRSH r1, [r1, #-10] ; r1 = x[3]
  651. LDR r5, OC_C3S5 ; r5 = OC_C3S5
  652. LDR r11,OC_C5S3 ; r11= OC_C5S3
  653. ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
  654. MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
  655. SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
  656. MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
  657. MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
  658. MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
  659. MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
  660. MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
  661. SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
  662. ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
  663. ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
  664. ; r10=t[6] r12=C4S4 r14=t[5]
  665. ; Stage 2
  666. ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  667. ; before multiplying, not after (this is not equivalent)
  668. ; 4-5 butterfly
  669. ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
  670. SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
  671. MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
  672. ; 7-6 butterfly
  673. ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
  674. SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
  675. MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
  676. ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
  677. ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
  678. ; Stage 3
  679. ADD r2, r2, #8<<16 ; r2 = t[0]+8<<16
  680. ADD r6, r6, #8<<16 ; r6 = t[1]+8<<16
  681. ; 0-3 butterfly
  682. ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] + 8
  683. SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] + 8
  684. ; 1-2 butterfly
  685. ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] + 8
  686. SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] + 8
  687. ; 6-5 butterfly
  688. MOV r14,r14,ASR #16 ; r14= t2[5]
  689. ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
  690. SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
  691. ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
  692. ; r10=t3[6] r14=t3[5]
  693. ; Stage 4
  694. ADD r2, r2, r8 ; r2 = t[0] + t[7] + 8
  695. ADD r6, r6, r10 ; r6 = t[1] + t[6] + 8
  696. ADD r3, r3, r14 ; r3 = t[2] + t[5] + 8
  697. ADD r4, r4, r9 ; r4 = t[3] + t[4] + 8
  698. SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] + 8
  699. SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] + 8
  700. SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] + 8
  701. SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] + 8
  702. ; TODO: This is wrong.
  703. ; The C code truncates to 16 bits by storing to RAM and doing the
  704. ; shifts later; we've got an extra 4 bits here.
  705. MOV r2, r2, ASR #4
  706. MOV r6, r6, ASR #4
  707. MOV r3, r3, ASR #4
  708. MOV r4, r4, ASR #4
  709. MOV r8, r8, ASR #4
  710. MOV r10,r10,ASR #4
  711. MOV r14,r14,ASR #4
  712. MOV r9, r9, ASR #4
  713. STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
  714. STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
  715. STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
  716. STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
  717. STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
  718. STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
  719. STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
  720. STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
  721. LDMFD r13!,{r1,PC}
  722. ENDP
  723. [ OC_ARM_ASM_MEDIA
  724. EXPORT oc_idct8x8_1_v6
  725. EXPORT oc_idct8x8_v6
  726. oc_idct8x8_1_v6 PROC
  727. ; r0 = ogg_int16_t *_y
  728. ; r1 = ogg_uint16_t _dc
  729. ORR r2, r1, r1, LSL #16
  730. ORR r3, r1, r1, LSL #16
  731. STRD r2, [r0], #8
  732. STRD r2, [r0], #8
  733. STRD r2, [r0], #8
  734. STRD r2, [r0], #8
  735. STRD r2, [r0], #8
  736. STRD r2, [r0], #8
  737. STRD r2, [r0], #8
  738. STRD r2, [r0], #8
  739. STRD r2, [r0], #8
  740. STRD r2, [r0], #8
  741. STRD r2, [r0], #8
  742. STRD r2, [r0], #8
  743. STRD r2, [r0], #8
  744. STRD r2, [r0], #8
  745. STRD r2, [r0], #8
  746. STRD r2, [r0], #8
  747. MOV PC, r14
  748. ENDP
  749. oc_idct8x8_v6 PROC
  750. ; r0 = ogg_int16_t *_y
  751. ; r1 = ogg_int16_t *_x
  752. ; r2 = int _last_zzi
  753. CMP r2, #3
  754. BLE oc_idct8x8_3_v6
  755. ;CMP r2, #6
  756. ;BLE oc_idct8x8_6_v6
  757. CMP r2, #10
  758. BLE oc_idct8x8_10_v6
  759. oc_idct8x8_slow_v6
  760. STMFD r13!,{r4-r11,r14}
  761. SUB r13,r13,#64*2
  762. ; Row transforms
  763. STR r0, [r13,#-4]!
  764. ADD r0, r13, #4 ; Write to temp storage.
  765. BL idct8_8core_v6
  766. BL idct8_8core_v6
  767. BL idct8_8core_v6
  768. BL idct8_8core_v6
  769. LDR r0, [r13], #4 ; Write to the final destination.
  770. ; Clear input data for next block.
  771. MOV r4, #0
  772. MOV r5, #0
  773. STRD r4, [r1,#-8*16]!
  774. STRD r4, [r1,#8]
  775. STRD r4, [r1,#16]
  776. STRD r4, [r1,#24]
  777. STRD r4, [r1,#32]
  778. STRD r4, [r1,#40]
  779. STRD r4, [r1,#48]
  780. STRD r4, [r1,#56]
  781. STRD r4, [r1,#64]
  782. STRD r4, [r1,#72]
  783. STRD r4, [r1,#80]
  784. STRD r4, [r1,#88]
  785. STRD r4, [r1,#96]
  786. STRD r4, [r1,#104]
  787. STRD r4, [r1,#112]
  788. STRD r4, [r1,#120]
  789. MOV r1, r13 ; And read from temp storage.
  790. ; Column transforms
  791. BL idct8_8core_down_v6
  792. BL idct8_8core_down_v6
  793. BL idct8_8core_down_v6
  794. BL idct8_8core_down_v6
  795. ADD r13,r13,#64*2
  796. LDMFD r13!,{r4-r11,PC}
  797. ENDP
  798. oc_idct8x8_10_v6 PROC
  799. STMFD r13!,{r4-r11,r14}
  800. SUB r13,r13,#64*2+4
  801. ; Row transforms
  802. MOV r2, r13
  803. STR r0, [r13,#-4]!
  804. AND r0, r2, #4 ; Align the stack.
  805. ADD r0, r0, r2 ; Write to temp storage.
  806. BL idct4_3core_v6
  807. BL idct2_1core_v6
  808. LDR r0, [r13], #4 ; Write to the final destination.
  809. ; Clear input data for next block.
  810. MOV r4, #0
  811. MOV r5, #0
  812. STRD r4, [r1,#-4*16]!
  813. STRD r4, [r1,#16]
  814. STR r4, [r1,#32]
  815. STR r4, [r1,#48]
  816. AND r1, r13,#4 ; Align the stack.
  817. ADD r1, r1, r13 ; And read from temp storage.
  818. ; Column transforms
  819. BL idct4_4core_down_v6
  820. BL idct4_4core_down_v6
  821. BL idct4_4core_down_v6
  822. BL idct4_4core_down_v6
  823. ADD r13,r13,#64*2+4
  824. LDMFD r13!,{r4-r11,PC}
  825. ENDP
  826. oc_idct8x8_3_v6 PROC
  827. STMFD r13!,{r4-r8,r14}
  828. SUB r13,r13,#64*2
  829. ; Row transforms
  830. MOV r8, r0
  831. MOV r0, r13 ; Write to temp storage.
  832. BL idct2_1core_v6
  833. ; Clear input data for next block.
  834. MOV r4, #0
  835. STR r4, [r1,#-2*16]!
  836. STR r4, [r1,#16]
  837. MOV r1, r13 ; Read from temp storage.
  838. MOV r0, r8 ; Write to the final destination.
  839. ; Column transforms
  840. BL idct2_2core_down_v6
  841. BL idct2_2core_down_v6
  842. BL idct2_2core_down_v6
  843. BL idct2_2core_down_v6
  844. ADD r13,r13,#64*2
  845. LDMFD r13!,{r4-r8,PC}
  846. ENDP
  847. idct2_1core_v6 PROC
  848. ; r0 = ogg_int16_t *_y (destination)
  849. ; r1 = const ogg_int16_t *_x (source)
  850. ; Stage 1:
  851. LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]>
  852. LDR r3, OC_C4S4
  853. LDRSH r6, [r1], #16 ; r6 = x[1,0]
  854. SMULWB r12,r3, r2 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16
  855. LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7
  856. SMULWB r6, r3, r6 ; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
  857. SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  858. SMULWT r7, r5, r2 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  859. ; Stage 2:
  860. SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  861. PKHBT r12,r12,r6, LSL #16 ; r12= <t[1,0]|t[0,0]>
  862. SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  863. PKHBT r7, r7, r3 ; r7 = <0|t[0,7]>
  864. ; Stage 3:
  865. PKHBT r5, r6, r5, LSL #16 ; r5 = <t[0,5]|t[0,6]>
  866. PKHBT r4, r4, r3 ; r4 = <0|t[0,4]>
  867. SASX r5, r5, r5 ; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
  868. ; Stage 4:
  869. PKHTB r6, r3, r5, ASR #16 ; r6 = <0|t[0,6]>
  870. PKHBT r5, r5, r3 ; r5 = <0|t[0,5]>
  871. SADD16 r3, r12,r7 ; r3 = t[0]+t[7]
  872. STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]
  873. SADD16 r3, r12,r6 ; r3 = t[0]+t[6]
  874. STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]
  875. SADD16 r3, r12,r5 ; r3 = t[0]+t[5]
  876. STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]
  877. SADD16 r3, r12,r4 ; r3 = t[0]+t[4]
  878. STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]
  879. SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]
  880. STR r4, [r0, #60] ; y[4<<3] = t[0]-t[4]
  881. SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]
  882. STR r5, [r0, #76] ; y[5<<3] = t[0]-t[5]
  883. SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]
  884. STR r6, [r0, #92] ; y[6<<3] = t[0]-t[6]
  885. SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]
  886. STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
  887. MOV PC,r14
  888. ENDP
  889. ]
  890. ALIGN 8
  891. OC_C7S1
  892. DCD 12785 ; 31F1
  893. OC_C1S7
  894. DCD 64277 ; FB15
  895. OC_C6S2
  896. DCD 25080 ; 61F8
  897. OC_C2S6
  898. DCD 60547 ; EC83
  899. OC_C5S3
  900. DCD 36410 ; 8E3A
  901. OC_C3S5
  902. DCD 54491 ; D4DB
  903. OC_C4S4
  904. DCD 46341 ; B505
  905. [ OC_ARM_ASM_MEDIA
  906. idct2_2core_down_v6 PROC
  907. ; r0 = ogg_int16_t *_y (destination)
  908. ; r1 = const ogg_int16_t *_x (source)
  909. ; Stage 1:
  910. LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]>
  911. LDR r3, OC_C4S4
  912. MOV r7 ,#8 ; r7 = 8
  913. LDR r6, [r1], #16 ; r6 = <x[1,1]|x[1,0]>
  914. SMLAWB r12,r3, r2, r7 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
  915. LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7
  916. SMLAWB r7, r3, r6, r7 ; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
  917. SMULWT r5, r5, r2 ; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
  918. PKHBT r12,r12,r7, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
  919. SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  920. ; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
  921. PKHBT r7, r5, r5, LSL #16 ; r7 = <t[0,7]|t[0,7]>
  922. ; Stage 2:
  923. SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  924. PKHBT r4, r4, r4, LSL #16 ; r4 = <t[0,4]|t[0,4]>
  925. SMULWT r2, r3, r7 ; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
  926. SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  927. PKHBT r6, r6, r2, LSL #16 ; r6 = <t[1,6]|t[0,6]>
  928. SMULWT r2, r3, r4 ; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
  929. PKHBT r2, r5, r2, LSL #16 ; r2 = <t[1,5]|t[0,5]>
  930. ; Stage 3:
  931. SSUB16 r5, r6, r2 ; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
  932. SADD16 r6, r6, r2 ; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
  933. ; Stage 4:
  934. SADD16 r2, r12,r7 ; r2 = t[0]+t[7]+8
  935. MOV r3, r2, ASR #4
  936. MOV r2, r2, LSL #16
  937. PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[7]+8>>4
  938. STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4
  939. SADD16 r2, r12,r6 ; r2 = t[0]+t[6]+8
  940. MOV r3, r2, ASR #4
  941. MOV r2, r2, LSL #16
  942. PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[6]+8>>4
  943. STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]+8>>4
  944. SADD16 r2, r12,r5 ; r2 = t[0]+t[5]+8
  945. MOV r3, r2, ASR #4
  946. MOV r2, r2, LSL #16
  947. PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[5]+8>>4
  948. STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]+8>>4
  949. SADD16 r2, r12,r4 ; r2 = t[0]+t[4]+8
  950. MOV r3, r2, ASR #4
  951. MOV r2, r2, LSL #16
  952. PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[4]+8>>4
  953. STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]+8>>4
  954. SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]+8
  955. MOV r3, r4, ASR #4
  956. MOV r4, r4, LSL #16
  957. PKHTB r3, r3, r4, ASR #20 ; r3 = t[0]-t[4]+8>>4
  958. STR r3, [r0, #60] ; y[4<<3] = t[0]-t[4]+8>>4
  959. SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]+8
  960. MOV r3, r5, ASR #4
  961. MOV r5, r5, LSL #16
  962. PKHTB r3, r3, r5, ASR #20 ; r3 = t[0]-t[5]+8>>4
  963. STR r3, [r0, #76] ; y[5<<3] = t[0]-t[5]+8>>4
  964. SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]+8
  965. MOV r3, r6, ASR #4
  966. MOV r6, r6, LSL #16
  967. PKHTB r3, r3, r6, ASR #20 ; r3 = t[0]-t[6]+8>>4
  968. STR r3, [r0, #92] ; y[6<<3] = t[0]-t[6]+8>>4
  969. SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]+8
  970. MOV r3, r7, ASR #4
  971. MOV r7, r7, LSL #16
  972. PKHTB r3, r3, r7, ASR #20 ; r3 = t[0]-t[7]+8>>4
  973. STR r3, [r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
  974. MOV PC,r14
  975. ENDP
  976. ; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
  977. ; pay for increased branch mis-prediction to get here, but in practice it
  978. ; doesn't seem to slow anything down to take it out, and it's less code this
  979. ; way.
  980. [ 0
  981. oc_idct8x8_6_v6 PROC
  982. STMFD r13!,{r4-r8,r10,r11,r14}
  983. SUB r13,r13,#64*2+4
  984. ; Row transforms
  985. MOV r8, r0
  986. AND r0, r13,#4 ; Align the stack.
  987. ADD r0, r0, r13 ; Write to temp storage.
  988. BL idct3_2core_v6
  989. BL idct1core_v6
  990. ; Clear input data for next block.
  991. MOV r4, #0
  992. MOV r5, #0
  993. STRD r4, [r1,#-3*16]!
  994. STR r4, [r1,#16]
  995. STR r4, [r1,#32]
  996. AND r1, r13,#4 ; Align the stack.
  997. MOV r0, r8 ; Write to the final destination.
  998. ADD r1, r1, r13 ; And read from temp storage.
  999. ; Column transforms
  1000. BL idct3_3core_down_v6
  1001. BL idct3_3core_down_v6
  1002. BL idct3_3core_down_v6
  1003. BL idct3_3core_down_v6
  1004. ADD r13,r13,#64*2+4
  1005. LDMFD r13!,{r4-r8,r10,r11,PC}
  1006. ENDP
  1007. idct1core_v6 PROC
  1008. ; r0 = ogg_int16_t *_y (destination)
  1009. ; r1 = const ogg_int16_t *_x (source)
  1010. LDRSH r3, [r1], #16
  1011. MOV r12,#0x05
  1012. ORR r12,r12,#0xB500
  1013. MUL r3, r12, r3
  1014. ; Stall ?
  1015. MOV r3, r3, ASR #16
  1016. ; Don't need to actually store the odd lines; they won't be read.
  1017. STRH r3, [r0], #2
  1018. STRH r3, [r0, #30]
  1019. STRH r3, [r0, #62]
  1020. STRH r3, [r0, #94]
  1021. MOV PC,R14
  1022. ENDP
  1023. idct3_2core_v6 PROC
  1024. ; r0 = ogg_int16_t *_y (destination)
  1025. ; r1 = const ogg_int16_t *_x (source)
  1026. ; Stage 1:
  1027. LDRD r4, [r1], #16 ; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
  1028. LDRD r10,OC_C6S2_3_v6 ; r10= OC_C6S2; r11= OC_C2S6
  1029. ; Stall
  1030. SMULWB r3, r11,r5 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1031. LDR r11,OC_C4S4
  1032. SMULWB r2, r10,r5 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1033. LDR r5, [r1], #16 ; r5 = <x[1,1]|x[1,0]>
  1034. SMULWB r12,r11,r4 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
  1035. LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
  1036. SMULWB r10,r11,r5 ; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
  1037. PKHBT r12,r12,r10,LSL #16 ; r12= <t[1,0]|t[0,0]>
  1038. SMULWT r10,r7, r5 ; r10= t[1,7]=OC_C1S7*x[1,1]>>16
  1039. PKHBT r2, r2, r11 ; r2 = <0|t[0,2]>
  1040. SMULWT r7, r7, r4 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1041. PKHBT r3, r3, r11 ; r3 = <0|t[0,3]>
  1042. SMULWT r5, r6, r5 ; r10= t[1,4]=OC_C7S1*x[1,1]>>16
  1043. PKHBT r7, r7, r10,LSL #16 ; r7 = <t[1,7]|t[0,7]>
  1044. SMULWT r4, r6, r4 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  1045. ; Stage 2:
  1046. SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1047. PKHBT r4, r4, r5, LSL #16 ; r4 = <t[1,4]|t[0,4]>
  1048. SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1049. SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1050. PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]>
  1051. SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1052. ; Stage 3:
  1053. B idct4_3core_stage3_v6
  1054. ENDP
  1055. ; Another copy so the LDRD offsets are less than +/- 255.
  1056. ALIGN 8
  1057. OC_C7S1_3_v6
  1058. DCD 12785 ; 31F1
  1059. OC_C1S7_3_v6
  1060. DCD 64277 ; FB15
  1061. OC_C6S2_3_v6
  1062. DCD 25080 ; 61F8
  1063. OC_C2S6_3_v6
  1064. DCD 60547 ; EC83
  1065. idct3_3core_down_v6 PROC
  1066. ; r0 = ogg_int16_t *_y (destination)
  1067. ; r1 = const ogg_int16_t *_x (source)
  1068. ; Stage 1:
  1069. LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
  1070. LDRD r6, OC_C6S2_3_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
  1071. LDR r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>
  1072. SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1073. MOV r7,#8
  1074. SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1075. LDR r11,OC_C4S4
  1076. SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1077. ; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
  1078. PKHBT r3, r3, r3, LSL #16 ; r3 = <t[0,3]|t[0,3]>
  1079. SMLAWB r5, r11,r4, r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1080. PKHBT r2, r2, r2, LSL #16 ; r2 = <t[0,2]|t[0,2]>
  1081. LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
  1082. PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
  1083. SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1084. SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1085. SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1086. PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
  1087. SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1088. ; Stage 2:
  1089. SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1090. PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
  1091. SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1092. SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1093. PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]>
  1094. SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1095. ; Stage 3:
  1096. B idct4_4core_down_stage3_v6
  1097. ENDP
  1098. ]
  1099. idct4_3core_v6 PROC
  1100. ; r0 = ogg_int16_t *_y (destination)
  1101. ; r1 = const ogg_int16_t *_x (source)
  1102. ; Stage 1:
  1103. LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1104. LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5
  1105. LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
  1106. SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1107. SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1108. PKHBT r9, r9, r2 ; r9 = <0|t[0,6]>
  1109. LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
  1110. PKHBT r8, r8, r2 ; r9 = <0|-t[0,5]>
  1111. SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1112. SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1113. LDR r11,OC_C4S4
  1114. SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1115. SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1116. PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]>
  1117. SMULWB r12,r11,r10 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16
  1118. PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]>
  1119. SMULWB r5, r11,r4 ; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
  1120. LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
  1121. PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]|t[0,0]>
  1122. SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1123. SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1124. SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1125. PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
  1126. SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1127. ; Stage 2:
  1128. SSUB16 r6, r7, r9 ; r6 = t[7]-t[6]
  1129. PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
  1130. SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6]
  1131. SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16
  1132. SADD16 r5, r4, r8 ; r5 = t[4]-t[5]
  1133. SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16
  1134. SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5]
  1135. SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16
  1136. PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]>
  1137. SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16
  1138. ; Stage 3:
  1139. idct4_3core_stage3_v6
  1140. SADD16 r11,r12,r2 ; r11= t[1]=t[0]+t[2]
  1141. PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]>
  1142. SSUB16 r2, r12,r2 ; r2 = t[2]=t[0]-t[2]
  1143. idct4_3core_stage3_5_v6
  1144. SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5]
  1145. SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5]
  1146. SADD16 r10,r12,r3 ; r10= t[0]'=t[0]+t[3]
  1147. SSUB16 r3, r12,r3 ; r3 = t[3]=t[0]-t[3]
  1148. ; Stage 4:
  1149. SADD16 r12,r10,r7 ; r12= t[0]+t[7]
  1150. STR r12,[r0], #4 ; y[0<<3] = t[0]+t[7]
  1151. SADD16 r12,r11,r6 ; r12= t[1]+t[6]
  1152. STR r12,[r0, #12] ; y[1<<3] = t[1]+t[6]
  1153. SADD16 r12,r2, r5 ; r12= t[2]+t[5]
  1154. STR r12,[r0, #28] ; y[2<<3] = t[2]+t[5]
  1155. SADD16 r12,r3, r4 ; r12= t[3]+t[4]
  1156. STR r12,[r0, #44] ; y[3<<3] = t[3]+t[4]
  1157. SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]
  1158. STR r4, [r0, #60] ; y[4<<3] = t[3]-t[4]
  1159. SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]
  1160. STR r5, [r0, #76] ; y[5<<3] = t[2]-t[5]
  1161. SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]
  1162. STR r6, [r0, #92] ; y[6<<3] = t[1]-t[6]
  1163. SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]
  1164. STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
  1165. MOV PC,r14
  1166. ENDP
  1167. ; Another copy so the LDRD offsets are less than +/- 255.
  1168. ALIGN 8
  1169. OC_C7S1_4_v6
  1170. DCD 12785 ; 31F1
  1171. OC_C1S7_4_v6
  1172. DCD 64277 ; FB15
  1173. OC_C6S2_4_v6
  1174. DCD 25080 ; 61F8
  1175. OC_C2S6_4_v6
  1176. DCD 60547 ; EC83
  1177. OC_C5S3_4_v6
  1178. DCD 36410 ; 8E3A
  1179. OC_C3S5_4_v6
  1180. DCD 54491 ; D4DB
  1181. idct4_4core_down_v6 PROC
  1182. ; r0 = ogg_int16_t *_y (destination)
  1183. ; r1 = const ogg_int16_t *_x (source)
  1184. ; Stage 1:
  1185. LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1186. LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5
  1187. LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
  1188. SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1189. LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
  1190. SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1191. ; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
  1192. PKHBT r9, r9, r9, LSL #16 ; r9 = <t[0,6]|t[0,6]>
  1193. SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1194. PKHBT r8, r8, r8, LSL #16 ; r8 = <-t[0,5]|-t[0,5]>
  1195. SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1196. LDR r11,OC_C4S4
  1197. SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1198. MOV r7,#8
  1199. SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1200. PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]>
  1201. SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1202. PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]>
  1203. SMLAWB r5, r11,r4 ,r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1204. LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
  1205. PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
  1206. SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1207. SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1208. SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1209. PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
  1210. SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1211. ; Stage 2:
  1212. SSUB16 r6, r7, r9 ; r6 = t[7]-t[6]
  1213. PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
  1214. SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6]
  1215. SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16
  1216. SADD16 r5, r4, r8 ; r5 = t[4]-t[5]
  1217. SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16
  1218. SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5]
  1219. SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16
  1220. PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]>
  1221. SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16
  1222. ; Stage 3:
  1223. idct4_4core_down_stage3_v6
  1224. SADD16 r11,r12,r2 ; r11= t[1]+8=t[0]+t[2]+8
  1225. PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]>
  1226. SSUB16 r2, r12,r2 ; r2 = t[2]+8=t[0]-t[2]+8
  1227. B idct8_8core_down_stage3_5_v6
  1228. ENDP
  1229. idct8_8core_v6 PROC
  1230. STMFD r13!,{r0,r14}
  1231. ; Stage 1:
  1232. ;5-6 rotation by 3pi/16
  1233. LDRD r10,OC_C5S3_4_v6 ; r10= OC_C5S3, r11= OC_C3S5
  1234. LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]>
  1235. LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]>
  1236. SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16
  1237. LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]>
  1238. SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16
  1239. LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]>
  1240. SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16
  1241. SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16
  1242. SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1243. PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5>
  1244. SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1245. PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]>
  1246. SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16
  1247. PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]>
  1248. SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16
  1249. ;2-3 rotation by 6pi/16
  1250. LDRD r10,OC_C6S2_4_v6 ; r10= OC_C6S2, r11= OC_C2S6
  1251. PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3>
  1252. LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]>
  1253. SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16
  1254. SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]>
  1255. SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16
  1256. LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]>
  1257. SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16
  1258. SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16
  1259. PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9>
  1260. SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1261. SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1262. SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16
  1263. PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]>
  1264. SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16
  1265. ;4-7 rotation by 7pi/16
  1266. LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7
  1267. PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12>
  1268. LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]>
  1269. PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]>
  1270. SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]>
  1271. SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16
  1272. LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]>
  1273. SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16
  1274. SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16
  1275. SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16
  1276. SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1277. PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8>
  1278. SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1279. PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]>
  1280. SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16
  1281. PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]>
  1282. SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16
  1283. ;0-1 butterfly
  1284. LDR r11,OC_C4S4
  1285. PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10>
  1286. SADD16 r7, r0, r4 ; r7 = x[0]+x[4]
  1287. SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]>
  1288. SSUB16 r4, r0, r4 ; r4 = x[0]-x[4]
  1289. SMULWB r8, r11,r7 ; r8 = t[0,0]=OC_C4S4*r7B>>16
  1290. SMULWT r12,r11,r7 ; r12= t[1,0]=OC_C4S4*r7T>>16
  1291. SMULWB r7, r11,r4 ; r7 = t[0,1]=OC_C4S4*r4B>>16
  1292. PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]|t[0,0]>
  1293. SMULWT r8, r11,r4 ; r8 = t[1,1]=OC_C4S4*r4T>>16
  1294. ; Stage 2:
  1295. SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5]
  1296. PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]|t[0,0]>
  1297. SSUB16 r5, r10,r5 ; r5 = t[4]-t[5]
  1298. SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16
  1299. SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6]
  1300. SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16
  1301. SSUB16 r6, r9, r6 ; r6 = t[7]-t[6]
  1302. SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16
  1303. PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]>
  1304. SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16
  1305. ; Stage 3:
  1306. SADD16 r11,r8, r2 ; r11= t[1]'=t[1]+t[2]
  1307. PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]>
  1308. SSUB16 r2, r8, r2 ; r2 = t[2]=t[1]-t[2]
  1309. LDMFD r13!,{r0,r14}
  1310. B idct4_3core_stage3_5_v6
  1311. ENDP
  1312. ; Another copy so the LDRD offsets are less than +/- 255.
  1313. ALIGN 8
  1314. OC_C7S1_8_v6
  1315. DCD 12785 ; 31F1
  1316. OC_C1S7_8_v6
  1317. DCD 64277 ; FB15
  1318. OC_C6S2_8_v6
  1319. DCD 25080 ; 61F8
  1320. OC_C2S6_8_v6
  1321. DCD 60547 ; EC83
  1322. OC_C5S3_8_v6
  1323. DCD 36410 ; 8E3A
  1324. OC_C3S5_8_v6
  1325. DCD 54491 ; D4DB
  1326. idct8_8core_down_v6 PROC
  1327. STMFD r13!,{r0,r14}
  1328. ; Stage 1:
  1329. ;5-6 rotation by 3pi/16
  1330. LDRD r10,OC_C5S3_8_v6 ; r10= OC_C5S3, r11= OC_C3S5
  1331. LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]>
  1332. LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]>
  1333. SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16
  1334. LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]>
  1335. SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16
  1336. LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]>
  1337. SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16
  1338. SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16
  1339. SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1340. PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5>
  1341. SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1342. PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]>
  1343. SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16
  1344. PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]>
  1345. SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16
  1346. ;2-3 rotation by 6pi/16
  1347. LDRD r10,OC_C6S2_8_v6 ; r10= OC_C6S2, r11= OC_C2S6
  1348. PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3>
  1349. LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]>
  1350. SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16
  1351. SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]>
  1352. SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16
  1353. LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]>
  1354. SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16
  1355. SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16
  1356. PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9>
  1357. SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1358. SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1359. SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16
  1360. PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]>
  1361. SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16
  1362. ;4-7 rotation by 7pi/16
  1363. LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7
  1364. PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12>
  1365. LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]>
  1366. PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]>
  1367. SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]>
  1368. SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16
  1369. LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]>
  1370. SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16
  1371. SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16
  1372. SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16
  1373. SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1374. PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8>
  1375. SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1376. PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]>
  1377. SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16
  1378. PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]>
  1379. SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16
  1380. ;0-1 butterfly
  1381. LDR r11,OC_C4S4
  1382. MOV r14,#8
  1383. PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10>
  1384. SADD16 r7, r0, r4 ; r7 = x[0]+x[4]
  1385. SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]>
  1386. SMLAWB r8, r11,r7, r14 ; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
  1387. SSUB16 r4, r0, r4 ; r4 = x[0]-x[4]
  1388. SMLAWT r12,r11,r7, r14 ; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
  1389. SMLAWB r7, r11,r4, r14 ; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
  1390. PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
  1391. SMLAWT r8, r11,r4, r14 ; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
  1392. ; Stage 2:
  1393. SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5]
  1394. PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]+8|t[0,0]+8>
  1395. SSUB16 r5, r10,r5 ; r5 = t[4]-t[5]
  1396. SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16
  1397. SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6]
  1398. SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16
  1399. SSUB16 r6, r9, r6 ; r6 = t[7]-t[6]
  1400. SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16
  1401. PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]>
  1402. SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16
  1403. ; Stage 3:
  1404. SADD16 r11,r8, r2 ; r11= t[1]'+8=t[1]+t[2]+8
  1405. PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]>
  1406. SSUB16 r2, r8, r2 ; r2 = t[2]+8=t[1]-t[2]+8
  1407. LDMFD r13!,{r0,r14}
  1408. idct8_8core_down_stage3_5_v6
  1409. SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5]
  1410. SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5]
  1411. SADD16 r10,r12,r3 ; r10= t[0]'+8=t[0]+t[3]+8
  1412. SSUB16 r3, r12,r3 ; r3 = t[3]+8=t[0]-t[3]+8
  1413. ; Stage 4:
  1414. SADD16 r12,r10,r7 ; r12= t[0]+t[7]+8
  1415. SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]+8
  1416. MOV r10,r12,ASR #4
  1417. MOV r12,r12,LSL #16
  1418. PKHTB r10,r10,r12,ASR #20 ; r10= t[0]+t[7]+8>>4
  1419. STR r10,[r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4
  1420. SADD16 r12,r11,r6 ; r12= t[1]+t[6]+8
  1421. SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]+8
  1422. MOV r10,r12,ASR #4
  1423. MOV r12,r12,LSL #16
  1424. PKHTB r10,r10,r12,ASR #20 ; r10= t[1]+t[6]+8>>4
  1425. STR r10,[r0, #12] ; y[1<<3] = t[1]+t[6]+8>>4
  1426. SADD16 r12,r2, r5 ; r12= t[2]+t[5]+8
  1427. SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]+8
  1428. MOV r10,r12,ASR #4
  1429. MOV r12,r12,LSL #16
  1430. PKHTB r10,r10,r12,ASR #20 ; r10= t[2]+t[5]+8>>4
  1431. STR r10,[r0, #28] ; y[2<<3] = t[2]+t[5]+8>>4
  1432. SADD16 r12,r3, r4 ; r12= t[3]+t[4]+8
  1433. SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]+8
  1434. MOV r10,r12,ASR #4
  1435. MOV r12,r12,LSL #16
  1436. PKHTB r10,r10,r12,ASR #20 ; r10= t[3]+t[4]+8>>4
  1437. STR r10,[r0, #44] ; y[3<<3] = t[3]+t[4]+8>>4
  1438. MOV r10,r4, ASR #4
  1439. MOV r4, r4, LSL #16
  1440. PKHTB r10,r10,r4, ASR #20 ; r10= t[3]-t[4]+8>>4
  1441. STR r10,[r0, #60] ; y[4<<3] = t[3]-t[4]+8>>4
  1442. MOV r10,r5, ASR #4
  1443. MOV r5, r5, LSL #16
  1444. PKHTB r10,r10,r5, ASR #20 ; r10= t[2]-t[5]+8>>4
  1445. STR r10,[r0, #76] ; y[5<<3] = t[2]-t[5]+8>>4
  1446. MOV r10,r6, ASR #4
  1447. MOV r6, r6, LSL #16
  1448. PKHTB r10,r10,r6, ASR #20 ; r10= t[1]-t[6]+8>>4
  1449. STR r10,[r0, #92] ; y[6<<3] = t[1]-t[6]+8>>4
  1450. MOV r10,r7, ASR #4
  1451. MOV r7, r7, LSL #16
  1452. PKHTB r10,r10,r7, ASR #20 ; r10= t[0]-t[7]+8>>4
  1453. STR r10,[r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
  1454. MOV PC,r14
  1455. ENDP
  1456. ]
  1457. [ OC_ARM_ASM_NEON
  1458. EXPORT oc_idct8x8_1_neon
  1459. EXPORT oc_idct8x8_neon
  1460. ALIGN 16
  1461. OC_IDCT_CONSTS_NEON
  1462. DCW 8
  1463. DCW 64277 ; FB15 (C1S7)
  1464. DCW 60547 ; EC83 (C2S6)
  1465. DCW 54491 ; D4DB (C3S5)
  1466. DCW 46341 ; B505 (C4S4)
  1467. DCW 36410 ; 471D (C5S3)
  1468. DCW 25080 ; 30FC (C6S2)
  1469. DCW 12785 ; 31F1 (C7S1)
  1470. oc_idct8x8_1_neon PROC
  1471. ; r0 = ogg_int16_t *_y
  1472. ; r1 = ogg_uint16_t _dc
  1473. VDUP.S16 Q0, r1
  1474. VMOV Q1, Q0
  1475. VST1.64 {D0, D1, D2, D3}, [r0@128]!
  1476. VST1.64 {D0, D1, D2, D3}, [r0@128]!
  1477. VST1.64 {D0, D1, D2, D3}, [r0@128]!
  1478. VST1.64 {D0, D1, D2, D3}, [r0@128]
  1479. MOV PC, r14
  1480. ENDP
  1481. oc_idct8x8_neon PROC
  1482. ; r0 = ogg_int16_t *_y
  1483. ; r1 = ogg_int16_t *_x
  1484. ; r2 = int _last_zzi
  1485. CMP r2, #10
  1486. BLE oc_idct8x8_10_neon
  1487. oc_idct8x8_slow_neon
  1488. VPUSH {D8-D15}
  1489. MOV r2, r1
  1490. ADR r3, OC_IDCT_CONSTS_NEON
  1491. ; Row transforms (input is pre-transposed)
  1492. VLD1.64 {D16,D17,D18,D19}, [r2@128]!
  1493. VLD1.64 {D20,D21,D22,D23}, [r2@128]!
  1494. VLD1.64 {D24,D25,D26,D27}, [r2@128]!
  1495. VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4]
  1496. VLD1.64 {D28,D29,D30,D31}, [r2@128]
  1497. VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4]
  1498. VLD1.64 {D0,D1}, [r3@128]
  1499. MOV r12, r14
  1500. BL oc_idct8x8_stage123_neon
  1501. ; Stage 4
  1502. VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]'
  1503. VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]'
  1504. VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]''
  1505. VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]''
  1506. VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]''
  1507. VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
  1508. VTRN.16 Q14,Q15
  1509. VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
  1510. VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
  1511. ; 8x8 Transpose
  1512. VTRN.16 Q8, Q9
  1513. VTRN.16 Q10,Q11
  1514. VTRN.16 Q12,Q13
  1515. VTRN.32 Q8, Q10
  1516. VTRN.32 Q9, Q11
  1517. VTRN.32 Q12,Q14
  1518. VTRN.32 Q13,Q15
  1519. VSWP D17,D24
  1520. VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4]
  1521. VSWP D19,D26
  1522. VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4]
  1523. VSWP D21,D28
  1524. VSWP D23,D30
  1525. ; Column transforms
  1526. BL oc_idct8x8_stage123_neon
  1527. ; We have to put the return address back in the LR, or the branch
  1528. ; predictor will not recognize the function return and mis-predict the
  1529. ; entire call stack.
  1530. MOV r14, r12
  1531. ; Stage 4
  1532. VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]'
  1533. VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]'
  1534. VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]''
  1535. VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]''
  1536. VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]''
  1537. VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
  1538. VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
  1539. VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
  1540. VMOV.I8 Q2,#0
  1541. VPOP {D8-D15}
  1542. VMOV.I8 Q3,#0
  1543. VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
  1544. VST1.64 {D4, D5, D6, D7}, [r1@128]!
  1545. VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
  1546. VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
  1547. VST1.64 {D4, D5, D6, D7}, [r1@128]!
  1548. VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
  1549. VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
  1550. VST1.64 {D4, D5, D6, D7}, [r1@128]!
  1551. VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
  1552. VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
  1553. VST1.64 {D4, D5, D6, D7}, [r1@128]
  1554. VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
  1555. VSTMIA r0, {D16-D31}
  1556. MOV PC, r14
  1557. ENDP
  1558. oc_idct8x8_stage123_neon PROC
  1559. ; Stages 1 & 2
  1560. VMULL.S16 Q4, D18,D1[3]
  1561. VMULL.S16 Q5, D19,D1[3]
  1562. VMULL.S16 Q7, D30,D1[3]
  1563. VMULL.S16 Q6, D31,D1[3]
  1564. VMULL.S16 Q2, D30,D0[1]
  1565. VMULL.S16 Q3, D31,D0[1]
  1566. VSHRN.S32 D8, Q4, #16
  1567. VSHRN.S32 D9, Q5, #16 ; Q4 = (OC_C7S1*x[1]>>16)
  1568. VSHRN.S32 D14,Q7, #16
  1569. VSHRN.S32 D15,Q6, #16 ; Q7 = (OC_C7S1*x[7]>>16)
  1570. VSHRN.S32 D4, Q2, #16
  1571. VSHRN.S32 D5, Q3, #16 ; Q2 = (OC_C1S7*x[7]>>16)-x[7]
  1572. VSUB.S16 Q4, Q4, Q15
  1573. VADD.S16 Q7, Q7, Q9
  1574. VSUB.S16 Q4, Q4, Q2 ; Q4 = t[4]
  1575. VMULL.S16 Q2, D18,D0[1]
  1576. VMULL.S16 Q9, D19,D0[1]
  1577. VMULL.S16 Q5, D26,D0[3]
  1578. VMULL.S16 Q3, D27,D0[3]
  1579. VMULL.S16 Q6, D22,D0[3]
  1580. VMULL.S16 Q12,D23,D0[3]
  1581. VSHRN.S32 D4, Q2, #16
  1582. VSHRN.S32 D5, Q9, #16 ; Q2 = (OC_C1S7*x[1]>>16)-x[1]
  1583. VSHRN.S32 D10,Q5, #16
  1584. VSHRN.S32 D11,Q3, #16 ; Q5 = (OC_C3S5*x[5]>>16)-x[5]
  1585. VSHRN.S32 D12,Q6, #16
  1586. VSHRN.S32 D13,Q12,#16 ; Q6 = (OC_C3S5*x[3]>>16)-x[3]
  1587. VADD.S16 Q7, Q7, Q2 ; Q7 = t[7]
  1588. VSUB.S16 Q5, Q5, Q11
  1589. VADD.S16 Q6, Q6, Q11
  1590. VADD.S16 Q5, Q5, Q13
  1591. VADD.S16 Q6, Q6, Q13
  1592. VMULL.S16 Q9, D22,D1[1]
  1593. VMULL.S16 Q11,D23,D1[1]
  1594. VMULL.S16 Q15,D26,D1[1]
  1595. VMULL.S16 Q13,D27,D1[1]
  1596. VMULL.S16 Q2, D20,D1[2]
  1597. VMULL.S16 Q12,D21,D1[2]
  1598. VSHRN.S32 D18,Q9, #16
  1599. VSHRN.S32 D19,Q11,#16 ; Q9 = (OC_C5S3*x[3]>>16)-x[3]
  1600. VSHRN.S32 D30,Q15,#16
  1601. VSHRN.S32 D31,Q13,#16 ; Q15= (OC_C5S3*x[5]>>16)-x[5]
  1602. VSHRN.S32 D4, Q2, #16
  1603. VSHRN.S32 D5, Q12,#16 ; Q2 = (OC_C6S2*x[2]>>16)
  1604. VSUB.S16 Q5, Q5, Q9 ; Q5 = t[5]
  1605. VADD.S16 Q6, Q6, Q15 ; Q6 = t[6]
  1606. VSUB.S16 Q2, Q2, Q14
  1607. VMULL.S16 Q3, D28,D1[2]
  1608. VMULL.S16 Q11,D29,D1[2]
  1609. VMULL.S16 Q12,D28,D0[2]
  1610. VMULL.S16 Q9, D29,D0[2]
  1611. VMULL.S16 Q13,D20,D0[2]
  1612. VMULL.S16 Q15,D21,D0[2]
  1613. VSHRN.S32 D6, Q3, #16
  1614. VSHRN.S32 D7, Q11,#16 ; Q3 = (OC_C6S2*x[6]>>16)
  1615. VSHRN.S32 D24,Q12,#16
  1616. VSHRN.S32 D25,Q9, #16 ; Q12= (OC_C2S6*x[6]>>16)-x[6]
  1617. VSHRN.S32 D26,Q13,#16
  1618. VSHRN.S32 D27,Q15,#16 ; Q13= (OC_C2S6*x[2]>>16)-x[2]
  1619. VSUB.S16 Q9, Q4, Q5 ; Q9 = t[4]-t[5]
  1620. VSUB.S16 Q11,Q7, Q6 ; Q11= t[7]-t[6]
  1621. VADD.S16 Q3, Q3, Q10
  1622. VADD.S16 Q4, Q4, Q5 ; Q4 = t[4]'=t[4]+t[5]
  1623. VADD.S16 Q7, Q7, Q6 ; Q7 = t[7]'=t[7]+t[6]
  1624. VSUB.S16 Q2, Q2, Q12 ; Q2 = t[2]
  1625. VADD.S16 Q3, Q3, Q13 ; Q3 = t[3]
  1626. VMULL.S16 Q12,D16,D1[0]
  1627. VMULL.S16 Q13,D17,D1[0]
  1628. VMULL.S16 Q14,D2, D1[0]
  1629. VMULL.S16 Q15,D3, D1[0]
  1630. VMULL.S16 Q5, D18,D1[0]
  1631. VMULL.S16 Q6, D22,D1[0]
  1632. VSHRN.S32 D24,Q12,#16
  1633. VSHRN.S32 D25,Q13,#16
  1634. VSHRN.S32 D28,Q14,#16
  1635. VSHRN.S32 D29,Q15,#16
  1636. VMULL.S16 Q13,D19,D1[0]
  1637. VMULL.S16 Q15,D23,D1[0]
  1638. VADD.S16 Q8, Q8, Q12 ; Q8 = t[0]
  1639. VADD.S16 Q1, Q1, Q14 ; Q1 = t[1]
  1640. VSHRN.S32 D10,Q5, #16
  1641. VSHRN.S32 D12,Q6, #16
  1642. VSHRN.S32 D11,Q13,#16
  1643. VSHRN.S32 D13,Q15,#16
  1644. VADD.S16 Q5, Q5, Q9 ; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
  1645. VADD.S16 Q6, Q6, Q11 ; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
  1646. ; Stage 3
  1647. VSUB.S16 Q11,Q8, Q3 ; Q11 = t[3]''=t[0]-t[3]
  1648. VADD.S16 Q8, Q8, Q3 ; Q8 = t[0]''=t[0]+t[3]
  1649. VADD.S16 Q9, Q1, Q2 ; Q9 = t[1]''=t[1]+t[2]
  1650. VADD.S16 Q3, Q6, Q5 ; Q3 = t[6]''=t[6]'+t[5]'
  1651. VSUB.S16 Q10,Q1, Q2 ; Q10 = t[2]''=t[1]-t[2]
  1652. VSUB.S16 Q5, Q6, Q5 ; Q5 = t[5]''=t[6]'-t[5]'
  1653. MOV PC, r14
  1654. ENDP
  1655. oc_idct8x8_10_neon PROC
  1656. ADR r3, OC_IDCT_CONSTS_NEON
  1657. VLD1.64 {D0,D1}, [r3@128]
  1658. MOV r2, r1
  1659. ; Row transforms (input is pre-transposed)
  1660. ; Stage 1
  1661. VLD1.64 {D16,D17,D18,D19},[r2@128]!
  1662. MOV r12, #16
  1663. VMULL.S16 Q15,D16,D1[0] ; Q15= OC_C4S4*x[0]-(x[0]<<16)
  1664. VLD1.64 {D17}, [r2@64], r12
  1665. VMULL.S16 Q2, D18,D0[1] ; Q2 = OC_C1S7*x[1]-(x[1]<<16)
  1666. VLD1.64 {D19}, [r2@64]
  1667. VMULL.S16 Q14,D17,D0[2] ; Q14= OC_C2S6*x[2]-(x[2]<<16)
  1668. VMULL.S16 Q3, D19,D0[3] ; Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1669. VMULL.S16 Q13,D19,D1[1] ; Q13= OC_C5S3*x[3]-(x[3]<<16)
  1670. VMULL.S16 Q12,D18,D1[3] ; Q12= OC_C7S1*x[1]
  1671. VMULL.S16 Q1, D17,D1[2] ; Q1 = OC_C6S2*x[2]
  1672. VSHRN.S32 D30,Q15,#16 ; D30= t[0]-x[0]
  1673. VSHRN.S32 D4, Q2, #16 ; D4 = t[7]-x[1]
  1674. VSHRN.S32 D31,Q14,#16 ; D31= t[3]-x[2]
  1675. VSHRN.S32 D6, Q3, #16 ; D6 = t[6]-x[3]
  1676. VSHRN.S32 D7, Q13,#16 ; D7 = -t[5]-x[3]
  1677. VSHRN.S32 D5, Q12,#16 ; D5 = t[4]
  1678. VSHRN.S32 D2, Q1, #16 ; D2 = t[2]
  1679. VADD.S16 D4, D4, D18 ; D4 = t[7]
  1680. VADD.S16 D6, D6, D19 ; D6 = t[6]
  1681. VADD.S16 D7, D7, D19 ; D7 = -t[5]
  1682. VADD.S16 Q15,Q15,Q8 ; D30= t[0]
  1683. ; D31= t[3]
  1684. ; Stages 2 & 3
  1685. VSUB.S16 Q12,Q2, Q3 ; D24= t[7]-t[6]
  1686. ; D25= t[4]'=t[4]+t[5]
  1687. VADD.S16 Q13,Q2, Q3 ; D26= t[7]'=t[7]+t[6]
  1688. ; D27= t[4]-t[5]
  1689. VMULL.S16 Q11,D24,D1[0] ; Q11= OC_C4S4*(t[7]-t[6])
  1690. ; -(t[7]-t[6]<<16)
  1691. VMULL.S16 Q14,D27,D1[0] ; Q14= OC_C4S4*(t[4]-t[5])
  1692. ; -(t[4]-t[5]<<16)
  1693. VADD.S16 D16,D30,D31 ; D16= t[0]'=t[0]+t[3]
  1694. VSUB.S16 D17,D30,D2 ; D17= t[2]'=t[0]-t[2]
  1695. VADD.S16 D18,D30,D2 ; D18= t[1]'=t[0]+t[2]
  1696. VSHRN.S32 D22,Q11,#16 ; D22= (OC_C4S4*(t[7]-t[6])>>16)
  1697. ; -(t[7]-t[6])
  1698. VSHRN.S32 D23,Q14,#16 ; D23= (OC_C4S4*(t[4]-t[5])>>16)
  1699. ; -(t[4]-t[5])
  1700. VSUB.S16 D19,D30,D31 ; D19= t[3]'=t[0]-t[3]
  1701. VADD.S16 D22,D22,D24 ; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
  1702. VADD.S16 D23,D23,D27 ; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
  1703. VSUB.S16 D27,D22,D23 ; D27= t[5]''=t[6]'-t[5]'
  1704. VADD.S16 D24,D22,D23 ; D24= t[6]''=t[6]'+t[5]'
  1705. ; Stage 4
  1706. VSUB.S16 Q11,Q8, Q13 ; D22= y[7]=t[0]'-t[7]'
  1707. ; D23= y[5]=t[2]'-t[5]''
  1708. VSUB.S16 Q10,Q9, Q12 ; D20= y[6]=t[1]'-t[6]'
  1709. ; D21= y[4]=t[3]'-t[4]''
  1710. VADD.S16 Q8, Q8, Q13 ; D16= y[0]=t[0]'+t[7]'
  1711. ; D17= y[2]=t[2]'+t[5]''
  1712. VADD.S16 Q9, Q9, Q12 ; D18= y[1]=t[1]'-t[6]'
  1713. ; D19= y[3]=t[3]'-t[4]''
  1714. ; 8x4 transpose
  1715. VTRN.16 Q10,Q11 ; Q10= c5c4a5a4 c7c6a7a6
  1716. ; Q11= d5d4b5b4 d7d6b7b6
  1717. VTRN.16 Q8, Q9 ; Q8 = c3c2a3a2 c1c0a1a0
  1718. ; Q9 = d3d2b3b2 d1d0b1b0
  1719. VSWP D20,D21 ; Q10= c7c6a7a6 c5c4a5a4
  1720. VSWP D22,D23 ; Q11= d7d6b7b6 d5d4b5b4
  1721. VUZP.32 Q9, Q11 ; Q9 = b7b6b5b4 b3b2b1b0
  1722. ; Q11= d7d6d5d4 d3d2d1d0
  1723. VMULL.S16 Q15,D18,D0[1]
  1724. VMULL.S16 Q13,D22,D1[1]
  1725. VUZP.32 Q8, Q10 ; Q8 = a7a6a5a4 a3a2a1a0
  1726. ; Q10= c7c6c5c4 c3c2c1c0
  1727. ; Column transforms
  1728. ; Stages 1, 2, & 3
  1729. VMULL.S16 Q14,D19,D0[1] ; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
  1730. VMULL.S16 Q12,D23,D1[1] ; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
  1731. VMULL.S16 Q3, D22,D0[3]
  1732. VMULL.S16 Q2, D23,D0[3] ; Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1733. VSHRN.S32 D30,Q15,#16
  1734. VSHRN.S32 D31,Q14,#16 ; Q15= (OC_C1S7*x[1]>>16)-x[1]
  1735. VSHRN.S32 D26,Q13,#16
  1736. VSHRN.S32 D27,Q12,#16 ; Q13= (OC_C5S3*x[3]>>16)-x[3]
  1737. VSHRN.S32 D28,Q3, #16
  1738. VSHRN.S32 D29,Q2, #16 ; Q14= (OC_C3S5*x[3]>>16)-x[3]
  1739. VADD.S16 Q15,Q15,Q9 ; Q15= t[7]
  1740. VADD.S16 Q13,Q13,Q11 ; Q13= -t[5]
  1741. VADD.S16 Q14,Q14,Q11 ; Q14= t[6]
  1742. VMULL.S16 Q12,D18,D1[3]
  1743. VMULL.S16 Q2, D19,D1[3] ; Q2:Q12= OC_C7S1*x[1]
  1744. VMULL.S16 Q1, D16,D1[0]
  1745. VMULL.S16 Q11,D17,D1[0] ; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
  1746. VMULL.S16 Q3, D20,D0[2]
  1747. VMULL.S16 Q9, D21,D0[2] ; Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
  1748. VSHRN.S32 D24,Q12,#16
  1749. VSHRN.S32 D25,Q2, #16 ; Q12= t[4]
  1750. VMULL.S16 Q2, D20,D1[2]
  1751. VSHRN.S32 D2, Q1, #16
  1752. VSHRN.S32 D3, Q11,#16 ; Q1 = (OC_C4S4*x[0]>>16)-x[0]
  1753. VMULL.S16 Q11,D21,D1[2] ; Q2:Q11= OC_C6S2*x[2]
  1754. VSHRN.S32 D6, Q3, #16
  1755. VSHRN.S32 D7, Q9, #16 ; Q3 = (OC_C2S6*x[2]>>16)-x[2]
  1756. VSUB.S16 Q9, Q15,Q14 ; Q9 = t[7]-t[6]
  1757. VADD.S16 Q15,Q15,Q14 ; Q15= t[7]'=t[7]+t[6]
  1758. VSHRN.S32 D4, Q2, #16
  1759. VSHRN.S32 D5, Q11,#16 ; Q2 = t[2]
  1760. VADD.S16 Q1, Q1, Q8 ; Q1 = t[0]
  1761. VADD.S16 Q8, Q12,Q13 ; Q8 = t[4]-t[5]
  1762. VADD.S16 Q3, Q3, Q10 ; Q3 = t[3]
  1763. VMULL.S16 Q10,D16,D1[0]
  1764. VMULL.S16 Q11,D17,D1[0] ; Q11:Q10= OC_C4S4*(t[4]-t[5])
  1765. ; -(t[4]-t[5]<<16)
  1766. VSUB.S16 Q12,Q12,Q13 ; Q12= t[4]'=t[4]+t[5]
  1767. VMULL.S16 Q14,D18,D1[0]
  1768. VMULL.S16 Q13,D19,D1[0] ; Q13:Q14= OC_C4S4*(t[6]-t[7])
  1769. ; -(t[6]-t[7]<<16)
  1770. VSHRN.S32 D20,Q10,#16
  1771. VSHRN.S32 D21,Q11,#16 ; Q10= (OC_C4S4*(t[4]-t[5])>>16)
  1772. ; -(t[4]-t[5])
  1773. VADD.S16 Q11,Q1, Q3 ; Q11= t[0]'=t[0]+t[3]
  1774. VSUB.S16 Q3, Q1, Q3 ; Q3 = t[3]'=t[0]-t[3]
  1775. VSHRN.S32 D28,Q14,#16
  1776. VSHRN.S32 D29,Q13,#16 ; Q14= (OC_C4S4*(t[7]-t[6])>>16)
  1777. ; -(t[7]-t[6])
  1778. VADD.S16 Q10,Q10,Q8 ; Q10=t[5]'
  1779. VADD.S16 Q14,Q14,Q9 ; Q14=t[6]'
  1780. VSUB.S16 Q13,Q14,Q10 ; Q13=t[5]''=t[6]'-t[5]'
  1781. VADD.S16 Q14,Q14,Q10 ; Q14=t[6]''=t[6]'+t[5]'
  1782. VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2]
  1783. VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2]
  1784. ; Stage 4
  1785. VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]'
  1786. VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]''
  1787. VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]'
  1788. VSUB.S16 Q14,Q10,Q14 ; Q14 = y[6]=t[1]'-t[6]''
  1789. VADD.S16 Q10,Q2, Q13 ; Q10 = y[2]=t[2]'+t[5]''
  1790. VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]'
  1791. VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]'
  1792. VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]''
  1793. VMOV.I8 D2, #0
  1794. VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
  1795. VST1.64 {D2}, [r1@64], r12
  1796. VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
  1797. VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
  1798. VST1.64 {D2}, [r1@64], r12
  1799. VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
  1800. VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
  1801. VST1.64 {D2}, [r1@64], r12
  1802. VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
  1803. VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
  1804. VST1.64 {D2}, [r1@64]
  1805. VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
  1806. VSTMIA r0, {D16-D31}
  1807. MOV PC, r14
  1808. ENDP
  1809. ]
  1810. END