mmxfdct.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. * *
  8. * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
  9. * by the Xiph.Org Foundation http://www.xiph.org/ *
  10. * *
  11. ********************************************************************/
  12. /*MMX fDCT implementation for x86_32*/
  13. /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
  14. #include "x86enc.h"
  15. #if defined(OC_X86_ASM)
  16. # define OC_FDCT_STAGE1_8x4 \
  17. "#OC_FDCT_STAGE1_8x4\n\t" \
  18. /*Stage 1:*/ \
  19. /*mm0=t7'=t0-t7*/ \
  20. "psubw %%mm7,%%mm0\n\t" \
  21. "paddw %%mm7,%%mm7\n\t" \
  22. /*mm1=t6'=t1-t6*/ \
  23. "psubw %%mm6,%%mm1\n\t" \
  24. "paddw %%mm6,%%mm6\n\t" \
  25. /*mm2=t5'=t2-t5*/ \
  26. "psubw %%mm5,%%mm2\n\t" \
  27. "paddw %%mm5,%%mm5\n\t" \
  28. /*mm3=t4'=t3-t4*/ \
  29. "psubw %%mm4,%%mm3\n\t" \
  30. "paddw %%mm4,%%mm4\n\t" \
  31. /*mm7=t0'=t0+t7*/ \
  32. "paddw %%mm0,%%mm7\n\t" \
  33. /*mm6=t1'=t1+t6*/ \
  34. "paddw %%mm1,%%mm6\n\t" \
  35. /*mm5=t2'=t2+t5*/ \
  36. "paddw %%mm2,%%mm5\n\t" \
  37. /*mm4=t3'=t3+t4*/ \
  38. "paddw %%mm3,%%mm4\n\t" \
  39. # define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
  40. "#OC_FDCT8x4\n\t" \
  41. /*Stage 2:*/ \
  42. /*mm7=t3''=t0'-t3'*/ \
  43. "psubw %%mm4,%%mm7\n\t" \
  44. "paddw %%mm4,%%mm4\n\t" \
  45. /*mm6=t2''=t1'-t2'*/ \
  46. "psubw %%mm5,%%mm6\n\t" \
  47. "movq %%mm7,"_r6"(%[y])\n\t" \
  48. "paddw %%mm5,%%mm5\n\t" \
  49. /*mm1=t5''=t6'-t5'*/ \
  50. "psubw %%mm2,%%mm1\n\t" \
  51. "movq %%mm6,"_r2"(%[y])\n\t" \
  52. /*mm4=t0''=t0'+t3'*/ \
  53. "paddw %%mm7,%%mm4\n\t" \
  54. "paddw %%mm2,%%mm2\n\t" \
  55. /*mm5=t1''=t1'+t2'*/ \
  56. "movq %%mm4,"_r0"(%[y])\n\t" \
  57. "paddw %%mm6,%%mm5\n\t" \
  58. /*mm2=t6''=t6'+t5'*/ \
  59. "paddw %%mm1,%%mm2\n\t" \
  60. "movq %%mm5,"_r4"(%[y])\n\t" \
  61. /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
  62. /*mm4, mm5, mm6, mm7 are free.*/ \
  63. /*Stage 3:*/ \
  64. /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
  65. "mov $0x5A806A0A,%[a]\n\t" \
  66. "pcmpeqb %%mm6,%%mm6\n\t" \
  67. "movd %[a],%%mm7\n\t" \
  68. "psrlw $15,%%mm6\n\t" \
  69. "punpckldq %%mm7,%%mm7\n\t" \
  70. "paddw %%mm6,%%mm6\n\t" \
  71. /*mm0=0, m2={-1}x4 \
  72. mm5:mm4=t5''*27146+0xB500*/ \
  73. "movq %%mm1,%%mm4\n\t" \
  74. "movq %%mm1,%%mm5\n\t" \
  75. "punpcklwd %%mm6,%%mm4\n\t" \
  76. "movq %%mm2,"_r3"(%[y])\n\t" \
  77. "pmaddwd %%mm7,%%mm4\n\t" \
  78. "movq %%mm0,"_r7"(%[y])\n\t" \
  79. "punpckhwd %%mm6,%%mm5\n\t" \
  80. "pxor %%mm0,%%mm0\n\t" \
  81. "pmaddwd %%mm7,%%mm5\n\t" \
  82. "pcmpeqb %%mm2,%%mm2\n\t" \
  83. /*mm2=t6'', mm1=t5''+(t5''!=0) \
  84. mm4=(t5''*27146+0xB500>>16)*/ \
  85. "pcmpeqw %%mm1,%%mm0\n\t" \
  86. "psrad $16,%%mm4\n\t" \
  87. "psubw %%mm2,%%mm0\n\t" \
  88. "movq "_r3"(%[y]),%%mm2\n\t" \
  89. "psrad $16,%%mm5\n\t" \
  90. "paddw %%mm0,%%mm1\n\t" \
  91. "packssdw %%mm5,%%mm4\n\t" \
  92. /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
  93. "paddw %%mm1,%%mm4\n\t" \
  94. "movq "_r7"(%[y]),%%mm0\n\t" \
  95. "psraw $1,%%mm4\n\t" \
  96. "movq %%mm3,%%mm1\n\t" \
  97. /*mm3=t4''=t4'+s*/ \
  98. "paddw %%mm4,%%mm3\n\t" \
  99. /*mm1=t5'''=t4'-s*/ \
  100. "psubw %%mm4,%%mm1\n\t" \
  101. /*mm1=0, mm3={-1}x4 \
  102. mm5:mm4=t6''*27146+0xB500*/ \
  103. "movq %%mm2,%%mm4\n\t" \
  104. "movq %%mm2,%%mm5\n\t" \
  105. "punpcklwd %%mm6,%%mm4\n\t" \
  106. "movq %%mm1,"_r5"(%[y])\n\t" \
  107. "pmaddwd %%mm7,%%mm4\n\t" \
  108. "movq %%mm3,"_r1"(%[y])\n\t" \
  109. "punpckhwd %%mm6,%%mm5\n\t" \
  110. "pxor %%mm1,%%mm1\n\t" \
  111. "pmaddwd %%mm7,%%mm5\n\t" \
  112. "pcmpeqb %%mm3,%%mm3\n\t" \
  113. /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
  114. "psrad $16,%%mm4\n\t" \
  115. "pcmpeqw %%mm2,%%mm1\n\t" \
  116. "psrad $16,%%mm5\n\t" \
  117. "psubw %%mm3,%%mm1\n\t" \
  118. "packssdw %%mm5,%%mm4\n\t" \
  119. "paddw %%mm1,%%mm2\n\t" \
  120. /*mm1=t1'' \
  121. mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
  122. "paddw %%mm2,%%mm4\n\t" \
  123. "movq "_r4"(%[y]),%%mm1\n\t" \
  124. "psraw $1,%%mm4\n\t" \
  125. "movq %%mm0,%%mm2\n\t" \
  126. /*mm7={54491-0x7FFF,0x7FFF}x2 \
  127. mm0=t7''=t7'+s*/ \
  128. "paddw %%mm4,%%mm0\n\t" \
  129. /*mm2=t6'''=t7'-s*/ \
  130. "psubw %%mm4,%%mm2\n\t" \
  131. /*Stage 4:*/ \
  132. /*mm0=0, mm2=t0'' \
  133. mm5:mm4=t1''*27146+0xB500*/ \
  134. "movq %%mm1,%%mm4\n\t" \
  135. "movq %%mm1,%%mm5\n\t" \
  136. "punpcklwd %%mm6,%%mm4\n\t" \
  137. "movq %%mm2,"_r3"(%[y])\n\t" \
  138. "pmaddwd %%mm7,%%mm4\n\t" \
  139. "movq "_r0"(%[y]),%%mm2\n\t" \
  140. "punpckhwd %%mm6,%%mm5\n\t" \
  141. "movq %%mm0,"_r7"(%[y])\n\t" \
  142. "pmaddwd %%mm7,%%mm5\n\t" \
  143. "pxor %%mm0,%%mm0\n\t" \
  144. /*mm7={27146,0x4000>>1}x2 \
  145. mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
  146. "psrad $16,%%mm4\n\t" \
  147. "mov $0x20006A0A,%[a]\n\t" \
  148. "pcmpeqw %%mm1,%%mm0\n\t" \
  149. "movd %[a],%%mm7\n\t" \
  150. "psrad $16,%%mm5\n\t" \
  151. "psubw %%mm3,%%mm0\n\t" \
  152. "packssdw %%mm5,%%mm4\n\t" \
  153. "paddw %%mm1,%%mm0\n\t" \
  154. "punpckldq %%mm7,%%mm7\n\t" \
  155. "paddw %%mm4,%%mm0\n\t" \
  156. /*mm6={0x00000E3D}x2 \
  157. mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
  158. "movq %%mm2,%%mm4\n\t" \
  159. "movq %%mm2,%%mm5\n\t" \
  160. "punpcklwd %%mm6,%%mm4\n\t" \
  161. "mov $0x0E3D,%[a]\n\t" \
  162. "pmaddwd %%mm7,%%mm4\n\t" \
  163. "punpckhwd %%mm6,%%mm5\n\t" \
  164. "movd %[a],%%mm6\n\t" \
  165. "pmaddwd %%mm7,%%mm5\n\t" \
  166. "pxor %%mm1,%%mm1\n\t" \
  167. "punpckldq %%mm6,%%mm6\n\t" \
  168. "pcmpeqw %%mm2,%%mm1\n\t" \
  169. /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
  170. "psrad $16,%%mm4\n\t" \
  171. "psubw %%mm3,%%mm1\n\t" \
  172. "psrad $16,%%mm5\n\t" \
  173. "paddw %%mm1,%%mm2\n\t" \
  174. "packssdw %%mm5,%%mm4\n\t" \
  175. "movq "_r5"(%[y]),%%mm1\n\t" \
  176. "paddw %%mm2,%%mm4\n\t" \
  177. /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
  178. The naive implementation could cause overflow, so we use \
  179. u=(r&s)+((r^s)>>1).*/ \
  180. "movq "_r3"(%[y]),%%mm2\n\t" \
  181. "movq %%mm0,%%mm7\n\t" \
  182. "pxor %%mm4,%%mm0\n\t" \
  183. "pand %%mm4,%%mm7\n\t" \
  184. "psraw $1,%%mm0\n\t" \
  185. "mov $0x7FFF54DC,%[a]\n\t" \
  186. "paddw %%mm7,%%mm0\n\t" \
  187. "movd %[a],%%mm7\n\t" \
  188. /*mm7={54491-0x7FFF,0x7FFF}x2 \
  189. mm4=_y[4]=v=r-u*/ \
  190. "psubw %%mm0,%%mm4\n\t" \
  191. "punpckldq %%mm7,%%mm7\n\t" \
  192. "movq %%mm4,"_r4"(%[y])\n\t" \
  193. /*mm0=0, mm7={36410}x4 \
  194. mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
  195. "movq %%mm1,%%mm4\n\t" \
  196. "movq %%mm1,%%mm5\n\t" \
  197. "punpcklwd %%mm1,%%mm4\n\t" \
  198. "mov $0x8E3A8E3A,%[a]\n\t" \
  199. "pmaddwd %%mm7,%%mm4\n\t" \
  200. "movq %%mm0,"_r0"(%[y])\n\t" \
  201. "punpckhwd %%mm1,%%mm5\n\t" \
  202. "pxor %%mm0,%%mm0\n\t" \
  203. "pmaddwd %%mm7,%%mm5\n\t" \
  204. "pcmpeqw %%mm0,%%mm1\n\t" \
  205. "movd %[a],%%mm7\n\t" \
  206. "psubw %%mm3,%%mm1\n\t" \
  207. "punpckldq %%mm7,%%mm7\n\t" \
  208. "paddd %%mm6,%%mm4\n\t" \
  209. "paddd %%mm6,%%mm5\n\t" \
  210. /*mm0=0 \
  211. mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
  212. "movq %%mm2,%%mm6\n\t" \
  213. "movq %%mm2,%%mm3\n\t" \
  214. "pmulhw %%mm7,%%mm6\n\t" \
  215. "paddw %%mm2,%%mm1\n\t" \
  216. "pmullw %%mm7,%%mm3\n\t" \
  217. "pxor %%mm0,%%mm0\n\t" \
  218. "paddw %%mm1,%%mm6\n\t" \
  219. "movq %%mm3,%%mm1\n\t" \
  220. "punpckhwd %%mm6,%%mm3\n\t" \
  221. "punpcklwd %%mm6,%%mm1\n\t" \
  222. /*mm3={-1}x4, mm6={1}x4 \
  223. mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
  224. "paddd %%mm3,%%mm5\n\t" \
  225. "paddd %%mm1,%%mm4\n\t" \
  226. "psrad $16,%%mm5\n\t" \
  227. "pxor %%mm6,%%mm6\n\t" \
  228. "psrad $16,%%mm4\n\t" \
  229. "pcmpeqb %%mm3,%%mm3\n\t" \
  230. "packssdw %%mm5,%%mm4\n\t" \
  231. "psubw %%mm3,%%mm6\n\t" \
  232. /*mm1=t7'', mm7={26568,0x3400}x2 \
  233. mm2=s=t6'''-(36410*u>>16)*/ \
  234. "movq %%mm4,%%mm1\n\t" \
  235. "mov $0x340067C8,%[a]\n\t" \
  236. "pmulhw %%mm7,%%mm4\n\t" \
  237. "movd %[a],%%mm7\n\t" \
  238. "movq %%mm1,"_r5"(%[y])\n\t" \
  239. "punpckldq %%mm7,%%mm7\n\t" \
  240. "paddw %%mm1,%%mm4\n\t" \
  241. "movq "_r7"(%[y]),%%mm1\n\t" \
  242. "psubw %%mm4,%%mm2\n\t" \
  243. /*mm6={0x00007B1B}x2 \
  244. mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
  245. "movq %%mm2,%%mm4\n\t" \
  246. "movq %%mm2,%%mm5\n\t" \
  247. "punpcklwd %%mm6,%%mm4\n\t" \
  248. "pcmpeqw %%mm2,%%mm0\n\t" \
  249. "pmaddwd %%mm7,%%mm4\n\t" \
  250. "mov $0x7B1B,%[a]\n\t" \
  251. "punpckhwd %%mm6,%%mm5\n\t" \
  252. "movd %[a],%%mm6\n\t" \
  253. "pmaddwd %%mm7,%%mm5\n\t" \
  254. "psubw %%mm3,%%mm0\n\t" \
  255. "punpckldq %%mm6,%%mm6\n\t" \
  256. /*mm7={64277-0x7FFF,0x7FFF}x2 \
  257. mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
  258. "psrad $17,%%mm4\n\t" \
  259. "paddw %%mm0,%%mm2\n\t" \
  260. "psrad $17,%%mm5\n\t" \
  261. "mov $0x7FFF7B16,%[a]\n\t" \
  262. "packssdw %%mm5,%%mm4\n\t" \
  263. "movd %[a],%%mm7\n\t" \
  264. "paddw %%mm4,%%mm2\n\t" \
  265. "punpckldq %%mm7,%%mm7\n\t" \
  266. /*mm0=0, mm7={12785}x4 \
  267. mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
  268. "movq %%mm1,%%mm4\n\t" \
  269. "movq %%mm1,%%mm5\n\t" \
  270. "movq %%mm2,"_r3"(%[y])\n\t" \
  271. "punpcklwd %%mm1,%%mm4\n\t" \
  272. "movq "_r1"(%[y]),%%mm2\n\t" \
  273. "pmaddwd %%mm7,%%mm4\n\t" \
  274. "mov $0x31F131F1,%[a]\n\t" \
  275. "punpckhwd %%mm1,%%mm5\n\t" \
  276. "pxor %%mm0,%%mm0\n\t" \
  277. "pmaddwd %%mm7,%%mm5\n\t" \
  278. "pcmpeqw %%mm0,%%mm1\n\t" \
  279. "movd %[a],%%mm7\n\t" \
  280. "psubw %%mm3,%%mm1\n\t" \
  281. "punpckldq %%mm7,%%mm7\n\t" \
  282. "paddd %%mm6,%%mm4\n\t" \
  283. "paddd %%mm6,%%mm5\n\t" \
  284. /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
  285. "movq %%mm2,%%mm6\n\t" \
  286. "movq %%mm2,%%mm3\n\t" \
  287. "pmulhw %%mm7,%%mm6\n\t" \
  288. "pmullw %%mm7,%%mm3\n\t" \
  289. "paddw %%mm1,%%mm6\n\t" \
  290. "movq %%mm3,%%mm1\n\t" \
  291. "punpckhwd %%mm6,%%mm3\n\t" \
  292. "punpcklwd %%mm6,%%mm1\n\t" \
  293. /*mm3={-1}x4, mm6={1}x4 \
  294. mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
  295. "paddd %%mm3,%%mm5\n\t" \
  296. "paddd %%mm1,%%mm4\n\t" \
  297. "psrad $16,%%mm5\n\t" \
  298. "pxor %%mm6,%%mm6\n\t" \
  299. "psrad $16,%%mm4\n\t" \
  300. "pcmpeqb %%mm3,%%mm3\n\t" \
  301. "packssdw %%mm5,%%mm4\n\t" \
  302. "psubw %%mm3,%%mm6\n\t" \
  303. /*mm1=t3'', mm7={20539,0x3000}x2 \
  304. mm4=s=(12785*u>>16)-t4''*/ \
  305. "movq %%mm4,"_r1"(%[y])\n\t" \
  306. "pmulhw %%mm7,%%mm4\n\t" \
  307. "mov $0x3000503B,%[a]\n\t" \
  308. "movq "_r6"(%[y]),%%mm1\n\t" \
  309. "movd %[a],%%mm7\n\t" \
  310. "psubw %%mm2,%%mm4\n\t" \
  311. "punpckldq %%mm7,%%mm7\n\t" \
  312. /*mm6={0x00006CB7}x2 \
  313. mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
  314. "movq %%mm4,%%mm5\n\t" \
  315. "movq %%mm4,%%mm2\n\t" \
  316. "punpcklwd %%mm6,%%mm4\n\t" \
  317. "pcmpeqw %%mm2,%%mm0\n\t" \
  318. "pmaddwd %%mm7,%%mm4\n\t" \
  319. "mov $0x6CB7,%[a]\n\t" \
  320. "punpckhwd %%mm6,%%mm5\n\t" \
  321. "movd %[a],%%mm6\n\t" \
  322. "pmaddwd %%mm7,%%mm5\n\t" \
  323. "psubw %%mm3,%%mm0\n\t" \
  324. "punpckldq %%mm6,%%mm6\n\t" \
  325. /*mm7={60547-0x7FFF,0x7FFF}x2 \
  326. mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
  327. "psrad $20,%%mm4\n\t" \
  328. "paddw %%mm0,%%mm2\n\t" \
  329. "psrad $20,%%mm5\n\t" \
  330. "mov $0x7FFF6C84,%[a]\n\t" \
  331. "packssdw %%mm5,%%mm4\n\t" \
  332. "movd %[a],%%mm7\n\t" \
  333. "paddw %%mm4,%%mm2\n\t" \
  334. "punpckldq %%mm7,%%mm7\n\t" \
  335. /*mm0=0, mm7={25080}x4 \
  336. mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
  337. "movq %%mm1,%%mm4\n\t" \
  338. "movq %%mm1,%%mm5\n\t" \
  339. "movq %%mm2,"_r7"(%[y])\n\t" \
  340. "punpcklwd %%mm1,%%mm4\n\t" \
  341. "movq "_r2"(%[y]),%%mm2\n\t" \
  342. "pmaddwd %%mm7,%%mm4\n\t" \
  343. "mov $0x61F861F8,%[a]\n\t" \
  344. "punpckhwd %%mm1,%%mm5\n\t" \
  345. "pxor %%mm0,%%mm0\n\t" \
  346. "pmaddwd %%mm7,%%mm5\n\t" \
  347. "movd %[a],%%mm7\n\t" \
  348. "pcmpeqw %%mm0,%%mm1\n\t" \
  349. "psubw %%mm3,%%mm1\n\t" \
  350. "punpckldq %%mm7,%%mm7\n\t" \
  351. "paddd %%mm6,%%mm4\n\t" \
  352. "paddd %%mm6,%%mm5\n\t" \
  353. /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
  354. "movq %%mm2,%%mm6\n\t" \
  355. "movq %%mm2,%%mm3\n\t" \
  356. "pmulhw %%mm7,%%mm6\n\t" \
  357. "pmullw %%mm7,%%mm3\n\t" \
  358. "paddw %%mm1,%%mm6\n\t" \
  359. "movq %%mm3,%%mm1\n\t" \
  360. "punpckhwd %%mm6,%%mm3\n\t" \
  361. "punpcklwd %%mm6,%%mm1\n\t" \
  362. /*mm1={-1}x4 \
  363. mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
  364. "paddd %%mm3,%%mm5\n\t" \
  365. "paddd %%mm1,%%mm4\n\t" \
  366. "psrad $16,%%mm5\n\t" \
  367. "mov $0x28005460,%[a]\n\t" \
  368. "psrad $16,%%mm4\n\t" \
  369. "pcmpeqb %%mm1,%%mm1\n\t" \
  370. "packssdw %%mm5,%%mm4\n\t" \
  371. /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
  372. mm4=s=(25080*u>>16)-t2''*/ \
  373. "movq %%mm4,%%mm6\n\t" \
  374. "pmulhw %%mm7,%%mm4\n\t" \
  375. "pxor %%mm5,%%mm5\n\t" \
  376. "movd %[a],%%mm7\n\t" \
  377. "psubw %%mm1,%%mm5\n\t" \
  378. "punpckldq %%mm7,%%mm7\n\t" \
  379. "psubw %%mm2,%%mm4\n\t" \
  380. /*mm2=s+(s!=0) \
  381. mm4:mm3=s*21600+0x2800*/ \
  382. "movq %%mm4,%%mm3\n\t" \
  383. "movq %%mm4,%%mm2\n\t" \
  384. "punpckhwd %%mm5,%%mm4\n\t" \
  385. "pcmpeqw %%mm2,%%mm0\n\t" \
  386. "pmaddwd %%mm7,%%mm4\n\t" \
  387. "psubw %%mm1,%%mm0\n\t" \
  388. "punpcklwd %%mm5,%%mm3\n\t" \
  389. "paddw %%mm0,%%mm2\n\t" \
  390. "pmaddwd %%mm7,%%mm3\n\t" \
  391. /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
  392. mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
  393. "movq "_r4"(%[y]),%%mm0\n\t" \
  394. "psrad $18,%%mm4\n\t" \
  395. "movq "_r5"(%[y]),%%mm5\n\t" \
  396. "psrad $18,%%mm3\n\t" \
  397. "movq "_r7"(%[y]),%%mm1\n\t" \
  398. "packssdw %%mm4,%%mm3\n\t" \
  399. "movq "_r0"(%[y]),%%mm4\n\t" \
  400. "paddw %%mm2,%%mm3\n\t" \
  401. /*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
  402. On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
  403. {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
  404. # define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
  405. "#OC_TRANSPOSE8x4\n\t" \
  406. /*First 4x4 transpose:*/ \
  407. /*mm0 = e3 e2 e1 e0 \
  408. mm5 = f3 f2 f1 f0 \
  409. mm3 = g3 g2 g1 g0 \
  410. mm1 = h3 h2 h1 h0*/ \
  411. "movq %%mm0,%%mm2\n\t" \
  412. "punpcklwd %%mm5,%%mm0\n\t" \
  413. "punpckhwd %%mm5,%%mm2\n\t" \
  414. "movq %%mm3,%%mm5\n\t" \
  415. "punpcklwd %%mm1,%%mm3\n\t" \
  416. "punpckhwd %%mm1,%%mm5\n\t" \
  417. /*mm0 = f1 e1 f0 e0 \
  418. mm2 = f3 e3 f2 e2 \
  419. mm3 = h1 g1 h0 g0 \
  420. mm5 = h3 g3 h2 g2*/ \
  421. "movq %%mm0,%%mm1\n\t" \
  422. "punpckldq %%mm3,%%mm0\n\t" \
  423. "movq %%mm0,"_r4"(%[y])\n\t" \
  424. "punpckhdq %%mm3,%%mm1\n\t" \
  425. "movq "_r1"(%[y]),%%mm0\n\t" \
  426. "movq %%mm2,%%mm3\n\t" \
  427. "punpckldq %%mm5,%%mm2\n\t" \
  428. "punpckhdq %%mm5,%%mm3\n\t" \
  429. "movq "_r3"(%[y]),%%mm5\n\t" \
  430. /*_y[4] = h0 g0 f0 e0 \
  431. mm1 = h1 g1 f1 e1 \
  432. mm2 = h2 g2 f2 e2 \
  433. mm3 = h3 g3 f3 e3*/ \
  434. /*Second 4x4 transpose:*/ \
  435. /*mm4 = a3 a2 a1 a0 \
  436. mm0 = b3 b2 b1 b0 \
  437. mm6 = c3 c2 c1 c0 \
  438. mm5 = d3 d2 d1 d0*/ \
  439. "movq %%mm4,%%mm7\n\t" \
  440. "punpcklwd %%mm0,%%mm4\n\t" \
  441. "punpckhwd %%mm0,%%mm7\n\t" \
  442. "movq %%mm6,%%mm0\n\t" \
  443. "punpcklwd %%mm5,%%mm6\n\t" \
  444. "punpckhwd %%mm5,%%mm0\n\t" \
  445. /*mm4 = b1 a1 b0 a0 \
  446. mm7 = b3 a3 b2 a2 \
  447. mm6 = d1 c1 d0 c0 \
  448. mm0 = d3 c3 d2 c2*/ \
  449. "movq %%mm4,%%mm5\n\t" \
  450. "punpckldq %%mm6,%%mm4\n\t" \
  451. "punpckhdq %%mm6,%%mm5\n\t" \
  452. "movq %%mm7,%%mm6\n\t" \
  453. "punpckhdq %%mm0,%%mm7\n\t" \
  454. "punpckldq %%mm0,%%mm6\n\t" \
  455. /*mm4 = d0 c0 b0 a0 \
  456. mm5 = d1 c1 b1 a1 \
  457. mm6 = d2 c2 b2 a2 \
  458. mm7 = d3 c3 b3 a3*/ \
  459. /*MMX implementation of the fDCT.*/
  460. void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
  461. ptrdiff_t a;
  462. __asm__ __volatile__(
  463. /*Add two extra bits of working precision to improve accuracy; any more and
  464. we could overflow.*/
  465. /*We also add biases to correct for some systematic error that remains in
  466. the full fDCT->iDCT round trip.*/
  467. "movq 0x00(%[x]),%%mm0\n\t"
  468. "movq 0x10(%[x]),%%mm1\n\t"
  469. "movq 0x20(%[x]),%%mm2\n\t"
  470. "movq 0x30(%[x]),%%mm3\n\t"
  471. "pcmpeqb %%mm4,%%mm4\n\t"
  472. "pxor %%mm7,%%mm7\n\t"
  473. "movq %%mm0,%%mm5\n\t"
  474. "psllw $2,%%mm0\n\t"
  475. "pcmpeqw %%mm7,%%mm5\n\t"
  476. "movq 0x70(%[x]),%%mm7\n\t"
  477. "psllw $2,%%mm1\n\t"
  478. "psubw %%mm4,%%mm5\n\t"
  479. "psllw $2,%%mm2\n\t"
  480. "mov $1,%[a]\n\t"
  481. "pslld $16,%%mm5\n\t"
  482. "movd %[a],%%mm6\n\t"
  483. "psllq $16,%%mm5\n\t"
  484. "mov $0x10001,%[a]\n\t"
  485. "psllw $2,%%mm3\n\t"
  486. "movd %[a],%%mm4\n\t"
  487. "punpckhwd %%mm6,%%mm5\n\t"
  488. "psubw %%mm6,%%mm1\n\t"
  489. "movq 0x60(%[x]),%%mm6\n\t"
  490. "paddw %%mm5,%%mm0\n\t"
  491. "movq 0x50(%[x]),%%mm5\n\t"
  492. "paddw %%mm4,%%mm0\n\t"
  493. "movq 0x40(%[x]),%%mm4\n\t"
  494. /*We inline stage1 of the transform here so we can get better instruction
  495. scheduling with the shifts.*/
  496. /*mm0=t7'=t0-t7*/
  497. "psllw $2,%%mm7\n\t"
  498. "psubw %%mm7,%%mm0\n\t"
  499. "psllw $2,%%mm6\n\t"
  500. "paddw %%mm7,%%mm7\n\t"
  501. /*mm1=t6'=t1-t6*/
  502. "psllw $2,%%mm5\n\t"
  503. "psubw %%mm6,%%mm1\n\t"
  504. "psllw $2,%%mm4\n\t"
  505. "paddw %%mm6,%%mm6\n\t"
  506. /*mm2=t5'=t2-t5*/
  507. "psubw %%mm5,%%mm2\n\t"
  508. "paddw %%mm5,%%mm5\n\t"
  509. /*mm3=t4'=t3-t4*/
  510. "psubw %%mm4,%%mm3\n\t"
  511. "paddw %%mm4,%%mm4\n\t"
  512. /*mm7=t0'=t0+t7*/
  513. "paddw %%mm0,%%mm7\n\t"
  514. /*mm6=t1'=t1+t6*/
  515. "paddw %%mm1,%%mm6\n\t"
  516. /*mm5=t2'=t2+t5*/
  517. "paddw %%mm2,%%mm5\n\t"
  518. /*mm4=t3'=t3+t4*/
  519. "paddw %%mm3,%%mm4\n\t"
  520. OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
  521. OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
  522. /*Swap out this 8x4 block for the next one.*/
  523. "movq 0x08(%[x]),%%mm0\n\t"
  524. "movq %%mm7,0x30(%[y])\n\t"
  525. "movq 0x78(%[x]),%%mm7\n\t"
  526. "movq %%mm1,0x50(%[y])\n\t"
  527. "movq 0x18(%[x]),%%mm1\n\t"
  528. "movq %%mm6,0x20(%[y])\n\t"
  529. "movq 0x68(%[x]),%%mm6\n\t"
  530. "movq %%mm2,0x60(%[y])\n\t"
  531. "movq 0x28(%[x]),%%mm2\n\t"
  532. "movq %%mm5,0x10(%[y])\n\t"
  533. "movq 0x58(%[x]),%%mm5\n\t"
  534. "movq %%mm3,0x70(%[y])\n\t"
  535. "movq 0x38(%[x]),%%mm3\n\t"
  536. /*And increase its working precision, too.*/
  537. "psllw $2,%%mm0\n\t"
  538. "movq %%mm4,0x00(%[y])\n\t"
  539. "psllw $2,%%mm7\n\t"
  540. "movq 0x48(%[x]),%%mm4\n\t"
  541. /*We inline stage1 of the transform here so we can get better instruction
  542. scheduling with the shifts.*/
  543. /*mm0=t7'=t0-t7*/
  544. "psubw %%mm7,%%mm0\n\t"
  545. "psllw $2,%%mm1\n\t"
  546. "paddw %%mm7,%%mm7\n\t"
  547. "psllw $2,%%mm6\n\t"
  548. /*mm1=t6'=t1-t6*/
  549. "psubw %%mm6,%%mm1\n\t"
  550. "psllw $2,%%mm2\n\t"
  551. "paddw %%mm6,%%mm6\n\t"
  552. "psllw $2,%%mm5\n\t"
  553. /*mm2=t5'=t2-t5*/
  554. "psubw %%mm5,%%mm2\n\t"
  555. "psllw $2,%%mm3\n\t"
  556. "paddw %%mm5,%%mm5\n\t"
  557. "psllw $2,%%mm4\n\t"
  558. /*mm3=t4'=t3-t4*/
  559. "psubw %%mm4,%%mm3\n\t"
  560. "paddw %%mm4,%%mm4\n\t"
  561. /*mm7=t0'=t0+t7*/
  562. "paddw %%mm0,%%mm7\n\t"
  563. /*mm6=t1'=t1+t6*/
  564. "paddw %%mm1,%%mm6\n\t"
  565. /*mm5=t2'=t2+t5*/
  566. "paddw %%mm2,%%mm5\n\t"
  567. /*mm4=t3'=t3+t4*/
  568. "paddw %%mm3,%%mm4\n\t"
  569. OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
  570. OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
  571. /*Here the first 4x4 block of output from the last transpose is the second
  572. 4x4 block of input for the next transform.
  573. We have cleverly arranged that it already be in the appropriate place,
  574. so we only have to do half the stores and loads.*/
  575. "movq 0x00(%[y]),%%mm0\n\t"
  576. "movq %%mm1,0x58(%[y])\n\t"
  577. "movq 0x10(%[y]),%%mm1\n\t"
  578. "movq %%mm2,0x68(%[y])\n\t"
  579. "movq 0x20(%[y]),%%mm2\n\t"
  580. "movq %%mm3,0x78(%[y])\n\t"
  581. "movq 0x30(%[y]),%%mm3\n\t"
  582. OC_FDCT_STAGE1_8x4
  583. OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
  584. OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
  585. /*mm0={-2}x4*/
  586. "pcmpeqw %%mm0,%%mm0\n\t"
  587. "paddw %%mm0,%%mm0\n\t"
  588. /*Round the results.*/
  589. "psubw %%mm0,%%mm1\n\t"
  590. "psubw %%mm0,%%mm2\n\t"
  591. "psraw $2,%%mm1\n\t"
  592. "psubw %%mm0,%%mm3\n\t"
  593. "movq %%mm1,0x18(%[y])\n\t"
  594. "psraw $2,%%mm2\n\t"
  595. "psubw %%mm0,%%mm4\n\t"
  596. "movq 0x08(%[y]),%%mm1\n\t"
  597. "psraw $2,%%mm3\n\t"
  598. "psubw %%mm0,%%mm5\n\t"
  599. "psraw $2,%%mm4\n\t"
  600. "psubw %%mm0,%%mm6\n\t"
  601. "psraw $2,%%mm5\n\t"
  602. "psubw %%mm0,%%mm7\n\t"
  603. "psraw $2,%%mm6\n\t"
  604. "psubw %%mm0,%%mm1\n\t"
  605. "psraw $2,%%mm7\n\t"
  606. "movq 0x40(%[y]),%%mm0\n\t"
  607. "psraw $2,%%mm1\n\t"
  608. "movq %%mm7,0x30(%[y])\n\t"
  609. "movq 0x78(%[y]),%%mm7\n\t"
  610. "movq %%mm1,0x08(%[y])\n\t"
  611. "movq 0x50(%[y]),%%mm1\n\t"
  612. "movq %%mm6,0x20(%[y])\n\t"
  613. "movq 0x68(%[y]),%%mm6\n\t"
  614. "movq %%mm2,0x28(%[y])\n\t"
  615. "movq 0x60(%[y]),%%mm2\n\t"
  616. "movq %%mm5,0x10(%[y])\n\t"
  617. "movq 0x58(%[y]),%%mm5\n\t"
  618. "movq %%mm3,0x38(%[y])\n\t"
  619. "movq 0x70(%[y]),%%mm3\n\t"
  620. "movq %%mm4,0x00(%[y])\n\t"
  621. "movq 0x48(%[y]),%%mm4\n\t"
  622. OC_FDCT_STAGE1_8x4
  623. OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
  624. OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
  625. /*mm0={-2}x4*/
  626. "pcmpeqw %%mm0,%%mm0\n\t"
  627. "paddw %%mm0,%%mm0\n\t"
  628. /*Round the results.*/
  629. "psubw %%mm0,%%mm1\n\t"
  630. "psubw %%mm0,%%mm2\n\t"
  631. "psraw $2,%%mm1\n\t"
  632. "psubw %%mm0,%%mm3\n\t"
  633. "movq %%mm1,0x58(%[y])\n\t"
  634. "psraw $2,%%mm2\n\t"
  635. "psubw %%mm0,%%mm4\n\t"
  636. "movq 0x48(%[y]),%%mm1\n\t"
  637. "psraw $2,%%mm3\n\t"
  638. "psubw %%mm0,%%mm5\n\t"
  639. "movq %%mm2,0x68(%[y])\n\t"
  640. "psraw $2,%%mm4\n\t"
  641. "psubw %%mm0,%%mm6\n\t"
  642. "movq %%mm3,0x78(%[y])\n\t"
  643. "psraw $2,%%mm5\n\t"
  644. "psubw %%mm0,%%mm7\n\t"
  645. "movq %%mm4,0x40(%[y])\n\t"
  646. "psraw $2,%%mm6\n\t"
  647. "psubw %%mm0,%%mm1\n\t"
  648. "movq %%mm5,0x50(%[y])\n\t"
  649. "psraw $2,%%mm7\n\t"
  650. "movq %%mm6,0x60(%[y])\n\t"
  651. "psraw $2,%%mm1\n\t"
  652. "movq %%mm7,0x70(%[y])\n\t"
  653. "movq %%mm1,0x48(%[y])\n\t"
  654. :[a]"=&r"(a)
  655. :[y]"r"(_y),[x]"r"(_x)
  656. :"memory"
  657. );
  658. }
  659. #endif