mmxencfrag.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. * *
  8. * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
  9. * by the Xiph.Org Foundation http://www.xiph.org/ *
  10. * *
  11. ********************************************************************
  12. function:
  13. last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
  14. ********************************************************************/
  15. #include <stddef.h>
  16. #include "x86enc.h"
  17. #if defined(OC_X86_ASM)
  18. unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  19. const unsigned char *_ref,int _ystride){
  20. ptrdiff_t ret;
  21. __asm{
  22. #define SRC esi
  23. #define REF edx
  24. #define YSTRIDE ecx
  25. #define YSTRIDE3 edi
  26. mov YSTRIDE,_ystride
  27. mov SRC,_src
  28. mov REF,_ref
  29. /*Load the first 4 rows of each block.*/
  30. movq mm0,[SRC]
  31. movq mm1,[REF]
  32. movq mm2,[SRC][YSTRIDE]
  33. movq mm3,[REF][YSTRIDE]
  34. lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
  35. movq mm4,[SRC+YSTRIDE*2]
  36. movq mm5,[REF+YSTRIDE*2]
  37. movq mm6,[SRC+YSTRIDE3]
  38. movq mm7,[REF+YSTRIDE3]
  39. /*Compute their SADs and add them in mm0*/
  40. psadbw mm0,mm1
  41. psadbw mm2,mm3
  42. lea SRC,[SRC+YSTRIDE*4]
  43. paddw mm0,mm2
  44. lea REF,[REF+YSTRIDE*4]
  45. /*Load the next 3 rows as registers become available.*/
  46. movq mm2,[SRC]
  47. movq mm3,[REF]
  48. psadbw mm4,mm5
  49. psadbw mm6,mm7
  50. paddw mm0,mm4
  51. movq mm5,[REF+YSTRIDE]
  52. movq mm4,[SRC+YSTRIDE]
  53. paddw mm0,mm6
  54. movq mm7,[REF+YSTRIDE*2]
  55. movq mm6,[SRC+YSTRIDE*2]
  56. /*Start adding their SADs to mm0*/
  57. psadbw mm2,mm3
  58. psadbw mm4,mm5
  59. paddw mm0,mm2
  60. psadbw mm6,mm7
  61. /*Load last row as registers become available.*/
  62. movq mm2,[SRC+YSTRIDE3]
  63. movq mm3,[REF+YSTRIDE3]
  64. /*And finish adding up their SADs.*/
  65. paddw mm0,mm4
  66. psadbw mm2,mm3
  67. paddw mm0,mm6
  68. paddw mm0,mm2
  69. movd [ret],mm0
  70. #undef SRC
  71. #undef REF
  72. #undef YSTRIDE
  73. #undef YSTRIDE3
  74. }
  75. return (unsigned)ret;
  76. }
  77. unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
  78. const unsigned char *_ref,int _ystride,unsigned _thresh){
  79. /*Early termination is for suckers.*/
  80. return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
  81. }
  82. #define OC_SAD2_LOOP __asm{ \
  83. /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
  84. pavgb computes (mm0+mm1+1>>1). \
  85. The latter is exactly 1 too large when the low bit of two corresponding \
  86. bytes is only set in one of them. \
  87. Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
  88. correct the output of pavgb.*/ \
  89. __asm movq mm6,mm0 \
  90. __asm lea REF1,[REF1+YSTRIDE*2] \
  91. __asm pxor mm0,mm1 \
  92. __asm pavgb mm6,mm1 \
  93. __asm lea REF2,[REF2+YSTRIDE*2] \
  94. __asm movq mm1,mm2 \
  95. __asm pand mm0,mm7 \
  96. __asm pavgb mm2,mm3 \
  97. __asm pxor mm1,mm3 \
  98. __asm movq mm3,[REF2+YSTRIDE] \
  99. __asm psubb mm6,mm0 \
  100. __asm movq mm0,[REF1] \
  101. __asm pand mm1,mm7 \
  102. __asm psadbw mm4,mm6 \
  103. __asm movd mm6,RET \
  104. __asm psubb mm2,mm1 \
  105. __asm movq mm1,[REF2] \
  106. __asm lea SRC,[SRC+YSTRIDE*2] \
  107. __asm psadbw mm5,mm2 \
  108. __asm movq mm2,[REF1+YSTRIDE] \
  109. __asm paddw mm5,mm4 \
  110. __asm movq mm4,[SRC] \
  111. __asm paddw mm6,mm5 \
  112. __asm movq mm5,[SRC+YSTRIDE] \
  113. __asm movd RET,mm6 \
  114. }
  115. /*Same as above, but does not pre-load the next two rows.*/
  116. #define OC_SAD2_TAIL __asm{ \
  117. __asm movq mm6,mm0 \
  118. __asm pavgb mm0,mm1 \
  119. __asm pxor mm6,mm1 \
  120. __asm movq mm1,mm2 \
  121. __asm pand mm6,mm7 \
  122. __asm pavgb mm2,mm3 \
  123. __asm pxor mm1,mm3 \
  124. __asm psubb mm0,mm6 \
  125. __asm pand mm1,mm7 \
  126. __asm psadbw mm4,mm0 \
  127. __asm psubb mm2,mm1 \
  128. __asm movd mm6,RET \
  129. __asm psadbw mm5,mm2 \
  130. __asm paddw mm5,mm4 \
  131. __asm paddw mm6,mm5 \
  132. __asm movd RET,mm6 \
  133. }
  134. unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  135. const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  136. unsigned _thresh){
  137. ptrdiff_t ret;
  138. __asm{
  139. #define REF1 ecx
  140. #define REF2 edi
  141. #define YSTRIDE esi
  142. #define SRC edx
  143. #define RET eax
  144. mov YSTRIDE,_ystride
  145. mov SRC,_src
  146. mov REF1,_ref1
  147. mov REF2,_ref2
  148. movq mm0,[REF1]
  149. movq mm1,[REF2]
  150. movq mm2,[REF1+YSTRIDE]
  151. movq mm3,[REF2+YSTRIDE]
  152. xor RET,RET
  153. movq mm4,[SRC]
  154. pxor mm7,mm7
  155. pcmpeqb mm6,mm6
  156. movq mm5,[SRC+YSTRIDE]
  157. psubb mm7,mm6
  158. OC_SAD2_LOOP
  159. OC_SAD2_LOOP
  160. OC_SAD2_LOOP
  161. OC_SAD2_TAIL
  162. mov [ret],RET
  163. #undef REF1
  164. #undef REF2
  165. #undef YSTRIDE
  166. #undef SRC
  167. #undef RET
  168. }
  169. return (unsigned)ret;
  170. }
  171. /*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
  172. 16-bit difference in mm0...mm7.*/
  173. #define OC_LOAD_SUB_8x4(_off) __asm{ \
  174. __asm movd mm0,[_off+SRC] \
  175. __asm movd mm4,[_off+REF] \
  176. __asm movd mm1,[_off+SRC+SRC_YSTRIDE] \
  177. __asm lea SRC,[SRC+SRC_YSTRIDE*2] \
  178. __asm movd mm5,[_off+REF+REF_YSTRIDE] \
  179. __asm lea REF,[REF+REF_YSTRIDE*2] \
  180. __asm movd mm2,[_off+SRC] \
  181. __asm movd mm7,[_off+REF] \
  182. __asm movd mm3,[_off+SRC+SRC_YSTRIDE] \
  183. __asm movd mm6,[_off+REF+REF_YSTRIDE] \
  184. __asm punpcklbw mm0,mm4 \
  185. __asm lea SRC,[SRC+SRC_YSTRIDE*2] \
  186. __asm punpcklbw mm4,mm4 \
  187. __asm lea REF,[REF+REF_YSTRIDE*2] \
  188. __asm psubw mm0,mm4 \
  189. __asm movd mm4,[_off+SRC] \
  190. __asm movq [_off*2+BUF],mm0 \
  191. __asm movd mm0,[_off+REF] \
  192. __asm punpcklbw mm1,mm5 \
  193. __asm punpcklbw mm5,mm5 \
  194. __asm psubw mm1,mm5 \
  195. __asm movd mm5,[_off+SRC+SRC_YSTRIDE] \
  196. __asm punpcklbw mm2,mm7 \
  197. __asm punpcklbw mm7,mm7 \
  198. __asm psubw mm2,mm7 \
  199. __asm movd mm7,[_off+REF+REF_YSTRIDE] \
  200. __asm punpcklbw mm3,mm6 \
  201. __asm lea SRC,[SRC+SRC_YSTRIDE*2] \
  202. __asm punpcklbw mm6,mm6 \
  203. __asm psubw mm3,mm6 \
  204. __asm movd mm6,[_off+SRC] \
  205. __asm punpcklbw mm4,mm0 \
  206. __asm lea REF,[REF+REF_YSTRIDE*2] \
  207. __asm punpcklbw mm0,mm0 \
  208. __asm lea SRC,[SRC+SRC_YSTRIDE*2] \
  209. __asm psubw mm4,mm0 \
  210. __asm movd mm0,[_off+REF] \
  211. __asm punpcklbw mm5,mm7 \
  212. __asm neg SRC_YSTRIDE \
  213. __asm punpcklbw mm7,mm7 \
  214. __asm psubw mm5,mm7 \
  215. __asm movd mm7,[_off+SRC+SRC_YSTRIDE] \
  216. __asm punpcklbw mm6,mm0 \
  217. __asm lea REF,[REF+REF_YSTRIDE*2] \
  218. __asm punpcklbw mm0,mm0 \
  219. __asm neg REF_YSTRIDE \
  220. __asm psubw mm6,mm0 \
  221. __asm movd mm0,[_off+REF+REF_YSTRIDE] \
  222. __asm lea SRC,[SRC+SRC_YSTRIDE*8] \
  223. __asm punpcklbw mm7,mm0 \
  224. __asm neg SRC_YSTRIDE \
  225. __asm punpcklbw mm0,mm0 \
  226. __asm lea REF,[REF+REF_YSTRIDE*8] \
  227. __asm psubw mm7,mm0 \
  228. __asm neg REF_YSTRIDE \
  229. __asm movq mm0,[_off*2+BUF] \
  230. }
  231. /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
  232. #define OC_LOAD_8x4(_off) __asm{ \
  233. __asm movd mm0,[_off+SRC] \
  234. __asm movd mm1,[_off+SRC+YSTRIDE] \
  235. __asm movd mm2,[_off+SRC+YSTRIDE*2] \
  236. __asm pxor mm7,mm7 \
  237. __asm movd mm3,[_off+SRC+YSTRIDE3] \
  238. __asm punpcklbw mm0,mm7 \
  239. __asm movd mm4,[_off+SRC4] \
  240. __asm punpcklbw mm1,mm7 \
  241. __asm movd mm5,[_off+SRC4+YSTRIDE] \
  242. __asm punpcklbw mm2,mm7 \
  243. __asm movd mm6,[_off+SRC4+YSTRIDE*2] \
  244. __asm punpcklbw mm3,mm7 \
  245. __asm movd mm7,[_off+SRC4+YSTRIDE3] \
  246. __asm punpcklbw mm4,mm4 \
  247. __asm punpcklbw mm5,mm5 \
  248. __asm psrlw mm4,8 \
  249. __asm psrlw mm5,8 \
  250. __asm punpcklbw mm6,mm6 \
  251. __asm punpcklbw mm7,mm7 \
  252. __asm psrlw mm6,8 \
  253. __asm psrlw mm7,8 \
  254. }
  255. /*Performs the first two stages of an 8-point 1-D Hadamard transform.
  256. The transform is performed in place, except that outputs 0-3 are swapped with
  257. outputs 4-7.
  258. Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
  259. perform this stage in place with no temporary registers).*/
  260. #define OC_HADAMARD_AB_8x4 __asm{ \
  261. /*Stage A: \
  262. Outputs 0-3 are swapped with 4-7 here.*/ \
  263. __asm paddw mm5,mm1 \
  264. __asm paddw mm6,mm2 \
  265. __asm paddw mm1,mm1 \
  266. __asm paddw mm2,mm2 \
  267. __asm psubw mm1,mm5 \
  268. __asm psubw mm2,mm6 \
  269. __asm paddw mm7,mm3 \
  270. __asm paddw mm4,mm0 \
  271. __asm paddw mm3,mm3 \
  272. __asm paddw mm0,mm0 \
  273. __asm psubw mm3,mm7 \
  274. __asm psubw mm0,mm4 \
  275. /*Stage B:*/ \
  276. __asm paddw mm0,mm2 \
  277. __asm paddw mm1,mm3 \
  278. __asm paddw mm4,mm6 \
  279. __asm paddw mm5,mm7 \
  280. __asm paddw mm2,mm2 \
  281. __asm paddw mm3,mm3 \
  282. __asm paddw mm6,mm6 \
  283. __asm paddw mm7,mm7 \
  284. __asm psubw mm2,mm0 \
  285. __asm psubw mm3,mm1 \
  286. __asm psubw mm6,mm4 \
  287. __asm psubw mm7,mm5 \
  288. }
  289. /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
  290. Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
  291. place with no temporary registers).*/
  292. #define OC_HADAMARD_C_8x4 __asm{ \
  293. /*Stage C:*/ \
  294. __asm paddw mm0,mm1 \
  295. __asm paddw mm2,mm3 \
  296. __asm paddw mm4,mm5 \
  297. __asm paddw mm6,mm7 \
  298. __asm paddw mm1,mm1 \
  299. __asm paddw mm3,mm3 \
  300. __asm paddw mm5,mm5 \
  301. __asm paddw mm7,mm7 \
  302. __asm psubw mm1,mm0 \
  303. __asm psubw mm3,mm2 \
  304. __asm psubw mm5,mm4 \
  305. __asm psubw mm7,mm6 \
  306. }
  307. /*Performs an 8-point 1-D Hadamard transform.
  308. The transform is performed in place, except that outputs 0-3 are swapped with
  309. outputs 4-7.
  310. Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
  311. in place with no temporary registers).*/
  312. #define OC_HADAMARD_8x4 __asm{ \
  313. OC_HADAMARD_AB_8x4 \
  314. OC_HADAMARD_C_8x4 \
  315. }
  316. /*Performs the first part of the final stage of the Hadamard transform and
  317. summing of absolute values.
  318. At the end of this part, mm1 will contain the DC coefficient of the
  319. transform.*/
  320. #define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
  321. /*We use the fact that \
  322. (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
  323. to merge the final butterfly with the abs and the first stage of \
  324. accumulation. \
  325. Thus we can avoid using pabsw, which is not available until SSSE3. \
  326. Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
  327. implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
  328. registers). \
  329. Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
  330. This implementation is only 26 (+4 for spilling registers).*/ \
  331. __asm movq [_r7+BUF],mm7 \
  332. __asm movq [_r6+BUF],mm6 \
  333. /*mm7={0x7FFF}x4 \
  334. mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
  335. __asm pcmpeqb mm7,mm7 \
  336. __asm movq mm6,mm0 \
  337. __asm psrlw mm7,1 \
  338. __asm paddw mm6,mm1 \
  339. __asm pmaxsw mm0,mm1 \
  340. __asm paddsw mm6,mm7 \
  341. __asm psubw mm0,mm6 \
  342. /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
  343. mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
  344. __asm movq mm6,mm2 \
  345. __asm movq mm1,mm4 \
  346. __asm pmaxsw mm2,mm3 \
  347. __asm pmaxsw mm4,mm5 \
  348. __asm paddw mm6,mm3 \
  349. __asm paddw mm1,mm5 \
  350. __asm movq mm3,[_r7+BUF] \
  351. }
  352. /*Performs the second part of the final stage of the Hadamard transform and
  353. summing of absolute values.*/
  354. #define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
  355. __asm paddsw mm6,mm7 \
  356. __asm movq mm5,[_r6+BUF] \
  357. __asm paddsw mm1,mm7 \
  358. __asm psubw mm2,mm6 \
  359. __asm psubw mm4,mm1 \
  360. /*mm7={1}x4 (needed for the horizontal add that follows) \
  361. mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
  362. __asm movq mm6,mm3 \
  363. __asm pmaxsw mm3,mm5 \
  364. __asm paddw mm0,mm2 \
  365. __asm paddw mm6,mm5 \
  366. __asm paddw mm0,mm4 \
  367. __asm paddsw mm6,mm7 \
  368. __asm paddw mm0,mm3 \
  369. __asm psrlw mm7,14 \
  370. __asm psubw mm0,mm6 \
  371. }
  372. /*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
  373. absolute value of each component, and accumulates everything into mm0.
  374. This is the only portion of SATD which requires MMXEXT (we could use plain
  375. MMX, but it takes 4 instructions and an extra register to work around the
  376. lack of a pmaxsw, which is a pretty serious penalty).*/
  377. #define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
  378. OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
  379. OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
  380. }
  381. /*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
  382. component, and accumulates everything into mm0.
  383. Note that mm0 will have an extra 4 added to each column, and that after
  384. removing this value, the remainder will be half the conventional value.*/
  385. #define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
  386. OC_HADAMARD_AB_8x4 \
  387. OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
  388. }
  389. /*Performs two 4x4 transposes (mostly) in place.
  390. On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
  391. contains rows {a,b,c,d}.
  392. On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
  393. {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
  394. #define OC_TRANSPOSE_4x4x2(_off) __asm{ \
  395. /*First 4x4 transpose:*/ \
  396. __asm movq [0x10+_off+BUF],mm5 \
  397. /*mm0 = e3 e2 e1 e0 \
  398. mm1 = f3 f2 f1 f0 \
  399. mm2 = g3 g2 g1 g0 \
  400. mm3 = h3 h2 h1 h0*/ \
  401. __asm movq mm5,mm2 \
  402. __asm punpcklwd mm2,mm3 \
  403. __asm punpckhwd mm5,mm3 \
  404. __asm movq mm3,mm0 \
  405. __asm punpcklwd mm0,mm1 \
  406. __asm punpckhwd mm3,mm1 \
  407. /*mm0 = f1 e1 f0 e0 \
  408. mm3 = f3 e3 f2 e2 \
  409. mm2 = h1 g1 h0 g0 \
  410. mm5 = h3 g3 h2 g2*/ \
  411. __asm movq mm1,mm0 \
  412. __asm punpckldq mm0,mm2 \
  413. __asm punpckhdq mm1,mm2 \
  414. __asm movq mm2,mm3 \
  415. __asm punpckhdq mm3,mm5 \
  416. __asm movq [0x40+_off+BUF],mm0 \
  417. __asm punpckldq mm2,mm5 \
  418. /*mm0 = h0 g0 f0 e0 \
  419. mm1 = h1 g1 f1 e1 \
  420. mm2 = h2 g2 f2 e2 \
  421. mm3 = h3 g3 f3 e3*/ \
  422. __asm movq mm5,[0x10+_off+BUF] \
  423. /*Second 4x4 transpose:*/ \
  424. /*mm4 = a3 a2 a1 a0 \
  425. mm5 = b3 b2 b1 b0 \
  426. mm6 = c3 c2 c1 c0 \
  427. mm7 = d3 d2 d1 d0*/ \
  428. __asm movq mm0,mm6 \
  429. __asm punpcklwd mm6,mm7 \
  430. __asm movq [0x50+_off+BUF],mm1 \
  431. __asm punpckhwd mm0,mm7 \
  432. __asm movq mm7,mm4 \
  433. __asm punpcklwd mm4,mm5 \
  434. __asm movq [0x60+_off+BUF],mm2 \
  435. __asm punpckhwd mm7,mm5 \
  436. /*mm4 = b1 a1 b0 a0 \
  437. mm7 = b3 a3 b2 a2 \
  438. mm6 = d1 c1 d0 c0 \
  439. mm0 = d3 c3 d2 c2*/ \
  440. __asm movq mm5,mm4 \
  441. __asm punpckldq mm4,mm6 \
  442. __asm movq [0x70+_off+BUF],mm3 \
  443. __asm punpckhdq mm5,mm6 \
  444. __asm movq mm6,mm7 \
  445. __asm punpckhdq mm7,mm0 \
  446. __asm punpckldq mm6,mm0 \
  447. /*mm4 = d0 c0 b0 a0 \
  448. mm5 = d1 c1 b1 a1 \
  449. mm6 = d2 c2 b2 a2 \
  450. mm7 = d3 c3 b3 a3*/ \
  451. }
  452. static unsigned oc_int_frag_satd_mmxext(int *_dc,
  453. const unsigned char *_src,int _src_ystride,
  454. const unsigned char *_ref,int _ref_ystride){
  455. OC_ALIGN8(ogg_int16_t buf[64]);
  456. ogg_int16_t *bufp;
  457. unsigned ret;
  458. unsigned ret2;
  459. int dc;
  460. bufp=buf;
  461. __asm{
  462. #define SRC esi
  463. #define REF eax
  464. #define SRC_YSTRIDE ecx
  465. #define REF_YSTRIDE edx
  466. #define BUF edi
  467. #define RET edx
  468. #define RET2 ecx
  469. #define DC eax
  470. #define DC_WORD ax
  471. mov SRC,_src
  472. mov SRC_YSTRIDE,_src_ystride
  473. mov REF,_ref
  474. mov REF_YSTRIDE,_ref_ystride
  475. mov BUF,bufp
  476. OC_LOAD_SUB_8x4(0x00)
  477. OC_HADAMARD_8x4
  478. OC_TRANSPOSE_4x4x2(0x00)
  479. /*Finish swapping out this 8x4 block to make room for the next one.
  480. mm0...mm3 have been swapped out already.*/
  481. movq [0x00+BUF],mm4
  482. movq [0x10+BUF],mm5
  483. movq [0x20+BUF],mm6
  484. movq [0x30+BUF],mm7
  485. OC_LOAD_SUB_8x4(0x04)
  486. OC_HADAMARD_8x4
  487. OC_TRANSPOSE_4x4x2(0x08)
  488. /*Here the first 4x4 block of output from the last transpose is the second
  489. 4x4 block of input for the next transform.
  490. We have cleverly arranged that it already be in the appropriate place, so
  491. we only have to do half the loads.*/
  492. movq mm1,[0x10+BUF]
  493. movq mm2,[0x20+BUF]
  494. movq mm3,[0x30+BUF]
  495. movq mm0,[0x00+BUF]
  496. /*We split out the stages here so we can save the DC coefficient in the
  497. middle.*/
  498. OC_HADAMARD_AB_8x4
  499. OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
  500. movd DC,mm1
  501. OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
  502. /*Up to this point, everything fit in 16 bits (8 input + 1 for the
  503. difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
  504. for the factor of two we dropped + 3 for the vertical accumulation).
  505. Now we finally have to promote things to dwords.
  506. We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
  507. latency of pmaddwd by starting the next series of loads now.*/
  508. pmaddwd mm0,mm7
  509. movq mm1,[0x50+BUF]
  510. movq mm5,[0x58+BUF]
  511. movq mm4,mm0
  512. movq mm2,[0x60+BUF]
  513. punpckhdq mm0,mm0
  514. movq mm6,[0x68+BUF]
  515. paddd mm4,mm0
  516. movq mm3,[0x70+BUF]
  517. movd RET2,mm4
  518. movq mm7,[0x78+BUF]
  519. movq mm0,[0x40+BUF]
  520. movq mm4,[0x48+BUF]
  521. OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
  522. pmaddwd mm0,mm7
  523. /*Subtract abs(dc) from 2*ret2.*/
  524. movsx DC,DC_WORD
  525. cdq
  526. lea RET2,[RET+RET2*2]
  527. movq mm4,mm0
  528. punpckhdq mm0,mm0
  529. xor RET,DC
  530. paddd mm4,mm0
  531. /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
  532. added to them, a factor of two removed, and the DC value included;
  533. correct the final sum here.*/
  534. sub RET2,RET
  535. movd RET,mm4
  536. lea RET,[RET2+RET*2-64]
  537. mov ret,RET
  538. mov dc,DC
  539. #undef SRC
  540. #undef REF
  541. #undef SRC_YSTRIDE
  542. #undef REF_YSTRIDE
  543. #undef BUF
  544. #undef RET
  545. #undef RET2
  546. #undef DC
  547. #undef DC_WORD
  548. }
  549. *_dc=dc;
  550. return ret;
  551. }
  552. unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
  553. const unsigned char *_ref,int _ystride){
  554. return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
  555. }
  556. /*Our internal implementation of frag_copy2 takes an extra stride parameter so
  557. we can share code with oc_enc_frag_satd2_mmxext().*/
  558. static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
  559. const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
  560. __asm{
  561. /*Load the first 3 rows.*/
  562. #define DST_YSTRIDE edi
  563. #define SRC_YSTRIDE esi
  564. #define DST eax
  565. #define SRC1 edx
  566. #define SRC2 ecx
  567. mov DST_YSTRIDE,_dst_ystride
  568. mov SRC_YSTRIDE,_src_ystride
  569. mov DST,_dst
  570. mov SRC1,_src1
  571. mov SRC2,_src2
  572. movq mm0,[SRC1]
  573. movq mm1,[SRC2]
  574. movq mm2,[SRC1+SRC_YSTRIDE]
  575. lea SRC1,[SRC1+SRC_YSTRIDE*2]
  576. movq mm3,[SRC2+SRC_YSTRIDE]
  577. lea SRC2,[SRC2+SRC_YSTRIDE*2]
  578. pxor mm7,mm7
  579. movq mm4,[SRC1]
  580. pcmpeqb mm6,mm6
  581. movq mm5,[SRC2]
  582. /*mm7={1}x8.*/
  583. psubb mm7,mm6
  584. /*Start averaging mm0 and mm1 into mm6.*/
  585. movq mm6,mm0
  586. pxor mm0,mm1
  587. pavgb mm6,mm1
  588. /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
  589. movq mm1,mm2
  590. pand mm0,mm7
  591. pavgb mm2,mm3
  592. pxor mm1,mm3
  593. /*mm3 is free.*/
  594. psubb mm6,mm0
  595. /*mm0 is free, start loading the next row.*/
  596. movq mm0,[SRC1+SRC_YSTRIDE]
  597. /*Start averaging mm5 and mm4 using mm3.*/
  598. movq mm3,mm4
  599. /*mm6 [row 0] is done; write it out.*/
  600. movq [DST],mm6
  601. pand mm1,mm7
  602. pavgb mm4,mm5
  603. psubb mm2,mm1
  604. /*mm1 is free, continue loading the next row.*/
  605. movq mm1,[SRC2+SRC_YSTRIDE]
  606. pxor mm3,mm5
  607. lea SRC1,[SRC1+SRC_YSTRIDE*2]
  608. /*mm2 [row 1] is done; write it out.*/
  609. movq [DST+DST_YSTRIDE],mm2
  610. pand mm3,mm7
  611. /*Start loading the next row.*/
  612. movq mm2,[SRC1]
  613. lea DST,[DST+DST_YSTRIDE*2]
  614. psubb mm4,mm3
  615. lea SRC2,[SRC2+SRC_YSTRIDE*2]
  616. /*mm4 [row 2] is done; write it out.*/
  617. movq [DST],mm4
  618. /*Continue loading the next row.*/
  619. movq mm3,[SRC2]
  620. /*Start averaging mm0 and mm1 into mm6.*/
  621. movq mm6,mm0
  622. pxor mm0,mm1
  623. /*Start loading the next row.*/
  624. movq mm4,[SRC1+SRC_YSTRIDE]
  625. pavgb mm6,mm1
  626. /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
  627. movq mm1,mm2
  628. pand mm0,mm7
  629. /*Continue loading the next row.*/
  630. movq mm5,[SRC2+SRC_YSTRIDE]
  631. pavgb mm2,mm3
  632. lea SRC1,[SRC1+SRC_YSTRIDE*2]
  633. pxor mm1,mm3
  634. /*mm3 is free.*/
  635. psubb mm6,mm0
  636. /*mm0 is free, start loading the next row.*/
  637. movq mm0,[SRC1]
  638. /*Start averaging mm5 into mm4 using mm3.*/
  639. movq mm3,mm4
  640. /*mm6 [row 3] is done; write it out.*/
  641. movq [DST+DST_YSTRIDE],mm6
  642. pand mm1,mm7
  643. lea SRC2,[SRC2+SRC_YSTRIDE*2]
  644. pavgb mm4,mm5
  645. lea DST,[DST+DST_YSTRIDE*2]
  646. psubb mm2,mm1
  647. /*mm1 is free; continue loading the next row.*/
  648. movq mm1,[SRC2]
  649. pxor mm3,mm5
  650. /*mm2 [row 4] is done; write it out.*/
  651. movq [DST],mm2
  652. pand mm3,mm7
  653. /*Start loading the next row.*/
  654. movq mm2,[SRC1+SRC_YSTRIDE]
  655. psubb mm4,mm3
  656. /*Start averaging mm0 and mm1 into mm6.*/
  657. movq mm6,mm0
  658. /*Continue loading the next row.*/
  659. movq mm3,[SRC2+SRC_YSTRIDE]
  660. /*mm4 [row 5] is done; write it out.*/
  661. movq [DST+DST_YSTRIDE],mm4
  662. pxor mm0,mm1
  663. pavgb mm6,mm1
  664. /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
  665. movq mm4,mm2
  666. pand mm0,mm7
  667. pavgb mm2,mm3
  668. pxor mm4,mm3
  669. lea DST,[DST+DST_YSTRIDE*2]
  670. psubb mm6,mm0
  671. pand mm4,mm7
  672. /*mm6 [row 6] is done, write it out.*/
  673. movq [DST],mm6
  674. psubb mm2,mm4
  675. /*mm2 [row 7] is done, write it out.*/
  676. movq [DST+DST_YSTRIDE],mm2
  677. #undef SRC1
  678. #undef SRC2
  679. #undef SRC_YSTRIDE
  680. #undef DST_YSTRIDE
  681. #undef DST
  682. }
  683. }
  684. unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
  685. const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
  686. OC_ALIGN8(unsigned char ref[64]);
  687. oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
  688. return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
  689. }
  690. unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
  691. int _ystride){
  692. OC_ALIGN8(ogg_int16_t buf[64]);
  693. ogg_int16_t *bufp;
  694. unsigned ret1;
  695. unsigned ret2;
  696. int dc;
  697. bufp=buf;
  698. __asm{
  699. #define SRC eax
  700. #define SRC4 esi
  701. #define BUF edi
  702. #define YSTRIDE edx
  703. #define YSTRIDE3 ecx
  704. #define RET eax
  705. #define RET2 ecx
  706. #define DC edx
  707. #define DC_WORD dx
  708. mov SRC,_src
  709. mov BUF,bufp
  710. mov YSTRIDE,_ystride
  711. /* src4 = src+4*ystride */
  712. lea SRC4,[SRC+YSTRIDE*4]
  713. /* ystride3 = 3*ystride */
  714. lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
  715. OC_LOAD_8x4(0x00)
  716. OC_HADAMARD_8x4
  717. OC_TRANSPOSE_4x4x2(0x00)
  718. /*Finish swapping out this 8x4 block to make room for the next one.
  719. mm0...mm3 have been swapped out already.*/
  720. movq [0x00+BUF],mm4
  721. movq [0x10+BUF],mm5
  722. movq [0x20+BUF],mm6
  723. movq [0x30+BUF],mm7
  724. OC_LOAD_8x4(0x04)
  725. OC_HADAMARD_8x4
  726. OC_TRANSPOSE_4x4x2(0x08)
  727. /*Here the first 4x4 block of output from the last transpose is the second
  728. 4x4 block of input for the next transform.
  729. We have cleverly arranged that it already be in the appropriate place, so
  730. we only have to do half the loads.*/
  731. movq mm1,[0x10+BUF]
  732. movq mm2,[0x20+BUF]
  733. movq mm3,[0x30+BUF]
  734. movq mm0,[0x00+BUF]
  735. /*We split out the stages here so we can save the DC coefficient in the
  736. middle.*/
  737. OC_HADAMARD_AB_8x4
  738. OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
  739. movd DC,mm1
  740. OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
  741. /*Up to this point, everything fit in 16 bits (8 input + 1 for the
  742. difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
  743. for the factor of two we dropped + 3 for the vertical accumulation).
  744. Now we finally have to promote things to dwords.
  745. We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
  746. latency of pmaddwd by starting the next series of loads now.*/
  747. pmaddwd mm0,mm7
  748. movq mm1,[0x50+BUF]
  749. movq mm5,[0x58+BUF]
  750. movq mm2,[0x60+BUF]
  751. movq mm4,mm0
  752. movq mm6,[0x68+BUF]
  753. punpckhdq mm0,mm0
  754. movq mm3,[0x70+BUF]
  755. paddd mm4,mm0
  756. movq mm7,[0x78+BUF]
  757. movd RET,mm4
  758. movq mm0,[0x40+BUF]
  759. movq mm4,[0x48+BUF]
  760. OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
  761. pmaddwd mm0,mm7
  762. /*We assume that the DC coefficient is always positive (which is true,
  763. because the input to the INTRA transform was not a difference).*/
  764. movzx DC,DC_WORD
  765. add RET,RET
  766. sub RET,DC
  767. movq mm4,mm0
  768. punpckhdq mm0,mm0
  769. paddd mm4,mm0
  770. movd RET2,mm4
  771. lea RET,[-64+RET+RET2*2]
  772. mov [dc],DC
  773. mov [ret1],RET
  774. #undef SRC
  775. #undef SRC4
  776. #undef BUF
  777. #undef YSTRIDE
  778. #undef YSTRIDE3
  779. #undef RET
  780. #undef RET2
  781. #undef DC
  782. #undef DC_WORD
  783. }
  784. *_dc=dc;
  785. return ret1;
  786. }
  787. void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
  788. const unsigned char *_src, const unsigned char *_ref,int _ystride){
  789. int i;
  790. __asm pxor mm7,mm7
  791. for(i=4;i-->0;){
  792. __asm{
  793. #define SRC edx
  794. #define YSTRIDE esi
  795. #define RESIDUE eax
  796. #define REF ecx
  797. mov YSTRIDE,_ystride
  798. mov RESIDUE,_residue
  799. mov SRC,_src
  800. mov REF,_ref
  801. /*mm0=[src]*/
  802. movq mm0,[SRC]
  803. /*mm1=[ref]*/
  804. movq mm1,[REF]
  805. /*mm4=[src+ystride]*/
  806. movq mm4,[SRC+YSTRIDE]
  807. /*mm5=[ref+ystride]*/
  808. movq mm5,[REF+YSTRIDE]
  809. /*Compute [src]-[ref].*/
  810. movq mm2,mm0
  811. punpcklbw mm0,mm7
  812. movq mm3,mm1
  813. punpckhbw mm2,mm7
  814. punpcklbw mm1,mm7
  815. punpckhbw mm3,mm7
  816. psubw mm0,mm1
  817. psubw mm2,mm3
  818. /*Compute [src+ystride]-[ref+ystride].*/
  819. movq mm1,mm4
  820. punpcklbw mm4,mm7
  821. movq mm3,mm5
  822. punpckhbw mm1,mm7
  823. lea SRC,[SRC+YSTRIDE*2]
  824. punpcklbw mm5,mm7
  825. lea REF,[REF+YSTRIDE*2]
  826. punpckhbw mm3,mm7
  827. psubw mm4,mm5
  828. psubw mm1,mm3
  829. /*Write the answer out.*/
  830. movq [RESIDUE+0x00],mm0
  831. movq [RESIDUE+0x08],mm2
  832. movq [RESIDUE+0x10],mm4
  833. movq [RESIDUE+0x18],mm1
  834. lea RESIDUE,[RESIDUE+0x20]
  835. mov _residue,RESIDUE
  836. mov _src,SRC
  837. mov _ref,REF
  838. #undef SRC
  839. #undef YSTRIDE
  840. #undef RESIDUE
  841. #undef REF
  842. }
  843. }
  844. }
  845. void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
  846. const unsigned char *_src,int _ystride){
  847. __asm{
  848. #define YSTRIDE edx
  849. #define YSTRIDE3 edi
  850. #define RESIDUE ecx
  851. #define SRC eax
  852. mov YSTRIDE,_ystride
  853. mov RESIDUE,_residue
  854. mov SRC,_src
  855. /*mm0=[src]*/
  856. movq mm0,[SRC]
  857. /*mm1=[src+ystride]*/
  858. movq mm1,[SRC+YSTRIDE]
  859. /*mm6={-1}x4*/
  860. pcmpeqw mm6,mm6
  861. /*mm2=[src+2*ystride]*/
  862. movq mm2,[SRC+YSTRIDE*2]
  863. /*[ystride3]=3*[ystride]*/
  864. lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
  865. /*mm6={1}x4*/
  866. psllw mm6,15
  867. /*mm3=[src+3*ystride]*/
  868. movq mm3,[SRC+YSTRIDE3]
  869. /*mm6={128}x4*/
  870. psrlw mm6,8
  871. /*mm7=0*/
  872. pxor mm7,mm7
  873. /*[src]=[src]+4*[ystride]*/
  874. lea SRC,[SRC+YSTRIDE*4]
  875. /*Compute [src]-128 and [src+ystride]-128*/
  876. movq mm4,mm0
  877. punpcklbw mm0,mm7
  878. movq mm5,mm1
  879. punpckhbw mm4,mm7
  880. psubw mm0,mm6
  881. punpcklbw mm1,mm7
  882. psubw mm4,mm6
  883. punpckhbw mm5,mm7
  884. psubw mm1,mm6
  885. psubw mm5,mm6
  886. /*Write the answer out.*/
  887. movq [RESIDUE+0x00],mm0
  888. movq [RESIDUE+0x08],mm4
  889. movq [RESIDUE+0x10],mm1
  890. movq [RESIDUE+0x18],mm5
  891. /*mm0=[src+4*ystride]*/
  892. movq mm0,[SRC]
  893. /*mm1=[src+5*ystride]*/
  894. movq mm1,[SRC+YSTRIDE]
  895. /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
  896. movq mm4,mm2
  897. punpcklbw mm2,mm7
  898. movq mm5,mm3
  899. punpckhbw mm4,mm7
  900. psubw mm2,mm6
  901. punpcklbw mm3,mm7
  902. psubw mm4,mm6
  903. punpckhbw mm5,mm7
  904. psubw mm3,mm6
  905. psubw mm5,mm6
  906. /*Write the answer out.*/
  907. movq [RESIDUE+0x20],mm2
  908. movq [RESIDUE+0x28],mm4
  909. movq [RESIDUE+0x30],mm3
  910. movq [RESIDUE+0x38],mm5
  911. /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
  912. movq mm2,[SRC+YSTRIDE*2]
  913. movq mm3,[SRC+YSTRIDE3]
  914. movq mm4,mm0
  915. punpcklbw mm0,mm7
  916. movq mm5,mm1
  917. punpckhbw mm4,mm7
  918. psubw mm0,mm6
  919. punpcklbw mm1,mm7
  920. psubw mm4,mm6
  921. punpckhbw mm5,mm7
  922. psubw mm1,mm6
  923. psubw mm5,mm6
  924. /*Write the answer out.*/
  925. movq [RESIDUE+0x40],mm0
  926. movq [RESIDUE+0x48],mm4
  927. movq [RESIDUE+0x50],mm1
  928. movq [RESIDUE+0x58],mm5
  929. /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
  930. movq mm4,mm2
  931. punpcklbw mm2,mm7
  932. movq mm5,mm3
  933. punpckhbw mm4,mm7
  934. psubw mm2,mm6
  935. punpcklbw mm3,mm7
  936. psubw mm4,mm6
  937. punpckhbw mm5,mm7
  938. psubw mm3,mm6
  939. psubw mm5,mm6
  940. /*Write the answer out.*/
  941. movq [RESIDUE+0x60],mm2
  942. movq [RESIDUE+0x68],mm4
  943. movq [RESIDUE+0x70],mm3
  944. movq [RESIDUE+0x78],mm5
  945. #undef YSTRIDE
  946. #undef YSTRIDE3
  947. #undef RESIDUE
  948. #undef SRC
  949. }
  950. }
  951. void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
  952. const unsigned char *_src1,const unsigned char *_src2,int _ystride){
  953. oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
  954. }
  955. #endif