mmxencfrag.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. * *
  8. * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
  9. * by the Xiph.Org Foundation http://www.xiph.org/ *
  10. * *
  11. ********************************************************************
  12. function:
  13. last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
  14. ********************************************************************/
  15. #include <stddef.h>
  16. #include "x86enc.h"
  17. #if defined(OC_X86_ASM)
  18. unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  19. const unsigned char *_ref,int _ystride){
  20. ptrdiff_t ret;
  21. __asm{
  22. #define SRC esi
  23. #define REF edx
  24. #define YSTRIDE ecx
  25. #define YSTRIDE3 edi
  26. mov YSTRIDE,_ystride
  27. mov SRC,_src
  28. mov REF,_ref
  29. /*Load the first 4 rows of each block.*/
  30. movq mm0,[SRC]
  31. movq mm1,[REF]
  32. movq mm2,[SRC][YSTRIDE]
  33. movq mm3,[REF][YSTRIDE]
  34. lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
  35. movq mm4,[SRC+YSTRIDE*2]
  36. movq mm5,[REF+YSTRIDE*2]
  37. movq mm6,[SRC+YSTRIDE3]
  38. movq mm7,[REF+YSTRIDE3]
  39. /*Compute their SADs and add them in mm0*/
  40. psadbw mm0,mm1
  41. psadbw mm2,mm3
  42. lea SRC,[SRC+YSTRIDE*4]
  43. paddw mm0,mm2
  44. lea REF,[REF+YSTRIDE*4]
  45. /*Load the next 3 rows as registers become available.*/
  46. movq mm2,[SRC]
  47. movq mm3,[REF]
  48. psadbw mm4,mm5
  49. psadbw mm6,mm7
  50. paddw mm0,mm4
  51. movq mm5,[REF+YSTRIDE]
  52. movq mm4,[SRC+YSTRIDE]
  53. paddw mm0,mm6
  54. movq mm7,[REF+YSTRIDE*2]
  55. movq mm6,[SRC+YSTRIDE*2]
  56. /*Start adding their SADs to mm0*/
  57. psadbw mm2,mm3
  58. psadbw mm4,mm5
  59. paddw mm0,mm2
  60. psadbw mm6,mm7
  61. /*Load last row as registers become available.*/
  62. movq mm2,[SRC+YSTRIDE3]
  63. movq mm3,[REF+YSTRIDE3]
  64. /*And finish adding up their SADs.*/
  65. paddw mm0,mm4
  66. psadbw mm2,mm3
  67. paddw mm0,mm6
  68. paddw mm0,mm2
  69. movd [ret],mm0
  70. #undef SRC
  71. #undef REF
  72. #undef YSTRIDE
  73. #undef YSTRIDE3
  74. }
  75. return (unsigned)ret;
  76. }
  77. unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
  78. const unsigned char *_ref,int _ystride,unsigned _thresh){
  79. /*Early termination is for suckers.*/
  80. return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
  81. }
  82. #define OC_SAD2_LOOP __asm{ \
  83. /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
  84. pavgb computes (mm0+mm1+1>>1). \
  85. The latter is exactly 1 too large when the low bit of two corresponding \
  86. bytes is only set in one of them. \
  87. Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
  88. correct the output of pavgb.*/ \
  89. __asm movq mm6,mm0 \
  90. __asm lea REF1,[REF1+YSTRIDE*2] \
  91. __asm pxor mm0,mm1 \
  92. __asm pavgb mm6,mm1 \
  93. __asm lea REF2,[REF2+YSTRIDE*2] \
  94. __asm movq mm1,mm2 \
  95. __asm pand mm0,mm7 \
  96. __asm pavgb mm2,mm3 \
  97. __asm pxor mm1,mm3 \
  98. __asm movq mm3,[REF2+YSTRIDE] \
  99. __asm psubb mm6,mm0 \
  100. __asm movq mm0,[REF1] \
  101. __asm pand mm1,mm7 \
  102. __asm psadbw mm4,mm6 \
  103. __asm movd mm6,RET \
  104. __asm psubb mm2,mm1 \
  105. __asm movq mm1,[REF2] \
  106. __asm lea SRC,[SRC+YSTRIDE*2] \
  107. __asm psadbw mm5,mm2 \
  108. __asm movq mm2,[REF1+YSTRIDE] \
  109. __asm paddw mm5,mm4 \
  110. __asm movq mm4,[SRC] \
  111. __asm paddw mm6,mm5 \
  112. __asm movq mm5,[SRC+YSTRIDE] \
  113. __asm movd RET,mm6 \
  114. }
  115. /*Same as above, but does not pre-load the next two rows.*/
  116. #define OC_SAD2_TAIL __asm{ \
  117. __asm movq mm6,mm0 \
  118. __asm pavgb mm0,mm1 \
  119. __asm pxor mm6,mm1 \
  120. __asm movq mm1,mm2 \
  121. __asm pand mm6,mm7 \
  122. __asm pavgb mm2,mm3 \
  123. __asm pxor mm1,mm3 \
  124. __asm psubb mm0,mm6 \
  125. __asm pand mm1,mm7 \
  126. __asm psadbw mm4,mm0 \
  127. __asm psubb mm2,mm1 \
  128. __asm movd mm6,RET \
  129. __asm psadbw mm5,mm2 \
  130. __asm paddw mm5,mm4 \
  131. __asm paddw mm6,mm5 \
  132. __asm movd RET,mm6 \
  133. }
  134. unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  135. const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  136. unsigned _thresh){
  137. ptrdiff_t ret;
  138. __asm{
  139. #define REF1 ecx
  140. #define REF2 edi
  141. #define YSTRIDE esi
  142. #define SRC edx
  143. #define RET eax
  144. mov YSTRIDE,_ystride
  145. mov SRC,_src
  146. mov REF1,_ref1
  147. mov REF2,_ref2
  148. movq mm0,[REF1]
  149. movq mm1,[REF2]
  150. movq mm2,[REF1+YSTRIDE]
  151. movq mm3,[REF2+YSTRIDE]
  152. xor RET,RET
  153. movq mm4,[SRC]
  154. pxor mm7,mm7
  155. pcmpeqb mm6,mm6
  156. movq mm5,[SRC+YSTRIDE]
  157. psubb mm7,mm6
  158. OC_SAD2_LOOP
  159. OC_SAD2_LOOP
  160. OC_SAD2_LOOP
  161. OC_SAD2_TAIL
  162. mov [ret],RET
  163. #undef REF1
  164. #undef REF2
  165. #undef YSTRIDE
  166. #undef SRC
  167. #undef RET
  168. }
  169. return (unsigned)ret;
  170. }
  171. /*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
  172. 16-bit difference in mm0...mm7.*/
  173. #define OC_LOAD_SUB_8x4(_off) __asm{ \
  174. __asm movd mm0,[_off+SRC] \
  175. __asm movd mm4,[_off+REF] \
  176. __asm movd mm1,[_off+SRC+SRC_YSTRIDE] \
  177. __asm lea SRC,[SRC+SRC_YSTRIDE*2] \
  178. __asm movd mm5,[_off+REF+REF_YSTRIDE] \
  179. __asm lea REF,[REF+REF_YSTRIDE*2] \
  180. __asm movd mm2,[_off+SRC] \
  181. __asm movd mm7,[_off+REF] \
  182. __asm movd mm3,[_off+SRC+SRC_YSTRIDE] \
  183. __asm movd mm6,[_off+REF+REF_YSTRIDE] \
  184. __asm punpcklbw mm0,mm4 \
  185. __asm lea SRC,[SRC+SRC_YSTRIDE*2] \
  186. __asm punpcklbw mm4,mm4 \
  187. __asm lea REF,[REF+REF_YSTRIDE*2] \
  188. __asm psubw mm0,mm4 \
  189. __asm movd mm4,[_off+SRC] \
  190. __asm movq [_off*2+BUF],mm0 \
  191. __asm movd mm0,[_off+REF] \
  192. __asm punpcklbw mm1,mm5 \
  193. __asm punpcklbw mm5,mm5 \
  194. __asm psubw mm1,mm5 \
  195. __asm movd mm5,[_off+SRC+SRC_YSTRIDE] \
  196. __asm punpcklbw mm2,mm7 \
  197. __asm punpcklbw mm7,mm7 \
  198. __asm psubw mm2,mm7 \
  199. __asm movd mm7,[_off+REF+REF_YSTRIDE] \
  200. __asm punpcklbw mm3,mm6 \
  201. __asm lea SRC,[SRC+SRC_YSTRIDE*2] \
  202. __asm punpcklbw mm6,mm6 \
  203. __asm psubw mm3,mm6 \
  204. __asm movd mm6,[_off+SRC] \
  205. __asm punpcklbw mm4,mm0 \
  206. __asm lea REF,[REF+REF_YSTRIDE*2] \
  207. __asm punpcklbw mm0,mm0 \
  208. __asm lea SRC,[SRC+SRC_YSTRIDE*2] \
  209. __asm psubw mm4,mm0 \
  210. __asm movd mm0,[_off+REF] \
  211. __asm punpcklbw mm5,mm7 \
  212. __asm neg SRC_YSTRIDE \
  213. __asm punpcklbw mm7,mm7 \
  214. __asm psubw mm5,mm7 \
  215. __asm movd mm7,[_off+SRC+SRC_YSTRIDE] \
  216. __asm punpcklbw mm6,mm0 \
  217. __asm lea REF,[REF+REF_YSTRIDE*2] \
  218. __asm punpcklbw mm0,mm0 \
  219. __asm neg REF_YSTRIDE \
  220. __asm psubw mm6,mm0 \
  221. __asm movd mm0,[_off+REF+REF_YSTRIDE] \
  222. __asm lea SRC,[SRC+SRC_YSTRIDE*8] \
  223. __asm punpcklbw mm7,mm0 \
  224. __asm neg SRC_YSTRIDE \
  225. __asm punpcklbw mm0,mm0 \
  226. __asm lea REF,[REF+REF_YSTRIDE*8] \
  227. __asm psubw mm7,mm0 \
  228. __asm neg REF_YSTRIDE \
  229. __asm movq mm0,[_off*2+BUF] \
  230. }
  231. /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
  232. #define OC_LOAD_8x4(_off) __asm{ \
  233. __asm movd mm0,[_off+SRC] \
  234. __asm movd mm1,[_off+SRC+YSTRIDE] \
  235. __asm movd mm2,[_off+SRC+YSTRIDE*2] \
  236. __asm pxor mm7,mm7 \
  237. __asm movd mm3,[_off+SRC+YSTRIDE3] \
  238. __asm punpcklbw mm0,mm7 \
  239. __asm movd mm4,[_off+SRC4] \
  240. __asm punpcklbw mm1,mm7 \
  241. __asm movd mm5,[_off+SRC4+YSTRIDE] \
  242. __asm punpcklbw mm2,mm7 \
  243. __asm movd mm6,[_off+SRC4+YSTRIDE*2] \
  244. __asm punpcklbw mm3,mm7 \
  245. __asm movd mm7,[_off+SRC4+YSTRIDE3] \
  246. __asm punpcklbw mm4,mm4 \
  247. __asm punpcklbw mm5,mm5 \
  248. __asm psrlw mm4,8 \
  249. __asm psrlw mm5,8 \
  250. __asm punpcklbw mm6,mm6 \
  251. __asm punpcklbw mm7,mm7 \
  252. __asm psrlw mm6,8 \
  253. __asm psrlw mm7,8 \
  254. }
  255. /*Performs the first two stages of an 8-point 1-D Hadamard transform.
  256. The transform is performed in place, except that outputs 0-3 are swapped with
  257. outputs 4-7.
  258. Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
  259. perform this stage in place with no temporary registers).*/
  260. #define OC_HADAMARD_AB_8x4 __asm{ \
  261. /*Stage A: \
  262. Outputs 0-3 are swapped with 4-7 here.*/ \
  263. __asm paddw mm5,mm1 \
  264. __asm paddw mm6,mm2 \
  265. __asm paddw mm1,mm1 \
  266. __asm paddw mm2,mm2 \
  267. __asm psubw mm1,mm5 \
  268. __asm psubw mm2,mm6 \
  269. __asm paddw mm7,mm3 \
  270. __asm paddw mm4,mm0 \
  271. __asm paddw mm3,mm3 \
  272. __asm paddw mm0,mm0 \
  273. __asm psubw mm3,mm7 \
  274. __asm psubw mm0,mm4 \
  275. /*Stage B:*/ \
  276. __asm paddw mm0,mm2 \
  277. __asm paddw mm1,mm3 \
  278. __asm paddw mm4,mm6 \
  279. __asm paddw mm5,mm7 \
  280. __asm paddw mm2,mm2 \
  281. __asm paddw mm3,mm3 \
  282. __asm paddw mm6,mm6 \
  283. __asm paddw mm7,mm7 \
  284. __asm psubw mm2,mm0 \
  285. __asm psubw mm3,mm1 \
  286. __asm psubw mm6,mm4 \
  287. __asm psubw mm7,mm5 \
  288. }
  289. /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
  290. Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
  291. place with no temporary registers).*/
  292. #define OC_HADAMARD_C_8x4 __asm{ \
  293. /*Stage C:*/ \
  294. __asm paddw mm0,mm1 \
  295. __asm paddw mm2,mm3 \
  296. __asm paddw mm4,mm5 \
  297. __asm paddw mm6,mm7 \
  298. __asm paddw mm1,mm1 \
  299. __asm paddw mm3,mm3 \
  300. __asm paddw mm5,mm5 \
  301. __asm paddw mm7,mm7 \
  302. __asm psubw mm1,mm0 \
  303. __asm psubw mm3,mm2 \
  304. __asm psubw mm5,mm4 \
  305. __asm psubw mm7,mm6 \
  306. }
  307. /*Performs an 8-point 1-D Hadamard transform.
  308. The transform is performed in place, except that outputs 0-3 are swapped with
  309. outputs 4-7.
  310. Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
  311. in place with no temporary registers).*/
  312. #define OC_HADAMARD_8x4 __asm{ \
  313. OC_HADAMARD_AB_8x4 \
  314. OC_HADAMARD_C_8x4 \
  315. }
  316. /*Performs the first part of the final stage of the Hadamard transform and
  317. summing of absolute values.
  318. At the end of this part, mm1 will contain the DC coefficient of the
  319. transform.*/
  320. #define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
  321. /*We use the fact that \
  322. (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
  323. to merge the final butterfly with the abs and the first stage of \
  324. accumulation. \
  325. Thus we can avoid using pabsw, which is not available until SSSE3. \
  326. Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
  327. implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
  328. registers). \
  329. Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
  330. This implementation is only 26 (+4 for spilling registers).*/ \
  331. __asm movq [_r7+BUF],mm7 \
  332. __asm movq [_r6+BUF],mm6 \
  333. /*mm7={0x7FFF}x4 \
  334. mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
  335. __asm pcmpeqb mm7,mm7 \
  336. __asm movq mm6,mm0 \
  337. __asm psrlw mm7,1 \
  338. __asm paddw mm6,mm1 \
  339. __asm pmaxsw mm0,mm1 \
  340. __asm paddsw mm6,mm7 \
  341. __asm psubw mm0,mm6 \
  342. /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
  343. mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
  344. __asm movq mm6,mm2 \
  345. __asm movq mm1,mm4 \
  346. __asm pmaxsw mm2,mm3 \
  347. __asm pmaxsw mm4,mm5 \
  348. __asm paddw mm6,mm3 \
  349. __asm paddw mm1,mm5 \
  350. __asm movq mm3,[_r7+BUF] \
  351. }
  352. /*Performs the second part of the final stage of the Hadamard transform and
  353. summing of absolute values.*/
  354. #define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
  355. __asm paddsw mm6,mm7 \
  356. __asm movq mm5,[_r6+BUF] \
  357. __asm paddsw mm1,mm7 \
  358. __asm psubw mm2,mm6 \
  359. __asm psubw mm4,mm1 \
  360. /*mm7={1}x4 (needed for the horizontal add that follows) \
  361. mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
  362. __asm movq mm6,mm3 \
  363. __asm pmaxsw mm3,mm5 \
  364. __asm paddw mm0,mm2 \
  365. __asm paddw mm6,mm5 \
  366. __asm paddw mm0,mm4 \
  367. __asm paddsw mm6,mm7 \
  368. __asm paddw mm0,mm3 \
  369. __asm psrlw mm7,14 \
  370. __asm psubw mm0,mm6 \
  371. }
  372. /*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
  373. absolute value of each component, and accumulates everything into mm0.
  374. This is the only portion of SATD which requires MMXEXT (we could use plain
  375. MMX, but it takes 4 instructions and an extra register to work around the
  376. lack of a pmaxsw, which is a pretty serious penalty).*/
  377. #define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
  378. OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
  379. OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
  380. }
  381. /*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
  382. component, and accumulates everything into mm0.
  383. Note that mm0 will have an extra 4 added to each column, and that after
  384. removing this value, the remainder will be half the conventional value.*/
  385. #define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
  386. OC_HADAMARD_AB_8x4 \
  387. OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
  388. }
  389. /*Performs two 4x4 transposes (mostly) in place.
  390. On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
  391. contains rows {a,b,c,d}.
  392. On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
  393. {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
  394. #define OC_TRANSPOSE_4x4x2(_off) __asm{ \
  395. /*First 4x4 transpose:*/ \
  396. __asm movq [0x10+_off+BUF],mm5 \
  397. /*mm0 = e3 e2 e1 e0 \
  398. mm1 = f3 f2 f1 f0 \
  399. mm2 = g3 g2 g1 g0 \
  400. mm3 = h3 h2 h1 h0*/ \
  401. __asm movq mm5,mm2 \
  402. __asm punpcklwd mm2,mm3 \
  403. __asm punpckhwd mm5,mm3 \
  404. __asm movq mm3,mm0 \
  405. __asm punpcklwd mm0,mm1 \
  406. __asm punpckhwd mm3,mm1 \
  407. /*mm0 = f1 e1 f0 e0 \
  408. mm3 = f3 e3 f2 e2 \
  409. mm2 = h1 g1 h0 g0 \
  410. mm5 = h3 g3 h2 g2*/ \
  411. __asm movq mm1,mm0 \
  412. __asm punpckldq mm0,mm2 \
  413. __asm punpckhdq mm1,mm2 \
  414. __asm movq mm2,mm3 \
  415. __asm punpckhdq mm3,mm5 \
  416. __asm movq [0x40+_off+BUF],mm0 \
  417. __asm punpckldq mm2,mm5 \
  418. /*mm0 = h0 g0 f0 e0 \
  419. mm1 = h1 g1 f1 e1 \
  420. mm2 = h2 g2 f2 e2 \
  421. mm3 = h3 g3 f3 e3*/ \
  422. __asm movq mm5,[0x10+_off+BUF] \
  423. /*Second 4x4 transpose:*/ \
  424. /*mm4 = a3 a2 a1 a0 \
  425. mm5 = b3 b2 b1 b0 \
  426. mm6 = c3 c2 c1 c0 \
  427. mm7 = d3 d2 d1 d0*/ \
  428. __asm movq mm0,mm6 \
  429. __asm punpcklwd mm6,mm7 \
  430. __asm movq [0x50+_off+BUF],mm1 \
  431. __asm punpckhwd mm0,mm7 \
  432. __asm movq mm7,mm4 \
  433. __asm punpcklwd mm4,mm5 \
  434. __asm movq [0x60+_off+BUF],mm2 \
  435. __asm punpckhwd mm7,mm5 \
  436. /*mm4 = b1 a1 b0 a0 \
  437. mm7 = b3 a3 b2 a2 \
  438. mm6 = d1 c1 d0 c0 \
  439. mm0 = d3 c3 d2 c2*/ \
  440. __asm movq mm5,mm4 \
  441. __asm punpckldq mm4,mm6 \
  442. __asm movq [0x70+_off+BUF],mm3 \
  443. __asm punpckhdq mm5,mm6 \
  444. __asm movq mm6,mm7 \
  445. __asm punpckhdq mm7,mm0 \
  446. __asm punpckldq mm6,mm0 \
  447. /*mm4 = d0 c0 b0 a0 \
  448. mm5 = d1 c1 b1 a1 \
  449. mm6 = d2 c2 b2 a2 \
  450. mm7 = d3 c3 b3 a3*/ \
  451. }
  452. static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
  453. int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
  454. OC_ALIGN8(ogg_int16_t buf[64]);
  455. ogg_int16_t *bufp;
  456. unsigned ret1;
  457. unsigned ret2;
  458. bufp=buf;
  459. __asm{
  460. #define SRC esi
  461. #define REF eax
  462. #define SRC_YSTRIDE ecx
  463. #define REF_YSTRIDE edx
  464. #define BUF edi
  465. #define RET eax
  466. #define RET2 edx
  467. mov SRC,_src
  468. mov SRC_YSTRIDE,_src_ystride
  469. mov REF,_ref
  470. mov REF_YSTRIDE,_ref_ystride
  471. mov BUF,bufp
  472. OC_LOAD_SUB_8x4(0x00)
  473. OC_HADAMARD_8x4
  474. OC_TRANSPOSE_4x4x2(0x00)
  475. /*Finish swapping out this 8x4 block to make room for the next one.
  476. mm0...mm3 have been swapped out already.*/
  477. movq [0x00+BUF],mm4
  478. movq [0x10+BUF],mm5
  479. movq [0x20+BUF],mm6
  480. movq [0x30+BUF],mm7
  481. OC_LOAD_SUB_8x4(0x04)
  482. OC_HADAMARD_8x4
  483. OC_TRANSPOSE_4x4x2(0x08)
  484. /*Here the first 4x4 block of output from the last transpose is the second
  485. 4x4 block of input for the next transform.
  486. We have cleverly arranged that it already be in the appropriate place, so
  487. we only have to do half the loads.*/
  488. movq mm1,[0x10+BUF]
  489. movq mm2,[0x20+BUF]
  490. movq mm3,[0x30+BUF]
  491. movq mm0,[0x00+BUF]
  492. OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
  493. /*Up to this point, everything fit in 16 bits (8 input + 1 for the
  494. difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
  495. for the factor of two we dropped + 3 for the vertical accumulation).
  496. Now we finally have to promote things to dwords.
  497. We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
  498. latency of pmaddwd by starting the next series of loads now.*/
  499. mov RET2,_thresh
  500. pmaddwd mm0,mm7
  501. movq mm1,[0x50+BUF]
  502. movq mm5,[0x58+BUF]
  503. movq mm4,mm0
  504. movq mm2,[0x60+BUF]
  505. punpckhdq mm0,mm0
  506. movq mm6,[0x68+BUF]
  507. paddd mm4,mm0
  508. movq mm3,[0x70+BUF]
  509. movd RET,mm4
  510. movq mm7,[0x78+BUF]
  511. /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
  512. added to them, and a factor of two removed; correct the final sum here.*/
  513. lea RET,[RET+RET-32]
  514. movq mm0,[0x40+BUF]
  515. cmp RET,RET2
  516. movq mm4,[0x48+BUF]
  517. jae at_end
  518. OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
  519. pmaddwd mm0,mm7
  520. /*There isn't much to stick in here to hide the latency this time, but the
  521. alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
  522. latency is even worse.*/
  523. sub RET,32
  524. movq mm4,mm0
  525. punpckhdq mm0,mm0
  526. paddd mm4,mm0
  527. movd RET2,mm4
  528. lea RET,[RET+RET2*2]
  529. align 16
  530. at_end:
  531. mov ret1,RET
  532. #undef SRC
  533. #undef REF
  534. #undef SRC_YSTRIDE
  535. #undef REF_YSTRIDE
  536. #undef BUF
  537. #undef RET
  538. #undef RET2
  539. }
  540. return ret1;
  541. }
  542. unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
  543. const unsigned char *_ref,int _ystride,unsigned _thresh){
  544. return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
  545. }
  546. /*Our internal implementation of frag_copy2 takes an extra stride parameter so
  547. we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
  548. static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
  549. const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
  550. __asm{
  551. /*Load the first 3 rows.*/
  552. #define DST_YSTRIDE edi
  553. #define SRC_YSTRIDE esi
  554. #define DST eax
  555. #define SRC1 edx
  556. #define SRC2 ecx
  557. mov DST_YSTRIDE,_dst_ystride
  558. mov SRC_YSTRIDE,_src_ystride
  559. mov DST,_dst
  560. mov SRC1,_src1
  561. mov SRC2,_src2
  562. movq mm0,[SRC1]
  563. movq mm1,[SRC2]
  564. movq mm2,[SRC1+SRC_YSTRIDE]
  565. lea SRC1,[SRC1+SRC_YSTRIDE*2]
  566. movq mm3,[SRC2+SRC_YSTRIDE]
  567. lea SRC2,[SRC2+SRC_YSTRIDE*2]
  568. pxor mm7,mm7
  569. movq mm4,[SRC1]
  570. pcmpeqb mm6,mm6
  571. movq mm5,[SRC2]
  572. /*mm7={1}x8.*/
  573. psubb mm7,mm6
  574. /*Start averaging mm0 and mm1 into mm6.*/
  575. movq mm6,mm0
  576. pxor mm0,mm1
  577. pavgb mm6,mm1
  578. /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
  579. movq mm1,mm2
  580. pand mm0,mm7
  581. pavgb mm2,mm3
  582. pxor mm1,mm3
  583. /*mm3 is free.*/
  584. psubb mm6,mm0
  585. /*mm0 is free, start loading the next row.*/
  586. movq mm0,[SRC1+SRC_YSTRIDE]
  587. /*Start averaging mm5 and mm4 using mm3.*/
  588. movq mm3,mm4
  589. /*mm6 [row 0] is done; write it out.*/
  590. movq [DST],mm6
  591. pand mm1,mm7
  592. pavgb mm4,mm5
  593. psubb mm2,mm1
  594. /*mm1 is free, continue loading the next row.*/
  595. movq mm1,[SRC2+SRC_YSTRIDE]
  596. pxor mm3,mm5
  597. lea SRC1,[SRC1+SRC_YSTRIDE*2]
  598. /*mm2 [row 1] is done; write it out.*/
  599. movq [DST+DST_YSTRIDE],mm2
  600. pand mm3,mm7
  601. /*Start loading the next row.*/
  602. movq mm2,[SRC1]
  603. lea DST,[DST+DST_YSTRIDE*2]
  604. psubb mm4,mm3
  605. lea SRC2,[SRC2+SRC_YSTRIDE*2]
  606. /*mm4 [row 2] is done; write it out.*/
  607. movq [DST],mm4
  608. /*Continue loading the next row.*/
  609. movq mm3,[SRC2]
  610. /*Start averaging mm0 and mm1 into mm6.*/
  611. movq mm6,mm0
  612. pxor mm0,mm1
  613. /*Start loading the next row.*/
  614. movq mm4,[SRC1+SRC_YSTRIDE]
  615. pavgb mm6,mm1
  616. /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
  617. movq mm1,mm2
  618. pand mm0,mm7
  619. /*Continue loading the next row.*/
  620. movq mm5,[SRC2+SRC_YSTRIDE]
  621. pavgb mm2,mm3
  622. lea SRC1,[SRC1+SRC_YSTRIDE*2]
  623. pxor mm1,mm3
  624. /*mm3 is free.*/
  625. psubb mm6,mm0
  626. /*mm0 is free, start loading the next row.*/
  627. movq mm0,[SRC1]
  628. /*Start averaging mm5 into mm4 using mm3.*/
  629. movq mm3,mm4
  630. /*mm6 [row 3] is done; write it out.*/
  631. movq [DST+DST_YSTRIDE],mm6
  632. pand mm1,mm7
  633. lea SRC2,[SRC2+SRC_YSTRIDE*2]
  634. pavgb mm4,mm5
  635. lea DST,[DST+DST_YSTRIDE*2]
  636. psubb mm2,mm1
  637. /*mm1 is free; continue loading the next row.*/
  638. movq mm1,[SRC2]
  639. pxor mm3,mm5
  640. /*mm2 [row 4] is done; write it out.*/
  641. movq [DST],mm2
  642. pand mm3,mm7
  643. /*Start loading the next row.*/
  644. movq mm2,[SRC1+SRC_YSTRIDE]
  645. psubb mm4,mm3
  646. /*Start averaging mm0 and mm1 into mm6.*/
  647. movq mm6,mm0
  648. /*Continue loading the next row.*/
  649. movq mm3,[SRC2+SRC_YSTRIDE]
  650. /*mm4 [row 5] is done; write it out.*/
  651. movq [DST+DST_YSTRIDE],mm4
  652. pxor mm0,mm1
  653. pavgb mm6,mm1
  654. /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
  655. movq mm4,mm2
  656. pand mm0,mm7
  657. pavgb mm2,mm3
  658. pxor mm4,mm3
  659. lea DST,[DST+DST_YSTRIDE*2]
  660. psubb mm6,mm0
  661. pand mm4,mm7
  662. /*mm6 [row 6] is done, write it out.*/
  663. movq [DST],mm6
  664. psubb mm2,mm4
  665. /*mm2 [row 7] is done, write it out.*/
  666. movq [DST+DST_YSTRIDE],mm2
  667. #undef SRC1
  668. #undef SRC2
  669. #undef SRC_YSTRIDE
  670. #undef DST_YSTRIDE
  671. #undef DST
  672. }
  673. }
  674. unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
  675. const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  676. unsigned _thresh){
  677. OC_ALIGN8(unsigned char ref[64]);
  678. oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
  679. return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
  680. }
  681. unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
  682. int _ystride){
  683. OC_ALIGN8(ogg_int16_t buf[64]);
  684. ogg_int16_t *bufp;
  685. unsigned ret1;
  686. unsigned ret2;
  687. bufp=buf;
  688. __asm{
  689. #define SRC eax
  690. #define SRC4 esi
  691. #define BUF edi
  692. #define RET eax
  693. #define RET_WORD ax
  694. #define RET2 ecx
  695. #define YSTRIDE edx
  696. #define YSTRIDE3 ecx
  697. mov SRC,_src
  698. mov BUF,bufp
  699. mov YSTRIDE,_ystride
  700. /* src4 = src+4*ystride */
  701. lea SRC4,[SRC+YSTRIDE*4]
  702. /* ystride3 = 3*ystride */
  703. lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
  704. OC_LOAD_8x4(0x00)
  705. OC_HADAMARD_8x4
  706. OC_TRANSPOSE_4x4x2(0x00)
  707. /*Finish swapping out this 8x4 block to make room for the next one.
  708. mm0...mm3 have been swapped out already.*/
  709. movq [0x00+BUF],mm4
  710. movq [0x10+BUF],mm5
  711. movq [0x20+BUF],mm6
  712. movq [0x30+BUF],mm7
  713. OC_LOAD_8x4(0x04)
  714. OC_HADAMARD_8x4
  715. OC_TRANSPOSE_4x4x2(0x08)
  716. /*Here the first 4x4 block of output from the last transpose is the second
  717. 4x4 block of input for the next transform.
  718. We have cleverly arranged that it already be in the appropriate place, so
  719. we only have to do half the loads.*/
  720. movq mm1,[0x10+BUF]
  721. movq mm2,[0x20+BUF]
  722. movq mm3,[0x30+BUF]
  723. movq mm0,[0x00+BUF]
  724. /*We split out the stages here so we can save the DC coefficient in the
  725. middle.*/
  726. OC_HADAMARD_AB_8x4
  727. OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
  728. movd RET,mm1
  729. OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
  730. /*Up to this point, everything fit in 16 bits (8 input + 1 for the
  731. difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
  732. for the factor of two we dropped + 3 for the vertical accumulation).
  733. Now we finally have to promote things to dwords.
  734. We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
  735. latency of pmaddwd by starting the next series of loads now.*/
  736. pmaddwd mm0,mm7
  737. movq mm1,[0x50+BUF]
  738. movq mm5,[0x58+BUF]
  739. movq mm2,[0x60+BUF]
  740. movq mm4,mm0
  741. movq mm6,[0x68+BUF]
  742. punpckhdq mm0,mm0
  743. movq mm3,[0x70+BUF]
  744. paddd mm4,mm0
  745. movq mm7,[0x78+BUF]
  746. movd RET2,mm4
  747. movq mm0,[0x40+BUF]
  748. movq mm4,[0x48+BUF]
  749. OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
  750. pmaddwd mm0,mm7
  751. /*We assume that the DC coefficient is always positive (which is true,
  752. because the input to the INTRA transform was not a difference).*/
  753. movzx RET,RET_WORD
  754. add RET2,RET2
  755. sub RET2,RET
  756. movq mm4,mm0
  757. punpckhdq mm0,mm0
  758. paddd mm4,mm0
  759. movd RET,mm4
  760. lea RET,[-64+RET2+RET*2]
  761. mov [ret1],RET
  762. #undef SRC
  763. #undef SRC4
  764. #undef BUF
  765. #undef RET
  766. #undef RET_WORD
  767. #undef RET2
  768. #undef YSTRIDE
  769. #undef YSTRIDE3
  770. }
  771. return ret1;
  772. }
  773. void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
  774. const unsigned char *_src, const unsigned char *_ref,int _ystride){
  775. int i;
  776. __asm pxor mm7,mm7
  777. for(i=4;i-->0;){
  778. __asm{
  779. #define SRC edx
  780. #define YSTRIDE esi
  781. #define RESIDUE eax
  782. #define REF ecx
  783. mov YSTRIDE,_ystride
  784. mov RESIDUE,_residue
  785. mov SRC,_src
  786. mov REF,_ref
  787. /*mm0=[src]*/
  788. movq mm0,[SRC]
  789. /*mm1=[ref]*/
  790. movq mm1,[REF]
  791. /*mm4=[src+ystride]*/
  792. movq mm4,[SRC+YSTRIDE]
  793. /*mm5=[ref+ystride]*/
  794. movq mm5,[REF+YSTRIDE]
  795. /*Compute [src]-[ref].*/
  796. movq mm2,mm0
  797. punpcklbw mm0,mm7
  798. movq mm3,mm1
  799. punpckhbw mm2,mm7
  800. punpcklbw mm1,mm7
  801. punpckhbw mm3,mm7
  802. psubw mm0,mm1
  803. psubw mm2,mm3
  804. /*Compute [src+ystride]-[ref+ystride].*/
  805. movq mm1,mm4
  806. punpcklbw mm4,mm7
  807. movq mm3,mm5
  808. punpckhbw mm1,mm7
  809. lea SRC,[SRC+YSTRIDE*2]
  810. punpcklbw mm5,mm7
  811. lea REF,[REF+YSTRIDE*2]
  812. punpckhbw mm3,mm7
  813. psubw mm4,mm5
  814. psubw mm1,mm3
  815. /*Write the answer out.*/
  816. movq [RESIDUE+0x00],mm0
  817. movq [RESIDUE+0x08],mm2
  818. movq [RESIDUE+0x10],mm4
  819. movq [RESIDUE+0x18],mm1
  820. lea RESIDUE,[RESIDUE+0x20]
  821. mov _residue,RESIDUE
  822. mov _src,SRC
  823. mov _ref,REF
  824. #undef SRC
  825. #undef YSTRIDE
  826. #undef RESIDUE
  827. #undef REF
  828. }
  829. }
  830. }
  831. void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
  832. const unsigned char *_src,int _ystride){
  833. __asm{
  834. #define YSTRIDE edx
  835. #define YSTRIDE3 edi
  836. #define RESIDUE ecx
  837. #define SRC eax
  838. mov YSTRIDE,_ystride
  839. mov RESIDUE,_residue
  840. mov SRC,_src
  841. /*mm0=[src]*/
  842. movq mm0,[SRC]
  843. /*mm1=[src+ystride]*/
  844. movq mm1,[SRC+YSTRIDE]
  845. /*mm6={-1}x4*/
  846. pcmpeqw mm6,mm6
  847. /*mm2=[src+2*ystride]*/
  848. movq mm2,[SRC+YSTRIDE*2]
  849. /*[ystride3]=3*[ystride]*/
  850. lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
  851. /*mm6={1}x4*/
  852. psllw mm6,15
  853. /*mm3=[src+3*ystride]*/
  854. movq mm3,[SRC+YSTRIDE3]
  855. /*mm6={128}x4*/
  856. psrlw mm6,8
  857. /*mm7=0*/
  858. pxor mm7,mm7
  859. /*[src]=[src]+4*[ystride]*/
  860. lea SRC,[SRC+YSTRIDE*4]
  861. /*Compute [src]-128 and [src+ystride]-128*/
  862. movq mm4,mm0
  863. punpcklbw mm0,mm7
  864. movq mm5,mm1
  865. punpckhbw mm4,mm7
  866. psubw mm0,mm6
  867. punpcklbw mm1,mm7
  868. psubw mm4,mm6
  869. punpckhbw mm5,mm7
  870. psubw mm1,mm6
  871. psubw mm5,mm6
  872. /*Write the answer out.*/
  873. movq [RESIDUE+0x00],mm0
  874. movq [RESIDUE+0x08],mm4
  875. movq [RESIDUE+0x10],mm1
  876. movq [RESIDUE+0x18],mm5
  877. /*mm0=[src+4*ystride]*/
  878. movq mm0,[SRC]
  879. /*mm1=[src+5*ystride]*/
  880. movq mm1,[SRC+YSTRIDE]
  881. /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
  882. movq mm4,mm2
  883. punpcklbw mm2,mm7
  884. movq mm5,mm3
  885. punpckhbw mm4,mm7
  886. psubw mm2,mm6
  887. punpcklbw mm3,mm7
  888. psubw mm4,mm6
  889. punpckhbw mm5,mm7
  890. psubw mm3,mm6
  891. psubw mm5,mm6
  892. /*Write the answer out.*/
  893. movq [RESIDUE+0x20],mm2
  894. movq [RESIDUE+0x28],mm4
  895. movq [RESIDUE+0x30],mm3
  896. movq [RESIDUE+0x38],mm5
  897. /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
  898. movq mm2,[SRC+YSTRIDE*2]
  899. movq mm3,[SRC+YSTRIDE3]
  900. movq mm4,mm0
  901. punpcklbw mm0,mm7
  902. movq mm5,mm1
  903. punpckhbw mm4,mm7
  904. psubw mm0,mm6
  905. punpcklbw mm1,mm7
  906. psubw mm4,mm6
  907. punpckhbw mm5,mm7
  908. psubw mm1,mm6
  909. psubw mm5,mm6
  910. /*Write the answer out.*/
  911. movq [RESIDUE+0x40],mm0
  912. movq [RESIDUE+0x48],mm4
  913. movq [RESIDUE+0x50],mm1
  914. movq [RESIDUE+0x58],mm5
  915. /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
  916. movq mm4,mm2
  917. punpcklbw mm2,mm7
  918. movq mm5,mm3
  919. punpckhbw mm4,mm7
  920. psubw mm2,mm6
  921. punpcklbw mm3,mm7
  922. psubw mm4,mm6
  923. punpckhbw mm5,mm7
  924. psubw mm3,mm6
  925. psubw mm5,mm6
  926. /*Write the answer out.*/
  927. movq [RESIDUE+0x60],mm2
  928. movq [RESIDUE+0x68],mm4
  929. movq [RESIDUE+0x70],mm3
  930. movq [RESIDUE+0x78],mm5
  931. #undef YSTRIDE
  932. #undef YSTRIDE3
  933. #undef RESIDUE
  934. #undef SRC
  935. }
  936. }
  937. void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
  938. const unsigned char *_src1,const unsigned char *_src2,int _ystride){
  939. oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
  940. }
  941. #endif