c64xfrag.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. * *
  8. * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
  9. * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. * *
  11. ********************************************************************
  12. function:
  13. last mod: $Id$
  14. ********************************************************************/
  15. #include <string.h>
  16. #include "c64xint.h"
  17. /*14 cycles.*/
  18. void oc_frag_copy_c64x(unsigned char *restrict _dst,
  19. const unsigned char *restrict _src,int _ystride){
  20. unsigned char *restrict d2;
  21. const unsigned char *restrict s2;
  22. d2=_dst+_ystride;
  23. s2=_src+_ystride;
  24. #define OC_ITER() \
  25. do{ \
  26. _amem8(_dst)=_mem8(_src); \
  27. _dst+=2*_ystride; \
  28. _src+=2*_ystride; \
  29. _amem8(d2)=_mem8(s2); \
  30. d2+=2*_ystride; \
  31. s2+=2*_ystride; \
  32. } \
  33. while(0)
  34. OC_ITER();
  35. OC_ITER();
  36. OC_ITER();
  37. OC_ITER();
  38. #undef OC_ITER
  39. }
  40. void oc_frag_copy_list_c64x(unsigned char *_dst_frame,
  41. const unsigned char *_src_frame,int _ystride,
  42. const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
  43. ptrdiff_t fragii;
  44. /*9 cycles per iteration.*/
  45. for(fragii=0;fragii<_nfragis;fragii++){
  46. const unsigned char *restrict src;
  47. const unsigned char *restrict s2;
  48. unsigned char *restrict dst;
  49. unsigned char *restrict d2;
  50. ptrdiff_t frag_buf_off;
  51. frag_buf_off=_frag_buf_offs[_fragis[fragii]];
  52. dst=_dst_frame+frag_buf_off;
  53. src=_src_frame+frag_buf_off;
  54. d2=dst+_ystride;
  55. s2=src+_ystride;
  56. #define OC_ITER() \
  57. do{ \
  58. _amem8(dst)=_amem8_const(src); \
  59. dst+=2*_ystride; \
  60. src+=2*_ystride; \
  61. _amem8(d2)=_amem8_const(s2); \
  62. d2+=2*_ystride; \
  63. s2+=2*_ystride; \
  64. } \
  65. while(0)
  66. OC_ITER();
  67. OC_ITER();
  68. OC_ITER();
  69. OC_ITER();
  70. #undef OC_ITER
  71. }
  72. }
  73. /*34 cycles.*/
  74. void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
  75. const ogg_int16_t _residue[64]){
  76. int i;
  77. for(i=0;i<8;i++){
  78. long long ll;
  79. int x1;
  80. int y1;
  81. int x2;
  82. int y2;
  83. ll=_amem8_const(_residue+i*8+0);
  84. x1=_sadd2(_loll(ll),0x00800080);
  85. y1=_sadd2(_hill(ll),0x00800080);
  86. ll=_amem8_const(_residue+i*8+4);
  87. x2=_sadd2(_loll(ll),0x00800080);
  88. y2=_sadd2(_hill(ll),0x00800080);
  89. _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
  90. _dst+=_ystride;
  91. }
  92. }
  93. /*41 cycles.*/
  94. void oc_frag_recon_inter_c64x(unsigned char *_dst,const unsigned char *_src,
  95. int _ystride,const ogg_int16_t _residue[64]){
  96. int i;
  97. for(i=0;i<8;i++){
  98. long long ll;
  99. int x1;
  100. int y1;
  101. int z1;
  102. int x2;
  103. int y2;
  104. int z2;
  105. ll=_mem8_const(_src);
  106. z1=_loll(ll);
  107. z2=_hill(ll);
  108. ll=_amem8_const(_residue+i*8+0);
  109. x1=_sadd2(_unpklu4(z1),_loll(ll));
  110. y1=_sadd2(_unpkhu4(z1),_hill(ll));
  111. ll=_amem8_const(_residue+i*8+4);
  112. x2=_sadd2(_unpklu4(z2),_loll(ll));
  113. y2=_sadd2(_unpkhu4(z2),_hill(ll));
  114. _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
  115. _dst+=_ystride;
  116. _src+=_ystride;
  117. }
  118. }
  119. /*56 cycles.*/
  120. void oc_frag_recon_inter2_c64x(unsigned char *_dst,
  121. const unsigned char *_src1,const unsigned char *_src2,int _ystride,
  122. const ogg_int16_t _residue[64]){
  123. int i;
  124. for(i=0;i<8;i++){
  125. long long ll;
  126. int a;
  127. int b;
  128. int c;
  129. int d;
  130. int x1;
  131. int y1;
  132. int z1;
  133. int x2;
  134. int y2;
  135. int z2;
  136. ll=_mem8_const(_src1);
  137. a=_loll(ll);
  138. b=_hill(ll);
  139. ll=_mem8_const(_src2);
  140. c=_loll(ll);
  141. d=_hill(ll);
  142. ll=_amem8_const(_residue+i*8+0);
  143. z1=~_avgu4(~a,~c);
  144. x1=_sadd2(_unpklu4(z1),_loll(ll));
  145. y1=_sadd2(_unpkhu4(z1),_hill(ll));
  146. ll=_amem8_const(_residue+i*8+4);
  147. z2=~_avgu4(~b,~d);
  148. x2=_sadd2(_unpklu4(z2),_loll(ll));
  149. y2=_sadd2(_unpkhu4(z2),_hill(ll));
  150. _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
  151. _dst+=_ystride;
  152. _src1+=_ystride;
  153. _src2+=_ystride;
  154. }
  155. }
  156. void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
  157. int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
  158. unsigned char *dst;
  159. ptrdiff_t frag_buf_off;
  160. int ystride;
  161. int refi;
  162. /*Apply the inverse transform.*/
  163. /*Special case only having a DC component.*/
  164. if(_last_zzi<2){
  165. int p;
  166. long long ll;
  167. int ci;
  168. /*We round this dequant product (and not any of the others) because there's
  169. no iDCT rounding.*/
  170. p=_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5;
  171. ll=_itoll(_pack2(p,p),_pack2(p,p));
  172. for(ci=0;ci<64;ci+=4)_amem8(_dct_coeffs+64+ci)=ll;
  173. }
  174. else{
  175. /*First, dequantize the DC coefficient.*/
  176. _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
  177. oc_idct8x8_c64x(_dct_coeffs+64,_dct_coeffs,_last_zzi);
  178. }
  179. /*Fill in the target buffer.*/
  180. frag_buf_off=_state->frag_buf_offs[_fragi];
  181. refi=_state->frags[_fragi].refi;
  182. ystride=_state->ref_ystride[_pli];
  183. dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
  184. if(refi==OC_FRAME_SELF)oc_frag_recon_intra_c64x(dst,ystride,_dct_coeffs+64);
  185. else{
  186. const unsigned char *ref;
  187. int mvoffsets[2];
  188. ref=_state->ref_frame_data[refi]+frag_buf_off;
  189. if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
  190. _state->frag_mvs[_fragi])>1){
  191. oc_frag_recon_inter2_c64x(dst,ref+mvoffsets[0],ref+mvoffsets[1],
  192. ystride,_dct_coeffs+64);
  193. }
  194. else oc_frag_recon_inter_c64x(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
  195. }
  196. }
  197. /*46 cycles.*/
  198. static void loop_filter_h(unsigned char *restrict _pix,int _ystride,int _ll){
  199. int p0;
  200. int p1;
  201. int p2;
  202. int p3;
  203. int p4;
  204. int p5;
  205. int p6;
  206. int p7;
  207. int y;
  208. _pix-=2;
  209. /*Do all the loads now to avoid the compiler's inability to prove they're not
  210. dependent on the stores later.*/
  211. p0=_mem4(_pix+_ystride*0);
  212. p1=_mem4(_pix+_ystride*1);
  213. p2=_mem4(_pix+_ystride*2);
  214. p3=_mem4(_pix+_ystride*3);
  215. p4=_mem4(_pix+_ystride*4);
  216. p5=_mem4(_pix+_ystride*5);
  217. p6=_mem4(_pix+_ystride*6);
  218. p7=_mem4(_pix+_ystride*7);
  219. for(y=0;y<8;y+=4){
  220. int f;
  221. int a;
  222. int b;
  223. int u;
  224. int v;
  225. /*We could pack things right after the dot product, but delaying it
  226. actually saves three cycles due to better instruction scheduling.*/
  227. a=_dotpsu4(0x01FD03FF,p0)+3>>3;
  228. b=_dotpsu4(0x01FD03FF,p1)+3>>3;
  229. u=_dotpsu4(0x01FD03FF,p2)+3>>3;
  230. v=_dotpsu4(0x01FD03FF,p3)+3>>3;
  231. f=_packl4(_pack2(v,u),_pack2(b,a));
  232. /*We split the results by sign and work with abs(f) here, since the C64x
  233. signed-unsigned addition with unsigned saturation is only available for
  234. 16-bit operands.
  235. For 8-bit operands, we have to emulate it with a saturated addition and a
  236. saturated subtraction using separate unsigned values.
  237. There's no direct support for 8-bit saturated subtraction, either, so we
  238. have to emulate that as well, using either x-_minu4(x,y) or
  239. ~_saddu4(~x,y), depending on which one schedules better.*/
  240. f=_add4(0x80808080,f);
  241. b=_minu4(0x80808080,f);
  242. a=0x80808080-b;
  243. b=f-b;
  244. /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
  245. u=_saddu4(a,_ll);
  246. v=_saddu4(b,_ll);
  247. a=_saddu4(a,u);
  248. b=_saddu4(b,v);
  249. a=a-_minu4(a,u);
  250. b=b-_minu4(b,v);
  251. /*Apply the changes to the original pixels.*/
  252. u=_pack2(p1>>8,p0>>8);
  253. v=_pack2(p3>>8,p2>>8);
  254. p1=_packl4(v,u);
  255. p2=_packh4(v,u);
  256. p1=_saddu4(~_saddu4(~p1,b),a);
  257. p2=_saddu4(p2-_minu4(p2,a),b);
  258. /*For unaligned short stores, we have to store byte by byte.
  259. It's faster to do it explicitly than to use _mem2().*/
  260. _pix[_ystride*0+1]=(unsigned char)p1;
  261. _pix[_ystride*0+2]=(unsigned char)p2;
  262. _pix[_ystride*1+1]=(unsigned char)(p1>>8);
  263. _pix[_ystride*1+2]=(unsigned char)(p2>>8);
  264. _pix[_ystride*2+1]=(unsigned char)(p1>>16);
  265. _pix[_ystride*2+2]=(unsigned char)(p2>>16);
  266. _pix[_ystride*3+1]=(unsigned char)(p1>>24);
  267. _pix[_ystride*3+2]=(unsigned char)(p2>>24);
  268. p0=p4;
  269. p1=p5;
  270. p2=p6;
  271. p3=p7;
  272. _pix+=4*_ystride;
  273. }
  274. }
  275. /*38 cycles.*/
  276. static void loop_filter_v(unsigned char * restrict _pix,int _ystride,int _ll){
  277. long long ll;
  278. int p0;
  279. int p1;
  280. int p2;
  281. int p3;
  282. int p4;
  283. int p5;
  284. int p6;
  285. int p7;
  286. int a1;
  287. int b1;
  288. int f1;
  289. int m1;
  290. int u1;
  291. int v1;
  292. int a2;
  293. int b2;
  294. int f2;
  295. int m2;
  296. int u2;
  297. int v2;
  298. /*Do all the loads now to avoid the compiler's inability to prove they're not
  299. dependent on the stores later.*/
  300. ll=_amem8(_pix-_ystride*2);
  301. p0=_loll(ll);
  302. p4=_hill(ll);
  303. ll=_amem8(_pix-_ystride*1);
  304. p1=_loll(ll);
  305. p5=_hill(ll);
  306. ll=_amem8(_pix+_ystride*0);
  307. p2=_loll(ll);
  308. p6=_hill(ll);
  309. ll=_amem8(_pix+_ystride*1);
  310. p3=_loll(ll);
  311. p7=_hill(ll);
  312. /*I can't find a way to put the rest in a loop that the compiler thinks is
  313. unrollable, so instead it's unrolled manually.*/
  314. /*This first part is based on the transformation
  315. f = -(3*(p2-p1)+p0-p3+4>>3)
  316. = -(3*(p2+255-p1)+(p0+255-p3)+4-1020>>3)
  317. = -(3*(p2+~p1)+(p0+~p3)-1016>>3)
  318. = 127-(3*(p2+~p1)+(p0+~p3)>>3)
  319. = 128+~(3*(p2+~p1)+(p0+~p3)>>3) (mod 256).
  320. Although _avgu4(a,b) = (a+b+1>>1) (biased up), we rely heavily on the
  321. fact that ~_avgu4(~a,~b) = (a+b>>1) (biased down).*/
  322. /*We need this first average both biased up and biased down.*/
  323. u1=~_avgu4(~p1,p2);
  324. v1=_avgu4(p1,~p2);
  325. /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
  326. m1=_sub4(u1,v1);
  327. a1=m1^_avgu4(m1^~p0,m1^p3);
  328. f1=_avgu4(_avgu4(a1,u1),v1);
  329. /*Instead of removing the bias by 128, we use it to split f by sign, since
  330. the C64x signed-unsigned addition with unsigned saturation is only
  331. available for 16-bit operands.
  332. For 8-bit operands, we have to emulate it with a saturated addition and a
  333. saturated subtraction using separate unsigned values.
  334. There's no direct support for 8-bit saturated subtraction, either, so we
  335. have to emulate that as well, using either x-_minu4(x,y) or
  336. ~_saddu4(~x,y), depending on which one schedules better.*/
  337. b1=_minu4(0x80808080,f1);
  338. a1=0x80808080-b1;
  339. b1=f1-b1;
  340. /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
  341. u1=_saddu4(a1,_ll);
  342. v1=_saddu4(b1,_ll);
  343. a1=_saddu4(a1,u1);
  344. b1=_saddu4(b1,v1);
  345. a1=a1-_minu4(a1,u1);
  346. b1=b1-_minu4(b1,v1);
  347. /*Apply the changes to the original pixels.*/
  348. p1=_saddu4(p1-_minu4(p1,b1),a1);
  349. p2=_saddu4(p2-_minu4(p2,a1),b1);
  350. /*We need this first average both biased up and biased down.*/
  351. u2=~_avgu4(~p5,p6);
  352. v2=_avgu4(p5,~p6);
  353. /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
  354. m2=_sub4(u2,v2);
  355. a2=m2^_avgu4(m2^~p4,m2^p7);
  356. f2=_avgu4(_avgu4(a2,u2),v2);
  357. /*Instead of removing the bias by 128, we use it to split f by sign.*/
  358. b2=_minu4(0x80808080,f2);
  359. a2=0x80808080-b2;
  360. b2=f2-b2;
  361. /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
  362. u2=_saddu4(a2,_ll);
  363. v2=_saddu4(b2,_ll);
  364. a2=_saddu4(a2,u2);
  365. b2=_saddu4(b2,v2);
  366. a2=a2-_minu4(a2,u2);
  367. b2=b2-_minu4(b2,v2);
  368. /*Apply the changes to the original pixels.*/
  369. p5=_saddu4(p5-_minu4(p5,b2),a2);
  370. p6=_saddu4(p6-_minu4(p6,a2),b2);
  371. /*Write out the results.*/
  372. _amem8(_pix-_ystride)=_itoll(p5,p1);
  373. _amem8(_pix)=_itoll(p6,p2);
  374. }
  375. void oc_loop_filter_init_c64x(signed char _bv[256],int _flimit){
  376. int ll;
  377. ll=_flimit<<1;
  378. ll=_pack2(ll,ll);
  379. ll=~_spacku4(ll,ll);
  380. *((int *)_bv)=ll;
  381. }
  382. void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
  383. signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
  384. const oc_fragment_plane *fplane;
  385. const oc_fragment *frags;
  386. const ptrdiff_t *frag_buf_offs;
  387. unsigned char *ref_frame_data;
  388. ptrdiff_t fragi_top;
  389. ptrdiff_t fragi_bot;
  390. ptrdiff_t fragi0;
  391. ptrdiff_t fragi0_end;
  392. int ystride;
  393. int nhfrags;
  394. int ll;
  395. fplane=_state->fplanes+_pli;
  396. nhfrags=fplane->nhfrags;
  397. fragi_top=fplane->froffset;
  398. fragi_bot=fragi_top+fplane->nfrags;
  399. fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
  400. fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
  401. ystride=_state->ref_ystride[_pli];
  402. frags=_state->frags;
  403. frag_buf_offs=_state->frag_buf_offs;
  404. ref_frame_data=_state->ref_frame_data[_refi];
  405. ll=*((int *)_bv);
  406. /*The following loops are constructed somewhat non-intuitively on purpose.
  407. The main idea is: if a block boundary has at least one coded fragment on
  408. it, the filter is applied to it.
  409. However, the order that the filters are applied in matters, and VP3 chose
  410. the somewhat strange ordering used below.*/
  411. while(fragi0<fragi0_end){
  412. ptrdiff_t fragi;
  413. ptrdiff_t fragi_end;
  414. fragi=fragi0;
  415. fragi_end=fragi+nhfrags;
  416. while(fragi<fragi_end){
  417. if(frags[fragi].coded){
  418. unsigned char *ref;
  419. ref=ref_frame_data+frag_buf_offs[fragi];
  420. if(fragi>fragi0)loop_filter_h(ref,ystride,ll);
  421. if(fragi0>fragi_top)loop_filter_v(ref,ystride,ll);
  422. if(fragi+1<fragi_end&&!frags[fragi+1].coded){
  423. loop_filter_h(ref+8,ystride,ll);
  424. }
  425. if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
  426. loop_filter_v(ref+(ystride<<3),ystride,ll);
  427. }
  428. }
  429. fragi++;
  430. }
  431. fragi0+=nhfrags;
  432. }
  433. }