c64xidct.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. * *
  8. * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
  9. * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. * *
  11. ********************************************************************
  12. function:
  13. last mod: $Id$
  14. ********************************************************************/
  15. #include <string.h>
  16. #include "c64xint.h"
  17. #include "dct.h"
  18. #define OC_C1S7D ((OC_C1S7<<16)|(OC_C1S7&0xFFFF))
  19. #define OC_C2S6D ((OC_C2S6<<16)|(OC_C2S6&0xFFFF))
  20. #define OC_C3S5D ((OC_C3S5<<16)|(OC_C3S5&0xFFFF))
  21. #define OC_C4S4D ((OC_C4S4<<16)|(OC_C4S4&0xFFFF))
  22. #define OC_C5S3D ((OC_C5S3<<16)|(OC_C5S3&0xFFFF))
  23. #define OC_C6S2D ((OC_C6S2<<16)|(OC_C6S2&0xFFFF))
  24. #define OC_C7S1D ((OC_C7S1<<16)|(OC_C7S1&0xFFFF))
  25. /*Various building blocks for the iDCT implementations.
  26. These are done in macros instead of functions so that we can use all local
  27. variables, which avoids leaving the compiler to try to sort out memory
  28. reference dependencies.*/
  29. /*Load two rows into x0...x7.*/
  30. #define OC_IDCT8x2_LOAD8(_x) \
  31. do{ \
  32. long long ll; \
  33. ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
  34. x0=_loll(ll); \
  35. x1=_hill(ll); \
  36. ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
  37. x2=_loll(ll); \
  38. x3=_hill(ll); \
  39. ll=_dpack2(_amem4_const((_x)+12),_amem4_const((_x)+4)); \
  40. x4=_loll(ll); \
  41. x5=_hill(ll); \
  42. ll=_dpack2(_amem4_const((_x)+14),_amem4_const((_x)+6)); \
  43. x6=_loll(ll); \
  44. x7=_hill(ll); \
  45. } \
  46. while(0)
  47. /*Load two rows into x0...x3.
  48. Uses ll as a temporary.*/
  49. #define OC_IDCT8x2_LOAD4(_x) \
  50. do{ \
  51. long long ll; \
  52. ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
  53. x0=_loll(ll); \
  54. x1=_hill(ll); \
  55. ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
  56. x2=_loll(ll); \
  57. x3=_hill(ll); \
  58. } \
  59. while(0)
  60. /*Load two rows into x0...x1.*/
  61. #define OC_IDCT8x2_LOAD2(_x) \
  62. do{ \
  63. long long ll; \
  64. ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
  65. x0=_loll(ll); \
  66. x1=_hill(ll); \
  67. } \
  68. while(0)
  69. /*Load two columns into x0...x1.*/
  70. #define OC_IDCT8x2_LOAD2T(_x) \
  71. do{ \
  72. x0=_amem4_const((_x)+(0<<3)); \
  73. x1=_amem4_const((_x)+(1<<3)); \
  74. } \
  75. while(0)
  76. /*Transform x0...x7 into t0...t7.*/
  77. #define OC_IDCT8x2() \
  78. do{ \
  79. long long ll; \
  80. int a; \
  81. int b; \
  82. /*Stage 1:*/ \
  83. ll=_addsub2(x0,x4); \
  84. a=_hill(ll); \
  85. b=_loll(ll); \
  86. t0=_packh2(_mpyhus(OC_C4S4D,a),_mpyus(OC_C4S4D,a)); \
  87. t1=_packh2(_mpyhus(OC_C4S4D,b),_mpyus(OC_C4S4D,b)); \
  88. ll=_mpy2ll(OC_C6S2D,x2); \
  89. a=_packh2(_hill(ll),_loll(ll)); \
  90. ll=_mpy2ll(OC_C2S6D,x6); \
  91. b=_add2(_packh2(_hill(ll),_loll(ll)),x6); \
  92. t2=_sub2(a,b); \
  93. ll=_mpy2ll(OC_C2S6D,x2); \
  94. a=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
  95. ll=_mpy2ll(OC_C6S2D,x6); \
  96. b=_packh2(_hill(ll),_loll(ll)); \
  97. t3=_add2(a,b); \
  98. ll=_mpy2ll(OC_C7S1D,x1); \
  99. a=_packh2(_hill(ll),_loll(ll)); \
  100. ll=_mpy2ll(OC_C1S7D,x7); \
  101. b=_add2(_packh2(_hill(ll),_loll(ll)),x7); \
  102. t4=_sub2(a,b); \
  103. ll=_mpy2ll(OC_C3S5D,x5); \
  104. a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
  105. ll=_mpy2ll(OC_C5S3D,x3); \
  106. b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
  107. t5=_sub2(a,b); \
  108. ll=_mpy2ll(OC_C5S3D,x5); \
  109. a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
  110. ll=_mpy2ll(OC_C3S5D,x3); \
  111. b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
  112. t6=_add2(a,b); \
  113. ll=_mpy2ll(OC_C1S7D,x1); \
  114. a=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
  115. ll=_mpy2ll(OC_C7S1D,x7); \
  116. b=_packh2(_hill(ll),_loll(ll)); \
  117. t7=_add2(a,b); \
  118. /*Stage 2:*/ \
  119. ll=_addsub2(t4,t5); \
  120. t4=_hill(ll); \
  121. b=_loll(ll); \
  122. ll=_mpy2ll(OC_C4S4D,b); \
  123. t5=_add2(_packh2(_hill(ll),_loll(ll)),b); \
  124. ll=_addsub2(t7,t6); \
  125. t7=_hill(ll); \
  126. b=_loll(ll); \
  127. ll=_mpy2ll(OC_C4S4D,b); \
  128. t6=_add2(_packh2(_hill(ll),_loll(ll)),b); \
  129. /*Stage 3:*/ \
  130. ll=_addsub2(t0,t3); \
  131. t0=_hill(ll); \
  132. t3=_loll(ll); \
  133. ll=_addsub2(t1,t2); \
  134. t1=_hill(ll); \
  135. t2=_loll(ll); \
  136. ll=_addsub2(t6,t5); \
  137. t6=_hill(ll); \
  138. t5=_loll(ll); \
  139. } \
  140. while(0)
  141. /*Transform x0...x3 into t0...t7, assuming x4...x7 are zero.*/
  142. #define OC_IDCT8x2_4() \
  143. do{ \
  144. long long ll; \
  145. int a; \
  146. /*Stage 1:*/ \
  147. ll=_mpy2ll(OC_C4S4D,x0); \
  148. t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
  149. t1=t0; \
  150. ll=_mpy2ll(OC_C6S2D,x2); \
  151. t2=_packh2(_hill(ll),_loll(ll)); \
  152. ll=_mpy2ll(OC_C2S6D,x2); \
  153. t3=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
  154. ll=_mpy2ll(OC_C7S1D,x1); \
  155. t4=_packh2(_hill(ll),_loll(ll)); \
  156. ll=_mpy2ll(OC_C5S3D,x3); \
  157. t5=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
  158. ll=_mpy2ll(OC_C3S5D,x3); \
  159. t6=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
  160. ll=_mpy2ll(OC_C1S7D,x1); \
  161. t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
  162. /*Stage 2:*/ \
  163. ll=_addsub2(t4,t5); \
  164. t4=_loll(ll); \
  165. a=_hill(ll); \
  166. ll=_mpy2ll(OC_C4S4D,a); \
  167. t5=_add2(_packh2(_hill(ll),_loll(ll)),a); \
  168. ll=_addsub2(t7,t6); \
  169. t7=_hill(ll); \
  170. a=_loll(ll); \
  171. ll=_mpy2ll(OC_C4S4D,a); \
  172. t6=_add2(_packh2(_hill(ll),_loll(ll)),a); \
  173. /*Stage 3:*/ \
  174. ll=_addsub2(t0,t3); \
  175. t0=_hill(ll); \
  176. t3=_loll(ll); \
  177. ll=_addsub2(t1,t2); \
  178. t1=_hill(ll); \
  179. t2=_loll(ll); \
  180. ll=_addsub2(t6,t5); \
  181. t6=_hill(ll); \
  182. t5=_loll(ll); \
  183. } \
  184. while(0)
  185. /*Transform x0...x1 into t0...t7, assuming x2...x7 are zero.*/
  186. #define OC_IDCT8x2_2() \
  187. do{ \
  188. long long ll; \
  189. /*Stage 1:*/ \
  190. ll=_mpy2ll(OC_C4S4D,x0); \
  191. t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
  192. t1=t0; \
  193. ll=_mpy2ll(OC_C7S1D,x1); \
  194. t4=_packh2(_hill(ll),_loll(ll)); \
  195. ll=_mpy2ll(OC_C1S7D,x1); \
  196. t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
  197. /*Stage 2:*/ \
  198. ll=_mpy2ll(OC_C4S4D,t4); \
  199. t5=_add2(_packh2(_hill(ll),_loll(ll)),t4); \
  200. ll=_mpy2ll(OC_C4S4D,t7); \
  201. t6=_add2(_packh2(_hill(ll),_loll(ll)),t7); \
  202. /*Stage 3:*/ \
  203. t3=t0; \
  204. t2=t1; \
  205. ll=_addsub2(t6,t5); \
  206. t6=_hill(ll); \
  207. t5=_loll(ll); \
  208. } \
  209. while(0)
  210. /*Finish transforming t0...t7 and store two rows.*/
  211. #define OC_IDCT8x2_STORE(_y) \
  212. do{ \
  213. long long ll; \
  214. int a; \
  215. int b; \
  216. int c; \
  217. int d; \
  218. /*Stage 4:*/ \
  219. ll=_addsub2(t0,t7); \
  220. a=_hill(ll); \
  221. c=_loll(ll); \
  222. ll=_addsub2(t1,t6); \
  223. b=_hill(ll); \
  224. d=_loll(ll); \
  225. ll=_dpack2(b,a); \
  226. _amem4((_y)+0)=_loll(ll); \
  227. _amem4((_y)+8)=_hill(ll); \
  228. ll=_dpack2(c,d); \
  229. _amem4((_y)+6)=_loll(ll); \
  230. _amem4((_y)+14)=_hill(ll); \
  231. ll=_addsub2(t2,t5); \
  232. a=_hill(ll); \
  233. c=_loll(ll); \
  234. ll=_addsub2(t3,t4); \
  235. b=_hill(ll); \
  236. d=_loll(ll); \
  237. ll=_dpack2(b,a); \
  238. _amem4((_y)+2)=_loll(ll); \
  239. _amem4((_y)+10)=_hill(ll); \
  240. ll=_dpack2(c,d); \
  241. _amem4((_y)+4)=_loll(ll); \
  242. _amem4((_y)+12)=_hill(ll); \
  243. } \
  244. while(0)
  245. /*Finish transforming t0...t7 and store two columns.*/
  246. #define OC_IDCT8x2_STORET(_y) \
  247. do{ \
  248. long long ll; \
  249. /*Stage 4:*/ \
  250. ll=_addsub2(t0,t7); \
  251. _amem4((_y)+(0<<3))=_hill(ll); \
  252. _amem4((_y)+(7<<3))=_loll(ll); \
  253. ll=_addsub2(t1,t6); \
  254. _amem4((_y)+(1<<3))=_hill(ll); \
  255. _amem4((_y)+(6<<3))=_loll(ll); \
  256. ll=_addsub2(t2,t5); \
  257. _amem4((_y)+(2<<3))=_hill(ll); \
  258. _amem4((_y)+(5<<3))=_loll(ll); \
  259. ll=_addsub2(t3,t4); \
  260. _amem4((_y)+(3<<3))=_hill(ll); \
  261. _amem4((_y)+(4<<3))=_loll(ll); \
  262. } \
  263. while(0)
  264. /*Finish transforming t0...t7, round and scale, and store two columns.*/
  265. #define OC_IDCT8x2_ROUND_STORET(_y) \
  266. do{ \
  267. long long ll; \
  268. /*Stage 4:*/ \
  269. /*Adjust for the scale factor.*/ \
  270. ll=_addsub2(t0,t7); \
  271. _amem4((_y)+(0<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
  272. _amem4((_y)+(7<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
  273. ll=_addsub2(t1,t6); \
  274. _amem4((_y)+(1<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
  275. _amem4((_y)+(6<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
  276. ll=_addsub2(t2,t5); \
  277. _amem4((_y)+(2<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
  278. _amem4((_y)+(5<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
  279. ll=_addsub2(t3,t4); \
  280. _amem4((_y)+(3<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
  281. _amem4((_y)+(4<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
  282. } \
  283. while(0)
  284. /*196 cycles.*/
  285. static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
  286. ogg_int16_t w[64];
  287. int x0;
  288. int x1;
  289. int x2;
  290. int x3;
  291. int x4;
  292. int x5;
  293. int x6;
  294. int x7;
  295. int t0;
  296. int t1;
  297. int t2;
  298. int t3;
  299. int t4;
  300. int t5;
  301. int t6;
  302. int t7;
  303. int i;
  304. /*Transform rows of x into columns of w.*/
  305. for(i=0;i<8;i+=2){
  306. OC_IDCT8x2_LOAD8(_x+i*8);
  307. _amem8(_x+i*8)=0LL;
  308. _amem8(_x+i*8+4)=0LL;
  309. _amem8(_x+i*8+8)=0LL;
  310. _amem8(_x+i*8+12)=0LL;
  311. OC_IDCT8x2();
  312. OC_IDCT8x2_STORET(w+i);
  313. }
  314. /*Transform rows of w into columns of y.*/
  315. for(i=0;i<8;i+=2){
  316. OC_IDCT8x2_LOAD8(w+i*8);
  317. OC_IDCT8x2();
  318. OC_IDCT8x2_ROUND_STORET(_y+i);
  319. }
  320. }
  321. /*106 cycles.*/
  322. static void oc_idct8x8_10_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
  323. ogg_int16_t w[64];
  324. int t0;
  325. int t1;
  326. int t2;
  327. int t3;
  328. int t4;
  329. int t5;
  330. int t6;
  331. int t7;
  332. int x0;
  333. int x1;
  334. int x2;
  335. int x3;
  336. int i;
  337. /*Transform rows of x into columns of w.*/
  338. OC_IDCT8x2_LOAD4(_x);
  339. OC_IDCT8x2_4();
  340. OC_IDCT8x2_STORET(w);
  341. OC_IDCT8x2_LOAD2(_x+16);
  342. _amem8(_x)=0LL;
  343. _amem8(_x+8)=0LL;
  344. _amem4(_x+16)=0;
  345. _amem4(_x+24)=0;
  346. OC_IDCT8x2_2();
  347. OC_IDCT8x2_STORET(w+2);
  348. /*Transform rows of w into columns of y.*/
  349. for(i=0;i<8;i+=2){
  350. OC_IDCT8x2_LOAD4(w+i*8);
  351. OC_IDCT8x2_4();
  352. OC_IDCT8x2_ROUND_STORET(_y+i);
  353. }
  354. }
  355. #if 0
  356. /*This used to compile to something faster (88 cycles), but no longer, and I'm
  357. not sure what changed to cause this.
  358. In any case, it's barely an advantage over the 10-coefficient version, and is
  359. now hardly worth the icache space.*/
  360. /*95 cycles.*/
  361. static inline void oc_idct8x8_3_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
  362. ogg_int16_t w[64];
  363. int t0;
  364. int t1;
  365. int t2;
  366. int t3;
  367. int t4;
  368. int t5;
  369. int t6;
  370. int t7;
  371. int x0;
  372. int x1;
  373. int i;
  374. /*Transform rows of x into rows of w.*/
  375. for(i=0;i<2;i+=2){
  376. OC_IDCT8x2_LOAD2(_x+i*8);
  377. OC_IDCT8x2_2();
  378. OC_IDCT8x2_STORE(w+i*8);
  379. }
  380. _amem4(_x)=0;
  381. _amem4(_x+8)=0;
  382. /*Transform columns of w into columns of y.*/
  383. for(i=0;i<8;i+=2){
  384. OC_IDCT8x2_LOAD2T(w+i);
  385. OC_IDCT8x2_2();
  386. OC_IDCT8x2_ROUND_STORET(_y+i);
  387. }
  388. }
  389. #endif
  390. /*Performs an inverse 8x8 Type-II DCT transform.
  391. The input is assumed to be scaled by a factor of 4 relative to orthonormal
  392. version of the transform.*/
  393. void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
  394. /*if(_last_zzi<=3)oc_idct8x8_3_c64x(_y,_x);
  395. else*/ if(_last_zzi<=10)oc_idct8x8_10_c64x(_y,_x);
  396. else oc_idct8x8_slow_c64x(_y,_x);
  397. }