123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- /********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- * *
- ********************************************************************
- function:
- last mod: $Id$
- ********************************************************************/
- #include <string.h>
- #include "c64xint.h"
- #include "dct.h"
- #define OC_C1S7D ((OC_C1S7<<16)|(OC_C1S7&0xFFFF))
- #define OC_C2S6D ((OC_C2S6<<16)|(OC_C2S6&0xFFFF))
- #define OC_C3S5D ((OC_C3S5<<16)|(OC_C3S5&0xFFFF))
- #define OC_C4S4D ((OC_C4S4<<16)|(OC_C4S4&0xFFFF))
- #define OC_C5S3D ((OC_C5S3<<16)|(OC_C5S3&0xFFFF))
- #define OC_C6S2D ((OC_C6S2<<16)|(OC_C6S2&0xFFFF))
- #define OC_C7S1D ((OC_C7S1<<16)|(OC_C7S1&0xFFFF))
- /*Various building blocks for the iDCT implementations.
- These are done in macros instead of functions so that we can use all local
- variables, which avoids leaving the compiler to try to sort out memory
- reference dependencies.*/
- /*Load two rows into x0...x7.*/
- #define OC_IDCT8x2_LOAD8(_x) \
- do{ \
- long long ll; \
- ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
- x0=_loll(ll); \
- x1=_hill(ll); \
- ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
- x2=_loll(ll); \
- x3=_hill(ll); \
- ll=_dpack2(_amem4_const((_x)+12),_amem4_const((_x)+4)); \
- x4=_loll(ll); \
- x5=_hill(ll); \
- ll=_dpack2(_amem4_const((_x)+14),_amem4_const((_x)+6)); \
- x6=_loll(ll); \
- x7=_hill(ll); \
- } \
- while(0)
- /*Load two rows into x0...x3.
- Uses ll as a temporary.*/
- #define OC_IDCT8x2_LOAD4(_x) \
- do{ \
- long long ll; \
- ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
- x0=_loll(ll); \
- x1=_hill(ll); \
- ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
- x2=_loll(ll); \
- x3=_hill(ll); \
- } \
- while(0)
- /*Load two rows into x0...x1.*/
- #define OC_IDCT8x2_LOAD2(_x) \
- do{ \
- long long ll; \
- ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
- x0=_loll(ll); \
- x1=_hill(ll); \
- } \
- while(0)
- /*Load two columns into x0...x1.*/
- #define OC_IDCT8x2_LOAD2T(_x) \
- do{ \
- x0=_amem4_const((_x)+(0<<3)); \
- x1=_amem4_const((_x)+(1<<3)); \
- } \
- while(0)
- /*Transform x0...x7 into t0...t7.*/
- #define OC_IDCT8x2() \
- do{ \
- long long ll; \
- int a; \
- int b; \
- /*Stage 1:*/ \
- ll=_addsub2(x0,x4); \
- a=_hill(ll); \
- b=_loll(ll); \
- t0=_packh2(_mpyhus(OC_C4S4D,a),_mpyus(OC_C4S4D,a)); \
- t1=_packh2(_mpyhus(OC_C4S4D,b),_mpyus(OC_C4S4D,b)); \
- ll=_mpy2ll(OC_C6S2D,x2); \
- a=_packh2(_hill(ll),_loll(ll)); \
- ll=_mpy2ll(OC_C2S6D,x6); \
- b=_add2(_packh2(_hill(ll),_loll(ll)),x6); \
- t2=_sub2(a,b); \
- ll=_mpy2ll(OC_C2S6D,x2); \
- a=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
- ll=_mpy2ll(OC_C6S2D,x6); \
- b=_packh2(_hill(ll),_loll(ll)); \
- t3=_add2(a,b); \
- ll=_mpy2ll(OC_C7S1D,x1); \
- a=_packh2(_hill(ll),_loll(ll)); \
- ll=_mpy2ll(OC_C1S7D,x7); \
- b=_add2(_packh2(_hill(ll),_loll(ll)),x7); \
- t4=_sub2(a,b); \
- ll=_mpy2ll(OC_C3S5D,x5); \
- a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
- ll=_mpy2ll(OC_C5S3D,x3); \
- b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
- t5=_sub2(a,b); \
- ll=_mpy2ll(OC_C5S3D,x5); \
- a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
- ll=_mpy2ll(OC_C3S5D,x3); \
- b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
- t6=_add2(a,b); \
- ll=_mpy2ll(OC_C1S7D,x1); \
- a=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
- ll=_mpy2ll(OC_C7S1D,x7); \
- b=_packh2(_hill(ll),_loll(ll)); \
- t7=_add2(a,b); \
- /*Stage 2:*/ \
- ll=_addsub2(t4,t5); \
- t4=_hill(ll); \
- b=_loll(ll); \
- ll=_mpy2ll(OC_C4S4D,b); \
- t5=_add2(_packh2(_hill(ll),_loll(ll)),b); \
- ll=_addsub2(t7,t6); \
- t7=_hill(ll); \
- b=_loll(ll); \
- ll=_mpy2ll(OC_C4S4D,b); \
- t6=_add2(_packh2(_hill(ll),_loll(ll)),b); \
- /*Stage 3:*/ \
- ll=_addsub2(t0,t3); \
- t0=_hill(ll); \
- t3=_loll(ll); \
- ll=_addsub2(t1,t2); \
- t1=_hill(ll); \
- t2=_loll(ll); \
- ll=_addsub2(t6,t5); \
- t6=_hill(ll); \
- t5=_loll(ll); \
- } \
- while(0)
- /*Transform x0...x3 into t0...t7, assuming x4...x7 are zero.*/
- #define OC_IDCT8x2_4() \
- do{ \
- long long ll; \
- int a; \
- /*Stage 1:*/ \
- ll=_mpy2ll(OC_C4S4D,x0); \
- t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
- t1=t0; \
- ll=_mpy2ll(OC_C6S2D,x2); \
- t2=_packh2(_hill(ll),_loll(ll)); \
- ll=_mpy2ll(OC_C2S6D,x2); \
- t3=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
- ll=_mpy2ll(OC_C7S1D,x1); \
- t4=_packh2(_hill(ll),_loll(ll)); \
- ll=_mpy2ll(OC_C5S3D,x3); \
- t5=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
- ll=_mpy2ll(OC_C3S5D,x3); \
- t6=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
- ll=_mpy2ll(OC_C1S7D,x1); \
- t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
- /*Stage 2:*/ \
- ll=_addsub2(t4,t5); \
- t4=_loll(ll); \
- a=_hill(ll); \
- ll=_mpy2ll(OC_C4S4D,a); \
- t5=_add2(_packh2(_hill(ll),_loll(ll)),a); \
- ll=_addsub2(t7,t6); \
- t7=_hill(ll); \
- a=_loll(ll); \
- ll=_mpy2ll(OC_C4S4D,a); \
- t6=_add2(_packh2(_hill(ll),_loll(ll)),a); \
- /*Stage 3:*/ \
- ll=_addsub2(t0,t3); \
- t0=_hill(ll); \
- t3=_loll(ll); \
- ll=_addsub2(t1,t2); \
- t1=_hill(ll); \
- t2=_loll(ll); \
- ll=_addsub2(t6,t5); \
- t6=_hill(ll); \
- t5=_loll(ll); \
- } \
- while(0)
- /*Transform x0...x1 into t0...t7, assuming x2...x7 are zero.*/
- #define OC_IDCT8x2_2() \
- do{ \
- long long ll; \
- /*Stage 1:*/ \
- ll=_mpy2ll(OC_C4S4D,x0); \
- t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
- t1=t0; \
- ll=_mpy2ll(OC_C7S1D,x1); \
- t4=_packh2(_hill(ll),_loll(ll)); \
- ll=_mpy2ll(OC_C1S7D,x1); \
- t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
- /*Stage 2:*/ \
- ll=_mpy2ll(OC_C4S4D,t4); \
- t5=_add2(_packh2(_hill(ll),_loll(ll)),t4); \
- ll=_mpy2ll(OC_C4S4D,t7); \
- t6=_add2(_packh2(_hill(ll),_loll(ll)),t7); \
- /*Stage 3:*/ \
- t3=t0; \
- t2=t1; \
- ll=_addsub2(t6,t5); \
- t6=_hill(ll); \
- t5=_loll(ll); \
- } \
- while(0)
- /*Finish transforming t0...t7 and store two rows.*/
- #define OC_IDCT8x2_STORE(_y) \
- do{ \
- long long ll; \
- int a; \
- int b; \
- int c; \
- int d; \
- /*Stage 4:*/ \
- ll=_addsub2(t0,t7); \
- a=_hill(ll); \
- c=_loll(ll); \
- ll=_addsub2(t1,t6); \
- b=_hill(ll); \
- d=_loll(ll); \
- ll=_dpack2(b,a); \
- _amem4((_y)+0)=_loll(ll); \
- _amem4((_y)+8)=_hill(ll); \
- ll=_dpack2(c,d); \
- _amem4((_y)+6)=_loll(ll); \
- _amem4((_y)+14)=_hill(ll); \
- ll=_addsub2(t2,t5); \
- a=_hill(ll); \
- c=_loll(ll); \
- ll=_addsub2(t3,t4); \
- b=_hill(ll); \
- d=_loll(ll); \
- ll=_dpack2(b,a); \
- _amem4((_y)+2)=_loll(ll); \
- _amem4((_y)+10)=_hill(ll); \
- ll=_dpack2(c,d); \
- _amem4((_y)+4)=_loll(ll); \
- _amem4((_y)+12)=_hill(ll); \
- } \
- while(0)
- /*Finish transforming t0...t7 and store two columns.*/
- #define OC_IDCT8x2_STORET(_y) \
- do{ \
- long long ll; \
- /*Stage 4:*/ \
- ll=_addsub2(t0,t7); \
- _amem4((_y)+(0<<3))=_hill(ll); \
- _amem4((_y)+(7<<3))=_loll(ll); \
- ll=_addsub2(t1,t6); \
- _amem4((_y)+(1<<3))=_hill(ll); \
- _amem4((_y)+(6<<3))=_loll(ll); \
- ll=_addsub2(t2,t5); \
- _amem4((_y)+(2<<3))=_hill(ll); \
- _amem4((_y)+(5<<3))=_loll(ll); \
- ll=_addsub2(t3,t4); \
- _amem4((_y)+(3<<3))=_hill(ll); \
- _amem4((_y)+(4<<3))=_loll(ll); \
- } \
- while(0)
- /*Finish transforming t0...t7, round and scale, and store two columns.*/
- #define OC_IDCT8x2_ROUND_STORET(_y) \
- do{ \
- long long ll; \
- /*Stage 4:*/ \
- /*Adjust for the scale factor.*/ \
- ll=_addsub2(t0,t7); \
- _amem4((_y)+(0<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
- _amem4((_y)+(7<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
- ll=_addsub2(t1,t6); \
- _amem4((_y)+(1<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
- _amem4((_y)+(6<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
- ll=_addsub2(t2,t5); \
- _amem4((_y)+(2<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
- _amem4((_y)+(5<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
- ll=_addsub2(t3,t4); \
- _amem4((_y)+(3<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
- _amem4((_y)+(4<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
- } \
- while(0)
- /*196 cycles.*/
- static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
- ogg_int16_t w[64];
- int x0;
- int x1;
- int x2;
- int x3;
- int x4;
- int x5;
- int x6;
- int x7;
- int t0;
- int t1;
- int t2;
- int t3;
- int t4;
- int t5;
- int t6;
- int t7;
- int i;
- /*Transform rows of x into columns of w.*/
- for(i=0;i<8;i+=2){
- OC_IDCT8x2_LOAD8(_x+i*8);
- _amem8(_x+i*8)=0LL;
- _amem8(_x+i*8+4)=0LL;
- _amem8(_x+i*8+8)=0LL;
- _amem8(_x+i*8+12)=0LL;
- OC_IDCT8x2();
- OC_IDCT8x2_STORET(w+i);
- }
- /*Transform rows of w into columns of y.*/
- for(i=0;i<8;i+=2){
- OC_IDCT8x2_LOAD8(w+i*8);
- OC_IDCT8x2();
- OC_IDCT8x2_ROUND_STORET(_y+i);
- }
- }
- /*106 cycles.*/
- static void oc_idct8x8_10_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
- ogg_int16_t w[64];
- int t0;
- int t1;
- int t2;
- int t3;
- int t4;
- int t5;
- int t6;
- int t7;
- int x0;
- int x1;
- int x2;
- int x3;
- int i;
- /*Transform rows of x into columns of w.*/
- OC_IDCT8x2_LOAD4(_x);
- OC_IDCT8x2_4();
- OC_IDCT8x2_STORET(w);
- OC_IDCT8x2_LOAD2(_x+16);
- _amem8(_x)=0LL;
- _amem8(_x+8)=0LL;
- _amem4(_x+16)=0;
- _amem4(_x+24)=0;
- OC_IDCT8x2_2();
- OC_IDCT8x2_STORET(w+2);
- /*Transform rows of w into columns of y.*/
- for(i=0;i<8;i+=2){
- OC_IDCT8x2_LOAD4(w+i*8);
- OC_IDCT8x2_4();
- OC_IDCT8x2_ROUND_STORET(_y+i);
- }
- }
- #if 0
- /*This used to compile to something faster (88 cycles), but no longer, and I'm
- not sure what changed to cause this.
- In any case, it's barely an advantage over the 10-coefficient version, and is
- now hardly worth the icache space.*/
- /*95 cycles.*/
- static inline void oc_idct8x8_3_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
- ogg_int16_t w[64];
- int t0;
- int t1;
- int t2;
- int t3;
- int t4;
- int t5;
- int t6;
- int t7;
- int x0;
- int x1;
- int i;
- /*Transform rows of x into rows of w.*/
- for(i=0;i<2;i+=2){
- OC_IDCT8x2_LOAD2(_x+i*8);
- OC_IDCT8x2_2();
- OC_IDCT8x2_STORE(w+i*8);
- }
- _amem4(_x)=0;
- _amem4(_x+8)=0;
- /*Transform columns of w into columns of y.*/
- for(i=0;i<8;i+=2){
- OC_IDCT8x2_LOAD2T(w+i);
- OC_IDCT8x2_2();
- OC_IDCT8x2_ROUND_STORET(_y+i);
- }
- }
- #endif
- /*Performs an inverse 8x8 Type-II DCT transform.
- The input is assumed to be scaled by a factor of 4 relative to orthonormal
- version of the transform.*/
- void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
- /*if(_last_zzi<=3)oc_idct8x8_3_c64x(_y,_x);
- else*/ if(_last_zzi<=10)oc_idct8x8_10_c64x(_y,_x);
- else oc_idct8x8_slow_c64x(_y,_x);
- }
|