*=============================================================================== * * From: https://www-a.ti.com/apps/c6000/xt_download.asp?sku=C67x_cfftr2 * * TEXAS INSTRUMENTS, INC. * * Copyright © Texas Instruments Incorporated 1998 * * TI retains all right, title and interest in this code and authorizes its * use solely and exclusively with digital signal processing devices * manufactured by or for TI. This code is intended to provide an * understanding of the benefits of using TI digital signal processing devices. * It is provided "AS IS". TI disclaims all warranties and representations, * including but not limited to, any warranty of merchantability or fitness * for a particular purpose. This code may contain irregularities not found * in commercial software and is not intended to be used in production * applications. You agree that prior to using or incorporating this code * into any commercial product you will thoroughly test that product and the * functionality of the code in that product and will be solely responsible * for any problems or failures. * * TI retains all rights not granted herein. * * * RADIX-2 FFT (DIT) * * Revision Date: 5/21/98 * * USAGE * * This routine is C Callable and can be called as: * * void cfftr2_dit( float *x, const float *w, short N) * * x Pointer to Array of Dimension 2*N elements holding * Input to and Outputs from function cfftr2_dit() * w Pointer to an array holding the coefficient (Dimension * n/2 complex numbers) * N Number of complex points in x * * If routine is not to be used as a C callable function then * you need to initialize values for all of the values passed * as these are assumed to be in registers as defined by the * calling convention of the compiler, (refer to the C compiler * reference guide). * * ARGUMENTS PASSED -> REGISTER * --------------------------------- * x -> A4 * w -> B4 * N -> A6 * * C CODE * * This is the C equivalent of the assembly code. Note that * the assembly code is hand optimized and restrictions may * apply. * * void cfftr2_dit(float* x, float* w, short n) * { * short n2, ie, ia, i, j, k, m; * float rtemp, itemp, c, s; * * n2 = n; * ie = 1; * * for(k=n; k > 1; k >>= 1) * { * n2 >>= 1; * ia = 0; * for(j=0; j < ie; j++) * { * c = w[2*j]; * s = w[2*j+1]; * for(i=0; i < n2; i++) * { * m = ia + n2; * rtemp = c * x[2*m] + s * x[2*m+1]; * itemp = c * x[2*m+1] - s * x[2*m]; * x[2*m] = x[2*ia] - rtemp; * x[2*m+1] = x[2*ia+1] - itemp; * x[2*ia] = x[2*ia] + rtemp; * x[2*ia+1] = x[2*ia+1] + itemp; * ia++; * } * ia += n2; * } * ie <<= 1; * } * } * * * DESCRIPTION * * This routine performs the Decimation-in-Time (DIT) Radix-2 FFT * of the input array x. * x has N complex floating point numbers arranged as successive * real and imaginary number pairs. Input array x contains N * complex points (N*2 elements). The coefficients for the * FFT are passed to the function in array w which contains * N/2 complex numbers (N elements) as successive real and * imaginary number pairs. * The FFT Coefficients w are in N/2 bit-reversed order * The elements of input array x are in normal order * The assembly routine performs 4 output samples (2 real and 2 * imaginary) for a pass through inner loop. * * Note that (bit-reversed) coefficients for higher order FFT (1024 * point) can be used unchanged as coefficients for a lower order * FFT (512, 256, 128 ... ,2) * * The routine can be used to implement Inverse-FFT by any ONE of * the following methods: * * 1.Inputs (x) are replaced by their Complex-conjugate values * Output values are divided by N * 2.FFT Coefficients (w) are replaced by their Complex-conjugates * Output values are divided by N * 3.Swap Real and Imaginary values of input * 4.Swap Real and Imaginary values of output * * TECHNIQUES * * 1. The inner two loops are combined into one inner loop whose * loop count is N/2. * 2. The first 4 cycles of inner loop prolog are scheduled in * parallel with the outer loop. * 3. Load counter is not used, so extreneous loads are performed * 4. Variables n and c share the register A6 and variables w and * nsave share the register B4. * * ASSUMPTIONS * * N is a integral power of 2 (4, 8,16,32 ...) and the FFT * dimension (N) must atleast be 4. * The FFT Coefficients w are in bit-reversed order * The elements of input array x are in normal order * The imaginary coefficients of w are negated as {cos(d*0), * sin(d*0), cos(d*1), sin(d*1) ...} as opposed to the normal * sequence of {cos(d*0), -sin(d*0), cos(d*1), -sin(d*1) ...} * where d = 2*PI/N. * * MEMORY NOTE * * Arrays x (data) and w (coefficients) must reside in * different memory banks to avoid memory conflicts. If Data and * Coefficents do reside in the same memory bank, add (N/2) + log2(N) + 1 * cycles to the cycles equation below. The memory bank hits are due to * scheduling of assembly code and also due to extreneous loads * causing bank hits. * * Data and Coefficents must be aligned on an 8 byte boundary. * * CYCLES * * ((2*N) + 23)*log2(N) + 6 * * For N=1024, Cycles = 20716 * * NOTATIONS * * f = Function Prolog or Epilog * o = Outer Loop * p = Inner Loop Prolog * *=============================================================================== .global _cfftr2_dit .text _cfftr2_dit: STW .D2T2 B14,*B15--(48) STW .D2T2 B3,*+B15(4) STW .D2T1 A10,*+B15(8) STW .D2T1 A11,*+B15(12) STW .D2T1 A12,*+B15(16) STW .D2T1 A13,*+B15(20) STW .D2T1 A14,*+B15(24) STW .D2T1 A15,*+B15(28) STW .D2T2 B10,*+B15(32) STW .D2T2 B11,*+B15(36) STW .D2T2 B12,*+B15(40) STW .D2T2 B13,*+B15(44) * Begin Benchmark Timing ADDAW .D1 A4,A6,A3 ; f xx2 = x + n*4 || MV .L2 B4,B12 ; f wsave = w || SHRU .S2X A6,1,B4 ; f nsave = n>>1 || MV .L1X B4,A5 ; f w1 = w || MVK .S1 1,A14 ; f onea = 1 MV .S2X A3,B8 ; f xx1 = xx2 || MV .L2 B4,B0 ; f i = nsave || SHL .S1 A6,2,A10 ; f k1 = n<<2 LDDW .D2 *B8++,B7:B6 ; p @ t2_1:t2_0 = *xx1++ || LDDW .D1 *A5++,A7:A6 ; p @ s:c = *w1 || MPY .M2 B0,1,B13 ; f ireset = i*1 || ADD .L2X A6,1,B9 ; f bk = n+1 || MV .S2 B8,B5 ; f xx3 = xx1 MV .L1 A4,A11 ; f xx4 = x || SHL .S2X A6,2,B1 ; f k = n * 4 MV .L1X B9,A0 ; f bk1 = bk ||[B0] SUB .L2 B0,1,B0 ; p if (i) i = i-1 || MV .S1 A4,A3 ; f xx2 = x || MVK .S2 1,B14 ; f oneb = 1 [!B0] ADD .L2 B8,B1,B8 ; p if (!i) xx1 = xx1 + k || MV .S2 B13,B2 ; f t3_ctr = ireset || MV .L1X B13,A1 ; f st_ctr = ireset oloop: LDDW .D2 *B8++,B7:B6 ; p @@ t2_1:t2_0 = *xx1++ ||[!B0] LDDW .D1 *A5++,A7:A6 ; p @@ if (!i) s:c = *w1 ||[!B0] MPY .M2 B14,B13,B0 ; p if (!i) i = ireset*1 MPYSP .M1X A6,B6,A13 ; p rtemp2 = c*t2_0 || MPYSP .M2X A6,B7,B11 ; p itemp2 = c*t2_1 [B0] SUB .S2 B0,1,B0 ; p if (i) i = i-1 || SUB .L1X B4,1,A2 ; f l = nsave - 1 MPYSP .M1X A7,B7,A15 ; p rtemp3 = s*t2_1 || MPYSP .M2X A7,B6,B3 ; p itemp3 = s*t2_0 ||[!B0] ADD .L2 B8,B1,B8 ; p if (!i) xx1 = xx1 + k LDDW .D2 *B8++,B7:B6 ; p @@@ t2_1:t2_0 = *xx1++ ||[!B0] LDDW .D1 *A5++,A7:A6 ; p @@@ if (!i) s:c = *w1 ||[!B0] MPY .M2 B14,B13,B0 ; p if (!i) i = ireset*1 MPYSP .M1X A6,B6,A13 ; p rtemp2 = c*t2_0 || MPYSP .M2X A6,B7,B11 ; p itemp2 = c*t2_1 [B0] SUB .S2 B0,1,B0 ; p if (i) i = i-1 LDDW .D1 *A3++,A9:A8 ; p @ t3_1:t3_0 = *xx2++ || MPYSP .M1X A7,B7,A15 ; p rtemp3 = s*t2_1 || MPYSP .M2X A7,B6,B3 ; p itemp3 = s*t2_0 ||[!B0] ADD .D2 B8,B1,B8 ; p if (!i) xx1 = xx1 + k || ADDSP .L1 A13,A15,A12 ; p rtemp1 = rtemp2 + rtemp3 || SUBSP .L2 B11,B3,B10 ; p itemp1 = itemp2 - itemp3 LDDW .D2 *B8++,B7:B6 ; p @@@@ t2_1:t2_0 = *xx1++ ||[!B0] LDDW .D1 *A5++,A7:A6 ; p @@@@ if (!i) s:c = *w1 ||[!B0] MPY .M2 B14,B13,B0 ; p if (!i) i = ireset*1 ||[B2] SUB .S2 B2,1,B2 ; p if (t3_ctr) t3_ctr -= 1 MPYSP .M1X A6,B6,A13 ; p rtemp2 = c*t2_0 || MPYSP .M2X A6,B7,B11 ; p itemp2 = c*t2_1 ||[!B2] ADD .S1 A3,A10,A3 ; p if (!t3_ctr) xx2 = xx2 + k1 [B0] SUB .S2 B0,1,B0 ; p if (i) i = i-1 ||[!B2] MPY .M2 B14,B13,B2 ; p if (!t3_ctr) t3_ctr = ireset*1 LDDW .D1 *A3++,A9:A8 ; p @@ t3_1:t3_0 = *xx2++ || MPYSP .M1X A7,B7,A15 ; p rtemp3 = s*t2_1 || MPYSP .M2X A7,B6,B3 ; p itemp3 = s*t2_0 ||[!B0] ADD .D2 B8,B1,B8 ; p if (!i) xx1 = xx1 + k || ADDSP .L1 A13,A15,A12 ; p rtemp1 = rtemp2 + rtemp3 || SUBSP .L2 B11,B3,B10 ; p itemp1 = itemp2 - itemp3 LDDW .D2 *B8++,B7:B6 ; p @@@@@ t2_1:t2_0 = *xx1++ ||[!B0] LDDW .D1 *A5++,A7:A6 ; p @@@@@ if (!i) s:c = *w1 ||[!B0] MPY .M2 B14,B13,B0 ; p if (!i) i = ireset*1 ||[B2] SUB .S2 B2,1,B2 ; p if (t3_ctr) t3_ctr -= 1 || SUBSP .L1 A8,A12,A15 ; p rtemp3 = t3_0 - rtemp1 || SUBSP .L2X A9,B10,B3 ; p itemp3 = t3_1 - itemp1 MPYSP .M1X A6,B6,A13 ; p rtemp2 = c*t2_0 || MPYSP .M2X A6,B7,B11 ; p itemp2 = c*t2_1 ||[!B2] ADD .S1 A3,A10,A3 ; p if (!t3_ctr) xx2 = xx2 + k1 || B .S2 iloop [B0] SUB .S2 B0,1,B0 ; p if (i) i = i-1 ||[!B2] MPY .M2 B14,B13,B2 ; p if (!t3_ctr) t3_ctr =ireset*1 || ADDSP .L1 A8,A12,A15 ; p rtemp3 = t3_0 + rtemp1 || ADDSP .L2X A9,B10,B3 ; p itemp3 = t3_1 + itemp1 ; Kernel Loop Begins iloop: LDDW .D1 *A3++,A9:A8 ; @@@ t3_1:t3_0 = *xx2++ || MPYSP .M1X A7,B7,A15 ; rtemp3 = s*t2_1 || MPYSP .M2X A7,B6,B3 ; itemp3 = s*t2_0 ||[!B0] ADD .D2 B8,B1,B8 ; if (!i) xx1 = xx1 + k || ADDSP .L1 A13,A15,A12 ; rtemp1 = rtemp2 + rtemp3 || SUBSP .L2 B11,B3,B10 ; itemp1 = itemp2 - itemp3 ||[!A1] ADD .S2 B5,B1,B5 ; if (!st_ctr) xx3 = xx3 + k ||[!A1] ADD .S1 A11,A10,A11 ; if (!st_ctr) xx4 = xx4 + k1 LDDW .D2 *B8++,B7:B6 ; @@@@@@ t2_1:t2_0 = *xx1++ ||[!B0] LDDW .D1 *A5++,A7:A6 ; @@@@@@ if (!i) s:c = *w1 ||[!B0] MPY .M2 B14,B13,B0 ; if (!i) i = ireset*1 ||[B2] SUB .S2 B2,1,B2 ; if (t3_ctr) t3_ctr -= 1 || SUBSP .L1 A8,A12,A15 ; rtemp3 = t3_0 - rtemp1 || SUBSP .L2X A9,B10,B3 ; itemp3 = t3_1 - itemp1 ||[A2] SUB .S1 A2,1,A2 ; if (l) l = l-1 ||[!A1] MPY .M1X A14,B13,A1 ; if (!st_ctr) st_ctr = ireset*1 MPYSP .M1X A6,B6,A13 ; rtemp2 = c*t2_0 || MPYSP .M2X A6,B7,B11 ; itemp2 = c*t2_1 ||[!B2] ADD .S1 A3,A10,A3 ; if (!t3_ctr) xx2 = xx2 + k1 ||[A2] B .S2 iloop ; Branch iloop || STW .D2T1 A15,*B5++[2] ; *xx3++[2] = rtemp3 || STW .D1T2 B3,*+A11[A0] ; *+xx4[bk1] = itemp3 [B0] SUB .S2 B0,1,B0 ; if (i) i = i-1 ||[!B2] MPY .M2 B14,B13,B2 ; if (!t3_ctr) t3_ctr = ireset*1 || ADDSP .L1 A8,A12,A15 ; rtemp3 = t3_0 + rtemp1 || ADDSP .L2X A9,B10,B3 ; itemp3 = t3_1 + itemp1 || STW .D1 A15,*A11++[2] ; *xx4++[2] = rtemp3 || STW .D2 B3,*-B5[B9] ; *-xx3[bk] = itemp3 ||[A1] SUB .S1 A1,1,A1 ; if (st_ctr) st_ctr=st_ctr-1 ; Kernel Loop Ends MV .S2 B13,B1 ; o k = ireset || MV .S1X B13,A2 ; o l = ireset SUB .S1X B1,1,A2 ; o l = k - 1 || SHRU .S2 B13,1,B0 ; o i = ireset>>1 || ADDAW .D1 A4,A2,A3 ; o xx2 = x + 4*l [A2] B .S1 oloop ||[!A2] LDW .D2 *+B15(4),B3 ; o if (!l) pop B3 || MV .S2X A3,B8 ; o xx1 = xx2 MV .L1X B12,A5 ; o w1 = wsave ||[!A2] LDDW .D2T1 *+B15(8),A11:A10; o if (!l) pop A11:A10 [A2] LDDW .D2 *B8++,B7:B6 ; p @ if (l) t2_1:t2_0 = *xx1++ ||[A2] LDDW .D1 *A5++,A7:A6 ; p @ if (l) s:c = *w1 || SHL .S1X B1,2,A10 ; f k1 = k<<2 || MPY .M2 B0,1,B13 ; f ireset = i*1 || ADD .L2 B1,1,B9 ; f bk = k + 1 MV .L1 A4,A11 ; f xx4 = x || SHL .S2 B1,2,B1 ; f k = k<<2 || MV .L2X A3,B5 ; f xx3 = xx2 ||[!A2] LDDW .D2T1 *+B15(16),A13:A12; o if (!l) pop A13:A12 MV .L1X B9,A0 ; f bk1 = bk ||[B0] SUB .L2 B0,1,B0 ; p if (i) i = i - 1 || MV .S1 A4,A3 ; f xx2 = x ||[!A2] LDDW .D2T1 *+B15(24),A15:A14; o if (!l) pop A15:A14 [!B0] ADD .L2 B8,B1,B8 ; p if (!i) xx1 = xx1 + k || MV .S2 B13,B2 ; f t3_ctr = ireset || MV .L1X B13,A1 ; f st_ctr = ireset ||[!A2] LDDW .D2T2 *+B15(32),B11:B10; o if (!l) pop B11:B10 ;----------------------------------------------- * End Benchmark Timing LDDW .D2T2 *+B15(40),B13:B12 || B .S2 B3 LDW .D2T2 *++B15(48),B14 NOP 4