*=============================================================================== * * From: https://www-a.ti.com/apps/c6000/xt_download.asp?sku=C67x_icfftr2 * * TEXAS INSTRUMENTS, INC. * * Copyright © Texas Instruments Incorporated 1998 * * TI retains all right, title and interest in this code and authorizes its * use solely and exclusively with digital signal processing devices * manufactured by or for TI. This code is intended to provide an * understanding of the benefits of using TI digital signal processing devices. * It is provided "AS IS". TI disclaims all warranties and representations, * including but not limited to, any warranty of merchantability or fitness * for a particular purpose. This code may contain irregularities not found * in commercial software and is not intended to be used in production * applications. You agree that prior to using or incorporating this code * into any commercial product you will thoroughly test that product and the * functionality of the code in that product and will be solely responsible * for any problems or failures. * * TI retains all rights not granted herein. * * * INVERSE COMPLEX RADIX-2 DECIMATION-IN-FREQUENCY FFT * * Revision Date: 07/06/98 * * USAGE * This routine is C-callable and can be called as: * * void icfftr2_dif(float* x, float* w, short n) * * x[] --- input and output sequences (dim-n) (input/output) * x has n complex numbers (2*n SP values). * The real and imaginary values are interleaved * in memory: re0:im0, re1:im1, ..... * w[] --- FFT coefficients (dim-n/2) (input) * w has n/2 complex numbers (n SP values). * FFT coeficients must be in bt-reversed order * The real and imaginary values are interleaved * in memory: re0:im0, re1:im1, ..... * n --- FFT size (input) * * If the routine is not to be used as a C-callable function, * then all instructions relating to the stack should be removed. * See comments of individual instructions to determine if they are * related to the stack. You also need to initialize all passed * parameters since these are assumed to be in registers as defined by * the calling convention of the compiler, (See the C compiler * reference guide.) * * C CODE * This is the C equivalent of the assembly code without restrictions: * Note that the assembly code is hand optimized and restrictions may * apply. * * void icfftr2_dif(float* x, float* w, short n) * { * short n2, ie, ia, i, j, k, m; * float rtemp, itemp, c, s; * * n2 = 1; * ie = n; * for(k=n; k > 1; k >>= 1) * { * ie >>= 1; * ia = 0; * for(j=0; j < ie; j++) * { * c = w[2*j]; * s = w[2*j+1]; * for(i=0; i < n2; i++) * { * m = ia + n2; * rtemp = x[2*ia] - x[2*m]; * x[2*ia] = x[2*ia] + x[2*m]; * itemp = x[2*ia+1] - x[2*m+1]; * x[2*ia+1] = x[2*ia+1] + x[2*m+1]; * x[2*m] = c*rtemp - s*itemp; * x[2*m+1] = c*itemp + s*rtemp; * ia++; * } * ia += n2; * } * n2 <<= 1; * } * } * * DESCRIPTION * This routine is used to compute the Inverse, Complex, Radix-2, * Decimation-in-Frequency Fast Fourier Transform of a single * precision complex sequence of size n, and a power of 2. * The routine requires bit-reversed input and bit-reversed * coefficents (twiddle factors) and produces results that are * in normal order. Final scaling is not done in this function. * * TECHNIQUES * 1. Loading input x as well as coefficient w in double word. * 2. Both loops j and i shown in the C code are placed in the * INNERLOOP of the assembly code. * 3. mpy was used to perform a mv. EX. mpy x, 1, y <=> mv x, y * 4. Because the data loads are 1 itteration ahead of the * coefficent loads, counter i was copied to counter m so that * the actual count could live longer for the coefficent loads. * 5. Two output pointers/counters are maintained to remove the * dependency between the X'a and Y's - the Y's have a much longer * latency path than the X's. * 6. Inner loop prolog and epilog are done in parallel with the * outer loop. * * ASSUMPTIONS * n >= 8 * * Both input x and coefficient w should be aligned on double word * (8 byte) boundary. * * The follwoing C code is used to generate the coefficient table * (non-bit reversed). * * #include * /* generate real and imaginary twiddle * table of size n/2 complex numbers */ * * gen_w_r2(float* w, int n) * { * int i; * float pi = 4.0*atan(1.0); * float e = pi*2.0/n; * * for(i=0; i < ( n>>1 ); i++) * { * w[2*i] = cos(i*e); * w[2*i+1] = sin(i*e); * } * } * * * The follwoing C code is used to bit-reverse the coefficents. * * bit_rev(float* x, int n) * { * int i, j, k; * float rtemp, itemp; * * j = 0; * for(i=1; i < (n-1); i++) * { * k = n >> 1; * while(k <= j) * { * j -= k; * k >>= 1; * } * j += k; * if(i < j) * { * rtemp = x[j*2]; * x[j*2] = x[i*2]; * x[i*2] = rtemp; * itemp = x[j*2+1]; * x[j*2+1] = x[i*2+1]; * x[i*2+1] = itemp; * } * } * } * * The follwoing C code is used to perform the final scaling * of the IFFT. * * /* divide each element of x by n */ * divide(float* x, int n) * { * int i; * float inv = 1.0 / n; * * for(i=0; i < n; i++) * { * x[2*i] = inv * x[2*i]; * x[2*i+1] = inv * x[2*i+1]; * } * } * * * MEMORY NOTE * * Data (x) 8*N bytes * Coefficients (w) 8*N/2 bytes * Stack 4*10 bytes * Program 800 bytes * * Note 1: Data and Coefficents must reside in different memory * blocks to avoid memory conflicts. * * Note 2: Data and Coefficents must be aligned to an 8 byte * boundary. * * CYCLES * * # of cycles = 21 + 4 + M*((N/2-2)*4 + 24) * / \ \ \ * C preservation ___/ \ \ \___ Loop L Prolog/Epilog * \ \ + * \ \ Loop K * \ \ * \ \___ Loop L * \ * \___ Loop K Prolog * * where: N is the number of point in the IFFT * M = log(base 2)N, is the number of stages in the IFFT * * Example: 1024 Point FFT Performance * N = 1024, M = 10, assume a 167MHz CPU clock * * # of cycles = 21 + 4 + 10*((1024/2-2)*4 + 24) = 20665 * * time = # of cycles * CPU clock = 20665/167*10^6 = 124.0 usec * * * EXAMPLE USAGE * * void main(void) * { * gen_w_r2(w, N); // Generate coefficient table * bit_rev(w, N>>1); // Bit-reverse coefficient table * cfftr2_dit(x, w, N); // This is the radix 2 FFT benchmark available * // from TI * // input in normal order, output in bit-reversed * // order * // coefficient table in bit-reversed order * icfftr2_dif(x, w, N); // Inverse radix 2 FFT * // input in bit-reversed order, output in normal * // order * // coefficient table in bit-reversed order * divide(x, N); // scale inverse FFT output * // result is the same as original input to FFT * } * * Since the twiddle table is in bit-reversed order, it turns out that * the same twiddle table will also work for smaller IFFTs. This * means that if you need to do both 512 and 1024 point IFFTs in the * same application, you only need to have the 1024 point twiddle * table. The 512 point FFT will use the first half of the 1024 * point twiddle table. * * * IMPLEMENTATION * * The above C implemetation of the IFFT has been modified to better fit * the 'C67xx architecture thus allowing the translation from C to hand * coded assembly easier. The modified function is listed below and is * functionally equivelent to the above function. Note, the C statements * in this function are used as comment for the equivelent assembly * statements (see the optimized assembly listeing). * * void icfftr2_dif(float* x, float* w, short n) * { * short n2, i, k, l, p, m, j, n2A; * float rtemp, itemp, s, c, xr, xi, yr, yi, Xr, Xi, Yr, Yi; * float *wptrB, *xinptrA, *xoutptrB, *xoutptrA, p1r, p2r, p1i, p2i; * * n2 = 1; * wptrB = w; * xinptrA = x; * xoutptrB = x; * c=*wptrB++; * s=*wptrB++; * xoutptrA = xoutptrB + 2*n2; * xoutptrB = 1 + xoutptrB; * i = n2; * j = n2; * p = n2; * * for(k=n; k > 1; k >>= 1) * { * for (l=0; l 1; k >>= 1) ; { ;------------------ outer loop - loopk ------------------------- ; for (l=0; l> 1; || sub l, 2, l ; o l = l - 2; || mv .L2 n2, i ; o i = n2; || lddw .D2 *xinptrA++, B5:B4 ; p xr = *xinptrA++; ; xi = *xinptrA++; c26: stw .D2 Yr, *xoutptrA++ ; e *xoutptrA++ = Yr; || mv .L2 n2, j ; o j = n2; || mv n2, p ; o p = n2 ||[i] sub .S2 i, 1, i ; p i = i - 1; c27: stw .D2 Yi, *xoutptrA ; e *xoutptrA++ = Yi; || add .S2X xoutptrB, n2As, xoutptrA; o xoutptrA = xoutptrB + n2; || add xoutptrB, 4, xoutptrB ; o xoutptrB = xoutptrB + 4; ||[!i] add .L2 xinptrA, n2As, xinptrA ; p if (i==0) ; xinptrA = xinptrA + n2A; loopk_end: ; ----------------- end of epilog for inner loop - loopl --------------- ; ----------------- end of outer loop - loopk -------------------------- ; ---------------------------- function epilog ------------------------ mvc .S2 CSR, B13 ; restore preserved by call registers sub B15, 4, A0 ldw .D1 *++A0[2], B3 ; f || ldw .D2 *++B15[2], A15 ; f || mvc .S2 CSR, B13 ; f ldw .D1 *++A0[2], B14 ; f || ldw .D2 *++B15[2], A14 ; f || or .L2 B13, 1, B13 ; f ldw .D1 *++A0[2], B13 ; f || ldw .D2 *++B15[2], A13 ; f || mvc .S2 B13,CSR ; f enable global interrupts ldw .D1 *++A0[2], B12 ; f || ldw .D2 *++B15[2], A12 ; f ldw .D1 *++A0[2], B11 ; f || ldw .D2 *++B15[2], A11 ; f ldw .D2 *++B15[2], A10 ; f || ldw .D1 *++A0[2], B10 ; f || b .S2 B3 ; f return(); nop 5 ; f