;* ======================================================================== *;
;*  TEXAS INSTRUMENTS, INC.                                                 *;
;*                                                                          *;
;*  DSPLIB  DSP Signal Processing Library                                   *;
;*                                                                          *;
;*      Release:        Version 1.02                                        *;
;*      CVS Revision:   1.8     Fri Mar 22 02:06:40 2002 (UTC)              *;
;*      Snapshot date:  18-Apr-2002                                         *;
;*                                                                          *;
;*  This library contains proprietary intellectual property of Texas        *;
;*  Instruments, Inc.  The library and its source code are protected by     *;
;*  various copyrights, and portions may also be protected by patents or    *;
;*  other legal protections.                                                *;
;*                                                                          *;
;*  This software is licensed for use with Texas Instruments TMS320         *;
;*  family DSPs.  This license was provided to you prior to installing      *;
;*  the software.  You may review this license by consulting the file       *;
;*  TI_license.PDF which accompanies the files in this library.             *;
;* ------------------------------------------------------------------------ *;
;*          Copyright (C) 2002 Texas Instruments, Incorporated.             *;
;*                          All Rights Reserved.                            *;
;* ======================================================================== *;
* ========================================================================= *
*                                                                           *
*  TEXAS INSTRUMENTS, INC.                                                  *
*                                                                           *
*  NAME                                                                     *
*        DSP_fft16x16r                                                      *
*                                                                           *
*                                                                           *
*  REVISION DATE                                                            *
*      12-Sep-2000                                                          *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C-callable and can be called as:                    *
*                                                                           *
*       void DSP_fft16x16r                                                  *
*       (                                                                   *
*           int             N,                                              *
*           short           *x,                                             *
*           short           *w,                                             *
*           unsigned char   *brev,                                          *
*           short           *y,                                             *
*           int             n_min,                                          *
*           int             offset,                                         *
*           int             nmax                                            *
*       );                                                                  *
*                                                                           *
*       N      : Length of fft in complex samples, power of 2 <=16384       *
*       x      : Pointer to complex data input                              *
*       w      : Pointer to complex twiddle factor (see below)              *
*       brev   : Pointer to bit reverse table containing 64 entries         *
*       y      : Pointer to complex data output                             *
*       n_min  : Smallest fft butterfly used in computation                 *
*                used for decomposing fft into subffts, see notes           *
*       offset : Index in complex samples of sub-fft from start of main     *
*                fft                                                        *
*       nmax   : Size of main fft in complex samples                        *
*                                                                           *
*   DESCRIPTION                                                             *
*       The benchmark performs a mixed radix forward FFT using a special    *
*       sequence of coefficients generated in the following way:            *
*                                                                           *
*        void tw_gen(short *w, int N)                                       *
*        {                                                                  *
*          int j, k;                                                        *
*          double x_t, y_t, theta1, theta2, theta3;                         *
*          const double PI = 3.141592654, M = 32767.0;                      *
*                      /* M is 16383 for scale by 4 */                      *
*                                                                           *
*          for (j=1, k=0; j <= N>>2; j = j<<2)                              *
*          {                                                                *
*              for (i=0; i < N>>2; i+=j)                                    *
*              {                                                            *
*                  theta1 = 2*PI*i/N;                                       *
*                  x_t = M*cos(theta1);                                     *
*                  y_t = M*sin(theta1);                                     *
*                  w[k]   =  (short)x_t;                                    *
*                  w[k+1] =  (short)y_t;                                    *
*                                                                           *
*                  theta2 = 4*PI*i/N;                                       *
*                  x_t = M*cos(theta2);                                     *
*                  y_t = M*sin(theta2);                                     *
*                  w[k+2] =  (short)x_t;                                    *
*                  w[k+3] =  (short)y_t;                                    *
*                                                                           *
*                  theta3 = 6*PI*i/N;                                       *
*                  x_t = M*cos(theta3);                                     *
*                  y_t = M*sin(theta3);                                     *
*                  w[k+4] =  (short)x_t;                                    *
*                  w[k+5] =  (short)y_t;                                    *
*                  k+=6;                                                    *
*              }                                                            *
*           }                                                               *
*        }                                                                  *
*                                                                           *
*       This redundent set of twiddle factors is size 2*N short samples.    *
*       As pointed out later dividing these twiddle factors by 2 will give  *
*       an effective divide by 4 at each stage to guarentee no overflow.    *
*       The function is accurate to about 68dB of signal to noise ratio to  *
*       the DFT function below:                                             *
*                                                                           *
*       void dft(int n, short x[], short y[])                               *
*       {                                                                   *
*          int k,i, index;                                                  *
*          const double PI = 3.14159654;                                    *
*          short *p_x;                                                      *
*          double arg, fx_0, fx_1, fy_0, fy_1, co, si;                      *
*                                                                           *
*          for(k = 0; k<n; k++)                                             *
*          {                                                                *
*            p_x = x;                                                       *
*            fy_0 = 0;                                                      *
*            fy_1 = 0;                                                      *
*            for(i=0; i<n; i++)                                             *
*            {                                                              *
*              fx_0 = (double)p_x[0];                                       *
*              fx_1 = (double)p_x[1];                                       *
*              p_x += 2;                                                    *
*              index = (i*k) % n;                                           *
*              arg = 2*PI*index/n;                                          *
*              co = cos(arg);                                               *
*              si = -sin(arg);                                              *
*              fy_0 += ((fx_0 * co) - (fx_1 * si));                         *
*              fy_1 += ((fx_1 * co) + (fx_0 * si));                         *
*            }                                                              *
*            y[2*k] = (short)2*fy_0/sqrt(N);                                *
*            y[2*k+1] = (short)2*fy_1/sqrt(N);                              *
*          }                                                                *
*       }                                                                   *
*                                                                           *
*       Scaling takes place at each stage except the last one.  This is     *
*       a divide by 2 to prevent overflow. All shifts are rounded to        *
*       reduce truncation noise power by 3dB.  The function takes the       *
*       table and input data and calculates the fft producing the           *
*       frequency domain data in the Y array.  As the fft allows every      *
*       input point to effect every output point in a cache based           *
*       system such as the TMS320C6211, this causes cache thrashing.        *
*       This is mitigated by allowing the main fft of size N to be          *
*       divided into several steps, allowing as much data reuse as          *
*       possible.                                                           *
*                                                                           *
*       For example the following function:                                 *
*                                                                           *
*       DSP_fft16x16r  (1024, &x_asm[0],&w[0],y_asm,brev,4,  0,1024);       *
*                                                                           *
*       is equvalent to:                                                    *
*                                                                           *
*       DSP_fft16x16r(1024,&x_asm[2*0],  &w[0]    ,y_asm,brev,256, 0,1024); *
*       DSP_fft16x16r(256, &x_asm[2*0],  &w[2*768],y_asm,brev,4,   0,1024); *
*       DSP_fft16x16r(256, &x_asm[2*256],&w[2*768],y_asm,brev,4, 256,1024); *
*       DSP_fft16x16r(256, &x_asm[2*512],&w[2*768],y_asm,brev,4, 512,1024); *
*       DSP_fft16x16r(256, &x_asm[2*768],&w[2*768],y_asm,brev,4, 768,1024); *
*                                                                           *
*       Notice how the 1st fft function is called on the entire 1K data     *
*       set it covers the 1st pass of the fft until the butterfly size      *
*       is 256.  The following 4 ffts do 256 pt ffts 25% of the size.       *
*       These continue down to the end when the buttefly is of size 4.      *
*       The use an index to the main twiddle factor array of 0.75*2*N.      *
*       This is because the twiddle factor array is composed of             *
*       successively decimated versions of the main array.                  *
*                                                                           *
*       N not equal to a power of 4 can be used, i.e. 512.  In this         *
*       case to decompose the fft the following would be needed:            *
*                                                                           *
*       DSP_fft16x16r   (512, &x_asm[0],&w[0],y_asm,brev,2,  0,512);        *
*                                                                           *
*       is equvalent to:                                                    *
*                                                                           *
*       DSP_fft16x16r(512, &x_asm[0],    &w[0],    y_asm,brev,128, 0,512);  *
*       DSP_fft16x16r(128, &x_asm[2*0],  &w[2*384],y_asm,brev,4,   0,512);  *
*       DSP_fft16x16r(128, &x_asm[2*128],&w[2*384],y_asm,brev,4, 128,512);  *
*       DSP_fft16x16r(128, &x_asm[2*256],&w[2*384],y_asm,brev,4, 256,512);  *
*       DSP_fft16x16r(128, &x_asm[2*384],&w[2*384],y_asm,brev,4, 384,512);  *
*                                                                           *
*       The twiddle factor array is composed of log4(N) sets of twiddle     *
*       factors, (3/4)*N, (3/16)*N, (3/64)*N, etc.  The index into this     *
*       array for each stage of the fft is calculated by summing these      *
*       indices up appropriately.  For multiple ffts they can share the     *
*       same table by calling the small ffts from further down in the       *
*       twiddle factor array. In the same way as the decomposition works    *
*       for more data reuse.                                                *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       N must be a power of 2 and 8 <=  N <= 16384.  Complex time data     *
*       x[] and twiddle facotrs w[] are aligned on double word              *
*       boundaries.  Real values are stored in even word positions and      *
*       imaginary values in odd positions.                                  *
*                                                                           *
*       All data is in short precision integer fixed point form.  The       *
*       complex frequency data will be returned in linear order.            *
*                                                                           *
*       This code is interrupt tolerant, interrupts are disabled on         *
*       entry to the function and reenabled on exit.                        *
*                                                                           *
*       If Interruption is required the decomposition can be used to        *
*       allow interrupts to occur in between function calls.  In this       *
*       way interrupts can occur roughly every 20% of the time through      *
*       the overall FFT.                                                    *
*                                                                           *
*   MEMORY NOTE                                                             *
*       Configuration is LITTLE ENDIAN.  This code will not function if     *
*       the -me flag is enabled.  It can, however, be modified for big      *
*       endian usage.                                                       *
*                                                                           *
*       No memory bank hits occur in this code.                             *
*                                                                           *
*   TECHNIQUES                                                              *
*       A special sequence of coefficients used as generated above          *
*       produces the fft.  This collapses the inner 2 loops in the          *
*       traditional Burrus and Parks implementation Fortran Code.           *
*                                                                           *
*       The revised FFT uses a redundant sequence of twiddle factors to     *
*       allow a linear access through the data.  This linear access         *
*       enables data and instruction level parallelism.  The data           *
*       produced by the DSP_fft16x16r fft is in normal form, the whole      *
*       data array is written into a new output buffer.                     *
*                                                                           *
*       The DSP_fft16x16r butterfly is bit reversed, i.e. the inner 2       *
*       points of the butterfly are crossed over, this has the effect       *
*       of making the data come out in bit reversed rather than in          *
*       radix 4 digit reversed order.  This simplifies the last pass of     *
*       the loop.  A simple table is used to do the bit reversal out of     *
*       place.                                                              *
*                                                                           *
*          unsigned char brev[64] = {                                       *
*                0x0, 0x20, 0x10, 0x30, 0x8, 0x28, 0x18, 0x38,              *
*                0x4, 0x24, 0x14, 0x34, 0xc, 0x2c, 0x1c, 0x3c,              *
*                0x2, 0x22, 0x12, 0x32, 0xa, 0x2a, 0x1a, 0x3a,              *
*                0x6, 0x26, 0x16, 0x36, 0xe, 0x2e, 0x1e, 0x3e,              *
*                0x1, 0x21, 0x11, 0x31, 0x9, 0x29, 0x19, 0x39,              *
*                0x5, 0x25, 0x15, 0x35, 0xd, 0x2d, 0x1d, 0x3d,              *
*                0x3, 0x23, 0x13, 0x33, 0xb, 0x2b, 0x1b, 0x3b,              *
*                0x7, 0x27, 0x17, 0x37, 0xf, 0x2f, 0x1f, 0x3f               *
*          };                                                               *
*                                                                           *
*   NOTES                                                                   *
*       For more aggressive overflow control the shift in the DC term       *
*       can be adjusted to 2 and the twiddle factors shifted right by       *
*       1.  This gives a divide by 4 at each stage.  For better             *
*       accuracy the data can be pre asserted left by so many bits so       *
*       that as it builds in magnitude the divide by 2 prevents too         *
*       much growth.  An optimal point for example with an 8192pt fft       *
*       with input data precision of 8 bits is to asert the input 4         *
*       bits left to make it 12 bits.  This gives an SNR of 68dB at the     *
*       output.  By trying combinations the optimal can be found.  If       *
*       scaling is not required it is possible to replace the MPY by        *
*       SMPY this will give a shift left by 1 so a shift right by 16        *
*       gives a total 15 bit shift right.  The DC term must be adjusted     *
*       to give a zero shift.                                               *
*                                                                           *
*   C CODE                                                                  *
*       This is the C equivalent of the assembly code without               *
*       restrictions.  Note that the assembly code is hand optimized        *
*       and restrictions may apply.                                         *
*                                                                           *
*       void DSP_fft16x16r                                                  *
*       (                                                                   *
*           int             n,                                              *
*           short           *ptr_x,                                         *
*           short           *ptr_w,                                         *
*           unsigned char   *brev,                                          *
*           short           *y,                                             *
*           int             radix,                                          *
*           int             offset,                                         *
*           int             nmax                                            *
*       )                                                                   *
*       {                                                                   *
*           int   i, l0, l1, l2, h2, predj;                                 *
*           int   l1p1,l2p1,h2p1, tw_offset, stride, fft_jmp;               *
*           short xt0, yt0, xt1, yt1, xt2, yt2;                             *
*           short si1,si2,si3,co1,co2,co3;                                  *
*           short xh0,xh1,xh20,xh21,xl0,xl1,xl20,xl21;                      *
*           short x_0, x_1, x_l1, x_l1p1, x_h2 , x_h2p1, x_l2, x_l2p1;      *
*           short *x,*w;                                                    *
*           short *ptr_x0, *ptr_x2, *y0;                                    *
*           unsigned int j, k, j0, j1, k0, k1;                              *
*           short x0, x1, x2, x3, x4, x5, x6, x7;                           *
*           short xh0_0, xh1_0, xh0_1, xh1_1;                               *
*           short xl0_0, xl1_0, xl0_1, xl1_1;                               *
*           short yt3, yt4, yt5, yt6, yt7;                                  *
*           unsigned a, num;                                                *
*                                                                           *
*           stride = n;         /* n is the number of complex samples */    *
*           tw_offset = 0;                                                  *
*           while (stride > radix)                                          *
*           {                                                               *
*               j = 0;                                                      *
*               fft_jmp = stride + (stride>>1);                             *
*               h2 = stride>>1;                                             *
*               l1 = stride;                                                *
*               l2 = stride + (stride>>1);                                  *
*               x = ptr_x;                                                  *
*               w = ptr_w + tw_offset;                                      *
*                                                                           *
*               for (i = 0; i < n>>1; i += 2)                               *
*               {                                                           *
*                   co1 = w[j+0];                                           *
*                   si1 = w[j+1];                                           *
*                   co2 = w[j+2];                                           *
*                   si2 = w[j+3];                                           *
*                   co3 = w[j+4];                                           *
*                   si3 = w[j+5];                                           *
*                   j += 6;                                                 *
*                                                                           *
*                   x_0    = x[0];                                          *
*                   x_1    = x[1];                                          *
*                   x_h2   = x[h2];                                         *
*                   x_h2p1 = x[h2+1];                                       *
*                   x_l1   = x[l1];                                         *
*                   x_l1p1 = x[l1+1];                                       *
*                   x_l2   = x[l2];                                         *
*                   x_l2p1 = x[l2+1];                                       *
*                                                                           *
*                   xh0  = x_0    + x_l1;                                   *
*                   xh1  = x_1    + x_l1p1;                                 *
*                   xl0  = x_0    - x_l1;                                   *
*                   xl1  = x_1    - x_l1p1;                                 *
*                                                                           *
*                   xh20 = x_h2   + x_l2;                                   *
*                   xh21 = x_h2p1 + x_l2p1;                                 *
*                   xl20 = x_h2   - x_l2;                                   *
*                   xl21 = x_h2p1 - x_l2p1;                                 *
*                                                                           *
*                   ptr_x0 = x;                                             *
*                   ptr_x0[0] = ((short)(xh0 + xh20))>>1;                   *
*                   ptr_x0[1] = ((short)(xh1 + xh21))>>1;                   *
*                                                                           *
*                   ptr_x2 = ptr_x0;                                        *
*                   x += 2;                                                 *
*                   predj = (j - fft_jmp);                                  *
*                   if (!predj) x += fft_jmp;                               *
*                   if (!predj) j = 0;                                      *
*                                                                           *
*                   xt0  = xh0 - xh20;                                      *
*                   yt0  = xh1 - xh21;                                      *
*                   xt1  = xl0 + xl21;                                      *
*                   yt2  = xl1 + xl20;                                      *
*                   xt2  = xl0 - xl21;                                      *
*                   yt1  = xl1 - xl20;                                      *
*                                                                           *
*                   l1p1 = l1+1;                                            *
*                   h2p1 = h2+1;                                            *
*                   l2p1 = l2+1;                                            *
*                                                                           *
*                   ptr_x2[l1  ] = (xt1 * co1 + yt1 * si1                   *
*                                   + 0x00008000) >> 16;                    *
*                   ptr_x2[l1p1] = (yt1 * co1 - xt1 * si1                   *
*                                   + 0x00008000) >> 16;                    *
*                   ptr_x2[h2  ] = (xt0 * co2 + yt0 * si2                   *
*                                   + 0x00008000) >> 16;                    *
*                   ptr_x2[h2p1] = (yt0 * co2 - xt0 * si2                   *
*                                   + 0x00008000) >> 16;                    *
*                   ptr_x2[l2  ] = (xt2 * co3 + yt2 * si3                   *
*                                   + 0x00008000) >> 16;                    *
*                   ptr_x2[l2p1] = (yt2 * co3 - xt2 * si3                   *
*                                   + 0x00008000) >> 16;                    *
*               }                                                           *
*                                                                           *
*               tw_offset += fft_jmp;                                       *
*               stride = stride>>2;                                         *
*           } /* end while */                                               *
*                                                                           *
*           j = offset>>2;                                                  *
*           ptr_x0 = ptr_x;                                                 *
*           y0 = y;                                                         *
*                                                                           *
*           /* determine l0 = _norm(nmax) - 17 */                           *
*           l0 = 31;                                                        *
*           if (((nmax>>31)&1)==1)                                          *
*               num = ~nmax;                                                *
*           else                                                            *
*               num = nmax;                                                 *
*           if (!num)                                                       *
*               l0 = 32;                                                    *
*           else                                                            *
*           {                                                               *
*               a=num&0xFFFF0000; if (a) { l0-=16; num=a; }                 *
*               a=num&0xFF00FF00; if (a) { l0-= 8; num=a; }                 *
*               a=num&0xF0F0F0F0; if (a) { l0-= 4; num=a; }                 *
*               a=num&0xCCCCCCCC; if (a) { l0-= 2; num=a; }                 *
*               a=num&0xAAAAAAAA; if (a) { l0-= 1; }                        *
*           }                                                               *
*           l0 -= 1;                                                        *
*                                                                           *
*           l0 -= 17;                                                       *
*                                                                           *
*           if(radix == 2 || radix  == 4)                                   *
*               for (i = 0; i < n; i += 4)                                  *
*               {                                                           *
*                       /* reversal computation */                          *
*                                                                           *
*                       j0 = (j     ) & 0x3F;                               *
*                       j1 = (j >> 6) & 0x3F;                               *
*                       k0 = brev[j0];                                      *
*                       k1 = brev[j1];                                      *
*                       k = (k0 << 6) |  k1;                                *
*                       if (l0 < 0)                                         *
*                         k = k << -l0;                                     *
*                       else                                                *
*                         k = k >> l0;                                      *
*                       j++;        /* multiple of 4 index */               *
*                                                                           *
*                       x0   = ptr_x0[0];  x1 = ptr_x0[1];                  *
*                       x2   = ptr_x0[2];  x3 = ptr_x0[3];                  *
*                       x4   = ptr_x0[4];  x5 = ptr_x0[5];                  *
*                       x6   = ptr_x0[6];  x7 = ptr_x0[7];                  *
*                       ptr_x0 += 8;                                        *
*                                                                           *
*                       xh0_0  = x0 + x4;                                   *
*                       xh1_0  = x1 + x5;                                   *
*                       xh0_1  = x2 + x6;                                   *
*                       xh1_1  = x3 + x7;                                   *
*                                                                           *
*                       if (radix == 2)                                     *
*                       {                                                   *
*                         xh0_0 = x0;                                       *
*                         xh1_0 = x1;                                       *
*                         xh0_1 = x2;                                       *
*                         xh1_1 = x3;                                       *
*                       }                                                   *
*                                                                           *
*                       yt0  = xh0_0 + xh0_1;                               *
*                       yt1  = xh1_0 + xh1_1;                               *
*                       yt4  = xh0_0 - xh0_1;                               *
*                       yt5  = xh1_0 - xh1_1;                               *
*                                                                           *
*                       xl0_0  = x0 - x4;                                   *
*                       xl1_0  = x1 - x5;                                   *
*                       xl0_1  = x2 - x6;                                   *
*                       xl1_1  = x3 - x7;                                   *
*                                                                           *
*                       if (radix == 2)                                     *
*                       {                                                   *
*                         xl0_0 = x4;                                       *
*                         xl1_0 = x5;                                       *
*                         xl1_1 = x6;                                       *
*                         xl0_1 = x7;                                       *
*                       }                                                   *
*                                                                           *
*                       yt2  = xl0_0 + xl1_1;                               *
*                       yt3  = xl1_0 - xl0_1;                               *
*                                                                           *
*                       yt6  = xl0_0 - xl1_1;                               *
*                       yt7  = xl1_0 + xl0_1;                               *
*                                                                           *
*                       if (radix == 2)                                     *
*                       {                                                   *
*                         yt7  = xl1_0 - xl0_1;                             *
*                         yt3  = xl1_0 + xl0_1;                             *
*                       }                                                   *
*                                                                           *
*                       y0[k] = yt0; y0[k+1] = yt1;                         *
*                       k += n>>1;                                          *
*                       y0[k] = yt2; y0[k+1] = yt3;                         *
*                       k += n>>1;                                          *
*                       y0[k] = yt4; y0[k+1] = yt5;                         *
*                       k += n>>1;                                          *
*                       y0[k] = yt6; y0[k+1] = yt7;                         *
*               }                                                           *
*       }                                                                   *
*                                                                           *
*   CYCLES                                                                  *
*       2.5 * N * ceil(log4(N)) - N/2 + 164                                 *
*                                                                           *
*       For N = 1024:  cycles = 12452                                       *
*       For N = 512:   cycles = 6308                                        *
*       For N = 256:   cycles = 2568                                        *
*                                                                           *
*   CODESIZE                                                                *
*       1344 bytes                                                          *
*                                                                           *
*   REFERENCES                                                              *
*       [1] C. S. Burrus and T.W. Parks (1985) "DFT/FFT and Convolution     *
*           Algos - Theory and Implementation", J. Wiley.                   *
*       [2] Implementation of Various Precision Fast Fourier Transforms on  *
*           the TMS320C6400 processor - DJH, ESC 2000                       *
*       [3] Burrus - Rice University and Papamichalis - TI (1988) - Paper   *
*           on the convertion of radix4 to radix2 digit reversal.           *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2002 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *

        .global _DSP_fft16x16r

* ========================================================================= *
*   End of file:  dsp_fft16x16r.h62                                         *
* ------------------------------------------------------------------------- *
*             Copyright (c) 2002 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *