*=============================================================================== * * TEXAS INSTRUMENTS, INC. * * Copyright © Texas Instruments Incorporated 1998 * * TI retains all right, title and interest in this code and authorizes its * use solely and exclusively with digital signal processing devices * manufactured by or for TI. This code is intended to provide an * understanding of the benefits of using TI digital signal processing devices. * It is provided "AS IS". TI disclaims all warranties and representations, * including but not limited to, any warranty of merchantability or fitness * for a particular purpose. This code may contain irregularities not found * in commercial software and is not intended to be used in production * applications. You agree that prior to using or incorporating this code * into any commercial product you will thoroughly test that product and the * functionality of the code in that product and will be solely responsible * for any problems or failures. * * TI retains all rights not granted herein. * * * Linear Time, Small Lookup Table: Bit Reversal * * Revision Date: 5/12/98 * * USAGE This routine is C Callable and can be called as: * * void bitrev(float *x, short *index, int n){ * * x = Input Array to be Bit-Reversed * n = Number of points in array must be a power of 2) * index = Array of ~sqrt(n) created by the routine * digitrev_index found below to allow the fast * implementation of the bit-reversal * * If routine is not to be used as a C callable function * then all instructions relating to stack should be removed. * Refer to comments of individual instructions. You will also * need to initialize values for all of the values passed as these * are assumed to be in registers as defined by the calling * convention of the compiler, (refer to the C compiler reference * guide). * * C Code This is the C equivalent of the Assembly Code without * restrictions. Note that the assembly code is hand optimized * and restrictions may apply * * TI retains all rights, title and interest in this code and only * authorizes the use of this code on TI TMS320 DSPs manufactured by TI. * * void bitrev(float *xs, short *index, int n){ * int i; * short i0, i1, i2, i3; * short j0, j1, j2, j3; * double xi0, xi1, xi2, xi3; * double xj0, xj1, xj2, xj3; * short t; * int a, b, ia, ib, ibs; * int mask; * int nbits, nbot, ntop, ndiff, n2, halfn; * double *x ; * x = (double *)xs ; * * nbits = 0; * i = n; * while (i > 1) * { * i = i >> 1; * nbits++; * } * * nbot = nbits >> 1; * ndiff = nbits & 1; * ntop = nbot + ndiff; * n2 = 1 << ntop; * mask = n2 - 1; * halfn = n >> 1; * * for (i0 = 0; i0 < halfn; i0 += 2) * { * b = i0 & mask; * a = i0 >> nbot; * if (!b) ia = index[a]; * ib = index[b]; * ibs = ib << nbot; * * j0 = ibs + ia; * t = i0 < j0; * xi0 = x[i0]; * xj0 = x[j0]; * * if (t) * { * x[i0] = xj0; * x[j0] = xi0; * } * * i1 = i0 + 1; * j1 = j0 + halfn; * xi1 = x[i1]; * xj1 = x[j1]; * x[i1] = xj1; * x[j1] = xi1; * * i3 = i1 + halfn; * j3 = j1 + 1; * xi3 = x[i3]; * xj3 = x[j3]; * if (t) * { * x[i3] = xj3; * x[j3] = xi3; * } * } * } * * DESCRIPTION * This routine performs the bit-reversal of the input array x[]. * where x[] is an array of length n 32-bit complex pairs of data. * This requires the index array provided by the program below. * This index should be generated at compile time not by the DSP. * * ASSUMPTIONS * n is a power of 2 * * NOTE: If n <= 4K one can use the char (8-bit) data type for * the "index" variable. This would require changing the LDH when * loading index values in the assembly routine to LDB. This would * further reduce the size of the Index Table by half its size. * * CYCLES (n/4)*11 + 9 * * MEMORY NOTE * There are NO memory bank hits regarless of array alignment * ******************************************************************************* * Use This Routine To Generate the Index Table for * Bit/Digit Reversing of Radix-2 and Radix-4 Routines ******************************************************************************* * This routine calculates the index for digitrev of length n * (length of index is 2^(radix*ceil(k/radix)) where n = 2^k * in otherwords * Either:sqrt(n) when n=2^even# Or: sqrt(2)*sqrt(n) when n=2^odd# [radix 2] * sqrt(n) when n=4^even# Or: sqrt(4)*sqrt(n) when n=4^odd# [radix 4] * Note: the variable "radix" is 2 for radix-2 and 4 for radix-4 ******************************************************************************* * * void digitrev_index(short *index, int n, int radix){ * * int i,j,k; * short nbits, nbot, ntop, ndiff, n2, raddiv2; * * nbits = 0; * i = n; * while (i > 1){ * i = i >> 1; * nbits++; * } * * raddiv2 = radix >> 1; * nbot = nbits >> raddiv2; * nbot = nbot << raddiv2 - 1; * ndiff = nbits & raddiv2; * ntop = nbot + ndiff; * n2 = 1 << ntop; * * index[0] = 0; * for ( i = 1, j = n2/radix + 1; i < n2 - 1; i++){ * index[i] = j - 1; * for (k = n2/radix; k*(radix-1) < j; k /= radix) * j -= k*(radix-1); * j += k; * } * index[n2 - 1] = n2 - 1; * } * * TECHNIQUES * * 1. The following registers are sharead to reduce register pressure * A8 (between) xi2, xi4, n2, nbits * A9 (between) xj2, xj4, tmp, ndiff * A10 (between) xi0, cnst * A11 (between) xj0, ntop * B4 (between) index, ptr_i1 * B6 (between) xi1, xi3 * B7 (between) xj1, xj3 * 2. The first set of indices ia and ib are loaded before * loop kernel * 3. Three load pointers are used to decouple 3 load/stores * 4. Memory bank hits are eliminated by loading odd/even * index pair in one cycle * ******************************************************************************* .global _bitrev .text _bitrev: STW .D2T2 B14,*B15--(48) STW .D2T2 B3,*+B15(4) STW .D2T1 A10,*+B15(8) STW .D2T1 A11,*+B15(12) STW .D2T1 A12,*+B15(16) STW .D2T1 A13,*+B15(20) STW .D2T1 A14,*+B15(24) STW .D2T1 A15,*+B15(28) STW .D2T2 B10,*+B15(32) STW .D2T2 B11,*+B15(36) STW .D2T2 B12,*+B15(40) STW .D2T2 B13,*+B15(44) ; Begin Benchmark Timing LDH .D2T1 *B4, A13 ; ib = *index || SHR .S2X A6, 2, B1 ; icntr = n >> 2 || MVK .S1 31, A10 ; cnst = 31 || LMBD .L1 1, A6, A9 ; tmp = lmbd(1, n) || ZERO .L2 B13 ; i0 = 0 SHR .S2X A6, 1, B5 ; halfn = n >> 1 || SHL .S1 A6, 2, A12 ; halfnbytes = n << 2 || LDH .D2 *B4, B3 ; ia = *index || SUB .L1 A10, A9, A8 ; nbits = cnst - tmp SHR .S1 A8, 1, A14 ; nbot1 = nbits >> 1 || AND .L1 A8, 1, A9 ; ndiff = nbits & 1 ADD .L1 A14, A9, A11 ; ntop = nbot1 + ndiff || MVK .S1 1, A9 ; tmp = 1 SHL .S1 A9, A11, A8 ; n2 = tmp << ntop || ADD .S2 B13,2,B2 ; aa = i0 + 2 || ADD .L1X B13,2,A1 ; bb = i0 + 2 || MV .L2X A14, B14 ; nbot = nbot1 SHL .S1 A13, A14, A13 ; ib = ib << nbot1 || SUB .D1 A8, 1, A15 ; mask = n2 - 1 ADD .L1X A13, B3, A0 ; j0 = ib + ia || SHR .S1 A6, 1, A6 ; n = n >> 1 SHL .S1 A0,3,A3 ; ptr_y0 = j0 << 3 || MV .L1X B4, A7 ; ptr_i0 = index MV .L2X A4,B11 ; ptr_y1 = x || ADD .L1 A4,A3,A3 ; ptr_y0 = x + ptr_y0 || AND .S1 A1, A15, A1 ; bb = bb & mask || SHR .S2 B2, B14, B2 ; aa = aa >> nbot LOOP: LDDW .D2T1 *++B11[1], A9:A8 ; xj2:xi2 = *++ptr_y1[1] || LDDW .D1T2 *++A3[A6], B7:B6 ; xj3:xi3 = *++ptr_y0[n] || ADD .S2 B11,8,B12 ; ptr_z1 = ptr_y1 + 8 || ADD .L1 A3,A12,A5 ; ptr_z0 = ptr_y0 + halfnbytes || CMPLT .L2X B13, A0, B0 ; if (i0 < j0) {t=1} else {t=0} LDH .D1 *+A7[A1], A13 ; ib = *+ptr_i0[bb] || MV .L1 A4, A2 ; ptr_x0 = x || MV .L2X A4, B10 ; ptr_x1 = x [B0] LDDW .D2T1 *++B10[B13], A11:A10 ; if (t) xj0:xi0 = *++ptr_x1[i0] ||[B0] LDDW .D1T2 *++A5[1], B9:B8 ; if (t) xj5:xi5 = *++ptr_z0[1] || ADD .L2 B13, 2, B13 ; i0 = i0 + 2 [!A1] LDH .D2 *+B4[B2], B3 ; if (!bb) ia = *+ptr_i1[aa] [B0] LDDW .D1T2 *++A2[A0], B7:B6 ; if (t) xj1:xi1 = *++ptr_x0[j0] ||[B0] LDDW .D2T1 *++B12[B5], A9:A8 ; if (t)xj4:xi4=*++ptr_z1[halfn] ||[B1] SUB .S2 B1, 1, B1 ; if (icntr) icntr -=1 || ADD .L2 B13,2,B2 ; aa = i0 + 2 || ADD .L1X B13,2,A1 ; bb = i0 + 2 STW .D1T1 A8, *A3++[1] ; *ptr_y0++[1] = xi2 || STW .D2T2 B7, *++B11[1] ; *++ptr_y1[1] = xj3 ||[B1] B .S2 LOOP STW .D1T1 A9, *A3--[1] ; *ptr_y0--[1] = xj2 || STW .D2T2 B6, *--B11[1] ; *--ptr_y1[1] = xi3 || SHL .S1 A13, A14, A13 ; ib = ib << nbot1 [B0] STW .D1T1 A10, *A2++[1] ; if (t) *ptr_x0++[1] = xi0 ||[B0] STW .D2T2 B9, *++B12[1] ; if (t) *++ptr_z1[1] = xj5 || AND .S1 A1, A15, A1 ; bb = bb & mask || SHR .S2 B2, B14, B2 ; aa = aa >> nbot [B0] STW .D1T1 A11, *A2--[1] ; if (t) *ptr_x0--[1] = xj0 ||[B0] STW .D2T2 B8, *--B12[1] ; if (t) *--ptr_z1[1] = xi5 || ADD .L1X A13, B3, A0 ; j0 = ib + ia [B0] STW .D2T2 B7, *++B10[1] ; if (t) *++ptr_x1[1] = xj1 ||[B0] STW .D1T1 A8, *A5++[1] ; if (t) *ptr_z0++[1] = xi4 || SHL .S1 A0,3,A3 ; ptr_y0 = j0 << 3 || SHL .S2 B13,3,B11 ; ptr_y1 = i0 << 3 [B0] STW .D2T2 B6, *--B10[1] ; if (t) *--ptr_x1[1] = xi1 ||[B0] STW .D1T1 A9, *A5--[1] ; if (t) *ptr_z0--[1] = xj4 || ADD .L2X A4,B11,B11 ; ptr_y1 = x + ptr_y1 || ADD .L1 A4,A3,A3 ; ptr_y0 = x + ptr_y0 ;--------------------------------------------------------------------------- ; End Benchmark Timing LDW .D2 *+B15(4),B3 ; pop return address LDDW .D2T1 *+B15(8),A11:A10 LDDW .D2T1 *+B15(16),A13:A12 LDDW .D2T1 *+B15(24),A15:A14 LDDW .D2T2 *+B15(32),B11:B10 LDDW .D2T2 *+B15(40),B13:B12 || B .S2 B3 ; Return to calling function LDW .D2T2 *++B15(48),B14 NOP 4