*=============================================================================== 
*
* From: https://www-a.ti.com/apps/c6000/xt_download.asp?sku=C67x_icfftr2
* 
*	TEXAS INSTRUMENTS, INC.
*
*	Copyright � Texas Instruments Incorporated 1998
*
*	TI retains all right, title and interest in this code and authorizes its
*	use solely and exclusively with digital signal processing devices
*	manufactured by or for TI.  This code is intended to provide an
*	understanding of the benefits of using TI digital signal processing devices.
*	It is provided "AS IS".  TI disclaims all warranties and representations,
*	including but not limited to, any warranty of merchantability or fitness
*	for a particular purpose.  This code may contain irregularities not found
*	in commercial software and is not intended to be used in production
*	applications.  You agree that prior to using or incorporating this code
*	into any commercial product you will thoroughly test that product and the
*	functionality of the code in that product and will be solely responsible
*	for any problems or failures.  
*
*	TI retains all rights not granted herein.
*
* 
*     INVERSE COMPLEX RADIX-2 DECIMATION-IN-FREQUENCY FFT 
* 
*     Revision Date: 07/06/98
* 
*     USAGE 
*           This routine is C-callable and can be called as: 
* 
*           void icfftr2_dif(float* x, float* w, short n) 
* 
*		x[] --- input and output sequences (dim-n)      (input/output)
*			x has n complex numbers (2*n SP values).  
*			The real and imaginary values are interleaved
*			in memory: re0:im0, re1:im1, .....
*               w[]  --- FFT coefficients (dim-n/2)             (input)
*			w has n/2 complex numbers (n SP values). 
*			FFT coeficients must be in bt-reversed order
*			The real and imaginary values are interleaved
*			in memory: re0:im0, re1:im1, .....
*               n    --- FFT size                               (input)       
* 
*           If the routine is not to be used as a C-callable function, 
*           then all instructions relating to the stack should be removed. 
*           See comments of individual instructions to determine if they are 
*           related to the stack. You also need to initialize all passed 
*           parameters since these are assumed to be in registers as defined by 
*           the calling convention of the compiler, (See the C compiler 
*           reference guide.) 
*
*     C CODE 
*           This is the C equivalent of the assembly code without restrictions: 
*           Note that the assembly code is hand optimized and restrictions may 
*           apply. 
* 
*		void icfftr2_dif(float* x, float* w, short n)
*		{
*		   short n2, ie, ia, i, j, k, m;
*		   float rtemp, itemp, c, s;
*		
*		   n2 = 1;
*		   ie = n;
*		   for(k=n; k > 1; k >>= 1)
*		   {
*		      ie >>= 1;
*		      ia = 0;
*		      for(j=0; j < ie; j++)
*		      {
*		         c = w[2*j];
*		         s = w[2*j+1];
*		         for(i=0; i < n2; i++)
*		         {
*		            m = ia + n2;
*		            rtemp     = x[2*ia]   - x[2*m];
*		            x[2*ia]   = x[2*ia]   + x[2*m];
*		            itemp     = x[2*ia+1] - x[2*m+1];
*		            x[2*ia+1] = x[2*ia+1] + x[2*m+1];
*		            x[2*m]    = c*rtemp   - s*itemp;
*		            x[2*m+1]  = c*itemp   + s*rtemp;
*			    ia++;
*		         }
*		         ia += n2;
*		      }
*		      n2 <<= 1;
*		   }
*		}
* 
*       DESCRIPTION
*		This routine is used to compute the Inverse, Complex, Radix-2, 
*		Decimation-in-Frequency Fast Fourier Transform of a single
*		precision complex sequence of size n, and a power of 2.
* 		The routine requires bit-reversed input and bit-reversed 
*		coefficents (twiddle factors) and produces results that are
* 		in normal order.  Final scaling is not done in this function.
*
*       TECHNIQUES
*            1. Loading input x as well as coefficient w in double word.
*            2. Both loops j and i shown in the C code are placed in the
*               INNERLOOP of the assembly code.
*            3. mpy was used to perform a mv.  EX. mpy x, 1, y <=> mv x, y
*	     4. Because the data loads are 1 itteration ahead of the 
*		coefficent loads, counter i was copied to counter m so that
*	 	the actual count could live longer for the coefficent loads.
*            5. Two output pointers/counters are maintained to remove the
*		dependency between the X'a and Y's - the Y's have a much longer
*		latency path than the X's.
*	     6.	Inner loop prolog and epilog are done in parallel with the 
*		outer loop.
*
*       ASSUMPTIONS
*		n >= 8
*
*               Both input x and coefficient w should be aligned on double word
*               (8 byte) boundary.
* 	
*		The follwoing C code is used to generate the coefficient table 
*		(non-bit reversed).
*
*		#include <math.h>
*		/* generate real and imaginary twiddle 
*			table of size n/2 complex numbers */
*
*		gen_w_r2(float* w, int n)
*		{
*	   	int i;
*	   	float pi = 4.0*atan(1.0);
*	   	float e = pi*2.0/n;
*
*	   		for(i=0; i < ( n>>1 ); i++)
*	   		{
*	     			w[2*i]   = cos(i*e);
*	     			w[2*i+1] = sin(i*e);
*	   		}
*		}
*
*
* 		The follwoing C code is used to bit-reverse the coefficents.
*
*		bit_rev(float* x, int n)
*		{
*		  int i, j, k;
*		  float rtemp, itemp;
*
*		  j = 0;
*		  for(i=1; i < (n-1); i++)
*		  {
*		     k = n >> 1;
*		     while(k <= j)
*		     {
*		        j -= k;
*		        k >>= 1;
*		     }
*		     j += k;
*		     if(i < j)
*		     {
*		        rtemp    = x[j*2];
*		        x[j*2]   = x[i*2];
*		        x[i*2]   = rtemp;
*		        itemp    = x[j*2+1];
*		        x[j*2+1] = x[i*2+1];
*		        x[i*2+1] = itemp;
*		     }
*		  }
*		}
*
* 		The follwoing C code is used to perform the final scaling 
*		of the IFFT.
*
*		/* divide each element of x by n */
*		divide(float* x, int n)
*		{
*		   int i;
*		   float inv = 1.0 / n;
*
*		   for(i=0; i < n; i++)
*		   {
*		     x[2*i]   = inv * x[2*i];
*		     x[2*i+1] = inv * x[2*i+1];
*		   }
*		}
*
*
* 	MEMORY NOTE
*
*		Data (x)		8*N    bytes
*		Coefficients (w)	8*N/2  bytes
*		Stack			4*10   bytes
*		Program			800    bytes
*	
*		Note 1:  Data and Coefficents must reside in different memory
*		blocks to avoid memory conflicts.
*
*		Note 2:	 Data and Coefficents must be aligned to an 8 byte 
*		boundary.
*
*       CYCLES 
*
*	   # of cycles = 21 + 4 + M*((N/2-2)*4 + 24)
*                        /    \         \         \
*    C preservation  ___/      \         \         \___ Loop L Prolog/Epilog
*                               \         \                     +
*                                \         \                  Loop K
*                                 \         \
*                                  \         \___ Loop L
*                                   \
*                                    \___ Loop K Prolog
*
*		where:  N is the number of point in the IFFT
*			M = log(base 2)N, is the number of stages in the IFFT
*
*	Example:  1024 Point FFT Performance
*		N = 1024, M = 10, assume a 167MHz CPU clock
*
*		# of cycles = 21 + 4 + 10*((1024/2-2)*4 + 24) = 20665
*
*		time = # of cycles * CPU clock = 20665/167*10^6 = 124.0 usec
*
*
* 	EXAMPLE USAGE
*
* void main(void)
* {
*    gen_w_r2(w, N);         // Generate coefficient table
*    bit_rev(w, N>>1);       // Bit-reverse coefficient table
*    cfftr2_dit(x, w, N);    // This is the radix 2 FFT benchmark available 
*			     // from TI
*                            // input in normal order, output in bit-reversed 
*			     // order
*                            // coefficient table in bit-reversed order
*    icfftr2_dif(x, w, N);   // Inverse radix 2 FFT
*                            // input in bit-reversed order, output in normal 
*			     // order
*                            // coefficient table in bit-reversed order
*    divide(x, N);           // scale inverse FFT output
*                            // result is the same as original input to FFT
* }
*
*	Since the twiddle table is in bit-reversed order, it turns out that
*	the same twiddle table will also work for smaller IFFTs.  This
*	means that if you need to do both 512 and 1024 point IFFTs in the
*	same application, you only need to have the 1024 point twiddle
*	table.  The 512 point FFT will use the first half of the 1024
*	point twiddle table.
*
*
* 	IMPLEMENTATION
*
*	The above C implemetation of the IFFT has been modified to better fit
*	the 'C67xx architecture thus allowing the translation from C to hand
*	coded assembly easier.  The modified function is listed below and is 
*	functionally equivelent to the above function.  Note, the C statements
*	in this function are used as comment for the equivelent assembly
*	statements (see the optimized assembly listeing).
*
* void icfftr2_dif(float* x, float* w, short n)
* {
*	short n2, i, k, l, p, m, j, n2A;
*	float rtemp, itemp, s, c, xr, xi, yr, yi, Xr, Xi, Yr, Yi;
*	float *wptrB, *xinptrA, *xoutptrB, *xoutptrA, p1r, p2r, p1i, p2i;
*	
*	n2 = 1;
*	wptrB = w;
*	xinptrA = x; 
*	xoutptrB = x; 	
*	c=*wptrB++; 
*	s=*wptrB++;
*	xoutptrA = xoutptrB + 2*n2;
*	xoutptrB = 1 + xoutptrB;
*	i = n2;
*	j = n2;
*	p = n2;
*	
*	for(k=n; k > 1; k >>= 1)
*	{
*		for (l=0; l<n/2; l++)
*		{
*			yr = xinptrA[2*n2];
*			yi = xinptrA[2*n2 + 1];
*			xr = *xinptrA++; 
*			xi = *xinptrA++;
*			j = j - 1;
*			i = i - 1;	
*			p = p -1;  
*			itemp = xi - yi;
*			rtemp = xr - yr;
*			Xi = xi + yi;
*			Xr = xr + yr;
*			if (i==0) xinptrA = xinptrA + 2*n2; 
*			p1i = c*itemp;
*			p2i = s*rtemp;
* 			p1r = c*rtemp;
* 			p2r = s*itemp;
*			*xoutptrB = Xi;
*			xoutptrB = xoutptrB + 2;
*			*(xoutptrB - 3) = Xr;		
*			Yi = p1i + p2i;
*			Yr = p1r - p2r;
*			if (p==0) xoutptrB = xoutptrB + 2*n2;
*			m = i; 	
*			if (i==0) i = n2;
*			*xoutptrA++ = Yr;
*			*xoutptrA++ = Yi;
*			if (j==0) xoutptrA = xoutptrA + 2*n2;
*			if (m==0) {
*				c=*wptrB++; 
*				s=*wptrB++;
*			}
*			if (p==0) p = n2;	
* 			if (j==0) j = n2;
*		}
*	
*		n2 = n2 << 1;
*		xinptrA = x;
*		xoutptrB = x;
*		wptrB = w;  	
*		c=*wptrB++; 
*		s=*wptrB++;
*		i = n2;
*		j = n2;
*		p = n2;
*		xoutptrA = xoutptrB + 2*n2;
*		xoutptrB = xoutptrB + 1;	
*	}
* }
*
*
*       NOTATIONS
*
*               f = Function Prolog or Epilog
*               o = Outer Loop
*               p = Inner Loop Prolog
*		e = Inner Loop Epilog
*
*===============================================================================
	;void icfftr2_dif(float* x, float* w, short n)
	; {

	.def _icfftr2_dif

_icfftr2_dif:	; .cproc	x, w, n

   	; short n2, i, k, l, nd2, n2A;
   	; float rtemp, itemp, s, c, xr, xi, yr, yi, Xr, Xi, Yr, Yi;
   	; float *wptrB, *xinptrA, *xoutptrB, *xoutptrA, p1r, p2r, p1i;
   	; float p2i;
   	; short n2p1;

p1r		.set	A0
p		.set	A1
k		.set	A2
l		.set	A2
p2r		.set	A3
c		.set 	A4
s		.set	A5
yr		.set 	A6
yi		.set	A7
rtemp		.set	A8
itemp		.set	A9
Yr		.set	A10	
Yi		.set 	A11
wptrB		.set	A12
xoutptrB	.set	A13
p1i		.set	A14
p2i		.set	A15

j		.set	B0
i		.set	B1
m		.set	B2
n2		.set	B3
xr		.set	B4
xi		.set	B5
x		.set	B6
w		.set	B7
n		.set	B8
Xr		.set	B9
Xi		.set	B10
xinptrA		.set 	B11
xoutptrA	.set	B12
n2As		.set	B13
tmpk		.set	B14

	; ----------------- function prolog  --------------------

 ; preserve "save-on-call" registers

	sub		B15, 4, A0

	stw	.D2 	A10, *B15--[2]		; f
 ||	stw	.D1 	B10, *A0--[2]		; f

	stw	.D2 	A11, *B15--[2]		; f
 ||	stw	.D1 	B11, *A0--[2]		; f

	stw	.D2 	A12, *B15--[2]		; f
 ||	stw	.D1 	B12, *A0--[2]		; f

	stw	.D2 	A13, *B15--[2]		; f
 ||	stw	.D1 	B13, *A0--[2]		; f
 ||     mvc     .S2     CSR,B13			; f 

	stw	.D2 	A14, *B15--[2]		; f
 ||	stw	.D1 	B14, *A0--[2]		; f
 ||     and     .L2     -2,B13,B13		; f

	stw	.D2 	A15, *B15--[2]		; f
 ||	stw	.D1	B3, *A0--[2]		; f
 ||     mvc     .S2     B13,CSR			; f disable global interrupts

	mv	.L2x	A4, xinptrA		; f move arg1 to x
 ||	mv	.D2	B4, w			; f move arg2 to w
 ||	mvk	.S2 	1, n2			; o n2 = 1;

	; ----------------- prolog for loopl --------------------   			
	
   	mv	.L1x	w, wptrB		; o wptrB = w;
 ||  	mv	.L2	xinptrA, x		; o xinptrA = x;	
 ||	lddw	.D2	*+xinptrA[n2], A7:A6	; p yr = *xinptr++; 
						;   xi = *xinptr++; 
 ||	mpy	.M2	n2, 1, i		; o i = n2;
 ||	mv	.S2x	A6, n			; f move arg3 to n

  	mv	.S1X	x, xoutptrB		; o xoutptrB = x; 
 ||	lddw	.D2	*xinptrA++, B5:B4	; p xr = *xinptrA++
						;   xi = *xinptrA++
 ||	shr	.S2	n, 2, tmpk		; o tmpk = n/2
	
	shr	.S1x	n, 1, l			; o l = n/2;
 || 	shl	.S2	n2, 3, n2As		; o n2As = n2<<3;
 ||[i]	sub	.L2	i, 1, i			; p i = i - 1; 

  	add	.L2X	xoutptrB, n2As, xoutptrA; o xoutptrA = xoutptrB + n2;
 ||	add	.D1	xoutptrB, 4, xoutptrB	; o xoutptrB = 4 + xoutptrB
 ||	mv	.D2	n2, j			; o j = n2;
 ||	mv	.S1X	n2, p			; o p = n2
 ||	sub	.L1	l, 2, l			; o l = l - 2
 ||[!i]	add	.S2	xinptrA, n2As, xinptrA	; p if(i==0)xinptrA=xinptrA+n2A 

						; for(k=n; k > 1; k >>= 1)
   						; {

 	;------------------ outer loop - loopk -------------------------						; for (l=0; l<n/2; l++)			  
loopk:
c0:
	lddw	.D2	*+xinptrA[n2], A7:A6	; @ yr = xinptrA[2*n2]; 
						;   yi = xinptrA[2*n2 + 1]] 
||	mpy	.M2	i, 1, m			; m = i; 	

c1:
	lddw	.D2	*xinptrA++, B5:B4	; @ xr = *xinptrA++; 
						;   xi = *xinptrA++; 
||	lddw	.D1	*wptrB++, A5:A4		; c = *wptrB++; s = *wptrB++; 
||[!i]	mv	.S2	n2, i			; @ if (i==0) i = n2; 

c2:
	subsp	.L1x	xr, yr, rtemp		; rtemp = xr - yr; 
||[i]	sub	.S2	i, 1, i			; @ i = i - 1; 	

c3:
        subsp	.L1x	xi, yi, itemp		; itemp = xi - yi; 
||      addsp	.L2x	xi, yi, Xi		; Xi = xi + yi; 
||[!i]	add	.S2	xinptrA, n2As, xinptrA	; @ if (i==0) 

c4:
	lddw	.D2	*+xinptrA[n2], A7:A6	; @@ yr = xinptrA[2*n2];
						;    yi = xinptrA[2*n2 + 1]; 
||      addsp	.L2x	xr, yr, Xr		; Xr = xr + yr; 
||	mpy	.M2	i, 1, m			; m = i; 	

c5:
	lddw	.D2	*xinptrA++, B5:B4	; @@ xr = *xinptrA++; 
						;    xi = *xinptrA++; 
||[!m]	lddw	.D1	*wptrB++, A5:A4		; @ if (m==0) {c=*wptrB++;
                                                ;              s=*wptrB++;} 
||[!i]	mv	.S2	n2, i			; @@ if (i==0) i = n2; 

c6:
	subsp	.L1x	xr, yr, rtemp		; @ rtemp = xr - yr; 
||	mpysp	.M1	c, rtemp, p1r		; p1r = c*rtemp; 
||[i]	sub	.S2	i, 1, i			; @@ i = i - 1; 	

c7:
        subsp	.L1x	xi, yi, itemp		; @ itemp = xi - yi; 
||      addsp	.L2x	xi, yi, Xi		; @ Xi = xi + yi; 
||	mpysp	.M1	s, itemp, p2r		; p2r = s*itemp; 
||[!i]	add	.S2	xinptrA, n2As, xinptrA	; @@ if (i==0) 

c8:
	lddw	.D2	*+xinptrA[n2], A7:A6	; @@@ yr = xinptrA[2*n2]
						;     yi = xinptr[2*n2 + 1]; 
||      addsp	.L2x	xr, yr, Xr		; @ Xr = xr + yr; 
||	mpysp	.M1	s, rtemp, p2i		; p2i = s*rtemp; 
||	mpy	.M2	i, 1, m			; @ m = i; 	


c9:
	lddw	.D2	*xinptrA++, B5:B4	; @@@ xr = *xinptrA++; 
						;     xi = *xinptrA++; 
||	mpysp	.M1	c, itemp, p1i		; p1i = c*itemp; 
||[!m]	lddw	.D1	*wptrB++, A5:A4		; @@ if (m==0) {c=*wptrB++; 
						;		s=*wptrB++;} 
||[!i]	mv	.S2	n2, i			; @@@ if (i==0) i = n2; 

c10:
	subsp	.L1x	xr, yr, rtemp		; @@ rtemp = xr - yr; 
||	mpysp	.M1	c, rtemp, p1r		; @ p1r = c*rtemp; 
||	stw	.D1	Xi, *xoutptrB++[2]	; *xoutptrB = Xi;
						;  xoutptrB=xoutptrB+2; 
||[i]	sub	.S2	i, 1, i			; @@@ i = i - 1; 	

c11:
        subsp	.L1x	xi, yi, itemp		; @@ itemp = xi - yi; 
||      addsp	.L2x	xi, yi, Xi		; @@ Xi = xi + yi; 
||	mpysp	.M1	s, itemp, p2r		; @ p2r = s*itemp; 
||	stw	.D1	Xr, *-xoutptrB[3]	; *(xoutptrB-3) = Xr; 
||[!i]	add	.S2	xinptrA, n2As, xinptrA	; @@@ if (i==0)
						;       xinptrA = xinptrA + n2A; 
||[p]	sub	.S1	p, 1, p			; p = p - 1;  

c12:
	lddw	.D2	*+xinptrA[n2], A7:A6	; @@@@ yr = xinptrA[2*n2];
						;      xyi = xinptrA[2*n2 + 1]; 
||      addsp	.L2x	xr, yr, Xr		; @@ Xr = xr + yr; 
||	mpysp	.M1	s, rtemp, p2i		; @ p2i = s*rtemp; 
||      subsp	.L1	p1r, p2r, Yr		; Yr = p1r - p2r; 
||[!p]	add	.S1x	xoutptrB, n2As, xoutptrB; if (p==0) 
						;    xoutptrB = xoutptrB + n2; 
||	mpy	.M2	i, 1, m			; @@ m = i; 	
||[l]	sub	.D1	l, 1, l			; if (l!=0) l = l -1; 

c13:
	lddw	.D2	*xinptrA++, B5:B4	; @@@@ xr = *xinptrA++; 
						;      xi = *xinptrA++; 
||	mpysp	.M1	c, itemp, p1i		; @ p1i = c*itemp; 
||      addsp	.L1	p1i, p2i, Yi		; Yi = p1i + p2i; 
||[!m]	lddw	.D1	*wptrB++, A5:A4		; @@@ if (m==0) {c=*wptrB++; 
						;		 s=*wptrB++;} 
||[!i]	mv	.S2	n2, i			; @@@@ if (i==0), i = n2; 
||[!p]	mv	.S1x	n2, p			; if (p==0), p = n2; 	

c14:
	subsp	.L1x	xr, yr, rtemp		; @@@ rtemp = xr - yr; 
||	mpysp	.M1	c, rtemp, p1r		; @@ p1r = c*rtemp; 
||	stw	.D1	Xi, *xoutptrB++[2]	; @ *xoutptrB=Xi;
						;    xoutptrB=xoutptrB+2; 
||[i]	sub	.S2	i, 1, i			; @@@@ i = i - 1; 	
||[l]	b	.S1	loopl			; if (l!=0) branch to loopl

c15:
        subsp	.L1x	xi, yi, itemp		; @@@ itemp = xi - yi; 
||      addsp	.L2x	xi, yi, Xi		; @@@ Xi = xi + yi; 
||	mpysp	.M1	s, itemp, p2r		; @@ p2r = s*itemp; 
||	stw	.D1	Xr, *-xoutptrB[3]	; *(xoutptrB-3) = Xr; 
||[!i]	add	.S2	xinptrA, n2As, xinptrA	; @@@@ if (i==0)
						;      xinptrA = xinptrA + n2A; 
||[p]	sub	.S1	p, 1, p 		; @ p = p - 1; 

	; ----------------- end prolog for inner loop - loopl ------------------   		

 	;------------------ inner loop - loopl loop code -----------------------
loopl:	

c16:
	lddw	.D2	*+xinptrA[n2], A7:A6	; @@@@@ yr = xinptrA[2*n2]; 
						;       yi = xinptrA[2*n2 + 1]; 
||      addsp	.L2x	xr, yr, Xr		; @@@ Xr = xr + yr; 
||	mpysp	.M1	s, rtemp, p2i		; @@ p2i = s*rtemp; 
||      subsp	.L1	p1r, p2r, Yr		; @ Yr = p1r - p2r; 
||[!p]	add	.S1x	xoutptrB, n2As, xoutptrB; @ if (j==0) 
						;    xoutptrB = xoutptrB + n2; 
||[l]	sub	.D1	l, 1, l			; @ if (l!=0) l = l -1; 
||	mpy	.M2	i, 1, m			; @@@ m = i; 	
||[!j]	add	.S2	xoutptrA, n2As, xoutptrA; if (j==0) 

c17:
	lddw	.D2	*xinptrA++, B5:B4	; @@@@@ xr = *xinptrA++; 
						;       xi = *xinptrA++; 
||	mpysp	.M1	c, itemp, p1i		; @@ p1i = c*itemp; 
||      addsp	.L1	p1i, p2i, Yi		; @ Yi = p1i + p2i; 
||[!m]	lddw	.D1	*wptrB++, A5:A4		; @@@@ if (i==0) {c=*wptrB++; 
						;		  s=*wptrB++;}
||[!i]	mv	.S2	n2, i			; @@@@@ if (i==0) i = n2; 
||[!p]	mv	.S1x	n2, p			; @ if (p==0) p = n2; 	
||[j]	sub	.L2	j, 1, j			; j = j - 1;

c18:
	subsp	.L1x	xr, yr, rtemp		; @@@@ rtemp = xr - yr; 
||	mpysp	.M1	c, rtemp, p1r		; @@@ p1r = c*rtemp; 
||	stw	.D2	Yr, *xoutptrA++		; *xoutptrA++ = Yr; 
||	stw	.D1	Xi, *xoutptrB++[2]	; @ *xoutptrB=Xi; 
						;   xoutptrB=xoutptrB+2; 
||[i]	sub	.S2	i, 1, i			; @@@@@ i = i - 1; 	
||[l]	b	.S1	loopl			; @

c19:
        subsp	.L1x	xi, yi, itemp		; @@@@ itemp = xi - yi; 
||      addsp	.L2x	xi, yi, Xi		; @@@@ Xi = xi + yi; 
||	mpysp	.M1	s, itemp, p2r		; @@@ p2r = s*itemp; 
||	stw	.D2	Yi, *xoutptrA++		; *xoutptrA++ = Yi; 
||	stw	.D1	Xr, *-xoutptrB[3]	; @ *(xoutptrB-3) = Xr; 
||[!i]	add	.S2	xinptrA, n2As, xinptrA	; @@@@@ if (i==0) 
						;     xinptrA = xinptrA + n2A; 
||[p]	sub	.S1	p, 1, p			; @@ p = p - 1;  
||[!j]	mpy	.M2	n2, 1, j		; if (j==0) j = n2; 

loopl_end:

 	;------------------ end of inner loop - loopl loop code ----------------
	
	; ----------------- epilog for inner loop - loopl ----------------------   	
 	
c20:
      	subsp	.L1	p1r, p2r, Yr		; e Yr = p1r - p2r;
 ||[!j]	add	.D2	xoutptrA, n2As, xoutptrA; e if (j==0) 
						;      xoutptrA = xoutptrB + n2;
 ||	mv	.S1x	tmpk, k			; o k = tmpk
 ||	mv	.L2	x, xinptrA		; o xinptrA = x;
 ||	shl	.S2	n2, 1, n2		; o n2 = n2 << 1;

c21:
      	addsp	.L1	p1i, p2i, Yi		; e Yi = p1i + p2i;
 ||[j]	sub	.L2	j, 1, j			; e j = j - 1;
 ||	mv	.S1x	x, xoutptrB		; o xoutptrB = x;

c22:
	stw	.D2	Yr, *xoutptrA++		; e *xoutptrA++ = Yr;
 ||[k]	b	.S1	loopk			; o 

c23:
	stw	.D2	Yi, *xoutptrA++		; e *xoutptrA++ = Yi;
 ||[!j]	mpy	.M2	n2, 1, j		; e if (j==0) j = n2;
 ||	mv	.S1x	w, wptrB		; o wptrB = w;  	

c24:
   [!j]	add	.L2	xoutptrA, n2As, xoutptrA; e if (j==0) 
						;      xoutptrA = xoutptrA + n2;
 ||	shr	.S1x	n, 1, l			; o l = n/2
 ||	shl	.S2	n2, 3, n2As		; o n2As = n2<<3;	 
 ||	lddw	.D2	*+xinptrA[n2], A7:A6	; p yr = xinptrA[2*n2]; 
						;   yi = xinptrA[2*n2 + 1];
c25:
   [k]	shr	.S2	tmpk, 1, tmpk		; o tmpk = tmpk >> 1;
 ||	sub		l, 2, l			; o l = l - 2;
 ||  	mv	.L2	n2, i			; o i = n2;
 ||	lddw	.D2	*xinptrA++, B5:B4	; p xr = *xinptrA++; 
						;   xi = *xinptrA++;
c26:
 	stw	.D2	Yr, *xoutptrA++		; e *xoutptrA++ = Yr;
 ||  	mv	.L2	n2, j			; o j = n2;
 ||	mv		n2, p			; o p = n2
 ||[i]	sub	.S2	i, 1, i			; p i = i - 1;

c27: 
	stw	.D2	Yi, *xoutptrA		; e *xoutptrA++ = Yi;
 ||  	add	.S2X	xoutptrB, n2As, xoutptrA; o xoutptrA = xoutptrB + n2;
 ||	add		xoutptrB, 4, xoutptrB	; o xoutptrB = xoutptrB + 4;	 
 ||[!i]	add	.L2	xinptrA, n2As, xinptrA	; p if (i==0) 
						;      xinptrA = xinptrA + n2A; 
loopk_end:

	; ----------------- end of epilog for inner loop - loopl ---------------   	

	; ----------------- end of outer loop - loopk --------------------------

	; ---------------------------- function epilog  ------------------------

	mvc	.S2	CSR, B13

	; restore preserved by call registers

	sub		B15, 4, A0

	ldw	.D1 	*++A0[2], B3		; f
 ||	ldw	.D2 	*++B15[2], A15		; f
 ||	mvc	.S2	CSR, B13		; f

	ldw	.D1 	*++A0[2], B14		; f
 ||	ldw	.D2 	*++B15[2], A14		; f
 ||	or	.L2	B13, 1, B13		; f

	ldw	.D1 	*++A0[2], B13		; f
 ||	ldw	.D2 	*++B15[2], A13		; f
 ||	mvc     .S2     B13,CSR			; f enable global interrupts

	ldw	.D1 	*++A0[2], B12		; f
 ||	ldw	.D2 	*++B15[2], A12		; f

	ldw	.D1 	*++A0[2], B11		; f
 ||	ldw	.D2 	*++B15[2], A11		; f

	ldw	.D2 	*++B15[2], A10		; f
 ||	ldw	.D1 	*++A0[2], B10		; f
 ||	b	.S2	B3			; f return();
	nop		5			; f