/*===================================================================

'Mult_16bit_signed' -- signed 16-bit sequential multiplier

Overview
========

This module implements a 16-bit sequential multiplier using Booth's algorithm
(see p. 259 of "Computer Organization and Design" by Hennessy and Patterson,
Morgan Kaufmann Publishers (San Francisco), 1998). The multipler requires
thirty-three clock cycles to produce a result. While this number is about
six times as long as the 16-bit multiplier produced by Xilinx Core Generator,
it is also about 3.5 times smaller (60 CLBs vs. 220 CLBs).

16-bit values in 2's complement format are applied to the X and Y inputs.
The inputs are also considered to represent fractional values in the range
-1 to +1 (minus a tad). More specifically, the 16-bit value 8000 (-32,768
decimal) denotes -1.00000 (= -32,768 / 2^15) and the value 7FFF (+32,767
decimal) denotes +0.99997 (= +32,767 / 2^15). The decimal point is effectively
shifted 15 places to the right, so the multiplication looks like this: 

			x.xxx_xxxx_xxxx_xxxx
  		      X y.yyy_yyyy_yyyy_yyyy
			--------------------
    zz.zz_zzzz_zzzz_zzzz_zzzz_zzzz_zzzz_zzzz
       ^^^^^^^^^^^^^^^^^^^^
        multiplier output

The decimal point is shifted one more place to the right in the product, so
the final 16-bit output Z is taken from the product as shown above.

The multiplier conforms to the "distributed control" timing model for cascaded
computation:

	1. If either input data "freshness" signal deasserts (goes low),
		then the multiplier will deassert its output data
		freshness signal and wait for fresh input data.
	2. Once fresh input data is indicated (both input freshness signals
		asserted), the multiplier will compute the product. The
		Z output will change during this time.
	3. Once the product is ready, the output freshness signal will
		be asserted and the multiplier will return to its idle state.
	4. Input data freshness signals are internally synchronized to the falling
		edge of the clock. Therefore, it would be best if the input
		freshness signals transition on the positive edge of the clock.
	

Instantiation Examples
======================

Example 1 -- Use multiplier as a sign inverter, i.e., Z = -X:

Mult_16bit_signed U1
   (.I$Clock(...),
    .I$Reset(...),	// Active high
    
    .I$X(...),		// X-input data bus
    .I$Y(16'h8000),	// Y-input data bus
    .O$Z(...),		// Z-output data bus

    .I$XValid(...),	// X-input data is current; active high
    .I$YValid(1'b1),	// Y-input data is current; active high
    
    .O$ZValid(...)	// Z-output data is current; active high
    );


Verification Results
====================

The multiplier has been verified for the following extremes of the input data range:

	 X	 Y	 Z	Comments
	----	----	----	-------
	7FFF	7FFF	7FFE	+1 X +1 = +1 
	7FFF	8000	8001	+1 X -1 = -1
	8000	7FFF	8001	-1 X +1 = -1
	8000	8001	7FFF	-1 X -1 = +1


===================================================================*/

module Mult_16bit_signed
   (I$Clock,
    I$Reset,	// Active high
    
    I$X,		// X-input data bus
    I$Y,		// Y-input data bus
    O$Z,		// Z-output data bus

    I$XValid,		// X-input data is current; active high
    I$YValid,		// Y-input data is current; active high
    
    O$ZValid 		// Z-output data is current; active high
    );

parameter	p$N = 16;	// Data bus width (must be even)
parameter	p$Log2ofN = 4;	// ceil(log2(N))
//NOTE: Some portions of code are hardwired to 16 bits

// Port mode declarations
input	I$Clock;
input	I$Reset;
input	[p$N-1:0] I$X;
input	[p$N-1:0] I$Y;
output	[p$N-1:0] O$Z;

input	I$XValid;
input	I$YValid;
output	O$ZValid;

// State definitions
parameter	p$NumStateBits	= 5;
parameter	p$Idle			= 5'b0_0001;
parameter	p$Wait			= 5'b0_0010;
parameter	p$Init			= 5'b0_0100;
parameter	p$TestBits		= 5'b0_1000;
parameter	p$ShiftBits		= 5'b1_0000;

parameter	p$Idle_case		= 5'b?_???1;
parameter	p$Wait_case		= 5'b?_??1?;
parameter	p$Init_case		= 5'b?_?1??;
parameter	p$TestBits_case		= 5'b?_1???;
parameter	p$ShiftBits_case	= 5'b1_????;

// Registered variable declarations
reg	[p$NumStateBits-1:0] r$State;	// State register
reg	[p$NumStateBits-1:0] r$NextState; // Next-state decoder output
reg	r$XValid;	
reg	r$YValid;	
reg	O$ZValid;
reg	r$Initialize;
reg	r$AddMultiplicand;
reg	r$SubtractMultiplicand;
reg	r$ShiftProduct;
reg	r$DecrementCount;

reg	[p$Log2ofN:0] r$Count;		// Keeps track of the number of shift
					// operations.

reg	[2*p$N+1:0] r$Product;		// 2N bits wide, plus an additional bit on
					// either side of the product register.
					// The LSB serves as the rightmost test bit
					// (see details of Booth algorithm), and the
					// MSB provides enough dynamic range to
					// avoid overflow problems during the add/
					// subtract operations.

// Next-state decoder
always @ (r$State or r$XValid or r$YValid or r$Product or r$Count) begin

   // Nominal outputs
   O$ZValid <= 0;
   r$Initialize <= 0;
   r$AddMultiplicand <= 0;
   r$SubtractMultiplicand <= 0;
   r$ShiftProduct <= 0;
   r$DecrementCount <= 0;
   r$NextState <= p$Idle;
       
   // Decoder outputs
   casez (r$State)
   
      		// Wait here as long as input data is fresh on both inputs
      p$Idle_case:
      		begin
      		   O$ZValid <= 1;
      		   r$NextState <= (r$XValid & r$YValid) ? p$Idle : p$Wait;
      		end

      		// Wait for data on both inputs to be updated
      p$Wait_case:
      		begin
      		   r$NextState <= !(r$XValid & r$YValid) ? p$Wait : p$Init;
      		end

      		// Initialize registers
      p$Init_case:
      		begin
      		   r$Initialize <= 1;
      		   r$NextState <= p$TestBits;
      		end

      		// Test lower two bits of product register
      p$TestBits_case:
      		begin
      		   case (r$Product[1:0])
      		      2'b00:
      		         begin
      		         end
      		      2'b01:
      		         begin
      		            r$AddMultiplicand <= 1;
      		         end
      		      2'b10:
      		         begin
      		            r$SubtractMultiplicand <= 1;
      		         end
      		      2'b11:
      		         begin
      		         end
      		   endcase
      		   r$NextState <= p$ShiftBits;
      		end

      		// Arithmetic right shift on the product register, and 
      		// decrement count
      p$ShiftBits_case:
      		begin
      		   r$ShiftProduct <= 1;
      		   r$DecrementCount <= 1;
      		   r$NextState <= (r$Count==0) ? p$Idle : p$TestBits;
      		end

      default:	r$NextState <= p$Idle;

   endcase
end


// Next state register
always @ (posedge I$Clock or posedge I$Reset) begin
   if (I$Reset == 1)
      r$State <= p$Idle;
   else
      r$State <= r$NextState;
end


// Data path elements controlled by state machine
always @ (negedge I$Clock or posedge I$Reset) begin
   if (I$Reset == 1) begin
      r$XValid <= 1;
      r$YValid <= 1;
      r$Count <= 5'h00;
      r$Product <= 34'h000000000;
   end
   else begin
      if (r$Initialize) begin
         r$Count <= p$N;
         r$Product[2*p$N+1:p$N+1] <= 17'h0000;
         r$Product[p$N:1] <= I$Y;
         r$Product[0] <= 1'b0;
      end
      r$XValid <= I$XValid;	// sync asynchronous inputs
      r$YValid <= I$YValid;
      if (r$AddMultiplicand) r$Product[2*p$N+1:p$N+1] <=
            r$Product[2*p$N+1:p$N+1] + {I$X[p$N-1], I$X};
      if (r$SubtractMultiplicand) r$Product[2*p$N+1:p$N+1] <= 
            r$Product[2*p$N+1:p$N+1] - {I$X[p$N-1], I$X};
      if (r$ShiftProduct) r$Product[2*p$N:0] <= r$Product[2*p$N+1:1];
      if (r$DecrementCount) r$Count <= r$Count - 1;
   end
end

// Connect most-significant word of product register to output
assign O$Z = r$Product[2*p$N-1:p$N];

endmodule