/* ======================================================================== */
/*  TEXAS INSTRUMENTS, INC.                                                 */
/*                                                                          */
/*  DSPLIB  DSP Signal Processing Library                                   */
/*                                                                          */
/*      Release:        Version 1.02                                        */
/*      CVS Revision:   1.6     Fri Mar 22 01:53:17 2002 (UTC)              */
/*      Snapshot date:  18-Apr-2002                                         */
/*                                                                          */
/*  This library contains proprietary intellectual property of Texas        */
/*  Instruments, Inc.  The library and its source code are protected by     */
/*  various copyrights, and portions may also be protected by patents or    */
/*  other legal protections.                                                */
/*                                                                          */
/*  This software is licensed for use with Texas Instruments TMS320         */
/*  family DSPs.  This license was provided to you prior to installing      */
/*  the software.  You may review this license by consulting the file       */
/*  TI_license.PDF which accompanies the files in this library.             */
/* ------------------------------------------------------------------------ */
/*          Copyright (C) 2002 Texas Instruments, Incorporated.             */
/*                          All Rights Reserved.                            */
/* ======================================================================== */
/* ======================================================================== */
/*  TEXAS INSTRUMENTS, INC.                                                 */
/*                                                                          */
/*  NAME                                                                    */
/*      DSP_mat_mul -- Matrix Multiply                                      */
/*                                                                          */
/*  REVISION DATE                                                           */
/*      15-Jan-2002                                                         */
/*                                                                          */
/*  USAGE                                                                   */
/*      This routine is C-callable and can be called as:                    */
/*                                                                          */
/*          void DSP_mat_mul                                                */
/*          (                                                               */
/*              const short *restrict x, int r1, int c1,                    */
/*              const short *restrict y,         int c2,                    */
/*              short       *restrict r,                                    */
/*              int                   qs                                    */
/*          );                                                              */
/*                                                                          */
/*      x  == Pointer to r1 by c1 input matrix.                             */
/*      y  == Pointer to c1 by c2 input matrix.                             */
/*      r  == Pointer to r1 by c2 output matrix.                            */
/*                                                                          */
/*      r1 == Number of rows in x.                                          */
/*      c1 == Number of columns in x.  Also number of rows in y.            */
/*      c2 == Number of columns in y.                                       */
/*                                                                          */
/*      qs == Final right-shift to apply to the result.                     */
/*                                                                          */
/*  DESCRIPTION                                                             */
/*      This function computes the expression "r = x * y" for the           */
/*      matrices x and y.  The columnar dimension of x must match           */
/*      the row dimension of y.  The resulting matrix has the same          */
/*      number of rows as x and the same number of columns as y.            */
/*                                                                          */
/*      The values stored in the matrices are assumed to be fixed-point     */
/*      or integer values.  All intermediate sums are retained to 32-bit    */
/*      precision, and no overflow checking is performed.  The results      */
/*      are right-shifted by a user-specified amount, and then truncated    */
/*      to 16 bits.                                                         */
/*                                                                          */
/*      This code is suitable for dense matrices.  No optimizations are     */
/*      made for sparse matrices.                                           */
/*                                                                          */
/*      The following is a C description of the algorithm.  The assembly    */
/*      code may place restrictions on the inputs that the C code version   */
/*      does not.  These restrictions are noted under ASSUMPTIONS below.    */
/*                                                                          */
/*      void DSP_mat_mul                                                    */
/*      (                                                                   */
/*          const short *restrict x, int r1, int c1,                        */
/*          const short *restrict y,         int c2,                        */
/*          short       *restrict r,                                        */
/*          int                   qs                                        */
/*      )                                                                   */
/*      {                                                                   */
/*          int i, j, k;                                                    */
/*          int sum;                                                        */
/*                                                                          */
/*          // ---------------------------------------------------- //      */
/*          //  Multiply each row in x by each column in y.  The    //      */
/*          //  product of row m in x and column n in y is placed   //      */
/*          //  in position (m,n) in the result.                    //      */
/*          // ---------------------------------------------------- //      */
/*          for (i = 0; i < r1; i++)                                        */
/*              for (j = 0; j < c2; j++)                                    */
/*              {                                                           */
/*                  sum = 0;                                                */
/*                                                                          */
/*                  for (k = 0; k < c1; k++)                                */
/*                      sum += x[k + i*c1] * y[j + k*c2];                   */
/*                                                                          */
/*                  r[j + i*c2] = sum >> qs;                                */
/*              }                                                           */
/*      }                                                                   */
/*                                                                          */
/*  ASSUMPTIONS                                                             */
/*      The arrays 'x', 'y', and 'r' are stored in distinct arrays.  That   */
/*      is, in-place processing is not allowed.                             */
/*                                                                          */
/*      The input matrices have minimum dimensions of at least 1 row and    */
/*      1 column, and maximum dimensions of 32767 rows and 32767 columns.   */
/*                                                                          */
/*  TECHNIQUES                                                              */
/*      The outer two loops are unrolled 2x.  For odd-sized dimensions,     */
/*      we end up doing extra multiplies along the edges.  This offsets     */
/*      the overhead of the nested loop structure, though.                  */
/*                                                                          */
/*      The outer two levels of loop nest are collapsed, further reducing   */
/*      the overhead of the looping structure.                              */
/*                                                                          */
/*  NOTES                                                                   */
/*      This code is ENDIAN NEUTRAL.                                        */
/*                                                                          */
/*      This code is interrupt tolerant, but not interruptible.             */
/*      Interrupts are locked out in the body of this function by branch    */
/*      delay slots.                                                        */
/*                                                                          */
/*      For odd values of r1 or c2, this code rounds the dimension up to    */
/*      the next larger even dimension when calculating loop trip counts.   */
/*      Array addressing is not affected by this rounding.                  */
/*                                                                          */
/*  MEMORY NOTE                                                             */
/*      The load instructions in the inner loop are predicated to avoid     */
/*      significant over-fetching on the matrices.  However, since the      */
/*      outer loops are unrolled, this code may fetch approximately one     */
/*      full row beyond the end of the 'x' matrix and approximately one     */
/*      word beyond the end of the 'y' matrix.  The values read are         */
/*      discarded and do not affect the results of the computation.         */
/*                                                                          */
/*      The code is organized so that accesses to the 'x' and 'y' matrices  */
/*      occur in parallel.  On C620x and C670x devices, this will result    */
/*      in memory bank conflicts unless both 'x' and 'y' are placed in      */
/*      separate memory spaces.  When both 'x' and 'y' are in separate      */
/*      memory spaces, no conflicts occur.  On C621x and C671x, there are   */
/*      no bank conflicts as the devices use dual-ported memory.            */
/*                                                                          */
/*      When 'x' and 'y' are placed in the same memory space (eg. the       */
/*      same half of data memory on C620x and C670x devices), bank          */
/*      conflicts will occur in a pattern that depends on the dimensions    */
/*      of matrices and the number and size of the memory banks on the      */
/*      specific device being used.  The code places the following          */
/*      accesses in parallel:  (Refer to C code above for the meaning of    */
/*      the indices 'i', 'j', and 'k'.)                                     */
/*                                                                          */
/*          x[k + (i + 0)*c1]  in parallel with  y[(j + 0) + k*c2]          */
/*          x[k + (i + 1)*c1]  in parallel with  y[(j + 1) + k*c2]          */
/*                                                                          */
/*      The exact analysis of bank conflicts requires the dimensions of     */
/*      the matrices being multiplied, and this calculation is left to      */
/*      the user.  Bank conflicts cause between 3% and 40% degradation      */
/*      on a 4-bank device such as C6201 when x and y are in the same       */
/*      memory space.                                                       */
/*                                                                          */
/*  CYCLES                                                                  */
/*      Assuming no bank conflicts, the following formula applies:          */
/*                                                                          */
/*      cycles = 0.5 * (r1'*c2'*c1) + 3 * (r1'*c2') + 24, where:            */
/*      r1' = r1 + (r1&1);   // r1 rounded up to next even                  */
/*      c2' = c2 + (c2&1);   // c2 rounded up to next even                  */
/*                                                                          */
/*      For r1= 1, c1= 1, c2= 1,  cycles =    38.                           */
/*      For r1= 8, c1=20, c2= 8,  cycles =   856.                           */
/*      For r1=12, c1=14, c2=18,  cycles =  2184.                           */
/*      For r1=32, c1=32, c2=32,  cycles = 19480.                           */
/*                                                                          */
/*      The cycle count includes 6 cycles of function call overhead.  The   */
/*      exact overhead seen by a given application will depend on the       */
/*      compiler options used.                                              */
/*                                                                          */
/*  CODESIZE                                                                */
/*      448 bytes.                                                          */
/*                                                                          */
/* ------------------------------------------------------------------------ */
/*            Copyright (c) 2002 Texas Instruments, Incorporated.           */
/*                           All Rights Reserved.                           */
/* ======================================================================== */
#ifndef DSP_MAT_MUL_H_
#define DSP_MAT_MUL_H_ 1

void DSP_mat_mul
(
    const short *restrict x, int r1, int c1,
    const short *restrict y,         int c2,
    short       *restrict r,
    int                   qs
);

#endif
/* ======================================================================== */
/*  End of file:  dsp_mat_mul.h                                             */
/* ------------------------------------------------------------------------ */
/*            Copyright (c) 2002 Texas Instruments, Incorporated.           */
/*                           All Rights Reserved.                           */
/* ======================================================================== */