/*
 *             Automatically Tuned Linear Algebra Software v3.0Beta
 *                      (C) Copyright 1998 Jeff Horner
 *
 * Code contributers : Jeff Horner, R. Clint Whaley
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University, the ATLAS group, or the names of its
 *      contributers may not be used to endorse or promote products derived
 *      from this software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */
/*
 * =====================================================================
 * Include files
 * =====================================================================
 */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include "atlas_misc.h"
#include "atlas_tst.h"
/*
 * =====================================================================
 * #define macro constants
 * =====================================================================
 */
#define    MEGA                     1000000.0
#if defined( SREAL ) || defined( SCPLX )
#define    THRESH                        50.0f
#else
#define    THRESH                        50.0
#endif

#define    ATLAS_DEBUG
/*
 * =====================================================================
 * # macro functions
 * =====================================================================
 *
 * The following and mutually exclusive  macros  allow to select various
 * BLAS implementations to test the ATLAS implementation against:
 *
 *    USE_F77_BLAS     : Fortran 77 BLAS interface,
 *    USE_L3_REFERENCE : C ATLAS reference implementation,
 *    USE_L3_CBLAS     : C BLAS interface,
 *    USE_L3_PTHREADS  : multi-threaded ATLAS implementation.
 *
 * If none of these macros is defined at compile time, the  ATLAS imple-
 * mentation is to be tested against itself,  after all this is the only
 * version we are sure to have available.
 */
#define USE_F77_BLAS

#if   defined( USE_F77_BLAS ) /* Trusted BLAS version to test against */
#define  TP3      Mjoin( PATL,   f77 )
#define  CMJ
#elif defined( USE_L3_REFERENCE )
#include "atlas_reflevel3.h"
#define  TP3      Mjoin( PATL,   ref )
#define  CMJ
#elif defined( USE_L3_CBLAS )
#include "cblas.h"
#define  TP3      Mjoin( cblas_, PRE )
#define  CMJ      CblasColMajor,
#elif defined( USE_L3_PTHREADS )
#include "atlas_ptlevel3.h"
#define  TP3      Mjoin( PATL,   pt  )
#define  CMJ
#else /* defined( USE_L3_ATLAS ) */  /* use ATLAS itself !! (default) */
#include "atlas_level3.h"
#define  TP3      PATL
#define  CMJ
#endif

#define  trusted_gemm(    TA, TB,     M, N, K, al, A, lA, B, lB, be, C, lC) \
Mjoin(TP3,gemm)  (CMJ     TA, TB,     M, N, K, al, A, lA, B, lB, be, C, lC)
#define  trusted_hemm(SI, UP,         M, N,    al, A, lA, B, lB, be, C, lC) \
Mjoin(TP3,hemm)  (CMJ SI, UP,         M, N,    al, A, lA, B, lB, be, C, lC)
#define  trusted_herk(    UP, TA,        N, K, al, A, lA,        be, C, lC) \
Mjoin(TP3,herk)  (CMJ     UP, TA,        N, K, al, A, lA,        be, C, lC)
#define trusted_her2k(    UP, TA,        N, K, al, A, lA, B, lB, be, C, lC) \
Mjoin(TP3,her2k) (CMJ     UP, TA,        N, K, al, A, lA, B, lB, be, C, lC)
#define  trusted_symm(SI, UP,         M, N,    al, A, lA, B, lB, be, C, lC) \
Mjoin(TP3,symm)  (CMJ SI, UP,         M, N,    al, A, lA, B, lB, be, C, lC)
#define  trusted_syrk(    UP, TA,        N, K, al, A, lA,        be, C, lC) \
Mjoin(TP3,syrk)  (CMJ     UP, TA,        N, K, al, A, lA,        be, C, lC)
#define trusted_syr2k(    UP, TA,        N, K, al, A, lA, B, lB, be, C, lC) \
Mjoin(TP3,syr2k) (CMJ     UP, TA,        N, K, al, A, lA, B, lB, be, C, lC)
#define  trusted_trmm(SI, UP, TA, DI, M, N,    al, A, lA,            B, lB) \
Mjoin(TP3,trmm)  (CMJ SI, UP, TA, DI, M, N,    al, A, lA,            B, lB)
#define  trusted_trsm(SI, UP, TA, DI, M, N,    al, A, lA,            B, lB) \
Mjoin(TP3,trsm)  (CMJ SI, UP, TA, DI, M, N,    al, A, lA,            B, lB)

/*
 * ATLAS version of the BLAS to test.
 */
#include "atlas_level3.h"

#define  test_gemm(    TA, TB,     M, N, K, al, A, lA, B, lB, be, C, lC) \
Mjoin(PATL,gemm)  (    TA, TB,     M, N, K, al, A, lA, B, lB, be, C, lC)
#define  test_hemm(SI, UP,         M, N,    al, A, lA, B, lB, be, C, lC) \
Mjoin(PATL,hemm)  (SI, UP,         M, N,    al, A, lA, B, lB, be, C, lC)
#define  test_herk(    UP, TA,        N, K, al, A, lA,        be, C, lC) \
Mjoin(PATL,herk)  (    UP, TA,        N, K, al, A, lA,        be, C, lC)
#define test_her2k(    UP, TA,        N, K, al, A, lA, B, lB, be, C, lC) \
Mjoin(PATL,her2k) (    UP, TA,        N, K, al, A, lA, B, lB, be, C, lC)
#define  test_symm(SI, UP,         M, N,    al, A, lA, B, lB, be, C, lC) \
Mjoin(PATL,symm)  (SI, UP,         M, N,    al, A, lA, B, lB, be, C, lC)
#define  test_syrk(    UP, TA,        N, K, al, A, lA,        be, C, lC) \
Mjoin(PATL,syrk)  (    UP, TA,        N, K, al, A, lA,        be, C, lC)
#define test_syr2k(    UP, TA,        N, K, al, A, lA, B, lB, be, C, lC) \
Mjoin(PATL,syr2k) (    UP, TA,        N, K, al, A, lA, B, lB, be, C, lC)
#define  test_trmm(SI, UP, TA, DI, M, N,    al, A, lA,            B, lB) \
Mjoin(PATL,trmm)  (SI, UP, TA, DI, M, N,    al, A, lA,            B, lB)
#define  test_trsm(SI, UP, TA, DI, M, N,    al, A, lA,            B, lB) \
Mjoin(PATL,trsm)  (SI, UP, TA, DI, M, N,    al, A, lA,            B, lB)

#ifdef TCPLX
   #define SCALAR_ABS(X) (Mabs(*X) + Mabs(*(X+1)))
#else
   #define SCALAR_ABS(X) (Mabs(X))
#endif

void Mjoin(PRE,symd)(enum ATLAS_UPLO UL, int M, TYPE *A, int lda, TYPE *B, int ldb, TYPE *C, int ldc)
{
   int i,j;
   if (UL == AtlasLower)
   {
      for (i=0; i<M; i++)
      {
         for (j=0; j<i; j++)
         {
#ifdef TCPLX
            C[(i+j*ldc)*2] = A[(i+j*lda)*2] - B[(i+j*ldb)*2];
            C[(i+j*ldc)*2+1] = A[(i+j*lda)*2+1] - B[(i+j*ldb)*2+1];
#else
            C[i+j*ldc] = A[i+j*lda] - B[i+j*ldb];
#endif
         }
         for (j=0; j<M-i; j++)
         {
#ifdef TCPLX
            C[(j+i+i*ldc)*2] = A[(j+i+i*lda)*2] - B[(j+i+i*ldb)*2];
            C[(j+i+i*ldc)*2+1] = A[(j+i+i*lda)*2+1] - B[(j+i+i*ldb)*2+1];
#else
            C[j+i+i*ldc] = A[j+i+i*lda] - B[j+i+i*ldb];
#endif
         }
      }
   }
   else
   {
      for (i=0; i<M; i++)
      {
         for (j=0; j<i; j++)
         {
#ifdef TCPLX
            C[(j+i*lda)*2] = A[(j+i*lda)*2] - B[(j+i*ldb)*2];
            C[(j+i*lda)*2+1] = A[(j+i*lda)*2+1] - B[(j+i*ldb)*2+1];
#else
            C[j+i*lda] = A[j+i*lda] - B[j+i*ldb];
#endif
         }
         for (j=0; j<M-i; j++)
         {
#ifdef TCPLX
            C[(i+(i+j)*lda)*2] = A[(i+(i+j)*lda)*2] - B[(i+(i+j)*ldb)*2];
            C[(i+(i+j)*lda)*2+1] = A[(i+(i+j)*lda)*2+1] - B[(i+(i+j)*ldb)*2+1];
#else
            C[i+(i+j)*lda] = A[i+(i+j)*lda] - B[i+(i+j)*ldb];
#endif
         }
      }
   }
}

TYPE  Mjoin(PRE,genorm)(int M, int N, TYPE *A, int lda)
{
   int i,j;
   TYPE *v, maxval=0.0;

   v = malloc(sizeof(TYPE) * N);
   ATL_assert(v);

   for (j=0; j<N; j++)
   {
      v[j] = 0.0;
      for (i=0; i< (M SHIFT); i++)
         v[j] += Mabs(A[i]);
      A += lda SHIFT;
   }

   for (i=0; i<N; i++){
      if (maxval < v[i]) maxval = v[i];
   }
   free(v);
   return maxval;
}

TYPE Mjoin(PRE,synorm)(enum ATLAS_UPLO UL, int M, TYPE *A, int lda)
{
   int i,j;
   TYPE *v, maxval=0.0;
#ifdef TCPLX
   TYPE *tmp;
#endif

   v = malloc(sizeof(TYPE) * M);
   ATL_assert(v);

   if (UL == AtlasLower){
      for (i=0; i<M; i++){
         v[i] = 0.0;
         for (j=0; j<i; j++){
#ifdef TCPLX
            tmp = A+(i+j*lda)*2;
            v[i] += SCALAR_ABS(tmp);
#else
            v[i] += Mabs(A[i+j*lda]);
#endif
         }
         for (j=0; j<M-i; j++){
#ifdef TCPLX
            tmp = A+(j+i+i*lda)*2;
            v[i] += SCALAR_ABS(tmp);
#else
            v[i] += Mabs(A[j+i+i*lda]);
#endif
         }
      }
   } else {
      for (i=0; i<M; i++){
         v[i] = 0.0;
         for (j=0; j<i; j++){
#ifdef TCPLX
            tmp = A+(j+i*lda)*2;
            v[i] += SCALAR_ABS(tmp);
#else
            v[i] += Mabs(A[j+i*lda]);
#endif
         }
         for (j=0; j<M-i; j++){
#ifdef TCPLX
            tmp = A+(i+(i+j)*lda)*2;
            v[i] += SCALAR_ABS(tmp);
#else
            v[i] += Mabs(A[i+(i+j)*lda]);
#endif
         }
      }
   }

   for (i=0; i<M; i++){
      if (maxval < v[i]) maxval = v[i];
   }
   free(v);
   return maxval;
}
#ifdef TCPLX
TYPE Mjoin(PRE,henorm)(enum ATLAS_UPLO UL, int M, TYPE *A, int lda)
{
   int i,j;
   TYPE *v, maxval=0.0;
   TYPE *tmp;

   v = malloc(sizeof(TYPE) * M);
   ATL_assert(v);

   if (UL == AtlasLower){
      for (i=0; i<M; i++){
         v[i] = 0.0;
         for (j=0; j<i; j++){
            tmp = A+(i+j*lda)*2;
            v[i] += SCALAR_ABS(tmp);
         }
         v[i] += Mabs(A[(i+i*lda)*2]);
         for (j=1; j<M-i; j++){
            tmp = A+(j+i+i*lda)*2;
            v[i] += SCALAR_ABS(tmp);
         }
      }
   } else {
      for (i=0; i<M; i++){
         v[i] = 0.0;
         for (j=0; j<i; j++){
            tmp = A+(j+i*lda)*2;
            v[i] += SCALAR_ABS(tmp);
         }
         v[i] += Mabs(A[(i+i*lda)*2]);
         for (j=1; j<M-i; j++){
            tmp = A+(i+(i+j)*lda)*2;
            v[i] += SCALAR_ABS(tmp);
         }
      }
   }

   for (i=0; i<M; i++){
      if (maxval < v[i]) maxval = v[i];
   }
   free(v);
   return maxval;
}
#endif

TYPE Mjoin(PRE,trnorm)(enum ATLAS_UPLO UL, enum ATLAS_DIAG DI, int M, TYPE *A, int lda)
{
   int i,j;
   TYPE *v, maxval=0.0;
#ifdef TCPLX
   TYPE *tmp;
#endif

   v = malloc(ATL_sizeof * M);
   ATL_assert(v);

   if (UL == AtlasLower){
      for (i=0; i<M; i++){
         if (DI == AtlasUnit)
            v[i] = 1.0;
         else
         {
#ifdef TCPLX
            tmp = A+(i+i*lda)*2;
            v[i] = SCALAR_ABS(tmp);
#else
            v[i] = Mabs(A[i+i*lda]);
#endif
         }
         for (j=i+1; j<M; j++)
         {
#ifdef TCPLX
            tmp = A+(j+(i*lda))*2;
            v[i] += SCALAR_ABS(tmp);
#else
            v[i] += Mabs(A[j+(i*lda)]);
#endif
         }
      }
   } else {
      for (i=0; i<M; i++){
         v[i] = 0.0;
         for (j=0; j<i; j++)
         {
#ifdef TCPLX
            tmp = A+(j+i*lda)*2;
            v[i] += SCALAR_ABS(tmp);
#else
            v[i] += Mabs(A[j+i*lda]);
#endif
         }
         if (DI == AtlasUnit)
            v[i] += 1.0;
         else
         {
#ifdef TCPLX
            tmp = A+(i+i*lda)*2;
            v[i] += SCALAR_ABS(tmp);
#else
            v[i] += Mabs(A[i+i*lda]);
#endif
         }
      }
   }

   for (i=0; i<M; i++){
      if (maxval < v[i]) maxval = v[i];
   }
   free(v);
   return maxval;
}

void Mjoin(PRE,gemscal)(int M, int N, TYPE *A, int lda, SCALAR alpha)
{
   int i,j;
#ifdef TCPLX
   TYPE tmp, *Ar, *Ai;
#endif

   for (i=0; i<N; i++)
      for (j=0; j<M; j++){
#ifdef TCPLX
         Ar = A+(j+i*lda)*2;
         Ai = Ar+1;
         tmp = *alpha * *Ar - *(alpha+1) * *Ai;
         *Ai = *alpha * *Ai + *(alpha+1) * *Ar;
         *Ar = tmp;
#else
         A[j+i*lda] *= alpha;
#endif
      }
}


int Maxval3(int M, int N, int K)
{
   int maxval=0;
   if (maxval < M) maxval = M;
   if (maxval < N) maxval = N;
   if (maxval < K) maxval = K;
   return maxval;
}

int Maxval2(int M, int N)
{
   int maxval=0;
   if (maxval < M) maxval = M;
   if (maxval < N) maxval = N;
   return maxval;
}
#ifdef TCPLX
#define L3_nrouts 9
#define L3_GEMM 0
#define L3_SYMM 1
#define L3_HEMM 2
#define L3_SYRK 3
#define L3_HERK 4
#define L3_SYR2K 5
#define L3_HER2K 6
#define L3_TRMM 7
#define L3_TRSM 8
static char *blas[L3_nrouts] = { "gemm" , "symm"  , "hemm" , "syrk" ,
                            "herk" , "syr2k" , "her2k", "trmm" , "trsm" };

#define L3_copyrouts(X) (X)[0] = blas[0]; (X)[1] = blas[1]; \
                        (X)[2] = blas[2]; (X)[3] = blas[3]; \
                        (X)[4] = blas[4]; (X)[5] = blas[5]; \
                        (X)[6] = blas[6]; (X)[7] = blas[7]; \
                        (X)[8] = blas[8];

#else
#define L3_nrouts 6
#define L3_GEMM 0
#define L3_SYMM 1
#define L3_SYRK 2
#define L3_SYR2K 3
#define L3_TRMM 4
#define L3_TRSM 5
static char *blas[L3_nrouts] = { "gemm" , "symm"  , "syrk" , "syr2k" ,
                            "trmm" , "trsm" };
#define L3_copyrouts(X) (X)[0] = blas[0]; (X)[1] = blas[1]; \
                        (X)[2] = blas[2]; (X)[3] = blas[3]; \
                        (X)[4] = blas[4]; (X)[5] = blas[5];
#endif

/*
 * GLOBAL VARIABLES
 */
TYPE eps=0.0;
char *pre=Mstr(PRE);

/*
 * FUNCTION PROTOTYPES FOR TESTING ROUTINES
 */
double time00(void);
void Mjoin(PRE,symd)(enum ATLAS_UPLO, int, TYPE *, int, TYPE *, int, TYPE *, int);
TYPE  Mjoin(PRE,genorm)(int, int, TYPE *, int);
TYPE  Mjoin(PRE,synorm)(enum ATLAS_UPLO, int, TYPE *, int);
TYPE  Mjoin(PRE,henorm)(enum ATLAS_UPLO, int, TYPE *, int);
TYPE  Mjoin(PRE,trnorm)(enum ATLAS_UPLO, enum ATLAS_DIAG, int, TYPE *, int);
int Maxval3(int, int, int);
int Maxval2(int, int);

#ifndef L2SIZE
   #define L2SIZE 4194304
#endif

#ifdef DEBUG
void printmat(char *mat, int M, int N, TYPE *A, int lda)
{
   int i, j;

#ifdef TCPLX
   lda *= 2;
#endif
   printf("\n%s = \n",mat);
   for (i=0; i != M; i++)
   {
#ifdef TREAL
#ifdef DEBUG
      for (j=0; j != N; j++) printf("%.1f  ",A[i+j*lda]);
#else
      for (j=0; j != N; j++) printf("%f  ",A[i+j*lda]);
#endif
#else
#ifdef DEBUG
      for (j=0; j != N; j++) printf("(%.1f,%.1f)  ",A[2*i+j*lda], A[1+2*i+j*lda]);
#else
      for (j=0; j != N; j++) printf("(%f,%f)  ",A[2*i+j*lda], A[1+2*i+j*lda]);
#endif
#endif
      printf("\n");
   }
}
#endif

void matgen(int M, int N, TYPE *A, int lda, int seed)
{
   int i, j;
#ifdef DEBUG
   int k = 1;
#endif

#ifdef TCPLX
   M *= 2;
   lda *= 2;
#endif
   dumb_seed(seed);
   for (j=N; j; j--)
   {
#ifdef DEBUG
      for (i=0; i != M; i++) A[i] = k++;
#else
      for (i=0; i != M; i++) A[i] = dumb_rand();
#endif
      A += lda;
   }
}

int gemmcase
(
   char TA,
   char TB,
   int M,
   int N,
   int K,
   SCALAR alpha,
   TYPE * A,
   int lda,
   TYPE * B,
   int ldb,
   SCALAR beta,
   TYPE * C,
   int ldc,
   TYPE *D,
   int ldd,
   int TEST
)
{
   char *pc = "---";
#ifdef TREAL
   char *form="%4d   %c   %c %4d %4d %4d  %5.1f  %5.1f  %6.2f  %5.1f %5.2f   %3s\n";
   #define MALPH alpha
   #define MBETA beta
#else
   #define MALPH *alpha, alpha[1]
   #define MBETA *beta, beta[1]
   char *form="%4d   %c   %c %4d %4d %4d  %5.1f %5.1f  %5.1f %5.1f  %6.2f %6.1f  %4.2f   %3s\n";
#endif
   int i, j=0, passed=0;
   double t0, t1, t2, t3, mflop;
   TYPE ferr;
   extern TYPE eps;
   static int itst=1;
   int *L2=NULL, nL2=(1.3*L2SIZE)/sizeof(int);
   enum ATLAS_TRANS TAc, TBc;
   TYPE normA, normB, normC;
   extern unsigned int _mcount;

   if (nL2) L2 = malloc(nL2*sizeof(int));
   if (TA == 'n' || TA == 'N')
   {
      matgen(M, K, A, lda, K*1112);
      TAc = AtlasNoTrans;
   }
   else
   {
      matgen(K, M, A, lda, K*1112);
      if (TA == 'c' || TA == 'C') TAc = AtlasConjTrans;
      else TAc = AtlasTrans;
   }
   if (TB == 'n' || TB == 'N')
   {
      matgen(K, N, B, ldb, N*2238);
      TBc = AtlasNoTrans;
   }
   else
   {
      matgen(N, K, B, ldb, N*2238);
      if (TB == 'c' || TB == 'C') TBc = AtlasConjTrans;
      else TBc = AtlasTrans;
   }
   matgen(M, N, C, ldc, M*N);
   matgen(M, N, D, ldd, M*N);

#ifdef DEBUG
   printmat("A0", M, K, A, lda);
   printmat("B0", K, N, B, ldb);
   printmat("C0", M, N, C, ldc);
#endif

   if (L2) /* invalidate L2 cache */
   {
      for (i=0; i != nL2; i++) L2[i] = 0.0;
      for (i=0; i != nL2; i++) j += L2[i];
   }

   t0 = time00();
   trusted_gemm(TAc, TBc, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
   t1 = time00() - t0;
   if (t1 <= 0.0) mflop = t1 = 0.0;
   else   /* flop rate's actually 8MNK+12MN & 2MNK + 2MN, resp */
      #ifdef TCPLX
         mflop = ( ((8.0*M)*N)*K ) / (t1*MEGA);
      #else
         mflop = ( ((2.0*M)*N)*K ) / (t1*MEGA);
      #endif
   printf(form, itst, TA, TB, M, N, K, MALPH, MBETA, t1, mflop, 1.0, "---");

#ifdef DEBUG
   printmat("C", M, N, C, ldc);
#endif

#ifndef TIMEONLY

   if (L2)
   {
      for (i=0; i != nL2; i++) L2[i] = 0.0;  /* invalidate L2 cache */
      for (i=0; i != nL2; i++) j += L2[i];  /* invalidate L2 cache */
   }

   t0 = time00();
   test_gemm(TAc, TBc, M, N, K, alpha, A, lda, B, ldb, beta, D, ldd);
   t2 = time00() - t0;
   if (t2 <= 0.0) t2 = mflop = 0.0;
   else
      #ifdef TCPLX
         mflop = ( ((8.0*M)*N)*K ) / (t2*MEGA);
      #else
         mflop = ( ((2.0*M)*N)*K ) / (t2*MEGA);
      #endif
#ifdef DEBUG
   printmat("D", M, N, D, ldd);
#endif
   if (TEST)
   {
      Mjoin(PATL,gediff)(M,N,C,ldc,D,ldd,C,ldc);
      normC = Mjoin(PRE,genorm)(M,N,C,ldc);
      normA = Mjoin(PRE,genorm)(M,K,A,lda);
      normB = Mjoin(PRE,genorm)(K,N,B,ldb);
      if (normA > 0.0 && normB > 0.0 && SCALAR_ABS(alpha) > 0.0 )
         ferr = normC / ( normA * normB * eps * Maxval3(M,N,K) * SCALAR_ABS(alpha) );
      else
         ferr = 0.0;
      if (ferr > 1.0 ){
         passed = 0;
         pc = "NO";
         printf("ERROR: ferr is %f\n",ferr);
      }
      else
      {
         passed = 1;
         pc = "YES";
      }
   }
   if (t1 == t2) t3 = 1.0;
   else if (t2 != 0.0) t3 = t1/t2;
   else t3 = 0.0;
   printf(form, itst++, TA, TB, M, N, K, MALPH, MBETA, t2, mflop, t3, pc);
#else
   itst++;
   passed = 1;
#endif
   if (L2) free(L2);
   return(passed);
}

void gemmloop(int nTA, enum ATLAS_TRANS *TransA,
              int nTB, enum ATLAS_TRANS *TransB,
              int M0, int MN, int incM,
              int N0, int NN, int incN,
              int K0, int KN, int incK,
              int nalph, TYPE *alph,
              int nbeta, TYPE *beta,
              int TEST, int LDA_IS_M, int *gtst, int *gpass)
{
   int lda, ldb, ldc, k, m, n, im, ik, ita, itb, ia, ib;
   int itst=0, ipass=0, MSAME=0, KSAME=0;
   TYPE *A, *B, *C, *D;
   char TA, TB;
   extern char *pre;

   if (M0 == -1)
   {
      MSAME = 1;
      M0 = MN = incM = NN;
   }
   if (K0 == -1)
   {
      KSAME = 1;
      K0 = KN = incK = NN;
   }

   A = malloc(MN*KN*ATL_sizeof);
   B = malloc(NN*KN*ATL_sizeof);
   C = malloc(MN*NN*ATL_sizeof);
   D = malloc(MN*NN*ATL_sizeof);
   if (!A || !B || !C || !D)
   {
      fprintf(stderr, "Not enough memory to run tests!!\n");
      exit(-1);
   }

   printf("\n%cGEMM\n",Mupcase(pre[0]));
#ifdef TREAL
   printf("TEST  TA  TB    M    N    K  alpha   beta    Time  Mflop  SpUp  PASS\n");
   printf("====  ==  ==  ===  ===  ===  =====  =====  ======  =====  ====  ====\n\n");
#else
printf("\nTEST  TA  TB    M    N    K        alpha         beta    Time  Mflop  SpUp  PASS\n");
  printf("====  ==  ==  ===  ===  ===  ===== =====  ===== =====  ======  =====  ====  ====\n\n");
#endif
   for (im=M0; im <= MN; im += incM)
   {
      for (n=N0; n <= NN; n += incN)
      {
         if (MSAME) m = n;
         else m = im;
         for (ik=K0; ik <= KN; ik += incK)
         {
            if (KSAME) k = n;
            else k = ik;
            for (ita=0; ita != nTA; ita++)
            {
               if (TransA[ita] == AtlasNoTrans) TA = 'N';
               else if (TransA[ita] == AtlasTrans) TA = 'T';
               else if (TransA[ita] == AtlasConjTrans) TA = 'C';

               for (itb=0; itb != nTB; itb++)
               {
                  if (TransB[itb] == AtlasNoTrans) TB = 'N';
                  else if (TransB[itb] == AtlasTrans) TB = 'T';
                  else if (TransB[itb] == AtlasConjTrans) TB = 'C';
                  for (ia=0; ia != nalph; ia++)
                  {
                     for (ib=0; ib != nbeta; ib++)
                     {
                        itst++;
                        if (LDA_IS_M)
                        {
                           if (TransA[ita] == AtlasNoTrans) lda = m;
                           else lda = k;
                           if (TransB[itb] == AtlasNoTrans) ldb = k;
                           else ldb = n;
                           ldc = m;
                        }
                        else
                        {
                           if (TransA[ita] == AtlasNoTrans) lda = MN;
                           else lda = KN;
                           if (TransB[itb] == AtlasNoTrans) ldb = KN;
                           else ldb = NN;
                           ldc = MN;
                        }

#ifdef TREAL
                        ipass += gemmcase(TA, TB, m, n, k,
                                          alph[ia], A, lda,
                                          B, ldb, beta[ib],
                                          C, ldc, D, ldc,
                                          TEST);
#else
                        ipass += gemmcase(TA, TB, m, n, k,
                                          &alph[2*ia], A, lda,
                                          B, ldb, &beta[2*ib],
                                          C, ldc, D, ldc,
                                          TEST);
#endif
                     }
                  }
               }
            }
         }
      }
   }
   if (TEST) printf("\nNTEST=%d, NUMBER passed=%d, NUMBER FAILURES=%d\n",
                    itst, ipass, itst-ipass);
   else printf("\nDone with %d timing runs\n",itst);
   *gtst += itst; *gpass += ipass;
   free(A);
   free(B);
   free(C);
   if (D) free(D);
}

int syhemmcase(char *rout, char SI, char UL,
             int M, int N,
             SCALAR alpha, TYPE *A, int lda,
             TYPE *B, int ldb,  SCALAR beta,
             TYPE *C, int ldc,
             TYPE *D, int ldd,
             int TEST)
{
   char *pc = "---";
#ifdef TREAL
   char *form="%4d   %c   %c %4d %4d       %5.1f  %5.1f  %6.2f  %5.1f  %4.2f   %3s\n";
   #define MALPH alpha
   #define MBETA beta
#else
   #define MALPH *alpha, alpha[1]
   #define MBETA *beta, beta[1]
   char *form="%4d   %c   %c %4d %4d       %5.1f %5.1f  %5.1f %5.1f  %6.2f %6.1f  %4.2f   %3s\n";
   static int hetst=1;
#endif
   int i, j=0, passed=0;
   double t0, t1, t2, t3, mflop;
   TYPE ferr;
   extern TYPE eps;
   static int sytst=1;
   int *itst = NULL;
   int *pL2 = NULL, *L2 = NULL, nL2=(1.3*L2SIZE)/sizeof(int);
   enum ATLAS_SIDE SIc;
   enum ATLAS_UPLO ULc;
   TYPE normA, normB, normC;

   if (nL2) {
      pL2 = malloc(nL2*sizeof(int));
      L2 = pL2;
   }
   if (SI == 'l' || SI == 'L'){
      matgen(M, M, A, lda, M*M);
      SIc = AtlasLeft;
   } else {
      matgen(N, N, A, lda, N*N);
      SIc = AtlasRight;
   }
   if (UL == 'U' || UL == 'u') ULc = AtlasUpper;
   else ULc = AtlasLower;

   matgen(M, N, B, ldb, M*N);
   matgen(M, N, C, ldc, M*N);
   matgen(M, N, D, ldd, M*N);

#ifdef DEBUG
   if (SI == 'l' || SI == 'L') printmat("A0", M, M, A, lda);
   else printmat("A0", N, N, A, lda);
   printmat("B0", M, N, B, ldb);
   printmat("C0", M, N, C, ldc);
#endif

   if (L2) /* invalidate L2 cache */
   {
      for (i=0; i != nL2; i++) L2[i] = 0.0;
      for (i=0; i != nL2; i++) j += L2[i];
   }

#ifdef TCPLX
   if (strcasecmp(rout,blas[L3_SYMM]) == 0)
   {
#endif
      itst = &sytst;
      t0 = time00();
      trusted_symm(SIc, ULc, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
      t1 = time00() - t0;
#ifdef TCPLX
   }
   else
   {
      itst = &hetst;
      t0 = time00();
      trusted_hemm(SIc, ULc, M, N, alpha, A, lda, B, ldb, beta, C, ldc);
      t1 = time00() - t0;
   }
#endif
   if (t1 <= 0.0) mflop = t1 = 0.0;
   else if (SI == 'L')
#ifdef TCPLX
      mflop = ( ((8.0*M)*M)*N ) / (t1*MEGA);
#else
      mflop = ( ((2.0*M)*M)*N ) / (t1*MEGA);
#endif
   else
#ifdef TCPLX
      mflop = ( ((8.0*M)*N)*N ) / (t1*MEGA);
#else
      mflop = ( ((2.0*M)*N)*N ) / (t1*MEGA);
#endif
   printf(form, *itst, SI, UL, M, N, MALPH, MBETA, t1, mflop, 1.0, "---");

#ifdef DEBUG
   printmat("C", M, N, C, ldc);
#endif

#ifndef TIMEONLY

      if (L2)
      {
         for (i=0; i != nL2; i++) L2[i] = 0.0;  /* invalidate L2 cache */
         for (i=0; i != nL2; i++) j += L2[i];  /* invalidate L2 cache */
      }

#ifdef TCPLX
   if (strcasecmp(rout,blas[L3_SYMM]) == 0)
   {
#endif
      t0 = time00();
      test_symm(SIc, ULc, M, N, alpha, A, lda, B, ldb, beta, D, ldc);
      t2 = time00() - t0;
#ifdef TCPLX
   }
   else
   {
      t0 = time00();
      test_hemm(SIc, ULc, M, N, alpha, A, lda, B, ldb, beta, D, ldc);
      t2 = time00() - t0;
   }
#endif
      if (t2 <= 0.0) t2 = mflop = 0.0;
      else if (SI == 'L')
#ifdef TCPLX
         mflop = ( ((8.0*M)*M)*N ) / (t2*MEGA);
#else
         mflop = ( ((2.0*M)*M)*N ) / (t2*MEGA);
#endif
      else
#ifdef TCPLX
         mflop = ( ((8.0*M)*N)*N ) / (t2*MEGA);
#else
         mflop = ( ((2.0*M)*N)*N ) / (t2*MEGA);
#endif
#ifdef DEBUG
      printmat("D", M, N, D, ldd);
#endif
   if (TEST)
   {

      Mjoin(PATL,gediff)(M,N,C,ldc,D,ldd,C,ldc);
      normC = Mjoin(PRE,genorm)(M,N,C,ldc);
      normB = Mjoin(PRE,genorm)(M,N,B,ldb);
#ifdef TCPLX
      if (strcasecmp(rout,blas[L3_SYMM]) == 0)
      {
#endif
         if (ULc == AtlasLower)
            normA = Mjoin(PRE,synorm)(ULc, M,A,lda);
         else
            normA = Mjoin(PRE,synorm)(ULc, N,A,lda);
#ifdef TCPLX
      }
      else
      {
         if (ULc == AtlasLower)
            normA = Mjoin(PRE,henorm)(ULc, M,A,lda);
         else
            normA = Mjoin(PRE,henorm)(ULc, N,A,lda);
      }
#endif
      if (normA > 0.0 && normB > 0.0 && SCALAR_ABS(alpha) > 0.0 )
         ferr = normC / ( normA * normB * eps * Maxval2(M,N) * SCALAR_ABS(alpha) );
      else
         ferr = 0.0;
      if (ferr > 1.0 ){
         passed = 0;
         pc = "NO";
         printf("ERROR: ferr is %f\n",ferr);
      }
      else
      {
         passed = 1;
         pc = "YES";
      }
   }
   if (t1 == t2) t3 = 1.0;
   else if (t2 != 0.0) t3 = t1/t2;
   else t3 = 0.0;
   printf(form, (*itst)++, SI, UL, M, N, MALPH, MBETA, t2, mflop, t3, pc);
#else
   (*itst)++;
   passed = 1;
#endif
   if (pL2) free(pL2);
   return(passed);
}

void syhemmloop(char *rout, int nside, enum ATLAS_SIDE *Side,
              int nuplo, enum ATLAS_UPLO *Uplo,
              int M0, int MN, int incM,
              int N0, int NN, int incN,
              int nalph, TYPE *alph,
              int nbeta, TYPE *beta,
              int TEST, int LDA_IS_M, int *gtst, int *gpass)
{
   int m, n, is, iu, ia, ib, lda, ldb, ldc;
   int itst=0, ipass=0, SAME=0;
   TYPE *A, *B, *C, *D;
   char SI, UL;
   extern char *pre;

   if (M0 == -1)
   {
      SAME = 1;
      M0 = MN = incM = NN;
   }

   if (MN >= NN)
   {
      A = malloc(MN*MN*ATL_sizeof);
      B = malloc(MN*MN*ATL_sizeof);
   }
   else
   {
      A = malloc(NN*NN*ATL_sizeof);
      B = malloc(NN*NN*ATL_sizeof);
   }
   C = malloc(MN*NN*ATL_sizeof);
   D = malloc(MN*NN*ATL_sizeof);
   if (!A || !B || !C || !D)
   {
      fprintf(stderr, "Not enough memory to run tests!!\n");
      exit(-1);
   }

   if (strcasecmp(rout,blas[L3_SYMM]) == 0)
      printf("\n%cSYMM\n",Mupcase(pre[0]));
   else
      printf("\n%cHEMM\n",Mupcase(pre[0]));
#ifdef TREAL
   printf("TEST  SI  UL    M    N       alpha   beta    Time  Mflop  SpUp  PASS\n");
   printf("====  ==  ==  ===  ===       =====  =====  ======  =====  ====  ====\n\n");
#else
printf("\nTEST  SI  UP    M    N             alpha         beta    Time  Mflop  SpUp  PASS\n");
  printf("====  ==  ==  ===  ===       ===== =====  ===== =====  ======  =====  ====  ====\n\n");
#endif
   for (m=M0; m <= MN; m += incM)
   {
      for (n=N0; n <= NN; n += incN)
      {
         if (SAME) m = n;
         for (is=0; is != nside; is++)
         {
            if (Side[is] == AtlasRight) SI = 'R';
            else SI = 'L';
            for (iu=0; iu != nuplo; iu++)
            {
               if (Uplo[iu] == AtlasUpper) UL = 'U';
               else UL = 'L';
               for (ia=0; ia != nalph; ia++)
               {
                  for (ib=0; ib != nbeta; ib++)
                  {
                     itst++;
                     if (LDA_IS_M)
                     {
                        if (Side[is] == AtlasLeft) lda = m;
                        else lda = n;
                        ldb = m;
                        ldc = m;
                     }
                     else
                     {
                        if (Side[is] == AtlasLeft) lda = MN;
                        else lda = NN;
                        ldb = MN;
                        ldc = MN;
                     }
#ifdef TREAL
                     ipass += syhemmcase(rout, SI, UL, m, n,
                                       alph[ia], A, lda,
                                       B, ldb, beta[ib],
                                       C, ldc, D, ldc,
                                       TEST);
#else
                     ipass += syhemmcase(rout, SI, UL, m, n,
                                       &alph[2*ia], A, lda,
                                       B, ldb, &beta[2*ib],
                                       C, ldc, D, ldc,
                                       TEST);
#endif
                  }
               }
            }
         }
      }
   }
   if (TEST) printf("\nNTEST=%d, NUMBER passed=%d, NUMBER FAILURES=%d\n",
         itst, ipass, itst-ipass);
   else printf("\nDone with %d timing runs\n",itst);
   *gtst += itst; *gpass += ipass;
   free(A);
   free(B);
   free(C);
   if (D) free(D);
}

int syherkcase( char *rout, char UL, char TR,
              int N, int K,
              SCALAR alpha, TYPE *A, int lda,
              SCALAR beta,  TYPE *C, int ldc,
              TYPE *D, int ldd,
              int TEST)
{
   char *pc = "---";
#ifdef TREAL
   char *form="%4d   %c   %c      %4d %4d  %5.1f  %5.1f  %6.2f  %5.1f  %4.2f   %3s\n";
   #define MALPH alpha
   #define MBETA beta
#else
   #define MALPH *alpha, alpha[1]
   #define MBETA *beta, beta[1]
   char *form="%4d   %c   %c      %4d %4d  %5.1f %5.1f  %5.1f %5.1f  %6.2f %6.1f  %4.2f   %3s\n";
   static int hetst=1;
#endif
   int i, j=0, passed=0;
   double t0, t1, t2, t3, mflop;
   extern TYPE eps;
   TYPE ferr;
   static int sytst=1;
   int *itst=NULL;
   int *L2=NULL, nL2=(1.3*L2SIZE)/sizeof(int);
   enum ATLAS_UPLO ULc;
   enum ATLAS_TRANS TRc;
   TYPE *mdiff, normA, normC;

   if (nL2) L2 = malloc(nL2*sizeof(int));
   if (TR == 'N')
   {
      matgen(N, K, A, lda, K*1112);
      TRc = AtlasNoTrans;
   }
   else
   {
      matgen(K, N, A, lda, K*1112);
      if (TR == 'c' || TR == 'C') TRc = AtlasConjTrans;
      else TRc = AtlasTrans;
   }
   if (UL == 'U' || UL == 'u') ULc = AtlasUpper;
   else ULc = AtlasLower;

   matgen(N, N, C, ldc, N*N);
   matgen(N, N, D, ldc, N*N);
#ifdef DEBUG
   if (TR == 'N') printmat("A0", N, K, A, lda);
   else printmat("A0", K, N, A, lda);
   printmat("B0", N, N, C, ldc);
#endif

   if (L2) /* invalidate L2 cache */
   {
      for (i=0; i != nL2; i++) L2[i] = 0.0;
      for (i=0; i != nL2; i++) j += L2[i];
   }

#ifdef TCPLX
   if (strcasecmp(rout,blas[L3_SYRK]) == 0)
   {
#endif
      itst = &sytst;
      t0 = time00();
      trusted_syrk(ULc, TRc, N, K, alpha, A, lda, beta, C, ldc);
      t1 = time00() - t0;
#ifdef TCPLX
   }
   else
   {
      itst = &hetst;
      t0 = time00();
      trusted_herk(ULc, TRc, N, K, *alpha, A, lda, *beta, C, ldc);
      t1 = time00() - t0;
   }
#endif
   if (t1 <= 0.0) mflop = t1 = 0.0;
   else
      #ifdef TCPLX
         mflop = ( ((4.0*N)*N)*K ) / (t1*MEGA);
      #else
         mflop = ( ((1.0*N)*N)*K ) / (t1*MEGA);
      #endif
   printf(form, *itst, UL, TR, N, K, MALPH, MBETA, t1, mflop, 1.0, "---");

#ifdef DEBUG
   printmat("C",N,N,C,ldc);
#endif

#ifndef TIMEONLY

   if (L2)
   {
      for (i=0; i != nL2; i++) L2[i] = 0.0;  /* invalidate L2 cache */
      for (i=0; i != nL2; i++) j += L2[i];  /* invalidate L2 cache */
   }

#ifdef TCPLX
   if (strcasecmp(rout,blas[L3_SYRK]) == 0)
   {
#endif
      t0 = time00();
      test_syrk(ULc, TRc, N, K, alpha, A, lda, beta, D, ldd);
      t2 = time00() - t0;
#ifdef TCPLX
   }
   else
   {
      t0 = time00();
      test_herk(ULc, TRc, N, K, *alpha, A, lda, *beta, D, ldd);
      t2 = time00() - t0;
   }
#endif

   if (t2 <= 0.0) t2 = mflop = 0.0;
   else
      #ifdef TCPLX
         mflop = ( ((4.0*N)*N)*K ) / (t2*MEGA);
      #else
         mflop = ( ((1.0*N)*N)*K ) / (t2*MEGA);
      #endif
#ifdef DEBUG
   printmat("D", N, N, D, ldd);
#endif
   if (TEST)
   {

      mdiff = malloc(ATL_sizeof * N * ldc);
      assert(mdiff);
      Mjoin(PRE,symd)(ULc,N,C,ldc,D,ldd,mdiff,ldc);
      normC = Mjoin(PRE,synorm)(ULc,N,mdiff,ldc);
      free(mdiff);
      if ( TRc == AtlasNoTrans)
         normA = Mjoin(PRE,genorm)(N,K,A,lda);
      else
         normA = Mjoin(PRE,genorm)(K,N,A,lda);

      #ifdef TCPLX
         if (strcasecmp(rout,blas[L3_SYRK]) == 0)
         {
            if (normA > 0.0 && SCALAR_ABS(alpha) > 0.0 )
               ferr = normC/(normA*normA*eps*Maxval2(N,K)*SCALAR_ABS(alpha));
            else ferr = 0.0;
         }
         else
         {
            if (normA > 0.0 && Mabs(*alpha) > 0.0 )
               ferr = normC/(normA * normA * eps * Maxval2(N,K) * Mabs(*alpha));
            else ferr = 0.0;
         }
      #else
         if (normA > 0.0 && Mabs(alpha) > 0.0 )
            ferr = normC/(normA * normA * eps * Maxval2(N,K) * Mabs(alpha));
         else ferr = 0.0;
      #endif
      if (ferr > 1.0 ){
         passed = 0;
         pc = "NO";
         printf("ERROR: ferr is %f\n",ferr);
      }
      else
      {
         passed = 1;
         pc = "YES";
      }
   }
   if (t1 == t2) t3 = 1.0;
   else if (t2 != 0.0) t3 = t1/t2;
   else t3 = 0.0;
   printf(form, (*itst)++, UL, TR, N, K, MALPH, MBETA, t2, mflop, t3, pc);
#else
   (*itst)++;
   passed = 1;
#endif
   if (L2) free(L2);
   return(passed);
}

void syherkloop(char *rout, int nuplo, enum ATLAS_UPLO *Uplo,
              int ntrans, enum ATLAS_TRANS *Trans,
              int N0, int NN, int incN,
              int K0, int KN, int incK,
              int nalph, TYPE *alph,
              int nbeta, TYPE *beta,
              int TEST, int LDA_IS_M, int *gtst, int *gpass)
{
   int n, k, ik, iu, it, ia, ib, lda, ldc;
   int itst=0, ipass=0, SAME=0;
   TYPE *A, *C, *D;
   char UL, TR;
   extern char *pre;

   if (K0 == -1)
   {
      SAME = 1;
      K0 = KN = incK = NN;
   }
   if (NN >=KN)
      A = malloc(NN*NN*ATL_sizeof);
   else
      A = malloc(KN*KN*ATL_sizeof);
   C = malloc(NN*NN*ATL_sizeof);
   D = malloc(NN*NN*ATL_sizeof);
   if (!A || !C || !D)
   {
      fprintf(stderr, "Not enough memory to run tests!!\n");
      exit(-1);
   }
   if (strcasecmp(rout,blas[L3_SYRK]) == 0)
      printf("\n%cSYRK\n",Mupcase(pre[0]));
   else
      printf("\n%cHERK\n",Mupcase(pre[0]));
#ifdef TREAL
   printf("TEST  UL  TR         N    K  alpha   beta    Time  Mflop  SpUp  PASS\n");
   printf("====  ==  ==       ===  ===  =====  =====  ======  =====  ====  ====\n\n");
#else
printf("\nTEST  UL  TR         N    K        alpha         beta    Time  Mflop  SpUp  PASS\n");
  printf("====  ==  ==       ===  ===  ===== =====  ===== =====  ======  =====  ====  ====\n\n");
#endif
   for (n=N0; n <= NN; n += incN)
   {
      for (ik=K0; ik <= KN; ik += incK)
      {
         if (SAME) k = n;
         else k = ik;
         for (iu=0; iu != nuplo; iu++)
         {
            if (Uplo[iu] == AtlasUpper) UL = 'U';
            else UL = 'L';
            for (it=0; it != ntrans; it++)
            {
               if (Trans[it] == AtlasNoTrans) TR = 'N';
               else if (Trans[it] == AtlasTrans)
               {
                  if (strcasecmp(rout,blas[L3_SYRK]) == 0) TR = 'T';
                  else continue;
               }
#ifdef TCPLX
               else if (Trans[it] == AtlasConjTrans)
               {
                  if (strcasecmp(rout,blas[L3_HERK]) == 0) TR = 'C';
                  else continue;
               }
#else
               else continue;
#endif
               for (ia=0; ia != nalph; ia++)
               {
                  for (ib=0; ib != nbeta; ib++)
                  {
                     itst++;
                     if (LDA_IS_M)
                     {
                        if ( Trans[it] == AtlasNoTrans) lda = n;
                        else lda = k;
                        ldc = n;
                     }
                     else
                     {
                        if ( Trans[it] == AtlasNoTrans) lda = NN;
                        else lda = KN;
                        ldc = NN;
                     }
#ifdef TREAL
                     ipass += syherkcase(rout, UL, TR, n, k, alph[ia],
                                       A, lda, beta[ib], C, ldc, D, ldc,
                                       TEST);
#else
                     ipass += syherkcase(rout, UL, TR, n, k, &alph[2*ia],
                                       A, lda, &beta[2*ib], C, ldc, D, ldc,
                                       TEST);
#endif
                  }
               }
            }
         }
      }
   }
   if (TEST) printf("\nNTEST=%d, NUMBER passed=%d, NUMBER FAILURES=%d\n",
         itst, ipass, itst-ipass);
   else printf("\nDone with %d timing runs\n",itst);
   *gtst += itst; *gpass += ipass;
   free(A);
   free(C);
   if (D) free(D);
}
int syher2kcase( char *rout, char UL, char TR,
              int N, int K,
              SCALAR alpha, TYPE *A, int lda,
              TYPE *B, int ldb, SCALAR beta,
              TYPE *C, int ldc,
              TYPE *D, int ldd,
              int TEST)
{
   char *pc = "---";
#ifdef TREAL
   char *form="%4d   %c   %c      %4d %4d  %5.1f  %5.1f  %6.2f  %5.1f  %4.2f   %3s\n";
   #define MALPH alpha
   #define MBETA beta
#else
   #define MALPH *alpha, alpha[1]
   #define MBETA *beta, beta[1]
   char *form="%4d   %c   %c      %4d %4d  %5.1f %5.1f  %5.1f %5.1f  %6.2f %6.1f  %4.2f   %3s\n";
   static int hetst=1;
#endif
   int i, j=0, passed=0;
   double t0, t1, t2, t3, mflop;
   extern TYPE eps;
   TYPE ferr;
   static int sytst=1;
   int *itst;
   int *L2=NULL, nL2=(1.3*L2SIZE)/sizeof(int);
   enum ATLAS_TRANS TRc;
   enum ATLAS_UPLO ULc;
   TYPE normA, normB, normC, *mdiff;

   if (nL2) L2 = malloc(nL2*sizeof(int));
   if (TR == 'N')
   {
      matgen(N, K, A, lda, K*1112);
      matgen(N, K, B, ldb, K*1112);
      TRc = AtlasNoTrans;
   }
   else
   {
      matgen(K, N, A, lda, K*1112);
      matgen(K, N, B, ldb, K*1112);
      if (TR == 'c' || TR == 'C') TRc = AtlasConjTrans;
      else TRc = AtlasTrans;
   }
   if (UL == 'U' || UL == 'u') ULc = AtlasUpper;
   else ULc = AtlasLower;
   matgen(N, N, C, ldc, N*N);
   matgen(N, N, D, ldd, N*N);

#ifdef DEBUG
   if (TR == 'N'){ printmat("A0", N, K, A, lda); printmat("B0",N,K,B,ldb); }
   else { printmat("A0", K, N, A, lda); printmat("B0",K,N,B,ldb); }
   printmat("C0", N, N, C, ldc);
#endif

   if (L2) /* invalidate L2 cache */
   {
      for (i=0; i != nL2; i++) L2[i] = 0.0;
      for (i=0; i != nL2; i++) j += L2[i];
   }

#ifdef TCPLX
   if (strcasecmp(rout,blas[L3_SYR2K]) == 0)
   {
#endif
      itst = &sytst;
      t0 = time00();
      trusted_syr2k(ULc, TRc, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
      t1 = time00() - t0;
#ifdef TCPLX
   }
   else
   {
      itst = &hetst;
      t0 = time00();
      trusted_her2k(ULc, TRc, N, K, alpha, A, lda, B, ldb, *beta, C, ldc);
      t1 = time00() - t0;
   }
#endif
   if (t1 <= 0.0) mflop = t1 = 0.0;
   else
      #ifdef TCPLX
         mflop = ( ((8.0*N)*N)*K ) / (t1*MEGA);
      #else
         mflop = ( ((2.0*N)*N)*K ) / (t1*MEGA);
      #endif
   printf(form, *itst, UL, TR, N, K, MALPH, MBETA, t1, mflop, 1.0, "---");
#ifdef DEBUG
   printmat("C",N,N,C,ldc);
#endif

#ifndef TIMEONLY

   if (L2)
   {
      for (i=0; i != nL2; i++) L2[i] = 0.0;  /* invalidate L2 cache */
      for (i=0; i != nL2; i++) j += L2[i];  /* invalidate L2 cache */
   }

#ifdef TCPLX
   if (strcasecmp(rout,blas[L3_SYR2K]) == 0)
   {
#endif
      t0 = time00();
      test_syr2k(ULc, TRc, N, K, alpha, A, lda, B, ldb, beta, D, ldd);
      t2 = time00() - t0;
#ifdef TCPLX
   }
   else
   {
      t0 = time00();
      test_her2k(ULc, TRc, N, K, alpha, A, lda, B, ldb, *beta, D, ldd);
      t2 = time00() - t0;
   }
#endif

   if (t2 <= 0.0) t2 = mflop = 0.0;
   else
      #ifdef TCPLX
         mflop = ( ((8.0*N)*N)*K ) / (t2*MEGA);
      #else
         mflop = ( ((2.0*N)*N)*K ) / (t2*MEGA);
      #endif
#ifdef DEBUG
   printmat("D", N, N, D, ldd);
#endif
   if (TEST)
   {
      mdiff = malloc(ATL_sizeof * N * ldc);
      assert(mdiff);
      Mjoin(PRE,symd)(ULc,N,C,ldc,D,ldd,mdiff,ldc);
      normC = Mjoin(PRE,synorm)(ULc,N,mdiff,ldc);
      free(mdiff);
      if ( TRc == AtlasNoTrans){
         normA = Mjoin(PRE,genorm)(N,K,A,lda);
         normB = Mjoin(PRE,genorm)(N,K,B,ldb);
      }
      else
      {
         normA = Mjoin(PRE,genorm)(K,N,A,lda);
         normB = Mjoin(PRE,genorm)(K,N,B,ldb);
      }

      if (normA > 0.0 && normB > 0.0 && SCALAR_ABS(alpha) > 0.0 )
         ferr = normC / ( normA * normB * eps * Maxval2(N,K) * SCALAR_ABS(alpha) );
      else
         ferr = 0.0;
      if (ferr > 1.0 ){
         passed = 0;
         pc = "NO";
         printf("ERROR: ferr is %f\n",ferr);
      }
      else
      {
         passed = 1;
         pc = "YES";
      }
   }
   if (t1 == t2) t3 = 1.0;
   else if (t2 != 0.0) t3 = t1/t2;
   else t3 = 0.0;
   printf(form, (*itst)++, UL, TR, N, K, MALPH, MBETA, t2, mflop, t3, pc);
#else
   (*itst)++;
   passed = 1;
#endif
   if (L2) free(L2);
   return(passed);
}
void syher2kloop(char *rout, int nuplo, enum ATLAS_UPLO *Uplo,
              int ntrans, enum ATLAS_TRANS *Trans,
              int N0, int NN, int incN,
              int K0, int KN, int incK,
              int nalph, TYPE *alph,
              int nbeta, TYPE *beta,
              int TEST, int LDA_IS_M, int *gtst, int *gpass)
{
   int n, k, ik, iu, it, ia, ib, lda, ldb, ldc;
   int itst=0, ipass=0, SAME=0;
   TYPE *A, *B, *C, *D;
   char UL, TR;
   extern char *pre;

   if (K0 == -1)
   {
      SAME = 1;
      K0 = KN = incK = NN;
   }
   if (NN >= KN)
   {
      A = malloc(NN*NN*ATL_sizeof);
      B = malloc(NN*NN*ATL_sizeof);
   }
   else
   {
      A = malloc(KN*KN*ATL_sizeof);
      B = malloc(KN*KN*ATL_sizeof);
   }
   C = malloc(NN*NN*ATL_sizeof);
   D = malloc(NN*NN*ATL_sizeof);
   if (!A || !B || !C || !D)
   {
      fprintf(stderr, "Not enough memory to run tests!!\n");
      exit(-1);
   }
   #ifdef TCPLX
      if (strcasecmp(rout,blas[L3_SYR2K]) == 0) printf("\n%cSYR2K\n",Mupcase(pre[0]));
      else printf("\n%cHER2K\n",Mupcase(pre[0]));
   #else
      printf("\n%cSYR2K\n",Mupcase(pre[0]));
   #endif
#ifdef TREAL
   printf("TEST  UL  TR         N    K  alpha   beta    Time  Mflop  SpUp  PASS\n");
   printf("====  ==  ==       ===  ===  =====  =====  ======  =====  ====  ====\n\n");
#else
printf("\nTEST  UL  TR         N    K        alpha         beta    Time  Mflop  SpUp  PASS\n");
  printf("====  ==  ==       ===  ===  ===== =====  ===== =====  ======  =====  ====  ====\n\n");
#endif
   for (n=N0; n <= NN; n += incN)
   {
      for (ik=K0; ik <= KN; ik += incK)
      {
         if (SAME) k = n;
         else k = ik;
         for (iu=0; iu != nuplo; iu++)
         {
            if (Uplo[iu] == AtlasUpper) UL = 'U';
            else UL = 'L';
            for (it=0; it != ntrans; it++)
            {
               if (Trans[it] == AtlasNoTrans) TR = 'N';
               else if (Trans[it] == AtlasTrans)
               {
                  if (strcasecmp(rout,blas[L3_SYR2K]) == 0) TR = 'T';
                  else continue;
               }
#ifdef TCPLX
               else if (Trans[it] == AtlasConjTrans)
               {
                  if (strcasecmp(rout,blas[L3_HER2K]) == 0) TR = 'C';
                  else continue;
               }
#else
               else continue;
#endif
               for (ia=0; ia != nalph; ia++)
               {
                  for (ib=0; ib != nbeta; ib++)
                  {
                     itst++;
                     if (LDA_IS_M)
                     {
                        if ( Trans[it] == AtlasNoTrans)
                        {
                           lda = n;
                           ldb = n;
                        }
                        else
                        {
                           lda = k;
                           ldb = k;
                        }
                        ldc = n;
                     }
                     else
                     {
                        if ( Trans[it] == AtlasNoTrans)
                        {
                           lda = NN;
                           ldb = NN;
                        }
                        else
                        {
                           lda = KN;
                           ldb = KN;
                        }
                        ldc = NN;
                     }
#ifdef TREAL
                     ipass += syher2kcase(rout, UL, TR, n, k, alph[ia],
                                       A, lda, B,ldb, beta[ib],
                                       C, ldc, D, ldc,
                                       TEST);
#else
                     ipass += syher2kcase(rout, UL, TR, n, k, &alph[2*ia],
                                       A, lda, B, ldb, &beta[2*ib],
                                       C, ldc, D, ldc,
                                       TEST);
#endif
                  }
               }
            }
         }
      }
   }
   if (TEST) printf("\nNTEST=%d, NUMBER passed=%d, NUMBER FAILURES=%d\n",
         itst, ipass, itst-ipass);
   else printf("\nDone with %d timing runs\n",itst);
   *gtst += itst; *gpass += ipass;
   free(A);
   free(B);
   free(C);
   if (D) free(D);
}
int trcase(char *rout,
             char SI, char UL, char TR, char DI,
             int M, int N,
             SCALAR alpha, TYPE *A, int lda,
             TYPE *B, int ldb,
             TYPE *C, int ldc,
             int TEST)
{
   char *pc = "---";
#ifdef TREAL
   char *form="%4d   %c  %c  %c  %c %4d %4d  %5.1f         %6.2f %6.1f  %4.2f   %3s\n";
   #define MALPH alpha
   #define MBETA beta
#else
   #define MALPH *alpha, alpha[1]
   #define MBETA *beta, beta[1]
   char *form="%4d   %c  %c  %c  %c %4d %4d  %5.1f %5.1f               %6.2f %6.1f  %4.2f   %3s\n";
#endif
   int i, j=0, passed=0;
   double t0, t1, t2, t3, mflop;
   TYPE ferr;
   extern TYPE eps;
   static int mmtst=1;
   static int smtst=1;
   int *itst;
   int *pL2 = NULL, *L2=NULL, nL2=(1.3*L2SIZE)/sizeof(int);
   enum ATLAS_SIDE SIc;
   enum ATLAS_UPLO ULc;
   enum ATLAS_TRANS TRc;
   enum ATLAS_DIAG DIc;
   TYPE normA, normB, normC, *cB;
#ifdef TREAL
   SCALAR alpha1 = 1.0;
#else
   TYPE alpha1[2] = {1.0,0.0};
#endif

   cB = malloc(ATL_sizeof * N * ldb);
   assert(cB);

   if (nL2) {
      pL2 = malloc(nL2*sizeof(int));
      L2 = pL2;
   }
   if (SI == 'l' || SI == 'L'){
      matgen(M, M, A, lda, M*M);
      if (strcasecmp(rout,blas[L3_TRSM]) == 0)
         for (i=0; i != M; i++)
            #ifdef TREAL
               A[i*(lda+1)] += M;
            #else
               {  A[2*i*(lda+1)] += M; A[2*i*(lda+1)+1] += M; }
            #endif
      SIc = AtlasLeft;
   } else {
      matgen(N, N, A, lda, N*N);
      if (strcasecmp(rout,blas[L3_TRSM]) == 0)
         for (i=0; i != N; i++)
            #ifdef TREAL
               A[i*(lda+1)] += N;
            #else
               {  A[2*i*(lda+1)] += N; A[2*i*(lda+1)+1] += N; }
            #endif
      SIc = AtlasRight;
   }

   if (UL == 'U' || UL == 'u') ULc = AtlasUpper;
   else ULc = AtlasLower;

   if (TR == 'N' || TR == 'n') TRc = AtlasNoTrans;
   else if (TR == 'T' || TR == 't') TRc = AtlasTrans;
   else TRc = AtlasConjTrans;

   if (DI == 'U' || DI == 'u') DIc = AtlasUnit;
   else DIc = AtlasNonUnit;

   matgen(M, N, B, ldb, M*N);
   bcopy(B,C,ATL_sizeof*N*ldb);
   bcopy(B,cB,ATL_sizeof*N*ldb);

#ifdef DEBUG
   if (SI == 'l' || SI == 'L') printmat("A0", M, M, A, lda);
   else printmat("A0", N, N, A, lda);
   printmat("B0", M, N, B, ldb);
#endif

   if (L2) /* invalidate L2 cache */
   {
      for (i=0; i != nL2; i++) L2[i] = 0.0;
      for (i=0; i != nL2; i++) j += L2[i];
   }

   if (strcasecmp(rout,blas[L3_TRMM]) == 0)
   {
      itst = &mmtst;
      t0 = time00();
      trusted_trmm(SIc, ULc, TRc, DIc, M, N, alpha, A, lda, B, ldb);
      t1 = time00() - t0;
   }
   else
   {
      itst = &smtst;
      t0 = time00();
      trusted_trsm(SIc, ULc, TRc, DIc, M, N, alpha, A, lda, B, ldb);
      t1 = time00() - t0;
   }

   if (t1 <= 0.0) mflop = t1 = 0.0;
   else if (SI == 'L')
   {
      #ifdef TCPLX
         mflop = ( ((4.0*M)*M)*N ) / (t1*MEGA);
      #else
         mflop = ( ((1.0*M)*M)*N ) / (t1*MEGA);
      #endif
   }
   else
   {
      #ifdef TCPLX
         mflop = ( ((4.0*M)*N)*N ) / (t1*MEGA);
      #else
         mflop = ( ((1.0*M)*N)*N ) / (t1*MEGA);
      #endif
   }
   printf(form, *itst, SI, UL, TR, DI, M, N, MALPH, t1, mflop, 1.0, "---");

#ifdef DEBUG
   printmat("B", M, N, B, ldb);
#endif

#ifndef TIMEONLY
   if (L2)
   {
      for (i=0; i != nL2; i++) L2[i] = 0.0;  /* invalidate L2 cache */
      for (i=0; i != nL2; i++) j += L2[i];  /* invalidate L2 cache */
   }

   if (strcasecmp(rout,blas[L3_TRMM]) == 0)
   {
      t0 = time00();
      test_trmm(SIc, ULc, TRc, DIc, M, N, alpha, A, lda, C, ldc);
      t2 = time00() - t0;
   }
   else
   {
      t0 = time00();
      test_trsm(SIc, ULc, TRc, DIc, M, N, alpha, A, lda, C, ldc);
      t2 = time00() - t0;
   }

   if (t2 <= 0.0) t2 = mflop = 0.0;
   else if (SI == 'L')
   {
      #ifdef TCPLX
         mflop = ( ((4.0*M)*M)*N ) / (t2*MEGA);
      #else
         mflop = ( ((1.0*M)*M)*N ) / (t2*MEGA);
      #endif
   }
   else
   {
      #ifdef TCPLX
         mflop = ( ((4.0*M)*N)*N ) / (t2*MEGA);
      #else
         mflop = ( ((1.0*M)*N)*N ) / (t2*MEGA);
      #endif
   }
#ifdef DEBUG
   printmat("C", M, N, C, ldc);
#endif
   if (TEST)
   {
      if (strcasecmp(rout,blas[L3_TRMM]) == 0)
      {
         Mjoin(PATL,gediff)(M,N,B,ldb,C,ldc,B,ldb);
         normC = Mjoin(PRE,genorm)(M,N,B,ldb);
         if (SIc == AtlasLeft)
            normA = Mjoin(PRE,trnorm)(ULc, DIc, M,A,lda);
         else
            normA = Mjoin(PRE,trnorm)(ULc, DIc, N,A,lda);
         normB = Mjoin(PRE,genorm)(M,N,cB,ldb);
      }
      else
      {
         normB = Mjoin(PRE,genorm)(M,N,B,ldb);
         trusted_trmm(SIc, ULc, TRc, DIc, M, N, alpha1, A, lda, C, ldc);
         if (SIc == AtlasLeft)
            normA = Mjoin(PRE,trnorm)(ULc,DIc,M,A,lda);
         else
            normA = Mjoin(PRE,trnorm)(ULc,DIc,N,A,lda);
         Mjoin(PRE,gemscal)(M,N,cB,ldb,alpha);
         Mjoin(PATL,gediff)(M,N,cB,ldb,C,ldc,cB,ldb);
         normC = Mjoin(PRE,genorm)(M,N,cB,ldb);
      }
      if (normA > 0.0 && normB > 0.0 && SCALAR_ABS(alpha) > 0.0 )
         ferr = normC / ( normA * normB * eps * Maxval2(M,N) * SCALAR_ABS(alpha) );
      else
         ferr = 0.0;
      if (ferr > 1.0 ){
         passed = 0;
         pc = "NO";
         printf("ERROR: ferr is %f\n",ferr);
      }
      else
      {
         passed = 1;
         pc = "YES";
      }
   }
   if (t1 == t2) t3 = 1.0;
   else if (t2 != 0.0) t3 = t1/t2;
   else t3 = 0.0;
   printf(form, (*itst)++, SI, UL, TR, DI, M, N, MALPH, t2, mflop, t3, pc);
#else
   (*itst)++;
   passed = 1;
#endif
   free(cB);
   if (pL2) free(pL2);
   return(passed);
}
void trloop(char *rout,
            int nside, enum ATLAS_SIDE *Side,
            int nuplo, enum ATLAS_UPLO *Uplo,
            int ntrans, enum ATLAS_TRANS *Trans,
            int ndiag, enum ATLAS_DIAG *Diag,
            int M0, int MN, int incM,
            int N0, int NN, int incN,
            int nalph, TYPE *alph,
            int TEST, int LDA_IS_M, int *gtst, int *gpass)
{
   int m, n, is, iu, it, id, ia, lda, ldb;
   int itst=0, ipass=0, SAME=0;
   TYPE *A, *B, *C;
   char SI, UL, TR, DI;
   extern char *pre;

   if (M0 == -1)
   {
      SAME = 1;
      M0 = MN = incM = NN;
   }

   if (MN >= NN)
   {
      A = malloc(MN*MN*ATL_sizeof);
      B = malloc(MN*MN*ATL_sizeof);
      C = malloc(MN*MN*ATL_sizeof);
   }
   else
   {
      A = malloc(NN*NN*ATL_sizeof);
      B = malloc(NN*NN*ATL_sizeof);
      C = malloc(NN*MN*ATL_sizeof);
   }
   if (!A || !B || !C)
   {
      fprintf(stderr, "Not enough memory to run tests!!\n");
      exit(-1);
   }

   if (strcasecmp(rout,blas[L3_TRMM]) == 0)
      printf("\n%cTRMM\n",Mupcase(pre[0]));
   else
      printf("\n%cTRSM\n",Mupcase(pre[0]));
#ifdef TREAL
   printf("TEST  SI UL TA DI    M    N  alpha           Time  Mflop  SpUp  PASS\n");
   printf("====  == == == ==  ===  ===  =====         ======  =====  ====  ====\n\n");
#else
printf("\nTEST  SI UL TA DI    M    N        alpha                 Time  Mflop  SpUp  PASS\n");
  printf("====  == == == ==  ===  ===  ===== =====               ======  =====  ====  ====\n\n");
#endif
   for (m=M0; m <= MN; m += incM)
   {
      for (n=N0; n <= NN; n += incN)
      {
         if (SAME) m = n;
         for (is=0; is != nside; is++)
         {
            if (Side[is] == AtlasRight) SI = 'R';
            else SI = 'L';
            for (iu=0; iu != nuplo; iu++)
            {
               if (Uplo[iu] == AtlasUpper) UL = 'U';
               else UL = 'L';
               for (it=0; it != ntrans; it++)
               {
                  if (Trans[it] == AtlasNoTrans) TR = 'N';
                  else if (Trans[it] == AtlasTrans) TR = 'T';
                  else if (Trans[it] == AtlasConjTrans) TR = 'C';
                  for (id=0; id != ndiag; id++)
                  {
                     if (Diag[id] == AtlasNonUnit) DI = 'N';
                     else DI = 'U';
                     for (ia=0; ia != nalph; ia++)
                     {
                        itst++;
                        if (LDA_IS_M)
                        {
                           if (Side[is] == AtlasLeft) lda = m;
                           else lda = n;
                           ldb = m;
                        }
                        else
                        {
                           if (Side[is] == AtlasLeft) lda = MN;
                           else lda = NN;
                           ldb = MN;
                        }
#ifdef TREAL
                        ipass += trcase(rout, SI, UL, TR, DI,
                              m, n, alph[ia], A, lda,
                              B, ldb, C, ldb,
                              TEST);
#else
                        ipass += trcase(rout, SI, UL, TR, DI,
                              m, n, &alph[2*ia], A, lda,
                              B, ldb, C, ldb,
                              TEST);
#endif
                     }
                  }
               }
            }
         }
      }
   }
   if (TEST) printf("\nNTEST=%d, NUMBER passed=%d, NUMBER FAILURES=%d\n",
         itst, ipass, itst-ipass);
   else printf("\nDone with %d timing runs\n",itst);
   *gtst += itst; *gpass += ipass;
   free(A);
   free(B);
   free(C);
}

void PrintUsage(char *nam)
{
   fprintf(stderr, "USAGE: %s -R <rout> -Side <nsides> L/R -Uplo <nuplo> L/U -Atrans <ntrans> n/t/c -Btrans <ntrans> n/t/c -Diag <ndiags> N/U -M <m1> <mN> <minc> -N <n1> <nN> <ninc> -K <k1> <kN> <kinc> -n <n> -m <m> -k <k> -a <nalphas> <alpha1> ... <alphaN> -b <nbetas> <beta1> ... <betaN> -Test <0/1>\n", nam);
   exit(-1);
}

void GetFlags(int nargs, char *args[], int *TEST, int *nside,
              enum ATLAS_SIDE **Side, int *nuplo, enum ATLAS_UPLO **Uplo,
              int *nta, enum ATLAS_TRANS **TransA,
              int *ntb, enum ATLAS_TRANS **TransB, int *ndiag,
              enum ATLAS_DIAG **Diag, int *M0, int *MN, int *Minc,
              int *N0, int *NN, int *Ninc, int *K0, int *KN, int *Kinc,
              int *nalphas, TYPE **alphas, int *nbetas, TYPE **betas,
              int *nrouts, char ***routs, int *LDA_IS_M)

{
   char ch;
   int i=1, j, k,l=0;
/*
 * Set up defaults
 */
   *TEST = 1;
   *M0 = *N0 = *K0 = -1;
   *nrouts = *nuplo = *nta = *ntb = *nside = *ndiag = *nalphas = *nbetas = -1;
   *LDA_IS_M = 0;

   for (i=1; i < nargs ; )
   {
      if ( args[i][0] != '-' ) PrintUsage(args[0]);
      switch(args[i++][1])
      {
      case 'S':
         if (args[i] == NULL) PrintUsage(args[0]);
         *nside = atoi(args[i++]);
         if (*nside <= 0) PrintUsage(args[0]);
         *Side = malloc(*nside * sizeof(int));
         assert(*Side);
         for (j=0; j != *nside; j++)
         {
            if (args[i] == NULL) PrintUsage(args[0]);
            ch = *args[i++];
            if (ch == 'r' || ch == 'R') (*Side)[j] = AtlasRight;
            else if (ch == 'l' || ch == 'L') (*Side)[j] = AtlasLeft;
            else PrintUsage(args[0]);
         }
         break;
      case 'U':
         if (args[i] == NULL) PrintUsage(args[0]);
         *nuplo = atoi(args[i++]);
         if (*nuplo <= 0) PrintUsage(args[0]);
         *Uplo = malloc(*nuplo * sizeof(int));
         assert(*Uplo);
         for (j=0; j != *nuplo; j++)
         {
            if (args[i] == NULL) PrintUsage(args[0]);
            ch = *args[i++];
            if (ch == 'u' || ch == 'U') (*Uplo)[j] = AtlasUpper;
            else if (ch == 'l' || ch == 'L') (*Uplo)[j] = AtlasLower;
            else PrintUsage(args[0]);
         }
         break;
      case 'D':
         if (args[i] == NULL) PrintUsage(args[0]);
         *ndiag = atoi(args[i++]);
         if (*ndiag <= 0) PrintUsage(args[0]);
         *Diag = malloc(*ndiag * sizeof(int));
         assert(*Diag);
         for (j=0; j != *ndiag; j++)
         {
            if (args[i] == NULL) PrintUsage(args[0]);
            ch = *args[i++];
            if (ch == 'u' || ch == 'U') (*Diag)[j] = AtlasUnit;
            else if (ch == 'n' || ch == 'N') (*Diag)[j] = AtlasNonUnit;
            else PrintUsage(args[0]);
         }
         break;
      case 'A':
         if (args[i] == NULL) PrintUsage(args[0]);
         *nta   = atoi(args[i++]);
         if (*nta <= 0) PrintUsage(args[0]);
         *TransA = malloc(*nta * sizeof(int));
         assert(*TransA);
         for (j=0; j != *nta; j++)
         {
            if (args[i] == NULL) PrintUsage(args[0]);
            ch = *args[i++];
            if (ch == 'n' || ch == 'N') (*TransA)[j] = AtlasNoTrans;
            else if (ch == 't' || ch == 'T') (*TransA)[j] = AtlasTrans;
            else if (ch == 'c' || ch == 'C') (*TransA)[j] = AtlasConjTrans;
            else PrintUsage(args[0]);
         }
         break;
      case 'B':
         if (args[i] == NULL) PrintUsage(args[0]);
         *ntb   = atoi(args[i++]);
         if (*ntb <= 0) PrintUsage(args[0]);
         *TransB = malloc(*ntb * sizeof(int));
         assert(*TransB);
         for (j=0; j != *ntb; j++)
         {
            if (args[i] == NULL) PrintUsage(args[0]);
            ch = *args[i++];
            if (ch == 'n' || ch == 'N') (*TransB)[j] = AtlasNoTrans;
            else if (ch == 't' || ch == 'T') (*TransB)[j] = AtlasTrans;
            else if (ch == 'c' || ch == 'C') (*TransB)[j] = AtlasConjTrans;
            else PrintUsage(args[0]);
         }
         break;
      case 'M':
         if (args[i] == NULL) PrintUsage(args[0]);
         *M0 = atoi(args[i++]);
         if (*M0 < 0) PrintUsage(args[0]);
         if (args[i] == NULL) PrintUsage(args[0]);
         *MN = atoi(args[i++]);
         if (*MN < 0) PrintUsage(args[0]);
         if (args[i] == NULL) PrintUsage(args[0]);
         *Minc = atoi(args[i++]);
         if (*Minc <= 0) PrintUsage(args[0]);
         break;
      case 'N':
         if (args[i] == NULL) PrintUsage(args[0]);
         *N0 = atoi(args[i++]);
         if (*N0 < 0) PrintUsage(args[0]);
         if (args[i] == NULL) PrintUsage(args[0]);
         *NN = atoi(args[i++]);
         if (*NN < 0) PrintUsage(args[0]);
         if (args[i] == NULL) PrintUsage(args[0]);
         *Ninc = atoi(args[i++]);
         if (*Ninc <= 0) PrintUsage(args[0]);
         break;
      case 'K':
         if (args[i] == NULL) PrintUsage(args[0]);
         *K0 = atoi(args[i++]);
         if (*K0 < 0) PrintUsage(args[0]);
         if (args[i] == NULL) PrintUsage(args[0]);
         *KN = atoi(args[i++]);
         if (*KN < 0) PrintUsage(args[0]);
         if (args[i] == NULL) PrintUsage(args[0]);
         *Kinc = atoi(args[i++]);
         if (*Kinc <= 0) PrintUsage(args[0]);
         break;
      case 'T':
         if (args[i] == NULL) PrintUsage(args[0]);
         *TEST = atoi(args[i++]);
         break;
      case 'R':
         if (args[i] == NULL) PrintUsage(args[0]);
         if (isdigit(*args[i]))
         {
            *nrouts = atoi(args[i++]);
            *routs = malloc( (*nrouts) * sizeof(char *) );
            assert(*routs);
         }
         else
         {
            *nrouts = 1;
            *routs = malloc( sizeof(char *) );
            assert(*routs);
         }
         for (j=0; j < *nrouts; j++)
         {
            if (args[i] == NULL) PrintUsage(args[0]);
            for (k=0; k < L3_nrouts; k++)
            {
               if (strcasecmp(args[i],blas[k]) == 0)
               {
                  l = 1;
                  (*routs)[j] = blas[k];
                  break;
               }
            }
            if (!l)
            {
               if (strcasecmp(args[i],"all") == 0 && *nrouts == 1)
               {
                  free(*routs);
                  *nrouts = L3_nrouts;
                  *routs = malloc( (*nrouts) * sizeof(char *) );
                  L3_copyrouts(*routs);
                  i++;
                  break;
               }
               else PrintUsage(args[0]);
            }
            l = 0;
            i++;
         }
         break;
      case 'm':
         if (args[i] == NULL) PrintUsage(args[0]);
         *M0 = *MN = atoi(args[i++]);
         *Minc = 1;
         if (*M0 <= 0) PrintUsage(args[0]);
         break;
      case 'n':
         if (args[i] == NULL) PrintUsage(args[0]);
         *N0 = *NN = atoi(args[i++]);
         *Ninc = 1;
         if (*N0 < 0) PrintUsage(args[0]);
         break;
      case 'k':
         if (args[i] == NULL) PrintUsage(args[0]);
         *K0 = *KN = atoi(args[i++]);
         *Kinc = 1;
         if (*K0 <= 0) PrintUsage(args[0]);
         break;
      case 'a':
         if (args[i] == NULL) PrintUsage(args[0]);
         *nalphas = atoi(args[i++]);
         if (*nalphas <= 0)  PrintUsage(args[0]);
         *alphas = malloc(ATL_MulBySize(*nalphas ));
         assert(*alphas);
         for (j=0; j < *nalphas SHIFT; j++)
         {
            if (args[i] == NULL) PrintUsage(args[0]);
            (*alphas)[j] = atof(args[i++]);
         }
         break;
      case 'b':
         if (args[i] == NULL) PrintUsage(args[0]);
         *nbetas  = atoi(args[i++]);
         if (*nbetas <= 0)  PrintUsage(args[0]);
         *betas  = malloc(ATL_MulBySize(*nbetas ));
         assert(*betas );
         for (j=0; j < *nbetas SHIFT; j++)
         {
            if (args[i] == NULL) PrintUsage(args[0]);
            (*betas)[j] = atof(args[i++]);
         }
         break;
      case 'd':
         *LDA_IS_M  = atoi(args[i++]);
         break;
      default:
         PrintUsage(args[0]);
         break;
      }
   }
/*
 * Finish setting up defaults if the user has not selected
 */
   if (*N0 == -1)
   {
      *N0 = 100;
      *NN = 1000;
      *Ninc = 100;
   }
   if (*nside == -1)
   {
      *nside = 1;
      *Side = malloc(sizeof(int));
      assert(*Side);
      **Side = AtlasLeft;
   }
   if (*nuplo == -1)
   {
      *nuplo = 1;
      *Uplo = malloc(sizeof(int));
      assert(*Uplo);
      **Uplo = AtlasLower;
   }
   if (*nta == -1)
   {
      *nta = 1;
      *TransA = malloc(sizeof(int));
      assert(*TransA);
      **TransA = AtlasNoTrans;
   }
   if (*ntb == -1)
   {
      *ntb = 1;
      *TransB = malloc(sizeof(int));
      assert(*TransB);
      **TransB = AtlasNoTrans;
   }
   if (*ndiag == -1)
   {
      *ndiag = 1;
      *Diag = malloc(sizeof(int));
      assert(*Diag);
      **Diag = AtlasNonUnit;
   }
   if (*nalphas == -1)
   {
      *nalphas = 1;
      *alphas = malloc(ATL_MulBySize(1));
      assert(*alphas);
      #ifdef TREAL
         **alphas = 1.0;
      #else
         **alphas = 1.0;
         (*alphas)[1] = 0.0;
      #endif
   }
   if (*nbetas  == -1)
   {
      *nbetas  = 1;
      *betas  = malloc(ATL_MulBySize(1));
      assert(*betas );
      #ifdef TREAL
         **betas  = 1.0;
      #else
         **betas  = 1.0;
         (*betas)[1] = 0.0;
      #endif
   }
   if (*nrouts == -1)
   {
     *nrouts = 1;
     *routs = malloc( sizeof(char *) );
     assert(*routs);
     (*routs)[0] = blas[L3_GEMM];
   }
}

int main(int nargs, char *args[])
/*
 *  tst <tst> <# TA> <TA's> <# TB's> <TB's> <M0> <MN> <incM> <N0> <NN> <incN>
 *      <K0> <KN> <incK> <# alphas> <alphas> <# betas> <betas>
 *
 */
{
   int i=0, M0=-1, MN=-1, incM=-1, N0=-1, NN=-1, incN=-1, K0=-1, KN=-1;
   int incK=-1, gtst=0, gpass=0, nalph=-1, nbeta=-1, ndiag=-1, nuplo=-1;
   int nside=-1, nrouts=-1, nTA=-1, nTB=-1, TEST=1, LDA_IS_M=1;
   TYPE *alph, *beta;
   char **rout;
   enum ATLAS_SIDE *Side;
   enum ATLAS_UPLO *Uplo;
   enum ATLAS_TRANS *TransA, *TransB;
   enum ATLAS_DIAG *Diag;
   extern TYPE eps;
#if 1
   eps = Mjoin(PATL,epsilon)();
#else
   eps = EPS;
#endif
#ifdef DEBUG
   printf("epsilon=%e\n",eps);
#endif

   GetFlags(nargs, args, &TEST, &nside, &Side, &nuplo, &Uplo,
            &nTA, &TransA, &nTB, &TransB, &ndiag, &Diag,
            &M0, &MN, &incM, &N0, &NN, &incN, &K0, &KN, &incK,
            &nalph, &alph, &nbeta, &beta, &nrouts, &rout, &LDA_IS_M);

   for (i=0; i < nrouts; i++)
   {
      if (strcasecmp(rout[i],blas[L3_GEMM]) == 0)      /* GEMM */
         gemmloop(nTA,TransA, nTB,TransB,
                  M0,MN,incM, N0,NN,incN, K0,KN,incK,
                  nalph,alph, nbeta,beta,
                  TEST,LDA_IS_M,&gtst,&gpass);
      else if (strcasecmp(rout[i],blas[L3_SYMM]) == 0) /* SYMM */
         syhemmloop(blas[L3_SYMM],nside,Side, nuplo,Uplo,
                  M0,MN,incM, N0,NN,incN,
                  nalph,alph, nbeta,beta,
                  TEST,LDA_IS_M,&gtst,&gpass);
     #ifdef TCPLX
      else if (strcasecmp(rout[i],blas[L3_HEMM]) == 0) /* HEMM */
         syhemmloop(blas[L3_HEMM],nside,Side, nuplo,Uplo,
                  M0,MN,incM, N0,NN,incN,
                  nalph,alph, nbeta,beta,
                  TEST,LDA_IS_M,&gtst,&gpass);
     #endif
      else if (strcasecmp(rout[i],blas[L3_SYRK]) == 0) /* SYRK */
         syherkloop(blas[L3_SYRK],nuplo,Uplo, nTA,TransA,
                  N0,NN,incN, K0,KN,incK,
                  nalph,alph, nbeta,beta,
                  TEST,LDA_IS_M,&gtst,&gpass);
     #ifdef TCPLX
      else if (strcasecmp(rout[i],blas[L3_HERK]) == 0) /* HERK */
         syherkloop(blas[L3_HERK],nuplo,Uplo, nTA,TransA,
                  N0,NN,incN, K0,KN,incK,
                  nalph,alph, nbeta,beta,
                  TEST,LDA_IS_M,&gtst,&gpass);
     #endif
      else if (strcasecmp(rout[i],blas[L3_SYR2K]) == 0) /* SYR2K */
         syher2kloop(blas[L3_SYR2K],nuplo,Uplo, nTA,TransA,
                  N0,NN,incN, K0,KN,incK,
                  nalph,alph, nbeta,beta,
                  TEST,LDA_IS_M,&gtst,&gpass);
     #ifdef TCPLX
      else if (strcasecmp(rout[i],blas[L3_HER2K]) == 0) /* HER2K */
         syher2kloop(blas[L3_HER2K],nuplo,Uplo, nTA,TransA,
                  N0,NN,incN, K0,KN,incK,
                  nalph,alph, nbeta,beta,
                  TEST,LDA_IS_M,&gtst,&gpass);
     #endif
      else if (strcasecmp(rout[i],blas[L3_TRMM]) == 0) /* TRMM */
         trloop(blas[L3_TRMM],
                  nside,Side, nuplo,Uplo, nTA,TransA, ndiag,Diag,
                  M0,MN,incM, N0,NN,incN,
                  nalph,alph,
                  TEST,LDA_IS_M,&gtst,&gpass);
      else if (strcasecmp(rout[i],blas[L3_TRSM]) == 0) /* TRSM */
         trloop(blas[L3_TRSM],
                  nside,Side, nuplo,Uplo, nTA,TransA, ndiag,Diag,
                  M0,MN,incM, N0,NN,incN,
                  nalph,alph,
                  TEST,LDA_IS_M,&gtst,&gpass);
   }

   if (nrouts > 1)
   {
   if (TEST)
      printf("\nGRAND TOTAL: NTEST=%d, NUMBER passed=%d, NUMBER FAILURES=%d\n",
             gtst, gpass,gtst-gpass);
   else
      printf("\nGRAND TOTAL: NTEST=%d\n",gtst);
   }

   free( Side   );
   free( Uplo   );
   free( TransA );
   free( TransB );
   free( Diag   );
   free( alph   );
   free( beta   );
   free( rout   );

   return 0;
}
