/*
 *             Automatically Tuned Linear Algebra Software v3.0Beta
 *                    (C) Copyright 1997 R. Clint Whaley                     
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University, the ATLAS group, or the names of its 
 *      contributers may not be used to endorse or promote products derived
 *      from this software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include "atlas_misc.h"
#include "atlas_lvl3.h"
#include Mstr(Mjoin(Mjoin(atlas_,PRE),fc.h))

double time00();

#ifndef DENMAT
   #define DENMAT 200
#endif

#ifdef Right_
   char Side = 'R';
#else
   char Side = 'L';
#endif
#ifdef Upper_
   char Uplo = 'U';
#else
   char Uplo = 'L';
#endif
#ifdef Transpose_
   char Tran = 'T';
#else
   char Tran = 'N';
#endif
#ifdef UnitDiag_
   char Diag = 'U';
#else
   char Diag = 'N';
#endif

#ifdef TREAL
void ATL_big_trsm(const int M, const int N, const void *valpha, const void *A,
                  const int lda, void *C, const int ldc);
void ATL_small_trsm(const int M, const int N, const void *valpha, const void *A,
                  const int lda, void *C, const int ldc);
#endif

void SetDiag(int N, SCALAR alpha, TYPE *A, int lda)
{
   int i;
   const int ldap1 = lda+1;

   for (i=0; i != N; i++)
   {
      *A = alpha;
      A += ldap1;
   }
}

void matgen(int M, int N, TYPE *A, int lda, int seed)
{
   int i, j;

   dumb_seed(seed);
   for (j=N; j; j--)
   {
      for (i=0; i != M SHIFT; i++) A[i] = dumb_rand();
      A += lda SHIFT;
   }
}


double smcase(int M, int N, SCALAR alpha)
{
   const char *form="   %c     %c     %c %5d %5d  %7.2f  %14.4f  %11.2f  %7.2f\n";
   const int lda=2*M, ldb=2*M;
   const int incA=lda*M, incB=ldb*N;
   int i, la, lb, reps;
   TYPE *a, *A, *b, *B, *ast, *bst;
   double t0, t1, t2, mf, mflop;

   i = M*M;
   la = ((ATL_DivBySize((2*L2SIZE)/3) + i-1) / i)*incA;
   a = A = malloc(ATL_MulBySize(la));
   assert(a != NULL);
   ast = A + la;

   i = M*N;
   lb = ((ATL_DivBySize((2*L2SIZE)/3) + i-1) / i)*incB;
   b = B = malloc(ATL_MulBySize(lb));
   assert(b != NULL);
   bst = B + lb;
/*
 * Set A to identity, so we can solve multiple times without overflowing
 */
   for (i=0; i != la; i++) a[i] = 0.0;
   while (a != ast)
   {
      SetDiag(M, 1.0, a, lda);
      a += incA;
   }
   a = A;
   matgen(lb, 1, B, lb, M*1245);
/*
 * preload instructions from disk
 */
   ATL_big_trsm(M, N, SADD alpha, A, lda, B, ldb);
   ATL_small_trsm(M, N, SADD alpha, A, lda, B, ldb);
   matgen(lb, 1, B, lb, M*1245);
/*
 * Insist we have as many flops as doing a matmul of order DENMAT
 */
   #ifdef Right_
      mf = (((double)M)*N)*N;
   #else
      mf = (((double)M)*M)*N;
   #endif
   #ifdef ATLAS_NKFLOP
      t0 = 1000.0 * ATLAS_NKFLOP;
   #else
      t0 = DENMAT;
      t0 *= t0*t0;
      t0 *= 2.0;
   #endif
   t0 /= mf;
   reps = t0 + 0.99;
   if (!reps) reps = 1;

   i = reps;
   t0 = time00();
   do
   {
      ATL_small_trsm(M, N, SADD alpha, a, lda, b, ldb);
      a += incA;
      b += incB;
      if (a == ast) a = A;
      if (b == bst) b = B;
   }
   while(--i);
   t1 = time00() - t0;
   if (t1 <= 0.0) mflop = t1 = 0.0;
   else mflop = (mf * reps) / (t1*1000000.0);
   fprintf(stdout, form, Side, Uplo, Tran, M, N, alpha, t1, mflop, 1.0);

   i = reps;
   t0 = time00();
   do
   {
      ATL_big_trsm(M, N, SADD alpha, a, lda, b, ldb);
      a += incA;
      b += incB;
      if (a == ast) a = A;
      if (b == bst) b = B;
   }
   while(--i);
   t2 = time00() - t0;
   if (t2 <= 0.0) mflop = t2 = 0.0;
   else mflop = (mf * reps) / (t2*1000000.0);
   if (t1 == t2) t0 = 1.0;
   else if (t2 != 0.0) t0 = t1/t2;
   else t0 = 0.0;
   fprintf(stdout, form, Side, Uplo, Tran, M, N, alpha, t2, mflop, t0);
   free(A);
   free(B);
   return(t2-t1);
}

int tloop(int JSTOP, int M, int N0, int NN, int incN, SCALAR alpha)
{
   int n, nn=NN;
   double t0;

   fprintf(stdout, 
"\n\nSIDE  UPLO  TRAN     M     N    alpha            Time        Mflop  SpeedUp\n");
   fprintf(stdout, 
"====  ====  ====  ====  ====  =======  ==============  ===========  =======\n");
   
   for (n=N0; n <= NN; n += incN)
   {
      t0 = smcase(M, n, alpha);
      if (t0 < 0.0 && nn == NN)
      {
         nn = n;
         if (JSTOP) return(nn);
      }
   }
   return(nn);
}

void PrintUsage(char *nam)
{
   fprintf(stderr, 
           "usage: %s -m <M> -N <N0> <NN> <incN> -a <alpha> -f <filename>\n",
           nam);
   exit(-1);
}
void GetFlags(int nargs, char **args, int *M, int *N0, int *NN, int *incN, 
              TYPE *alpha,  char *file)
{
   int i;
   char *in, *out=file;

   file[0] = '\0';
   *M = NB;
   *N0 = 100;
   *NN = 2000;
   *incN = 100;
   *alpha = 1.0;
   for (i=1; i < nargs; i++)
   {
      if (args[i][0] != '-') PrintUsage(args[0]);
      switch(args[i][1])
      {
      case 'f':
         in = args[++i];
         while (*file++ = *in++);
         break;
      case 'm':
         *M = atoi(args[++i]);
         break;
      case 'a':
         *alpha = atof(args[++i]);
      case 'N':
         *N0 = atoi(args[++i]);
         *NN = atoi(args[++i]);
         *incN = atoi(args[++i]);
         break;
      default:
         PrintUsage(args[0]);
      }
   }
}

main(int nargs, char *args[])
{
   char fnam[256];
   TYPE alpha;
   int M, N0, NN, incN, n, nn, k;
   FILE *fpout;

   GetFlags(nargs, args, &M, &N0, &NN, &incN, &alpha, fnam);
   if (fnam[0] == '\0') nn = tloop(0, NB, N0, NN, incN, alpha);
   else
   {
      fpout = fopen(fnam, "a");
      assert(fpout);
      k = incN / 10;
      if (!k) k = 1;
      nn = n = tloop(1, NB, N0, NN, incN, alpha);
      if (n != NN) nn = tloop(1, NB, Mmax(n-incN,0)+k, n, k, alpha);
      else nn = 10000;
      fprintf(fpout, "#define TRSM_%c%c%c%c_Xover %d\n", 
              Side, Uplo, Tran, Diag, nn);
   }
   fprintf(stdout, "\n\nXover point at NB=%d, N=%d\n\n", NB, nn);
   exit(0);
}

