// Check the result is as expected
 static bool check(const MatrixType matrix,
                   Teuchos::FancyOStream& out) {
   typedef typename MatrixType::values_type matrix_values_type;
   typename matrix_values_type::HostMirror host_matrix_values =
     Kokkos::create_mirror_view(matrix.values);
   Kokkos::deep_copy(host_matrix_values, matrix.values);
   const ordinal_type nrow = matrix.numRows();
   const ordinal_type vec_size = host_matrix_values.sacado_size();
   bool success = true;
   for (ordinal_type row=0; row<nrow; ++row) {
     bool s = compareVecs(host_matrix_values(row),
                          "matrix_values(row)",
                          value_type(vec_size, row),
                          "value_type(row)",
                          0.0, 0.0, out);
     success = success && s;
   }
   return success;
 }
int main(int argc, const char *argv[])
{
    // Seed the random number generator using time
    srand48((unsigned int) time(NULL));

    // Dimension of the operation with defaul value
    int N = PROBSIZE;

    // Specify operation: 0 MatMult; 1 MatVecMult
    int opr = 0;

    // Whether to verify the result or not
    int verif = 0;

    // Whether to display the result or not
    int disp = 0;

    // Whether to call the naive implementation
    int execNaive = 1;

    // Whether to call the optimized implementation
    int execOPT = 1;

    // Parse command line
    {
        int arg_index = 1;
        int print_usage = 0;

        while (arg_index < argc)
        {
            if ( strcmp(argv[arg_index], "-N") == 0 )
            {
                arg_index++;
                N = atoi(argv[arg_index++]);
            }
            else if ( strcmp(argv[arg_index], "-operation") == 0 )
            {
                arg_index++;
                opr = atoi(argv[arg_index++]);
            }
            else if ( strcmp(argv[arg_index], "-help") == 0 )
            {
                print_usage = 1;
                break;
            }
            else if( strcmp(argv[arg_index], "-verif") == 0 )
            {
                arg_index++;
                verif = 1;
                if(execNaive==0 || execOPT==0) {
                  printf("***Must call both naive and optimized when running verification\n");
                  print_usage = 1;
                  break;
                }
            }
            else if( strcmp(argv[arg_index], "-disp") == 0 )
            {
                arg_index++;
                disp = 1;
            }
            else if( strcmp(argv[arg_index], "-naive") == 0 )
            {
                arg_index++;
                execNaive = 1;
                execOPT   = 0;
                if(verif==1) {
                  printf("***Must call both naive and optimized when running verification\n");
                  print_usage = 1;
                  break;                  
                }
            }
            else if( strcmp(argv[arg_index], "-OPT") == 0 )
            {
                arg_index++;
                execOPT   = 1;
                execNaive = 0;
                if(verif==1) {
                  printf("***Must call both naive and optimized when running verification\n");
                  print_usage = 1;
                  break;                  
                }
            }
            else
            {
                printf("***Invalid argument: %s\n", argv[arg_index]);
                print_usage = 1;
                break;
            }
        }

        if (print_usage)
        {
            printf("\n");
            printf("Usage: %s [<options>]\n", argv[0]);
            printf("\n");
            printf("  -N <N>          : problem size (default: %d)\n", PROBSIZE);
            printf("  -operation <ID> : Operation ID = 0 for MatMult or ID = 1 for MatVecMult\n");
            printf("  -verif          : Activate verification\n");
            printf("  -disp           : Display result (use only for small N!)\n");
            printf("  -naive          : Run only naive implementation\n");
            printf("  -OPT            : Run only optimized implementation\n");
            printf("  -help           : Display this message\n");
            printf("\n");
        }

        if (print_usage)
            return 0;
    }

    // Perform operation
    switch(opr)
    {
        case 0: /* Matrix-matrix multiply */
            {
                printf("Performing matrix-matrix multiply operation\n");
                double *matA, *matB, *matC1, *matC2;

                // Allocate memory
                matA = (double *) malloc(N*N * sizeof(double));
                matB = (double *) malloc(N*N * sizeof(double));
                if(execNaive) matC1 = (double *) malloc(N*N * sizeof(double));
                if(execOPT)   matC2 = (double *) malloc(N*N * sizeof(double));

                // Initialize matrix values
                randInitialize(N*N,matA);
                randInitialize(N*N,matB);

                clock_t tic, toc;
                double tm;

                if(execNaive) {
                  // Perform naive matA x matB = matC1
                  tic = clock();
                  matMult(N,matA,matB,matC1);
                  toc = clock();
                  tm = (double)(toc - tic) / CLOCKS_PER_SEC;
                  printf("Elapsed time for naive mat-mat mult.: %f seconds\n",tm);
                }

                if(execOPT) {
                  // Perform optimized matA x matB = matC2
                  tic = clock();
                  //matMult_opt(N,matA,matB,matC2);
                  toc = clock();
                  tm = (double)(toc - tic) / CLOCKS_PER_SEC;
                  printf("Elapsed time for optimized mat-mat mult.: %f seconds\n",tm);
                }

                // Verify results (compare the two matrices)
                if(verif)
                    compareVecs(N*N,matC2,matC1);

                // Display results (don't use for large matrices)
                if(disp)
                {
                    displayMat(N,N,matA);
                    printf("\n");
                    displayMat(N,N,matB);
                    printf("\n");
                    displayMat(N,N,matC1);
                    printf("\n");
		    displayMat(N,N,matC2);
                }

                // Free memory
                free(matA);
                free(matB);
                if(execNaive) free(matC1);
                if(execOPT)   free(matC2);
            }
            break;

        case 1: /* Matrix-vector multiply */
            {
                printf("Performing matrix-vector multiply operation\n");
                double *matA, *vecB, *vecC1,*vecC2;

                // Allocate memory
                matA = (double *) malloc(N*N * sizeof(double));
                vecB = (double *) malloc(N*N * sizeof(double));
                if(execNaive) vecC1 = (double *) malloc(N*N * sizeof(double));
                if(execOPT)   vecC2 = (double *) malloc(N*N * sizeof(double));

                // Initialize values
                randInitialize(N*N,matA);
                randInitialize(N,vecB);

                clock_t tic, toc;
                double tm;

                if(execNaive) {
                  // Perform naive matA x vecB = vecC1
                  tic = clock();
                  matVecMult(N,matA,vecB,vecC1);
                  toc = clock();
                  tm = (double)(toc - tic) / CLOCKS_PER_SEC;
                  printf("Elapsed time for naive mat-vec mult.: %f seconds\n",tm);
                }

                if(execOPT) {
                  // Perform optimized matA x vecB = vecC2
                  tic = clock();
                  matVecMult_opt(N,matA,vecB,vecC2);
                  toc = clock();
                  tm = (double)(toc - tic) / CLOCKS_PER_SEC;
                  printf("Elapsed time for optimized mat-vec mult.: %f seconds\n",tm);
                }

                // Verify results
                if(verif)
                    compareVecs(N,vecC2,vecC1);

                // Display results (don't use for large matrices)
                if(disp)
                {
                    displayMat(N,N,matA);
                    printf("\n");
                    displayVec(N,vecB);
                    printf("\n");
                    displayVec(N,vecC1);
                    printf("\n");
                }

                // Free memory
                free(matA);
                free(vecB);
                if(execNaive) free(vecC1);
                if(execOPT)   free(vecC2);
            }
            break;

        default:
            printf(" Invalid operation ID\n");
            return 0;
    }


    return 0;
}