int main(int argc, char **argv)
{
    int dimX = 4;
    int dimY = 64;
    int dimZ = 64;
    int numIters = 64;

    // Process args from command line*/
    int argi = 1;
    int errFlag = 0;
    while (argi < argc) {
        char *one = argv[argi];
        if (!strcmp(one, "-d") && argc > argi + 3) {
            dimX = atoi(argv[argi + 1]);
            dimY = atoi(argv[argi + 2]);
            dimZ = atoi(argv[argi + 3]);
            argi += 4;
        } else if (!strcmp(one, "-n") && argc > argi + 1) {
            numIters = atoi(argv[argi + 1]);
            /* Make numIters a multiple of 16 */
            numIters = numIters & (~0xf);
            argi += 2;
        }
        // Also take tile_size from command line*/
        else if (!strcmp(one, "-t") && argc > argi + 1) {
            tile_size = atoi(argv[argi + 1]);
            if (dimY % tile_size != 0) {
                printf("The given tile_size does not divide dimY evenly!\n");
                return -1;
            }
            argi += 2;
        } else {
            errFlag = 1;
            break;
        }
    }
    if (errFlag) {
        printf("Usage: %s [-d [the size of each dimension]] "
               "[-n [the number of iterations in the kernel]] "
               "[-t tile_size]\n", argv[0]);
        return -1;
    }

    if (dimY != dimZ) {
        printf("DimY is not the same as DimZ, "
               "changing dimZ to make them alike!\n");
        dimZ = dimY;
    }

    if (dimX * dimY * dimZ * numIters % 64 != 0) {
        printf("The production of DimX, DimY, DimZ and "
               "#iters must be multiples of 64!\n");
        return -1;
    }

    printf("DimX=%d, DimY=%d, DimZ=%d, #Loop_Iterations=%d, tile_size=%d\n",
           dimX, dimY, dimZ, numIters, tile_size);

    REAL *out = NULL;

    // 64 bytes aligned allocation.
    posix_memalign((void **) &out, 64,
                   dimX * dimY * dimZ * numIters * sizeof(REAL) * 2);

    if (out == NULL) {
        printf("Memory allocation failed!\n");
        return -1;
    }

    double iterTimes[NUM_TESTS_ITERS];
    double mintime = 1e6;

    // Notice how the out1 and out2 pointers are calculated.
    REAL *out1 = out;
    REAL *out2 = out + dimX * dimY * dimZ * numIters;

    //--------------------------------------------------------------

    // Initialize hStreams with the given StreamsPerDomain and
    // over-subscription level.
    CHECK_HSTR_RESULT(hStreams_app_init(streams_per_domain, oversubscription));

    //--------------------------------------------------------------

    //--------------------------------------------------------------
    // Saving buffer length in a temporary variable to save computation.
    long long buffer_len = dimX * dimY * dimZ * numIters
                           * sizeof(REAL) * 2;

    // Set up buffers.
    // Wrap out with a buffer.
    hStreams_app_create_buf((void *) out, buffer_len);

    register long int buff_length = dimY * dimZ * numIters * sizeof(REAL);

    //!!a Creating two dimX buffer addresses pointing inside out buffer.
    // out1_addr[i] points to the position of out1 buffer where
    // iteration i of the outer loop of the compute kernel writes.
    // out2_addr[i] points to the position of out2 buffer where
    // iteration i of the outer loop of the compute kernel writes.
    REAL *out1_addr[dimX];
    REAL *out2_addr[dimX];

    // Separation between out1 and out2.
    long int out_length = dimX * dimY * dimZ * numIters;

    for (int i = 0; i < dimX; i++) {
        //%% EXERCISE: go back to compute kernel and find out what is the value for
        //   out1_addr[i] and out2_addr[i].
        out1_addr[i] = out + i * dimY * dimZ * numIters;
        out2_addr[i] = out1_addr[i] + out_length;
        //  You can also create buffer in this way instead of creating
        //   out buffer as a whole
        //  hStreams_app_create_buf((void *)out1_addr[i], buff_length);
        //  hStreams_app_create_buf((void *)out2_addr[i], buff_length);
    }

    // Event pointers to support asynchronous function calls.
    HSTR_EVENT eout1[numIters][dimX], eout2[numIters][dimX],
               eout3[numIters][dimX], eout4[numIters][dimX], eout5[numIters][dimX];

    //--------------------------------------------------------------

    for (int i = 0; i < NUM_TESTS_ITERS; i++) {
        iterTimes[i] = GetTime();
        //--------------------------------------------------------------
        // Call device side API.
        // Prepare to perform computation on the sink-side.
        // Outer loop.
        // The body of the original i loop is enqueued on the streams in a round
        // robin fashion.
        for (int ii = 0; ii < dimX; ii++) {


            int stream = ii % streams_per_domain;
            //--------------------------------------------------------------
            // Initialize data at the sink-side

            //!!b  Initializes intermediate data at sink, only
            // the portion  being worked on
            hStreams_app_memset(stream, out1_addr[ii], // source proxy address to write
                                0.5, buff_length, // number of bytes to send
                                &eout4[i][ii]); // completion event

            //%% EXERCISE: Initialize out2_addr[ii] buffer similarly on the sink-side.
            hStreams_app_memset(stream, out2_addr[ii], // source proxy address to write
                                0.5, buff_length, // number of bytes to send
                                &eout5[i][ii]); // completion event


            uint64_t args[8];

            // Pack scalar arguments first, then heap args.
            //%% EXERCISE: Setup the heap args properly.
            args[0] = (uint64_t)(ii);
            args[1] = (uint64_t)(dimX);
            args[2] = (uint64_t)(dimY);
            args[3] = (uint64_t)(dimZ);
            args[4] = (uint64_t)(numIters);
            args[5] = (uint64_t)(tile_size);
            args[6] = (uint64_t)(out1_addr[ii]);
            args[7] = (uint64_t)(out2_addr[ii]);
            //--------------------------------------------------------------

            hStreams_app_invoke(stream, // same idea
                                "compute", // remote function name
                                6, // scalar arg
                                2, // heap args
                                args, // array of args
                                &eout1[i][ii], NULL, // return variable
                                0);

            //-----------------------------------------------------------------
            // Collect result.
            hStreams_app_xfer_memory(stream, out1_addr[ii], // source proxy address to write
                                     out1_addr[ii], // source proxy address to read
                                     buff_length, // number of bytes to send
                                     HSTR_SINK_TO_SRC, // transfer direction
                                     &eout2[i][ii]); // completion event

            //--------------------------------------------------------------
            //!!c Transfer output data from sink to source
            //%% EXERCISE: Transfer data back for out2_addr[ii].
            hStreams_app_xfer_memory(stream, out2_addr[ii], // source proxy address to write
                                     out2_addr[ii], // source proxy address to read
                                     buff_length, // number of bytes to send
                                     HSTR_SINK_TO_SRC, // transfer direction
                                     &eout3[i][ii]); // completion event
        }
        //--------------------------------------------------------------

        // Synchronize.
        CHECK_HSTR_RESULT(hStreams_app_thread_sync());
        //--------------------------------------------------------------

        iterTimes[i] = GetTime() - iterTimes[i];

        double result = out1[numIters / 2] + out2[numIters / 2];
        printf("Test %d takes %.3lf ms with result %.3lf\n", i, iterTimes[i],
               result);
        if (iterTimes[i] < mintime) {
            mintime = iterTimes[i];
        }
    }
    printf("Test's min time is %.3lf ms\n", mintime);

    //--------------------------------------------------------------
    // Cleanup before exiting.
    CHECK_HSTR_RESULT(hStreams_app_fini());
    //--------------------------------------------------------------
    free(out);
    return 0;
}
void cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size,
                    int niter, int max_log_str, bool layRow, int verify)
{
    //total number of tiles
    int tot_tiles = num_tiles * num_tiles;

    //memory allocation for matrix for tiled-Cholesky
    double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for matrix for MKL cholesky (for comparison)
    double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for tiled matrix
    double **Asplit = new double* [tot_tiles];
    int mem_size_tile = tile_size * tile_size * sizeof(double);

    for (int i = 0; i < tot_tiles; ++i) {
        //Buffer per tile, host allocation
        Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64);

        //Buffer creation and allocation on the card
        hStreams_app_create_buf((void *)Asplit[i], mem_size_tile);
    }

    double tbegin, tend;

    int iter;
    int info;

    //Events are needed for various synchronizations to enforce
    //data dependence between and among data-transfers/computes
    HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles];

    //for timing tiled cholesky
    double *totTimeMsec = new double [niter];

    //for timing MKL cholesky
    double *totTimeMsecMKL = new double [niter];

    HSTR_RESULT res;

    //these queues are used for queining up compute on the card and
    //data transfers to/from the card.
    //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm.
    //The queues are incremented by one for every compute queued and wrap
    //around the max_log_str available. This ensures good load-balancing.
    int q_trsm, q_potrf, q_syrk_gemm;

    CBLAS_ORDER blasLay;
    int lapackLay;

    if (layRow) {
        blasLay = CblasRowMajor;
        lapackLay = LAPACK_ROW_MAJOR;
    } else {
        blasLay = CblasColMajor;
        lapackLay = LAPACK_COL_MAJOR;
    }

    for (iter = 0; iter < niter; ++iter) {

        //copying matrices into separate variables for tiled cholesky (A_my)
        //and MKL cholesky (A_MKL)
        //The output overwrites the matrices and hence the need to copy
        //for each iteration
        copy_mat(mat, A_my, mat_size);
        copy_mat(mat, A_MKL, mat_size);

        unsigned int m, n, k;

        printf("\nIteration = %d\n", iter);

        split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);
        //beginning of timing
        tbegin = dtimeGet();

        //splitting time included in the timing
        //This splits the input matrix into tiles (or blocks)
        //split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);

        q_potrf = 0;
        for (k = 0; k < num_tiles; ++k) {
            //POTRF
            //dpotrf is executed on the host on the diagonal tile
            //the results are then sent to the card
            if (k > 0) {
                hStreams_app_event_wait(1, &eventsyrk[k * num_tiles + k]);
                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] to host in queue %d\n",
                           k, k, (int)(q_potrf % max_log_str)) ;

                hStreams_app_xfer_memory(Asplit[k * num_tiles + k],
                                         Asplit[k * num_tiles + k], mem_size_tile,
                                         (int)(q_potrf % max_log_str), HSTR_SINK_TO_SRC,
                                         &eventcpyfr[k * num_tiles + k]);

                hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]);
            }

            if (loc_verbose > 0) {
                printf("Executing potrf on host for tile[%d][%d]\n", k, k);
            }

            info = LAPACKE_dpotrf(lapackLay, 'L', tile_size,
                                  Asplit[k * num_tiles + k], tile_size);

            if (k < num_tiles - 1) {
                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] to card in queue %d\n",
                           k, k, (int)(q_potrf % max_log_str));

                hStreams_app_xfer_memory(Asplit[k * num_tiles + k],
                                         Asplit[k * num_tiles + k], mem_size_tile,
                                         (int)(q_potrf % max_log_str), HSTR_SRC_TO_SINK,
                                         &eventcpyto[k * num_tiles + k]);
            }
            q_potrf++;

            q_trsm = 0;
            for (m = k + 1; m < num_tiles; ++m) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               m, k, (int)(q_trsm % max_log_str));

                    hStreams_app_xfer_memory(Asplit[m * num_tiles + k],
                                             Asplit[m * num_tiles + k], mem_size_tile,
                                             (int)(q_trsm % max_log_str), HSTR_SRC_TO_SINK,
                                             &eventcpyto[m * num_tiles + k]);
                }

                //DTRSM
                hStreams_app_event_wait(1, &eventcpyto[k * num_tiles + k]);

                if (k > 0) {
                    hStreams_app_event_wait(1, &eventgemm[m * num_tiles + k]);
                }

                //dtrsm is executed on the card
                if (loc_verbose > 0)
                    printf("Executing trsm for tile[%d][%d] on card in queue %d\n",
                           m, k, (int)(q_trsm % max_log_str));

                res = hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower,
                                            CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                            Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],
                                            tile_size, (int)(q_trsm % max_log_str),
                                            &eventtrsm[m * num_tiles + k]);

                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] back to host in queue %d\n",
                           m, k, (int)(q_trsm % max_log_str));

                hStreams_app_xfer_memory(Asplit[m * num_tiles + k],
                                         Asplit[m * num_tiles + k], mem_size_tile,
                                         (int)(q_trsm % max_log_str), HSTR_SINK_TO_SRC,
                                         &eventcpyfr[m * num_tiles + k]);

                q_trsm++;
            }

            q_syrk_gemm = 0;
            for (n = k + 1; n < num_tiles; ++n) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               n, n, (int)(q_syrk_gemm % max_log_str));

                    hStreams_app_xfer_memory(Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             (int)(q_syrk_gemm % max_log_str), HSTR_SRC_TO_SINK,
                                             &eventcpyto[n * num_tiles + n]);
                }

                //DSYRK
                hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]);
                if (k > 0) {
                    hStreams_app_event_wait(1, &eventsyrk[n * num_tiles + n]);
                }

                //dsyrk is executed on the card
                if (loc_verbose > 0)
                    printf("Executing syrk for tile[%d][%d] on card in queue %d\n",
                           n, n, (int)(q_syrk_gemm % max_log_str));

                res = hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans,
                                            tile_size, tile_size, -1.0, Asplit[n * num_tiles + k],
                                            tile_size, 1.0, Asplit[n * num_tiles + n], tile_size,
                                            (int)(q_syrk_gemm % max_log_str), &eventsyrk[n * num_tiles + n]);

                q_syrk_gemm++;

                for (m = n + 1; m < num_tiles; ++m) {
                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to card in queue %d\n",
                                   m, n, (int)(q_syrk_gemm % max_log_str));

                        hStreams_app_xfer_memory(Asplit[m * num_tiles + n],
                                                 Asplit[m * num_tiles + n], mem_size_tile,
                                                 (int)(q_syrk_gemm % max_log_str),
                                                 HSTR_SRC_TO_SINK,
                                                 &eventcpyto[m * num_tiles + n]);
                    }

                    //DGEMM
                    hStreams_app_event_wait(1, &eventtrsm[m * num_tiles + k]);
                    hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]);

                    if (k > 0) {
                        hStreams_app_event_wait(1, &eventgemm[m * num_tiles + n]);
                    }

                    //dgemm is executed on the card
                    if (loc_verbose > 0)
                        printf("Executing gemm for tile[%d][%d] on card in queue %d\n",
                               m, n, (int)(q_syrk_gemm % max_log_str));

                    res = hStreams_app_dgemm(blasLay, CblasNoTrans, CblasTrans,
                                             tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k],
                                             tile_size, Asplit[n * num_tiles + k], tile_size, 1.0,
                                             Asplit[m * num_tiles + n], tile_size,
                                             (int)(q_syrk_gemm % max_log_str), &eventgemm[m * num_tiles + n]);

                    q_syrk_gemm++;
                }
            }
        }

        //syncrhonizing all the streams
        hStreams_app_thread_sync();

        //end of timing
        tend = dtimeGet();

        totTimeMsec[iter] = 1e3 * (tend - tbegin);
        printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n",
               iter, totTimeMsec[iter]);

        //assembling of tiles back into full matrix
        assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow);

        //calling mkl cholesky for verification and timing comparison.
        //Using auto-offload feature of MKL
#ifndef _WIN32
        //FIXME: calling this function causes a crash on Windows
        mkl_mic_enable();
#endif
        tbegin = dtimeGet();

        //calling MKL dpotrf on the full matrix
        info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size);

        tend = dtimeGet();
        totTimeMsecMKL[iter] = 1e3 * (tend - tbegin);
        printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n",
               iter, totTimeMsecMKL[iter]);

        if (info != 0) {
            printf("error with dpotrf\n");
        }
        mkl_mic_disable();

        if (verify == 1) {
            bool result = verify_results(A_my, A_MKL, mat_size * mat_size);
            if (result == true) {
                printf("Tiled Cholesky successful\n");
            } else {
                printf("Tiled Chloesky failed\n");
            }
        }
    }

    double meanTimeMsec, stdDevMsec;
    double meanTimeMsecMKL, stdDevMsecMKL;
    mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter);
    mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter);

    double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9;

    printf("\nMatrix size = %d\n", mat_size);

    printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using mean Time) = %.2f\n",
           niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3));

    printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using meanTime) = %.2f\n\n",
           niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3));

    //Free
    free(A_my);
    free(A_MKL);
    for (int i = 0; i < tot_tiles; ++i) {
        _mm_free(Asplit[i]);
    }
    delete [] Asplit;
    delete [] eventcpyto;
    delete [] eventcpyfr;
    delete [] eventpotrf;
    delete [] eventtrsm;
    delete [] eventsyrk;
    delete [] eventgemm;
    delete [] totTimeMsec;
    delete [] totTimeMsecMKL;

}