Пример #1
0
/*
 * Returns flops per cycle.
 */
double time_mult(long dim, enum mult_flag_enum flag){
	int count = 10;

	double *a = new double[dim*dim];
	double *b = new double[dim*dim];
	double *c = new double[dim*dim];

#pragma omp parallel for
	for(long i=0; i < dim*dim; i++)
		a[i] = b[i] = c[i] = 1;

	TimeStamp clk;
	StatVector stats(count);

	if(flag == AUTO)
		mkl_mic_enable();

	for(int i=0; i < count; i++){
		clk.tic();
		switch(flag){
		case HOST: 
			mmult(a, b, c, dim);
			break;
		case MIC:
#pragma offload target(mic)						\
	in(a:length(dim*dim) align(64) alloc_if(1) free_if(1))		\
	in(b:length(dim*dim) align(64) alloc_if(1) free_if(1))		\
	inout(c: length(dim*dim) align(64) alloc_if(1) free_if(1))
			mmult(a, b, c, dim);
			break;
		case AUTO:
			mmult(a, b, c, dim);
			break;
		}
		double cycles = clk.toc();
		stats.insert(cycles);
	}

	if(flag == AUTO)
		mkl_mic_disable();

	delete[] a;
	delete[] b;
	delete[] c;

	return 2.0*dim*dim*dim/stats.median();
}
int cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size,
                   int niter, int max_log_str, bool layRow, int verify, int num_doms, int use_host, int num_mics,
                   int host_ht_offset)
{
    //verification result
    bool result;
    //total number of tiles
    int tot_tiles = num_tiles * num_tiles;

    //memory allocation for matrix for tiled-Cholesky
    double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for matrix for MKL cholesky (for comparison)
    double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for tiled matrix
    double **Asplit = new double* [tot_tiles];
    int mem_size_tile = tile_size * tile_size * sizeof(double);

#define HSTR_BUFFER_PROPS_VALUES {        \
        HSTR_MEM_TYPE_NORMAL,             \
        HSTR_MEM_ALLOC_PREFERRED,         \
        HSTR_BUF_PROP_ALIASED}

    HSTR_BUFFER_PROPS buffer_props = HSTR_BUFFER_PROPS_VALUES;
    for (int i = 0; i < tot_tiles; ++i) {
        //Buffer per tile, host allocation
        Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64);

        //Buffer creation and allocation on the card
        //hStreams_app_create_buf((void *)Asplit[i], mem_size_tile);
        CHECK_HSTR_RESULT(hStreams_Alloc1DEx(
                              (void *)Asplit[i],
                              mem_size_tile,
                              &buffer_props,
                              -1,
                              NULL));
    }

    double tbegin, tend;

    int iter;
    int info;

    //Events are needed for various synchronizations to enforce
    //data dependence between and among data-transfers/computes
    HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventcpyto_trsm = new HSTR_EVENT[tot_tiles * num_doms];
    HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles];

    //for timing tiled cholesky
    double *totTimeMsec = new double [niter];

    //for timing MKL cholesky
    double *totTimeMsecMKL = new double [niter];


    mkl_mic_disable();

    //these queues are used for queining up compute on the card and
    //data transfers to/from the card.
    //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm.
    //The queues are incremented by one for every compute queued and wrap
    //around the max_log_str available. This ensures good load-balancing.
    int q_trsm, q_potrf;
    int q_syrk_gemm[10];

    CBLAS_ORDER blasLay;
    int lapackLay;

    if (layRow) {
        blasLay = CblasRowMajor;
        lapackLay = LAPACK_ROW_MAJOR;
    } else {
        blasLay = CblasColMajor;
        lapackLay = LAPACK_COL_MAJOR;
    }

    for (iter = 0; iter < niter; ++iter) {

        //copying matrices into separate variables for tiled cholesky (A_my)
        //and MKL cholesky (A_MKL)
        //The output overwrites the matrices and hence the need to copy
        //for each iteration
        copy_mat(mat, A_my, mat_size);
        copy_mat(mat, A_MKL, mat_size);

        unsigned int m, n, k;

        printf("\nIteration = %d\n", iter);

        //splitting time included in the timing
        //This splits the input matrix into tiles (or blocks)
        split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);

        //beginning of timing
        tbegin = dtimeGet();

        int ic;
        int is_mic;
        for (ic = 0; ic < num_doms; ++ic) {
            q_syrk_gemm[ic] = 0;
        }
        q_potrf = 0;
        q_trsm = 0;
        for (k = 0; k < num_tiles; ++k) {
            //POTRF
            //dpotrf is executed on the host on the diagonal tile
            if (mach_wide_league) {
                q_potrf = 0;
            } else {
                q_potrf = q_syrk_gemm[0];
            }

            int qindex = (int)q_potrf % max_log_str;
            if (use_host) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to host in queue %d, triggering event eventcpyto[%d][%d]\n",
                               k, k, (int)(qindex), k, k);

                    hStreams_app_xfer_memory((int)(qindex),
                                             Asplit[k * num_tiles + k],
                                             Asplit[k * num_tiles + k], mem_size_tile,
                                             HSTR_SRC_TO_SINK,
                                             &eventcpyto[k * num_tiles + k]);
                }
            }

            if (k > 0) {
                if (use_host) {
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyfr[k * num_tiles + k], 0, NULL, NULL);
                } else {
                    hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]);
                }

                if (loc_verbose > 0) {
                    printf("Waiting on eventcpyfr[%d]\n", k * num_tiles + k);
                }
            }

            if (loc_verbose > 0)
                printf("Executing potrf on host for tile[%d][%d], in queue (if use_host) %d, triggerring eventpotrf[%d][%d]\n",
                       k, k, qindex, k, k);

            if (use_host) {
                CHECK_HSTR_RESULT(hStreams_custom_dpotrf(lapackLay, 'L', tile_size,
                                  Asplit[k * num_tiles + k], tile_size, qindex, &eventpotrf[k * num_tiles + k]));
            } else {
                info = LAPACKE_dpotrf(lapackLay, 'L', tile_size,
                                      Asplit[k * num_tiles + k], tile_size);
            }


            if (mach_wide_league) {
                q_trsm = q_syrk_gemm[0];
            } else {
                q_potrf++;
                q_trsm = q_potrf;
            }

            for (m = k + 1; m < num_tiles; ++m) {

                if (mach_wide_league) {
                    qindex = (int)(q_trsm % max_log_str + 1);
                } else {
                    qindex = (int)(q_trsm % max_log_str);
                }

                if (use_host) {
                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to host in queue %d, triggering event eventcpyto[%d][%d]\n",
                                   m, k, (int)(qindex), m, k);

                        hStreams_app_xfer_memory((int)(qindex),
                                                 Asplit[m * num_tiles + k],
                                                 Asplit[m * num_tiles + k], mem_size_tile,
                                                 HSTR_SRC_TO_SINK,
                                                 &eventcpyto[m * num_tiles + k]);
                    }
                }

                if (k > 0) {
                    if (use_host) {
                        hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyfr[m * num_tiles + k], 0, NULL, NULL);
                    } else {
                        hStreams_app_event_wait(1, &eventcpyfr[m * num_tiles + k]);
                    }

                    if (loc_verbose > 0) {
                        printf("Waiting on eventcpyfr[%d]\n", m * num_tiles + k);
                    }
                }

                if (use_host)
                    //hStreams_app_event_wait(1, &eventpotrf[k*num_tiles + k]);
                {
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventpotrf[k * num_tiles + k], 0, NULL, NULL);
                }

                //dtrsm is executed on the host
                if (loc_verbose > 0)
                    printf("Executing trsm for tile[%d][%d] on host, in queue (if use_host) %d, triggering eventtrsm[%d][%d]\n",
                           m, k, qindex, m, k);

                if (use_host) {
                    CHECK_HSTR_RESULT(hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower,
                                                            CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                                            Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],
                                                            tile_size, qindex,
                                                            &eventtrsm[m * num_tiles + k]));
                } else {
                    cblas_dtrsm(blasLay, CblasRight, CblasLower,
                                CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],
                                tile_size);
                }

                //transfer to all cards
                for (ic = 0; ic < num_doms; ++ic) {
                    if ((use_host == 1) && (num_mics >= 1)) {
                        if (ic == 0) {
                            is_mic = 0;    //this is host
                        } else {
                            is_mic = 1;
                        }
                    } else {
                        is_mic = 0;
                    }

                    if (mach_wide_league) {
                        qindex = (int)q_trsm % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset;
                    } else {
                        qindex = (int)q_trsm % max_log_str + ic * max_log_str + is_mic * host_ht_offset;
                    }

                    if (use_host)
                        //hStreams_app_event_wait(1, &eventtrsm[m*num_tiles + k]);
                    {
                        hStreams_app_event_wait_in_stream(qindex, 1, &eventtrsm[m * num_tiles + k], 0, NULL, NULL);
                    }

                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card %d in queue %d, triggering event eventcpyto_trsm[%d]\n",
                               m, k, ic, (int)(qindex), m * num_tiles + k + ic * tot_tiles);

                    hStreams_app_xfer_memory((int)(qindex),
                                             Asplit[m * num_tiles + k],
                                             Asplit[m * num_tiles + k], mem_size_tile,
                                             HSTR_SRC_TO_SINK,
                                             &eventcpyto_trsm[m * num_tiles + k + ic * tot_tiles]);
                }

                q_trsm++;
            }

            if (use_host) {
                q_syrk_gemm[0] = q_trsm;
                for (ic = 1; ic < num_doms; ++ic) {
                    q_syrk_gemm[ic] = 0;
                }
            } else {
                for (ic = 0; ic < num_doms; ++ic) {
                    q_syrk_gemm[ic] = 0;
                }
            }

            for (n = k + 1; n < num_tiles; ++n) {
                ic = n % num_doms; //round-robin rows across num_doms

                if ((use_host == 1) && (num_mics >= 1)) {
                    if (ic == 0) {
                        is_mic = 0;    //this is host
                    } else {
                        is_mic = 1;
                    }
                } else {
                    is_mic = 0;
                }

                if (mach_wide_league) {
                    qindex  = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset;
                } else {
                    qindex  = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + is_mic * host_ht_offset;
                }

                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               n, n, (int)(qindex));

                    hStreams_app_xfer_memory((int)(qindex),
                                             Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             HSTR_SRC_TO_SINK,
                                             &eventcpyto[n * num_tiles + n]);
                }

                //DSYRK
                //hStreams_app_event_wait(1, &eventcpyto_trsm[n*num_tiles + k + ic*tot_tiles]);
                hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[n * num_tiles + k + ic * tot_tiles], 0, NULL, NULL);
                if (loc_verbose > 0) {
                    printf("Waiting on eventcpyto_trsm[%d]\n", n * num_tiles + k + ic * tot_tiles);
                }

                if (k > 0) {
                    //hStreams_app_event_wait(1, &eventsyrk[n*num_tiles + n]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventsyrk[n * num_tiles + n], 0, NULL, NULL);
                    if (loc_verbose > 0) {
                        printf("Waiting on eventsyrk[%d]\n", n * num_tiles + n);
                    }
                }

                //dsyrk is executed on the card
                if (loc_verbose > 0)
                    printf("Executing syrk for tile[%d][%d] on card in queue %d, triggering event eventsyrk[%d]\n",
                           n, n, (int)(qindex), n * num_tiles + n);


                CHECK_HSTR_RESULT(hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans,
                                                        tile_size, tile_size, -1.0, Asplit[n * num_tiles + k],
                                                        tile_size, 1.0, Asplit[n * num_tiles + n], tile_size,
                                                        (int)(qindex), &eventsyrk[n * num_tiles + n]));

                //send tile to host (only if n = k+1)
                if (n == k + 1) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] from card  to host in queue %d, triggering event eventcpyfr[%d]\n",
                               n, n, (int)(qindex), n * num_tiles + n);

                    hStreams_app_xfer_memory((int)(qindex),
                                             Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             HSTR_SINK_TO_SRC,
                                             &eventcpyfr[n * num_tiles + n]);

                }

                q_syrk_gemm[ic]++;


                for (m = n + 1; m < num_tiles; ++m) {
                    ic = m % num_doms; //round-robin rows across num_doms

                    if ((use_host == 1) && (num_mics >= 1)) {
                        if (ic == 0) {
                            is_mic = 0;    //this is host
                        } else {
                            is_mic = 1;
                        }
                    } else {
                        is_mic = 0;
                    }

                    if (mach_wide_league) {
                        qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset;
                    } else {
                        qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + is_mic * host_ht_offset;
                    }

                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to card in queue %d\n",
                                   m, n, (int)(qindex));

                        hStreams_app_xfer_memory((int)(qindex),
                                                 Asplit[m * num_tiles + n],
                                                 Asplit[m * num_tiles + n], mem_size_tile,
                                                 HSTR_SRC_TO_SINK,
                                                 &eventcpyto[m * num_tiles + n]);
                    }

                    //DGEMM
                    if (loc_verbose > 0) {
                        printf("Waiting on eventcpyto_trsm[%d]\n", m * num_tiles + k + ic * tot_tiles);
                    }
                    //hStreams_app_event_wait(1, &eventcpyto_trsm[m*num_tiles + k + ic*tot_tiles]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[m * num_tiles + k + ic * tot_tiles], 0, NULL, NULL);

                    if (loc_verbose > 0) {
                        printf("Waiting on eventcpyto_trsm[%d]\n", n * num_tiles + k + ic * tot_tiles);
                    }
                    //hStreams_app_event_wait(1, &eventcpyto_trsm[n*num_tiles + k + ic*tot_tiles]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[n * num_tiles + k + ic * tot_tiles], 0, NULL, NULL);

                    if (k > 0) {
                        //hStreams_app_event_wait(1, &eventgemm[m*num_tiles + n]);
                        hStreams_app_event_wait_in_stream(qindex, 1, &eventgemm[m * num_tiles + n], 0, NULL, NULL);
                        if (loc_verbose > 0) {
                            printf("Waiting on eventgemm[%d]\n", m * num_tiles + n);
                        }
                    }

                    //dgemm is executed on the card
                    if (loc_verbose > 0)
                        printf("Executing gemm for tile[%d][%d] on card in queue %d, triggering event eventgemm[%d]\n",
                               m, n, (int)(qindex), m * num_tiles + n);

                    CHECK_HSTR_RESULT(hStreams_app_dgemm((int)(qindex),
                                                         blasLay, CblasNoTrans, CblasTrans,
                                                         tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k],
                                                         tile_size, Asplit[n * num_tiles + k], tile_size, 1.0,
                                                         Asplit[m * num_tiles + n], tile_size,
                                                         &eventgemm[m * num_tiles + n]));

                    //send tile to host (only if n = k+1)
                    if (n == k + 1) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] from card to host in queue %d, triggering event eventcpyfr[%d]\n",
                                   m, n, (int)(qindex), m * num_tiles + n);

                        hStreams_app_xfer_memory(
                            (int)(qindex),
                            Asplit[m * num_tiles + n],
                            Asplit[m * num_tiles + n], mem_size_tile,
                            HSTR_SINK_TO_SRC,
                            &eventcpyfr[m * num_tiles + n]);
                    }


                    q_syrk_gemm[ic]++;
                }
            }
        }


        //syncrhonizing all the streams
        hStreams_app_thread_sync();

        //end of timing
        tend = dtimeGet();

        totTimeMsec[iter] = 1e3 * (tend - tbegin);
        printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n",
               iter, totTimeMsec[iter]);

        //assembling of tiles back into full matrix
        assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow);

        //calling mkl cholesky for verification and timing comparison.
        //Using auto-offload feature of MKL
        tbegin = dtimeGet();

        //calling MKL dpotrf on the full matrix
        info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size);

        tend = dtimeGet();
        totTimeMsecMKL[iter] = 1e3 * (tend - tbegin);
        printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n",
               iter, totTimeMsecMKL[iter]);

        if (info != 0) {
            printf("error with dpotrf\n");
        }
        mkl_mic_disable();

        if (verify == 1) {
            result = verify_results(A_my, A_MKL, mat_size * mat_size);
            if (result == true) {
                printf("Tiled Cholesky successful\n");
            } else {
                printf("Tiled Chloesky failed\n");
            }
        }
    }

    double meanTimeMsec, stdDevMsec;
    double meanTimeMsecMKL, stdDevMsecMKL;
    mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter);
    mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter);

    double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9;

    printf("\nMatrix size = %d\n", mat_size);

    printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using mean Time) = %.2f\n",
           niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3));

    printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using meanTime) = %.2f\n\n",
           niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3));

    //Free
    free(A_my);
    free(A_MKL);
    for (int i = 0; i < tot_tiles; ++i) {
        _mm_free(Asplit[i]);
    }
    delete [] Asplit;
    delete [] eventcpyto;
    delete [] eventcpyto_trsm;
    delete [] eventcpyfr;
    delete [] eventpotrf;
    delete [] eventtrsm;
    delete [] eventsyrk;
    delete [] eventgemm;
    delete [] totTimeMsec;
    delete [] totTimeMsecMKL;

    // true result indicates all OK
    if (result) {
        return 0;
    }
    return 1;

}
int main(int argc, char **argv)
{
    HSTR_OPTIONS hstreams_options;
    CHECK_HSTR_RESULT(hStreams_GetCurrentOptions(&hstreams_options, sizeof(HSTR_OPTIONS)));

    char *libNames[200] = {NULL, NULL};

    //Library to be loaded for sink-side code
    libNames[0] = "cholesky_sink_1.so";
    hstreams_options.libNameCnt = 1;
    hstreams_options.libNames = libNames;
    hstreams_options.libFlags = NULL;

    hstreams_options.libNameCntHost = 0;
    hstreams_options.libNamesHost = NULL;

    int mat_size_m, num_tiles, niter, tile_size;
    niter = 5;
    num_tiles = 1;
    mat_size_m = 0; //must be an input
    bool layRow = true;

    //max_log_str defines the no. of physical partitions on the card
    int use_host = 1, num_mics = 1;
    int nstreams_host = 4, nstreams_mic = 4;

    int verify = 1;

    CHECK_HSTR_RESULT(hStreams_SetOptions(&hstreams_options));
    for (int i = 1; i < argc; i++) {
        if (*argv[i] == SWITCH_CHAR) {
            switch (*(argv[i] + 1)) {
            case 'm':
                mat_size_m = (int)atol(argv[i] + 3);
                break;

            case 't':
                num_tiles = (int)atol(argv[i] + 3);
                break;

            case 's':
                nstreams_mic = (int)atol(argv[i] + 3);
                break;

            case 'l':
                if ((strcmp("row", argv[i] + 3) == 0) ||
                        (strcmp("ROW", argv[i] + 3) == 0)) {
                    layRow = true;
                    printf("matrix is in Row major format\n");
                } else {
                    layRow = false;
                    printf("matrix is in Col major format\n");
                }
                break;

            case 'i':
                niter = (int)atol(argv[i] + 3);
                //if( niter < 3 ) niter=3;
                break;

            case 'h':
                use_host = (int)atol(argv[i] + 3);
                break;

            case 'c':
                num_mics = (int)atol(argv[i] + 3);
                break;

            case 'v':
                verify = (int)atol(argv[i] + 3);
                break;

            default:
                break;
            }
        }
    }
    dtimeInit();

    //Check that mat_size is divisible by num_tiles
    if (mat_size_m % num_tiles != 0) {
        printf("matrix size MUST be divisible by num_tiles.. aborting\n");
        exit(0);
    }

    if (mat_size_m == 0) {
        printf("mat_size_m is not defined\n");
        exit(0);
    }

    tile_size = mat_size_m / num_tiles;

    //This allocates memory for the full input matrix
    double *A = (double *)malloc(mat_size_m * mat_size_m * sizeof(double));

    //Generate a symmetric positve-definite matrix
    A = dpo_generate(mat_size_m);

    int num_doms = use_host + num_mics;
    int max_log_str;

    if (use_host == 0 && num_mics == 0) {
        printf("Cannot run if not using either host or MIC cards\n");
        exit(-1);
    }

    if (use_host == 1) {
        printf("Using the host CPU for compute.. and\n");
    }
    printf("Using %d MIC cards for compute..\n", num_mics);

    if (use_host == 1 && num_mics >= 1) {
#ifdef HOST_HT_ON
        nstreams_host = 2 * nstreams_mic;
#else
        nstreams_host = nstreams_mic;
#endif
        max_log_str = nstreams_mic;
    } else if (num_mics == 0) {
        nstreams_host = nstreams_mic;
        max_log_str = nstreams_host;
#ifdef HOST_HT_ON
        nstreams_host = 2 * nstreams_host;
#endif
    } else if (use_host == 0) {
        max_log_str = nstreams_mic;
    }

    int host_ht_offset = 0;

#ifdef HOST_HT_ON
    host_ht_offset = nstreams_host - max_log_str;
#endif

    if (use_host) {
        printf("number of streams used on host = %d\n", nstreams_host);
        if (loc_verbose) {
            printf("if HT is enabled on host, only top half streams will be used\n");
            printf("if number of streams on host do not evenly divide with number of cores, performance can suffer\n");
        }
    }
    if (num_mics >= 1) {
        printf("number of streams used on mic = %d\n", nstreams_mic);
    }

    if (use_host == 1) {
        resv_cpu_master = 1;
        mach_wide_league = 1;
    } else {
        resv_cpu_master = 0;
        mach_wide_league = 0;
    }

    HSTR_PHYS_DOM *physDomID = new HSTR_PHYS_DOM[num_doms];
    HSTR_LOG_DOM *logDomID = new HSTR_LOG_DOM[num_doms + 1]; //+1 for creating a machine wide stream

    HSTR_CPU_MASK out_CPUmask, src_hstr_cpu_mask;
    HSTR_PHYS_DOM *out_pPhysDomainID = new HSTR_PHYS_DOM;
    HSTR_OVERLAP_TYPE *out_pOverlap = new HSTR_OVERLAP_TYPE;

    uint32_t *places = new uint32_t[num_doms];

    for (int i = 0; i < num_doms; ++i) {
        if (i == 0) {
            if (use_host == 1) {
                places[i] = nstreams_host;
                physDomID[i] = -1;
            } else {
                places[i] = nstreams_mic;
                physDomID[i] = i;
            }
        } else {
            places[i] = nstreams_mic;
            if (use_host == 1) {
                physDomID[i] = i - 1;
            } else {
                physDomID[i] = i;
            }
        }
    }

    if (resv_cpu_master) {
        HostCPUMask host_cpu_mask;
        host_cpu_mask.cpu_zero();

        for (int i = 0; i < num_resv_cpus; ++i) {
            host_cpu_mask.cpu_set(resv_cpus[i]);
        }

        int ret;
        HSTR_CPU_MASK_ZERO(src_hstr_cpu_mask);
        setCurrentProcessAffinityMask(host_cpu_mask);
        getCurrentProcessAffinityMask(host_cpu_mask);
        int first, last, num_set;
        last = 0;
        first = HSTR_MAX_THREADS;
        num_set = 0;
        for (int i = 0; i < HSTR_MAX_THREADS; i++) {
            if (host_cpu_mask.cpu_isset(i)) {
                if (i < first) {
                    first = i;
                }
                last = i;
                num_set++;
                HSTR_CPU_MASK_SET(i, src_hstr_cpu_mask);
            }
        }
        if (loc_verbose) {
            printf("Reserving the following cpu_set for master on CPU\n");
            ShowLimitCPUmask(src_hstr_cpu_mask);
        }
    }

    uint32_t str_offset = 0;
    uint32_t places_mach_wide = 1;
    int iret;

    //create a machine wide stream on host for potrf
    if (mach_wide_league) {
        if (resv_cpu_master) {
            iret = hStreams_custom_init_selected_domains(
                       1,
                       physDomID,
                       1,
                       &places_mach_wide,
                       1,
                       str_offset,
                       src_hstr_cpu_mask);
        } else {
            iret = hStreams_app_init_selected_domains(
                       1,
                       physDomID,
                       1,
                       &places_mach_wide,
                       1,
                       str_offset);
        }
        str_offset = 1;
    }

    //create rest of the streams
    if (resv_cpu_master) {
        iret = hStreams_custom_init_selected_domains(
                   num_doms,
                   physDomID,
                   num_doms,
                   places,
                   1,
                   str_offset,
                   src_hstr_cpu_mask);
    } else {
        iret = hStreams_app_init_selected_domains(
                   num_doms,
                   physDomID,
                   num_doms,
                   places,
                   1,
                   str_offset);
    }
    if (iret != 0) {
        printf("hstreams_app_init failed!\r\n");
        exit(-1);
    }

    mkl_mic_disable();
    //10 max streams for printout
    HSTR_LOG_STR *out_pLogStreamIDs = new HSTR_LOG_STR[10];

    if (loc_verbose) {
        if (mach_wide_league) {
            //host
            CHECK_HSTR_RESULT(hStreams_GetLogDomainIDList(physDomID[0], 2, &logDomID[0]));
            for (int idom = 0; idom < 2; ++idom) {
                CHECK_HSTR_RESULT(hStreams_GetLogDomainDetails(logDomID[idom], out_pPhysDomainID, out_CPUmask));
                //ShowLimitCPUmask(out_CPUmask);
                if (idom == 0) {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], 1, out_pLogStreamIDs));
                } else {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], places[0], out_pLogStreamIDs));
                }
                if (idom > 0) {
                    for (int i = 0; i < places[0]; ++i) {
                        CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[i], logDomID[idom], out_CPUmask));
                        printf("streamId = %d\n", (int)out_pLogStreamIDs[i]);
                        ShowLimitCPUmask(out_CPUmask);
                    }
                } else {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[0], logDomID[idom], out_CPUmask));
                    printf("streamId = %d\n", (int)out_pLogStreamIDs[0]);
                    ShowLimitCPUmask(out_CPUmask);
                }
            }

            for (int idom = 1; idom < num_doms; ++idom) {
                CHECK_HSTR_RESULT(hStreams_GetLogDomainIDList(physDomID[idom], 1, &logDomID[idom]));
                CHECK_HSTR_RESULT(hStreams_GetLogDomainDetails(logDomID[idom], out_pPhysDomainID, out_CPUmask));
                //ShowLimitCPUmask(out_CPUmask);
                CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], places[idom], out_pLogStreamIDs));
                for (int i = 0; i < places[idom]; ++i) {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[i], logDomID[idom], out_CPUmask));
                    printf("streamId = %d\n", (int)out_pLogStreamIDs[i]);
                    ShowLimitCPUmask(out_CPUmask);
                }
            }
        } else {
            for (int idom = 0; idom < num_doms; ++idom) {
                CHECK_HSTR_RESULT(hStreams_GetLogDomainIDList(physDomID[idom], 1, &logDomID[idom]));
                CHECK_HSTR_RESULT(hStreams_GetLogDomainDetails(logDomID[idom], out_pPhysDomainID, out_CPUmask));
                //ShowLimitCPUmask(out_CPUmask);
                CHECK_HSTR_RESULT(hStreams_GetLogStreamIDList(logDomID[idom], places[idom], out_pLogStreamIDs));
                for (int i = 0; i < places[idom]; ++i) {
                    CHECK_HSTR_RESULT(hStreams_GetLogStreamDetails(out_pLogStreamIDs[i], logDomID[idom], out_CPUmask));
                    printf("streamId = %d\n", (int)out_pLogStreamIDs[i]);
                    ShowLimitCPUmask(out_CPUmask);
                }
            }
        }
    }

    //Calling the tiled Cholesky function. This does the factorization of the full matrix using a tiled implementation.
    int cholesky_code = cholesky_tiled(A, tile_size, num_tiles, mat_size_m, niter,
                                       max_log_str, layRow, verify, num_doms, use_host, num_mics, host_ht_offset);

    CHECK_HSTR_RESULT(hStreams_app_fini());

    free(A);

    return cholesky_code;
}
void cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size,
                    int niter, int max_log_str, bool layRow, int verify)
{
    //total number of tiles
    int tot_tiles = num_tiles * num_tiles;

    //memory allocation for matrix for tiled-Cholesky
    double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for matrix for MKL cholesky (for comparison)
    double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for tiled matrix
    double **Asplit = new double* [tot_tiles];
    int mem_size_tile = tile_size * tile_size * sizeof(double);

    for (int i = 0; i < tot_tiles; ++i) {
        //Buffer per tile, host allocation
        Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64);

        //Buffer creation and allocation on the card
        hStreams_app_create_buf((void *)Asplit[i], mem_size_tile);
    }

    double tbegin, tend;

    int iter;
    int info;

    //Events are needed for various synchronizations to enforce
    //data dependence between and among data-transfers/computes
    HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles];

    //for timing tiled cholesky
    double *totTimeMsec = new double [niter];

    //for timing MKL cholesky
    double *totTimeMsecMKL = new double [niter];

    HSTR_RESULT res;

    //these queues are used for queining up compute on the card and
    //data transfers to/from the card.
    //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm.
    //The queues are incremented by one for every compute queued and wrap
    //around the max_log_str available. This ensures good load-balancing.
    int q_trsm, q_potrf, q_syrk_gemm;

    CBLAS_ORDER blasLay;
    int lapackLay;

    if (layRow) {
        blasLay = CblasRowMajor;
        lapackLay = LAPACK_ROW_MAJOR;
    } else {
        blasLay = CblasColMajor;
        lapackLay = LAPACK_COL_MAJOR;
    }

    for (iter = 0; iter < niter; ++iter) {

        //copying matrices into separate variables for tiled cholesky (A_my)
        //and MKL cholesky (A_MKL)
        //The output overwrites the matrices and hence the need to copy
        //for each iteration
        copy_mat(mat, A_my, mat_size);
        copy_mat(mat, A_MKL, mat_size);

        unsigned int m, n, k;

        printf("\nIteration = %d\n", iter);

        split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);
        //beginning of timing
        tbegin = dtimeGet();

        //splitting time included in the timing
        //This splits the input matrix into tiles (or blocks)
        //split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);

        q_potrf = 0;
        for (k = 0; k < num_tiles; ++k) {
            //POTRF
            //dpotrf is executed on the host on the diagonal tile
            //the results are then sent to the card
            if (k > 0) {
                hStreams_app_event_wait(1, &eventsyrk[k * num_tiles + k]);
                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] to host in queue %d\n",
                           k, k, (int)(q_potrf % max_log_str)) ;

                hStreams_app_xfer_memory(Asplit[k * num_tiles + k],
                                         Asplit[k * num_tiles + k], mem_size_tile,
                                         (int)(q_potrf % max_log_str), HSTR_SINK_TO_SRC,
                                         &eventcpyfr[k * num_tiles + k]);

                hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]);
            }

            if (loc_verbose > 0) {
                printf("Executing potrf on host for tile[%d][%d]\n", k, k);
            }

            info = LAPACKE_dpotrf(lapackLay, 'L', tile_size,
                                  Asplit[k * num_tiles + k], tile_size);

            if (k < num_tiles - 1) {
                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] to card in queue %d\n",
                           k, k, (int)(q_potrf % max_log_str));

                hStreams_app_xfer_memory(Asplit[k * num_tiles + k],
                                         Asplit[k * num_tiles + k], mem_size_tile,
                                         (int)(q_potrf % max_log_str), HSTR_SRC_TO_SINK,
                                         &eventcpyto[k * num_tiles + k]);
            }
            q_potrf++;

            q_trsm = 0;
            for (m = k + 1; m < num_tiles; ++m) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               m, k, (int)(q_trsm % max_log_str));

                    hStreams_app_xfer_memory(Asplit[m * num_tiles + k],
                                             Asplit[m * num_tiles + k], mem_size_tile,
                                             (int)(q_trsm % max_log_str), HSTR_SRC_TO_SINK,
                                             &eventcpyto[m * num_tiles + k]);
                }

                //DTRSM
                hStreams_app_event_wait(1, &eventcpyto[k * num_tiles + k]);

                if (k > 0) {
                    hStreams_app_event_wait(1, &eventgemm[m * num_tiles + k]);
                }

                //dtrsm is executed on the card
                if (loc_verbose > 0)
                    printf("Executing trsm for tile[%d][%d] on card in queue %d\n",
                           m, k, (int)(q_trsm % max_log_str));

                res = hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower,
                                            CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                            Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],
                                            tile_size, (int)(q_trsm % max_log_str),
                                            &eventtrsm[m * num_tiles + k]);

                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] back to host in queue %d\n",
                           m, k, (int)(q_trsm % max_log_str));

                hStreams_app_xfer_memory(Asplit[m * num_tiles + k],
                                         Asplit[m * num_tiles + k], mem_size_tile,
                                         (int)(q_trsm % max_log_str), HSTR_SINK_TO_SRC,
                                         &eventcpyfr[m * num_tiles + k]);

                q_trsm++;
            }

            q_syrk_gemm = 0;
            for (n = k + 1; n < num_tiles; ++n) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               n, n, (int)(q_syrk_gemm % max_log_str));

                    hStreams_app_xfer_memory(Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             (int)(q_syrk_gemm % max_log_str), HSTR_SRC_TO_SINK,
                                             &eventcpyto[n * num_tiles + n]);
                }

                //DSYRK
                hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]);
                if (k > 0) {
                    hStreams_app_event_wait(1, &eventsyrk[n * num_tiles + n]);
                }

                //dsyrk is executed on the card
                if (loc_verbose > 0)
                    printf("Executing syrk for tile[%d][%d] on card in queue %d\n",
                           n, n, (int)(q_syrk_gemm % max_log_str));

                res = hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans,
                                            tile_size, tile_size, -1.0, Asplit[n * num_tiles + k],
                                            tile_size, 1.0, Asplit[n * num_tiles + n], tile_size,
                                            (int)(q_syrk_gemm % max_log_str), &eventsyrk[n * num_tiles + n]);

                q_syrk_gemm++;

                for (m = n + 1; m < num_tiles; ++m) {
                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to card in queue %d\n",
                                   m, n, (int)(q_syrk_gemm % max_log_str));

                        hStreams_app_xfer_memory(Asplit[m * num_tiles + n],
                                                 Asplit[m * num_tiles + n], mem_size_tile,
                                                 (int)(q_syrk_gemm % max_log_str),
                                                 HSTR_SRC_TO_SINK,
                                                 &eventcpyto[m * num_tiles + n]);
                    }

                    //DGEMM
                    hStreams_app_event_wait(1, &eventtrsm[m * num_tiles + k]);
                    hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]);

                    if (k > 0) {
                        hStreams_app_event_wait(1, &eventgemm[m * num_tiles + n]);
                    }

                    //dgemm is executed on the card
                    if (loc_verbose > 0)
                        printf("Executing gemm for tile[%d][%d] on card in queue %d\n",
                               m, n, (int)(q_syrk_gemm % max_log_str));

                    res = hStreams_app_dgemm(blasLay, CblasNoTrans, CblasTrans,
                                             tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k],
                                             tile_size, Asplit[n * num_tiles + k], tile_size, 1.0,
                                             Asplit[m * num_tiles + n], tile_size,
                                             (int)(q_syrk_gemm % max_log_str), &eventgemm[m * num_tiles + n]);

                    q_syrk_gemm++;
                }
            }
        }

        //syncrhonizing all the streams
        hStreams_app_thread_sync();

        //end of timing
        tend = dtimeGet();

        totTimeMsec[iter] = 1e3 * (tend - tbegin);
        printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n",
               iter, totTimeMsec[iter]);

        //assembling of tiles back into full matrix
        assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow);

        //calling mkl cholesky for verification and timing comparison.
        //Using auto-offload feature of MKL
#ifndef _WIN32
        //FIXME: calling this function causes a crash on Windows
        mkl_mic_enable();
#endif
        tbegin = dtimeGet();

        //calling MKL dpotrf on the full matrix
        info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size);

        tend = dtimeGet();
        totTimeMsecMKL[iter] = 1e3 * (tend - tbegin);
        printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n",
               iter, totTimeMsecMKL[iter]);

        if (info != 0) {
            printf("error with dpotrf\n");
        }
        mkl_mic_disable();

        if (verify == 1) {
            bool result = verify_results(A_my, A_MKL, mat_size * mat_size);
            if (result == true) {
                printf("Tiled Cholesky successful\n");
            } else {
                printf("Tiled Chloesky failed\n");
            }
        }
    }

    double meanTimeMsec, stdDevMsec;
    double meanTimeMsecMKL, stdDevMsecMKL;
    mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter);
    mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter);

    double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9;

    printf("\nMatrix size = %d\n", mat_size);

    printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using mean Time) = %.2f\n",
           niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3));

    printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using meanTime) = %.2f\n\n",
           niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3));

    //Free
    free(A_my);
    free(A_MKL);
    for (int i = 0; i < tot_tiles; ++i) {
        _mm_free(Asplit[i]);
    }
    delete [] Asplit;
    delete [] eventcpyto;
    delete [] eventcpyfr;
    delete [] eventpotrf;
    delete [] eventtrsm;
    delete [] eventsyrk;
    delete [] eventgemm;
    delete [] totTimeMsec;
    delete [] totTimeMsecMKL;

}