Esempio n. 1
int main(int argc, char ** argv)
  if (argc != 3) {
    std::cerr << "Usage: chol <num_threads> <matrix_size>" << std::endl;
    return 1;


  double L = 1.0;
  unsigned int N = atoi(argv[2]);
  double dx = (double) L / (N - 1);

  double * A = (double *)malloc(N * N * sizeof(double));
  for (unsigned int i = 0; i < N; i++) {
    for (unsigned int j = 0; j < N; j++) {
      A[i*N+j] = cov(i*dx, j*dx);

  double start = omp_get_wtime();

  // We're letting MKL choose the workspace array at runtime
  // Assume unsigned int N converts to lapack_int
  int info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'U', N, A, N);

  double time = omp_get_wtime() - start;

  std::cout << "LAPACKE_dpotrf executed in " << time << " secs." << std::endl;

  std::cout << "LAPACKE_dpotrf return value is: " << info << std::endl;

  return info;
Esempio n. 2
// This task computes the Cholesky factorization of a symmetric positive definite matrix...
// This is the first step in the tile Cholesky factorization.
ocrGuid_t lapacke_dpotrf_task ( u32 paramc, u64* paramv, u32 depc, ocrEdtDep_t depv[]) {
    u32 info;

    u64 *func_args = paramv;
    u32 k = (u32) func_args[0];
    u32 tileSize = (u32) func_args[1];
    ocrGuid_t out_lkji_kkkp1_event_guid = (ocrGuid_t) func_args[2];

    double* aBlock = (double*) (depv[0].ptr);

//    PRINTF("RUNNING sequential_cholesky %d with 0x%llx to satisfy\n", k, (u64)(out_lkji_kkkp1_event_guid));

    ocrGuid_t out_lkji_kkkp1_db_guid;
    ocrGuid_t out_lkji_kkkp1_db_affinity = NULL_GUID;

    info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'L', tileSize, aBlock, tileSize );

    if (info != 0)
        if (info > 0) PRINTF("Matrix A is not Symmetric Positive Definite (SPD)");
        else PRINTF("i-th parameter had an illegal value.");
        return NULL_GUID;

    ocrEventSatisfy(out_lkji_kkkp1_event_guid, depv[0].guid);

    return NULL_GUID;
Esempio n. 3
static int Choleski_decompose(double *X, double *L, int n, int lapack){
	int i,j,error_code;
	char upper = 'U';

	for (i=0; i < n; i++){
		for (j=0; j < n; j++){
			if (i > j)
				L[j*n+i] = 0.0;
			else {
				L[j*n+i] = X[j*n + i];
#if 0
	if (!lapack){
	} else {
	error_code = LAPACKE_dpotrf( LAPACK_COL_MAJOR, upper, n, L, n );

	return error_code;
int CholeskyFactorization::factorize() {
    if (m_matrix_type == Matrix::MATRIX_SPARSE) {
        /* Cholesky decomposition of a SPARSE matrix: */
        if (m_matrix->m_sparse == NULL) {
        /* analyze */
        m_factor = cholmod_analyze(m_matrix->m_sparse, Matrix::cholmod_handle());
        /* factorize */
        cholmod_factorize(m_matrix->m_sparse, m_factor, Matrix::cholmod_handle());
        /* Success: status = 0, else 1*/
        return (m_factor->minor == m_matrix->m_nrows) ? ForBESUtils::STATUS_OK : ForBESUtils::STATUS_NUMERICAL_PROBLEMS;
    } else { /* If this is any non-sparse matrix: */
        memcpy(m_L, m_matrix->getData(), m_matrix->length() * sizeof (double)); /* m_L := m_matrix.m_data */
        int info = ForBESUtils::STATUS_OK;
        if (m_matrix_type == Matrix::MATRIX_DENSE) { /* This is a dense matrix */
            info = LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', m_matrix_nrows, m_L, m_matrix_nrows);
            for (size_t i = 0; i < m_matrix_nrows; i++) {
                for (size_t j = i + 1; j < m_matrix_nrows; j++) {
                    L.set(i, j, 0.0);
        } else if (m_matrix_type == Matrix::MATRIX_SYMMETRIC) { /* This is a symmetric matrix */
            info = LAPACKE_dpptrf(LAPACK_COL_MAJOR, 'L', m_matrix_nrows, m_L);
        return info;
Esempio n. 5
GP_LKonly::GP_LKonly(int d, int n, double* Xin, double* Yin, double* Sin, int* Din, int kindex, double* hyp, double* R){
		//lk = 0.;
		//hyp process
		std::vector<double> ih = std::vector<double>(numhyp(kindex,d));
		hypconvert(&hyp[0], K, D, &ih[0]);
		std::vector<double>Kxx = std::vector<double>(N*N);
                double smodel = 0;
		for (int i=0; i<N; i++){
			Kxx[i*N+i] = kern[K](&Xin[i*D], &Xin[i*D],Din[i],Din[i],D,&ih[0],&smodel);
			for (int j=0; j<i; j++){
				Kxx[i*N+j] = Kxx[i+N*j] = kern[K](&Xin[i*D], &Xin[j*D],Din[i],Din[j],D,&ih[0],&smodel);
				//if (i<10){printf("%f %f %d %d %d %f %f %f %f _ ",Xin[i*D], Xin[j*D],Din[i],Din[j],D,ih[0],ih[1],ih[2],k(&Xin[i*D], &Xin[j*D],Din[i],Din[j],D,&ih[0]));}

		//cho factor
		int c = LAPACKE_dpotrf(LAPACK_ROW_MAJOR,'L',N,&Kxx[0],N);
                if (c!=0){
                    printf("failed to cho fac Kxx %d with hyp [",c);
                    for (int i=0; i<numhyp(kindex,d);i++){printf("%f ",hyp[i]);}
                    R[0]=-1e22; return;
		std::vector<double>Yd = std::vector<double>(N);
		for (int i=0; i<N; i++){
				Yd[i]= Yin[i];
		//solve agains Y
		//calc the llk

		R[0] = - 0.5*N*L2PI;
		//printf("\n1 %f\n",R[0]);
		for (int i=0; i<N; i++){
		//printf("2 %f\n",R[0]);
		R[0] -= 0.5*cblas_ddot(N,&Yin[0],1,&Yd[0],1);
		//printf("3 %f\n",R[0]);
void GeneticAlgorithm::initialiseMatrices(){
	_emat = calcEmat(_R, _basis);

	int address = 0;
	int addresses[8];
		for(int i = 0; i < 8; i ++){
			addresses[i] = address;
			address += (_basisPop[i])*(_R+1);
		for(int i = 0; i < 8; i++){
			int ch0 = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'U', _basisPop[i], &_emat[addresses[i]], _R);


	_gradientCalcs = calcGradient(_R,_basis);
int cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size,
                   int niter, int max_log_str, bool layRow, int verify, int num_doms, int use_host, int num_mics,
                   int host_ht_offset)
    //verification result
    bool result;
    //total number of tiles
    int tot_tiles = num_tiles * num_tiles;

    //memory allocation for matrix for tiled-Cholesky
    double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for matrix for MKL cholesky (for comparison)
    double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for tiled matrix
    double **Asplit = new double* [tot_tiles];
    int mem_size_tile = tile_size * tile_size * sizeof(double);

#define HSTR_BUFFER_PROPS_VALUES {        \
        HSTR_MEM_TYPE_NORMAL,             \
        HSTR_MEM_ALLOC_PREFERRED,         \

    for (int i = 0; i < tot_tiles; ++i) {
        //Buffer per tile, host allocation
        Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64);

        //Buffer creation and allocation on the card
        //hStreams_app_create_buf((void *)Asplit[i], mem_size_tile);
                              (void *)Asplit[i],

    double tbegin, tend;

    int iter;
    int info;

    //Events are needed for various synchronizations to enforce
    //data dependence between and among data-transfers/computes
    HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventcpyto_trsm = new HSTR_EVENT[tot_tiles * num_doms];
    HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles];

    //for timing tiled cholesky
    double *totTimeMsec = new double [niter];

    //for timing MKL cholesky
    double *totTimeMsecMKL = new double [niter];


    //these queues are used for queining up compute on the card and
    //data transfers to/from the card.
    //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm.
    //The queues are incremented by one for every compute queued and wrap
    //around the max_log_str available. This ensures good load-balancing.
    int q_trsm, q_potrf;
    int q_syrk_gemm[10];

    CBLAS_ORDER blasLay;
    int lapackLay;

    if (layRow) {
        blasLay = CblasRowMajor;
        lapackLay = LAPACK_ROW_MAJOR;
    } else {
        blasLay = CblasColMajor;
        lapackLay = LAPACK_COL_MAJOR;

    for (iter = 0; iter < niter; ++iter) {

        //copying matrices into separate variables for tiled cholesky (A_my)
        //and MKL cholesky (A_MKL)
        //The output overwrites the matrices and hence the need to copy
        //for each iteration
        copy_mat(mat, A_my, mat_size);
        copy_mat(mat, A_MKL, mat_size);

        unsigned int m, n, k;

        printf("\nIteration = %d\n", iter);

        //splitting time included in the timing
        //This splits the input matrix into tiles (or blocks)
        split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);

        //beginning of timing
        tbegin = dtimeGet();

        int ic;
        int is_mic;
        for (ic = 0; ic < num_doms; ++ic) {
            q_syrk_gemm[ic] = 0;
        q_potrf = 0;
        q_trsm = 0;
        for (k = 0; k < num_tiles; ++k) {
            //dpotrf is executed on the host on the diagonal tile
            if (mach_wide_league) {
                q_potrf = 0;
            } else {
                q_potrf = q_syrk_gemm[0];

            int qindex = (int)q_potrf % max_log_str;
            if (use_host) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to host in queue %d, triggering event eventcpyto[%d][%d]\n",
                               k, k, (int)(qindex), k, k);

                                             Asplit[k * num_tiles + k],
                                             Asplit[k * num_tiles + k], mem_size_tile,
                                             &eventcpyto[k * num_tiles + k]);

            if (k > 0) {
                if (use_host) {
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyfr[k * num_tiles + k], 0, NULL, NULL);
                } else {
                    hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]);

                if (loc_verbose > 0) {
                    printf("Waiting on eventcpyfr[%d]\n", k * num_tiles + k);

            if (loc_verbose > 0)
                printf("Executing potrf on host for tile[%d][%d], in queue (if use_host) %d, triggerring eventpotrf[%d][%d]\n",
                       k, k, qindex, k, k);

            if (use_host) {
                CHECK_HSTR_RESULT(hStreams_custom_dpotrf(lapackLay, 'L', tile_size,
                                  Asplit[k * num_tiles + k], tile_size, qindex, &eventpotrf[k * num_tiles + k]));
            } else {
                info = LAPACKE_dpotrf(lapackLay, 'L', tile_size,
                                      Asplit[k * num_tiles + k], tile_size);

            if (mach_wide_league) {
                q_trsm = q_syrk_gemm[0];
            } else {
                q_trsm = q_potrf;

            for (m = k + 1; m < num_tiles; ++m) {

                if (mach_wide_league) {
                    qindex = (int)(q_trsm % max_log_str + 1);
                } else {
                    qindex = (int)(q_trsm % max_log_str);

                if (use_host) {
                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to host in queue %d, triggering event eventcpyto[%d][%d]\n",
                                   m, k, (int)(qindex), m, k);

                                                 Asplit[m * num_tiles + k],
                                                 Asplit[m * num_tiles + k], mem_size_tile,
                                                 &eventcpyto[m * num_tiles + k]);

                if (k > 0) {
                    if (use_host) {
                        hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyfr[m * num_tiles + k], 0, NULL, NULL);
                    } else {
                        hStreams_app_event_wait(1, &eventcpyfr[m * num_tiles + k]);

                    if (loc_verbose > 0) {
                        printf("Waiting on eventcpyfr[%d]\n", m * num_tiles + k);

                if (use_host)
                    //hStreams_app_event_wait(1, &eventpotrf[k*num_tiles + k]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventpotrf[k * num_tiles + k], 0, NULL, NULL);

                //dtrsm is executed on the host
                if (loc_verbose > 0)
                    printf("Executing trsm for tile[%d][%d] on host, in queue (if use_host) %d, triggering eventtrsm[%d][%d]\n",
                           m, k, qindex, m, k);

                if (use_host) {
                    CHECK_HSTR_RESULT(hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower,
                                                            CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                                            Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],
                                                            tile_size, qindex,
                                                            &eventtrsm[m * num_tiles + k]));
                } else {
                    cblas_dtrsm(blasLay, CblasRight, CblasLower,
                                CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],

                //transfer to all cards
                for (ic = 0; ic < num_doms; ++ic) {
                    if ((use_host == 1) && (num_mics >= 1)) {
                        if (ic == 0) {
                            is_mic = 0;    //this is host
                        } else {
                            is_mic = 1;
                    } else {
                        is_mic = 0;

                    if (mach_wide_league) {
                        qindex = (int)q_trsm % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset;
                    } else {
                        qindex = (int)q_trsm % max_log_str + ic * max_log_str + is_mic * host_ht_offset;

                    if (use_host)
                        //hStreams_app_event_wait(1, &eventtrsm[m*num_tiles + k]);
                        hStreams_app_event_wait_in_stream(qindex, 1, &eventtrsm[m * num_tiles + k], 0, NULL, NULL);

                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card %d in queue %d, triggering event eventcpyto_trsm[%d]\n",
                               m, k, ic, (int)(qindex), m * num_tiles + k + ic * tot_tiles);

                                             Asplit[m * num_tiles + k],
                                             Asplit[m * num_tiles + k], mem_size_tile,
                                             &eventcpyto_trsm[m * num_tiles + k + ic * tot_tiles]);


            if (use_host) {
                q_syrk_gemm[0] = q_trsm;
                for (ic = 1; ic < num_doms; ++ic) {
                    q_syrk_gemm[ic] = 0;
            } else {
                for (ic = 0; ic < num_doms; ++ic) {
                    q_syrk_gemm[ic] = 0;

            for (n = k + 1; n < num_tiles; ++n) {
                ic = n % num_doms; //round-robin rows across num_doms

                if ((use_host == 1) && (num_mics >= 1)) {
                    if (ic == 0) {
                        is_mic = 0;    //this is host
                    } else {
                        is_mic = 1;
                } else {
                    is_mic = 0;

                if (mach_wide_league) {
                    qindex  = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset;
                } else {
                    qindex  = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + is_mic * host_ht_offset;

                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               n, n, (int)(qindex));

                                             Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             &eventcpyto[n * num_tiles + n]);

                //hStreams_app_event_wait(1, &eventcpyto_trsm[n*num_tiles + k + ic*tot_tiles]);
                hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[n * num_tiles + k + ic * tot_tiles], 0, NULL, NULL);
                if (loc_verbose > 0) {
                    printf("Waiting on eventcpyto_trsm[%d]\n", n * num_tiles + k + ic * tot_tiles);

                if (k > 0) {
                    //hStreams_app_event_wait(1, &eventsyrk[n*num_tiles + n]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventsyrk[n * num_tiles + n], 0, NULL, NULL);
                    if (loc_verbose > 0) {
                        printf("Waiting on eventsyrk[%d]\n", n * num_tiles + n);

                //dsyrk is executed on the card
                if (loc_verbose > 0)
                    printf("Executing syrk for tile[%d][%d] on card in queue %d, triggering event eventsyrk[%d]\n",
                           n, n, (int)(qindex), n * num_tiles + n);

                CHECK_HSTR_RESULT(hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans,
                                                        tile_size, tile_size, -1.0, Asplit[n * num_tiles + k],
                                                        tile_size, 1.0, Asplit[n * num_tiles + n], tile_size,
                                                        (int)(qindex), &eventsyrk[n * num_tiles + n]));

                //send tile to host (only if n = k+1)
                if (n == k + 1) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] from card  to host in queue %d, triggering event eventcpyfr[%d]\n",
                               n, n, (int)(qindex), n * num_tiles + n);

                                             Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             &eventcpyfr[n * num_tiles + n]);



                for (m = n + 1; m < num_tiles; ++m) {
                    ic = m % num_doms; //round-robin rows across num_doms

                    if ((use_host == 1) && (num_mics >= 1)) {
                        if (ic == 0) {
                            is_mic = 0;    //this is host
                        } else {
                            is_mic = 1;
                    } else {
                        is_mic = 0;

                    if (mach_wide_league) {
                        qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset;
                    } else {
                        qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + is_mic * host_ht_offset;

                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to card in queue %d\n",
                                   m, n, (int)(qindex));

                                                 Asplit[m * num_tiles + n],
                                                 Asplit[m * num_tiles + n], mem_size_tile,
                                                 &eventcpyto[m * num_tiles + n]);

                    if (loc_verbose > 0) {
                        printf("Waiting on eventcpyto_trsm[%d]\n", m * num_tiles + k + ic * tot_tiles);
                    //hStreams_app_event_wait(1, &eventcpyto_trsm[m*num_tiles + k + ic*tot_tiles]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[m * num_tiles + k + ic * tot_tiles], 0, NULL, NULL);

                    if (loc_verbose > 0) {
                        printf("Waiting on eventcpyto_trsm[%d]\n", n * num_tiles + k + ic * tot_tiles);
                    //hStreams_app_event_wait(1, &eventcpyto_trsm[n*num_tiles + k + ic*tot_tiles]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[n * num_tiles + k + ic * tot_tiles], 0, NULL, NULL);

                    if (k > 0) {
                        //hStreams_app_event_wait(1, &eventgemm[m*num_tiles + n]);
                        hStreams_app_event_wait_in_stream(qindex, 1, &eventgemm[m * num_tiles + n], 0, NULL, NULL);
                        if (loc_verbose > 0) {
                            printf("Waiting on eventgemm[%d]\n", m * num_tiles + n);

                    //dgemm is executed on the card
                    if (loc_verbose > 0)
                        printf("Executing gemm for tile[%d][%d] on card in queue %d, triggering event eventgemm[%d]\n",
                               m, n, (int)(qindex), m * num_tiles + n);

                                                         blasLay, CblasNoTrans, CblasTrans,
                                                         tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k],
                                                         tile_size, Asplit[n * num_tiles + k], tile_size, 1.0,
                                                         Asplit[m * num_tiles + n], tile_size,
                                                         &eventgemm[m * num_tiles + n]));

                    //send tile to host (only if n = k+1)
                    if (n == k + 1) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] from card to host in queue %d, triggering event eventcpyfr[%d]\n",
                                   m, n, (int)(qindex), m * num_tiles + n);

                            Asplit[m * num_tiles + n],
                            Asplit[m * num_tiles + n], mem_size_tile,
                            &eventcpyfr[m * num_tiles + n]);


        //syncrhonizing all the streams

        //end of timing
        tend = dtimeGet();

        totTimeMsec[iter] = 1e3 * (tend - tbegin);
        printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n",
               iter, totTimeMsec[iter]);

        //assembling of tiles back into full matrix
        assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow);

        //calling mkl cholesky for verification and timing comparison.
        //Using auto-offload feature of MKL
        tbegin = dtimeGet();

        //calling MKL dpotrf on the full matrix
        info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size);

        tend = dtimeGet();
        totTimeMsecMKL[iter] = 1e3 * (tend - tbegin);
        printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n",
               iter, totTimeMsecMKL[iter]);

        if (info != 0) {
            printf("error with dpotrf\n");

        if (verify == 1) {
            result = verify_results(A_my, A_MKL, mat_size * mat_size);
            if (result == true) {
                printf("Tiled Cholesky successful\n");
            } else {
                printf("Tiled Chloesky failed\n");

    double meanTimeMsec, stdDevMsec;
    double meanTimeMsecMKL, stdDevMsecMKL;
    mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter);
    mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter);

    double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9;

    printf("\nMatrix size = %d\n", mat_size);

    printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using mean Time) = %.2f\n",
           niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3));

    printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using meanTime) = %.2f\n\n",
           niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3));

    for (int i = 0; i < tot_tiles; ++i) {
    delete [] Asplit;
    delete [] eventcpyto;
    delete [] eventcpyto_trsm;
    delete [] eventcpyfr;
    delete [] eventpotrf;
    delete [] eventtrsm;
    delete [] eventsyrk;
    delete [] eventgemm;
    delete [] totTimeMsec;
    delete [] totTimeMsecMKL;

    // true result indicates all OK
    if (result) {
        return 0;
    return 1;

Esempio n. 8
int GP::fac(){

	return LAPACKE_dpotrf(LAPACK_ROW_MAJOR,'L',N,&Kxx[0],N);
void cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size,
                    int niter, int max_log_str, bool layRow, int verify)
    //total number of tiles
    int tot_tiles = num_tiles * num_tiles;

    //memory allocation for matrix for tiled-Cholesky
    double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for matrix for MKL cholesky (for comparison)
    double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for tiled matrix
    double **Asplit = new double* [tot_tiles];
    int mem_size_tile = tile_size * tile_size * sizeof(double);

    for (int i = 0; i < tot_tiles; ++i) {
        //Buffer per tile, host allocation
        Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64);

        //Buffer creation and allocation on the card
        hStreams_app_create_buf((void *)Asplit[i], mem_size_tile);

    double tbegin, tend;

    int iter;
    int info;

    //Events are needed for various synchronizations to enforce
    //data dependence between and among data-transfers/computes
    HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles];

    //for timing tiled cholesky
    double *totTimeMsec = new double [niter];

    //for timing MKL cholesky
    double *totTimeMsecMKL = new double [niter];

    HSTR_RESULT res;

    //these queues are used for queining up compute on the card and
    //data transfers to/from the card.
    //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm.
    //The queues are incremented by one for every compute queued and wrap
    //around the max_log_str available. This ensures good load-balancing.
    int q_trsm, q_potrf, q_syrk_gemm;

    CBLAS_ORDER blasLay;
    int lapackLay;

    if (layRow) {
        blasLay = CblasRowMajor;
        lapackLay = LAPACK_ROW_MAJOR;
    } else {
        blasLay = CblasColMajor;
        lapackLay = LAPACK_COL_MAJOR;

    for (iter = 0; iter < niter; ++iter) {

        //copying matrices into separate variables for tiled cholesky (A_my)
        //and MKL cholesky (A_MKL)
        //The output overwrites the matrices and hence the need to copy
        //for each iteration
        copy_mat(mat, A_my, mat_size);
        copy_mat(mat, A_MKL, mat_size);

        unsigned int m, n, k;

        printf("\nIteration = %d\n", iter);

        split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);
        //beginning of timing
        tbegin = dtimeGet();

        //splitting time included in the timing
        //This splits the input matrix into tiles (or blocks)
        //split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);

        q_potrf = 0;
        for (k = 0; k < num_tiles; ++k) {
            //dpotrf is executed on the host on the diagonal tile
            //the results are then sent to the card
            if (k > 0) {
                hStreams_app_event_wait(1, &eventsyrk[k * num_tiles + k]);
                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] to host in queue %d\n",
                           k, k, (int)(q_potrf % max_log_str)) ;

                hStreams_app_xfer_memory(Asplit[k * num_tiles + k],
                                         Asplit[k * num_tiles + k], mem_size_tile,
                                         (int)(q_potrf % max_log_str), HSTR_SINK_TO_SRC,
                                         &eventcpyfr[k * num_tiles + k]);

                hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]);

            if (loc_verbose > 0) {
                printf("Executing potrf on host for tile[%d][%d]\n", k, k);

            info = LAPACKE_dpotrf(lapackLay, 'L', tile_size,
                                  Asplit[k * num_tiles + k], tile_size);

            if (k < num_tiles - 1) {
                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] to card in queue %d\n",
                           k, k, (int)(q_potrf % max_log_str));

                hStreams_app_xfer_memory(Asplit[k * num_tiles + k],
                                         Asplit[k * num_tiles + k], mem_size_tile,
                                         (int)(q_potrf % max_log_str), HSTR_SRC_TO_SINK,
                                         &eventcpyto[k * num_tiles + k]);

            q_trsm = 0;
            for (m = k + 1; m < num_tiles; ++m) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               m, k, (int)(q_trsm % max_log_str));

                    hStreams_app_xfer_memory(Asplit[m * num_tiles + k],
                                             Asplit[m * num_tiles + k], mem_size_tile,
                                             (int)(q_trsm % max_log_str), HSTR_SRC_TO_SINK,
                                             &eventcpyto[m * num_tiles + k]);

                hStreams_app_event_wait(1, &eventcpyto[k * num_tiles + k]);

                if (k > 0) {
                    hStreams_app_event_wait(1, &eventgemm[m * num_tiles + k]);

                //dtrsm is executed on the card
                if (loc_verbose > 0)
                    printf("Executing trsm for tile[%d][%d] on card in queue %d\n",
                           m, k, (int)(q_trsm % max_log_str));

                res = hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower,
                                            CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                            Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],
                                            tile_size, (int)(q_trsm % max_log_str),
                                            &eventtrsm[m * num_tiles + k]);

                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] back to host in queue %d\n",
                           m, k, (int)(q_trsm % max_log_str));

                hStreams_app_xfer_memory(Asplit[m * num_tiles + k],
                                         Asplit[m * num_tiles + k], mem_size_tile,
                                         (int)(q_trsm % max_log_str), HSTR_SINK_TO_SRC,
                                         &eventcpyfr[m * num_tiles + k]);


            q_syrk_gemm = 0;
            for (n = k + 1; n < num_tiles; ++n) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               n, n, (int)(q_syrk_gemm % max_log_str));

                    hStreams_app_xfer_memory(Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             (int)(q_syrk_gemm % max_log_str), HSTR_SRC_TO_SINK,
                                             &eventcpyto[n * num_tiles + n]);

                hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]);
                if (k > 0) {
                    hStreams_app_event_wait(1, &eventsyrk[n * num_tiles + n]);

                //dsyrk is executed on the card
                if (loc_verbose > 0)
                    printf("Executing syrk for tile[%d][%d] on card in queue %d\n",
                           n, n, (int)(q_syrk_gemm % max_log_str));

                res = hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans,
                                            tile_size, tile_size, -1.0, Asplit[n * num_tiles + k],
                                            tile_size, 1.0, Asplit[n * num_tiles + n], tile_size,
                                            (int)(q_syrk_gemm % max_log_str), &eventsyrk[n * num_tiles + n]);


                for (m = n + 1; m < num_tiles; ++m) {
                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to card in queue %d\n",
                                   m, n, (int)(q_syrk_gemm % max_log_str));

                        hStreams_app_xfer_memory(Asplit[m * num_tiles + n],
                                                 Asplit[m * num_tiles + n], mem_size_tile,
                                                 (int)(q_syrk_gemm % max_log_str),
                                                 &eventcpyto[m * num_tiles + n]);

                    hStreams_app_event_wait(1, &eventtrsm[m * num_tiles + k]);
                    hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]);

                    if (k > 0) {
                        hStreams_app_event_wait(1, &eventgemm[m * num_tiles + n]);

                    //dgemm is executed on the card
                    if (loc_verbose > 0)
                        printf("Executing gemm for tile[%d][%d] on card in queue %d\n",
                               m, n, (int)(q_syrk_gemm % max_log_str));

                    res = hStreams_app_dgemm(blasLay, CblasNoTrans, CblasTrans,
                                             tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k],
                                             tile_size, Asplit[n * num_tiles + k], tile_size, 1.0,
                                             Asplit[m * num_tiles + n], tile_size,
                                             (int)(q_syrk_gemm % max_log_str), &eventgemm[m * num_tiles + n]);


        //syncrhonizing all the streams

        //end of timing
        tend = dtimeGet();

        totTimeMsec[iter] = 1e3 * (tend - tbegin);
        printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n",
               iter, totTimeMsec[iter]);

        //assembling of tiles back into full matrix
        assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow);

        //calling mkl cholesky for verification and timing comparison.
        //Using auto-offload feature of MKL
#ifndef _WIN32
        //FIXME: calling this function causes a crash on Windows
        tbegin = dtimeGet();

        //calling MKL dpotrf on the full matrix
        info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size);

        tend = dtimeGet();
        totTimeMsecMKL[iter] = 1e3 * (tend - tbegin);
        printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n",
               iter, totTimeMsecMKL[iter]);

        if (info != 0) {
            printf("error with dpotrf\n");

        if (verify == 1) {
            bool result = verify_results(A_my, A_MKL, mat_size * mat_size);
            if (result == true) {
                printf("Tiled Cholesky successful\n");
            } else {
                printf("Tiled Chloesky failed\n");

    double meanTimeMsec, stdDevMsec;
    double meanTimeMsecMKL, stdDevMsecMKL;
    mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter);
    mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter);

    double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9;

    printf("\nMatrix size = %d\n", mat_size);

    printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using mean Time) = %.2f\n",
           niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3));

    printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using meanTime) = %.2f\n\n",
           niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3));

    for (int i = 0; i < tot_tiles; ++i) {
    delete [] Asplit;
    delete [] eventcpyto;
    delete [] eventcpyfr;
    delete [] eventpotrf;
    delete [] eventtrsm;
    delete [] eventsyrk;
    delete [] eventgemm;
    delete [] totTimeMsec;
    delete [] totTimeMsecMKL;
