Ejemplo n.º 1
0
int main( int argc, char* argv[] )
{
  int iterations = 1;
  if (argc > 1)
    iterations = atoi( argv[1] );

  int size = 10;
  uint dest[size*size];

  int i;
  for ( i = 0; i < size*size; i++ )
    dest[i] = 0;

  int temp = 0;

  for ( i = 0; i < iterations; i++ ) {
    test_stats_on( temp );
    masked_filter_scalar( dest, mask, src, size, size, g_coeff );
    test_stats_off( temp );
  }

  verify_results( dest, ref, size );

  return 0;
}
Ejemplo n.º 2
0
microseconds MatrixMul::blitz(const Args & args, std::mt19937 & gen)
{
	std::cout << "Test: blitz++ ";

	const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args);
	uint32_t m, k, l;
	get_matrix_sizes(cur_args, m, k, l);

	blitz::Array<double, 2> C(m, l), B(k, l), A(m, k);
	initialize_matrices(A.begin(), A.end(), B.begin(), B.end(), gen, cur_args);

	auto start = std::chrono::high_resolution_clock::now();
    blitz::firstIndex i;
    blitz::secondIndex j;
    blitz::thirdIndex n;
    C = blitz::sum(A(i,n) * B(n,j), n);
	auto end = std::chrono::high_resolution_clock::now();

	if( args.test ) {
		verify_results(C.begin(), C.end());
	}

	auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start);
	std::cout << time.count() << std::endl;
	return time;
}
Ejemplo n.º 3
0
microseconds MatrixMul::mult_blas(const Args & args, std::mt19937 & gen)
{
	std::cout << "Test: BLAS ";
	const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args);

	uint32_t m, k, l;
	get_matrix_sizes(cur_args, m, k, l);

	double * A = new double[m * k];
	double * B = new double[k * l];
	double * C = new double[m * l];

	initialize_matrices(A, A + m*k, B, B + k*l, gen, cur_args);

	auto start = std::chrono::high_resolution_clock::now();
	cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, l, k,
				1.0, A, k, B, l, 0.0, C, l);
	auto end = std::chrono::high_resolution_clock::now();

	if( args.test ) {
		verify_results(C, C + m*l);
	}

	auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start);
	std::cout << time.count() << std::endl;

	delete[] A;
	delete[] B;
	delete[] C;

	return time;
}
Ejemplo n.º 4
0
microseconds MatrixMul::plain_call(const Args & args, std::mt19937 & gen)
{
	std::cout << "Test: plain call ";

	const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args);
	uint32_t m, k, l;
	get_matrix_sizes(cur_args, m, k, l);
	double * A = new double[m*k];
	double * B = new double[k*l];
	double * C = new double[m*l];

	/**
		Initialize
	**/
	initialize_matrices(A, A + m*k, B, B + k*l, gen, cur_args);

	/**
		Compute
	**/
	auto start = std::chrono::high_resolution_clock::now();

	for(uint32_t i = 0; i < m;++i) {

		for(uint32_t n = 0; n < l; ++n) {
			C[i*l + n] = A[i*k] * B[n];
		}

		for(uint32_t j = 1; j < k; ++j) {
			for(uint32_t n = 0; n < l; ++n) {
				C[i*l + n] += A[i*k + j] * B[j*l + n];
			}
		}
	}

	auto end = std::chrono::high_resolution_clock::now();

	if( args.test ) {
		verify_results(C, C + m*l);
	}

	delete[] A;
	delete[] B;
	delete[] C;

	auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start);
	std::cout << time.count() << std::endl;
	return time;
}
int main( int argc, char* argv[] )
{

    int size = 100;
    int dest[size];

    int i;
    for ( i = 0; i < size; i++ )
      dest[i] = 0;

    test_stats_on();
    vvadd_scalar( dest, src0, src1, size );
    test_stats_off();

    verify_results( dest, ref, size );

    return 0;
}
Ejemplo n.º 6
0
int
main(int argc, char *argv[])
{
    Elem *test_memory     = test_memory_data;
    Elem *expected_memory = expected_memory_data;


    if (shuffle_memory(test_memory) == 0) {
	printf("ERROR: shuffle_memory failed. not verifying results \n");
	return 1;
    }

    if (verify_results(test_memory, expected_memory) == 0) {
	printf("ERROR: verify_results failed. \n");
	return 1;
    }

    return 0;
}
Ejemplo n.º 7
0
int main( int argc, char* argv[] )
{

    int size = 10;
    uint dest[size*size];

    int i;
    for ( i = 0; i < size*size; i++ )
      dest[i] = 0;

    int temp = 0;

    test_stats_on( temp );
    masked_filter_scalar( dest, mask, src, size, size, g_coeff );
    test_stats_off( temp );

    verify_results( dest, ref, size );

    return 0;

}
Ejemplo n.º 8
0
microseconds MatrixMul::boost_ublas(const Args & args, std::mt19937 & gen)
{
	std::cout << "Test: boost uBLAS ";
	const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args);
	uint32_t m, k, l;
	get_matrix_sizes(cur_args, m, k, l);

	boost::numeric::ublas::matrix<double> A(m, k), B(k, l), C(m, l);
	initialize_matrices(A.data().begin(), A.data().end(), B.data().begin(), B.data().end(), gen, cur_args);

	auto start = std::chrono::high_resolution_clock::now();
	noalias(C) = prod( A, B );
	auto end = std::chrono::high_resolution_clock::now();

	if( args.test ) {
		verify_results(C.data().begin(), C.data().end());
	}

	auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start);
	std::cout << time.count() << std::endl;
	return time;
}
Ejemplo n.º 9
0
microseconds MatrixMul::mult_blaze(const Args & args, std::mt19937 & gen)
{
	std::cout << "Test: Blaze ";
	const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args);
	uint32_t m, k, l;
	get_matrix_sizes(cur_args, m, k, l);

	blaze::DynamicMatrix<double, blaze::rowMajor> A(m, k), B(k, l), C(m, l);
	initialize_matrices(A.data(), A.data() + m*k, B.data(), B.data() + k*l, gen, cur_args);

	auto start = std::chrono::high_resolution_clock::now();
	C = A * B;
	auto end = std::chrono::high_resolution_clock::now();

	if( args.test ) {
		verify_results(C.data(), C.data() + m*l);
	}

	auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start);
	std::cout << time.count() << std::endl;
	return time;
}
Ejemplo n.º 10
0
int main( int argc, char* argv[] )
{

    int size = C;
    int dest[size];

    int i;
    for ( i = 0; i < size; i++ )
      dest[i] = 0;

    int temp = 0;

    // warmup
    mvmult_scalar( dest, (int*) matrix, vector, R, C );

    test_stats_on( temp );
    for ( i = 0; i < 1; i++ )
      mvmult_scalar( dest, (int*) matrix, vector, R, C );
    test_stats_off( temp );

    verify_results( dest, ref, size );

    return 0;
}
Ejemplo n.º 11
0
Archivo: data.c Proyecto: amumu/nokuno
/* for calling from Fortran */
void
verify_results_(vector_t x[], vector_t v[])
{
    verify_results(x, v, "f90");
}
int cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size,
                   int niter, int max_log_str, bool layRow, int verify, int num_doms, int use_host, int num_mics,
                   int host_ht_offset)
{
    //verification result
    bool result;
    //total number of tiles
    int tot_tiles = num_tiles * num_tiles;

    //memory allocation for matrix for tiled-Cholesky
    double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for matrix for MKL cholesky (for comparison)
    double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for tiled matrix
    double **Asplit = new double* [tot_tiles];
    int mem_size_tile = tile_size * tile_size * sizeof(double);

#define HSTR_BUFFER_PROPS_VALUES {        \
        HSTR_MEM_TYPE_NORMAL,             \
        HSTR_MEM_ALLOC_PREFERRED,         \
        HSTR_BUF_PROP_ALIASED}

    HSTR_BUFFER_PROPS buffer_props = HSTR_BUFFER_PROPS_VALUES;
    for (int i = 0; i < tot_tiles; ++i) {
        //Buffer per tile, host allocation
        Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64);

        //Buffer creation and allocation on the card
        //hStreams_app_create_buf((void *)Asplit[i], mem_size_tile);
        CHECK_HSTR_RESULT(hStreams_Alloc1DEx(
                              (void *)Asplit[i],
                              mem_size_tile,
                              &buffer_props,
                              -1,
                              NULL));
    }

    double tbegin, tend;

    int iter;
    int info;

    //Events are needed for various synchronizations to enforce
    //data dependence between and among data-transfers/computes
    HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventcpyto_trsm = new HSTR_EVENT[tot_tiles * num_doms];
    HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles];

    //for timing tiled cholesky
    double *totTimeMsec = new double [niter];

    //for timing MKL cholesky
    double *totTimeMsecMKL = new double [niter];


    mkl_mic_disable();

    //these queues are used for queining up compute on the card and
    //data transfers to/from the card.
    //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm.
    //The queues are incremented by one for every compute queued and wrap
    //around the max_log_str available. This ensures good load-balancing.
    int q_trsm, q_potrf;
    int q_syrk_gemm[10];

    CBLAS_ORDER blasLay;
    int lapackLay;

    if (layRow) {
        blasLay = CblasRowMajor;
        lapackLay = LAPACK_ROW_MAJOR;
    } else {
        blasLay = CblasColMajor;
        lapackLay = LAPACK_COL_MAJOR;
    }

    for (iter = 0; iter < niter; ++iter) {

        //copying matrices into separate variables for tiled cholesky (A_my)
        //and MKL cholesky (A_MKL)
        //The output overwrites the matrices and hence the need to copy
        //for each iteration
        copy_mat(mat, A_my, mat_size);
        copy_mat(mat, A_MKL, mat_size);

        unsigned int m, n, k;

        printf("\nIteration = %d\n", iter);

        //splitting time included in the timing
        //This splits the input matrix into tiles (or blocks)
        split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);

        //beginning of timing
        tbegin = dtimeGet();

        int ic;
        int is_mic;
        for (ic = 0; ic < num_doms; ++ic) {
            q_syrk_gemm[ic] = 0;
        }
        q_potrf = 0;
        q_trsm = 0;
        for (k = 0; k < num_tiles; ++k) {
            //POTRF
            //dpotrf is executed on the host on the diagonal tile
            if (mach_wide_league) {
                q_potrf = 0;
            } else {
                q_potrf = q_syrk_gemm[0];
            }

            int qindex = (int)q_potrf % max_log_str;
            if (use_host) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to host in queue %d, triggering event eventcpyto[%d][%d]\n",
                               k, k, (int)(qindex), k, k);

                    hStreams_app_xfer_memory((int)(qindex),
                                             Asplit[k * num_tiles + k],
                                             Asplit[k * num_tiles + k], mem_size_tile,
                                             HSTR_SRC_TO_SINK,
                                             &eventcpyto[k * num_tiles + k]);
                }
            }

            if (k > 0) {
                if (use_host) {
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyfr[k * num_tiles + k], 0, NULL, NULL);
                } else {
                    hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]);
                }

                if (loc_verbose > 0) {
                    printf("Waiting on eventcpyfr[%d]\n", k * num_tiles + k);
                }
            }

            if (loc_verbose > 0)
                printf("Executing potrf on host for tile[%d][%d], in queue (if use_host) %d, triggerring eventpotrf[%d][%d]\n",
                       k, k, qindex, k, k);

            if (use_host) {
                CHECK_HSTR_RESULT(hStreams_custom_dpotrf(lapackLay, 'L', tile_size,
                                  Asplit[k * num_tiles + k], tile_size, qindex, &eventpotrf[k * num_tiles + k]));
            } else {
                info = LAPACKE_dpotrf(lapackLay, 'L', tile_size,
                                      Asplit[k * num_tiles + k], tile_size);
            }


            if (mach_wide_league) {
                q_trsm = q_syrk_gemm[0];
            } else {
                q_potrf++;
                q_trsm = q_potrf;
            }

            for (m = k + 1; m < num_tiles; ++m) {

                if (mach_wide_league) {
                    qindex = (int)(q_trsm % max_log_str + 1);
                } else {
                    qindex = (int)(q_trsm % max_log_str);
                }

                if (use_host) {
                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to host in queue %d, triggering event eventcpyto[%d][%d]\n",
                                   m, k, (int)(qindex), m, k);

                        hStreams_app_xfer_memory((int)(qindex),
                                                 Asplit[m * num_tiles + k],
                                                 Asplit[m * num_tiles + k], mem_size_tile,
                                                 HSTR_SRC_TO_SINK,
                                                 &eventcpyto[m * num_tiles + k]);
                    }
                }

                if (k > 0) {
                    if (use_host) {
                        hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyfr[m * num_tiles + k], 0, NULL, NULL);
                    } else {
                        hStreams_app_event_wait(1, &eventcpyfr[m * num_tiles + k]);
                    }

                    if (loc_verbose > 0) {
                        printf("Waiting on eventcpyfr[%d]\n", m * num_tiles + k);
                    }
                }

                if (use_host)
                    //hStreams_app_event_wait(1, &eventpotrf[k*num_tiles + k]);
                {
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventpotrf[k * num_tiles + k], 0, NULL, NULL);
                }

                //dtrsm is executed on the host
                if (loc_verbose > 0)
                    printf("Executing trsm for tile[%d][%d] on host, in queue (if use_host) %d, triggering eventtrsm[%d][%d]\n",
                           m, k, qindex, m, k);

                if (use_host) {
                    CHECK_HSTR_RESULT(hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower,
                                                            CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                                            Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],
                                                            tile_size, qindex,
                                                            &eventtrsm[m * num_tiles + k]));
                } else {
                    cblas_dtrsm(blasLay, CblasRight, CblasLower,
                                CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],
                                tile_size);
                }

                //transfer to all cards
                for (ic = 0; ic < num_doms; ++ic) {
                    if ((use_host == 1) && (num_mics >= 1)) {
                        if (ic == 0) {
                            is_mic = 0;    //this is host
                        } else {
                            is_mic = 1;
                        }
                    } else {
                        is_mic = 0;
                    }

                    if (mach_wide_league) {
                        qindex = (int)q_trsm % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset;
                    } else {
                        qindex = (int)q_trsm % max_log_str + ic * max_log_str + is_mic * host_ht_offset;
                    }

                    if (use_host)
                        //hStreams_app_event_wait(1, &eventtrsm[m*num_tiles + k]);
                    {
                        hStreams_app_event_wait_in_stream(qindex, 1, &eventtrsm[m * num_tiles + k], 0, NULL, NULL);
                    }

                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card %d in queue %d, triggering event eventcpyto_trsm[%d]\n",
                               m, k, ic, (int)(qindex), m * num_tiles + k + ic * tot_tiles);

                    hStreams_app_xfer_memory((int)(qindex),
                                             Asplit[m * num_tiles + k],
                                             Asplit[m * num_tiles + k], mem_size_tile,
                                             HSTR_SRC_TO_SINK,
                                             &eventcpyto_trsm[m * num_tiles + k + ic * tot_tiles]);
                }

                q_trsm++;
            }

            if (use_host) {
                q_syrk_gemm[0] = q_trsm;
                for (ic = 1; ic < num_doms; ++ic) {
                    q_syrk_gemm[ic] = 0;
                }
            } else {
                for (ic = 0; ic < num_doms; ++ic) {
                    q_syrk_gemm[ic] = 0;
                }
            }

            for (n = k + 1; n < num_tiles; ++n) {
                ic = n % num_doms; //round-robin rows across num_doms

                if ((use_host == 1) && (num_mics >= 1)) {
                    if (ic == 0) {
                        is_mic = 0;    //this is host
                    } else {
                        is_mic = 1;
                    }
                } else {
                    is_mic = 0;
                }

                if (mach_wide_league) {
                    qindex  = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset;
                } else {
                    qindex  = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + is_mic * host_ht_offset;
                }

                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               n, n, (int)(qindex));

                    hStreams_app_xfer_memory((int)(qindex),
                                             Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             HSTR_SRC_TO_SINK,
                                             &eventcpyto[n * num_tiles + n]);
                }

                //DSYRK
                //hStreams_app_event_wait(1, &eventcpyto_trsm[n*num_tiles + k + ic*tot_tiles]);
                hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[n * num_tiles + k + ic * tot_tiles], 0, NULL, NULL);
                if (loc_verbose > 0) {
                    printf("Waiting on eventcpyto_trsm[%d]\n", n * num_tiles + k + ic * tot_tiles);
                }

                if (k > 0) {
                    //hStreams_app_event_wait(1, &eventsyrk[n*num_tiles + n]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventsyrk[n * num_tiles + n], 0, NULL, NULL);
                    if (loc_verbose > 0) {
                        printf("Waiting on eventsyrk[%d]\n", n * num_tiles + n);
                    }
                }

                //dsyrk is executed on the card
                if (loc_verbose > 0)
                    printf("Executing syrk for tile[%d][%d] on card in queue %d, triggering event eventsyrk[%d]\n",
                           n, n, (int)(qindex), n * num_tiles + n);


                CHECK_HSTR_RESULT(hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans,
                                                        tile_size, tile_size, -1.0, Asplit[n * num_tiles + k],
                                                        tile_size, 1.0, Asplit[n * num_tiles + n], tile_size,
                                                        (int)(qindex), &eventsyrk[n * num_tiles + n]));

                //send tile to host (only if n = k+1)
                if (n == k + 1) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] from card  to host in queue %d, triggering event eventcpyfr[%d]\n",
                               n, n, (int)(qindex), n * num_tiles + n);

                    hStreams_app_xfer_memory((int)(qindex),
                                             Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             HSTR_SINK_TO_SRC,
                                             &eventcpyfr[n * num_tiles + n]);

                }

                q_syrk_gemm[ic]++;


                for (m = n + 1; m < num_tiles; ++m) {
                    ic = m % num_doms; //round-robin rows across num_doms

                    if ((use_host == 1) && (num_mics >= 1)) {
                        if (ic == 0) {
                            is_mic = 0;    //this is host
                        } else {
                            is_mic = 1;
                        }
                    } else {
                        is_mic = 0;
                    }

                    if (mach_wide_league) {
                        qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset;
                    } else {
                        qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + is_mic * host_ht_offset;
                    }

                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to card in queue %d\n",
                                   m, n, (int)(qindex));

                        hStreams_app_xfer_memory((int)(qindex),
                                                 Asplit[m * num_tiles + n],
                                                 Asplit[m * num_tiles + n], mem_size_tile,
                                                 HSTR_SRC_TO_SINK,
                                                 &eventcpyto[m * num_tiles + n]);
                    }

                    //DGEMM
                    if (loc_verbose > 0) {
                        printf("Waiting on eventcpyto_trsm[%d]\n", m * num_tiles + k + ic * tot_tiles);
                    }
                    //hStreams_app_event_wait(1, &eventcpyto_trsm[m*num_tiles + k + ic*tot_tiles]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[m * num_tiles + k + ic * tot_tiles], 0, NULL, NULL);

                    if (loc_verbose > 0) {
                        printf("Waiting on eventcpyto_trsm[%d]\n", n * num_tiles + k + ic * tot_tiles);
                    }
                    //hStreams_app_event_wait(1, &eventcpyto_trsm[n*num_tiles + k + ic*tot_tiles]);
                    hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[n * num_tiles + k + ic * tot_tiles], 0, NULL, NULL);

                    if (k > 0) {
                        //hStreams_app_event_wait(1, &eventgemm[m*num_tiles + n]);
                        hStreams_app_event_wait_in_stream(qindex, 1, &eventgemm[m * num_tiles + n], 0, NULL, NULL);
                        if (loc_verbose > 0) {
                            printf("Waiting on eventgemm[%d]\n", m * num_tiles + n);
                        }
                    }

                    //dgemm is executed on the card
                    if (loc_verbose > 0)
                        printf("Executing gemm for tile[%d][%d] on card in queue %d, triggering event eventgemm[%d]\n",
                               m, n, (int)(qindex), m * num_tiles + n);

                    CHECK_HSTR_RESULT(hStreams_app_dgemm((int)(qindex),
                                                         blasLay, CblasNoTrans, CblasTrans,
                                                         tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k],
                                                         tile_size, Asplit[n * num_tiles + k], tile_size, 1.0,
                                                         Asplit[m * num_tiles + n], tile_size,
                                                         &eventgemm[m * num_tiles + n]));

                    //send tile to host (only if n = k+1)
                    if (n == k + 1) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] from card to host in queue %d, triggering event eventcpyfr[%d]\n",
                                   m, n, (int)(qindex), m * num_tiles + n);

                        hStreams_app_xfer_memory(
                            (int)(qindex),
                            Asplit[m * num_tiles + n],
                            Asplit[m * num_tiles + n], mem_size_tile,
                            HSTR_SINK_TO_SRC,
                            &eventcpyfr[m * num_tiles + n]);
                    }


                    q_syrk_gemm[ic]++;
                }
            }
        }


        //syncrhonizing all the streams
        hStreams_app_thread_sync();

        //end of timing
        tend = dtimeGet();

        totTimeMsec[iter] = 1e3 * (tend - tbegin);
        printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n",
               iter, totTimeMsec[iter]);

        //assembling of tiles back into full matrix
        assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow);

        //calling mkl cholesky for verification and timing comparison.
        //Using auto-offload feature of MKL
        tbegin = dtimeGet();

        //calling MKL dpotrf on the full matrix
        info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size);

        tend = dtimeGet();
        totTimeMsecMKL[iter] = 1e3 * (tend - tbegin);
        printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n",
               iter, totTimeMsecMKL[iter]);

        if (info != 0) {
            printf("error with dpotrf\n");
        }
        mkl_mic_disable();

        if (verify == 1) {
            result = verify_results(A_my, A_MKL, mat_size * mat_size);
            if (result == true) {
                printf("Tiled Cholesky successful\n");
            } else {
                printf("Tiled Chloesky failed\n");
            }
        }
    }

    double meanTimeMsec, stdDevMsec;
    double meanTimeMsecMKL, stdDevMsecMKL;
    mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter);
    mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter);

    double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9;

    printf("\nMatrix size = %d\n", mat_size);

    printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using mean Time) = %.2f\n",
           niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3));

    printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using meanTime) = %.2f\n\n",
           niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3));

    //Free
    free(A_my);
    free(A_MKL);
    for (int i = 0; i < tot_tiles; ++i) {
        _mm_free(Asplit[i]);
    }
    delete [] Asplit;
    delete [] eventcpyto;
    delete [] eventcpyto_trsm;
    delete [] eventcpyfr;
    delete [] eventpotrf;
    delete [] eventtrsm;
    delete [] eventsyrk;
    delete [] eventgemm;
    delete [] totTimeMsec;
    delete [] totTimeMsecMKL;

    // true result indicates all OK
    if (result) {
        return 0;
    }
    return 1;

}
Ejemplo n.º 13
0
/*
 * The primary compute function for the bucket sort
 * Executes the sum of NUM_ITERATIONS + BURN_IN iterations, as defined in params.h
 * Only iterations after the BURN_IN iterations are timed
 * Only the final iteration calls the verification function
 */
static int bucket_sort(void)
{
  int err = 0;

  init_timers(NUM_ITERATIONS);

#ifdef PERMUTE
  create_permutation_array();
#endif

  for(uint64_t i = 0; i < (NUM_ITERATIONS + BURN_IN); ++i)
  {

    // Reset timers after burn in 
    if(i == BURN_IN){ init_timers(NUM_ITERATIONS); } 

    SHMEM_BARRIER_AT_START;

    timer_start(&timers[TIMER_TOTAL]);

    KEY_TYPE * my_keys = make_input();

    int * local_bucket_sizes = count_local_bucket_sizes(my_keys);

    int * send_offsets;
    int * local_bucket_offsets = compute_local_bucket_offsets(local_bucket_sizes,
                                                                   &send_offsets);

    KEY_TYPE * my_local_bucketed_keys =  bucketize_local_keys(my_keys, local_bucket_offsets);

    KEY_TYPE * my_bucket_keys = exchange_keys(send_offsets, 
                                              local_bucket_sizes,
                                              my_local_bucketed_keys);

    my_bucket_size = receive_offset;

    int * my_local_key_counts = count_local_keys(my_bucket_keys);

    SHMEM_BARRIER_AT_END;

    timer_stop(&timers[TIMER_TOTAL]);

    // Only the last iteration is verified
    if(i == NUM_ITERATIONS) { 
      err = verify_results(my_local_key_counts, my_bucket_keys);
    }

    // Reset receive_offset used in exchange_keys
    receive_offset = 0;

    free(my_local_bucketed_keys);
    free(my_keys);
    free(local_bucket_sizes);
    free(local_bucket_offsets);
    free(send_offsets);
    free(my_local_key_counts);

    shmem_barrier_all();
  }

  return err;
}
Ejemplo n.º 14
0
void test_perf_nb(int dry_run) {
  
    int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
    int stride, k=0, ntimes;
    double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9;
    double *dsrc[MAXPROC], scale=1.0;
    armci_hdl_t hdl_get, hdl_put, hdl_acc;
        
    create_array((void**)ddst, sizeof(double),2, elems);
    create_array((void**)dsrc, sizeof(double),1, &elems[1]);

    if(!dry_run)if(me == 0) {
      printf("\n\t\t\tRemote 1-D Array Section\n");
      printf("section    get      nbget    wait     put     nbput  ");
      printf("   wait     acc     nbacc     wait\n");
      printf("-------  -------- -------- -------- -------- --------");
      printf(" -------- -------- -------- --------\n");
      fflush(stdout);
    }

    for(loop=1; loop<=MAXELEMS; loop*=2, k++) {

      elems[1] = loop;
      ntimes = (int)sqrt((double)(MAXELEMS/elems[1]));
      if(ntimes <1) ntimes=1;

      /* -------------------------- SETUP --------------------------- */
      /*initializing non-blocking handles,time,src & dst buffers*/
      ARMCI_INIT_HANDLE(&hdl_put);
      ARMCI_INIT_HANDLE(&hdl_get);
      ARMCI_INIT_HANDLE(&hdl_acc);
      t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0;
      for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;    
      MP_BARRIER();
      
      /* bytes transfered */
      bytes = sizeof(double)*elems[1]; 
      MP_BARRIER();
      
      /* -------------------------- PUT/GET -------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	  t1 += MP_TIMER()-stime;
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();
      
      if(me == 0) { 
	for(i=1; i<nproc; i++) {
	  stime=MP_TIMER();    
	  for(j=0; j<ntimes; j++)
	    if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	  t4 += MP_TIMER()-stime;	
	}
      }    
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      /* ------------------------ nb PUT/GET ------------------------- */    
      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,
			       i, &hdl_put)))
	      ARMCI_Error("armci_nbput failed\n",rc);
	    t2 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_put);
	    t3 += MP_TIMER()-stime;
	  } 
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(PUT, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
      MP_BARRIER();

      if(me == 0) {
	for(i=1; i<nproc; i++) {
	  for(j=0; j<ntimes; j++) {
	    stime=MP_TIMER();    
	    if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,
			       i, &hdl_get)))
	      ARMCI_Error("armci_nbget failed\n",rc);
	    t5 += MP_TIMER()-stime;	stime=MP_TIMER();
	    ARMCI_Wait(&hdl_get);
	    t6 += MP_TIMER()-stime;
	  }
	}
      }
      MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
      if(VERIFY) verify_results(GET, elems);
      for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; 
      MP_BARRIER();


      /* ------------------------ Accumulate ------------------------- */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();
	if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			  &ddst[0][0], &stride, &bytes, 0, 0)))
	  ARMCI_Error("armci_acc failed\n",rc);
	t7 += MP_TIMER()-stime;
	
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }

#if 1
      /* See the note below why this part is disabled */
      /* ---------------------- nb-Accumulate ------------------------ */    
      for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0;  MP_BARRIER();
      stride = elems[1]*sizeof(double); scale  = 1.0;
      for(j=0; j<ntimes; j++) {
	stime=MP_TIMER();    
	if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, 
			    &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc)))
	  ARMCI_Error("armci_nbacc failed\n",rc);
	t8 += MP_TIMER()-stime; stime=MP_TIMER();
	ARMCI_Wait(&hdl_acc);
	t9 += MP_TIMER()-stime;
      
	MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER();
	if(VERIFY) verify_results(ACC, elems);
	for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0;
	MP_BARRIER();
      }
#endif

      /* print timings */
     if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", 
		       bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, 
		       t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes);
    }

    ARMCI_AllFence();
    MP_BARRIER();
    
    if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);}
    destroy_array((void **)ddst);
    destroy_array((void **)dsrc);
}
void cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size,
                    int niter, int max_log_str, bool layRow, int verify)
{
    //total number of tiles
    int tot_tiles = num_tiles * num_tiles;

    //memory allocation for matrix for tiled-Cholesky
    double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for matrix for MKL cholesky (for comparison)
    double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double));

    //memory allocation for tiled matrix
    double **Asplit = new double* [tot_tiles];
    int mem_size_tile = tile_size * tile_size * sizeof(double);

    for (int i = 0; i < tot_tiles; ++i) {
        //Buffer per tile, host allocation
        Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64);

        //Buffer creation and allocation on the card
        hStreams_app_create_buf((void *)Asplit[i], mem_size_tile);
    }

    double tbegin, tend;

    int iter;
    int info;

    //Events are needed for various synchronizations to enforce
    //data dependence between and among data-transfers/computes
    HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles];
    HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles];

    //for timing tiled cholesky
    double *totTimeMsec = new double [niter];

    //for timing MKL cholesky
    double *totTimeMsecMKL = new double [niter];

    HSTR_RESULT res;

    //these queues are used for queining up compute on the card and
    //data transfers to/from the card.
    //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm.
    //The queues are incremented by one for every compute queued and wrap
    //around the max_log_str available. This ensures good load-balancing.
    int q_trsm, q_potrf, q_syrk_gemm;

    CBLAS_ORDER blasLay;
    int lapackLay;

    if (layRow) {
        blasLay = CblasRowMajor;
        lapackLay = LAPACK_ROW_MAJOR;
    } else {
        blasLay = CblasColMajor;
        lapackLay = LAPACK_COL_MAJOR;
    }

    for (iter = 0; iter < niter; ++iter) {

        //copying matrices into separate variables for tiled cholesky (A_my)
        //and MKL cholesky (A_MKL)
        //The output overwrites the matrices and hence the need to copy
        //for each iteration
        copy_mat(mat, A_my, mat_size);
        copy_mat(mat, A_MKL, mat_size);

        unsigned int m, n, k;

        printf("\nIteration = %d\n", iter);

        split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);
        //beginning of timing
        tbegin = dtimeGet();

        //splitting time included in the timing
        //This splits the input matrix into tiles (or blocks)
        //split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow);

        q_potrf = 0;
        for (k = 0; k < num_tiles; ++k) {
            //POTRF
            //dpotrf is executed on the host on the diagonal tile
            //the results are then sent to the card
            if (k > 0) {
                hStreams_app_event_wait(1, &eventsyrk[k * num_tiles + k]);
                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] to host in queue %d\n",
                           k, k, (int)(q_potrf % max_log_str)) ;

                hStreams_app_xfer_memory(Asplit[k * num_tiles + k],
                                         Asplit[k * num_tiles + k], mem_size_tile,
                                         (int)(q_potrf % max_log_str), HSTR_SINK_TO_SRC,
                                         &eventcpyfr[k * num_tiles + k]);

                hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]);
            }

            if (loc_verbose > 0) {
                printf("Executing potrf on host for tile[%d][%d]\n", k, k);
            }

            info = LAPACKE_dpotrf(lapackLay, 'L', tile_size,
                                  Asplit[k * num_tiles + k], tile_size);

            if (k < num_tiles - 1) {
                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] to card in queue %d\n",
                           k, k, (int)(q_potrf % max_log_str));

                hStreams_app_xfer_memory(Asplit[k * num_tiles + k],
                                         Asplit[k * num_tiles + k], mem_size_tile,
                                         (int)(q_potrf % max_log_str), HSTR_SRC_TO_SINK,
                                         &eventcpyto[k * num_tiles + k]);
            }
            q_potrf++;

            q_trsm = 0;
            for (m = k + 1; m < num_tiles; ++m) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               m, k, (int)(q_trsm % max_log_str));

                    hStreams_app_xfer_memory(Asplit[m * num_tiles + k],
                                             Asplit[m * num_tiles + k], mem_size_tile,
                                             (int)(q_trsm % max_log_str), HSTR_SRC_TO_SINK,
                                             &eventcpyto[m * num_tiles + k]);
                }

                //DTRSM
                hStreams_app_event_wait(1, &eventcpyto[k * num_tiles + k]);

                if (k > 0) {
                    hStreams_app_event_wait(1, &eventgemm[m * num_tiles + k]);
                }

                //dtrsm is executed on the card
                if (loc_verbose > 0)
                    printf("Executing trsm for tile[%d][%d] on card in queue %d\n",
                           m, k, (int)(q_trsm % max_log_str));

                res = hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower,
                                            CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
                                            Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k],
                                            tile_size, (int)(q_trsm % max_log_str),
                                            &eventtrsm[m * num_tiles + k]);

                if (loc_verbose > 0)
                    printf("Sending tile[%d][%d] back to host in queue %d\n",
                           m, k, (int)(q_trsm % max_log_str));

                hStreams_app_xfer_memory(Asplit[m * num_tiles + k],
                                         Asplit[m * num_tiles + k], mem_size_tile,
                                         (int)(q_trsm % max_log_str), HSTR_SINK_TO_SRC,
                                         &eventcpyfr[m * num_tiles + k]);

                q_trsm++;
            }

            q_syrk_gemm = 0;
            for (n = k + 1; n < num_tiles; ++n) {
                if (k == 0) {
                    if (loc_verbose > 0)
                        printf("Sending tile[%d][%d] to card in queue %d\n",
                               n, n, (int)(q_syrk_gemm % max_log_str));

                    hStreams_app_xfer_memory(Asplit[n * num_tiles + n],
                                             Asplit[n * num_tiles + n], mem_size_tile,
                                             (int)(q_syrk_gemm % max_log_str), HSTR_SRC_TO_SINK,
                                             &eventcpyto[n * num_tiles + n]);
                }

                //DSYRK
                hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]);
                if (k > 0) {
                    hStreams_app_event_wait(1, &eventsyrk[n * num_tiles + n]);
                }

                //dsyrk is executed on the card
                if (loc_verbose > 0)
                    printf("Executing syrk for tile[%d][%d] on card in queue %d\n",
                           n, n, (int)(q_syrk_gemm % max_log_str));

                res = hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans,
                                            tile_size, tile_size, -1.0, Asplit[n * num_tiles + k],
                                            tile_size, 1.0, Asplit[n * num_tiles + n], tile_size,
                                            (int)(q_syrk_gemm % max_log_str), &eventsyrk[n * num_tiles + n]);

                q_syrk_gemm++;

                for (m = n + 1; m < num_tiles; ++m) {
                    if (k == 0) {
                        if (loc_verbose > 0)
                            printf("Sending tile[%d][%d] to card in queue %d\n",
                                   m, n, (int)(q_syrk_gemm % max_log_str));

                        hStreams_app_xfer_memory(Asplit[m * num_tiles + n],
                                                 Asplit[m * num_tiles + n], mem_size_tile,
                                                 (int)(q_syrk_gemm % max_log_str),
                                                 HSTR_SRC_TO_SINK,
                                                 &eventcpyto[m * num_tiles + n]);
                    }

                    //DGEMM
                    hStreams_app_event_wait(1, &eventtrsm[m * num_tiles + k]);
                    hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]);

                    if (k > 0) {
                        hStreams_app_event_wait(1, &eventgemm[m * num_tiles + n]);
                    }

                    //dgemm is executed on the card
                    if (loc_verbose > 0)
                        printf("Executing gemm for tile[%d][%d] on card in queue %d\n",
                               m, n, (int)(q_syrk_gemm % max_log_str));

                    res = hStreams_app_dgemm(blasLay, CblasNoTrans, CblasTrans,
                                             tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k],
                                             tile_size, Asplit[n * num_tiles + k], tile_size, 1.0,
                                             Asplit[m * num_tiles + n], tile_size,
                                             (int)(q_syrk_gemm % max_log_str), &eventgemm[m * num_tiles + n]);

                    q_syrk_gemm++;
                }
            }
        }

        //syncrhonizing all the streams
        hStreams_app_thread_sync();

        //end of timing
        tend = dtimeGet();

        totTimeMsec[iter] = 1e3 * (tend - tbegin);
        printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n",
               iter, totTimeMsec[iter]);

        //assembling of tiles back into full matrix
        assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow);

        //calling mkl cholesky for verification and timing comparison.
        //Using auto-offload feature of MKL
#ifndef _WIN32
        //FIXME: calling this function causes a crash on Windows
        mkl_mic_enable();
#endif
        tbegin = dtimeGet();

        //calling MKL dpotrf on the full matrix
        info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size);

        tend = dtimeGet();
        totTimeMsecMKL[iter] = 1e3 * (tend - tbegin);
        printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n",
               iter, totTimeMsecMKL[iter]);

        if (info != 0) {
            printf("error with dpotrf\n");
        }
        mkl_mic_disable();

        if (verify == 1) {
            bool result = verify_results(A_my, A_MKL, mat_size * mat_size);
            if (result == true) {
                printf("Tiled Cholesky successful\n");
            } else {
                printf("Tiled Chloesky failed\n");
            }
        }
    }

    double meanTimeMsec, stdDevMsec;
    double meanTimeMsecMKL, stdDevMsecMKL;
    mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter);
    mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter);

    double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9;

    printf("\nMatrix size = %d\n", mat_size);

    printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using mean Time) = %.2f\n",
           niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3));

    printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n"
           "mean Time = %.2f msec, stdDev Time = %.2f msec,\n"
           "Mean Gflops (using meanTime) = %.2f\n\n",
           niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3));

    //Free
    free(A_my);
    free(A_MKL);
    for (int i = 0; i < tot_tiles; ++i) {
        _mm_free(Asplit[i]);
    }
    delete [] Asplit;
    delete [] eventcpyto;
    delete [] eventcpyfr;
    delete [] eventpotrf;
    delete [] eventtrsm;
    delete [] eventsyrk;
    delete [] eventgemm;
    delete [] totTimeMsec;
    delete [] totTimeMsecMKL;

}