int main()
    double *A, *B, *C;
    int i,j,r,max_threads,size;
    double alpha, beta;
    double s_initial, s_elapsed;
    printf("Intializing data for matrix multiplication C=A*B for matrix\n\n"
            " A(%i*%i) and matrix B(%i*%i)\n",M,P,P,N);
    alpha = 1.0;
    beta = 0.0;

    printf("Allocating memory for matrices aligned on 64-byte boundary for better performance \n\n");
    A = ( double *)mkl_malloc(M*P*sizeof( double ),64);
    B = ( double *)mkl_malloc(N*P*sizeof( double ),64);
    C = ( double *)mkl_malloc(M*N*sizeof( double ),64);
    if (A == NULL || B == NULL || C == NULL)
        printf("Error: can`t allocate memory for matrices.\n\n");
        return 1;

    printf("Intializing matrix data\n\n");
    size = M*P;
    for (i = 0; i < size; ++i)
        A[i] = ( double )(i+1);
    size = N*P;
    for (i = 0; i < size; ++i)
        B[i] = ( double )(i-1);

    printf("Finding max number of threads can use for parallel runs \n\n");
    max_threads = mkl_get_max_threads();

    printf("Running from 1 to %i threads \n\n",max_threads);
    for (i = 1; i <= max_threads; ++i)
        size = M*N;
        for (j = 0; j < size; ++j)
            C[j] = 0.0;

	    printf("Requesting to use %i threads \n\n",i); 

	    printf("Measuring performance of matrix product using dgemm function\n"
		    " via CBLAS interface on %i threads \n\n",i);
	    s_initial = dsecnd();
	    for (r = 0; r < LOOP_COUNT; ++r)
    		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, P, alpha, A, P, B, N, beta, C, N);
            // multiply matrices with cblas_dgemm;
	    s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;

	    printf("Matrix multiplication using dgemm completed \n"
		    " at %.5f milliseconds using %d threads \n\n",
		    (s_elapsed * 1000),i);
        printf("Output the result: \n");
        size = M*N;
        for (i = 0; i < size; ++i)
            if (i % N == N - 1)

    printf("Dellocating memory\n");

    return 0;
Ejemplo n.º 2
void solve1()
	struct st_mesh *q=mshio_create_mesh(fin_base);

	struct st_solver_v1 *s=sv1_create_solver(q,ipar,dpar);

	//double _Complex *b0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	double _Complex *x0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	double _Complex *b1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	double _Complex *x1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	int nitr;
	double eps;



void GeneticAlgorithm::resetParameters(int nPopulation, double scaleFactor, double crossingProbability){
	_scaleFactor = scaleFactor;
	_crossingProbability = crossingProbability;
	_nPopulation = nPopulation;
	delete [] _populationParametersOld;
	delete [] _populationParametersNew;

	_populationParametersOld = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16);
	_populationParametersNew = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16);

	for(int i  = 0; i < _nPopulation; i++){
		_populationParametersOld[i].c11 = (randomDouble(0.0,1.0))*pow(10,9);
		_populationParametersOld[i].c22 = _populationParametersOld[i].c11;
		_populationParametersOld[i].c33 = _populationParametersOld[i].c11;
		_populationParametersOld[i].c44 = (randomDouble(0.0,1.0))*pow(10,9);
		_populationParametersOld[i].c55 = _populationParametersOld[i].c44;
		_populationParametersOld[i].c66 = _populationParametersOld[i].c44;
		_populationParametersOld[i].c12 = (randomDouble(0.0,1.0))*pow(10,9);
		_populationParametersOld[i].c13 = _populationParametersOld[i].c12;
		_populationParametersOld[i].c23 = _populationParametersOld[i].c12;
		_populationParametersOld[i].chiSq = 1;
		_populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);

	//delete _integerDistribution;
	delete [] ints1;
	delete [] ints2;
	delete [] ints3;

	ints1 = new int[_nPopulation];
    ints2 = new int[_nPopulation];
    ints3 = new int[_nPopulation];
Ejemplo n.º 4
void mexFunction(int nlhs, mxArray *plhs[],
	int nrhs, const mxArray *prhs[]){
	/*Declarar las variables locales*/
	mexPrintf("hios\n"); //Tarea1 termina aqui
	double *A, *B, determinante;
	int *pivot, info, Nfilas, Ncolumnas;
	/*Insertar el código */
	if (nrhs != 1){ // nº args diferente de 1
		mexErrMsgTxt("Error. myla, Debe tener un arg de entrada");
	if (!mxIsNumeric(prhs[0])){
		mexErrMsgTxt("Error. El argumento de entrada debe ser una matriz");
	Nfilas = mxGetM(prhs[0]);
	Ncolumnas = mxGetN(prhs[0]);
	if (Nfilas != Ncolumnas){
		mexErrMsgTxt("Error. La matriz debe ser cuadrada");
	if (Nfilas == 0){
		mexErrMsgTxt("Error. La matriz debe no ser vacía");
	if (nlhs > 2){
		mexErrMsgTxt("Error. Debe haber uno o dos args de salida");
	// copia de las variables
	A = mxGetPr(prhs[0]);
	B = (double *)mkl_malloc(Nfilas*Ncolumnas*sizeof(double), 64);
	memcpy(B, A, Nfilas*Ncolumnas*sizeof(double));
	pivot = (int *)mkl_malloc(Nfilas*sizeof(int), 32);
	//procesos computacionales
	info = LAPACKE_dgetrf(LAPACK_COL_MAJOR, Nfilas, Ncolumnas, B, Ncolumnas, pivot);
	determinante = 1.0;
	for (int i = 0; i < Nfilas; i++){
		if (pivot[i] != (i+1)){
			determinante *= -B[i*Ncolumnas + i];
			determinante *= B[i*Ncolumnas + i];
	// crear los resultados de salida
	plhs[0] = mxCreateDoubleScalar(determinante);
	if (nlhs == 2){
		if (fabs(determinante) < 1.0e-8){
			mexWarnMsgTxt("Matriz singular o casi singular");
		LAPACKE_dgetri(LAPACK_COL_MAJOR, Nfilas, B, Ncolumnas, pivot);
		plhs[1] = mxCreateDoubleMatrix(Nfilas, Ncolumnas, mxREAL);
		double *C = mxGetPr(plhs[1]);
		memcpy(C, B, Nfilas*Ncolumnas*sizeof(double));
Ejemplo n.º 5
void save_2d_image_potential(SimulationData &sim_data, double *potential, const char * fits_file_name) {

	double *save_data;
	save_data = (double*)mkl_malloc(sim_data.get_num_x() * sim_data.get_num_y() * sizeof(double), 64);
	fitsfile *fptr;
	int status = 0;
	long fpixel = 1, naxis = 2, nelements;
	long naxes[2] = {sim_data.get_num_y(), sim_data.get_num_x()};

	for (int i = 0; i < sim_data.get_num_x(); ++i) {
		for (int j = 0; j < sim_data.get_num_y(); ++j) {
			save_data[i * sim_data.get_num_y() + j] = 0;
			for (int k = 0; k < sim_data.get_num_z(); ++k) {
				save_data[i * sim_data.get_num_y() + j] += potential[i * sim_data.get_num_y() * sim_data.get_num_z() + j * sim_data.get_num_z() + k];

	fits_create_file(&fptr, fits_file_name, &status);
	fits_create_img(fptr, DOUBLE_IMG, naxis, naxes, &status);
	nelements = naxes[0] * naxes[1];
	fits_write_img(fptr, TDOUBLE, fpixel, nelements, save_data, &status);
	fits_close_file(fptr, &status);
	fits_report_error(stderr, status);	


Ejemplo n.º 6
int check_result(double* A, double* BT, double* C, int m, int n, int c, int transposed) {
    int err_c = 0; //how many errors found
    int i, j, k;
    double* C_ref = (double*)mkl_malloc(m * n * sizeof(double), 16); //with zeroed
//    for(i = 0; i < m; i ++) {
//        for(j = 0; j < n; j++) {
//            C_ref[i*n+j] = 0;
//            for(k = 0; k < c; k++) {
//                C_ref[i*n+j] += A[i*c+k] * BT[j*c+k];
//            }
//        }
//    }

    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
            m, n, c,
            1, A, c, BT, c, 0, C_ref, n);

        for(i = 0; i < m; i++) {
            for(j = 0; j < n; j++) {
                err_c += (fabs(C[j*m + i] - C_ref[i*n+j]) < 0.0001 ? 0 : 1);
    } else {
        //do compare
        for(i = 0; i < m * n ; i++) {
            err_c += (fabs(C[i] - C_ref[i]) < 0.0001 ? 0 : 1);
    return err_c;
Ejemplo n.º 7
// If CUDA is available and in GPU mode, host memory will be allocated pinned,
// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA).
// The improvement in performance seems negligible in the single GPU case,
// but might be more significant for parallel training. Most importantly,
// it improved stability for large models on many GPUs.
inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaMallocHost(ptr, size));
    *use_cuda = true;

#ifdef USE_MLSL
  if (mn::is_multinode()) {
    *ptr = mn::alloc(size ? size : 1, 64);
  } else {
#endif /* !USE_MLSL */

#ifdef USE_MKL
    *ptr = mkl_malloc(size ? size : 1, 64);
   *ptr = malloc(size);

#ifdef USE_MLSL
#endif /* USE_MLSL */

  *use_cuda = false;
  CHECK(*ptr) << "host allocation of size " << size << " failed";
void GeneticAlgorithm2::resetParameters(int nPopulation, double scaleFactor, double crossingProbability){
	_scaleFactor = scaleFactor;
	_crossingProbability = crossingProbability;
	_nPopulation = nPopulation;
	delete [] _populationParametersOld;
	delete [] _populationParametersNew;
//	_populationParametersOld = new Parameters::fitParameters[nPopulation];
//	_populationParametersNew = new Parameters::fitParameters[nPopulation];
	_populationParametersOld = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16);
	_populationParametersNew = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16);
	for(int i  = 0; i < _nPopulation; i++){
		_populationParametersOld[i].A1 = randomDouble(0.0,5.0);
		_populationParametersOld[i].A2 = randomDouble(0.0,5.0);
		_populationParametersOld[i].F1 = randomDouble(440.0,500.0);
		_populationParametersOld[i].F2 = randomDouble(500.0,560.0);
		_populationParametersOld[i].dF1 = randomDouble(20.0,60.0);
		_populationParametersOld[i].dF2 =0;
		_populationParametersOld[i].phi1 = randomDouble(0.0,1.0);
		_populationParametersOld[i].phi2 = randomDouble(0.0,1.0);
		_populationParametersOld[i].Td1 = randomDouble(2.0,10.0);
		_populationParametersOld[i].Td2 = randomDouble(2.0,10.0);
		_populationParametersOld[i].ms1 = randomDouble(1,2.6);
		_populationParametersOld[i].ms2 = randomDouble(1,2.6);
	  //  _populationParametersOld[i].m1 =  randomDouble(1.3,1.9);
	//	_populationParametersOld[i].m2 =  randomDouble(1.3,1.9);
		_populationParametersOld[i].m1 = 1.7;
		_populationParametersOld[i].m2 =  1.7;
		_populationParametersOld[i].dF12 =  randomDouble(-15,5);
	//	_populationParametersOld[i].ms11 = randomDouble(0.0,.2);
	//	_populationParametersOld[i].ms22 = randomDouble(0.0,.2);
		_populationParametersOld[i].T = 4.2;
	for(int i  = 0; i < _nPopulation; i++){
		_populationParametersOld[i].chiSq = calculateResidual2(&_populationParametersOld[i],0);

	//delete _integerDistribution;
	delete [] ints1;
	delete [] ints2;
	delete [] ints3;
//	_integerDistribution = new boost::random::uniform_int_distribution<>(0, _nPopulation-1);
	ints1 = new int[_nPopulation];
    ints2 = new int[_nPopulation];
    ints3 = new int[_nPopulation];
Ejemplo n.º 9
PotentialData::PotentialData(SimulationData &sim_data) {
	this->harmonic_trap = (double*)mkl_malloc(sim_data.num_points * sizeof(double), 64);

	#pragma omp parallel for
	for (int i = 0; i < sim_data.num_points; ++i) {
		harmonic_trap[i] = 0.5 * pow(sim_data.x[i], 2.0);
Ejemplo n.º 10
Archivo: dgemm_3.c Proyecto: yoyz/mpi
int bench_stream_triad()
    double *A, *B, *C;
    double t;
    int64_t m, n, k, i, j;
    m = SIZE, k = SIZE, n = SIZE;
    double scalar=3.14;
    A = (double *)mkl_malloc( m*k*sizeof( double ), 64 );
    B = (double *)mkl_malloc( k*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );

#pragma omp parallel for 
    for (i = 0; i < (m*k); i++) {
        A[i] = (double)(i+1);
#pragma omp parallel for 
    for (i = 0; i < (k*n); i++) {
        B[i] = (double)(-i-1);

#pragma omp parallel for 
    for (i = 0; i < (m*n); i++) {
        C[i] = 0.0;

    if (A == NULL || B == NULL || C == NULL) {
      printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
      return 1;
    for (i=0;i<NTIME;i++)
#pragma omp parallel for    
      for (j=0; j<(m*k); j++)
	A[j] = B[j]+scalar*C[j];
    printf("GB/s         : %f\n",(((((m*k)*3)*8)*NTIME)/t)*1E-9);
    DPRINTF("\n Deallocating memory \n\n");
    return 0;
Ejemplo n.º 11
vector_t::vector_t(size_t _len, double init)
	data(static_cast<double*>(mkl_malloc(_len * sizeof(double), 64)), std::ptr_fun(mkl_free)),
	len(_len), inc(1)
	stack_assert(data.get() != nullptr);
	stack_assert(len > 0);
	std::fill(data.get(), data.get() + len, init);
Ejemplo n.º 12
vector_t::vector_t(size_t _len, std::mt19937 & gen)
	data(static_cast<double*>(mkl_malloc(_len * sizeof(double), 64)), std::ptr_fun(mkl_free)),
	len(_len), inc(1)
	stack_assert(data.get() != nullptr);
	stack_assert(len > 0);
	std::normal_distribution<> d(0,0.1);
	std::generate(data.get(), data.get() + len, std::bind(d,gen));
Ejemplo n.º 13
inline void* znn_malloc(size_t s)
    void* r = mkl_malloc(s,64);
    void* r = malloc(s);
    if ( !r ) throw std::bad_alloc();
    return r;
Ejemplo n.º 14
int create_array(const MKL_INT length, T*& x)
	if (x == nullptr)
		x = (T*)mkl_malloc(length*sizeof(T),64);
		if (x == nullptr)
			return OUTOFMEMORY;
	return 0;
Ejemplo n.º 15
WavefunctionData::WavefunctionData(SimulationData &sim_data) {
    this->psi = (MKL_Complex16*)mkl_malloc(sim_data.num_points * sizeof(MKL_Complex16), 64);
    this->psi_old = (MKL_Complex16*)mkl_malloc(sim_data.num_points * sizeof(MKL_Complex16), 64);
    this->psi_new = (MKL_Complex16*)mkl_malloc(sim_data.num_points * sizeof(MKL_Complex16), 64);
    this->conj_psi = (MKL_Complex16*)mkl_malloc(sim_data.num_points * sizeof(MKL_Complex16), 64);
    this->psi_tf = (double*)mkl_malloc(sim_data.num_points * sizeof(double), 64);
    this->psi_abs2 = (double*)mkl_malloc(sim_data.num_points * sizeof(double), 64);
    this->wavefunction_norm = 1;

    double expval;
    #pragma omp parallel for private(expval)
    for (int i = 0; i < sim_data.num_points; ++i) {
        expval = exp(-0.05 * pow(sim_data.x[i], 2.0));
        this->psi[i].real = expval;
        this->psi[i].imag = 0;
        this->psi_old[i].real = expval;
        this->psi_old[i].imag = 0;
        this->psi_new[i].real = 0;
        this->psi_new[i].imag = 0;
        this->conj_psi[i].real = 0;
        this->conj_psi[i].imag = 0;
        this->psi_abs2[i] = 0;
        this->psi_tf[i] = 0;

    calc_norm(sim_data, this->psi);
    normalize_wf(sim_data, this->psi);
    vzAbs(sim_data.num_points, this->psi, this->psi_abs2);
    vdMul(sim_data.num_points, this->psi_abs2, this->psi_abs2, this->psi_abs2);
    save_data(this->psi_abs2, sim_data, "init_state.bin");

Ejemplo n.º 16
void initial_matrix(double** A_addr, double** BT_addr, double** C_addr, int m, int n, int c) {
    double* A = (double*)mkl_malloc(m * c * sizeof(double) , 16);
    double* BT = (double*)mkl_malloc(n * c * sizeof(double), 16);
    double* C = (double*)mkl_malloc(m * n * sizeof(double), 16);
    *A_addr = A;
    *BT_addr = BT;
    *C_addr = C;
    //use random number for input
    int i,j,k;

    for(i = 0; i < m; i++) {
        for(k = 0; k < c; k++) {
            A[i*c+k] = ((double)rand()/(double)RAND_MAX);
    for(j = 0; j < n; j++) {
        for(k = 0; k < c; k++) {
            BT[j*c+k] = ((double)rand()/(double)RAND_MAX);
Ejemplo n.º 17
int main(int argc, char *argv[]){
	double inicio, fin = dsecnd();
	double *A = (double *)mkl_malloc(N*N*sizeof(double), 64);
	double *B = (double *)mkl_malloc(N*sizeof(double), 64);
	int *pivot = (int *)mkl_malloc(N*sizeof(int), 32);
	// distribucion normal de media 0 y varianza 1 
	std::default_random_engine generador;
	std::normal_distribution<double> aleatorio(0.0, 1.0);
	for (int i = 0; i < N*N; i++) A[i] = aleatorio(generador);
	for (int i = 0; i < N; i++) B[i] = aleatorio(generador);
	// matriz A marcadamente diagonal para evitar riesgo de singularidad 
	for (int i = 0; i < N; i++) A[i*N + i] += 10.0;
	int result;
	inicio = dsecnd();
	for (int i = 0; i < NTEST; i++)
		result = LAPACKE_dgesv(LAPACK_ROW_MAJOR, N, 1, A, N, pivot, B, 1);
	fin = dsecnd();
	double tiempo = (fin - inicio) / (double)NTEST;
	printf("Tiempo: %lf msec\n", tiempo*1.0e3);
	std::getchar(); return 0;
Ejemplo n.º 18
 *struct st_rmsm {
 *        int status; // internal status
 *        int size; // dimension, i.e., size of this matrix
 *        int *pos; // i-th row starts from n[pos[i]] and a[pos[i]]
 *        int *rsz; // i-th row has rsz[i] non-zero elements, 'rsz' stands for row size
 *        int **col
 *        double **data;
 *        std::vector<intdbl_t> *tmp; // used only when unpacked
struct st_rmsm *rmsm_create(const int size)

	struct st_rmsm *m=(struct st_rmsm*)mkl_malloc(sizeof(struct st_rmsm),64);


	m->size  = size;
	m->pos   = (int*)mkl_malloc(sizeof(int)*size,64);
	m->rsz   = (int*)mkl_malloc(sizeof(int)*size,64);
	m->col   = NULL;
	m->data  = NULL;
	m->tmp   = new std::vector<intdbl_t>[size];


	return m;
Ejemplo n.º 19
int save_data_real(MKL_Complex16 *data, SimulationData &sim_data, const char * filename) {
	double *data2;
	data2 = (double*)mkl_malloc(sim_data.num_points * sizeof(double), 64);

	for (int i = 0; i < sim_data.num_points; ++i) {
		data2[i] = data[i].real;

	FILE* pFile;
	pFile = fopen(filename, "wb");
	fwrite(data2, sizeof(double), sim_data.num_points, pFile);
	return 0;
Ejemplo n.º 20
CMatrix3D::CMatrix3D(MKL_INT r,MKL_INT c, MKL_INT z)
//Since we are planning on very large non-sparse arrays therefore we will make this a list of CMatrix pointers
//this will enable the creation of large matrix arrays in different memory regions thus avoid wasting memory at the cost of performance

	pMats = (CMatrix*) mkl_malloc(z*sizeof(CMatrix),64);

	depth = z;

	for(int i =0; i < z;i++)

Ejemplo n.º 21
int main()
    double *A, *B, *C;
    int m, n, p, i, j;
    double alpha, beta;

    printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
            " Intel® MKL function dgemm, where A, B, and  C are matrices and \n"
            " alpha and beta are double precision scalars\n\n");

    m = 2000, p = 200, n = 1000;
    printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
            " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
    alpha = 1.0; beta = 0.0;

    printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
            " performance \n\n");
    A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
    B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
    if (A == NULL || B == NULL || C == NULL) {
      printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
      return 1;

    printf (" Intializing matrix data \n\n");
    for (i = 0; i < (m*p); i++) {
        A[i] = (double)(i+1);

    for (i = 0; i < (p*n); i++) {
        B[i] = (double)(-i-1);

    for (i = 0; i < (m*n); i++) {
        C[i] = 0.0;

    printf (" Computing matrix product using Intel® MKL dgemm function via CBLAS interface \n\n");
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 
                m, n, p, alpha, A, p, B, n, beta, C, n);
    printf ("\n Computations completed.\n\n");

    printf (" Top left corner of matrix A: \n");
    for (i=0; i<min(m,6); i++) {
      for (j=0; j<min(p,6); j++) {
        printf ("%12.0f", A[i+j*p]);
      printf ("\n");

    printf ("\n Top left corner of matrix B: \n");
    for (i=0; i<min(p,6); i++) {
      for (j=0; j<min(n,6); j++) {
        printf ("%12.0f", B[j+i*n]);
      printf ("\n");
    printf ("\n Top left corner of matrix C: \n");
    for (i=0; i<min(m,6); i++) {
      for (j=0; j<min(n,6); j++) {
        printf ("%12.5G", C[j+i*n]);
      printf ("\n");

    printf ("\n Deallocating memory \n\n");

    printf (" Example completed. \n\n");
    return 0;
Ejemplo n.º 22
		Matrix(const int n): num(n) {
			data = (double*)mkl_malloc(sizeof(double)*n*n,64);
Ejemplo n.º 23
// X: a MxD matrix, Y: a M vector, W: a M vector
// W0: a M vector
int main(int argc, char ** argv){
    if (argc>1 && argv[1][0]=='h') {
        printf ("Usage: parSymSGD M D T C lamda r\n");
        printf ("  M: number of data points, D: dimensions, T: time iterations, C: cores;\n");
        printf ("  lamda: learning rate, r: panel size in unit of C.\n");
        return 1;
    // read in the arguments: M, D, I (time iterations), C (cores), r (each panel contains r*C points)
    int M = argc>1?atoi(argv[1]):32;
    int D = argc>2?atoi(argv[2]):4;
    T = argc>3?atoi(argv[3]):10;
    int C = argc>4?atoi(argv[4]):4;
    float lamda = argc>5?atof(argv[5]):0.01;
    int r = argc>6?atoi(argv[6]):1;
    ///printf("M=%d, D=%d, T=%d, C=%d, lamda=%8.6f, r=%d\n",M,D,T,C,lamda,r);

    int max_threads = mkl_get_max_threads(); // get the max number of threads
    int rep;
    mkl_set_num_threads(1); // set the number of threads to use by mkl
    panelSz = C*r;
    panels = M/panelSz;

    int i,j,k,p,t;
    float *Y, *Wreal, *W, *X;
    Y = (float *) mkl_malloc(M*sizeof(float),PAGESIZE);
    Wreal = (float *) mkl_malloc(D*sizeof(float),PAGESIZE);
    W = (float *) mkl_malloc(D*sizeof(float),PAGESIZE);
    X = (float *) mkl_malloc(M*D*sizeof(float),PAGESIZE);
    float *Ypred = (float*)mkl_malloc(M*sizeof(float),PAGESIZE);
    float *Ytmp = (float*)mkl_malloc(M*sizeof(float),PAGESIZE);
	float *I = (float*)mkl_malloc(D*D*sizeof(float),PAGESIZE);
    float *Z = (float*)mkl_malloc(M*D*sizeof(float),PAGESIZE);
    float *B = (float*)mkl_malloc(panels*D*sizeof(float),PAGESIZE);

    if (Y==NULL | Wreal==NULL | W==NULL | X==NULL | Ypred==NULL || Ytmp==NULL || Z==NULL || B==NULL || I== NULL){
        printf("Memory allocation error.\n");
        return 2;

    initData(Wreal,W,X,Y, M, D,I);

    ///printf("panelSz=%d, panels=%d\n", panelSz, panels);

    for (nt=1; nt<=max_threads && nt<=panelSz; nt*=2){
        omp_set_num_threads(nt);// set the number of openMP threads

        for (rep=0; rep<REPEATS; rep++){//repeat measurements
            double prepTime, gdTime, sInit;
            // preprocessing
            //preprocessSeq(X, Y, Z, B, panelSz, panels, M, D, lamda);
            preprocessPar(X, Y, Z, B, panelSz, panels, M, D, lamda);
            prepTime = (dsecnd() - sInit);

            // GD
            ///dump1("W (initial)", W, D);
            float err;
            float fixpoint = 0.0;
            for (t=0;t<T;t++){
                for (p=0;p<panels;p++){
                    gd(&(X[p*panelSz*D]),&(Z[p*panelSz*D]), &(B[p*D]), panelSz, D, lamda, W, I);
                    ///printf("(t=%d, p=%d) ",t,p);
                    ///dump1("W", W, D);
                    ///err=calErr(X, Ypred, Ytmp, Y, W, M, D);
                  printf("finish  one  panels     ............................  \n");
            gdTime = (dsecnd() - sInit);

            err=calErr(X, Ypred, Ytmp, Y, W, M, D);
            fixpoint = err - prev_err;

            // print final err. time is in milliseconds
            printf("nt=%d\t ttlTime=%.5f\t prepTime=%.5f\t gdTime=%.5f\t error=%.5f\n", nt, (gdTime+prepTime)*1000, prepTime*1000, gdTime*1000, err);
    if (B) mkl_free(B);
    if (Z) mkl_free(Z);
    if (Ytmp) mkl_free(Ytmp);
    if (Ypred) mkl_free(Ypred);
    if (Y) mkl_free(Y);
    if (Wreal) mkl_free(Wreal);
    if (W) mkl_free(W);
    if (X) mkl_free(X);
	if (I) mkl_free(I);
    return 0;
Ejemplo n.º 24
int main()
    double *A, *B, *C;
    int m, n, p, i, j, k, r;
    double alpha, beta;
    double sum;
    double s_initial, s_elapsed;

    printf ("\n This example measures performance of rcomputing the real matrix product \n"
            " C=alpha*A*B+beta*C using a triple nested loop, where A, B, and C are \n"
            " matrices and alpha and beta are double precision scalars \n\n");

    m = 2000, p = 200, n = 1000;
    printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
            " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
    alpha = 1.0; beta = 0.0;
    printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
            " performance \n\n");
    A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
    B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
    if (A == NULL || B == NULL || C == NULL) {
        printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
        return 1;

    printf (" Intializing matrix data \n\n");
    for (i = 0; i < (m*p); i++) {
        A[i] = (double)(i+1);

    for (i = 0; i < (p*n); i++) {
        B[i] = (double)(-i-1);

    for (i = 0; i < (m*n); i++) {
        C[i] = 0.0;

    printf (" Making the first run of matrix product using triple nested loop\n"
            " to get stable run time measurements \n\n");
    for (i = 0; i < m; i++) {
        for (j = 0; j < n; j++) {
            sum = 0.0;
            for (k = 0; k < p; k++)
                sum += A[p*i+k] * B[n*k+j];
            C[n*i+j] = sum;

    printf (" Measuring performance of matrix product using triple nested loop \n\n");
    s_initial = dsecnd();
    for (r = 0; r < LOOP_COUNT; r++) {
        for (i = 0; i < m; i++) {
            for (j = 0; j < n; j++) {
                sum = 0.0;
                for (k = 0; k < p; k++)
                    sum += A[p*i+k] * B[n*k+j];
                C[n*i+j] = sum;
    s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;
    printf (" == Matrix multiplication using triple nested loop completed == \n"
            " == at %.5f milliseconds == \n\n", (s_elapsed * 1000));
    printf (" Deallocating memory \n\n");
    if (s_elapsed < 0.9/LOOP_COUNT) {
        printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"
               " computer as %i to have total execution time about 1 second for reliability \n"
               " of measurements\n\n", i);
    printf (" Example completed. \n\n");
    return 0;
Ejemplo n.º 25
int test01(void)
	int err=0; 
        printf("	|Test solver_v2 workflow\n");

	//struct st_mesh *q=mshio_create_mesh("msh/0344/0344");
	struct st_mesh *q=mshio_create_mesh("msh/0616/0616");

	 * ipar[0] = M
	 * ipar[1] = Nd
	 * ipar[2] = pad
	 * ipar[3] = rule1
	 * ipar[4] = rule2
	 * ipar[5] = nu
	 * ipar[6] = nv
	 * ipar[7] = num_threads in omp
	const int ipar[128]={1,3,1, 1,1,5,3, 1};
	 * dpar[0] = g factor
	 * dpar[1] = mua (absorption coefficient)
	 * dpar[2] = mus (scattering coefficient)
	const double dpar[128]={0.7,1.0,2.0};

	struct st_solver_v2 *s=sv2_create_solver(q,ipar,dpar);
	//for (int i = 0; i < s->Nt; i++)

	double _Complex *b0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	double _Complex *x0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	double _Complex *b1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	double _Complex *x1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	int nitr;
	double eps;

	for (int i = 0; i < s->Ng; i++)
		x0[i] = 1.0;



        printf("END OF TEST01\n");
	return err;
Ejemplo n.º 26
int test01(void)
	int err=0; 
        printf("	|Test solver_v1 workflow\n");

	struct st_mesh *q=mshio_create_mesh("msh/0344/0344");

	 * ipar[0] = M
	 * ipar[1] = Nd
	 * ipar[2] = pad
	 * ipar[3] = rule1
	 * ipar[4] = rule2
	 * ipar[5] = nu
	 * ipar[6] = nv
	 * ipar[7] = num_threads in omp
	//const int ipar[128]={1,3,1, 1,1,5,3, 1};
	const int ipar[128]={1,3,1, 1,1,5,3, 8};
	//const int ipar[128]={1,30,1, 2,5,5,3, 8};
	 * dpar[0] = g factor
	 * dpar[1] = mua (absorption coefficient)
	 * dpar[2] = mus (scattering coefficient)
	const double dpar[128]={0.7,1.0,2.0};

	struct st_solver_v1 *s=sv1_create_solver(q,ipar,dpar);
	//for (int i = 0; i < s->Ns; i++)
		//printf("[%5d] %.5E\n",i,s->E[i]);

	double _Complex *b0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	double _Complex *x0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	double _Complex *b1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	double _Complex *x1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64);
	int nitr;
	double eps;



	char dir[FILENAME_MAX]="SOL";


        printf("END OF TEST01\n");
	return err;
Ejemplo n.º 27
void rmsm_pack(struct st_rmsm *m)

	//for (int i = 0; i < m->size; i++)
		//printf("[%5d] %lu\n",i,m->tmp[i].size());

	//const int row=73; // examine this row

	//printf("row %d (raw)\n",row);
	//for (int i = 0; i < m->tmp[row].size(); i++)
		//printf("[%5d] %f\n",m->tmp[row].at(i).i,m->tmp[row].at(i).d); 

	// sort each row
	for (int n = 0; n < m->size; n++)

	//for (int i = 0; i < m->tmp[row].size(); i++)
		//printf("[%5d] %f\n",m->tmp[row].at(i).i,m->tmp[row].at(i).d); 

	// merge
	std::vector<int>    *vi = new std::vector<int>[m->size];
	std::vector<double> *vd = new std::vector<double>[m->size];
	for (int n = 0; n < m->size; n++) {
		if (m->tmp[n].size()==0) continue;
		int curr_col=vi[n].at(0);
		for (int j = 1; j < m->tmp[n].size(); j++) {
			int    col=m->tmp[n].at(j).i;
			double val=m->tmp[n].at(j).d;
			if (col==curr_col)
				vd[n].back() += val;
			else {
				curr_col = col;
	delete [] m->tmp;

	//for (int i = 0; i < vi[row].size(); i++)
		//printf("[%5d] %f\n",vi[row].at(i),vd[row].at(i));

	// pack
	{ int ptr=0;
	for (int i = 0; i < m->size; i++) {
		m->pos[i] = ptr;
		ptr      += vi[i].size();
		m->rsz[i] = vi[i].size();
	m->col =(int*)   mkl_malloc(sizeof(int)   *(m->length),64);
	for (int i = 0; i < m->size; i++)
		for (int j = 0; j < vi[i].size(); j++) {
			m->col [j+m->pos[i]] = vi[i].at(j);
			m->data[j+m->pos[i]] = vd[i].at(j);
	delete [] vi;
	delete [] vd;

	//for (int i = 0; i < m->rsz[row]; i++)
		//printf("[%5d] %f\n",m->col[i+m->pos[row]],m->data[i+m->pos[row]]);

void GeneticAlgorithm::initializeParameters(double* dataSet, int dataSetLength, int nPopulation, double scaleFactor, double crossingProbability){

	_scaleFactor = scaleFactor;
	_crossingProbability = crossingProbability;
	_dataSetLength = dataSetLength;
	_dataSet = dataSet;

	_residualArray = new double*[nThreads];
	_paramArray = new double*[nThreads];
	for(int i = 0; i < nThreads; i++){
	_residualArray[i] = new double[_dataSetLength];
	_paramArray[i] = new double[nParams];

	_populationParametersOld = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16);
	_populationParametersNew = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16);
	for(int i  = 0; i < _nPopulation; i++){
		_populationParametersOld[i].missFreq = new long[_nMissing];
		_populationParametersNew[i].missFreq = new long[_nMissing];

	/*	_populationParametersOld[i].c11 = (randomDouble(196,196.1))*pow(10,9);
		_populationParametersOld[i].c22 = _populationParametersOld[i].c11;
		_populationParametersOld[i].c33 = (randomDouble(187,187.1))*pow(10,9);

		_populationParametersOld[i].c44 = (randomDouble(63.5,63.6))*pow(10,9);
		_populationParametersOld[i].c55 = _populationParametersOld[i].c44;
		_populationParametersOld[i].c66 = (randomDouble(55.7,55.8))*pow(10,9);

		_populationParametersOld[i].c12 = (randomDouble(62.5,62.6))*pow(10,9);
		_populationParametersOld[i].c13 = (randomDouble(69.8,69.9))*pow(10,9);
		_populationParametersOld[i].c23 = _populationParametersOld[i].c13; 

		_populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Tetragonal PuCoGa5*/
	/*	_populationParametersOld[i].c11 = (randomDouble(260,300))*pow(10,9);
		_populationParametersOld[i].c22 = _populationParametersOld[i].c11;
		_populationParametersOld[i].c33 = (randomDouble(290,320))*pow(10,9);

		_populationParametersOld[i].c44 = (randomDouble(90,110))*pow(10,9);
		_populationParametersOld[i].c55 = _populationParametersOld[i].c44;
		_populationParametersOld[i].c66 = (randomDouble(130,150))*pow(10,9);

		_populationParametersOld[i].c12 = (randomDouble(140,165))*pow(10,9);
		_populationParametersOld[i].c13 = (randomDouble(100,130))*pow(10,9);
		_populationParametersOld[i].c23 = _populationParametersOld[i].c13; 

		_populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Tetragonal URu2Si2

		_populationParametersOld[i].c11 = (randomDouble(220,260))*pow(10,9);
		_populationParametersOld[i].c22 = (randomDouble(210,250))*pow(10,9);
		_populationParametersOld[i].c33 = (randomDouble(100,150))*pow(10,9);

		_populationParametersOld[i].c44 = (randomDouble(32,38))*pow(10,9);
		_populationParametersOld[i].c55 = (randomDouble(48,52))*pow(10,9);
		_populationParametersOld[i].c66 = (randomDouble(94,98))*pow(10,9);

		_populationParametersOld[i].c12 = (randomDouble(100,150))*pow(10,9);
		_populationParametersOld[i].c13 = (randomDouble(25,60))*pow(10,9);
		_populationParametersOld[i].c23 = (randomDouble(20,70))*pow(10,9);

		_populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Orthorhombic YBCO67

		//_populationParametersOld[i].c11 = (randomDouble(1,400))*pow(10,9);
		//_populationParametersOld[i].c22 = _populationParametersOld[i].c11;
		//_populationParametersOld[i].c33 = _populationParametersOld[i].c11;
		//_populationParametersOld[i].c44 = (randomDouble(1,400))*pow(10,9);
		//_populationParametersOld[i].c55 = _populationParametersOld[i].c44;		
		//_populationParametersOld[i].c66 = _populationParametersOld[i].c44;

		//_populationParametersOld[i].c12 = (randomDouble(1,400))*pow(10,9);	
		//_populationParametersOld[i].c13 = _populationParametersOld[i].c12; 
		//_populationParametersOld[i].c23 = _populationParametersOld[i].c12; 

		//_populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Cubic Nb

		//_populationParametersOld[i].c11 = (randomDouble(120,200))*pow(10,9);
		//_populationParametersOld[i].c22 = _populationParametersOld[i].c11;
		//_populationParametersOld[i].c33 = (randomDouble(120,200))*pow(10,9);

		//_populationParametersOld[i].c44 = (randomDouble(10,100))*pow(10,9);
		//_populationParametersOld[i].c55 = _populationParametersOld[i].c44;
		//_populationParametersOld[i].c66 = (randomDouble(10,100))*pow(10,9);

		//_populationParametersOld[i].c12 = (randomDouble(10,100))*pow(10,9);
		//_populationParametersOld[i].c13 = (randomDouble(10,100))*pow(10,9);
		//_populationParametersOld[i].c23 = _populationParametersOld[i].c13; 

		//_populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Tetragonal CeCoIn5


		_minimumParameters.c11 = 1;
		_minimumParameters.c22 = 1;
		_minimumParameters.c33 = 1;
		_minimumParameters.c44 = 1;
		_minimumParameters.c55 = 1;
		_minimumParameters.c66 = 1;
		_minimumParameters.c12 = 1;
		_minimumParameters.c13 = 1;
		_minimumParameters.c23 = 1;

		_minimumParameters.chiSq = std::numeric_limits<double>::infinity();

		_minimumParameters.missFreq = new long[_nMissing];
void GeneticAlgorithm2::initializeParameters(double** dataSet, int dataSetLength, int nPopulation, double scaleFactor, double crossingProbability){

	_scaleFactor = scaleFactor;
	_crossingProbability = crossingProbability;
	_dataSetLength = dataSetLength;
	_dataSet = dataSet;

	_residualArray = new double*[nThreads];
	_paramArray = new double*[nThreads];
	for(int i = 0; i < nThreads; i++){
	_residualArray[i] = new double[_dataSetLength];
	_paramArray[i] = new double[nParams];
	// xVals[i] = new double[4];

	_populationParametersOld = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16);
	_populationParametersNew = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16);
	for(int i  = 0; i < _nPopulation; i++){
		_populationParametersOld[i].A1 = randomDouble(0.0,1.0);
		_populationParametersOld[i].A2 = randomDouble(0.0,1.0);
		_populationParametersOld[i].F1 = randomDouble(460,480);
		_populationParametersOld[i].F2 = randomDouble(520,540);
		_populationParametersOld[i].dF1 = randomDouble(25.,45.0);
		_populationParametersOld[i].dF2 = 0;
		_populationParametersOld[i].phi1 = randomDouble(0.0,1.0);
		_populationParametersOld[i].phi2 = randomDouble(0.0,1.0);
		_populationParametersOld[i].Td1 = randomDouble(4., 8.);
		_populationParametersOld[i].Td2 = randomDouble(4., 8.);
	//	_populationParametersOld[i].ms1 = randomDouble(1,2.2);
	//	_populationParametersOld[i].ms2 = randomDouble(1.5,1.7);
		_populationParametersOld[i].ms1 = randomDouble(1.0,1.4);
		_populationParametersOld[i].ms2 = randomDouble(1.5,1.7);
	//	_populationParametersOld[i].m1 =  randomDouble(1.0,3);
	//	_populationParametersOld[i].m2 =  randomDouble(1.0,3);
		_populationParametersOld[i].m1 = 1.7;
		_populationParametersOld[i].m2 =  1.7;
	//	_populationParametersOld[i].dF12 = randomDouble(-15,5);
	//	_populationParametersOld[i].ms11 = randomDouble(0.0,.2);
	//	_populationParametersOld[i].ms22 = randomDouble(0.0,.2);
		_populationParametersOld[i].T = 4.2;
		_populationParametersOld[i].chiSq = calculateResidual2(&_populationParametersOld[i],0);

	for(int i  = 0; i < _nPopulation; i++){
		_populationParametersNew[i].T = 0;
		_populationParametersNew[i].m1 = 0;
		_populationParametersNew[i].m2 = 0;
		_minimumParameters.A1 = 1;
		_minimumParameters.A2 = 1;
		_minimumParameters.F1 = 0;
		_minimumParameters.F2 = 0;
		_minimumParameters.dF1 = 0;
		_minimumParameters.dF2 = 0;
		_minimumParameters.phi1 = 0;
		_minimumParameters.phi2 = 0;
		_minimumParameters.Td1 = 0;
		_minimumParameters.Td2 = 0;
		_minimumParameters.ms1 = 0;
		_minimumParameters.ms2 = 0;
		_minimumParameters.m1 = 0;
		_minimumParameters.m2 = 0;
	//	_minimumParameters.dF12 = 0;
	//	_minimumParameters.ms11 = 0;
	//	_minimumParameters.ms22 = 0;
		_minimumParameters.T = 0;
		_minimumParameters.chiSq = INFINITE;
Ejemplo n.º 30
Archivo: dgemm_3.c Proyecto: yoyz/mpi
int bench_dgemm()
    double *A, *B, *C;
    int m, n, k, i, j;
    double alpha, beta;
    double t;

    m = SIZE, k = SIZE, n = SIZE;
    DPRINTF(" Initializing data for matrix multiplication C=A*B for matrix \n"
            " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
    alpha = 1.0; beta = 0.0;

    DPRINTF(" Allocating memory for matrices aligned on 64-byte boundary for better \n"
            " performance \n\n");
    A = (double *)mkl_malloc( m*k*sizeof( double ), 64 );
    B = (double *)mkl_malloc( k*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
    if (A == NULL || B == NULL || C == NULL) {
      printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
      return 1;

    DPRINTF(" Intializing matrix data \n\n");
#pragma omp parallel for 
    for (i = 0; i < (m*k); i++) {
        A[i] = (double)(i+1);
#pragma omp parallel for 
    for (i = 0; i < (k*n); i++) {
        B[i] = (double)(-i-1);

#pragma omp parallel for 
    for (i = 0; i < (m*n); i++) {
        C[i] = 0.0;

    DPRINTF(" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 
                m, n, k, alpha, A, k, B, n, beta, C, n);
    printf("calculation time : %f\n",t);
    printf("gflops/s         : %f\n",((2.0*m*n*k)*1E-9)/t);

    DPRINTF("\n Computations completed.\n\n");

    DPRINTF(" Top left corner of matrix A: \n");
    for (i=0; i<min(m,6); i++) {
      for (j=0; j<min(k,6); j++) {
        DPRINTF("%12.0f", A[j+i*k]);

    DPRINTF("\n Top left corner of matrix B: \n");
    for (i=0; i<min(k,6); i++) {
      for (j=0; j<min(n,6); j++) {
        DPRINTF("%12.0f", B[j+i*n]);
    DPRINTF("\n Top left corner of matrix C: \n");
    for (i=0; i<min(m,6); i++) {
      for (j=0; j<min(n,6); j++) {
        DPRINTF("%12.5G", C[j+i*n]);

    DPRINTF("\n Deallocating memory \n\n");

    DPRINTF(" Example completed. \n\n");
    return 0;