int main() { double *A, *B, *C; int i,j,r,max_threads,size; double alpha, beta; double s_initial, s_elapsed; printf("Intializing data for matrix multiplication C=A*B for matrix\n\n" " A(%i*%i) and matrix B(%i*%i)\n",M,P,P,N); alpha = 1.0; beta = 0.0; printf("Allocating memory for matrices aligned on 64-byte boundary for better performance \n\n"); A = ( double *)mkl_malloc(M*P*sizeof( double ),64); B = ( double *)mkl_malloc(N*P*sizeof( double ),64); C = ( double *)mkl_malloc(M*N*sizeof( double ),64); if (A == NULL || B == NULL || C == NULL) { printf("Error: can`t allocate memory for matrices.\n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf("Intializing matrix data\n\n"); size = M*P; for (i = 0; i < size; ++i) { A[i] = ( double )(i+1); } size = N*P; for (i = 0; i < size; ++i) { B[i] = ( double )(i-1); } printf("Finding max number of threads can use for parallel runs \n\n"); max_threads = mkl_get_max_threads(); printf("Running from 1 to %i threads \n\n",max_threads); for (i = 1; i <= max_threads; ++i) { size = M*N; for (j = 0; j < size; ++j) { C[j] = 0.0; } printf("Requesting to use %i threads \n\n",i); mkl_set_num_threads(i); printf("Measuring performance of matrix product using dgemm function\n" " via CBLAS interface on %i threads \n\n",i); s_initial = dsecnd(); for (r = 0; r < LOOP_COUNT; ++r) { cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, P, alpha, A, P, B, N, beta, C, N); // multiply matrices with cblas_dgemm; } s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT; printf("Matrix multiplication using dgemm completed \n" " at %.5f milliseconds using %d threads \n\n", (s_elapsed * 1000),i); printf("Output the result: \n"); size = M*N; for (i = 0; i < size; ++i) { printf("%i\t",(int)C[i]); if (i % N == N - 1) printf("\n"); } } printf("Dellocating memory\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 0; }
void solve1() { struct st_mesh *q=mshio_create_mesh(fin_base); //mshio_print_mesh(q,PRINT_INFO_VERBOSE); struct st_solver_v1 *s=sv1_create_solver(q,ipar,dpar); sv1_print_solverinfo(s); //double _Complex *b0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *x0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *b1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *x1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); //assert(b0); assert(x0); assert(b1); assert(x1); int nitr; double eps; //sv1_gen_b0(s,0.0,b0); //sv1_solve(s,b0,x0,200,12,1.0E-13,&nitr,&eps); //sv1_save_solution(s,x0,output_dir); sv1_gen_b1x0(s,0.0,b1,x0); sv1_solve(s,b1,x1,200,30,1.0E-13,&nitr,&eps); sv1_save_solution(s,x1,output_dir); //mkl_free(b0); mkl_free(x0); mkl_free(b1); mkl_free(x1); sv1_destroy_solver(s); mshio_destroy_mesh(q); }
void GeneticAlgorithm::resetParameters(int nPopulation, double scaleFactor, double crossingProbability){ _scaleFactor = scaleFactor; _crossingProbability = crossingProbability; _nPopulation = nPopulation; delete [] _populationParametersOld; delete [] _populationParametersNew; _populationParametersOld = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16); _populationParametersNew = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16); for(int i = 0; i < _nPopulation; i++){ _populationParametersOld[i].c11 = (randomDouble(0.0,1.0))*pow(10,9); _populationParametersOld[i].c22 = _populationParametersOld[i].c11; _populationParametersOld[i].c33 = _populationParametersOld[i].c11; _populationParametersOld[i].c44 = (randomDouble(0.0,1.0))*pow(10,9); _populationParametersOld[i].c55 = _populationParametersOld[i].c44; _populationParametersOld[i].c66 = _populationParametersOld[i].c44; _populationParametersOld[i].c12 = (randomDouble(0.0,1.0))*pow(10,9); _populationParametersOld[i].c13 = _populationParametersOld[i].c12; _populationParametersOld[i].c23 = _populationParametersOld[i].c12; _populationParametersOld[i].chiSq = 1; _populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0); } //delete _integerDistribution; delete [] ints1; delete [] ints2; delete [] ints3; ints1 = new int[_nPopulation]; ints2 = new int[_nPopulation]; ints3 = new int[_nPopulation]; }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){ /*Declarar las variables locales*/ mexPrintf("hios\n"); //Tarea1 termina aqui double *A, *B, determinante; int *pivot, info, Nfilas, Ncolumnas; /*Insertar el código */ if (nrhs != 1){ // nº args diferente de 1 mexErrMsgTxt("Error. myla, Debe tener un arg de entrada"); } if (!mxIsNumeric(prhs[0])){ mexErrMsgTxt("Error. El argumento de entrada debe ser una matriz"); } Nfilas = mxGetM(prhs[0]); Ncolumnas = mxGetN(prhs[0]); if (Nfilas != Ncolumnas){ mexErrMsgTxt("Error. La matriz debe ser cuadrada"); } if (Nfilas == 0){ mexErrMsgTxt("Error. La matriz debe no ser vacía"); } if (nlhs > 2){ mexErrMsgTxt("Error. Debe haber uno o dos args de salida"); } // copia de las variables A = mxGetPr(prhs[0]); B = (double *)mkl_malloc(Nfilas*Ncolumnas*sizeof(double), 64); memcpy(B, A, Nfilas*Ncolumnas*sizeof(double)); pivot = (int *)mkl_malloc(Nfilas*sizeof(int), 32); //procesos computacionales info = LAPACKE_dgetrf(LAPACK_COL_MAJOR, Nfilas, Ncolumnas, B, Ncolumnas, pivot); determinante = 1.0; for (int i = 0; i < Nfilas; i++){ if (pivot[i] != (i+1)){ determinante *= -B[i*Ncolumnas + i]; } else{ determinante *= B[i*Ncolumnas + i]; } } // crear los resultados de salida plhs[0] = mxCreateDoubleScalar(determinante); if (nlhs == 2){ if (fabs(determinante) < 1.0e-8){ mexWarnMsgTxt("Matriz singular o casi singular"); } LAPACKE_dgetri(LAPACK_COL_MAJOR, Nfilas, B, Ncolumnas, pivot); plhs[1] = mxCreateDoubleMatrix(Nfilas, Ncolumnas, mxREAL); double *C = mxGetPr(plhs[1]); memcpy(C, B, Nfilas*Ncolumnas*sizeof(double)); } mkl_free(pivot); mkl_free(B); }
void save_2d_image_potential(SimulationData &sim_data, double *potential, const char * fits_file_name) { double *save_data; save_data = (double*)mkl_malloc(sim_data.get_num_x() * sim_data.get_num_y() * sizeof(double), 64); fitsfile *fptr; int status = 0; long fpixel = 1, naxis = 2, nelements; long naxes[2] = {sim_data.get_num_y(), sim_data.get_num_x()}; for (int i = 0; i < sim_data.get_num_x(); ++i) { for (int j = 0; j < sim_data.get_num_y(); ++j) { save_data[i * sim_data.get_num_y() + j] = 0; for (int k = 0; k < sim_data.get_num_z(); ++k) { save_data[i * sim_data.get_num_y() + j] += potential[i * sim_data.get_num_y() * sim_data.get_num_z() + j * sim_data.get_num_z() + k]; } } } fits_create_file(&fptr, fits_file_name, &status); fits_create_img(fptr, DOUBLE_IMG, naxis, naxes, &status); nelements = naxes[0] * naxes[1]; fits_write_img(fptr, TDOUBLE, fpixel, nelements, save_data, &status); fits_close_file(fptr, &status); fits_report_error(stderr, status); mkl_free(save_data); }
int check_result(double* A, double* BT, double* C, int m, int n, int c, int transposed) { int err_c = 0; //how many errors found int i, j, k; double* C_ref = (double*)mkl_malloc(m * n * sizeof(double), 16); //with zeroed // for(i = 0; i < m; i ++) { // for(j = 0; j < n; j++) { // C_ref[i*n+j] = 0; // for(k = 0; k < c; k++) { // C_ref[i*n+j] += A[i*c+k] * BT[j*c+k]; // } // } // } cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, c, 1, A, c, BT, c, 0, C_ref, n); if(transposed){ for(i = 0; i < m; i++) { for(j = 0; j < n; j++) { err_c += (fabs(C[j*m + i] - C_ref[i*n+j]) < 0.0001 ? 0 : 1); } } } else { //do compare for(i = 0; i < m * n ; i++) { err_c += (fabs(C[i] - C_ref[i]) < 0.0001 ? 0 : 1); } } mkl_free(C_ref); return err_c; }
// If CUDA is available and in GPU mode, host memory will be allocated pinned, // using cudaMallocHost. It avoids dynamic pinning for transfers (DMA). // The improvement in performance seems negligible in the single GPU case, // but might be more significant for parallel training. Most importantly, // it improved stability for large models on many GPUs. inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) { #ifndef CPU_ONLY if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaMallocHost(ptr, size)); *use_cuda = true; return; } #endif #ifdef USE_MLSL if (mn::is_multinode()) { *ptr = mn::alloc(size ? size : 1, 64); } else { #endif /* !USE_MLSL */ #ifdef USE_MKL *ptr = mkl_malloc(size ? size : 1, 64); #else *ptr = malloc(size); #endif #ifdef USE_MLSL } #endif /* USE_MLSL */ *use_cuda = false; CHECK(*ptr) << "host allocation of size " << size << " failed"; }
void GeneticAlgorithm2::resetParameters(int nPopulation, double scaleFactor, double crossingProbability){ _scaleFactor = scaleFactor; _crossingProbability = crossingProbability; _nPopulation = nPopulation; delete [] _populationParametersOld; delete [] _populationParametersNew; // _populationParametersOld = new Parameters::fitParameters[nPopulation]; // _populationParametersNew = new Parameters::fitParameters[nPopulation]; _populationParametersOld = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16); _populationParametersNew = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16); for(int i = 0; i < _nPopulation; i++){ _populationParametersOld[i].A1 = randomDouble(0.0,5.0); _populationParametersOld[i].A2 = randomDouble(0.0,5.0); _populationParametersOld[i].F1 = randomDouble(440.0,500.0); _populationParametersOld[i].F2 = randomDouble(500.0,560.0); _populationParametersOld[i].dF1 = randomDouble(20.0,60.0); _populationParametersOld[i].dF2 =0; _populationParametersOld[i].phi1 = randomDouble(0.0,1.0); _populationParametersOld[i].phi2 = randomDouble(0.0,1.0); _populationParametersOld[i].Td1 = randomDouble(2.0,10.0); _populationParametersOld[i].Td2 = randomDouble(2.0,10.0); _populationParametersOld[i].ms1 = randomDouble(1,2.6); _populationParametersOld[i].ms2 = randomDouble(1,2.6); // _populationParametersOld[i].m1 = randomDouble(1.3,1.9); // _populationParametersOld[i].m2 = randomDouble(1.3,1.9); _populationParametersOld[i].m1 = 1.7; _populationParametersOld[i].m2 = 1.7; _populationParametersOld[i].dF12 = randomDouble(-15,5); // _populationParametersOld[i].ms11 = randomDouble(0.0,.2); // _populationParametersOld[i].ms22 = randomDouble(0.0,.2); _populationParametersOld[i].T = 4.2; } for(int i = 0; i < _nPopulation; i++){ _populationParametersOld[i].chiSq = calculateResidual2(&_populationParametersOld[i],0); } //delete _integerDistribution; delete [] ints1; delete [] ints2; delete [] ints3; // _integerDistribution = new boost::random::uniform_int_distribution<>(0, _nPopulation-1); ints1 = new int[_nPopulation]; ints2 = new int[_nPopulation]; ints3 = new int[_nPopulation]; }
PotentialData::PotentialData(SimulationData &sim_data) { this->harmonic_trap = (double*)mkl_malloc(sim_data.num_points * sizeof(double), 64); #pragma omp parallel for for (int i = 0; i < sim_data.num_points; ++i) { harmonic_trap[i] = 0.5 * pow(sim_data.x[i], 2.0); } }
int bench_stream_triad() { double *A, *B, *C; double t; int64_t m, n, k, i, j; m = SIZE, k = SIZE, n = SIZE; double scalar=3.14; A = (double *)mkl_malloc( m*k*sizeof( double ), 64 ); B = (double *)mkl_malloc( k*n*sizeof( double ), 64 ); C = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); #pragma omp parallel for for (i = 0; i < (m*k); i++) { A[i] = (double)(i+1); } #pragma omp parallel for for (i = 0; i < (k*n); i++) { B[i] = (double)(-i-1); } #pragma omp parallel for for (i = 0; i < (m*n); i++) { C[i] = 0.0; } if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } t=stoptime(); for (i=0;i<NTIME;i++) #pragma omp parallel for for (j=0; j<(m*k); j++) A[j] = B[j]+scalar*C[j]; t=stoptime()-t; printf("GB/s : %f\n",(((((m*k)*3)*8)*NTIME)/t)*1E-9); DPRINTF("\n Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 0; }
vector_t::vector_t(size_t _len, double init) : data(static_cast<double*>(mkl_malloc(_len * sizeof(double), 64)), std::ptr_fun(mkl_free)), len(_len), inc(1) { stack_assert(data.get() != nullptr); stack_assert(len > 0); std::fill(data.get(), data.get() + len, init); }
vector_t::vector_t(size_t _len, std::mt19937 & gen) : data(static_cast<double*>(mkl_malloc(_len * sizeof(double), 64)), std::ptr_fun(mkl_free)), len(_len), inc(1) { stack_assert(data.get() != nullptr); stack_assert(len > 0); std::normal_distribution<> d(0,0.1); std::generate(data.get(), data.get() + len, std::bind(d,gen)); }
inline void* znn_malloc(size_t s) { #ifdef ZNN_XEON_PHI void* r = mkl_malloc(s,64); #else void* r = malloc(s); #endif if ( !r ) throw std::bad_alloc(); return r; }
int create_array(const MKL_INT length, T*& x) { if (x == nullptr) { x = (T*)mkl_malloc(length*sizeof(T),64); if (x == nullptr) { return OUTOFMEMORY; } } return 0; }
WavefunctionData::WavefunctionData(SimulationData &sim_data) { this->psi = (MKL_Complex16*)mkl_malloc(sim_data.num_points * sizeof(MKL_Complex16), 64); this->psi_old = (MKL_Complex16*)mkl_malloc(sim_data.num_points * sizeof(MKL_Complex16), 64); this->psi_new = (MKL_Complex16*)mkl_malloc(sim_data.num_points * sizeof(MKL_Complex16), 64); this->conj_psi = (MKL_Complex16*)mkl_malloc(sim_data.num_points * sizeof(MKL_Complex16), 64); this->psi_tf = (double*)mkl_malloc(sim_data.num_points * sizeof(double), 64); this->psi_abs2 = (double*)mkl_malloc(sim_data.num_points * sizeof(double), 64); this->wavefunction_norm = 1; double expval; #pragma omp parallel for private(expval) for (int i = 0; i < sim_data.num_points; ++i) { expval = exp(-0.05 * pow(sim_data.x[i], 2.0)); this->psi[i].real = expval; this->psi[i].imag = 0; this->psi_old[i].real = expval; this->psi_old[i].imag = 0; this->psi_new[i].real = 0; this->psi_new[i].imag = 0; this->conj_psi[i].real = 0; this->conj_psi[i].imag = 0; this->psi_abs2[i] = 0; this->psi_tf[i] = 0; } calc_norm(sim_data, this->psi); normalize_wf(sim_data, this->psi); vzAbs(sim_data.num_points, this->psi, this->psi_abs2); vdMul(sim_data.num_points, this->psi_abs2, this->psi_abs2, this->psi_abs2); save_data(this->psi_abs2, sim_data, "init_state.bin"); }
void initial_matrix(double** A_addr, double** BT_addr, double** C_addr, int m, int n, int c) { double* A = (double*)mkl_malloc(m * c * sizeof(double) , 16); double* BT = (double*)mkl_malloc(n * c * sizeof(double), 16); double* C = (double*)mkl_malloc(m * n * sizeof(double), 16); *A_addr = A; *BT_addr = BT; *C_addr = C; //use random number for input int i,j,k; srand((unsigned)time(NULL)); for(i = 0; i < m; i++) { for(k = 0; k < c; k++) { A[i*c+k] = ((double)rand()/(double)RAND_MAX); } } for(j = 0; j < n; j++) { for(k = 0; k < c; k++) { BT[j*c+k] = ((double)rand()/(double)RAND_MAX); } } }
int main(int argc, char *argv[]){ double inicio, fin = dsecnd(); double *A = (double *)mkl_malloc(N*N*sizeof(double), 64); double *B = (double *)mkl_malloc(N*sizeof(double), 64); int *pivot = (int *)mkl_malloc(N*sizeof(int), 32); // distribucion normal de media 0 y varianza 1 std::default_random_engine generador; std::normal_distribution<double> aleatorio(0.0, 1.0); for (int i = 0; i < N*N; i++) A[i] = aleatorio(generador); for (int i = 0; i < N; i++) B[i] = aleatorio(generador); // matriz A marcadamente diagonal para evitar riesgo de singularidad for (int i = 0; i < N; i++) A[i*N + i] += 10.0; int result; inicio = dsecnd(); for (int i = 0; i < NTEST; i++) result = LAPACKE_dgesv(LAPACK_ROW_MAJOR, N, 1, A, N, pivot, B, 1); fin = dsecnd(); double tiempo = (fin - inicio) / (double)NTEST; printf("Tiempo: %lf msec\n", tiempo*1.0e3); mkl_free(A); mkl_free(B); std::getchar(); return 0; }
/* *struct st_rmsm { * int status; // internal status * int size; // dimension, i.e., size of this matrix * int *pos; // i-th row starts from n[pos[i]] and a[pos[i]] * int *rsz; // i-th row has rsz[i] non-zero elements, 'rsz' stands for row size * int **col * double **data; * std::vector<intdbl_t> *tmp; // used only when unpacked *}; */ struct st_rmsm *rmsm_create(const int size) { //fprintf(stderr,"rmsm_create()\n"); struct st_rmsm *m=(struct st_rmsm*)mkl_malloc(sizeof(struct st_rmsm),64); assert(m); m->status=0; m->size = size; m->pos = (int*)mkl_malloc(sizeof(int)*size,64); m->rsz = (int*)mkl_malloc(sizeof(int)*size,64); m->col = NULL; m->data = NULL; m->tmp = new std::vector<intdbl_t>[size]; assert(m->pos); assert(m->rsz); assert(m->tmp); m->status=1; return m; }
int save_data_real(MKL_Complex16 *data, SimulationData &sim_data, const char * filename) { double *data2; data2 = (double*)mkl_malloc(sim_data.num_points * sizeof(double), 64); for (int i = 0; i < sim_data.num_points; ++i) { data2[i] = data[i].real; } FILE* pFile; pFile = fopen(filename, "wb"); fwrite(data2, sizeof(double), sim_data.num_points, pFile); fclose(pFile); return 0; }
CMatrix3D::CMatrix3D(MKL_INT r,MKL_INT c, MKL_INT z) { //Since we are planning on very large non-sparse arrays therefore we will make this a list of CMatrix pointers //this will enable the creation of large matrix arrays in different memory regions thus avoid wasting memory at the cost of performance pMats = (CMatrix*) mkl_malloc(z*sizeof(CMatrix),64); depth = z; for(int i =0; i < z;i++) { pMats[i].Create(r,c); } }
int main() { double *A, *B, *C; int m, n, p, i, j; double alpha, beta; printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n" " Intel® MKL function dgemm, where A, B, and C are matrices and \n" " alpha and beta are double precision scalars\n\n"); m = 2000, p = 200, n = 1000; printf (" Initializing data for matrix multiplication C=A*B for matrix \n" " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n); alpha = 1.0; beta = 0.0; printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A = (double *)mkl_malloc( m*p*sizeof( double ), 64 ); B = (double *)mkl_malloc( p*n*sizeof( double ), 64 ); C = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf (" Intializing matrix data \n\n"); for (i = 0; i < (m*p); i++) { A[i] = (double)(i+1); } for (i = 0; i < (p*n); i++) { B[i] = (double)(-i-1); } for (i = 0; i < (m*n); i++) { C[i] = 0.0; } printf (" Computing matrix product using Intel® MKL dgemm function via CBLAS interface \n\n"); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, p, alpha, A, p, B, n, beta, C, n); printf ("\n Computations completed.\n\n"); printf (" Top left corner of matrix A: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(p,6); j++) { printf ("%12.0f", A[i+j*p]); } printf ("\n"); } printf ("\n Top left corner of matrix B: \n"); for (i=0; i<min(p,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.0f", B[j+i*n]); } printf ("\n"); } printf ("\n Top left corner of matrix C: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C[j+i*n]); } printf ("\n"); } printf ("\n Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); printf (" Example completed. \n\n"); return 0; }
Matrix(const int n): num(n) { data = (double*)mkl_malloc(sizeof(double)*n*n,64); assert(data); }
// X: a MxD matrix, Y: a M vector, W: a M vector // W0: a M vector int main(int argc, char ** argv){ if (argc>1 && argv[1][0]=='h') { printf ("Usage: parSymSGD M D T C lamda r\n"); printf (" M: number of data points, D: dimensions, T: time iterations, C: cores;\n"); printf (" lamda: learning rate, r: panel size in unit of C.\n"); return 1; }u // read in the arguments: M, D, I (time iterations), C (cores), r (each panel contains r*C points) int M = argc>1?atoi(argv[1]):32; int D = argc>2?atoi(argv[2]):4; T = argc>3?atoi(argv[3]):10; int C = argc>4?atoi(argv[4]):4; float lamda = argc>5?atof(argv[5]):0.01; int r = argc>6?atoi(argv[6]):1; ///printf("M=%d, D=%d, T=%d, C=%d, lamda=%8.6f, r=%d\n",M,D,T,C,lamda,r); int max_threads = mkl_get_max_threads(); // get the max number of threads int rep; mkl_set_num_threads(1); // set the number of threads to use by mkl panelSz = C*r; panels = M/panelSz; int i,j,k,p,t; float *Y, *Wreal, *W, *X; Y = (float *) mkl_malloc(M*sizeof(float),PAGESIZE); Wreal = (float *) mkl_malloc(D*sizeof(float),PAGESIZE); W = (float *) mkl_malloc(D*sizeof(float),PAGESIZE); X = (float *) mkl_malloc(M*D*sizeof(float),PAGESIZE); float *Ypred = (float*)mkl_malloc(M*sizeof(float),PAGESIZE); float *Ytmp = (float*)mkl_malloc(M*sizeof(float),PAGESIZE); float *I = (float*)mkl_malloc(D*D*sizeof(float),PAGESIZE); float *Z = (float*)mkl_malloc(M*D*sizeof(float),PAGESIZE); float *B = (float*)mkl_malloc(panels*D*sizeof(float),PAGESIZE); if (Y==NULL | Wreal==NULL | W==NULL | X==NULL | Ypred==NULL || Ytmp==NULL || Z==NULL || B==NULL || I== NULL){ printf("Memory allocation error.\n"); return 2; } initData(Wreal,W,X,Y, M, D,I); ///printf("panelSz=%d, panels=%d\n", panelSz, panels); for (nt=1; nt<=max_threads && nt<=panelSz; nt*=2){ omp_set_num_threads(nt);// set the number of openMP threads for (rep=0; rep<REPEATS; rep++){//repeat measurements double prepTime, gdTime, sInit; // preprocessing sInit=dsecnd(); //preprocessSeq(X, Y, Z, B, panelSz, panels, M, D, lamda); preprocessPar(X, Y, Z, B, panelSz, panels, M, D, lamda); prepTime = (dsecnd() - sInit); ///dump2("Z",Z,M,D); ///dump2("B",B,panels,D); // GD initW(W,D); ///dump1("W (initial)", W, D); sInit=dsecnd(); float err; float fixpoint = 0.0; for (t=0;t<T;t++){ for (p=0;p<panels;p++){ gd(&(X[p*panelSz*D]),&(Z[p*panelSz*D]), &(B[p*D]), panelSz, D, lamda, W, I); ///printf("(t=%d, p=%d) ",t,p); ///dump1("W", W, D); ///err=calErr(X, Ypred, Ytmp, Y, W, M, D); printf("finish one panels ............................ \n"); } } gdTime = (dsecnd() - sInit); err=calErr(X, Ypred, Ytmp, Y, W, M, D); fixpoint = err - prev_err; // print final err. time is in milliseconds printf("nt=%d\t ttlTime=%.5f\t prepTime=%.5f\t gdTime=%.5f\t error=%.5f\n", nt, (gdTime+prepTime)*1000, prepTime*1000, gdTime*1000, err); } } if (B) mkl_free(B); if (Z) mkl_free(Z); if (Ytmp) mkl_free(Ytmp); if (Ypred) mkl_free(Ypred); if (Y) mkl_free(Y); if (Wreal) mkl_free(Wreal); if (W) mkl_free(W); if (X) mkl_free(X); if (I) mkl_free(I); return 0; }
int main() { double *A, *B, *C; int m, n, p, i, j, k, r; double alpha, beta; double sum; double s_initial, s_elapsed; printf ("\n This example measures performance of rcomputing the real matrix product \n" " C=alpha*A*B+beta*C using a triple nested loop, where A, B, and C are \n" " matrices and alpha and beta are double precision scalars \n\n"); m = 2000, p = 200, n = 1000; printf (" Initializing data for matrix multiplication C=A*B for matrix \n" " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n); alpha = 1.0; beta = 0.0; printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A = (double *)mkl_malloc( m*p*sizeof( double ), 64 ); B = (double *)mkl_malloc( p*n*sizeof( double ), 64 ); C = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf (" Intializing matrix data \n\n"); for (i = 0; i < (m*p); i++) { A[i] = (double)(i+1); } for (i = 0; i < (p*n); i++) { B[i] = (double)(-i-1); } for (i = 0; i < (m*n); i++) { C[i] = 0.0; } printf (" Making the first run of matrix product using triple nested loop\n" " to get stable run time measurements \n\n"); for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (k = 0; k < p; k++) sum += A[p*i+k] * B[n*k+j]; C[n*i+j] = sum; } } printf (" Measuring performance of matrix product using triple nested loop \n\n"); s_initial = dsecnd(); for (r = 0; r < LOOP_COUNT; r++) { for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (k = 0; k < p; k++) sum += A[p*i+k] * B[n*k+j]; C[n*i+j] = sum; } } } s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT; printf (" == Matrix multiplication using triple nested loop completed == \n" " == at %.5f milliseconds == \n\n", (s_elapsed * 1000)); printf (" Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); if (s_elapsed < 0.9/LOOP_COUNT) { s_elapsed=1.0/LOOP_COUNT/s_elapsed; i=(int)(s_elapsed*LOOP_COUNT)+1; printf(" It is highly recommended to define LOOP_COUNT for this example on your \n" " computer as %i to have total execution time about 1 second for reliability \n" " of measurements\n\n", i); } printf (" Example completed. \n\n"); return 0; }
int test01(void) { int err=0; printf("TEST01\n"); printf(" |Test solver_v2 workflow\n"); //struct st_mesh *q=mshio_create_mesh("msh/0344/0344"); struct st_mesh *q=mshio_create_mesh("msh/0616/0616"); //mshio_print_mesh(q,PRINT_INFO_VERBOSE); /* * ipar[0] = M * ipar[1] = Nd * ipar[2] = pad * ipar[3] = rule1 * ipar[4] = rule2 * ipar[5] = nu * ipar[6] = nv * ipar[7] = num_threads in omp */ const int ipar[128]={1,3,1, 1,1,5,3, 1}; /* * dpar[0] = g factor * dpar[1] = mua (absorption coefficient) * dpar[2] = mus (scattering coefficient) */ const double dpar[128]={0.7,1.0,2.0}; struct st_solver_v2 *s=sv2_create_solver(q,ipar,dpar); sv2_print_solverinfo(s); //for (int i = 0; i < s->Nt; i++) //rmsm_print_row(s->E,i); double _Complex *b0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *x0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *b1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *x1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); assert(b0); assert(x0); assert(b1); assert(x1); int nitr; double eps; //sv2_gen_b0(s,0.0,b0); for (int i = 0; i < s->Ng; i++) x0[i] = 1.0; print_vector("x=",5,x0); sv2_mul(s,x0,b0); print_vector("b=A.x=",5,b0); sv2_solve(s,b0,x0,200,12,1.0E-13,&nitr,&eps); //sv2_gen_b1x0(s,0.0,b1,x0); //sv2_solve(s,b1,x1,200,12,1.0E-13,&nitr,&eps); sv2_destroy_solver(s); mshio_destroy_mesh(q); printf("END OF TEST01\n"); printf("\n"); return err; }
int test01(void) { int err=0; printf("TEST01\n"); printf(" |Test solver_v1 workflow\n"); struct st_mesh *q=mshio_create_mesh("msh/0344/0344"); //mshio_print_mesh(q,PRINT_INFO_VERBOSE); /* * ipar[0] = M * ipar[1] = Nd * ipar[2] = pad * ipar[3] = rule1 * ipar[4] = rule2 * ipar[5] = nu * ipar[6] = nv * ipar[7] = num_threads in omp */ //const int ipar[128]={1,3,1, 1,1,5,3, 1}; const int ipar[128]={1,3,1, 1,1,5,3, 8}; //const int ipar[128]={1,30,1, 2,5,5,3, 8}; /* * dpar[0] = g factor * dpar[1] = mua (absorption coefficient) * dpar[2] = mus (scattering coefficient) */ const double dpar[128]={0.7,1.0,2.0}; struct st_solver_v1 *s=sv1_create_solver(q,ipar,dpar); sv1_print_solverinfo(s); //for (int i = 0; i < s->Ns; i++) //printf("[%5d] %.5E\n",i,s->E[i]); double _Complex *b0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *x0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *b1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *x1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); assert(b0); assert(x0); assert(b1); assert(x1); int nitr; double eps; //sv1_gen_b0(s,0.0,b0); //sv1_solve(s,b0,x0,200,12,1.0E-13,&nitr,&eps); sv1_gen_b1x0(s,0.0,b1,x0); sv1_solve(s,b1,x1,200,12,1.0E-13,&nitr,&eps); char dir[FILENAME_MAX]="SOL"; sv1_save_solution(s,x1,dir); sv1_destroy_solver(s); mshio_destroy_mesh(q); printf("END OF TEST01\n"); printf("\n"); return err; }
void rmsm_pack(struct st_rmsm *m) { //fprintf(stderr,"rmsm_pack()\n"); assert(m->status==1); //for (int i = 0; i < m->size; i++) //printf("[%5d] %lu\n",i,m->tmp[i].size()); //const int row=73; // examine this row //printf("row %d (raw)\n",row); //for (int i = 0; i < m->tmp[row].size(); i++) //printf("[%5d] %f\n",m->tmp[row].at(i).i,m->tmp[row].at(i).d); // sort each row for (int n = 0; n < m->size; n++) std::sort(m->tmp[n].begin(),m->tmp[n].end(),cmp); //printf("(sorted)\n"); //for (int i = 0; i < m->tmp[row].size(); i++) //printf("[%5d] %f\n",m->tmp[row].at(i).i,m->tmp[row].at(i).d); // merge std::vector<int> *vi = new std::vector<int>[m->size]; std::vector<double> *vd = new std::vector<double>[m->size]; for (int n = 0; n < m->size; n++) { if (m->tmp[n].size()==0) continue; vi[n].push_back(m->tmp[n].at(0).i); vd[n].push_back(m->tmp[n].at(0).d); int curr_col=vi[n].at(0); for (int j = 1; j < m->tmp[n].size(); j++) { int col=m->tmp[n].at(j).i; double val=m->tmp[n].at(j).d; if (col==curr_col) vd[n].back() += val; else { vi[n].push_back(col); vd[n].push_back(val); curr_col = col; } } } delete [] m->tmp; //printf("(merged)\n"); //for (int i = 0; i < vi[row].size(); i++) //printf("[%5d] %f\n",vi[row].at(i),vd[row].at(i)); // pack { int ptr=0; for (int i = 0; i < m->size; i++) { m->pos[i] = ptr; ptr += vi[i].size(); m->rsz[i] = vi[i].size(); } m->length=ptr; } m->col =(int*) mkl_malloc(sizeof(int) *(m->length),64); m->data=(double*)mkl_malloc(sizeof(double)*(m->length),64); for (int i = 0; i < m->size; i++) for (int j = 0; j < vi[i].size(); j++) { m->col [j+m->pos[i]] = vi[i].at(j); m->data[j+m->pos[i]] = vd[i].at(j); } delete [] vi; delete [] vd; //printf("(packed)\n"); //for (int i = 0; i < m->rsz[row]; i++) //printf("[%5d] %f\n",m->col[i+m->pos[row]],m->data[i+m->pos[row]]); m->status=2; }
void GeneticAlgorithm::initializeParameters(double* dataSet, int dataSetLength, int nPopulation, double scaleFactor, double crossingProbability){ _scaleFactor = scaleFactor; _crossingProbability = crossingProbability; _dataSetLength = dataSetLength; _dataSet = dataSet; _residualArray = new double*[nThreads]; _paramArray = new double*[nThreads]; for(int i = 0; i < nThreads; i++){ _residualArray[i] = new double[_dataSetLength]; _paramArray[i] = new double[nParams]; } _populationParametersOld = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16); _populationParametersNew = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16); for(int i = 0; i < _nPopulation; i++){ _populationParametersOld[i].missFreq = new long[_nMissing]; _populationParametersNew[i].missFreq = new long[_nMissing]; /* _populationParametersOld[i].c11 = (randomDouble(196,196.1))*pow(10,9); _populationParametersOld[i].c22 = _populationParametersOld[i].c11; _populationParametersOld[i].c33 = (randomDouble(187,187.1))*pow(10,9); _populationParametersOld[i].c44 = (randomDouble(63.5,63.6))*pow(10,9); _populationParametersOld[i].c55 = _populationParametersOld[i].c44; _populationParametersOld[i].c66 = (randomDouble(55.7,55.8))*pow(10,9); _populationParametersOld[i].c12 = (randomDouble(62.5,62.6))*pow(10,9); _populationParametersOld[i].c13 = (randomDouble(69.8,69.9))*pow(10,9); _populationParametersOld[i].c23 = _populationParametersOld[i].c13; _populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Tetragonal PuCoGa5*/ /* _populationParametersOld[i].c11 = (randomDouble(260,300))*pow(10,9); _populationParametersOld[i].c22 = _populationParametersOld[i].c11; _populationParametersOld[i].c33 = (randomDouble(290,320))*pow(10,9); _populationParametersOld[i].c44 = (randomDouble(90,110))*pow(10,9); _populationParametersOld[i].c55 = _populationParametersOld[i].c44; _populationParametersOld[i].c66 = (randomDouble(130,150))*pow(10,9); _populationParametersOld[i].c12 = (randomDouble(140,165))*pow(10,9); _populationParametersOld[i].c13 = (randomDouble(100,130))*pow(10,9); _populationParametersOld[i].c23 = _populationParametersOld[i].c13; _populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Tetragonal URu2Si2 */ _populationParametersOld[i].c11 = (randomDouble(220,260))*pow(10,9); _populationParametersOld[i].c22 = (randomDouble(210,250))*pow(10,9); _populationParametersOld[i].c33 = (randomDouble(100,150))*pow(10,9); _populationParametersOld[i].c44 = (randomDouble(32,38))*pow(10,9); _populationParametersOld[i].c55 = (randomDouble(48,52))*pow(10,9); _populationParametersOld[i].c66 = (randomDouble(94,98))*pow(10,9); _populationParametersOld[i].c12 = (randomDouble(100,150))*pow(10,9); _populationParametersOld[i].c13 = (randomDouble(25,60))*pow(10,9); _populationParametersOld[i].c23 = (randomDouble(20,70))*pow(10,9); _populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Orthorhombic YBCO67 //_populationParametersOld[i].c11 = (randomDouble(1,400))*pow(10,9); //_populationParametersOld[i].c22 = _populationParametersOld[i].c11; //_populationParametersOld[i].c33 = _populationParametersOld[i].c11; // //_populationParametersOld[i].c44 = (randomDouble(1,400))*pow(10,9); //_populationParametersOld[i].c55 = _populationParametersOld[i].c44; //_populationParametersOld[i].c66 = _populationParametersOld[i].c44; //_populationParametersOld[i].c12 = (randomDouble(1,400))*pow(10,9); //_populationParametersOld[i].c13 = _populationParametersOld[i].c12; //_populationParametersOld[i].c23 = _populationParametersOld[i].c12; //_populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Cubic Nb //_populationParametersOld[i].c11 = (randomDouble(120,200))*pow(10,9); //_populationParametersOld[i].c22 = _populationParametersOld[i].c11; // //_populationParametersOld[i].c33 = (randomDouble(120,200))*pow(10,9); //_populationParametersOld[i].c44 = (randomDouble(10,100))*pow(10,9); //_populationParametersOld[i].c55 = _populationParametersOld[i].c44; // //_populationParametersOld[i].c66 = (randomDouble(10,100))*pow(10,9); //_populationParametersOld[i].c12 = (randomDouble(10,100))*pow(10,9); //_populationParametersOld[i].c13 = (randomDouble(10,100))*pow(10,9); //_populationParametersOld[i].c23 = _populationParametersOld[i].c13; //_populationParametersOld[i].chiSq = calculateResidual(&_populationParametersOld[i],0);/// ***Tetragonal CeCoIn5 } _minimumParameters.c11 = 1; _minimumParameters.c22 = 1; _minimumParameters.c33 = 1; _minimumParameters.c44 = 1; _minimumParameters.c55 = 1; _minimumParameters.c66 = 1; _minimumParameters.c12 = 1; _minimumParameters.c13 = 1; _minimumParameters.c23 = 1; _minimumParameters.chiSq = std::numeric_limits<double>::infinity(); _minimumParameters.missFreq = new long[_nMissing]; }
void GeneticAlgorithm2::initializeParameters(double** dataSet, int dataSetLength, int nPopulation, double scaleFactor, double crossingProbability){ _scaleFactor = scaleFactor; _crossingProbability = crossingProbability; _dataSetLength = dataSetLength; _dataSet = dataSet; _residualArray = new double*[nThreads]; _paramArray = new double*[nThreads]; for(int i = 0; i < nThreads; i++){ _residualArray[i] = new double[_dataSetLength]; _paramArray[i] = new double[nParams]; // xVals[i] = new double[4]; } _populationParametersOld = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16); _populationParametersNew = (Parameters::fitParameters *) mkl_malloc(sizeof(Parameters::fitParameters)*nPopulation,16); for(int i = 0; i < _nPopulation; i++){ _populationParametersOld[i].A1 = randomDouble(0.0,1.0); _populationParametersOld[i].A2 = randomDouble(0.0,1.0); _populationParametersOld[i].F1 = randomDouble(460,480); _populationParametersOld[i].F2 = randomDouble(520,540); _populationParametersOld[i].dF1 = randomDouble(25.,45.0); _populationParametersOld[i].dF2 = 0; _populationParametersOld[i].phi1 = randomDouble(0.0,1.0); _populationParametersOld[i].phi2 = randomDouble(0.0,1.0); _populationParametersOld[i].Td1 = randomDouble(4., 8.); _populationParametersOld[i].Td2 = randomDouble(4., 8.); // _populationParametersOld[i].ms1 = randomDouble(1,2.2); // _populationParametersOld[i].ms2 = randomDouble(1.5,1.7); _populationParametersOld[i].ms1 = randomDouble(1.0,1.4); _populationParametersOld[i].ms2 = randomDouble(1.5,1.7); // _populationParametersOld[i].m1 = randomDouble(1.0,3); // _populationParametersOld[i].m2 = randomDouble(1.0,3); _populationParametersOld[i].m1 = 1.7; _populationParametersOld[i].m2 = 1.7; // _populationParametersOld[i].dF12 = randomDouble(-15,5); // _populationParametersOld[i].ms11 = randomDouble(0.0,.2); // _populationParametersOld[i].ms22 = randomDouble(0.0,.2); _populationParametersOld[i].T = 4.2; _populationParametersOld[i].chiSq = calculateResidual2(&_populationParametersOld[i],0); } for(int i = 0; i < _nPopulation; i++){ _populationParametersNew[i].T = 0; _populationParametersNew[i].m1 = 0; _populationParametersNew[i].m2 = 0; } _minimumParameters.A1 = 1; _minimumParameters.A2 = 1; _minimumParameters.F1 = 0; _minimumParameters.F2 = 0; _minimumParameters.dF1 = 0; _minimumParameters.dF2 = 0; _minimumParameters.phi1 = 0; _minimumParameters.phi2 = 0; _minimumParameters.Td1 = 0; _minimumParameters.Td2 = 0; _minimumParameters.ms1 = 0; _minimumParameters.ms2 = 0; _minimumParameters.m1 = 0; _minimumParameters.m2 = 0; // _minimumParameters.dF12 = 0; // _minimumParameters.ms11 = 0; // _minimumParameters.ms22 = 0; _minimumParameters.T = 0; _minimumParameters.chiSq = INFINITE; }
int bench_dgemm() { double *A, *B, *C; int m, n, k, i, j; double alpha, beta; double t; m = SIZE, k = SIZE, n = SIZE; DPRINTF(" Initializing data for matrix multiplication C=A*B for matrix \n" " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n); alpha = 1.0; beta = 0.0; DPRINTF(" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A = (double *)mkl_malloc( m*k*sizeof( double ), 64 ); B = (double *)mkl_malloc( k*n*sizeof( double ), 64 ); C = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } DPRINTF(" Intializing matrix data \n\n"); #pragma omp parallel for for (i = 0; i < (m*k); i++) { A[i] = (double)(i+1); } #pragma omp parallel for for (i = 0; i < (k*n); i++) { B[i] = (double)(-i-1); } #pragma omp parallel for for (i = 0; i < (m*n); i++) { C[i] = 0.0; } DPRINTF(" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n"); t=stoptime(); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, k, B, n, beta, C, n); t=stoptime()-t; printf("calculation time : %f\n",t); printf("gflops/s : %f\n",((2.0*m*n*k)*1E-9)/t); DPRINTF("\n Computations completed.\n\n"); DPRINTF(" Top left corner of matrix A: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(k,6); j++) { DPRINTF("%12.0f", A[j+i*k]); } DPRINTF("\n"); } DPRINTF("\n Top left corner of matrix B: \n"); for (i=0; i<min(k,6); i++) { for (j=0; j<min(n,6); j++) { DPRINTF("%12.0f", B[j+i*n]); } DPRINTF("\n"); } DPRINTF("\n Top left corner of matrix C: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { DPRINTF("%12.5G", C[j+i*n]); } DPRINTF("\n"); } DPRINTF("\n Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); DPRINTF(" Example completed. \n\n"); return 0; }