int main() { double *A, *B, *C; int i,j,r,max_threads,size; double alpha, beta; double s_initial, s_elapsed; printf("Intializing data for matrix multiplication C=A*B for matrix\n\n" " A(%i*%i) and matrix B(%i*%i)\n",M,P,P,N); alpha = 1.0; beta = 0.0; printf("Allocating memory for matrices aligned on 64-byte boundary for better performance \n\n"); A = ( double *)mkl_malloc(M*P*sizeof( double ),64); B = ( double *)mkl_malloc(N*P*sizeof( double ),64); C = ( double *)mkl_malloc(M*N*sizeof( double ),64); if (A == NULL || B == NULL || C == NULL) { printf("Error: can`t allocate memory for matrices.\n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf("Intializing matrix data\n\n"); size = M*P; for (i = 0; i < size; ++i) { A[i] = ( double )(i+1); } size = N*P; for (i = 0; i < size; ++i) { B[i] = ( double )(i-1); } printf("Finding max number of threads can use for parallel runs \n\n"); max_threads = mkl_get_max_threads(); printf("Running from 1 to %i threads \n\n",max_threads); for (i = 1; i <= max_threads; ++i) { size = M*N; for (j = 0; j < size; ++j) { C[j] = 0.0; } printf("Requesting to use %i threads \n\n",i); mkl_set_num_threads(i); printf("Measuring performance of matrix product using dgemm function\n" " via CBLAS interface on %i threads \n\n",i); s_initial = dsecnd(); for (r = 0; r < LOOP_COUNT; ++r) { cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, P, alpha, A, P, B, N, beta, C, N); // multiply matrices with cblas_dgemm; } s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT; printf("Matrix multiplication using dgemm completed \n" " at %.5f milliseconds using %d threads \n\n", (s_elapsed * 1000),i); printf("Output the result: \n"); size = M*N; for (i = 0; i < size; ++i) { printf("%i\t",(int)C[i]); if (i % N == N - 1) printf("\n"); } } printf("Dellocating memory\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 0; }
void solve1() { struct st_mesh *q=mshio_create_mesh(fin_base); //mshio_print_mesh(q,PRINT_INFO_VERBOSE); struct st_solver_v1 *s=sv1_create_solver(q,ipar,dpar); sv1_print_solverinfo(s); //double _Complex *b0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *x0=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *b1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); double _Complex *x1=(double _Complex*)mkl_malloc(sizeof(double _Complex)*s->Ng,64); //assert(b0); assert(x0); assert(b1); assert(x1); int nitr; double eps; //sv1_gen_b0(s,0.0,b0); //sv1_solve(s,b0,x0,200,12,1.0E-13,&nitr,&eps); //sv1_save_solution(s,x0,output_dir); sv1_gen_b1x0(s,0.0,b1,x0); sv1_solve(s,b1,x1,200,30,1.0E-13,&nitr,&eps); sv1_save_solution(s,x1,output_dir); //mkl_free(b0); mkl_free(x0); mkl_free(b1); mkl_free(x1); sv1_destroy_solver(s); mshio_destroy_mesh(q); }
void mark_best_KernelPerceptronModel(KernelPerceptron kmodel, int numit) { kmodel->best_numit = numit; if (kmodel->best_alpha_avg != NULL) { mkl_free(kmodel->best_alpha_avg); } if (kmodel->best_kernel_matrix != NULL) { mkl_free(kmodel->best_kernel_matrix); } kmodel->best_alpha_avg = (float*) mkl_64bytes_malloc((kmodel->M) * sizeof (float)); kmodel->best_kernel_matrix = (float*) mkl_64bytes_malloc((kmodel->N) * (kmodel->M) * sizeof (float)); for (size_t i = 0; i < (kmodel->M); i++) { (kmodel->best_alpha_avg)[i] = (kmodel->alpha_avg)[i]; } size_t mn = (kmodel->N) * (kmodel->M); #pragma ivdep #pragma loop_count min(102400) for (size_t i = 0; i < mn; i++) { (kmodel->best_kernel_matrix)[i] = (kmodel->kernel_matrix)[i]; } kmodel->best_m = kmodel->M; }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){ /*Declarar las variables locales*/ mexPrintf("hios\n"); //Tarea1 termina aqui double *A, *B, determinante; int *pivot, info, Nfilas, Ncolumnas; /*Insertar el código */ if (nrhs != 1){ // nº args diferente de 1 mexErrMsgTxt("Error. myla, Debe tener un arg de entrada"); } if (!mxIsNumeric(prhs[0])){ mexErrMsgTxt("Error. El argumento de entrada debe ser una matriz"); } Nfilas = mxGetM(prhs[0]); Ncolumnas = mxGetN(prhs[0]); if (Nfilas != Ncolumnas){ mexErrMsgTxt("Error. La matriz debe ser cuadrada"); } if (Nfilas == 0){ mexErrMsgTxt("Error. La matriz debe no ser vacía"); } if (nlhs > 2){ mexErrMsgTxt("Error. Debe haber uno o dos args de salida"); } // copia de las variables A = mxGetPr(prhs[0]); B = (double *)mkl_malloc(Nfilas*Ncolumnas*sizeof(double), 64); memcpy(B, A, Nfilas*Ncolumnas*sizeof(double)); pivot = (int *)mkl_malloc(Nfilas*sizeof(int), 32); //procesos computacionales info = LAPACKE_dgetrf(LAPACK_COL_MAJOR, Nfilas, Ncolumnas, B, Ncolumnas, pivot); determinante = 1.0; for (int i = 0; i < Nfilas; i++){ if (pivot[i] != (i+1)){ determinante *= -B[i*Ncolumnas + i]; } else{ determinante *= B[i*Ncolumnas + i]; } } // crear los resultados de salida plhs[0] = mxCreateDoubleScalar(determinante); if (nlhs == 2){ if (fabs(determinante) < 1.0e-8){ mexWarnMsgTxt("Matriz singular o casi singular"); } LAPACKE_dgetri(LAPACK_COL_MAJOR, Nfilas, B, Ncolumnas, pivot); plhs[1] = mxCreateDoubleMatrix(Nfilas, Ncolumnas, mxREAL); double *C = mxGetPr(plhs[1]); memcpy(C, B, Nfilas*Ncolumnas*sizeof(double)); } mkl_free(pivot); mkl_free(B); }
inline void CaffeFreeHost(void* ptr, bool use_cuda) { #ifndef CPU_ONLY if (use_cuda) { CUDA_CHECK(cudaFreeHost(ptr)); return; } #endif #ifdef USE_MLSL if (mn::is_multinode()) { mn::free(ptr); } else { #endif /* !USE_MLSL */ #ifdef USE_MKL mkl_free(ptr); #else free(ptr); #endif #ifdef USE_MLSL } #endif /* USE_MLSL */ }
void save_2d_image_potential(SimulationData &sim_data, double *potential, const char * fits_file_name) { double *save_data; save_data = (double*)mkl_malloc(sim_data.get_num_x() * sim_data.get_num_y() * sizeof(double), 64); fitsfile *fptr; int status = 0; long fpixel = 1, naxis = 2, nelements; long naxes[2] = {sim_data.get_num_y(), sim_data.get_num_x()}; for (int i = 0; i < sim_data.get_num_x(); ++i) { for (int j = 0; j < sim_data.get_num_y(); ++j) { save_data[i * sim_data.get_num_y() + j] = 0; for (int k = 0; k < sim_data.get_num_z(); ++k) { save_data[i * sim_data.get_num_y() + j] += potential[i * sim_data.get_num_y() * sim_data.get_num_z() + j * sim_data.get_num_z() + k]; } } } fits_create_file(&fptr, fits_file_name, &status); fits_create_img(fptr, DOUBLE_IMG, naxis, naxes, &status); nelements = naxes[0] * naxes[1]; fits_write_img(fptr, TDOUBLE, fpixel, nelements, save_data, &status); fits_close_file(fptr, &status); fits_report_error(stderr, status); mkl_free(save_data); }
int check_result(double* A, double* BT, double* C, int m, int n, int c, int transposed) { int err_c = 0; //how many errors found int i, j, k; double* C_ref = (double*)mkl_malloc(m * n * sizeof(double), 16); //with zeroed // for(i = 0; i < m; i ++) { // for(j = 0; j < n; j++) { // C_ref[i*n+j] = 0; // for(k = 0; k < c; k++) { // C_ref[i*n+j] += A[i*c+k] * BT[j*c+k]; // } // } // } cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, c, 1, A, c, BT, c, 0, C_ref, n); if(transposed){ for(i = 0; i < m; i++) { for(j = 0; j < n; j++) { err_c += (fabs(C[j*m + i] - C_ref[i*n+j]) < 0.0001 ? 0 : 1); } } } else { //do compare for(i = 0; i < m * n ; i++) { err_c += (fabs(C[i] - C_ref[i]) < 0.0001 ? 0 : 1); } } mkl_free(C_ref); return err_c; }
inline void znn_free(void* ptr) { #ifdef ZNN_XEON_PHI mkl_free(ptr); #else free(ptr); #endif }
CMatrix3D::~CMatrix3D() { //first free each matrix and then release the pointer array for(int i = 0;i < depth;i++) { pMats[i].~CMatrix();//force all the arrays to dealloc } mkl_free(pMats); }
WavefunctionData::~WavefunctionData() { mkl_free(psi); mkl_free(psi_new); mkl_free(psi_old); mkl_free(psi_tf); mkl_free(psi_abs2); mkl_free(conj_psi); }
int main(int argc, char *argv[]){ double inicio, fin = dsecnd(); double *A = (double *)mkl_malloc(N*N*sizeof(double), 64); double *B = (double *)mkl_malloc(N*sizeof(double), 64); int *pivot = (int *)mkl_malloc(N*sizeof(int), 32); // distribucion normal de media 0 y varianza 1 std::default_random_engine generador; std::normal_distribution<double> aleatorio(0.0, 1.0); for (int i = 0; i < N*N; i++) A[i] = aleatorio(generador); for (int i = 0; i < N; i++) B[i] = aleatorio(generador); // matriz A marcadamente diagonal para evitar riesgo de singularidad for (int i = 0; i < N; i++) A[i*N + i] += 10.0; int result; inicio = dsecnd(); for (int i = 0; i < NTEST; i++) result = LAPACKE_dgesv(LAPACK_ROW_MAJOR, N, 1, A, N, pivot, B, 1); fin = dsecnd(); double tiempo = (fin - inicio) / (double)NTEST; printf("Tiempo: %lf msec\n", tiempo*1.0e3); mkl_free(A); mkl_free(B); std::getchar(); return 0; }
void update_average_alpha(KernelPerceptron kp) { if (kp->alpha_avg != NULL) { mkl_free(kp->alpha_avg); } kp->alpha_avg = (float*) mkl_64bytes_malloc((kp->M) * sizeof (float)); for (size_t i = 0; i < (kp->M); i++) { (kp->alpha_avg)[i] = (kp->alpha)[i] - (kp->beta)[i] / (kp->c); } }
int bench_stream_triad() { double *A, *B, *C; double t; int64_t m, n, k, i, j; m = SIZE, k = SIZE, n = SIZE; double scalar=3.14; A = (double *)mkl_malloc( m*k*sizeof( double ), 64 ); B = (double *)mkl_malloc( k*n*sizeof( double ), 64 ); C = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); #pragma omp parallel for for (i = 0; i < (m*k); i++) { A[i] = (double)(i+1); } #pragma omp parallel for for (i = 0; i < (k*n); i++) { B[i] = (double)(-i-1); } #pragma omp parallel for for (i = 0; i < (m*n); i++) { C[i] = 0.0; } if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } t=stoptime(); for (i=0;i<NTIME;i++) #pragma omp parallel for for (j=0; j<(m*k); j++) A[j] = B[j]+scalar*C[j]; t=stoptime()-t; printf("GB/s : %f\n",(((((m*k)*3)*8)*NTIME)/t)*1E-9); DPRINTF("\n Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 0; }
~Vector() { mkl_free(data); }
// X: a MxD matrix, Y: a M vector, W: a M vector // W0: a M vector int main(int argc, char ** argv){ if (argc>1 && argv[1][0]=='h') { printf ("Usage: parSymSGD M D T C lamda r\n"); printf (" M: number of data points, D: dimensions, T: time iterations, C: cores;\n"); printf (" lamda: learning rate, r: panel size in unit of C.\n"); return 1; }u // read in the arguments: M, D, I (time iterations), C (cores), r (each panel contains r*C points) int M = argc>1?atoi(argv[1]):32; int D = argc>2?atoi(argv[2]):4; T = argc>3?atoi(argv[3]):10; int C = argc>4?atoi(argv[4]):4; float lamda = argc>5?atof(argv[5]):0.01; int r = argc>6?atoi(argv[6]):1; ///printf("M=%d, D=%d, T=%d, C=%d, lamda=%8.6f, r=%d\n",M,D,T,C,lamda,r); int max_threads = mkl_get_max_threads(); // get the max number of threads int rep; mkl_set_num_threads(1); // set the number of threads to use by mkl panelSz = C*r; panels = M/panelSz; int i,j,k,p,t; float *Y, *Wreal, *W, *X; Y = (float *) mkl_malloc(M*sizeof(float),PAGESIZE); Wreal = (float *) mkl_malloc(D*sizeof(float),PAGESIZE); W = (float *) mkl_malloc(D*sizeof(float),PAGESIZE); X = (float *) mkl_malloc(M*D*sizeof(float),PAGESIZE); float *Ypred = (float*)mkl_malloc(M*sizeof(float),PAGESIZE); float *Ytmp = (float*)mkl_malloc(M*sizeof(float),PAGESIZE); float *I = (float*)mkl_malloc(D*D*sizeof(float),PAGESIZE); float *Z = (float*)mkl_malloc(M*D*sizeof(float),PAGESIZE); float *B = (float*)mkl_malloc(panels*D*sizeof(float),PAGESIZE); if (Y==NULL | Wreal==NULL | W==NULL | X==NULL | Ypred==NULL || Ytmp==NULL || Z==NULL || B==NULL || I== NULL){ printf("Memory allocation error.\n"); return 2; } initData(Wreal,W,X,Y, M, D,I); ///printf("panelSz=%d, panels=%d\n", panelSz, panels); for (nt=1; nt<=max_threads && nt<=panelSz; nt*=2){ omp_set_num_threads(nt);// set the number of openMP threads for (rep=0; rep<REPEATS; rep++){//repeat measurements double prepTime, gdTime, sInit; // preprocessing sInit=dsecnd(); //preprocessSeq(X, Y, Z, B, panelSz, panels, M, D, lamda); preprocessPar(X, Y, Z, B, panelSz, panels, M, D, lamda); prepTime = (dsecnd() - sInit); ///dump2("Z",Z,M,D); ///dump2("B",B,panels,D); // GD initW(W,D); ///dump1("W (initial)", W, D); sInit=dsecnd(); float err; float fixpoint = 0.0; for (t=0;t<T;t++){ for (p=0;p<panels;p++){ gd(&(X[p*panelSz*D]),&(Z[p*panelSz*D]), &(B[p*D]), panelSz, D, lamda, W, I); ///printf("(t=%d, p=%d) ",t,p); ///dump1("W", W, D); ///err=calErr(X, Ypred, Ytmp, Y, W, M, D); printf("finish one panels ............................ \n"); } } gdTime = (dsecnd() - sInit); err=calErr(X, Ypred, Ytmp, Y, W, M, D); fixpoint = err - prev_err; // print final err. time is in milliseconds printf("nt=%d\t ttlTime=%.5f\t prepTime=%.5f\t gdTime=%.5f\t error=%.5f\n", nt, (gdTime+prepTime)*1000, prepTime*1000, gdTime*1000, err); } } if (B) mkl_free(B); if (Z) mkl_free(Z); if (Ytmp) mkl_free(Ytmp); if (Ypred) mkl_free(Ypred); if (Y) mkl_free(Y); if (Wreal) mkl_free(Wreal); if (W) mkl_free(W); if (X) mkl_free(X); if (I) mkl_free(I); return 0; }
int main() { double *A, *B, *C; int m, n, p, i, j, k, r; double alpha, beta; double sum; double s_initial, s_elapsed; printf ("\n This example measures performance of rcomputing the real matrix product \n" " C=alpha*A*B+beta*C using a triple nested loop, where A, B, and C are \n" " matrices and alpha and beta are double precision scalars \n\n"); m = 2000, p = 200, n = 1000; printf (" Initializing data for matrix multiplication C=A*B for matrix \n" " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n); alpha = 1.0; beta = 0.0; printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A = (double *)mkl_malloc( m*p*sizeof( double ), 64 ); B = (double *)mkl_malloc( p*n*sizeof( double ), 64 ); C = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf (" Intializing matrix data \n\n"); for (i = 0; i < (m*p); i++) { A[i] = (double)(i+1); } for (i = 0; i < (p*n); i++) { B[i] = (double)(-i-1); } for (i = 0; i < (m*n); i++) { C[i] = 0.0; } printf (" Making the first run of matrix product using triple nested loop\n" " to get stable run time measurements \n\n"); for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (k = 0; k < p; k++) sum += A[p*i+k] * B[n*k+j]; C[n*i+j] = sum; } } printf (" Measuring performance of matrix product using triple nested loop \n\n"); s_initial = dsecnd(); for (r = 0; r < LOOP_COUNT; r++) { for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sum = 0.0; for (k = 0; k < p; k++) sum += A[p*i+k] * B[n*k+j]; C[n*i+j] = sum; } } } s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT; printf (" == Matrix multiplication using triple nested loop completed == \n" " == at %.5f milliseconds == \n\n", (s_elapsed * 1000)); printf (" Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); if (s_elapsed < 0.9/LOOP_COUNT) { s_elapsed=1.0/LOOP_COUNT/s_elapsed; i=(int)(s_elapsed*LOOP_COUNT)+1; printf(" It is highly recommended to define LOOP_COUNT for this example on your \n" " computer as %i to have total execution time about 1 second for reliability \n" " of measurements\n\n", i); } printf (" Example completed. \n\n"); return 0; }
extern "C" __declspec(dllexport) void free_array(void* x) { mkl_free(x); }
int bench_dgemm() { double *A, *B, *C; int m, n, k, i, j; double alpha, beta; double t; m = SIZE, k = SIZE, n = SIZE; DPRINTF(" Initializing data for matrix multiplication C=A*B for matrix \n" " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n); alpha = 1.0; beta = 0.0; DPRINTF(" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A = (double *)mkl_malloc( m*k*sizeof( double ), 64 ); B = (double *)mkl_malloc( k*n*sizeof( double ), 64 ); C = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } DPRINTF(" Intializing matrix data \n\n"); #pragma omp parallel for for (i = 0; i < (m*k); i++) { A[i] = (double)(i+1); } #pragma omp parallel for for (i = 0; i < (k*n); i++) { B[i] = (double)(-i-1); } #pragma omp parallel for for (i = 0; i < (m*n); i++) { C[i] = 0.0; } DPRINTF(" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n"); t=stoptime(); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, k, B, n, beta, C, n); t=stoptime()-t; printf("calculation time : %f\n",t); printf("gflops/s : %f\n",((2.0*m*n*k)*1E-9)/t); DPRINTF("\n Computations completed.\n\n"); DPRINTF(" Top left corner of matrix A: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(k,6); j++) { DPRINTF("%12.0f", A[j+i*k]); } DPRINTF("\n"); } DPRINTF("\n Top left corner of matrix B: \n"); for (i=0; i<min(k,6); i++) { for (j=0; j<min(n,6); j++) { DPRINTF("%12.0f", B[j+i*n]); } DPRINTF("\n"); } DPRINTF("\n Top left corner of matrix C: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { DPRINTF("%12.5G", C[j+i*n]); } DPRINTF("\n"); } DPRINTF("\n Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); DPRINTF(" Example completed. \n\n"); return 0; }
~Matrix() { mkl_free(data); }
int main() { double *A, *B, *C; int m, n, p, i, j; double alpha, beta; printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n" " Intel® MKL function dgemm, where A, B, and C are matrices and \n" " alpha and beta are double precision scalars\n\n"); m = 2000, p = 200, n = 1000; printf (" Initializing data for matrix multiplication C=A*B for matrix \n" " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n); alpha = 1.0; beta = 0.0; printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" " performance \n\n"); A = (double *)mkl_malloc( m*p*sizeof( double ), 64 ); B = (double *)mkl_malloc( p*n*sizeof( double ), 64 ); C = (double *)mkl_malloc( m*n*sizeof( double ), 64 ); if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf (" Intializing matrix data \n\n"); for (i = 0; i < (m*p); i++) { A[i] = (double)(i+1); } for (i = 0; i < (p*n); i++) { B[i] = (double)(-i-1); } for (i = 0; i < (m*n); i++) { C[i] = 0.0; } printf (" Computing matrix product using Intel® MKL dgemm function via CBLAS interface \n\n"); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, p, alpha, A, p, B, n, beta, C, n); printf ("\n Computations completed.\n\n"); printf (" Top left corner of matrix A: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(p,6); j++) { printf ("%12.0f", A[i+j*p]); } printf ("\n"); } printf ("\n Top left corner of matrix B: \n"); for (i=0; i<min(p,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.0f", B[j+i*n]); } printf ("\n"); } printf ("\n Top left corner of matrix C: \n"); for (i=0; i<min(m,6); i++) { for (j=0; j<min(n,6); j++) { printf ("%12.5G", C[j+i*n]); } printf ("\n"); } printf ("\n Deallocating memory \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); printf (" Example completed. \n\n"); return 0; }
void free_matrix(double* A, double* B, double *C) { mkl_free(A); mkl_free(B); mkl_free(C); }
//============================================================generateProbabilityMap with MKL=====// void Registration::generateProbabilityMap4(double *x, int x_rows, int D, double *xPr, double *y, int y_rows, double sigma2, double outlier, double *P1, double *Pt1, double *Px) { // Initialize N, M, and D from input int N = x_rows; int M = y_rows; int P1_rows = y_rows; int P1_cols = 1; double ksig, outlier_tmp, sp; // Lookup table for the exponential double* expTable = (double*)mkl_malloc(10000 * sizeof(double), 64);//new double[1000]; for(int i=0;i < 10000;i++) expTable[i] = exp(-(double)i/1000); double* P = (double *)mkl_malloc( M*1*sizeof( double ), 64 ); double* temp_x = (double *)mkl_malloc( D*1*sizeof( double ), 64 ); // Set sizes of matrices P1,Pt and Pt1. Fill them with zeros. //P1 = (double *)mkl_malloc( M*1*sizeof( double ), 64 ); for(int i = 0; i < M*1; i++) P1[i] = 0.0; //Pt1 = (double *)mkl_malloc( N*1*sizeof( double ), 64 ); //fill_mkl_matrix(Pt1, M, 1, 0.0); //Px = (double *)mkl_malloc( M*D*sizeof( double ), 64 ); //fill_mkl_matrix(Px, M*D, 0.0); for(int i = 0; i < M*D; i++) Px[i] = 0.0; ksig = -2.0 * sigma2; outlier_tmp = (outlier * M * pow(-ksig*3.14159265358979,0.5*D) )/((1-outlier)*N); // Matrices used for main loop double* Mx1 = (double *)mkl_malloc( M*1*sizeof( double ), 64 ); for(int i = 0; i < M; i++) Mx1[i] = 1.0; double* Q = (double *)mkl_malloc( M*3*sizeof( double ), 64 ); for(int i = 0; i < M*3; i++) Q[i] = 0.0; double* F = (double *)mkl_malloc( M*1*sizeof( double ), 64 ); for(int i = 0; i < M*1; i++) F[i] = 0.0; double* tempM = (double *)mkl_malloc( 3*1*sizeof( double ), 64 ); for(int i = 0; i < 3; i++) tempM[i] = 1.0; double one = 1.0; double negone = -1.0; double zero = 0.0; double beta = 1.0; double alpha = 1.0; double* x_nth_row = (double *)mkl_malloc( D*sizeof( double ), 64 ); //x + n * D * sizeof(double); double* temp_array = (double *)mkl_malloc( 1*1*sizeof( double ), 64 ); int temp_matrix_size; // Main loop going over two point sets to calculate the probability map. for(int n = 0; n < N; n++) { //Q = Mx1 * x->get_n_rows(n,1) for(int i = 0; i<D; i++) x_nth_row[i] = x[n*D+i]; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, D, 1, one, Mx1, 1, x_nth_row, D, zero, Q, D); // Q = Q - y temp_matrix_size = y_rows * D; vdsub(&temp_matrix_size, Q, y, Q); //Q->apply(squarefunction) int Q_rows = y_rows; int Q_cols = D; temp_matrix_size = Q_rows*Q_cols; vdmul(&temp_matrix_size, Q, Q, Q); //*F = ( *Q * *tempM / ksig) beta = 1.0 / ksig; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, Q_rows, 1, Q_cols, beta, Q, Q_cols, tempM, 1, zero, F, 1); //F->apply(expfunction) int F_rows = Q_rows; int F_cols = 1; temp_matrix_size = F_rows * F_cols; // Calcuate exponential through the lookup table for(int i=0;i < Q_rows;i++) {/* if(F[i] < -10) F[i] = 0; else F[i] = expTable[-(int)floor(F[i]*1000)];*/ F[i] = exp(F[i]); } //vdExp(temp_matrix_size, F, F); //sp = (Mx1->transpose()* *F).get(0,0); //temp_array[0] = 0.0; cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, 1, 1, M, one, Mx1, 1, F, F_cols, zero, temp_array, 1); sp = temp_array[0]; sp += outlier_tmp; //*P = (*F/sp) * xPrb->get(n, 0) = F*(sp/xPrb->get(n,0)) double multiplier = 1/ sp * xPr[n]; for(int i = 0; i < F_rows; i++) { P[i] = F[i] * multiplier; } //Pt1->put(n,0,(1 - outlier_tmp/sp) * xPrb->get(n,0)) Pt1[n] = (1 - outlier_tmp/sp) * xPr[n]; //*P1 = *P1 + (*P) temp_matrix_size = M * 1; vdadd(&temp_matrix_size, P1, P, P1); //*Px = *Px + *P*x->get_n_rows(n,1) alpha = 1; beta = 1; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, D, 1, one, P, 1, x_nth_row, D, one, Px, D); } mkl_free_buffers(); //mkl_thread_free_buffers(); mkl_free(expTable); mkl_free(P); mkl_free(temp_x); mkl_free(Mx1); mkl_free(Q); mkl_free(F); mkl_free(tempM); mkl_free(x_nth_row); mkl_free(temp_array); return; }
PotentialData::~PotentialData() { mkl_free(harmonic_trap); }