int #if defined(_MSC_VER) __cdecl #endif // _MSC_VER main (int argc, const char *argv[]) { #ifdef CILK_TEST cilkTest(); #endif #if CLP_USE_OPENBLAS openblas_set_num_threads(CLP_USE_OPENBLAS); #endif #ifdef LAPACK_TEST //void openblas_set_num_threads(int num_threads); openblas_set_num_threads(1); if(argc<2){ printf("Error - need size of matrix for lapack test\n"); return 1; } int n=atoi(argv[1]); printf("n=%d\n",n); if(argc>2){ int nThreads=atoi(argv[2]); printf("Using %d threads\n",nThreads); openblas_set_num_threads(nThreads); } test_lapack(n); return 0; #endif #ifndef ABC_INHERIT ClpSimplex * models = new ClpSimplex[1]; #else AbcSimplex * models = new AbcSimplex[1]; #endif std::cout << "Coin LP version " << CLP_VERSION << ", build " << __DATE__ << std::endl; // Print command line if (argc > 1) { printf("command line - "); for (int i = 0; i < argc; i++) printf("%s ", argv[i]); printf("\n"); } ClpMain0(models); int returnCode = ClpMain1(argc, argv,models); delete [] models; return returnCode; }
DisableThreadingInBlock::~DisableThreadingInBlock() { #if defined(HAVE_MKL_H) mkl_set_num_threads(mklNumThreads); #endif #ifdef _OPENMP omp_set_num_threads(ompNumThreads); #endif #ifdef OPENBLAS_DISABLE_THREADS openblas_set_num_threads(openblasNumThreads); #endif }
DisableThreadingInBlock::DisableThreadingInBlock() : mklNumThreads(1) , ompNumThreads(1) , openblasNumThreads(1) { #if defined(HAVE_MKL_H) mklNumThreads = mkl_get_max_threads(); mkl_set_num_threads(1); #endif #ifdef _OPENMP ompNumThreads = omp_get_max_threads(); omp_set_num_threads(1); #endif #ifdef OPENBLAS_DISABLE_THREADS openblasNumThreads = goto_get_num_procs(); openblas_set_num_threads(1); #endif // Silence compiler warnings about unused private members (void) mklNumThreads; (void) ompNumThreads; (void) openblasNumThreads; }
void openblas_set_num_threads_(int* num_threads){ openblas_set_num_threads(*num_threads); }
int main(int argc, char const *argv[]) { if (argc < 2) { printf("Not enough arguments\n"); return -1; } int test_method = atoi(argv[1]); openblas_set_num_threads(1); int m = 1024; int n = 1024; float *A = new float[m * n]; for (int i = 0; i < m * n; i++) { A[i] = rand() / RAND_MAX; } float *a = new float[n]; for (int i = 0; i < n; i++) { a[i] = rand() / RAND_MAX; } float *B = new float[m * n]; for (int i = 0; i < m * n; i++) { B[i] = rand() / RAND_MAX; } float *b = new float[n]; for (int i = 0; i < n; i++) { b[i] = rand() / RAND_MAX; } float *C = new float[m]; float *c = new float[m]; float *temp_a = new float[m]; float *temp_b = new float[m]; float *res = new float[m]; switch (test_method) { case 0: { omp_set_num_threads(3); double begTime = CycleTimer::currentSeconds(); #pragma omp parallel for for (int i=0; i<3; ++i) { switch(i) { case 0: { cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, A, n, a, 1, 1.0, temp_a, 1); break; } case 1: { cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, B, n, b, 1, 1.0, temp_b, 1); break; } case 2: { elem_mul(res, C, c, m); break; } } } for (int i=0; i<m; ++i) { res[i] += temp_a[i] + temp_b[i]; } double endTime = CycleTimer::currentSeconds(); printf("%f\n", (endTime - begTime)); break; } case 1: { double begTime = CycleTimer::currentSeconds(); cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, A, n, a, 1, 1.0, res, 1); cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, B, n, b, 1, 1.0, res, 1); elem_mul(res, C, c, m); double endTime = CycleTimer::currentSeconds(); printf("%f\n", (endTime - begTime)); break; } default: { printf("No matched test method\n"); return -1; } } delete [] A; delete [] B; delete [] a; delete [] b; delete [] C; delete [] c; delete [] temp_a; delete [] temp_b; delete [] res; return 0; }
//-------------------------------------------------------------------------- int main(int argc, char **argv) { // Temporary variables long int i, j, k, ii, idx = 1; double ElapsedTime; // Arguments int NumThreads = atoi(argv[idx++]); char *FileTrain = argv[idx++]; char *FileTest = argv[idx++]; int NumClasses = atoi(argv[idx++]); int d = atoi(argv[idx++]); int r = atoi(argv[idx++]); int Num_lambda = atoi(argv[idx++]); double *List_lambda = (double *)malloc(Num_lambda*sizeof(double)); for (ii = 0; ii < Num_lambda; ii++) { List_lambda[ii] = atof(argv[idx++]); } int Num_sigma = atoi(argv[idx++]); double *List_sigma = (double *)malloc(Num_sigma*sizeof(double)); for (i = 0; i < Num_sigma; i++) { List_sigma[i] = atof(argv[idx++]); } int MAXIT = atoi(argv[idx++]); double TOL = atof(argv[idx++]); bool verbose = atoi(argv[idx++]); // Threading #ifdef USE_OPENBLAS openblas_set_num_threads(NumThreads); #elif USE_ESSL #elif USE_OPENMP omp_set_num_threads(NumThreads); #else NumThreads = 1; // To avoid compiler warining of unused variable #endif PREPARE_CLOCK(1); START_CLOCK; // Read in X = Xtrain (n*d), y = ytrain (n*1), // and X0 = Xtest (m*d), y0 = ytest (m*1) DPointArray Xtrain; // read all data points from train DPointArray Xtest; // read all data points from test DVector ytrain; // Training labels DVector ytest; // Testing labels (ground truth) DVector ytest_predict; // Predictions if (ReadData(FileTrain, Xtrain, ytrain, d) == 0) { return -1; } if (ReadData(FileTest, Xtest, ytest, d) == 0) { return -1; } END_CLOCK; ElapsedTime = ELAPSED_TIME; printf("OneVsAll: time loading data = %g seconds\n", ElapsedTime); fflush(stdout); // For multiclass classification, need to convert a single vector // ytrain to a matrix Ytrain. The "predictions" are stored in the // corresponding matrix Ytest_predict. The vector ytest_predict is DMatrix Ytrain; ConvertYtrain(ytrain, Ytrain, NumClasses); int Seed = 0; // initialize seed as zero // Loop over List_lambda for (ii = 0; ii < Num_lambda; ii++) { double lambda = List_lambda[ii]; // Loop over List_sigma for (k = 0; k < Num_sigma; k++) { double sigma = List_sigma[k]; // Seed the RNG srandom(Seed); START_CLOCK; // Generate feature matrix Xdata_randbin given Xdata vector< vector< pair<int,double> > > instances_old, instances_new; long Xtrain_N = Xtrain.GetN(); for(i=0;i<Xtrain_N;i++){ instances_old.push_back(vector<pair<int,double> >()); for(j=0;j<d;j++){ int index = j+1; double *myXtrain = Xtrain.GetPointer(); double myXtrain_feature = myXtrain[j*Xtrain_N+i]; if (myXtrain_feature != 0) instances_old.back().push_back(pair<int,double>(index, myXtrain_feature)); } } long Xtest_N = Xtest.GetN(); for(i=0;i<Xtest_N;i++){ instances_old.push_back(vector<pair<int,double> >()); for(j=0;j<d;j++){ int index = j+1; double *myXtest = Xtest.GetPointer(); double myXtest_feature = myXtest[j*Xtest_N+i]; if (myXtest_feature != 0) instances_old.back().push_back(pair<int,double>(index, myXtest_feature)); } } END_CLOCK; printf("Train. RandBin: Time (in seconds) for converting data format: %g\n", ELAPSED_TIME);fflush(stdout); // add 0 feature for Enxu's code START_CLOCK; random_binning_feature(d+1, r, instances_old, instances_new, sigma); END_CLOCK; printf("Train. RandBin: Time (in seconds) for generating random binning features: %g\n", ELAPSED_TIME);fflush(stdout); START_CLOCK; SPointArray Xdata_randbin; // Generate random binning features long int nnz = r*(Xtrain_N + Xtest_N); long int dd = 0; for(i = 0; i < instances_new.size(); i++){ if(dd < instances_new[i][r-1].first) dd = instances_new[i][r-1].first; } Xdata_randbin.Init(Xtrain_N+Xtest_N, dd, nnz); long int ind = 0; long int *mystart = Xdata_randbin.GetPointerStart(); int *myidx = Xdata_randbin.GetPointerIdx(); double *myX = Xdata_randbin.GetPointerX(); for(i = 0; i < instances_new.size(); i++){ if (i == 0) mystart[i] = 0; else mystart[i] = mystart[i-1] + r; for(j = 0; j < instances_new[i].size(); j++){ myidx[ind] = instances_new[i][j].first-1; myX[ind] = instances_new[i][j].second; ind++; } } mystart[i] = nnz; // mystart has a length N+1 // generate random binning features for Xtrain and Xtest SPointArray Xtrain; // Training points SPointArray Xtest; // Testing points long Row_start = 0; Xdata_randbin.GetSubset(Row_start, Xtrain_N,Xtrain); Xdata_randbin.GetSubset(Xtrain_N,Xtest_N,Xtest); Xdata_randbin.ReleaseAllMemory(); END_CLOCK; printf("Train. RandBin: Time (in seconds) for converting data format back: %g\n", ELAPSED_TIME);fflush(stdout); printf("OneVsAll: n train = %ld, m test = %ld, r = %d, D = %ld, Gamma = %f, num threads = %d\n", Xtrain_N, Xtest_N, r, dd, sigma, NumThreads); fflush(stdout); // solve (Z'Z + lambdaI)w = Z'y, note that we never explicitly form // Z'Z since Z is a large sparse matrix N*dd START_CLOCK; int m = Ytrain.GetN(); // number of classes long N = Xtrain.GetN(); // number of training points long NN = Xtest.GetN(); // number of training points long M = Xtrain.GetD(); // dimension of randome binning features DMatrix Ytest_predict(NN,m); DMatrix W(M,m); SPointArray EYE; EYE.Init(M,M,M); mystart = EYE.GetPointerStart(); myidx = EYE.GetPointerIdx(); myX = EYE.GetPointerX(); for(i=0;i<M;i++){ mystart[i] = i; myidx[i] = i; myX[i] = 1; } mystart[i] = M+1; // mystart has a length N+1 for (i = 0; i < m; i++) { DVector w; w.Init(M); DVector ytrain, yy; Ytrain.GetColumn(i, ytrain); Xtrain.MatVec(ytrain, yy, TRANSPOSE); double NormRHS = yy.Norm2(); PCG pcg_solver; pcg_solver.Solve<SPointArray, SPointArray>(Xtrain, yy, w, EYE, MAXIT, TOL, 1); if (verbose) { int Iter = 0; const double *ResHistory = pcg_solver.GetResHistory(Iter); printf("RLCM::Train, PCG. iteration = %d, Relative residual = %g\n", Iter, ResHistory[Iter-1]/NormRHS);fflush(stdout); } pcg_solver.GetSolution(w); W.SetColumn(i, w); } END_CLOCK; printf("Train. RandBin: Time (in seconds) for solving linear system solution: %g\n", ELAPSED_TIME);fflush(stdout); // y = Xtest*W = z(x)'*w START_CLOCK; Xtest.MatMat(W,Ytest_predict,NORMAL,NORMAL); double accuracy = Performance(ytest, Ytest_predict, NumClasses); END_CLOCK; ElapsedTime = ELAPSED_TIME; printf("Test. RandBin: param = %g %g, perf = %g, time = %g\n", sigma, lambda, accuracy, ElapsedTime); fflush(stdout); }// End loop over List_sigma }// End loop over List_lambda // Clean up free(List_sigma); free(List_lambda); return 0; }
void NAME(int* num_threads) { openblas_set_num_threads(*num_threads); }
int DMxVMPI(int argc, char *argv[], int numProcs, int myid) { int ret_code = 1; int option; unsigned long *II; unsigned long *J; double *values; unsigned long M; unsigned long local_M; unsigned long N; unsigned long long nz; double *vectorValues; unsigned long M_Vector; unsigned long N_Vector; unsigned long long nz_vector; char *outputFileName = NULL; char *inputMatrixFile = NULL; char *inputVectorFile = NULL; char *outputVectorFile = NULL; int inputFormatRow = 0; double alpha = 1.0; double beta = 0.0; int numThreads = 1; int i, j; while ((option = getopt(argc, argv,"ro:a:b:t:")) >= 0) { switch (option) { case 'o' : //free(outputFileName); outputFileName = (char *) malloc(sizeof(char)*strlen(optarg)+1); strcpy(outputFileName,optarg); break; case 'r': inputFormatRow = 1; break; case 'b': beta = atof(optarg); break; case 'a': alpha = atof(optarg); break; case 't': numThreads = atoi(optarg); break; default: break; } } if ((optind + 3 != argc) && (optind + 2 != argc)) { if (myid == 0) { //fprintf(stderr,"[%s] Argc: %d, optind: %d\n",__func__, argc, optind); usageDMxVMPI(); } return 0; } openblas_set_num_threads(numThreads); if(optind + 3 == argc) { //We have an output vector outputVectorFile = (char *)malloc(sizeof(char)*strlen(argv[optind+2])+1); strcpy(outputVectorFile,argv[optind+2]); } if(outputFileName == NULL) { outputFileName = (char *) malloc(sizeof(char)*6); sprintf(outputFileName,"stdout"); } inputMatrixFile = (char *)malloc(sizeof(char)*strlen(argv[optind])+1); inputVectorFile = (char *)malloc(sizeof(char)*strlen(argv[optind+1])+1); strcpy(inputMatrixFile,argv[optind]); strcpy(inputVectorFile,argv[optind+1]); //Read matrix if(inputFormatRow) { if(!readDenseCoordinateMatrixMPIRowLine(inputMatrixFile,&II,&J,&values,&M,&local_M,&N,&nz,myid, numProcs)){ fprintf(stderr, "[%s] Can not read Matrix\n",__func__); return 0; } } else{ if(!readDenseCoordinateMatrixMPI(inputMatrixFile,&II,&J,&values,&M,&local_M,&N,&nz,myid, numProcs)){ fprintf(stderr, "[%s] Can not read Matrix\n",__func__); return 0; } } //Read input vector if(!readDenseVector(inputVectorFile, &vectorValues,&M_Vector,&N_Vector,&nz_vector)){ fprintf(stderr, "[%s] Can not read Vector\n",__func__); return 0; } /* void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY); */ double *partial_result=(double *) calloc(local_M,sizeof(double)); double* y = (double*)calloc(N,sizeof(double)); if(y == NULL){ fprintf(stderr,"[%s] Error reserving memory for final result vector in processor %d\n",__func__,myid); return 0; } //Read output vector if any if(outputVectorFile != NULL) { if(!readDenseVector(outputVectorFile, &y,&M_Vector,&N_Vector,&nz_vector)){ fprintf(stderr, "[%s] Can not read Vector %s\n",__func__, outputVectorFile); return 0; } for( i = (local_M * myid), j = 0; i< (local_M * myid + local_M) && j< local_M; i++, j++) { partial_result[j] = y [i]; } } double t_real = realtime(); //y := alpha * A * x + beta * y //cblas_dgemv(CblasRowMajor,CblasNoTrans,local_M,N,1.0,values,N,vectorValues,1,0.0,partial_result,1); cblas_dgemv(CblasRowMajor,CblasNoTrans,local_M,N,alpha,values,N,vectorValues,1,beta,partial_result,1); MPI_Allgather (partial_result,local_M,MPI_DOUBLE,y,local_M,MPI_DOUBLE,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); fprintf(stderr, "\n[%s] Time spent in DMxV: %.6f sec\n", __func__, realtime() - t_real); if (myid == 0){ writeDenseVector(outputFileName, y,M_Vector,N_Vector,nz_vector); } return ret_code; }
int main() { #if defined(REF_BLAS_OPENBLAS) openblas_set_num_threads(1); #endif #if defined(REF_BLAS_BLIS) omp_set_num_threads(1); #endif printf("\n"); printf("\n"); printf("\n"); printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n"); printf(" Copyright (C) 2014 by Technical University of Denmark. All rights reserved.\n"); printf("\n"); printf(" HPMPC is distributed in the hope that it will be useful,\n"); printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"); printf(" See the GNU Lesser General Public License for more details.\n"); printf("\n"); printf("\n"); printf("\n"); printf("Riccati solver performance test - double precision\n"); printf("\n"); // maximum frequency of the processor const float GHz_max = GHZ_MAX; printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max); printf("\n"); // maximum flops per cycle, double precision #if defined(TARGET_X64_AVX2) const float flops_max = 16; printf("Testing solvers for AVX & FMA3 instruction sets, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_X64_AVX) const float flops_max = 8; printf("Testing solvers for AVX instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3) const float flops_max = 4; printf("Testing solvers for SSE3 instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_CORTEX_A15) const float flops_max = 2; printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A15: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_CORTEX_A9) const float flops_max = 1; printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A9: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_CORTEX_A7) const float flops_max = 0.5; printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A7: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_X86_ATOM) const float flops_max = 1; printf("Testing solvers for SSE3 instruction set, 32 bit, optimized for Intel Atom: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_POWERPC_G2) const float flops_max = 1; printf("Testing solvers for POWERPC instruction set, 32 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_C99_4X4) const float flops_max = 2; printf("Testing reference solvers, 4x4 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_C99_4X4_PREFETCH) const float flops_max = 2; printf("Testing reference solvers, 4x4 kernel with register prefetch: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_C99_2X2) const float flops_max = 2; printf("Testing reference solvers, 2x2 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #endif FILE *f; f = fopen("./test_problems/results/test_blas.m", "w"); // a #if defined(TARGET_X64_AVX2) fprintf(f, "C = 'd_x64_avx2';\n"); fprintf(f, "\n"); #elif defined(TARGET_X64_AVX) fprintf(f, "C = 'd_x64_avx';\n"); fprintf(f, "\n"); #elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3) fprintf(f, "C = 'd_x64_sse3';\n"); fprintf(f, "\n"); #elif defined(TARGET_CORTEX_A9) fprintf(f, "C = 'd_ARM_cortex_A9';\n"); fprintf(f, "\n"); #elif defined(TARGET_CORTEX_A7) fprintf(f, "C = 'd_ARM_cortex_A7';\n"); fprintf(f, "\n"); #elif defined(TARGET_CORTEX_A15) fprintf(f, "C = 'd_ARM_cortex_A15';\n"); fprintf(f, "\n"); #elif defined(TARGET_X86_ATOM) fprintf(f, "C = 'd_x86_atom';\n"); fprintf(f, "\n"); #elif defined(TARGET_POWERPC_G2) fprintf(f, "C = 'd_PowerPC_G2';\n"); fprintf(f, "\n"); #elif defined(TARGET_C99_4X4) fprintf(f, "C = 'd_c99_4x4';\n"); fprintf(f, "\n"); #elif defined(TARGET_C99_4X4_PREFETCH) fprintf(f, "C = 'd_c99_4x4';\n"); fprintf(f, "\n"); #elif defined(TARGET_C99_2X2) fprintf(f, "C = 'd_c99_2x2';\n"); fprintf(f, "\n"); #endif fprintf(f, "A = [%f %f];\n", GHz_max, flops_max); fprintf(f, "\n"); fprintf(f, "B = [\n"); printf("\n"); printf("Tested solvers:\n"); printf("-sv : Riccati factorization and system solution (prediction step in IP methods)\n"); printf("-trs: system solution after a previous call to Riccati factorization (correction step in IP methods)\n"); printf("\n"); printf("\n"); #if defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX) || defined(TARGET_X64_SSE3) || defined(TARGET_X86_ATOM) || defined(TARGET_AMD_SSE3) /* printf("\nflush to zero on\n");*/ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!! #endif // to throw floating-point exception /*#ifndef __APPLE__*/ /* feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);*/ /*#endif*/ int ii, jj; const int bs = D_MR; //d_get_mr(); const int ncl = D_NCL; const int nal = bs*ncl; // number of doubles per cache line int nn[] = {4, 6, 8, 10, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300}; int nnrep[] = {10000, 10000, 10000, 10000, 10000, 4000, 4000, 2000, 2000, 1000, 1000, 400, 400, 400, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; int vnx[] = {8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 1024}; int vnrep[] = {100, 100, 100, 100, 100, 100, 50, 50, 50, 20, 10, 10}; int vN[] = {4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256}; int nx, nw, ny, ndN, N, nrep, Ns; int diag_R; int ll; // int ll_max = 77; int ll_max = 1; for(ll=0; ll<ll_max; ll++) { FILE* fid; double* yy; float* yy_temp; if(1) { fid = fopen("./test_problems/mhe_measure.dat", "r"); if(fid==NULL) exit(-1); //printf("\nhola\n"); int dummy_int = fscanf(fid, "%d %d %d %d", &nx, &nw, &ny, &Ns); //printf("\n%d %d %d %d\n", nx, nw, ny, Ns); yy_temp = (float*) malloc(ny*Ns*sizeof(float)); yy = (double*) malloc(ny*Ns*sizeof(double)); for(jj=0; jj<ny*Ns; jj++) { dummy_int = fscanf(fid, "%e", &yy_temp[jj]); yy[jj] = (double) yy_temp[jj]; //printf("\n%f", yy[jj]); } //printf("\n"); fclose(fid); #if 1 N = 15; //Ns-1; // NN; nrep = NREP;//nnrep[ll]; nx = 12;//nn[ll]; nw = 5;//nn[ll]; ny = 3; ndN = 0; //2; diag_R = 0; #else N = 10; //Ns-1; // NN; nrep = nnrep[ll]; nx = nn[ll]; nw = nn[ll]; ny = 3; ndN = 0; diag_R = 0; #endif //printf("\nnx = %d; nw = %d; ny = %d; ndN = %d; N = %d\n\n", nx, nw, ny, ndN, N); } else if(ll_max==1) { nx = NX; // number of states (it has to be even for the mass-spring system test problem) nw = NU; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem) ny = nx/2; // size of measurements vector N = NN; // horizon lenght nrep = NREP; } else { nx = nn[ll]; // number of states (it has to be even for the mass-spring system test problem) nw = 2; // number of inputs (controllers) (it has to be at least 1 and at most nx/2 for the mass-spring system test problem) ny = nx/2; // size of measurements vector N = 10; // horizon lenght nrep = nnrep[ll]; } int rep; const int nz = nx+ny; // TODO delete const int nwx = nw+nx; const int anz = nal*((nz+nal-1)/nal); const int anx = nal*((nx+nal-1)/nal); const int anw = nal*((nw+nal-1)/nal); const int any = nal*((ny+nal-1)/nal); const int pnz = bs*((nz+bs-1)/bs); const int pnx = bs*((nx+bs-1)/bs); const int pnw = bs*((nw+bs-1)/bs); const int pny = bs*((ny+bs-1)/bs); const int pnx2 = bs*((2*nx+bs-1)/bs); const int pnwx = bs*((nw+nx+bs-1)/bs); const int cnz = ncl*((nz+ncl-1)/ncl); const int cnx = ncl*((nx+ncl-1)/ncl); const int cnw = ncl*((nw+ncl-1)/ncl); const int cny = ncl*((ny+ncl-1)/ncl); const int cnx2 = 2*(ncl*((nx+ncl-1)/ncl)); const int cnwx = ncl*((nw+nx+ncl-1)/ncl); const int cnwx1 = ncl*((nw+nx+1+ncl-1)/ncl); const int cnf = cnz<cnx+ncl ? cnx+ncl : cnz; const int pad = (ncl-(nx+nw)%ncl)%ncl; // packing between AGL & P const int cnl = nx+nw+pad+cnx; const int pad2 = (ncl-(nx)%ncl)%ncl; // packing between AGL & P const int cnl2 = cnz<cnx+ncl ? nx+pad2+cnx+ncl : nx+pad2+cnz; /************************************************ * dynamical system ************************************************/ double *A; d_zeros(&A, nx, nx); // states update matrix double *B; d_zeros(&B, nx, nw); // inputs matrix double *b; d_zeros(&b, nx, 1); // states offset double *x0; d_zeros(&x0, nx, 1); // initial state double Ts = 0.5; // sampling time mass_spring_system(Ts, nx, nw, N, A, B, b, x0); for(jj=0; jj<nx; jj++) b[jj] = 0.0; for(jj=0; jj<nx; jj++) x0[jj] = 0.0; x0[0] = 3.5; x0[1] = 3.5; double *C; d_zeros(&C, ny, nx); // inputs matrix for(jj=0; jj<ny; jj++) C[jj*(ny+1)] = 1.0; // d_print_mat(nx, nx, A, nx); // d_print_mat(nx, nw, B, nx); // d_print_mat(ny, nx, C, ny); // d_print_mat(nx, 1, b, nx); // d_print_mat(nx, 1, x0, nx); /* packed into contiguous memory */ double *pA; d_zeros_align(&pA, pnx, cnx); d_cvt_mat2pmat(nx, nx, A, nx, 0, pA, cnx); double *pG; d_zeros_align(&pG, pnx, cnw); d_cvt_mat2pmat(nx, nw, B, nx, 0, pG, cnw); double *pC; d_zeros_align(&pC, pny, cnx); d_cvt_mat2pmat(ny, nx, C, ny, 0, pC, cnx); double *pCA; d_zeros_align(&pCA, pnz, cnx); d_cvt_mat2pmat(ny, nx, C, ny, 0, pCA, cnx); d_cvt_mat2pmat(nx, nx, A, nx, ny, pCA+(ny/bs)*bs+ny%bs, cnx); // d_print_pmat(nx, nx, bs, pA, cnx); // d_print_pmat(nx, nw, bs, pG, cnw); // d_print_pmat(ny, nx, bs, pC, cnx); /************************************************ * cost function ************************************************/ double *R; d_zeros(&R, nw, nw); for(jj=0; jj<nw; jj++) R[jj*(nw+1)] = 1.0; double *Q; d_zeros(&Q, ny, ny); for(jj=0; jj<ny; jj++) Q[jj*(ny+1)] = 1.0; double *Qx; d_zeros(&Qx, nx, nx); for(jj=0; jj<ny; jj++) for(ii=0; ii<ny; ii++) Qx[ii+nx*jj] = Q[ii+ny*jj]; double *L0; d_zeros(&L0, nx, nx); for(jj=0; jj<nx; jj++) L0[jj*(nx+1)] = 1.0; double *q; d_zeros_align(&q, any, 1); for(jj=0; jj<ny; jj++) q[jj] = 0.0; double *r; d_zeros_align(&r, anw, 1); for(jj=0; jj<nw; jj++) r[jj] = 1.0; double *f; d_zeros_align(&f, anx, 1); for(jj=0; jj<nx; jj++) f[jj] = jj;//1.0; //b[jj]; //1.0; /* packed into contiguous memory */ double *pR; d_zeros_align(&pR, pnw, cnw); d_cvt_mat2pmat(nw, nw, R, nw, 0, pR, cnw); double *pQ; d_zeros_align(&pQ, pny, cny); d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQ, cny); // d_print_pmat(nw, nw, bs, pQ, cnw); // d_print_pmat(ny, ny, bs, pR, cny); /************************************************ * compound quantities ************************************************/ double *pRG; d_zeros_align(&pRG, pnwx, cnw); d_cvt_mat2pmat(nw, nw, R, nw, 0, pRG, cnw); d_cvt_mat2pmat(nx, nw, B, nx, nw, pRG+(nw/bs)*bs*cnw+nw%bs, cnw); //d_print_pmat(nw+nx, nw, bs, pRG, cnw); double *pQA; d_zeros_align(&pQA, pnx2, cnx); d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQA, cnx); d_cvt_mat2pmat(nx, nx, A, nx, nx, pQA+(nx/bs)*bs*cnx+nx%bs, cnx); //d_print_pmat(2*nx, cnx, bs, pQA, cnx); //exit(1); /************************************************ * series of matrices ************************************************/ double *(hpA[N]); double *(hpCA[N]); double *(hpG[N]); double *(hpC[N+1]); double *(hpR[N]); double *(hpQ[N+1]); double *(hpLp[N+1]); double *(hdLp[N+1]); double *(hpLp2[N+1]); double *(hpLe[N+1]); double *(hq[N]); double *(hr[N+1]); double *(hf[N]); double *(hxe[N+1]); double *(hxp[N+1]); double *(hw[N]); double *(hy[N+1]); double *(hlam[N]); double *(hpRG[N]); double *(hpQA[N+1]); double *(hpGLr[N]); double *(hpALe[N+1]); double *(hrr[N]); double *(hqq[N+1]); double *(hff[N+1]); double *p_hrr; d_zeros_align(&p_hrr, anw, N); double *p_hqq; d_zeros_align(&p_hqq, anx, N+1); double *p_hff; d_zeros_align(&p_hff, anx, N+1); double *p_hxe; d_zeros_align(&p_hxe, anx, N+1); double *p_hxp; d_zeros_align(&p_hxp, anx, N+1); double *p_hw; d_zeros_align(&p_hw, anw, N); double *p_hy; d_zeros_align(&p_hy, any, N+1); double *p_hlam; d_zeros_align(&p_hlam, anx, N+1); double *(hq_res[N+1]); double *(hr_res[N]); double *(hf_res[N+1]); double *p_hq_res; d_zeros_align(&p_hq_res, anx, N+1); double *p_hr_res; d_zeros_align(&p_hr_res, anw, N); double *p_hf_res; d_zeros_align(&p_hf_res, anx, N+1); for(jj=0; jj<N; jj++) { hpA[jj] = pA; hpCA[jj] = pCA; hpG[jj] = pG; hpC[jj] = pC; hpR[jj] = pR; hpQ[jj] = pQ; d_zeros_align(&hpLp[jj], pnx, cnl); d_zeros_align(&hdLp[jj], anx, 1); d_zeros_align(&hpLp2[jj], pnz, cnl2); d_zeros_align(&hpLe[jj], pnz, cnf); hr[jj] = r; hq[jj] = q; hf[jj] = f; hpRG[jj] = pRG; hpQA[jj] = pQA; d_zeros_align(&hpGLr[jj], pnwx, cnw); d_zeros_align(&hpALe[jj], pnx2, cnx2); hrr[jj] = p_hrr+jj*anw; hqq[jj] = p_hqq+jj*anx; hff[jj] = p_hff+jj*anx; hxe[jj] = p_hxe+jj*anx; //d_zeros_align(&hxe[jj], anx, 1); hxp[jj] = p_hxp+jj*anx; //d_zeros_align(&hxp[jj], anx, 1); hw[jj] = p_hw+jj*anw; //d_zeros_align(&hw[jj], anw, 1); hy[jj] = p_hy+jj*any; //d_zeros_align(&hy[jj], any, 1); hlam[jj] = p_hlam+jj*anx; //d_zeros_align(&hlambda[jj], anx, 1); hq_res[jj] = p_hq_res+jj*anx; hr_res[jj] = p_hr_res+jj*anw; hf_res[jj] = p_hf_res+jj*anx; } hpC[N] = pC; hpQ[N] = pQ; d_zeros_align(&hpLp[N], pnx, cnl); d_zeros_align(&hdLp[N], anx, 1); d_zeros_align(&hpLp2[N], pnz, cnl2); d_zeros_align(&hpLe[N], pnz, cnf); hq[N] = q; // equality constraints on the states at the last stage double *D; d_zeros(&D, ndN, nx); for(ii=0; ii<ndN; ii++) D[ii*(ndN+1)] = 1; //D[0+ndN*0] = 1; //D[1+ndN*(nx-1)] = 1; double *d; d_zeros_align(&d, ndN, 1); for(ii=0; ii<ndN; ii++) d[ii] = ii; //d[0] = 1; //d[1] = 0; const int pnxdN = bs*((nx+ndN+bs-1)/bs); double *pCtQC; d_zeros_align(&pCtQC, pnxdN, cnx); d_cvt_mat2pmat(ny, ny, Q, ny, 0, pCtQC, cnx); d_cvt_mat2pmat(ndN, nx, D, ndN, nx, pCtQC+nx/bs*bs*cnx+nx%bs, cnx); //d_print_pmat(nx+ndN, nx, bs, pCtRC, cnx); hpQA[N] = pCtQC; // there is not A_N d_zeros_align(&hpALe[N], pnxdN, cnx2); // there is not A_N: pnx not pnx2 hqq[N] = p_hqq+N*anx; hff[N] = p_hff+N*anx; const int pndN = bs*((ndN+bs-1)/bs); const int cndN = ncl*((ndN+ncl-1)/ncl); double *Ld; d_zeros_align(&Ld, pndN, cndN); double *d_res; d_zeros_align(&d_res, pndN, 1); hxe[N] = p_hxe+N*anx; //d_zeros_align(&hxe[N], anx, 1); hxp[N] = p_hxp+N*anx; //d_zeros_align(&hxp[N], anx, 1); hy[N] = p_hy+N*any; //d_zeros_align(&hy[N], any, 1); hlam[N] = p_hlam+N*anx; //d_zeros_align(&hlambda[jj], anx, 1); hf_res[N] = p_hf_res+N*anx; hq_res[N] = p_hq_res+N*anx; // initialize hpLp[0] with the cholesky factorization of /Pi_p d_cvt_mat2pmat(nx, nx, L0, nx, 0, hpLp[0]+(nx+nw+pad)*bs, cnl); for(ii=0; ii<nx; ii++) hdLp[0][ii] = 1.0/L0[ii*(nx+1)]; d_cvt_mat2pmat(nx, nx, L0, nx, ny, hpLp2[0]+(ny/bs)*bs+ny%bs+(nx+pad2+ny)*bs, cnl2); dtrtr_l_lib(nx, ny, hpLp2[0]+(ny/bs)*bs*cnl2+ny%bs+(nx+pad2+ny)*bs, cnl2, hpLp2[0]+(nx+pad2+ncl)*bs, cnl2); //d_print_pmat(nx, cnl, bs, hpLp[0], cnl); //d_print_pmat(nz, cnl2, bs, hpLp2[0], cnl2); // buffer for L0 double *pL0; d_zeros_align(&pL0, pnx, cnx); d_cvt_mat2pmat(nx, nx, L0, nx, 0, pL0, cnx); // invert L0 in hpALe[0] dtrinv_lib(nx, pL0, cnx, hpALe[0], cnx2); double *pL0_inv; d_zeros_align(&pL0_inv, pnx, cnx); dtrinv_lib(nx, pL0, cnx, pL0_inv, cnx); //d_print_pmat(nx, nx, bs, pL0, cnx); //d_print_pmat(nx, nx, bs, pL0_inv, cnx); //d_print_pmat(pnx2, cnx2, bs, hpALe[0], cnx2); //exit(1); //double *work; d_zeros_align(&work, pny*cnx+pnz*cnz+anz+pnz*cnf+pnw*cnw, 1); double *work; d_zeros_align(&work, 2*pny*cnx+anz+pnw*cnw+pnx*cnx, 1); //printf("\nciao %d %d %d %d %d %d\n", pny, cnx, anz, pnw, cnw, pnx); double *work2; d_zeros_align(&work2, 2*pny*cnx+pnw*cnw+pnx*cnw+2*pnx*cnx+anz, 1); double *work3; d_zeros_align(&work3, pnx*cnl+anx, 1); double *work4; d_zeros_align(&work4, 4*anx+2*(anx+anw), 1); // for(jj=0; jj<2*pny*cnx+anz+pnw*cnw+pnx*cnx; jj++) // work[jj] = -100.0; // measurements for(jj=0; jj<=N; jj++) for(ii=0; ii<ny; ii++) hy[jj][ii] = yy[jj*ny+ii]; //d_print_mat(ny, N+1, hy[0], any); // initial guess for(ii=0; ii<nx; ii++) x0[ii] = 0.0; for(ii=0; ii<nx; ii++) hxp[0][ii] = x0[ii]; // information filter - solution double *y_temp; d_zeros_align(&y_temp, any, 1); for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) hrr[ii][jj] = r[jj]; for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) hff[ii][jj] = f[jj]; for(jj=0; jj<ndN; jj++) hff[N][jj] = d[jj]; for(ii=0; ii<=N; ii++) { for(jj=0; jj<ny; jj++) y_temp[jj] = - q[jj]; //d_print_mat(1, ny, y_temp, 1); dsymv_lib(ny, ny, hpQ[ii], cny, hy[ii], y_temp, y_temp, -1); //d_print_mat(1, ny, y_temp, 1); dgemv_t_lib(ny, nx, hpC[ii], cnx, y_temp, hqq[ii], hqq[ii], 0); //d_print_mat(1, nx, hqq[ii], 1); //if(ii==9) //exit(1); } //exit(1); /************************************************ * new low-level mhe_if interface ************************************************/ int nrows = pnx>pnw ? 2*pnx : pnx+pnw; int ncols = cnwx1; double *pQRAG; d_zeros_align(&pQRAG, nrows, ncols); if(nx>=nw) { d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQRAG, cnwx1); d_cvt_mat2pmat(nx, nx, A, nx, 0, pQRAG+pnx*cnwx1, cnwx1); d_cvt_mat2pmat(nw, nw, R, nw, 0, pQRAG+(pnx-pnw)*cnwx1+nx*bs, cnwx1); d_cvt_mat2pmat(nx, nw, B, nx, 0, pQRAG+pnx*cnwx1+nx*bs, cnwx1); //d_print_pmat(nrows, ncols, bs, pQRAG, ncols); if(nx>pnx-nx) d_cvt_mat2pmat(pnx-nx, nx, A+(nx-pnx+nx), nx, nx, pQRAG+nx/bs*bs*cnwx1+nx%bs, cnwx1); else d_cvt_mat2pmat(nx, nx, A, nx, nx, pQRAG+nx/bs*bs*cnwx1+nx%bs, cnwx1); if(nx>pnw-nw) d_cvt_mat2pmat(pnw-nw, nw, B+(nx-pnw+nw), nx, nw, pQRAG+(pnx-pnw+nw/bs*bs)*cnwx1+nw%bs+nx*bs, cnwx1); else d_cvt_mat2pmat(nx, nw, B, nx, nw, pQRAG+(pnx-pnw+nw/bs*bs)*cnwx1+nw%bs+nx*bs, cnwx1); //d_print_pmat(nrows, ncols, bs, pQRAG, ncols); } else { d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQRAG+(pnw-pnx)*cnwx1, cnwx1); d_cvt_mat2pmat(nx, nx, A, nx, 0, pQRAG+pnw*cnwx1, cnwx1); d_cvt_mat2pmat(nw, nw, R, nw, 0, pQRAG+nx*bs, cnwx1); d_cvt_mat2pmat(nx, nw, B, nx, 0, pQRAG+pnw*cnwx1+nx*bs, cnwx1); //d_print_pmat(nrows, ncols, bs, pQRAG, ncols); if(nx>pnx-nx) d_cvt_mat2pmat(pnx-nx, nx, A+(nx-pnx+nx), nx, nx, pQRAG+(pnw-pnx+nx/bs*bs)*cnwx1+nx%bs, cnwx1); else d_cvt_mat2pmat(nx, nx, A, nx, nx, pQRAG+(pnw-pnx+nx/bs*bs)*cnwx1+nx%bs, cnwx1); if(nx>pnw-nw) d_cvt_mat2pmat(pnw-nw, nw, B+(nx-pnw+nw), nx, nw, pQRAG+nw/bs*bs*cnwx1+nw%bs+nx*bs, cnwx1); else d_cvt_mat2pmat(nx, nw, B, nx, nw, pQRAG+nw/bs*bs*cnwx1+nw%bs+nx*bs, cnwx1); //d_print_pmat(nrows, ncols, bs, pQRAG, ncols); } double *pQD; d_zeros_align(&pQD, pnx+pndN, cnx); d_cvt_mat2pmat(ny, ny, Q, ny, 0, pQD, cnx); d_cvt_mat2pmat(ndN, nx, D, ndN, 0, pQD+pnx*cnx, cnx); //d_print_pmat(pnx+pndN, cnx, bs, pQD, cnx); if(ndN>pnx-nx) d_cvt_mat2pmat(pnx-nx, nx, D+(ndN-pnx+nx), ndN, nx, pQD+nx/bs*bs*cnx+nx%bs, cnx); else d_cvt_mat2pmat(ndN, nx, D, ndN, nx, pQD+nx/bs*bs*cnx+nx%bs, cnx); //d_print_pmat(pnx+pndN, cnx, bs, pQD, cnx); //exit(1); double *(hpQRAG[N+1]); double *(hpLAG[N+1]); double *(hpLe2[N+1]); for(ii=0; ii<N; ii++) { hpQRAG[ii] = pQRAG; d_zeros_align(&hpLAG[ii], nrows, ncols); d_zeros_align(&hpLe2[ii], pnx, cnx); } hpQRAG[N] = pQD; d_zeros_align(&hpLAG[N], pnx+pndN, cnx); d_zeros_align(&hpLe2[N], pnx, cnx); d_cvt_mat2pmat(nx, nx, L0, nx, 0, hpLe2[0], cnx); //d_print_pmat(nx, nx, bs, hpLe2[0], cnx); double **dummy; #if 0 struct timeval tv10, tv11, tv12; // double precision gettimeofday(&tv10, NULL); // start for(ii=0; ii<1; ii++) //for(ii=0; ii<nrep; ii++) { d_ric_trf_mhe_if(nx, nw, ndN, N, hpQRAG, diag_R, hpLe2, hpLAG, Ld, work3); //d_ric_trf_mhe_if(nx, nw, ndN, N, hpQA, hpRG, diag_R, hpALe, hpGLr, Ld, work3); } gettimeofday(&tv11, NULL); // stop for(ii=0; ii<1; ii++) //for(ii=0; ii<nrep; ii++) { d_ric_trs_mhe_if(nx, nw, ndN, N, hpLe2, hpLAG, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3); } gettimeofday(&tv12, NULL); // stop float time_trf_mhe_if_new = (float) (tv11.tv_sec-tv10.tv_sec)/(nrep+0.0)+(tv11.tv_usec-tv10.tv_usec)/(nrep*1e6); float time_trs_mhe_if_new = (float) (tv12.tv_sec-tv11.tv_sec)/(nrep+0.0)+(tv12.tv_usec-tv11.tv_usec)/(nrep*1e6); printf("\ntime = %e\t%e\n\n", time_trf_mhe_if_new, time_trs_mhe_if_new); //exit(1); #endif /************************************************ * reference code ************************************************/ double *(hA[N]); double *(hG[N]); double *(hQ[N+1]); double *(hR[N]); double *(hAGU[N]); double *(hUp[N+1]); double *(hUe[N+1]); double *(hUr[N]); double *Ud; double *work_ref; for(ii=0; ii<N; ii++) { hA[ii] = A; hG[ii] = B; hQ[ii] = Qx; hR[ii] = R; d_zeros(&hAGU[ii], nx, nx+nw); d_zeros(&hUp[ii], nx, nx); d_zeros(&hUe[ii], nx, nx); d_zeros(&hUr[ii], nw, nw); } hA[N] = D; hQ[N] = Qx; d_zeros(&hAGU[N], ndN, nx); d_zeros(&hUp[N], nx, nx); d_zeros(&hUe[N], nx, nx); d_zeros(&Ud, ndN, ndN); d_zeros(&work_ref, nx+nw, 1); for(ii=0; ii<nx*nx; ii++) hUp[0][ii] = L0[ii]; #if 0 printf("\nfactorization\n"); d_ric_trf_mhe_if_blas( nx, nw, ndN, N, hA, hG, hQ, hR, hAGU, hUp, hUe, hUr, Ud); printf("\nsolution\n"); d_ric_trs_mhe_if_blas( nx, nw, ndN, N, hAGU, hUp, hUe, hUr, Ud, hqq, hrr, hff, hxp, hxe, hw, hlam, work_ref); //d_print_mat(nx, nx, hUe[N], nx); //exit(1); #endif /************************************************ * high-level interface ************************************************/ #if 0 int kk; double *AA; d_zeros(&AA, nx, nx*N); //for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) for(ll=0; ll<nx; ll++) AA[ll+nx*jj+nx*nx*ii] = A[ll+nx*jj]; for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) for(kk=0; kk<nx; kk++) AA[jj+nx*kk+nx*nx*ii] = A[kk+nx*jj]; double *GG; d_zeros(&GG, nx, nw*N); //for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) for(ll=0; ll<nx; ll++) GG[ll+nx*jj+nx*nw*ii] = B[ll+nx*jj]; for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) for(kk=0; kk<nx; kk++) GG[jj+nw*kk+nx*nw*ii] = B[kk+nx*jj]; double *ff; d_zeros(&ff, nx, N); for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) ff[jj+nx*ii] = f[jj]; double *DD; d_zeros(&DD, ndN, nx); //for(jj=0; jj<nx; jj++) for(ll=0; ll<ndN; ll++) DD[ll+ndN*jj] = D[ll+ndN*jj]; for(jj=0; jj<nx; jj++) for(kk=0; kk<ndN; kk++) DD[jj+nx*kk] = D[kk+ndN*jj]; double *dd; d_zeros(&dd, ndN, 1); for(kk=0; kk<ndN; kk++) dd[kk] = d[kk]; double *RR; d_zeros(&RR, nw, nw*N); for(ii=0; ii<N; ii++) for(jj=0; jj<nw*nw; jj++) RR[jj+nw*nw*ii] = R[jj]; double *QQ; d_zeros(&QQ, nx, nx*N); for(ii=0; ii<N; ii++) { for(jj=0; jj<ny; jj++) for(kk=0; kk<ny; kk++) QQ[kk+nx*jj+nx*nx*ii] = Q[kk+ny*jj]; //for(jj=ny; jj<nx; jj++) QQ[jj+nx*jj+nx*nx*ii] = 1e-8; } double *Qf; d_zeros(&Qf, nx, nx); for(jj=0; jj<ny; jj++) for(kk=0; kk<ny; kk++) Qf[kk+nx*jj] = Q[kk+ny*jj]; double *rr; d_zeros(&rr, nw, N); for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) rr[jj+nw*ii] = r[jj]; double *qq; d_zeros(&qq, nx, N); for(ii=0; ii<N; ii++) for(jj=0; jj<ny; jj++) qq[jj+nx*ii] = q[jj]; double *yy_tmp; d_zeros_align(&yy_tmp, any, 1); for(ii=0; ii<N; ii++) { for(jj=0; jj<ny; jj++) yy_tmp[jj] = - q[jj]; dsymv_lib(ny, ny, hpQ[ii], cny, hy[ii], yy_tmp, -1); dgemv_t_lib(ny, nx, hpC[ii], cnx, yy_tmp, &qq[ii*nx], 0); } double *qf; d_zeros(&qf, nx, 1); // for(jj=0; jj<ny; jj++) qf[jj] = q[jj]; // if(ndN>0) // { for(jj=0; jj<ny; jj++) yy_tmp[jj] = - q[jj]; dsymv_lib(ny, ny, hpQ[N], cny, hy[N], yy_tmp, -1); dgemv_t_lib(ny, nx, hpC[N], cnx, yy_tmp, qf, 0); // } double *xx0; d_zeros(&xx0, nx, 1); double *LL0; d_zeros(&LL0, nx, nx); double *xxe; d_zeros(&xxe, nx, N+1); double *LLe; d_zeros(&LLe, nx, nx); double *ww; d_zeros(&ww, nw, N); double *llam; d_zeros(&llam, nx, N+1); double *work_high_level; d_zeros(&work_high_level, hpmpc_ric_mhe_if_dp_work_space(nx, nw, ny, ndN, N), 1); double *dummy0; struct timeval tv00, tv01; int error_code; printf("\nhigh-level\n"); // double precision gettimeofday(&tv00, NULL); // start for(ii=0; ii<nrep; ii++) { for(jj=0; jj<nx; jj++) xx0[jj] = x0[jj]; for(jj=0; jj<nx*nx; jj++) LL0[jj] = L0[jj]; //error_code = fortran_order_riccati_mhe_if( 'd', 2, nx, nw, 0, ndN, N, AA, GG, dummy, ff, DD, dd, RR, QQ, Qf, rr, qq, qf, dummy, xx0, LL0, xxe, LLe, ww, llam, work_high_level); error_code = c_order_riccati_mhe_if( 'd', 2, nx, nw, 0, ndN, N, AA, GG, dummy0, ff, DD, dd, RR, QQ, Qf, rr, qq, qf, dummy0, xx0, LL0, xxe, LLe, ww, llam, work_high_level); //if(error_code) // break; } gettimeofday(&tv01, NULL); // stop float time_mhe_if_high_level = (float) (tv01.tv_sec-tv00.tv_sec)/(nrep+0.0)+(tv01.tv_usec-tv00.tv_usec)/(nrep*1e6); printf("\nhigh-level interface for MHE_if\n\nerror_code: %d, time = %e\n\n", error_code, time_mhe_if_high_level); //d_print_mat(nx, N+1, xxe, nx); //d_print_mat(nw, N, ww, nw); free(AA); free(GG); free(ff); free(DD); free(dd); free(RR); free(QQ); free(Qf); free(rr); free(qq); free(qf); free(xx0); free(LL0); free(xxe); free(LLe); free(ww); free(llam); free(work_high_level); free(yy_tmp); //exit(1); #endif /************************************************ * call the solver ************************************************/ //d_print_mat(nx, nx, A, nx); //d_print_mat(nx, nw, B, nx); //d_ric_trf_mhe_test(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hpQ, hpR, hpLe, work); d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, work); // estimation d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hw, hy, 0, hlam, work); #if 0 // print solution printf("\nx_e\n"); d_print_mat(nx, N+1, hxe[0], anx); #endif // smooth estimation d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hw, hy, 1, hlam, work); //d_print_pmat(nx, nx, bs, hpLp[N-1]+(nx+nw+pad)*bs, cnl); //d_print_pmat(nx, nx, bs, hpLp[N]+(nx+nw+pad)*bs, cnl); //d_print_pmat(nx, nx, bs, hpLe[N-1]+ncl*bs, cnf); //d_print_pmat(nx, nx, bs, hpLe[N]+ncl*bs, cnf); #if 1 printf("\nx_s\n"); //d_print_mat(nx, N+1, hxp[0], anx); d_print_mat(nw, N, hw[0], anw); d_print_mat(nx, N+1, hxe[0], anx); //d_print_mat(nx, N, hlam[0], anx); #endif // information filter - factorization //d_ric_trf_mhe_if(nx, nw, ndN, N, hpQA, hpRG, diag_R, hpALe, hpGLr, Ld, work3); d_ric_trf_mhe_if(nx, nw, ndN, N, hpQRAG, diag_R, hpLe2, hpLAG, Ld, work3); // information filter - solution //d_ric_trs_mhe_if(nx, nw, ndN, N, hpALe, hpGLr, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3); d_ric_trs_mhe_if(nx, nw, ndN, N, hpLe2, hpLAG, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3); //d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, hq, hr, hf, hxp, hxe, hw, hy, 1, hlam, work); //d_print_pmat(nx, nx, bs, hpALe[N-1], cnx2); //d_print_pmat(nx, nx, bs, hpALe[N], cnx2); //d_print_pmat(nx, nx, bs, hpALe[N-2]+cnx*bs, cnx2); //d_print_pmat(nx, nx, bs, hpALe[N-1]+cnx*bs, cnx2); //d_print_pmat(nx, nx, bs, hpALe[N]+cnx*bs, cnx2); //d_print_pmat(nx, nx, bs, hpRA[N], cnx); #if 1 printf("\nx_s_if\n"); //d_print_mat(nx, N+1, hxp[0], anx); d_print_mat(nw, N, hw[0], anw); d_print_mat(nx, N+1, hxe[0], anx); //d_print_mat(nx, N, hlam[0], anx); //exit(1); #endif //d_print_pmat(nw, nw, bs, hpQ[0], cnw); //d_print_pmat(nx, nw, bs, hpG[0], cnw); //d_print_mat(nw, 1, hq[0], nw); //d_print_mat(nw, 1, hw[0], nw); //d_print_mat(nx, 1, hlam[0], nx); //exit(3); #if 1 int nZ = nw+nx+1; int pnZ = (nw+nx+1+bs-1)/bs*bs; int cnZ = (nw+nx+1+ncl-1)/ncl*ncl; int cnL = cnZ>cnx+ncl ? cnZ : cnx+ncl; double *(hpRSQrq[N+1]); for(ii=0; ii<=N; ii++) { d_zeros_align(&hpRSQrq[ii], pnZ, cnZ); d_cvt_mat2pmat(nw, nw, R, nw, 0, hpRSQrq[ii], cnZ); d_cvt_mat2pmat(ny, ny, Q, ny, nw, hpRSQrq[ii]+nw/bs*bs*cnZ+nw%bs+nw*bs, cnZ); d_cvt_mat2pmat(1, nw, r, 1, nw+nx, hpRSQrq[ii]+(nw+nx)/bs*bs*cnZ+(nw+nx)%bs, cnZ); d_cvt_mat2pmat(1, nx, hqq[ii], 1, nw+nx, hpRSQrq[ii]+(nw+nx)/bs*bs*cnZ+(nw+nx)%bs+nw*bs, cnZ); //d_print_pmat(nZ, nZ, bs, hpRSQrq[ii], cnZ); } double *pP0; d_zeros_align(&pP0, pnx, cnx); d_cvt_mat2pmat(nx, nx, L0, nx, 0, pP0, cnx); //d_print_pmat(nx, nx, bs, pP0, cnx); dgead_lib(nx, nx, 1.0, 0, pP0, cnx, nw, hpRSQrq[0]+nw/bs*bs*cnZ+nw%bs+nw*bs, cnZ); //d_print_pmat(nZ, nZ, bs, hpRSQrq[0], cnZ); double *pBAbt; d_zeros_align(&pBAbt, pnZ, cnx); d_cvt_tran_mat2pmat(nx, nw, B, nx, 0, pBAbt, cnx); d_cvt_tran_mat2pmat(nx, nx, A, nx, nw, pBAbt+nw/bs*bs*cnx+nw%bs, cnx); d_cvt_mat2pmat(1, nx, f, 1, nw+nx, pBAbt+(nw+nx)/bs*bs*cnx+(nw+nx)%bs, cnx); //d_print_pmat(nZ, nx, bs, pBAbt, cnx); double *(hpBAbt[N]); for(ii=0; ii<N; ii++) { hpBAbt[ii] = pBAbt; } double *(hpLam[N+1]); for(ii=0; ii<=N; ii++) { d_zeros_align(&hpLam[ii], pnZ, cnL); } double *work_ric; d_zeros_align(&work_ric, pnZ, cnx); double *diag_ric; d_zeros_align(&diag_ric, pnZ, 1); double *hux_mat; d_zeros_align(&hux_mat, pnZ, N+1); double *(hux[N+1]); for(ii=0; ii<=N; ii++) { hux[ii] = hux_mat+ii*pnZ; } double **pdummy; d_back_ric_sv(N, nx, nw, hpBAbt, hpRSQrq, 0, pdummy, pdummy, 0, hux, hpLam, work_ric, diag_ric, 0, pdummy, 0, pdummy, 0, 0, 0, pdummy, pdummy, pdummy); d_print_mat(nw, N+1, hux_mat, pnZ); d_print_mat(nx, N+1, hux_mat+nw, pnZ); exit(1); #endif // compute residuals double *p0; d_zeros_align(&p0, anx, 1); double *x_temp; d_zeros_align(&x_temp, anx, 1); dtrmv_u_t_lib(nx, pL0_inv, cnx, x0, x_temp, 0); dtrmv_u_n_lib(nx, pL0_inv, cnx, x_temp, p0, 0); d_res_mhe_if(nx, nw, ndN, N, hpQA, hpRG, pL0_inv, hqq, hrr, hff, p0, hxe, hw, hlam, hq_res, hr_res, hf_res, work4); // printf("\nprint residuals\n\n"); // d_print_mat(nx, N+1, hq_res[0], anx); // d_print_mat(nw, N, hr_res[0], anw); // d_print_mat(nx, N, hf_res[0], anx); // d_print_mat(ndN, 1, hf_res[0]+N*anx, anx); //return 0; //exit(1); if(0 && PRINTRES) { // print solution printf("\nx_p\n"); d_print_mat(nx, N+1, hxp[0], anx); printf("\nx_s\n"); d_print_mat(nx, N+1, hxe[0], anx); printf("\nw\n"); d_print_mat(nw, N+1, hw[0], anw); //printf("\nL_p\n"); //d_print_pmat(nx, nx, bs, hpLp[0]+(nx+nw+pad)*bs, cnl); //d_print_mat(1, nx, hdLp[0], 1); //d_print_pmat(nx, nx, bs, hpLp[1]+(nx+nw+pad)*bs, cnl); //d_print_mat(1, nx, hdLp[1], 1); //d_print_pmat(nx, nx, bs, hpLp[2]+(nx+nw+pad)*bs, cnl); //d_print_mat(1, nx, hdLp[2], 1); //d_print_pmat(nx, nx, bs, hpLp[N]+(nx+nw+pad)*bs, cnl); //d_print_mat(1, nx, hdLp[N], 1); //printf("\nL_p\n"); //d_print_pmat(nz, nz, bs, hpLp2[0]+(nx+pad2)*bs, cnl2); //d_print_pmat(nz, nz, bs, hpLp2[1]+(nx+pad2)*bs, cnl2); //d_print_pmat(nz, nz, bs, hpLp2[2]+(nx+pad2)*bs, cnl2); //printf("\nL_e\n"); //d_print_pmat(nz, nz, bs, hpLe[0], cnf); //d_print_pmat(nz, nz, bs, hpLe[1], cnf); //d_print_pmat(nz, nz, bs, hpLe[2], cnf); //d_print_pmat(nx, nx, bs, hpA[0], cnx); } // timing struct timeval tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8; // double precision gettimeofday(&tv0, NULL); // start // factorize for(rep=0; rep<nrep; rep++) { //d_ric_trf_mhe_test(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hpQ, hpR, hpLe, work); d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, work); } gettimeofday(&tv1, NULL); // start // solve for(rep=0; rep<nrep; rep++) { d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hw, hy, 1, hlam, work); } gettimeofday(&tv2, NULL); // start // factorize for(rep=0; rep<nrep; rep++) { //d_print_pmat(nx, nx, bs, hpLe[N]+(ncl)*bs, cnf); //d_print_pmat(nx, nx, bs, hpLp[N]+(nx+nw+pad)*bs, cnl); //d_ric_trf_mhe_test(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hpQ, hpR, hpLe, work); d_ric_trf_mhe_end(nx, nw, ny, N, hpCA, hpG, hpC, hpLp2, hpR, hpQ, hpLe, work2); } gettimeofday(&tv3, NULL); // start // solve for(rep=0; rep<nrep; rep++) { d_ric_trs_mhe_end(nx, nw, ny, N, hpA, hpG, hpC, hpLp2, hpR, hpQ, hpLe, hr, hq, hf, hxp, hxe, hy, work2); } gettimeofday(&tv4, NULL); // start // factorize information filter for(rep=0; rep<nrep; rep++) { //d_ric_trf_mhe_if(nx, nw, ndN, N, hpQA, hpRG, diag_R, hpALe, hpGLr, Ld, work3); d_ric_trf_mhe_if(nx, nw, ndN, N, hpQRAG, diag_R, hpLe2, hpLAG, Ld, work3); } gettimeofday(&tv5, NULL); // start // factorize information filter for(rep=0; rep<nrep; rep++) { //d_ric_trs_mhe_if(nx, nw, ndN, N, hpALe, hpGLr, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3); d_ric_trs_mhe_if(nx, nw, ndN, N, hpLe2, hpLAG, Ld, hqq, hrr, hff, hxp, hxe, hw, hlam, work3); } gettimeofday(&tv6, NULL); // start // factorize information filter for(rep=0; rep<nrep; rep++) { #if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_BLIS) || defined(REF_BLAS_NETLIB) //d_ric_trf_mhe_if_blas( nx, nw, ndN, N, hA, hG, hQ, hR, hAGU, hUp, hUe, hUr); d_ric_trf_mhe_if_blas( nx, nw, ndN, N, hA, hG, hQ, hR, hAGU, hUp, hUe, hUr, Ud); #endif } gettimeofday(&tv7, NULL); // start // solution information filter for(rep=0; rep<nrep; rep++) { #if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_BLIS) || defined(REF_BLAS_NETLIB) d_ric_trs_mhe_if_blas( nx, nw, ndN, N, hAGU, hUp, hUe, hUr, Ud, hqq, hrr, hff, hxp, hxe, hw, hlam, work_ref); #endif } gettimeofday(&tv8, NULL); // start float Gflops_max = flops_max * GHz_max; float time_trf = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6); float time_trs = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6); float time_trf_end = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6); float time_trs_end = (float) (tv4.tv_sec-tv3.tv_sec)/(nrep+0.0)+(tv4.tv_usec-tv3.tv_usec)/(nrep*1e6); float time_trf_if = (float) (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6); float time_trs_if = (float) (tv6.tv_sec-tv5.tv_sec)/(nrep+0.0)+(tv6.tv_usec-tv5.tv_usec)/(nrep*1e6); float time_trf_if_blas = (float) (tv7.tv_sec-tv6.tv_sec)/(nrep+0.0)+(tv7.tv_usec-tv6.tv_usec)/(nrep*1e6); float time_trs_if_blas = (float) (tv8.tv_sec-tv7.tv_sec)/(nrep+0.0)+(tv8.tv_usec-tv7.tv_usec)/(nrep*1e6); float flop_trf_if = N*(10.0/3.0*nx*nx*nx+nx*nx*nw)+2.0/3.0*nx*nx*nx+ndN*nx*nx+ndN*ndN*nx+1.0/3.0*ndN*ndN*ndN; if(diag_R==0) flop_trf_if += N*(nx*nw*nw+1.0/3.0*nw*nw*nw); else flop_trf_if += N*(nx*nw+1.0/2.0*nw*nw); float Gflops_trf_if = flop_trf_if*1e-9/time_trf_if; float Gflops_trf_if_blas = flop_trf_if*1e-9/time_trf_if_blas; if(ll==0) { printf("\nnx\tnw\tny\tN\ttrf time\ttrs time\ttrf_e time\ttrs_e time\ttrf_if time\ttrf_if Gflops\ttrf_if percent\ttrs_if time\ttrf_if BLAS\tGflops\t\tpercent\t\ttrs_if BLAS\n\n"); // fprintf(f, "\nnx\tnu\tN\tsv time\t\tsv Gflops\tsv %%\t\ttrs time\ttrs Gflops\ttrs %%\n\n"); } printf("%d\t%d\t%d\t%d\t%e\t%e\t%e\t%e\t%e\t%f\t%f\t%e\t%e\t%f\t%f\t%e\n", nx, nw, ny, N, time_trf, time_trs, time_trf_end, time_trs_end, time_trf_if, Gflops_trf_if, 100*Gflops_trf_if/Gflops_max, time_trs_if, time_trf_if_blas, Gflops_trf_if_blas, 100*Gflops_trf_if_blas/Gflops_max, time_trs_if_blas); #if 0 return 0; // moving horizon test // window size N = 20; double *(hhxe[N+1]); double *(hhxp[N+1]); double *(hhw[N]); double *(hhy[N+1]); double *(hhlam[N]); double *p_hhxe; d_zeros_align(&p_hhxe, anx, N+1); double *p_hhxp; d_zeros_align(&p_hhxp, anx, N+1); double *p_hhw; d_zeros_align(&p_hhw, anw, N); double *p_hhlam; d_zeros_align(&p_hhlam, anx, N); // shift measurements and initial prediction for(ii=0; ii<N; ii++) { hhxe[ii] = p_hhxe+ii*anx; //d_zeros_align(&hxe[jj], anx, 1); hhxp[ii] = p_hhxp+ii*anx; //d_zeros_align(&hxp[jj], anx, 1); hhw[ii] = p_hhw+ii*anw; //d_zeros_align(&hw[jj], anw, 1); hhy[ii] = hy[ii]; //d_zeros_align(&hy[jj], any, 1); hhlam[ii] = p_hhlam+ii*anx; //d_zeros_align(&hlam[jj], anx, 1); } hhxe[N] = p_hhxe+N*anx; //d_zeros_align(&hxe[jj], anx, 1); hhxp[N] = p_hhxp+N*anx; //d_zeros_align(&hxp[jj], anx, 1); hhy[N] = hy[N]; //d_zeros_align(&hy[jj], any, 1); // shift initial prediction covariance //for(ii=0; ii<pnx*cnl; ii++) // hpLp[0][ii] = hpLp[1][ii]; d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, work); d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, hq, hr, hf, hhxp, hhxe, hhw, hhy, 1, hhlam, work); // zero data for(ii=0; ii<Ns*anx; ii++) hxe[0][ii] = 0.0; for(ii=anx; ii<Ns*anx; ii++) hxp[0][ii] = 0.0; for(ii=0; ii<(Ns-1)*anw; ii++) hw[0][ii] = 0.0; for(ii=0; ii<(Ns-1)*anx; ii++) hlam[0][ii] = 0.0; // save data for(ii=0; ii<(N+1); ii++) for(jj=0; jj<nx; jj++) hxe[ii][jj] = hhxe[ii][jj]; for(ii=0; ii<(N+1); ii++) for(jj=0; jj<nx; jj++) hxp[ii][jj] = hhxp[ii][jj]; for(ii=0; ii<N; ii++) for(jj=0; jj<nw; jj++) hw[ii][jj] = hhw[ii][jj]; //d_print_mat(nw, N, hw[0], anw); for(ii=0; ii<N; ii++) for(jj=0; jj<nx; jj++) hlam[ii][jj] = hhlam[ii][jj]; for(jj=1; jj<Ns-N; jj++) { //break; // shift measurements and initial prediction for(ii=0; ii<=N; ii++) { hhy[ii] = hy[ii+jj]; } // shift initial prediction and relative covariance for(ii=0; ii<nx; ii++) hhxp[0][ii] = hhxp[1][ii]; for(ii=0; ii<pnx*cnl; ii++) hpLp[0][ii] = hpLp[1][ii]; //d_print_mat(nx, N+1, hhxp[0], anx); //d_print_pmat(nx, nx, bs, hpLp[1]+(nx+nw+pad)*bs, cnl); //d_print_pmat(nz, nz, bs, hpLe[1], cnf); //d_print_pmat(nx, nx, bs, hpLp[2]+(nx+nw+pad)*bs, cnl); //d_print_pmat(nz, nz, bs, hpLe[2], cnf); d_ric_trf_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, work); d_ric_trs_mhe(nx, nw, ny, N, hpA, hpG, hpC, hpLp, hdLp, hpQ, hpR, hpLe, hq, hr, hf, hhxp, hhxe, hhw, hhy, 1, hhlam, work); //d_print_mat(nx, N+1, hhxp[0], anx); //d_print_pmat(nx, nx, bs, hpLp[0]+(nx+nw+pad)*bs, cnl); //d_print_pmat(nz, nz, bs, hpLe[0], cnf); //d_print_pmat(nx, nx, bs, hpLp[1]+(nx+nw+pad)*bs, cnl); //d_print_pmat(nz, nz, bs, hpLe[1], cnf); // save data for(ii=0; ii<nx; ii++) hxe[N+jj][ii] = hhxe[N][ii]; for(ii=0; ii<nx; ii++) hxp[N+jj][ii] = hhxp[N][ii]; if(jj<Ns-N-1) for(ii=0; ii<nw; ii++) hw[N+jj][ii] = hhw[N-1][ii]; if(jj<Ns-N-1) for(ii=0; ii<nx; ii++) hlam[N+jj][ii] = hhlam[N-1][ii]; //break; } // print solution if(PRINTRES) { printf("\nx_p\n"); d_print_mat(nx, Ns, hxp[0], anx); printf("\nx_e\n"); d_print_mat(nx, Ns, hxe[0], anx); //printf("\nL_e\n"); //d_print_pmat(nx, nx, bs, hpLp[Ns-1]+(nx+nw+pad)*bs, cnl); } #endif /************************************************ * return ************************************************/ free(A); free(B); free(C); free(b); free(D); free(d); free(x0); free(Q); free(Qx); free(R); free(q); free(r); free(f); free(L0); free(pA); free(pG); free(pC); free(pQ); free(pR); free(pQA); free(pRG); free(work); free(work2); free(work3); free(work4); free(p_hxe); free(p_hxp); free(p_hy); free(p_hw); free(p_hlam); //free(p_hhxe); //free(p_hhxp); //free(p_hhw); //free(p_hhlam); free(x_temp); free(y_temp); free(p0); free(p_hr_res); free(p_hq_res); free(p_hf_res); free(pL0_inv); free(hpLp[0]); free(hdLp[0]); free(hpLe[0]); for(jj=0; jj<N; jj++) { free(hpLp[jj+1]); free(hdLp[jj+1]); free(hpLe[jj+1]); free(hpGLr[jj]); free(hpALe[jj]); free(hpLp2[jj]); } free(hpALe[N]); free(pQRAG); free(pQD); for(ii=0; ii<N; ii++) { free(hpLAG[ii]); free(hpLe2[ii]); } free(hpLAG[N]); free(hpLe2[N]); for(ii=0; ii<N; ii++) { free(hAGU[ii]); free(hUp[ii]); free(hUe[ii]); free(hUr[ii]); } free(hUp[N]); free(hUe[N]); free(Ud); free(work_ref); } // increase size fprintf(f, "];\n"); fclose(f); return 0; }
void masterFunc (int argc, char ** argv) { /**************************************************************** * Step 1: Setup and Initialization * Load conf, init model, allocate mem, init params, init solver * Load cross-validation data ****************************************************************/ // Step 1.1: Load configuration if (argc < 2) { printf("argc %d\n", argc); exit(1); } string dirPath = argv[1]; boost::property_tree::ptree *confReader = new boost::property_tree::ptree(); boost::property_tree::ini_parser::read_ini(dirPath+"mpi.conf", *confReader); string section = "Master."; // int validBatchSize = confReader->get<int>(section + "validation_batch_size"); int nSendMax = confReader->get<int>(section + "max_iteration_number"); // Step 1.2 Initialize model section = "LSTM."; openblas_set_num_threads(1); int max_openmp_threads = confReader->get<int>(section + "max_threads"); omp_set_num_threads(max_openmp_threads); omp_set_nested(0); printf("MASTER openmp threads: max threads %d, nested %d\n", omp_get_max_threads(), omp_get_nested()); RecurrentNN *rnn = new RNNLSTM(confReader, section); int paramSize = rnn->m_paramSize; printf("paramSize: %d\n", paramSize); // Step 1.3: Allocate master memory float *params = new float[paramSize]; float *grad = new float[paramSize]; // Step 1.4: Initialize params rnn->initParams(params); // Step 1.5: Initialize SGD Solver section = "SGD."; sgdBase *sgdSolver = initSgdSolver(confReader, section, paramSize); printf("MASTER: finish step 1\n"); // Step 1.6: Load cross-validation data // section = "ValidationData."; // DataFactory *dataset = initDataFactory(confReader, section); // int numSample = dataset->getNumberOfData(); // int dataSize = dataset->getDataSize(); // int labelSize = dataset->getLabelSize(); // float *data = new float[validBatchSize * dataSize]; // float *label = new float[validBatchSize * labelSize]; /**************************************************************** * Step 2: Seed the slaves * (1) Broadcast paramSize to all slaves * (2) Send the same initial params with WORKTAG to all slaves ****************************************************************/ int nProc; MPI_Comm_size(MPI_COMM_WORLD, &nProc); int nSlave = nProc - 1; MPI_Bcast(¶mSize, 1, MPI_INT, ROOT, MPI_COMM_WORLD); int nSend = 0; int nRecv = 0; for (int rank = 1; rank < nProc; ++rank) { MPI_Send(params, paramSize, MPI_FLOAT, rank, WORKTAG, MPI_COMM_WORLD); nSend++; } printf("MASTER: finish step 2\n"); /**************************************************************** * Step 3: Paralleled training * Receive mini-batch grad from *ANY* slave * Update params based received grad * Re-send params to slave to process next mini-batch ****************************************************************/ MPI_Status status; // TEMP while loop condition while (nSend < nSendMax) { MPI_Recv(grad, paramSize, MPI_FLOAT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); nRecv++; sgdSolver->updateParams(params, grad, status.MPI_SOURCE); // Send updated params to corresponding slave MPI_Send(params, paramSize, MPI_FLOAT, status.MPI_SOURCE, WORKTAG, MPI_COMM_WORLD); nSend++; } printf("MASTER: finish step 3\n"); /**************************************************************** * Step 4: Stop the slaves ****************************************************************/ // Step 4.1: Receive all dispatched but irreceived grad result while (nRecv < nSend) { MPI_Recv(grad, paramSize, MPI_FLOAT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); sgdSolver->updateParams(params, grad, status.MPI_SOURCE); nRecv++; } // Step 4.2: Send STOPTAG to all slaves for (int rank = 1; rank < nProc; ++rank) { MPI_Send(&rank, 1, MPI_INT, rank, STOPTAG, MPI_COMM_WORLD); } printf("MASTER: finish step 4\n"); /**************************************************************** * Step 5: Save trained parameters and clear things ****************************************************************/ section = "Master."; string saveFilename = confReader->get<string>(section + "save_filename"); ofstream savefile (saveFilename.c_str(), ios::out|ios::binary); if (savefile.is_open()) { savefile.write ((char *)params, sizeof(float) * paramSize); savefile.close(); } else { printf("Failed to open savefile\n"); exit(1); } delete [] params; delete [] grad; delete confReader; delete sgdSolver; delete rnn; }
int main() { #if defined(REF_BLAS_OPENBLAS) openblas_set_num_threads(1); #endif #if defined(REF_BLAS_BLIS) omp_set_num_threads(1); #endif printf("\n"); printf("\n"); printf("\n"); printf(" HPMPC -- Library for High-Performance implementation of solvers for MPC.\n"); printf(" Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.\n"); printf("\n"); printf(" HPMPC is distributed in the hope that it will be useful,\n"); printf(" but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); printf(" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"); printf(" See the GNU Lesser General Public License for more details.\n"); printf("\n"); printf("\n"); printf("\n"); printf("BLAS performance test - double precision\n"); printf("\n"); // maximum frequency of the processor const float GHz_max = GHZ_MAX; printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max); printf("\n"); // maximum flops per cycle, double precision #if defined(TARGET_X64_AVX2) const float flops_max = 16; printf("Testing BLAS version for AVX2 & FMA3 instruction sets, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_X64_AVX) const float flops_max = 8; printf("Testing BLAS version for AVX instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3) const float flops_max = 4; printf("Testing BLAS version for SSE3 instruction set, 64 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_CORTEX_A15) const float flops_max = 2; printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A15: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_CORTEX_A9) const float flops_max = 1; printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A9: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_CORTEX_A7) const float flops_max = 0.5; printf("Testing solvers for ARMv7a VFPv3 instruction set, oprimized for Cortex A7: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_X86_ATOM) const float flops_max = 1; printf("Testing BLAS version for SSE3 instruction set, 32 bit, optimized for Intel Atom: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_POWERPC_G2) const float flops_max = 1; printf("Testing BLAS version for POWERPC instruction set, 32 bit: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_C99_4X4) const float flops_max = 2; printf("Testing reference BLAS version, 4x4 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_C99_4X4_PREFETCH) const float flops_max = 2; printf("Testing reference BLAS version, 4x4 kernel with register prefetch: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #elif defined(TARGET_C99_2X2) const float flops_max = 2; printf("Testing reference BLAS version, 2x2 kernel: theoretical peak %5.1f Gflops\n", flops_max*GHz_max); #endif FILE *f; f = fopen("./test_problems/results/test_blas.m", "w"); // a #if defined(TARGET_X64_AVX2) fprintf(f, "C = 'd_x64_avx2';\n"); fprintf(f, "\n"); #elif defined(TARGET_X64_AVX) fprintf(f, "C = 'd_x64_avx';\n"); fprintf(f, "\n"); #elif defined(TARGET_X64_SSE3) || defined(TARGET_AMD_SSE3) fprintf(f, "C = 'd_x64_sse3';\n"); fprintf(f, "\n"); #elif defined(TARGET_CORTEX_A9) fprintf(f, "C = 'd_ARM_cortex_A9';\n"); fprintf(f, "\n"); #elif defined(TARGET_CORTEX_A7) fprintf(f, "C = 'd_ARM_cortex_A7';\n"); fprintf(f, "\n"); #elif defined(TARGET_CORTEX_A15) fprintf(f, "C = 'd_ARM_cortex_A15';\n"); fprintf(f, "\n"); #elif defined(TARGET_X86_ATOM) fprintf(f, "C = 'd_x86_atom';\n"); fprintf(f, "\n"); #elif defined(TARGET_POWERPC_G2) fprintf(f, "C = 'd_PowerPC_G2';\n"); fprintf(f, "\n"); #elif defined(TARGET_C99_4X4) fprintf(f, "C = 'd_c99_4x4';\n"); fprintf(f, "\n"); #elif defined(TARGET_C99_4X4_PREFETCH) fprintf(f, "C = 'd_c99_4x4';\n"); fprintf(f, "\n"); #elif defined(TARGET_C99_2X2) fprintf(f, "C = 'd_c99_2x2';\n"); fprintf(f, "\n"); #endif fprintf(f, "A = [%f %f];\n", GHz_max, flops_max); fprintf(f, "\n"); fprintf(f, "B = [\n"); int i, j, rep, ll; const int bsd = D_MR; //d_get_mr(); /* int info = 0;*/ printf("\nn\t kernel_dgemm\t dgemm\t\t dsyrk_dpotrf\t dtrmm\t\t dtrtr\t\t dgemv_n\t dgemv_t\t dtrmv_n\t dtrmv_t\t dtrsv_n\t dtrsv_t\t dsymv\t\t dgemv_nt\t\t dsyrk+dpotrf\t BLAS dgemm\t BLAS dgemv_n\t BLAS dgemv_t\n"); printf("\nn\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\t Gflops\t %%\n\n"); #if 1 int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700}; int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4}; for(ll=0; ll<75; ll++) // for(ll=0; ll<115; ll++) // for(ll=0; ll<120; ll++) { int n = nn[ll]; int nrep = nnrep[ll]; #else int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; for(ll=0; ll<24; ll++) { int n = nn[ll]; int nrep = 40000; //nnrep[ll]; #endif #if defined(REF_BLAS_BLIS) f77_int n77 = n; #endif double *A; d_zeros(&A, n, n); double *B; d_zeros(&B, n, n); double *C; d_zeros(&C, n, n); double *M; d_zeros(&M, n, n); char c_n = 'n'; char c_t = 't'; int i_1 = 1; #if defined(REF_BLAS_BLIS) f77_int i77_1 = i_1; #endif double d_1 = 1; double d_0 = 0; for(i=0; i<n*n; i++) A[i] = i; for(i=0; i<n; i++) B[i*(n+1)] = 1; for(i=0; i<n*n; i++) M[i] = 1; int pnd = ((n+bsd-1)/bsd)*bsd; int cnd = ((n+D_NCL-1)/D_NCL)*D_NCL; int cnd2 = 2*((n+D_NCL-1)/D_NCL)*D_NCL; int pad = (D_NCL-n%D_NCL)%D_NCL; double *pA; d_zeros_align(&pA, pnd, cnd); double *pB; d_zeros_align(&pB, pnd, cnd); double *pC; d_zeros_align(&pC, pnd, cnd); double *pD; d_zeros_align(&pD, pnd, cnd); double *pE; d_zeros_align(&pE, pnd, cnd2); double *pF; d_zeros_align(&pF, 2*pnd, cnd); double *pL; d_zeros_align(&pL, pnd, cnd); double *pM; d_zeros_align(&pM, pnd, cnd); double *x; d_zeros_align(&x, pnd, 1); double *y; d_zeros_align(&y, pnd, 1); double *x2; d_zeros_align(&x2, pnd, 1); double *y2; d_zeros_align(&y2, pnd, 1); double *diag; d_zeros_align(&diag, pnd, 1); d_cvt_mat2pmat(n, n, A, n, 0, pA, cnd); d_cvt_mat2pmat(n, n, B, n, 0, pB, cnd); d_cvt_mat2pmat(n, n, B, n, 0, pD, cnd); d_cvt_mat2pmat(n, n, A, n, 0, pE, cnd2); d_cvt_mat2pmat(n, n, M, n, 0, pM, cnd); /* d_cvt_mat2pmat(n, n, B, n, 0, pE+n*bsd, pnd);*/ /* d_print_pmat(n, 2*n, bsd, pE, 2*pnd);*/ /* exit(2);*/ for(i=0; i<pnd*cnd; i++) pC[i] = -1; for(i=0; i<pnd; i++) x[i] = 1; for(i=0; i<pnd; i++) x2[i] = 1; double *dummy; /* timing */ struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16; /* warm up */ for(rep=0; rep<nrep; rep++) { dgemm_nt_lib(n, n, n, pA, cnd, pB, cnd, 1, pC, cnd, pC, cnd, 1, 1); } gettimeofday(&tvm1, NULL); // start for(rep=0; rep<nrep; rep++) { //dgemm_kernel_nt_lib(n, n, n, pA, cnd, pB, cnd, pC, cnd, pC, cnd, 0, 0, 0); dgemm_nn_lib(n, n, n, pA, cnd, pB, cnd, 0, pC, cnd, pC, cnd, 0, 0); } gettimeofday(&tv0, NULL); // start for(rep=0; rep<nrep; rep++) { dgemm_nt_lib(n, n, n, pA, cnd, pB, cnd, 0, pC, cnd, pC, cnd, 0, 0); } gettimeofday(&tv1, NULL); // stop for(rep=0; rep<nrep; rep++) { //dsyrk_dpotrf_lib(n, n, n, pA, cnd, 1, pD, cnd, pC, cnd, diag, 0); dsyrk_dpotrf_lib_new(n, n, n, pA, cnd, pA, cnd, 1, pD, cnd, pC, cnd, diag); } gettimeofday(&tv2, NULL); // stop for(rep=0; rep<nrep; rep++) { dtrmm_nt_u_lib(n, n, pA, cnd, pB, cnd, pC, cnd); } gettimeofday(&tv3, NULL); // stop for(rep=0; rep<nrep; rep++) { dtrtr_l_lib(n, 0, pA, cnd, pC, cnd); // triangualr matrix transpose //dgetr_lib(n, n, 0, pA, cnd, 0, pC, cnd); // general matrix transpose } gettimeofday(&tv4, NULL); // stop for(rep=0; rep<nrep; rep++) { dgemv_n_lib(n, n, pA, cnd, x, 0, y, y); } gettimeofday(&tv5, NULL); // stop for(rep=0; rep<nrep; rep++) { dgemv_t_lib(n, n, pA, cnd, x, 0, y, y); } gettimeofday(&tv6, NULL); // stop for(rep=0; rep<nrep; rep++) { dtrmv_u_n_lib(n, pA, cnd, x, 0, y); } gettimeofday(&tv7, NULL); // stop for(rep=0; rep<nrep; rep++) { dtrmv_u_t_lib(n, pA, cnd, x, 0, y); } gettimeofday(&tv8, NULL); // stop for(rep=0; rep<nrep; rep++) { dtrsv_n_lib(2*n, n, 1, pF, cnd, x); } gettimeofday(&tv9, NULL); // stop for(rep=0; rep<nrep; rep++) { dtrsv_t_lib(2*n, n, 1, pF, cnd, x); } gettimeofday(&tv10, NULL); // stop for(rep=0; rep<nrep; rep++) { dsymv_lib(n, n, pA, cnd, x, 0, y, y); } gettimeofday(&tv11, NULL); // stop for(rep=0; rep<nrep; rep++) { dgemv_nt_lib(n, n, pA, cnd, x, x2, 0, y, y2, y, y2); } gettimeofday(&tv12, NULL); // stop for(rep=0; rep<nrep; rep++) { dsyrk_nt_lib(n, n, n, pE, cnd2, pE, cnd2, 1, pD, cnd, pE+(n+pad)*bsd, cnd2); //dpotrf_lib(n, n, pE+(n+pad)*bsd, cnd2, pE+(n+pad)*bsd, cnd2, diag); dpotrf_lib_new(n, n, pE+(n+pad)*bsd, cnd2, pE+(n+pad)*bsd, cnd2, diag); //d_print_pmat(pnd, cnd2, bsd, pE, cnd2); //exit(1); //break; } gettimeofday(&tv13, NULL); // stop for(rep=0; rep<nrep; rep++) { #if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) dgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n); #endif #if defined(REF_BLAS_BLIS) dgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77); #endif } gettimeofday(&tv14, NULL); // stop for(rep=0; rep<nrep; rep++) { #if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) dgemv_(&c_n, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y, &i_1); #endif #if defined(REF_BLAS_BLIS) dgemv_(&c_n, &n77, &n77, &d_1, A, &n77, x2, &i77_1, &d_0, y, &i77_1); #endif } gettimeofday(&tv15, NULL); // stop for(rep=0; rep<nrep; rep++) { #if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) dgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y, &i_1); #endif #if defined(REF_BLAS_BLIS) dgemv_(&c_t, &n77, &n77, &d_1, A, &n77, x2, &i77_1, &d_0, y, &i77_1); #endif } gettimeofday(&tv16, NULL); // stop float Gflops_max = flops_max * GHz_max; float time_dgemm_kernel = (float) (tv0.tv_sec-tvm1.tv_sec)/(nrep+0.0)+(tv0.tv_usec-tvm1.tv_usec)/(nrep*1e6); float flop_dgemm_kernel = 2.0*n*n*n; float Gflops_dgemm_kernel = 1e-9*flop_dgemm_kernel/time_dgemm_kernel; float time_dgemm = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6); float flop_dgemm = 2.0*n*n*n; float Gflops_dgemm = 1e-9*flop_dgemm/time_dgemm; float time_dsyrk_dpotrf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6); float flop_dsyrk_dpotrf = 1.0*n*n*n + 1.0/3.0*n*n*n; float Gflops_dsyrk_dpotrf = 1e-9*flop_dsyrk_dpotrf/time_dsyrk_dpotrf; float time_dtrmm = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6); float flop_dtrmm = 1.0*n*n*n; float Gflops_dtrmm = 1e-9*flop_dtrmm/time_dtrmm; float time_dtrtr = (float) (tv4.tv_sec-tv3.tv_sec)/(nrep+0.0)+(tv4.tv_usec-tv3.tv_usec)/(nrep*1e6); float flop_dtrtr = 0.5*n*n; float Gflops_dtrtr = 1e-9*flop_dtrtr/time_dtrtr; float time_dgemv_n = (float) (tv5.tv_sec-tv4.tv_sec)/(nrep+0.0)+(tv5.tv_usec-tv4.tv_usec)/(nrep*1e6); float flop_dgemv_n = 2.0*n*n; float Gflops_dgemv_n = 1e-9*flop_dgemv_n/time_dgemv_n; float time_dgemv_t = (float) (tv6.tv_sec-tv5.tv_sec)/(nrep+0.0)+(tv6.tv_usec-tv5.tv_usec)/(nrep*1e6); float flop_dgemv_t = 2.0*n*n; float Gflops_dgemv_t = 1e-9*flop_dgemv_t/time_dgemv_t; float time_dtrmv_n = (float) (tv7.tv_sec-tv6.tv_sec)/(nrep+0.0)+(tv7.tv_usec-tv6.tv_usec)/(nrep*1e6); float flop_dtrmv_n = 1.0*n*n; float Gflops_dtrmv_n = 1e-9*flop_dtrmv_n/time_dtrmv_n; float time_dtrmv_t = (float) (tv8.tv_sec-tv7.tv_sec)/(nrep+0.0)+(tv8.tv_usec-tv7.tv_usec)/(nrep*1e6); float flop_dtrmv_t = 1.0*n*n; float Gflops_dtrmv_t = 1e-9*flop_dtrmv_t/time_dtrmv_t; float time_dtrsv_n = (float) (tv9.tv_sec-tv8.tv_sec)/(nrep+0.0)+(tv9.tv_usec-tv8.tv_usec)/(nrep*1e6); float flop_dtrsv_n = 3.0*n*n; float Gflops_dtrsv_n = 1e-9*flop_dtrsv_n/time_dtrsv_n; float time_dtrsv_t = (float) (tv10.tv_sec-tv9.tv_sec)/(nrep+0.0)+(tv10.tv_usec-tv9.tv_usec)/(nrep*1e6); float flop_dtrsv_t = 3.0*n*n; float Gflops_dtrsv_t = 1e-9*flop_dtrsv_t/time_dtrsv_t; float time_dsymv = (float) (tv11.tv_sec-tv10.tv_sec)/(nrep+0.0)+(tv11.tv_usec-tv10.tv_usec)/(nrep*1e6); float flop_dsymv = 2.0*n*n; float Gflops_dsymv = 1e-9*flop_dsymv/time_dsymv; float time_dgemv_nt = (float) (tv12.tv_sec-tv11.tv_sec)/(nrep+0.0)+(tv12.tv_usec-tv11.tv_usec)/(nrep*1e6); float flop_dgemv_nt = 4.0*n*n; float Gflops_dgemv_nt = 1e-9*flop_dgemv_nt/time_dgemv_nt; float time_dsyrk_dpotrf2 = (float) (tv13.tv_sec-tv12.tv_sec)/(nrep+0.0)+(tv13.tv_usec-tv12.tv_usec)/(nrep*1e6); float flop_dsyrk_dpotrf2 = 1.0*n*n*n + 1.0/3.0*n*n*n; float Gflops_dsyrk_dpotrf2 = 1e-9*flop_dsyrk_dpotrf2/time_dsyrk_dpotrf2; float time_dgemm_blas = (float) (tv14.tv_sec-tv13.tv_sec)/(nrep+0.0)+(tv14.tv_usec-tv13.tv_usec)/(nrep*1e6); float flop_dgemm_blas = 2.0*n*n*n; float Gflops_dgemm_blas = 1e-9*flop_dgemm_blas/time_dgemm_blas; float time_dgemv_n_blas = (float) (tv15.tv_sec-tv14.tv_sec)/(nrep+0.0)+(tv15.tv_usec-tv14.tv_usec)/(nrep*1e6); float flop_dgemv_n_blas = 2.0*n*n; float Gflops_dgemv_n_blas = 1e-9*flop_dgemv_n_blas/time_dgemv_n_blas; float time_dgemv_t_blas = (float) (tv16.tv_sec-tv15.tv_sec)/(nrep+0.0)+(tv16.tv_usec-tv15.tv_usec)/(nrep*1e6); float flop_dgemv_t_blas = 2.0*n*n; float Gflops_dgemv_t_blas = 1e-9*flop_dgemv_t_blas/time_dgemv_t_blas; printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm_kernel, 100.0*Gflops_dgemm_kernel/Gflops_max, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dgemv_nt, 100.0*Gflops_dgemv_nt/Gflops_max, Gflops_dsyrk_dpotrf2, 100.0*Gflops_dsyrk_dpotrf2/Gflops_max, Gflops_dgemm_blas, 100.0*Gflops_dgemm_blas/Gflops_max, Gflops_dgemv_n_blas, 100.0*Gflops_dgemv_n_blas/Gflops_max, Gflops_dgemv_t_blas, 100.0*Gflops_dgemv_t_blas/Gflops_max); fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_dgemm_kernel, 100.0*Gflops_dgemm_kernel/Gflops_max, Gflops_dgemm, 100.0*Gflops_dgemm/Gflops_max, Gflops_dsyrk_dpotrf, 100.0*Gflops_dsyrk_dpotrf/Gflops_max, Gflops_dtrmm, 100.0*Gflops_dtrmm/Gflops_max, Gflops_dtrtr, 100.0*Gflops_dtrtr/Gflops_max, Gflops_dgemv_n, 100.0*Gflops_dgemv_n/Gflops_max, Gflops_dgemv_t, 100.0*Gflops_dgemv_t/Gflops_max, Gflops_dtrmv_n, 100.0*Gflops_dtrmv_n/Gflops_max, Gflops_dtrmv_t, 100.0*Gflops_dtrmv_t/Gflops_max, Gflops_dtrsv_n, 100.0*Gflops_dtrsv_n/Gflops_max, Gflops_dtrsv_t, 100.0*Gflops_dtrsv_t/Gflops_max, Gflops_dsymv, 100.0*Gflops_dsymv/Gflops_max, Gflops_dgemv_nt, 100.0*Gflops_dgemv_nt/Gflops_max, Gflops_dsyrk_dpotrf2, 100.0*Gflops_dsyrk_dpotrf2/Gflops_max, Gflops_dgemm_blas, 100.0*Gflops_dgemm_blas/Gflops_max, Gflops_dgemv_n_blas, 100.0*Gflops_dgemv_n_blas/Gflops_max, Gflops_dgemv_t_blas, 100.0*Gflops_dgemv_t_blas/Gflops_max); free(A); free(B); free(M); free(pA); free(pB); free(pC); free(pD); free(pE); free(pF); free(pL); free(pM); free(x); free(y); free(x2); free(y2); } printf("\n"); fprintf(f, "];\n"); fclose(f); return 0; }
int main(int argc, char const *argv[]) { if (argc < 4) { printf("Not enough arguments\n"); return -1; } int max_num_thread = atoi(argv[1]); int max_iter = atoi(argv[2]); int test_method = atoi(argv[3]); openblas_set_num_threads(max_num_thread); omp_set_num_threads(max_num_thread); int m = 1024; int n = 1024; float *A = new float[m * n]; for (int i = 0; i < m * n; i++) { A[i] = rand() / RAND_MAX; } float *b = new float[n]; for (int i = 0; i < n; i++) { b[i] = rand() / RAND_MAX; } float *Ab = new float[m]; switch (test_method) { case 0: { printf("Runing Matrix-Vector Multiplication by OpenMP (%d threads)\n", omp_get_max_threads()); double begTime = CycleTimer::currentSeconds(); for (int iter = 0; iter < max_iter; ++iter) { #pragma omp parallel for for (int i=0; i<m; ++i) { for (int j=0; j<n; ++j) { Ab[i] += A[i*n+j] * b[j]; } } } double endTime = CycleTimer::currentSeconds(); printf("%f\n", (endTime - begTime) / float(max_iter)); break; } case 1: { double begTime = CycleTimer::currentSeconds(); printf("Runing Matrix-Vector Multiplication by OpenBlas (%d threads)\n", omp_get_max_threads()); for (int iter = 0; iter < max_iter; ++iter) { cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, A, n, b, 1, 1.0, Ab, 1); } double endTime = CycleTimer::currentSeconds(); printf("%f\n", (endTime - begTime) / float(max_iter)); break; } case 2: { int block_size = (m + max_num_thread - 1)/ max_num_thread; double begTime = CycleTimer::currentSeconds(); printf("Runing Matrix-Vector Multiplication by OpenMP (%d threads) with OpenBlas\n", omp_get_max_threads()); for (int iter = 0; iter < max_iter; ++iter) { #pragma omp parallel for for (int i = 0; i < max_num_thread; ++i) { int actual_size = std::min(block_size, m-i*block_size); cblas_sgemv(CblasRowMajor, CblasNoTrans, actual_size, n, 1.0, A+i*block_size*n, n, b, 1, 1.0, Ab+i*block_size, 1); } } double endTime = CycleTimer::currentSeconds(); printf("%f\n", (endTime - begTime) / float(max_iter)); break; } default: printf("No matched test method\n"); break; } delete [] A; delete [] b; delete [] Ab; return 0; }