void Gemm(const int NN) { typedef Kokkos::Schedule<Kokkos::Static> ScheduleType; constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value; const int N = NN/VectorLength; { std::string value_type_name; if (std::is_same<value_type,double>::value) value_type_name = "double"; if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>"; #if defined(__AVX512F__) std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #endif } const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize); const double tmax = 1.0e15; const int iter_begin = -10, iter_end = 100; Kokkos::Impl::Timer timer; Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> cref; Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> amat("amat", N*VectorLength, BlkSize, BlkSize), bmat("bmat", N*VectorLength, BlkSize, BlkSize); Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718); Kokkos::fill_random(amat, random, value_type(1.0)); Kokkos::fill_random(bmat, random, value_type(1.0)); typedef Vector<SIMD<value_type>,VectorLength> VectorType; Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> amat_simd("amat_simd", N, BlkSize, BlkSize), bmat_simd("bmat_simd", N, BlkSize, BlkSize); Kokkos::parallel_for (Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k/VectorLength, k1 = k%VectorLength; for (int i=0;i<BlkSize;++i) for (int j=0;j<BlkSize;++j) { amat_simd(k0, i, j)[k1] = amat(k, i, j); bmat_simd(k0, i, j)[k1] = bmat(k, i, j); } });
int main (int argc, char *argv[]) { #define amat(I,J) a[I*n + J] #define bmat(I,J) b[I*n + J] #define cmat(I,J) c[I*n + J] int n, nthreads, i, j, k; double *a, *b, *c; double t0, t1; n = 1000; nthreads = atoi(argv[1]); omp_set_num_threads (nthreads); a = (double *) malloc (n * n * sizeof (double)); b = (double *) malloc (n * n * sizeof (double)); c = (double *) malloc (n * n * sizeof (double)); t0 = omp_get_wtime(); #pragma omp parallel for private (i, j, k) for (j=0; j<n; j++) for (i=0; i<n; i++) for (k=0; k<n; k++) cmat(i,j) = cmat(i,j) + amat(i,k) * bmat(k,j); t1 = omp_get_wtime(); printf("nthreads, time: %d %6.2f\n", nthreads, t1-t0); }
Results* join_clusters2_restart (double *x,//array/matrix of data SymNoDiag *W,//lower triangle of weight matrix unsigned int Px,//problem size double lambda,//starting point in regularization path double join_thresh, //tolerance for equality of points double opt_thresh, //tolerance for optimality double lambda_factor,//increase of lambda after optimality double smooth,//smoothing parameter int maxit, int linesearch_freq,//how often to do a linesearch? if 0, never. if //n>0, do n-1 linesearch steps for every //decreasing step size step. set this to 2 if //unsure. int linesearch_points,//how many points to check along the gradient //direction. set to 10 if unsure. int check_splits, int target_cluster, int verbose ){ unsigned int N = W->N; //W->print(); double old_lambda=0; std::vector<int> rows,rowsj; std::vector<int>::iterator rowit,ri,rj; std::list< std::vector<int> > clusters,tocheck; std::list< std::vector<int> >::iterator it,cj; unsigned int i,k,j; int tried_restart; for(i=0;i<N;i++){ rows.assign(1,i); clusters.push_back(rows); } double *old_alpha = new double[N*Px]; double *alpha = new double[N*Px]; double *xbar = new double[N*Px]; double *dir = new double[N*Px]; for(i=0;i<N*Px;i++){ alpha[i]=xbar[i]=x[i]; } Matrix amat(alpha,N,Px),xmat(x,N,Px); SymNoDiag diffs(N); diffs.calc_diffs(clusters,amat,nrm2); //store initial trivial solution Results *results = new Results(N,Px,opt_thresh); if(target_cluster==0)results->add(alpha,0,0); double weight,diff,step; while(clusters.size()>1){ double grad=opt_thresh; int iteration=1; tried_restart=0; //if we use the general (slower) algorithm for any weights, then //split the clusters to individual points if(check_splits){ clusters.clear(); //reassign original clusters for(i=0;i<N;i++){ rows.assign(1,i); clusters.push_back(rows); } //recopy original xbar for(i=0;i<N*Px;i++){ xbar[i]=x[i]; } } while(grad>=opt_thresh){ //first calc gradients grad = 0; for(it=clusters.begin();it!=clusters.end();it++){ rows = *it; i = rows[0]; for(k=0;k<Px;k++){ dir[i+k*N] = xbar[i+k*N] - alpha[i+k*N]; } for(cj=clusters.begin();cj!=clusters.end();cj++){ if(it!=cj){ rowsj = *cj; j=rowsj[0]; weight=0; diff = *diffs(i,j); if(diff!=0){ if(smooth!=0){ diff *= diff; //now squared l2 norm diff += smooth; //add smoothing parameter under sqrt diff = sqrt(diff);//put sqrt back } for(ri=rows.begin();ri!=rows.end();ri++){ for(rj=rowsj.begin();rj!=rowsj.end();rj++){ weight += W->getval(*ri,*rj); } } //weight *= lambda / diff / ((double)(N-1)) / ((double)rows.size()); weight *= lambda / diff / ((double)rows.size()); for(k=0;k<Px;k++){ dir[i+k*N] += weight * (alpha[j+k*N]-alpha[i+k*N]); } } } } grad += nrm2(Array(dir+i,N,Px)); } //store this iteration //results->add(alpha,lambda,grad); //then take a step if(linesearch_freq==0 || (iteration % linesearch_freq)==0 ){ //Decreasing step size //TDH and pierre 18 jan 2011 try sqrt dec step size step=1/((double)iteration); //step=1/sqrt((double)iteration); if(verbose>=2)printf("grad %f step %f it %d\n",grad,step,iteration); take_step(clusters,alpha,dir,N,Px,step); }else{ double cost_here,cost_step; std::map<double,double> cost_steps; std::map<double,double>::iterator step1,step2; for(i=0;i<N*Px;i++)old_alpha[i]=alpha[i];//copy alpha //compare current cost to cost after stepping in gradient direction cost_here=cost_step=calc_cost(clusters,amat,xmat,W,diffs,lambda); step = 0; cost_steps.insert(std::pair<double,double>(cost_here,0)); while(cost_step<=cost_here){ take_step(clusters,alpha,dir,N,Px,1); step += 1; diffs.calc_diffs(clusters,amat,nrm2); cost_step=calc_cost(clusters,amat,xmat,W,diffs,lambda); if(verbose>=2) printf("cost %.10f step %f cost_here %f\n",cost_step,step,cost_here); cost_steps.insert(std::pair<double,double>(cost_step,step)); } for(int cuts=0;cuts<linesearch_points;cuts++){ step1=step2=cost_steps.begin(); step2++; step = (step1->second + step2->second)/2; for(i=0;i<N*Px;i++){ alpha[i]=old_alpha[i]; } take_step(clusters,alpha,dir,N,Px,step); diffs.calc_diffs(clusters,amat,nrm2); cost_step=calc_cost(clusters,amat,xmat,W,diffs,lambda); if(verbose>=2)printf("cost %.10f step %f %d\n",cost_step,step,cuts); cost_steps.insert(std::pair<double,double>(cost_step,step)); } cost_steps.clear(); } if(iteration++ > maxit){ if(tried_restart){ printf("max iteration %d exit\n",maxit); delete old_alpha; delete alpha; delete xbar; delete dir; return results; }else{ if(verbose>=1)printf("max iterations, trying restart from x\n"); tried_restart=1; iteration=1; for(i=0;i<N*Px;i++)alpha[i]=x[i]; } } //calculate differences diffs.calc_diffs(clusters,amat,nrm2); //check for joins JoinPair tojoin; while(dojoin(tojoin=check_clusters_thresh(&clusters,diffs,join_thresh))){ //if(verbose>=1) // printf("join: %d %d\n",tojoin.first->front(),tojoin.second->front()); int ni=tojoin.first->size(); int nj=tojoin.second->size(); i=tojoin.first->front(); j=tojoin.second->front(); tojoin.first->insert(tojoin.first->end(), tojoin.second->begin(), tojoin.second->end()); for(k=0;k<Px;k++){ alpha[i+k*N] = (alpha[i+k*N]*ni + alpha[j+k*N]*nj)/(ni+nj); xbar[i+k*N] = (xbar[i+k*N]*ni + xbar[j+k*N]*nj)/(ni+nj); } clusters.erase(tojoin.second); iteration=1; if(clusters.size()>1){ diffs.calc_diffs(clusters,amat,nrm2);//inefficient }else{ grad=0;//so we can escape from the last optimization loop } } }//while(grad>=opt_thresh) if(verbose>=1) printf("solution iteration %d lambda %f nclusters %d\n", iteration,lambda,(int)clusters.size()); if(target_cluster == 0){ //for each cluster, there may be several points. we store the //alpha value just in the row of the first point. thus here we //copy this value to the other rows before copying the optimal //alpha to results. for(it=clusters.begin();it!=clusters.end();it++){ rows = *it; if(rows.size()>1){ for(i=1;i<rows.size();i++){ for(k=0;k<Px;k++){ alpha[rows[i]+k*N] = alpha[rows[0]+k*N]; } } } } results->add(alpha,lambda,grad); } //haven't yet reached the target number of clusters, multiply //lambda by lambda_factor and continue along the path if((int)clusters.size()>target_cluster){ old_lambda=lambda; lambda *= lambda_factor; } //if we have passed the target cluster number then decrease //lambda and go look for it! if((int)clusters.size()<target_cluster){ if(verbose>=1){ printf("missed target %d, going back for it\n",target_cluster); } lambda = (lambda+old_lambda)/2; clusters.clear(); //reassign original clusters for(i=0;i<N;i++){ rows.assign(1,i); clusters.push_back(rows); } //recopy original xbar for(i=0;i<N*Px;i++){ xbar[i]=x[i]; } } //this is the number of clusters that we were looking for, //save and quit! if((int)clusters.size()==target_cluster){ for(it=clusters.begin();it!=clusters.end();it++){ rows = *it; if(rows.size()>1){ for(i=1;i<rows.size();i++){ for(k=0;k<Px;k++){ alpha[rows[i]+k*N] = alpha[rows[0]+k*N]; } } } } results->add(alpha,lambda,grad); if(verbose>=1)printf("got target cluster %d exit\n",target_cluster); delete old_alpha; delete alpha; delete xbar; delete dir; return results; } } //TODO: consolidate cleanup... just use data structures that //automatically clean themselves up when the function exits. delete old_alpha; delete alpha; delete xbar; delete dir; return results; }
int main(int argc, char* argv[]) { /* Preprocessor Definitions */ #define atempmat(I,J) Atemp[I + n*J] #define amat(I,J) A[I + n*J] #define bmat(I,J) B[I + n*J] #define cmat(I,J) C[I + n*J] //#define DEBUG 1 /* Variables */ int p; //total number of processors int k; //this processor's rank int i,j; //counter variables double start, finish; //used for timing matrix computations char hostname[20]; //hostname of this machine (for debug) double *Amatrix, *Bmatrix, *Cmatrix;//actual matrices (used by process 0) /* Initializations */ MPI_Init(NULL, NULL); MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &k); hostname[19] = '\0'; gethostname(hostname, 19); # ifdef DEBUG printf("Process %d of %d running on host %s\n", k, p, hostname); # endif //handle command line input num_threads = 1; if(argc == 1) n = 1024; else if (argc > 1) n = atoi(argv[1]); if (argc == 3) { num_threads = atoi(argv[2]); } if(argc > 3 || argc < 1) { printf(usage); return 1; } nc = n/p; A = (double *)malloc(n * nc * sizeof(double)); Atemp = (double *)malloc(n * nc * sizeof(double)); B = (double *)malloc(n * nc * sizeof(double)); C = (double *)malloc(n * nc * sizeof(double)); srand(0); for(i = 0; i < n*nc; i++) { C[i]=0; } /* Root process: generate and distribute data */ if(k == 0) { Amatrix = (double *)malloc(n * n * sizeof(double)); Bmatrix = (double *)malloc(n * n * sizeof(double)); Cmatrix = (double *)malloc(n * n * sizeof(double)); //Initialize data for(i = 0; i < n*n; i++) { Amatrix[i] = 0; Bmatrix[i] = 0; Cmatrix[i] = 0; //C[i] = 0; } #ifdef VALUES //for correctness testing printf("\n%d\n", n); if(k==0) { printf("A = B = \n"); for(i = 0; i < n; i++) { for(j = 0; j < n; j++) { Amatrix[i+n*j] = i+j; Bmatrix[i+n*j] = i+j; printf("%f ", Amatrix[i+n*j]); } printf("\n"); } } #endif } //Now distribute. Can do outside the root block MPI_Scatter(Amatrix, n*nc, MPI_DOUBLE, A, n*nc, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Scatter(Bmatrix, n*nc, MPI_DOUBLE, B, n*nc, MPI_DOUBLE, 0, MPI_COMM_WORLD); //pseudocode //Ck = Ck + Ak*Bkk //Atemp = Ak //j = k //for i = 1 to p-1 do // j=(j+1) mod p // send Atemp to left // receive Atemp from right // Ck = Ck + Atemp*Bjk //end /* Matrix Calculations */ MPI_Barrier(MPI_COMM_WORLD); //do barrier so that we synchronize for better time GET_TIME(start); //real code memcpy(Atemp,A,(n * nc * sizeof(double))); #ifdef VALUES if(k == 0) { printf("\n\n Ak = \n"); for(i = 0; i < n; i++) { for(j = 0; j < nc; j++) { printf("%f ", amat(i,j)); } printf("\n"); } } #endif calc_c(k,k); j = k; int send_to = k-1; int receive_from = k+1; if (receive_from == p) receive_from = 0; if (send_to == -1) { send_to = p-1; } MPI_Request rec_request; MPI_Request send_request; MPI_Status rec_status; MPI_Isend(Atemp, n*nc, MPI_DOUBLE, send_to, 0, MPI_COMM_WORLD, &send_request); MPI_Irecv(A, n*nc, MPI_DOUBLE, receive_from, 0, MPI_COMM_WORLD, &rec_request); for(i = 0; i < p-1; i++) { j = (j+1) % p; MPI_Wait(&rec_request, &rec_status); memcpy(Atemp,A,(n * nc * sizeof(double))); MPI_Isend(Atemp, n*nc, MPI_DOUBLE, send_to, 0, MPI_COMM_WORLD, &send_request); MPI_Irecv(A, n*nc, MPI_DOUBLE, receive_from, 0, MPI_COMM_WORLD, &rec_request); calc_c(j,k); } GET_TIME(finish); MPI_Barrier(MPI_COMM_WORLD); //output if(k == 0) printf("%f\n", finish-start); MPI_Gather(C, n*nc, MPI_DOUBLE, Cmatrix, n*nc, MPI_DOUBLE, 0, MPI_COMM_WORLD); #ifdef VALUES if(k == 0) { printf("\n\n C = \n"); for(i = 0; i < n; i++) { for(j = 0; j < n; j++) { printf("%f ", Cmatrix[i+n*j]); } printf("\n"); } } #endif MPI_Finalize(); return 0; } /* main */
MStatus depthMap::doIt( const MArgList& args ) { MStatus status = parseArgs( args ); if( status != MS::kSuccess ) return status; MArgDatabase argData(syntax(), args); MAnimControl timeControl; MTime time = timeControl.currentTime(); int frame =int(time.value()); MString scene_name, camera_name, title; if (argData.isFlagSet("-n")) argData.getFlagArgument("-n", 0, title); else return MS::kFailure; if (argData.isFlagSet("-sc")) argData.getFlagArgument("-sc", 0, scene_name); else return MS::kFailure; if (argData.isFlagSet("-ca")) argData.getFlagArgument("-ca", 0, camera_name); else return MS::kFailure; m_eye[0][0]=1; m_eye[0][1]=0; m_eye[0][2]=0; m_eye[0][3]=0; m_eye[1][0]=0; m_eye[1][1]=1; m_eye[1][2]=0; m_eye[1][3]=0; m_eye[2][0]=0; m_eye[2][1]=0; m_eye[2][2]=1; m_eye[2][3]=0; m_eye[3][0]=0; m_eye[3][1]=0; m_eye[3][2]=0; m_eye[3][3]=1; // get eye space zWorks::getTypedPathByName(MFn::kTransform, camera_name, p_eye); MObject o_eye = p_eye.transform(); if(o_eye.isNull()) MGlobal::displayWarning("Cannot find eye camera, use default space."); else zWorks::getTransformWorldNoScale(p_eye.partialPathName(), m_eye); m_eye[0][0] *=-1; m_eye[0][1] *=-1; m_eye[0][2] *=-1; m_eye[2][0] *=-1; m_eye[2][1] *=-1; m_eye[2][2] *=-1; p_eye.extendToShape(); MFnCamera feye(p_eye); double fov = feye.horizontalFieldOfView(); int map_w = 1024, map_h = 1024; float* data = new float[map_w * map_h]; for(int i=0; i<map_w * map_h; i++) data[i] = 10e6; string sscene = scene_name.asChar(); zGlobal::changeFrameNumber(sscene, frame); MGlobal::displayInfo ( MString(" calculating ") + sscene.c_str()); FXMLScene* fscene = new FXMLScene(); if(fscene->load(sscene.c_str()) != 1) { MGlobal::displayWarning(" cannot open scene, do nothing."); return MS::kFailure; } fscene->depthMap(data, map_w, map_h, fov, m_eye); zGlobal::cutByLastSlash(sscene); sscene = sscene + "/" + title.asChar() + ".1.exr"; zGlobal::changeFrameNumber(sscene, frame); //zGlobal::changeFilenameExtension(sscene, "exr"); MGlobal::displayInfo ( MString(" saving ") + sscene.c_str()); M44f amat(m_eye[0][0], m_eye[0][1], m_eye[0][2], m_eye[0][3], m_eye[1][0], m_eye[1][1], m_eye[1][2], m_eye[1][3], m_eye[2][0], m_eye[2][1], m_eye[2][2], m_eye[2][3], m_eye[3][0], m_eye[3][1], m_eye[3][2], m_eye[3][3] ); ZFnEXR::saveCameraNZ(data, amat, fov, sscene.c_str(), map_w, map_h); delete[] data; delete fscene; return MS::kSuccess; }
void LU(const int NN) { typedef Kokkos::Schedule<Kokkos::Static> ScheduleType; //typedef Kokkos::Schedule<Kokkos::Dynamic> ScheduleType; constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value; const int N = NN/VectorLength; { std::string value_type_name; if (std::is_same<value_type,double>::value) value_type_name = "double"; if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>"; #if defined(__AVX512F__) std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #endif } const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize); const double tmax = 1.0e15; const int iter_begin = -10, iter_end = 100; Kokkos::Impl::Timer timer; /// /// Reference version using MKL DGETRF /// Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> aref; Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> amat("amat", N*VectorLength, BlkSize, BlkSize); Random<value_type> random; for (int k=0;k<N*VectorLength;++k) { // use tridiagonal matrices; for now we just check elementwise l/u factors // do not allow pivots for (int i=0;i<BlkSize;++i) { amat(k, i, i) = random.value() + 10.0; if ((i+1) < BlkSize) { amat(k, i, i+1) = random.value() + 1.0; amat(k, i+1, i) = random.value() + 1.0; } } } typedef Vector<SIMD<value_type>,VectorLength> VectorType; Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> amat_simd("amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize); Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::Pack", Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k/VectorLength, k1 = k%VectorLength; for (int i=0;i<BlkSize;++i) for (int j=0;j<BlkSize;++j) { amat_simd(k0, i, j)[k1] = amat(k0*VectorLength+k1, i, j); } });