/** * * Main program: * This program reads the following parameters from the console and * then computes the optical flow: * I_1 Previous image to I0 * I0 first image * I1 second image * I0_Smoothed Image for using with function g * out name of the output flow field * outOcc name of the output occlusion map * nprocs number of threads to use (OpenMP library) * tauEta Time step in the primal-dual scheme for eta variable * tauChi Time step in the primal-dual scheme for chi variable * lambda Data term weight parameter * alpha Length term weight parameter (in the occlusion region) * beta Negative divergence data Term * theta tightness parameter * nscales number of scales in the pyramidal structure * zfactor downsampling factor for creating the scales * nwarps number of warps per scales * epsilon stopping criterion threshold for the iterative process * verbose switch on/off messages * */ int main(int argc, char *argv[]) { if (argc < 3) { fprintf(stderr, "Usage: %s I_1 I0 I1 [I0_Smoothed out " // 0 1 2 3 4 5 "outOcc nproc lambda alpha beta theta nscales zfactor nwarps epsilon " // 6 7 8 9 10 11 12 13 14 15 "verbose ]\n", *argv); // 16 return EXIT_FAILURE; } // Variable Declaration double *u1=NULL, *u2=NULL; //field double *chi=NULL; //Occlussion map double *I_1=NULL, *I0=NULL, *I1=NULL; // Previous (I_1), current (I0) and next image (I1) double *filtI0=NULL; //Filtered image used in function g int nx_1, ny_1, nx, ny, nx1, ny1, nxf, nyf; //Image sizes //read the parameters int i = 1; char* image_1_name = argv[i]; i++; //1 char* image1_name = argv[i]; i++; //2 char* image2_name = argv[i]; i++; //3 char* image1_Smooth_name = (argc>i) ? argv[i] : argv[2]; i++; //4 If there is no I0_Smoothed, then it will be I0 const char* outfile = (argc>i)? argv[i]: PAR_DEFAULT_OUTFLOW; i++; //5 const char* outOccFile = (argc>i)? argv[i]: PAR_DEFAULT_OUT_OCC; i++; //6 int nproc = (argc>i)? atoi(argv[i]): PAR_DEFAULT_NPROC; i++; //7 double lambda = (argc>i)? atof(argv[i]): PAR_DEFAULT_LAMBDA; i++; //8 double alpha = (argc>i)? atof(argv[i]): PAR_DEFAULT_ALPHA; i++; //9 double betaW = (argc>i)? atof(argv[i]): PAR_DEFAULT_BETA; i++; //10 double theta = (argc>i)? atof(argv[i]): PAR_DEFAULT_THETA; i++; //11 int nscales = (argc>i)? atoi(argv[i]): PAR_DEFAULT_NSCALES; i++; //12 double zfactor = (argc>i)? atof(argv[i]): PAR_DEFAULT_ZFACTOR; i++; //13 int nwarps = (argc>i)? atoi(argv[i]): PAR_DEFAULT_NWARPS; i++; //14 double epsilon = (argc>i)? atof(argv[i]): PAR_DEFAULT_EPSILON; i++; //15 int verbose = (argc>i)? atoi(argv[i]): PAR_DEFAULT_VERBOSE; i++; //16 //check parameters if (nproc < 0) { nproc = PAR_DEFAULT_NPROC; fprintf(stderr, "warning: " "nproc changed to %d\n", nproc); } if (lambda <= 0) { lambda = PAR_DEFAULT_LAMBDA; fprintf(stderr, "warning: " "lambda changed to %g\n", lambda); } if (alpha <= 0) { alpha = PAR_DEFAULT_ALPHA; fprintf(stderr, "warning: " "alpha changed to %g\n", alpha); } if (betaW <= 0) { betaW = PAR_DEFAULT_BETA; fprintf(stderr, "warning: " "beta changed to %g\n", betaW); } if (theta <= 0) { theta = PAR_DEFAULT_THETA; if (verbose) fprintf(stderr, "warning: " "theta changed to %g\n", theta); } if (nscales <= 0) { nscales = PAR_DEFAULT_NSCALES; fprintf(stderr, "warning: " "nscales changed to %d\n", nscales); } if (zfactor <= 0 || zfactor >= 1) { zfactor = PAR_DEFAULT_ZFACTOR; fprintf(stderr, "warning: " "zfactor changed to %g\n", zfactor); } if (nwarps <= 0) { nwarps = PAR_DEFAULT_NWARPS; fprintf(stderr, "warning: " "nwarps changed to %d\n", nwarps); } if (epsilon <= 0) { epsilon = PAR_DEFAULT_EPSILON; fprintf(stderr, "warning: " "epsilon changed to %f\n", epsilon); } #ifdef _OPENMP if (nproc > 0) omp_set_num_threads(nproc); #endif//DISABLE_OMP // read the input images I_1 = read_image(image_1_name, &nx_1, &ny_1); I0 = read_image(image1_name, &nx, &ny); I1 = read_image(image2_name, &nx1, &ny1); filtI0 = read_image(image1_Smooth_name, &nxf, &nyf); if(nx==nx_1 && nx==nx1 && nx==nxf && ny==ny_1 && ny==ny1 && ny==nyf) { //Set the number of scales according to the size of the //images. The value N is computed to assure that the smaller //images of the pyramid don't have a size smaller than 16x16 const int N = floor(log((float)MIN(nx, ny)/16.0)/log(1./zfactor)) + 1; if (N < nscales) nscales = N; if (verbose) fprintf(stderr, " nproc=%d \n lambda=%f \n alpha=%f \n" " beta=%f \n theta=%f \n nscales=%d \n zfactor=%f\n nwarps=%d \n epsilon=%g\n", nproc, lambda, alpha, betaW, theta, nscales, zfactor, nwarps, epsilon); //allocate memory for the flow u1 = (double *)xmalloc( nx * ny * sizeof(double)); u2 = (double *)xmalloc( nx * ny * sizeof(double)); //and the occlusion map chi = (double *)xmalloc( nx * ny * sizeof(double)); for(int i=0; i<nx*ny; i++) { chi[i] = 0.0; u1[i] = 0.0; u2[i] = 0.0; } //compute the optical flow Dual_TVL1_optic_flow_multiscale( I_1, I0, I1, filtI0, u1, u2, chi, nx, ny, lambda, alpha, betaW, theta, nscales, zfactor, nwarps, epsilon, verbose); //write_flow(u1, u2, nx, ny); //<----Eliminar en la version fina a entregar. Solo esta para propositos de depuraci—n. //save the optical flow float *f = (float *)malloc(sizeof(float) * nx * ny * 2); for (int i = 0; i < nx * ny; i++) { f[2*i] = (float)u1[i]; //Avoid the cast! f[2*i+1] = (float)u2[i]; //Avoid the cast! } iio_save_image_float_vec((char *)outfile, f, nx, ny, 2); free(f); //save the occlusions /* int iv=0; FILE * fid=fopen(outOccFile, "w"); for (int i=0; i<ny; i++) { for (int j=0; j<nx; j++) { fprintf(fid, " %f", chi[iv]); iv++; } fprintf(fid, " \n"); } fclose(fid);*/ //iio_save_image_double((char *)outOccFile, chi, nx, ny); float *fOcc = (float *)malloc(sizeof(float) * nx * ny ); for (int i = 0; i < nx * ny; i++) { fOcc[i] = (float)chi[i]*255; //Avoid the cast! } iio_save_image_float((char *)outOccFile, fOcc, nx, ny); free(fOcc); } //delete allocated memory free(I0); free(I1); free(u1); free(u2); free(filtI0); free(chi); return EXIT_SUCCESS; }
// METHOD THAT RECEIVES POINT CLOUDS (OPEN MP) std::vector<cluster> poseEstimationSV::poseEstimationCore_openmp(pcl::PointCloud<pcl::PointXYZ>::ConstPtr cloud) { Tic(); std::vector <std::vector < pose > > bestPosesAux; bestPosesAux.resize(omp_get_num_procs()); //int bestPoseAlpha; //int bestPosePoint; //int bestPoseVotes; Eigen::Vector3f scenePoint; Eigen::Vector3f sceneNormal; pcl::PointIndices normals_nan_indices; pcl::ExtractIndices<pcl::PointNormal> nan_extract; float alpha; unsigned int alphaBin,index; // Iterators //unsigned int sr; // scene reference point pcl::PointCloud<pcl::PointNormal>::iterator si; // scene paired point std::vector<pointPairSV>::iterator sameFeatureIt; // same key on hash table std::vector<boost::shared_ptr<pose> >::iterator bestPosesIt; Eigen::Vector4f feature; Eigen::Vector3f _pointTwoTransformed; std::cout<< "\tCloud size: " << cloud->size() << endl; ////////////////////////////////////////////// // Downsample point cloud using a voxelgrid // ////////////////////////////////////////////// pcl::PointCloud<pcl::PointXYZ>::Ptr cloudDownsampled(new pcl::PointCloud<pcl::PointXYZ> ()); // Create the filtering object pcl::VoxelGrid<pcl::PointXYZ> sor; sor.setInputCloud (cloud); sor.setLeafSize (model->distanceStep,model->distanceStep,model->distanceStep); sor.filter (*cloudDownsampled); std::cout<< "\tCloud size after downsampling: " << cloudDownsampled->size() << endl; // Compute point cloud normals (using cloud before downsampling information) std::cout<< "\tCompute normals... "; cloudNormals=model->computeSceneNormals(cloudDownsampled); std::cout<< "Done" << endl; /*boost::shared_ptr<pcl_visualization::PCLVisualizer> viewer2 = objectModel::viewportsVis(cloudFilteredNormals); while (!viewer2->wasStopped ()) { viewer2->spinOnce (100); boost::this_thread::sleep (boost::posix_time::microseconds (100000)); }*/ /*boost::shared_ptr<pcl_visualization::PCLVisualizer> viewer2 = objectModel::viewportsVis(model->modelCloud); while (!viewer2->wasStopped ()) { viewer2->spinOnce (100); boost::this_thread::sleep (boost::posix_time::microseconds (100000)); }*/ ////////////////////////////////////////////////////////////////////////////// // Filter again to remove spurious normals nans (and it's associated point) // ////////////////////////////////////////////////fa////////////////////////////// for (unsigned int i = 0; i < cloudNormals->points.size(); ++i) { if (isnan(cloudNormals->points[i].normal[0]) || isnan(cloudNormals->points[i].normal[1]) || isnan(cloudNormals->points[i].normal[2])) { normals_nan_indices.indices.push_back(i); } } nan_extract.setInputCloud(cloudNormals); nan_extract.setIndices(boost::make_shared<pcl::PointIndices> (normals_nan_indices)); nan_extract.setNegative(true); nan_extract.filter(*cloudWithNormalsDownSampled); std::cout<< "\tCloud size after removing NaN normals: " << cloudWithNormalsDownSampled->size() << endl; ///////////////////////////////////////////// // Extract reference points from the scene // ///////////////////////////////////////////// //pcl::RandomSample< pcl::PointCloud<pcl::PointNormal> > randomSampler; //randomSampler.setInputCloud(cloudWithNormalsDownSampled); // Create the filtering object int numberOfPoints=(int) (cloudWithNormalsDownSampled->size () )*referencePointsPercentage; int totalPoints=(int) (cloudWithNormalsDownSampled->size ()); std::cout << "\tUniform sample a set of " << numberOfPoints << "(" << referencePointsPercentage*100 << "%)... "; referencePointsIndices->indices.clear(); extractReferencePointsUniform(referencePointsPercentage,totalPoints); std::cout << "Done" << std::endl; //std::cout << referencePointsIndices->indices.size() << std::endl; ////////////// // Votation // ////////////// std::cout<< "\tVotation... "; omp_set_num_threads(omp_get_num_procs()); //omp_set_num_threads(1); //int iteration=0; bestPoses.clear(); #pragma omp parallel for private(alpha,alphaBin,alphaScene,sameFeatureIt,index,feature,si,_pointTwoTransformed) //reduction(+:iteration) //nowait for(unsigned int sr=0; sr < referencePointsIndices->indices.size(); ++sr) { //++iteration; //std::cout << "iteration: " << iteration << " thread:" << omp_get_thread_num() << std::endl; //printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads()); scenePoint=cloudWithNormalsDownSampled->points[referencePointsIndices->indices[sr]].getVector3fMap(); sceneNormal=cloudWithNormalsDownSampled->points[referencePointsIndices->indices[sr]].getNormalVector3fMap(); // Get transformation from scene frame to global frame Eigen::Vector3f cross=sceneNormal.cross (Eigen::Vector3f::UnitX ()). normalized(); Eigen::Affine3f rotationSceneToGlobal; if(isnan(cross[0])) { rotationSceneToGlobal=Eigen::AngleAxisf(0.0,Eigen::Vector3f::UnitX ()); } else rotationSceneToGlobal=Eigen::AngleAxisf(acosf (sceneNormal.dot (Eigen::Vector3f::UnitX ())),cross); Eigen::Affine3f transformSceneToGlobal = Eigen::Translation3f ( rotationSceneToGlobal* ((-1)*scenePoint)) * rotationSceneToGlobal; ////////////////////// // Choose best pose // ////////////////////// // Reset pose accumulator for(std::vector<std::vector<int> >::iterator accumulatorIt=accumulatorParallelAux[omp_get_thread_num()].begin();accumulatorIt < accumulatorParallelAux[omp_get_thread_num()].end(); ++accumulatorIt) { std::fill(accumulatorIt->begin(),accumulatorIt->end(),0); } //std::cout << std::endl; for(si=cloudWithNormalsDownSampled->begin(); si < cloudWithNormalsDownSampled->end();++si) { // if same point, skip point pair if( (cloudWithNormalsDownSampled->points[referencePointsIndices->indices[sr]].x==si->x) && (cloudWithNormalsDownSampled->points[referencePointsIndices->indices[sr]].y==si->y) && (cloudWithNormalsDownSampled->points[referencePointsIndices->indices[sr]].z==si->z)) { //std::cout << si->x << " " << si->y << " " << si->z << std::endl; continue; } // Compute PPF pointPairSV PPF=pointPairSV(cloudWithNormalsDownSampled->points[sr],*si, transformSceneToGlobal); // Compute index index=PPF.getHash(*si,model->distanceStepInverted); // If distance between point pairs is bigger than the maximum for this model, skip point pair if(index>pointPairSV::maxHash) { //std::cout << "DEBUG" << std::endl; continue; } // If there is no similar point pair features in the model, skip point pair and avoid computing the alpha if(model->hashTable[index].size()==0) continue; for(sameFeatureIt=model->hashTable[index].begin(); sameFeatureIt<model->hashTable[index].end(); ++sameFeatureIt) { // Vote on the reference point and angle (and object) alpha=sameFeatureIt->alpha-PPF.alpha; // alpha values between [-360,360] // alpha values should be between [-180,180] ANGLE_MAX = 2*PI if(alpha<(-PI)) alpha=ANGLE_MAX+alpha; else if(alpha>(PI)) alpha=alpha-ANGLE_MAX; //std::cout << "alpha after: " << alpha*RAD_TO_DEG << std::endl; //std::cout << "alpha after2: " << (alpha+PI)*RAD_TO_DEG << std::endl; alphaBin=static_cast<unsigned int> ( round((alpha+PI)*pointPair::angleStepInverted) ); // division is slower than multiplication //std::cout << "angle1: " << alphaBin << std::endl; /*alphaBin = static_cast<unsigned int> (floor (alpha) + floor (PI *poseAngleStepInverted)); std::cout << "angle2: " << alphaBin << std::endl;*/ //alphaBin=static_cast<unsigned int> ( floor(alpha*poseAngleStepInverted) + floor(PI*poseAngleStepInverted) ); if(alphaBin>=pointPair::angleBins) { alphaBin=0; //ROS_INFO("naoooo"); //exit(1); } //#pragma omp critical //{std::cout << index <<" "<<sameFeatureIt->id << " " << alphaBin << " " << omp_get_thread_num() << " " << accumulatorParallelAux[omp_get_thread_num()][sameFeatureIt->id][alphaBin] << std::endl;} accumulatorParallelAux[omp_get_thread_num()][sameFeatureIt->id][alphaBin]+=sameFeatureIt->weight; } } //ROS_INFO("DISTANCE:%f DISTANCE SQUARED:%f", model->maxModelDist, model->maxModel // Choose best pose (highest peak on the accumulator[peak with more votes]) int bestPoseAlpha=0; int bestPosePoint=0; int bestPoseVotes=0; for(size_t p=0; p < model->modelCloud->size(); ++p) { for(unsigned int a=0; a < pointPair::angleBins; ++a) { if(accumulatorParallelAux[omp_get_thread_num()][p][a]>bestPoseVotes) { bestPoseVotes=accumulatorParallelAux[omp_get_thread_num()][p][a]; bestPosePoint=p; bestPoseAlpha=a; } } } // A candidate pose was found if(bestPoseVotes!=0) { // Compute and store transformation from model to scene //boost::shared_ptr<pose> bestPose(new pose( bestPoseVotes,model->modelToScene(model->modelCloud->points[bestPosePoint],transformSceneToGlobal,static_cast<float>(bestPoseAlpha)*pointPair::angleStep-PI) )); bestPosesAux[omp_get_thread_num()].push_back(pose( bestPoseVotes,model->modelToScene(bestPosePoint,transformSceneToGlobal,static_cast<float>(bestPoseAlpha)*pointPair::angleStep-PI) )); //bestPoses.push_back(bestPose); //std::cout << bestPosesAux[omp_get_thread_num()].size() <<" " <<omp_get_thread_num()<< std::endl; } else { continue; } // Choose poses whose votes are a percentage above a given threshold of the best pose accumulatorParallelAux[omp_get_thread_num()][bestPosePoint][bestPoseAlpha]=0; // This is more efficient than having an if condition to verify if we are considering the best pose again for(size_t p=0; p < model->modelCloud->size(); ++p) { for(unsigned int a=0; a < pointPair::angleBins; ++a) { if(accumulatorParallelAux[omp_get_thread_num()][p][a]>=accumulatorPeakThreshold*bestPoseVotes) { // Compute and store transformation from model to scene //boost::shared_ptr<pose> bestPose(new pose( accumulatorParallelAux[omp_get_thread_num()][p][a],model->modelToScene(model->modelCloud->points[p],transformSceneToGlobal,static_cast<float>(a)*pointPair::angleStep-PI ) )); //bestPoses.push_back(bestPose); bestPosesAux[omp_get_thread_num()].push_back(pose( bestPoseVotes,model->modelToScene(bestPosePoint,transformSceneToGlobal,static_cast<float>(bestPoseAlpha)*pointPair::angleStep-PI) )); //std::cout << bestPosesAux[omp_get_thread_num()].size() <<" " <<omp_get_thread_num()<< std::endl; } } } } std::cout << "Done" << std::endl; for(int i=0; i<omp_get_num_procs(); ++i) { for(unsigned int j=0; j<bestPosesAux[i].size(); ++j) bestPoses.push_back(bestPosesAux[i][j]); } std::cout << "\thypothesis number: " << bestPoses.size() << std::endl << std::endl; if(bestPoses.size()==0) { clusters.clear(); return clusters; } ////////////////////// // Compute clusters // ////////////////////// Tac(); std::cout << "\tCompute clusters... "; Tic(); clusters=poseClustering(bestPoses); Tac(); std::cout << "Done" << std::endl; return clusters; }
void parallel_lu(int argc, char **argv, double **matrix, int dim, int block_dim, int rank2print, int doSerial, int numThreads) { omp_set_num_threads(numThreads); int procs; int rank; MPI_Comm_size(MPI_COMM_WORLD, &procs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Status status; MPI_Request request; int num_rows = sqrt(procs); int num_cols = sqrt(procs); int dimSize[2] = {num_rows, num_cols}; int periodic[2] = {0, 0}; int myCoords[2]; MPI_Comm comm2D; MPI_Cart_create(MPI_COMM_WORLD, 2, dimSize, periodic, 0, &comm2D); int myRow, myCol; MPI_Cart_coords(comm2D, rank, 2, myCoords); myRow = myCoords[0]; myCol = myCoords[1]; //Determine the neighbour rank numbers int rightRank; int leftRank = rank; int botRank; int topRank = rank; MPI_Cart_shift(comm2D, 1, 1, &leftRank, &rightRank); MPI_Cart_shift(comm2D, 0, 1, &topRank, &botRank); double **L = create_zero_matrix(dim); double *LBuffSend = (double*) malloc (block_dim * sizeof(double)); double *LBuffRecv = (double*) malloc (block_dim * sizeof(double)); double *PBuffSend = (double*) malloc (block_dim * sizeof(double)); double *PBuffRecv = (double*) malloc (block_dim * sizeof(double)); int i,j,k; // initialize buffers for (i=0;i<block_dim;i++) { LBuffSend[i] = LBuffRecv[i] = PBuffSend[i] = PBuffRecv[i] = 0; } // initialize L diag for (i=0;i<dim;i++) { L[i][i] = 1.0; } int proc_per_row = dim/block_dim; int col_start = (rank*block_dim) % dim; int col_end = col_start+block_dim-1; int row_start = (rank/proc_per_row)*block_dim; int row_end = row_start+block_dim-1; if(rank==rank2print) { printf("Rank %i\n", rank); printf("myRow of proc:%i\n", myRow); printf("myCol of proc:%i\n", myCol); printf("Right rank is: %i\n",rightRank); printf("Left rank is: %i\n",leftRank); printf("Top rank is: %i\n",topRank); printf("Bottom rank is: %i\n",botRank); printf("Col start %i\n", col_start); printf("Col end %i\n", col_end); printf("Row start %i\n", row_start); printf("Row end %i\n", row_end); //print_matrix(dim,matrix); } //Main computation loop for(k=0;k<dim;k++) { bool kInMyRows = k >= row_start && k <= row_end; bool kInTopRows = k <= row_end-block_dim; bool kInBotRows = k >= row_start+block_dim; bool kInMyCols = k>=col_start && k<=col_end; bool kInLeftCols = k <= col_end-block_dim; bool kInRightCols = k >= col_start+block_dim; //Send & recieve pivot row //Recieve PBuffRec from top if(topRank >= 0 && kInTopRows && !kInRightCols) { MPI_Recv(PBuffRecv, block_dim, MPI_DOUBLE, topRank, 0, MPI_COMM_WORLD, &status); if(rank==rank2print) { printf("Received pivot row from rank %i for k = %i: ",topRank,k); print_vector(block_dim,PBuffRecv); } //Place PBuffRecv in correct place of matrix for(j=col_start;j<=col_end;j++) { if(j>=k) { matrix[k][j] = PBuffRecv[j-col_start]; } } } //send PBuffSend to bottom if(botRank >= 0 && !kInRightCols) { if(kInMyRows) { //pivot row is generated from this process //Assemble PBuffSend for(j=col_start;j<=col_end;j++) { if(j>=k) { PBuffSend[j-col_start] = matrix[k][j]; } } if(rank==rank2print) { printf("Sending pivot row to rank %i for k = %i (Creating): ",botRank,k); print_vector(block_dim,PBuffSend); } } else if(kInTopRows) { //pivot row is generated in a top process; just pass the recieved one along //Assemble PBuffSend for(j=col_start;j<=col_end;j++) { if(j>=k) { PBuffSend[j-col_start] = PBuffRecv[j-col_start]; } } if(rank==rank2print) { printf("Sending pivot row to rank %i for k = %i (Passing): ",botRank,k); print_vector(block_dim,PBuffSend); } } MPI_Isend(PBuffSend, block_dim, MPI_DOUBLE, botRank, 0, MPI_COMM_WORLD, &request); } //Calculate ratios if(kInMyCols) { for(i=row_start;i<=row_end;i++) { if (i>k) { L[i][k] = matrix[i][k]/matrix[k][k]; } } } //Wait for PBuffSend to be usable if(botRank >= 0 && kInMyRows) MPI_Wait(&request, &status); if(rank==rank2print) { printf("L:\n"); print_matrix_chunk(block_dim,row_start,col_start,L); } //Send & recieve ratios //Recieve LBuffRec from left if(leftRank >= 0 && kInLeftCols && !kInBotRows) { MPI_Recv(LBuffRecv, block_dim, MPI_DOUBLE, leftRank, 0, MPI_COMM_WORLD, &status); if(rank==rank2print) { printf("Recieved L from rank %i: ",leftRank); print_vector(block_dim,LBuffRecv); } //Place LBuffRecv in correct place of L[i][k] for(i=row_start;i<=row_end;i++) { if(i>k) { L[i][k] = LBuffRecv[i-row_start]; } } } //send LBuffSend to right if(rightRank >= 0 && !kInBotRows) { if(kInMyCols) { //ratio is generated from this process //Assemble LBuffSend for(i=row_start;i<=row_end;i++) { if(i>k) { LBuffSend[i-row_start] = L[i][k]; } } if(rank==rank2print) { printf("Sending L to rank %i for k = %i: (Creating)",rightRank,k); print_vector(block_dim,LBuffSend); } } else if(kInLeftCols) { //ratio is generated in a left process; just pass the recieved one along //Assemble LBuffSend for(i=row_start;i<=row_end;i++) { if(i>k) { LBuffSend[i-row_start] = LBuffRecv[i-row_start]; } } if(rank==rank2print) { printf("Sending L to rank %i for k = %i (Passing): ",rightRank,k); print_vector(block_dim,LBuffSend); } } MPI_Isend(LBuffSend, block_dim, MPI_DOUBLE, rightRank, 0, MPI_COMM_WORLD, &request); } //Compute upper triangular matrix #pragma omp parallel for private(j,i) firstprivate(k,col_start,col_end) for (j=col_start;j<=col_end;j++) { if (j>=k) { for (i=row_start;i<=row_end;i++) { if (i>k) { matrix[i][j] = matrix[i][j]-L[i][k]*matrix[k][j]; } } } } //Wait for LBuffSend to be usable if(rightRank >= 0 && kInMyCols) MPI_Wait(&request, &status); if(rank==rank2print) { printf("U:\n"); print_matrix_chunk(block_dim,row_start,col_start,matrix); } } /* double **L_chunk = create_zero_matrix(block_dim); double **U_chunk = create_zero_matrix(block_dim); // copy chunk data int r = 0; for(i=row_start;i<=row_end;i++) { int c = 0; for(j=col_start;j<=col_end;j++) { L_chunk[r][c] = L[i][j]; U_chunk[r][c] = matrix[i][j]; c++; } r++; }*/ if(rank2print == -1) { printf("Rank %i\n",rank); printf("L\n"); print_matrix_chunk(block_dim,row_start,col_start,L); //print_matrix(block_dim,L_chunk); printf("U\n"); print_matrix_chunk(block_dim,row_start,col_start,matrix); //print_matrix(block_dim,U_chunk); } /*if(rank != 0) { // send L and U chunks to process 0 MPI_Isend(L_chunk,block_dim*block_dim,MPI_DOUBLE,0,rank*,MPI_COMM_WORLD,&request); } else { // receive L and U chunks from all processes }*/ free_matrix(dim,L); free_matrix(dim,matrix); }
int dt_init(int argc, char *argv[], const int init_gui) { // make everything go a lot faster. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); #if !defined __APPLE__ && !defined __WIN32__ _dt_sigsegv_old_handler = signal(SIGSEGV,&_dt_sigsegv_handler); #endif #ifndef __SSE2__ fprintf(stderr, "[dt_init] unfortunately we depend on SSE2 instructions at this time.\n"); fprintf(stderr, "[dt_init] please contribute a backport patch (or buy a newer processor).\n"); return 1; #endif #ifdef M_MMAP_THRESHOLD mallopt(M_MMAP_THRESHOLD,128*1024) ; /* use mmap() for large allocations */ #endif // we have to have our share dir in XDG_DATA_DIRS, // otherwise GTK+ won't find our logo for the about screen (and maybe other things) { const gchar *xdg_data_dirs = g_getenv("XDG_DATA_DIRS"); gchar *new_xdg_data_dirs = NULL; gboolean set_env = TRUE; if(xdg_data_dirs != NULL && *xdg_data_dirs != '\0') { // check if DARKTABLE_SHAREDIR is already in there gboolean found = FALSE; gchar **tokens = g_strsplit(xdg_data_dirs, ":", 0); // xdg_data_dirs is neither NULL nor empty => tokens != NULL for(char **iter = tokens; *iter != NULL; iter++) if(!strcmp(DARKTABLE_SHAREDIR, *iter)) { found = TRUE; break; } g_strfreev(tokens); if(found) set_env = FALSE; else new_xdg_data_dirs = g_strjoin(":", DARKTABLE_SHAREDIR, xdg_data_dirs, NULL); } else new_xdg_data_dirs = g_strdup(DARKTABLE_SHAREDIR); if(set_env) g_setenv("XDG_DATA_DIRS", new_xdg_data_dirs, 1); g_free(new_xdg_data_dirs); } setlocale(LC_ALL, ""); bindtextdomain (GETTEXT_PACKAGE, DARKTABLE_LOCALEDIR); bind_textdomain_codeset (GETTEXT_PACKAGE, "UTF-8"); textdomain (GETTEXT_PACKAGE); // init all pointers to 0: memset(&darktable, 0, sizeof(darktable_t)); darktable.progname = argv[0]; // database gchar *dbfilename_from_command = NULL; char *datadir_from_command = NULL; char *moduledir_from_command = NULL; char *tmpdir_from_command = NULL; char *configdir_from_command = NULL; char *cachedir_from_command = NULL; darktable.num_openmp_threads = 1; #ifdef _OPENMP darktable.num_openmp_threads = omp_get_num_procs(); #endif darktable.unmuted = 0; GSList *images_to_load = NULL, *config_override = NULL; for(int k=1; k<argc; k++) { if(argv[k][0] == '-') { if(!strcmp(argv[k], "--help")) { return usage(argv[0]); } if(!strcmp(argv[k], "-h")) { return usage(argv[0]); } else if(!strcmp(argv[k], "--version")) { printf("this is "PACKAGE_STRING"\ncopyright (c) 2009-2014 johannes hanika\n"PACKAGE_BUGREPORT"\n"); return 1; } else if(!strcmp(argv[k], "--library")) { dbfilename_from_command = argv[++k]; } else if(!strcmp(argv[k], "--datadir")) { datadir_from_command = argv[++k]; } else if(!strcmp(argv[k], "--moduledir")) { moduledir_from_command = argv[++k]; } else if(!strcmp(argv[k], "--tmpdir")) { tmpdir_from_command = argv[++k]; } else if(!strcmp(argv[k], "--configdir")) { configdir_from_command = argv[++k]; } else if(!strcmp(argv[k], "--cachedir")) { cachedir_from_command = argv[++k]; } else if(!strcmp(argv[k], "--localedir")) { bindtextdomain (GETTEXT_PACKAGE, argv[++k]); } else if(argv[k][1] == 'd' && argc > k+1) { if(!strcmp(argv[k+1], "all")) darktable.unmuted = 0xffffffff; // enable all debug information else if(!strcmp(argv[k+1], "cache")) darktable.unmuted |= DT_DEBUG_CACHE; // enable debugging for lib/film/cache module else if(!strcmp(argv[k+1], "control")) darktable.unmuted |= DT_DEBUG_CONTROL; // enable debugging for scheduler module else if(!strcmp(argv[k+1], "dev")) darktable.unmuted |= DT_DEBUG_DEV; // develop module else if(!strcmp(argv[k+1], "fswatch")) darktable.unmuted |= DT_DEBUG_FSWATCH; // fswatch module else if(!strcmp(argv[k+1], "input")) darktable.unmuted |= DT_DEBUG_INPUT; // input devices else if(!strcmp(argv[k+1], "camctl")) darktable.unmuted |= DT_DEBUG_CAMCTL; // camera control module else if(!strcmp(argv[k+1], "perf")) darktable.unmuted |= DT_DEBUG_PERF; // performance measurements else if(!strcmp(argv[k+1], "pwstorage")) darktable.unmuted |= DT_DEBUG_PWSTORAGE; // pwstorage module else if(!strcmp(argv[k+1], "opencl")) darktable.unmuted |= DT_DEBUG_OPENCL; // gpu accel via opencl else if(!strcmp(argv[k+1], "sql")) darktable.unmuted |= DT_DEBUG_SQL; // SQLite3 queries else if(!strcmp(argv[k+1], "memory")) darktable.unmuted |= DT_DEBUG_MEMORY; // some stats on mem usage now and then. else if(!strcmp(argv[k+1], "lighttable")) darktable.unmuted |= DT_DEBUG_LIGHTTABLE; // lighttable related stuff. else if(!strcmp(argv[k+1], "nan")) darktable.unmuted |= DT_DEBUG_NAN; // check for NANs when processing the pipe. else if(!strcmp(argv[k+1], "masks")) darktable.unmuted |= DT_DEBUG_MASKS; // masks related stuff. else if(!strcmp(argv[k+1], "lua")) darktable.unmuted |= DT_DEBUG_LUA; // lua errors are reported on console else return usage(argv[0]); k ++; } else if(argv[k][1] == 't' && argc > k+1) { darktable.num_openmp_threads = CLAMP(atol(argv[k+1]), 1, 100); printf("[dt_init] using %d threads for openmp parallel sections\n", darktable.num_openmp_threads); k ++; } else if(!strcmp(argv[k], "--conf")) { gchar *keyval = g_strdup(argv[++k]), *c = keyval; while(*c != '=' && c < keyval + strlen(keyval)) c++; if(*c == '=' && *(c+1) != '\0') { *c++ = '\0'; dt_conf_string_entry_t *entry = (dt_conf_string_entry_t*)g_malloc(sizeof(dt_conf_string_entry_t)); entry->key = g_strdup(keyval); entry->value = g_strdup(c); config_override = g_slist_append(config_override, entry); } g_free(keyval); } } #ifndef MAC_INTEGRATION else { images_to_load = g_slist_append(images_to_load, argv[k]); } #endif } if(darktable.unmuted & DT_DEBUG_MEMORY) { fprintf(stderr, "[memory] at startup\n"); dt_print_mem_usage(); } #ifdef _OPENMP omp_set_num_threads(darktable.num_openmp_threads); #endif dt_loc_init_datadir(datadir_from_command); dt_loc_init_plugindir(moduledir_from_command); if(dt_loc_init_tmp_dir(tmpdir_from_command)) { printf(_("ERROR : invalid temporary directory : %s\n"),darktable.tmpdir); return usage(argv[0]); } dt_loc_init_user_config_dir(configdir_from_command); dt_loc_init_user_cache_dir(cachedir_from_command); #if !GLIB_CHECK_VERSION(2, 35, 0) g_type_init(); #endif // does not work, as gtk is not inited yet. // even if it were, it's a super bad idea to invoke gtk stuff from // a signal handler. /* check cput caps */ // dt_check_cpu(argc,argv); #ifdef HAVE_GEGL char geglpath[DT_MAX_PATH_LEN]; char datadir[DT_MAX_PATH_LEN]; dt_loc_get_datadir(datadir, DT_MAX_PATH_LEN); snprintf(geglpath, DT_MAX_PATH_LEN, "%s/gegl:/usr/lib/gegl-0.0", datadir); (void)setenv("GEGL_PATH", geglpath, 1); gegl_init(&argc, &argv); #endif #ifdef USE_LUA dt_lua_init_early(NULL); #endif // thread-safe init: dt_exif_init(); char datadir[DT_MAX_PATH_LEN]; dt_loc_get_user_config_dir (datadir,DT_MAX_PATH_LEN); char filename[DT_MAX_PATH_LEN]; snprintf(filename, DT_MAX_PATH_LEN, "%s/darktablerc", datadir); // initialize the config backend. this needs to be done first... darktable.conf = (dt_conf_t *)malloc(sizeof(dt_conf_t)); memset(darktable.conf, 0, sizeof(dt_conf_t)); dt_conf_init(darktable.conf, filename, config_override); g_slist_free_full(config_override, g_free); // set the interface language const gchar* lang = dt_conf_get_string("ui_last/gui_language"); if(lang != NULL && lang[0] != '\0') { if(setlocale(LC_ALL, lang) != NULL) gtk_disable_setlocale(); } // initialize the database darktable.db = dt_database_init(dbfilename_from_command); if(darktable.db == NULL) { printf("ERROR : cannot open database\n"); return 1; } else if(!dt_database_get_lock_acquired(darktable.db)) { // send the images to the other instance via dbus if(images_to_load) { GSList *p = images_to_load; // get a connection! GDBusConnection *connection = g_bus_get_sync(G_BUS_TYPE_SESSION,NULL, NULL); while (p != NULL) { // make the filename absolute ... gchar *filename = dt_make_path_absolute((gchar*)p->data); if(filename == NULL) continue; // ... and send it to the running instance of darktable g_dbus_connection_call_sync(connection, "org.darktable.service", "/darktable", "org.darktable.service.Remote", "Open", g_variant_new ("(s)", filename), NULL, G_DBUS_CALL_FLAGS_NONE, -1, NULL, NULL); p = g_slist_next(p); g_free(filename); } g_slist_free(images_to_load); g_object_unref(connection); } return 1; } // Initialize the signal system darktable.signals = dt_control_signal_init(); // Initialize the filesystem watcher darktable.fswatch=dt_fswatch_new(); #ifdef HAVE_GPHOTO2 // Initialize the camera control darktable.camctl=dt_camctl_new(); #endif // get max lighttable thumbnail size: darktable.thumbnail_width = CLAMPS(dt_conf_get_int("plugins/lighttable/thumbnail_width"), 200, 3000); darktable.thumbnail_height = CLAMPS(dt_conf_get_int("plugins/lighttable/thumbnail_height"), 200, 3000); // and make sure it can be mip-mapped all the way from mip4 to mip0 darktable.thumbnail_width /= 16; darktable.thumbnail_width *= 16; darktable.thumbnail_height /= 16; darktable.thumbnail_height *= 16; // Initialize the password storage engine darktable.pwstorage=dt_pwstorage_new(); // FIXME: move there into dt_database_t dt_pthread_mutex_init(&(darktable.db_insert), NULL); dt_pthread_mutex_init(&(darktable.plugin_threadsafe), NULL); dt_pthread_mutex_init(&(darktable.capabilities_threadsafe), NULL); darktable.control = (dt_control_t *)malloc(sizeof(dt_control_t)); memset(darktable.control, 0, sizeof(dt_control_t)); if(init_gui) { dt_control_init(darktable.control); } else { if(dbfilename_from_command && !strcmp(dbfilename_from_command, ":memory:")) dt_gui_presets_init(); // init preset db schema. darktable.control->running = 0; darktable.control->accelerators = NULL; dt_pthread_mutex_init(&darktable.control->run_mutex, NULL); } // initialize collection query darktable.collection_listeners = NULL; darktable.collection = dt_collection_new(NULL); /* initialize selection */ darktable.selection = dt_selection_new(); /* capabilities set to NULL */ darktable.capabilities = NULL; #ifdef HAVE_GRAPHICSMAGICK /* GraphicsMagick init */ InitializeMagick(darktable.progname); #endif darktable.opencl = (dt_opencl_t *)malloc(sizeof(dt_opencl_t)); memset(darktable.opencl, 0, sizeof(dt_opencl_t)); #ifdef HAVE_OPENCL dt_opencl_init(darktable.opencl, argc, argv); #endif darktable.blendop = (dt_blendop_t *)malloc(sizeof(dt_blendop_t)); memset(darktable.blendop, 0, sizeof(dt_blendop_t)); dt_develop_blend_init(darktable.blendop); darktable.points = (dt_points_t *)malloc(sizeof(dt_points_t)); memset(darktable.points, 0, sizeof(dt_points_t)); dt_points_init(darktable.points, dt_get_num_threads()); // must come before mipmap_cache, because that one will need to access // image dimensions stored in here: darktable.image_cache = (dt_image_cache_t *)malloc(sizeof(dt_image_cache_t)); memset(darktable.image_cache, 0, sizeof(dt_image_cache_t)); dt_image_cache_init(darktable.image_cache); darktable.mipmap_cache = (dt_mipmap_cache_t *)malloc(sizeof(dt_mipmap_cache_t)); memset(darktable.mipmap_cache, 0, sizeof(dt_mipmap_cache_t)); dt_mipmap_cache_init(darktable.mipmap_cache); // The GUI must be initialized before the views, because the init() // functions of the views depend on darktable.control->accels_* to register // their keyboard accelerators if(init_gui) { darktable.gui = (dt_gui_gtk_t *)malloc(sizeof(dt_gui_gtk_t)); memset(darktable.gui,0,sizeof(dt_gui_gtk_t)); if(dt_gui_gtk_init(darktable.gui, argc, argv)) return 1; dt_bauhaus_init(); } else darktable.gui = NULL; darktable.view_manager = (dt_view_manager_t *)malloc(sizeof(dt_view_manager_t)); memset(darktable.view_manager, 0, sizeof(dt_view_manager_t)); dt_view_manager_init(darktable.view_manager); // load the darkroom mode plugins once: dt_iop_load_modules_so(); if(init_gui) { darktable.lib = (dt_lib_t *)malloc(sizeof(dt_lib_t)); memset(darktable.lib, 0, sizeof(dt_lib_t)); dt_lib_init(darktable.lib); dt_control_load_config(darktable.control); } darktable.imageio = (dt_imageio_t *)malloc(sizeof(dt_imageio_t)); memset(darktable.imageio, 0, sizeof(dt_imageio_t)); dt_imageio_init(darktable.imageio); if(init_gui) { // Loading the keybindings char keyfile[DT_MAX_PATH_LEN]; // First dump the default keymapping snprintf(keyfile, DT_MAX_PATH_LEN, "%s/keyboardrc_default", datadir); gtk_accel_map_save(keyfile); // Removing extraneous semi-colons from the default keymap strip_semicolons_from_keymap(keyfile); // Then load any modified keys if available snprintf(keyfile, DT_MAX_PATH_LEN, "%s/keyboardrc", datadir); if(g_file_test(keyfile, G_FILE_TEST_EXISTS)) gtk_accel_map_load(keyfile); else gtk_accel_map_save(keyfile); // Save the default keymap if none is present // I doubt that connecting to dbus for darktable-cli makes sense darktable.dbus = dt_dbus_init(); // initialize undo struct darktable.undo = dt_undo_init(); // load image(s) specified on cmdline int id = 0; if(images_to_load) { // If only one image is listed, attempt to load it in darkroom gboolean load_in_dr = (g_slist_next(images_to_load) == NULL); GSList *p = images_to_load; while (p != NULL) { // don't put these function calls into MAX(), the macro will evaluate // it twice (and happily deadlock, in this particular case) int newid = dt_load_from_string((gchar*)p->data, load_in_dr); id = MAX(id, newid); p = g_slist_next(p); } if (!load_in_dr || id == 0) dt_ctl_switch_mode_to(DT_LIBRARY); g_slist_free(images_to_load); } else dt_ctl_switch_mode_to(DT_LIBRARY); } if(darktable.unmuted & DT_DEBUG_MEMORY) { fprintf(stderr, "[memory] after successful startup\n"); dt_print_mem_usage(); } dt_image_local_copy_synch(); /* init lua last, since it's user made stuff it must be in the real environment */ #ifdef USE_LUA dt_lua_init(darktable.lua_state.state,init_gui); #endif return 0; }
int main(int argc, char * argv[]) { Descr qry_descr = { {0} }; Descr tgt_descr = { {0} }; clock_t CPU_time_begin, CPU_time_end; int retval, qry_done, tgt_done; int db_ctr, db_effective_ctr; int user_defined_name; FILE * qry_fptr = NULL, * tgt_fptr = NULL, * digest = NULL; // Score score; //int compare(Descr *descr1, Descr *descr2, Score * score); int compare(Descr *descr1, Descr *descr2, Score * score, Score * score_hung); int read_cmd_file(char *filename); if (argc < 3) { fprintf(stderr, "Usage: %s <db file> <qry file> [<parameter file>].\n", argv[0]); exit(1); } if (!(qry_fptr = efopen(argv[2], "r"))) return 1; if (!(tgt_fptr = efopen(argv[1], "r"))) return 1; /* set defaults: */ set_default_options(); /* change them with the cmd file, if the cmd file given */ if (argc == 4) { if (read_cmd_file(argv[3])) return 1; } /* read in the table of integral values */ /* the array int_table in struct_table.c */ if (read_integral_table(options.path)) { fprintf(stderr, "In data file %s.\n\n", options.path); exit(1); } set_up_exp_table(); user_defined_name = options.outname[0]; /*********************************/ /* loop over the query database :*/ qry_done = 0; retval = -1; db_effective_ctr = 0; CPU_time_begin = clock(); while (!qry_done) { retval = get_next_descr(qry_fptr, &qry_descr); if (retval == 1) { continue; } else if (retval == -1) { qry_done = 1; continue; } /* digest file for larger scale comparisons */ if (!digest) { if (!user_defined_name) { sprintf(options.outname, "%s.struct_out", qry_descr.name); } // ************ added by Mile // output name in postprocessing consists of query and target name retval = get_next_descr(tgt_fptr, &tgt_descr); if (options.postprocess) { sprintf(options.outname, "%s_%s.struct_out", qry_descr.name, tgt_descr.name); } // ************* end by Mile digest = efopen(options.outname, "w"); if (!digest) exit(1); if (options.print_header) { fprintf(digest, "%% columns: \n"); fprintf(digest, "%% query, target: structure names\n"); fprintf(digest, "%% geom_z: z score for the orientational match \n"); fprintf(digest, "%% <dL>: average length mismatch for matched SSEs \n"); fprintf(digest, "%% T: total score assigned to matched SSEs \n"); fprintf(digest, "%% frac: T divided by the number of matched SSEs \n"); fprintf(digest, "%% GC_rmsd: RMSD btw geometric centers of matched SSEs (before postprocessing) \n"); fprintf(digest, "%% A: (after postprocessing) the alignment score \n"); fprintf(digest, "%% aln_L: (after postprocessing) the alignment length \n\n"); fprintf(digest, "%% %6s%6s %6s %6s %6s %6s %6s %6s %6s %6s \n", "query ", "target ", "geom_z", "<dL>", " T ", "frac", "GC_rmsd", "rmsd ", "A ", "aln_L "); } } else { /* otherwise write to the same old digest file */ } /* loop over the database :*/ // Added by Mile - using FOR instead of WHILE - parallelization int tgt_counter = 0; int i; int *retval_array; Descr *tgt_descr_array; rewind(tgt_fptr); tgt_done = 0; /* * Counting number of successful targets */ while (!tgt_done) { retval = get_next_descr(tgt_fptr, &tgt_descr); if (retval == 0 || retval == 1) { tgt_counter++; } else if (retval == -1) { tgt_done = 1; } } /* * Initialization of a Descr array (array of targets) - easy parallelization */ rewind(tgt_fptr); tgt_descr_array = (Descr *) calloc(tgt_counter, sizeof(Descr)); if (tgt_descr_array == NULL) { printf("malloc return NULL!\n"); } retval_array = (int *) calloc(tgt_counter, sizeof(int)); if (retval_array == NULL) { printf("malloc return NULL!\n"); } /* * Storing targets a returning values */ for(i = 0; i < tgt_counter; ++i) { retval = get_next_descr(tgt_fptr, &tgt_descr_array[i]); retval_array[i] = retval; } // Added by Mile - end rewind(tgt_fptr); // tgt_done = 0; db_ctr = 0; db_effective_ctr = 0; if (!user_defined_name) CPU_time_begin = clock(); retval = -1; /* while ( ! tgt_done) { */ // Start of parallelization if (options.postprocess) omp_set_num_threads(1); else omp_set_num_threads(6); #pragma omp parallel // num_threads(1) { #pragma omp for for (i = 0; i < tgt_counter; ++i) { // Added by Mile int retval = retval_array[i]; /* * Two scores one for Smith Waterman, another for Hungarian in database search phase */ Score score; Score score_hung; Descr tgt_descr = tgt_descr_array[i]; /* printf("%s %d\n", tgt_descr.name, retval); */ // Descr qry_descr = qry_descr; #pragma omp atomic db_ctr++; // atomic /* retval = get_next_descr(tgt_fptr, &tgt_descr); */ if (retval == 1) { continue; } else if (retval == -1) { // tgt_done = 1; printf("Error!!!!\n"); exit(1); // added by Mile } else { /* min number of elements */ int helix_overlap = (qry_descr.no_of_helices < tgt_descr.no_of_helices) ? qry_descr.no_of_helices : tgt_descr.no_of_helices; int strand_overlap = (qry_descr.no_of_strands < tgt_descr.no_of_strands) ? qry_descr.no_of_strands : tgt_descr.no_of_strands; double fraction_assigned; int query_size = qry_descr.no_of_strands + qry_descr.no_of_helices; int target_size = tgt_descr.no_of_strands + tgt_descr.no_of_helices; if (helix_overlap + strand_overlap >= options.min_no_SSEs) { #pragma omp atomic db_effective_ctr++; // atomic /* here is the core of the operation: */ retval = compare(&tgt_descr, &qry_descr, &score, &score_hung); if (retval) { printf(" error comparing db:%s query:%s \n", tgt_descr.name, qry_descr.name); exit(retval); } /* * Output score. Can be based: * - only on SW alignment during the database search * - only on Hungarian algorithm during the database search * - on combination depending on the postprocessing score */ switch (options.score_out) { case 0: // SW if (query_size > target_size) { fraction_assigned = score.total_assigned_score / target_size; } else { fraction_assigned = score.total_assigned_score / query_size; } retval = print_score(digest, &qry_descr, &tgt_descr, &score, fraction_assigned, 1); break; case 1: // Hungarian if (query_size > target_size) { fraction_assigned = score_hung.total_assigned_score / target_size; } else { fraction_assigned = score_hung.total_assigned_score / query_size; } retval = print_score(digest, &qry_descr, &tgt_descr, &score_hung, fraction_assigned, 1); break; case 2: // either SW or Hungarian depends on score if (score.total_assigned_score > score_hung.total_assigned_score) { if (query_size > target_size) { fraction_assigned = score.total_assigned_score / target_size; } else { fraction_assigned = score.total_assigned_score / query_size; } retval = print_score(digest, &qry_descr, &tgt_descr, &score, fraction_assigned, 1); } else { if (query_size > target_size) { fraction_assigned = score_hung.total_assigned_score / target_size; } else { fraction_assigned = score_hung.total_assigned_score / query_size; } retval = print_score(digest, &qry_descr, &tgt_descr, &score_hung, fraction_assigned, 1); } break; } if (retval) { printf("error in printing to output file\n"); exit(retval); } } else if (options.report_no_sse_overlap) { retval = print_score(digest, &qry_descr, &tgt_descr, &score, fraction_assigned, 0); if (retval) { printf("error in printing to output file\n"); exit(retval); } } } /* if (options.postprocess) tgt_done = 1; // for now, we postprocess only one pair of structures (not structure against database) */ // if (options.postprocess) break; // added by Mile tricky but I think it should work even without it } } // Added by Mile // Memory cleaning for(i = 0; i < tgt_counter; ++i) { descr_shutdown ( &tgt_descr_array[i] ); } free(tgt_descr_array); free(retval_array); // End added by Mile if (!user_defined_name && db_effective_ctr) { CPU_time_end = clock(); fprintf(digest, "done CPU: %10.3lf s\n", (double) (CPU_time_end - CPU_time_begin) / CLOCKS_PER_SEC); fflush(digest); } if (!user_defined_name) { fclose(digest); digest = NULL; } /* otherwise we keep writing into the saem digest file */ if (options.postprocess) qry_done = 1; /* for now, we postprocess only one pair of structures (not structure against database) */ } if (digest) { CPU_time_end = clock(); fprintf(digest, "done CPU: %10.3lf s\n", (double) (CPU_time_end - CPU_time_begin) / CLOCKS_PER_SEC); fflush(digest); } if (options.verbose) { printf("\n\nlooked at %d db entries.\n", db_effective_ctr); printf("the output written to %s.\n\n", options.outname); } /**************************************************/ /* housekeeping, good for tracking memory leaks */ if (digest) fclose(digest); // map_consistence(0, 0, NULL, NULL, NULL, NULL, NULL); // compare(NULL, NULL, NULL); descr_shutdown(&qry_descr); descr_shutdown(&tgt_descr); fclose(qry_fptr); fclose(tgt_fptr); return 0; }
// // benchmarking program // int main(int argc, char **argv) { if( find_option( argc, argv, "-h" ) >= 0 ) { printf( "Options:\n" ); printf( "-h to see this help\n" ); printf( "-n <int> to set number of particles\n" ); printf( "-o <filename> to specify the output file name\n" ); printf( "-s <filename> to specify a summary file name\n" ); printf( "-no turns off all correctness checks and particle output\n"); printf( "-p <int> to set the (maximum) number of threads used\n"); return 0; } const int n = read_int( argc, argv, "-n", 1000 ); const bool fast = (find_option( argc, argv, "-no" ) != -1); const char *savename = read_string( argc, argv, "-o", NULL ); const char *sumname = read_string( argc, argv, "-s", NULL ); const int num_threads_override = read_int( argc, argv, "-p", 0); FILE *fsave = ((!fast) && savename) ? fopen( savename, "w" ) : NULL; FILE *fsum = sumname ? fopen ( sumname, "a" ) : NULL; const double size = set_size( n ); // We need to set the size of a grid square so that the average number of // particles per grid square is constant. The simulation already ensures // that the average number of particles in an arbitrary region is constant // and proportional to the area. So this is just a constant. const double grid_square_size = sqrt(0.0005) + 0.000001; const int num_grid_squares_per_side = size / grid_square_size; printf("Using %d grid squares of side-length %f for %d particles.\n", num_grid_squares_per_side*num_grid_squares_per_side, grid_square_size, n); std::unique_ptr<std::vector<particle_t> > particles = init_particles(n); if (num_threads_override > 0) { omp_set_dynamic(0); // fixed number of threads omp_set_num_threads(num_threads_override); // assign number of threads } // // simulate a number of time steps // double simulation_time = read_timer( ); int max_num_threads = omp_get_max_threads(); int num_actual_threads; // User-defined reductions aren't available in the version of OMP we're // using. Instead, we accumulate per-thread stats in this global array // and reduce manually when we're done. Stats per_thread_stats[max_num_threads]; // Shared across threads. std::unique_ptr<OmpThreadsafeGrid> old_grid(new OmpThreadsafeGrid(size, num_grid_squares_per_side)); std::unique_ptr<OmpThreadsafeGrid> next_grid(new OmpThreadsafeGrid(size, num_grid_squares_per_side)); #pragma omp parallel { #pragma omp atomic write num_actual_threads = omp_get_num_threads(); //get number of actual threads int thread_idx = omp_get_thread_num(); Stats thread_stats; for (int step = 0; step < 1000; step++) { // If this is the first step, we must initialize the grid here // without respecting cache locality. Since we cannot use the existing // grid, we have to just divide the particles arbitrarily. This // means that the subsequent code for simulating forces and movement // will have almost no cache locality on the first iteration: Each thread // has picked up an arbitrary subset of the particles to insert into the // grid, and then the threads are responsible for simulating a different, // mostly-disjoint subset of the particles. On subsequent iterations, // only the particles that have moved will cause cache misses, so we // should have much better locality. If we want to really optimize, // it may be worth rethinking how we store particles and communicate among // threads. But at that point we might as well write distributed-memory // code. if (step == 0) { #pragma omp for for (int i = 0; i < n; i++) { next_grid->add((*particles)[i]); } } // Here we are building the grid that maps locations to sets of // particles. This step does O(n) work, so it is a bottleneck if done // serially. For performance comparisons, we have two versions of the // grid-formation code. The second simply forms the grid serially, in a // single arbitrary thread. The first is parallel and attempts // some cache locality. Each thread is responsible for re-inserting // the grid elements that previously lay in its subgrid. For that reason // we need to keep around the old grid while we are building the new one; // this is why we have old_grid and next_grid. // NOTE: We could instead re-insert each particle right after moving it. // This would be faster, but it would require us to think about // simultaneous parallel delete and add, while the current scheme needs // only support parallel add. (Deleting the entire grid at once is an // O(1) operation, so we can do it in one thread with a barrier.) // (The actual simulation operations are read-only on the grid structure // and write to each particle only once, so we can simply use two // barriers to protect them. #pragma omp single { old_grid.swap(next_grid); next_grid.reset(new OmpThreadsafeGrid(size, num_grid_squares_per_side)); } // Now insert each particle into the new grid. { std::unique_ptr<SimpleIterator<particle_t&> > particles_to_insert = old_grid->subgrid(thread_idx, num_actual_threads); while (particles_to_insert->hasNext()) { particle_t& p = particles_to_insert->next(); next_grid->add(p); } } // Now we compute forces for particles. Each thread handles its assigned // subgrid. We first need a barrier to ensure that everyone sees all // the particles in next_grid. #pragma omp barrier { std::unique_ptr<SimpleIterator<particle_t&> > particles_to_force = next_grid->subgrid(thread_idx, num_actual_threads); while (particles_to_force->hasNext()) { particle_t& p = particles_to_force->next(); p.ax = p.ay = 0; std::unique_ptr<SimpleIterator<particle_t&> > neighbors = next_grid->neighbor_iterator(p); while (neighbors->hasNext()) { particle_t& neighbor = neighbors->next(); apply_force(p, neighbor, thread_stats); } } } // The barrier here ensures that no particle is moved before it is used // in apply_force above. #pragma omp barrier // Now we move each particle. std::unique_ptr<SimpleIterator<particle_t&> > particles_to_move = next_grid->subgrid(thread_idx, num_actual_threads); while (particles_to_move->hasNext()) { particle_t& p = particles_to_move->next(); move(p); } // This barrier is probably unnecessary unless save() is going to happen. #pragma omp barrier if (!fast) { // // save if necessary // #pragma omp master if( fsave && (step%SAVEFREQ) == 0 ) { save( fsave, n, (*particles).data() ); } } // This barrier is probably unnecessary unless save() happened. #pragma omp barrier } #pragma omp critical per_thread_stats[thread_idx] = thread_stats; } simulation_time = read_timer( ) - simulation_time; // Could do a tree reduce here, but it seems unnecessary. Stats overall_stats; for (int thread_idx = 0; thread_idx < max_num_threads; thread_idx++) { overall_stats.aggregate_left(per_thread_stats[thread_idx]); } printf( "n = %d,threads = %d, simulation time = %g seconds", n,num_actual_threads, simulation_time); if (!fast) { // // -the minimum distance absmin between 2 particles during the run of the simulation // -A Correct simulation will have particles stay at greater than 0.4 (of cutoff) with typical values between .7-.8 // -A simulation were particles don't interact correctly will be less than 0.4 (of cutoff) with typical values between .01-.05 // // -The average distance absavg is ~.95 when most particles are interacting correctly and ~.66 when no particles are interacting // printf( ", absmin = %lf, absavg = %lf", overall_stats.min, overall_stats.avg); if (overall_stats.min < 0.4) printf ("\nThe minimum distance is below 0.4 meaning that some particle is not interacting"); if (overall_stats.avg < 0.8) printf ("\nThe average distance is below 0.8 meaning that most particles are not interacting"); } printf("\n"); // // Printing summary data // if( fsum) fprintf(fsum,"%d %d %g\n",n,num_actual_threads, simulation_time); // // Clearing space // if( fsum ) fclose( fsum ); if( fsave ) fclose( fsave ); return 0; }
int main(int argc, char ** argv) { int my_ID; /* Thread ID */ int vector_length; /* length of vector loop containing the branch */ int nfunc; /* number of functions used in INS_HEAVY option */ int rank; /* matrix rank used in INS_HEAVY option */ double branch_time, /* timing parameters */ no_branch_time; double ops; /* double precision representation of integer ops */ int iterations; /* number of times the branch loop is carried out */ int i, iter, aux; /* dummies */ char *branch_type; /* string defining branching type */ int btype; /* integer encoding branching type */ int total=0, total_ref; /* computed and stored verification values */ int nthread_input; /* thread parameters */ int nthread; int num_error=0; /* flag that signals that requested and obtained numbers of threads are the same */ /********************************************************************************** ** process and test input parameters **********************************************************************************/ printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("OpenMP Branching Bonanza\n"); if (argc != 5){ printf("Usage: %s <# threads> <# iterations> <vector length>", *argv); printf("<branching type>\n"); printf("branching type: vector_go, vector_stop, no_vector, ins_heavy\n"); exit(EXIT_FAILURE); } nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); exit(EXIT_FAILURE); } omp_set_num_threads(nthread_input); iterations = atoi(*++argv); if (iterations < 1 || iterations%2==1){ printf("ERROR: Iterations must be positive and even : %d \n", iterations); exit(EXIT_FAILURE); } vector_length = atoi(*++argv); if (vector_length < 1){ printf("ERROR: loop length must be >= 1 : %d \n",vector_length); exit(EXIT_FAILURE); } branch_type = *++argv; if (!strcmp(branch_type,"vector_stop")) btype = VECTOR_STOP; else if (!strcmp(branch_type,"vector_go" )) btype = VECTOR_GO; else if (!strcmp(branch_type,"no_vector" )) btype = NO_VECTOR; else if (!strcmp(branch_type,"ins_heavy" )) btype = INS_HEAVY; else { printf("Wrong branch type: %s; choose vector_stop, vector_go, ", branch_type); printf("no_vector, or ins_heavy\n"); exit(EXIT_FAILURE); } #pragma omp parallel private(i, my_ID, iter, aux, nfunc, rank) reduction(+:total) { int * RESTRICT vector; int * RESTRICT index; int factor = -1; #pragma omp master { nthread = omp_get_num_threads(); if (nthread != nthread_input) { num_error = 1; printf("ERROR: number of requested threads %d does not equal ", nthread_input); printf("number of spawned threads %d\n", nthread); } else { printf("Number of threads = %d\n", nthread_input); printf("Vector length = %d\n", vector_length); printf("Number of iterations = %d\n", iterations); printf("Branching type = %s\n", branch_type); #if RESTRICT_KEYWORD printf("No aliasing = on\n"); #else printf("No aliasing = off\n"); #endif } } bail_out(num_error); my_ID = omp_get_thread_num(); vector = prk_malloc(vector_length*2*sizeof(int)); if (!vector) { printf("ERROR: Thread %d failed to allocate space for vector\n", my_ID); num_error = 1; } bail_out(num_error); /* grab the second half of vector to store index array */ index = vector + vector_length; /* initialize the array with entries with varying signs; array "index" is only used to obfuscate the compiler (i.e. it won't vectorize a loop containing indirect referencing). It functions as the identity operator. */ for (i=0; i<vector_length; i++) { vector[i] = 3 - (i&7); index[i] = i; } #pragma omp barrier #pragma omp master { branch_time = wtime(); } /* do actual branching */ switch (btype) { case VECTOR_STOP: /* condition vector[index[i]]>0 inhibits vectorization */ for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3 - (i&7)); if (vector[index[i]]>0) vector[i] -= 2*vector[i]; else vector[i] -= 2*aux; } #pragma vector always for (i=0; i<vector_length; i++) { aux = (3 - (i&7)); if (vector[index[i]]>0) vector[i] -= 2*vector[i]; else vector[i] -= 2*aux; } } break; case VECTOR_GO: /* condition aux>0 allows vectorization */ for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3 - (i&7)); if (aux>0) vector[i] -= 2*vector[i]; else vector[i] -= 2*aux; } #pragma vector always for (i=0; i<vector_length; i++) { aux = (3 - (i&7)); if (aux>0) vector[i] -= 2*vector[i]; else vector[i] -= 2*aux; } } break; case NO_VECTOR: /* condition aux>0 allows vectorization, but indirect indexing inbibits it */ for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3 - (i&7)); if (aux>0) vector[i] -= 2*vector[index[i]]; else vector[i] -= 2*aux; } #pragma vector always for (i=0; i<vector_length; i++) { aux = (3 - (i&7)); if (aux>0) vector[i] -= 2*vector[index[i]]; else vector[i] -= 2*aux; } } break; case INS_HEAVY: fill_vec(vector, vector_length, iterations, WITH_BRANCHES, &nfunc, &rank); } #pragma omp master { branch_time = wtime() - branch_time; if (btype == INS_HEAVY) { printf("Number of matrix functions = %d\n", nfunc); printf("Matrix order = %d\n", rank); } } /* do the whole thing once more, but now without branches */ #pragma omp barrier #pragma omp master { no_branch_time = wtime(); } /* do actual branching */ switch (btype) { case VECTOR_STOP: case VECTOR_GO: for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3-(i&7)); vector[i] -= (vector[i] + aux); } for (i=0; i<vector_length; i++) { aux = (3-(i&7)); vector[i] -= (vector[i] + aux); } } break; case NO_VECTOR: for (iter=0; iter<iterations; iter+=2) { #pragma vector always for (i=0; i<vector_length; i++) { aux = -(3-(i&7)); vector[i] -= (vector[index[i]]+aux); } #pragma vector always for (i=0; i<vector_length; i++) { aux = (3-(i&7)); vector[i] -= (vector[index[i]]+aux); } } break; case INS_HEAVY: fill_vec(vector, vector_length, iterations, WITHOUT_BRANCHES, &nfunc, &rank); } #pragma omp master { no_branch_time = wtime() - no_branch_time; ops = (double)vector_length * (double)iterations * (double)nthread; if (btype == INS_HEAVY) ops *= rank*(rank*19 + 6); else ops *= 4; } for (total = 0, i=0; i<vector_length; i++) total += vector[i]; } /* end of OPENMP parallel region */ /* compute verification values */ total_ref = ((vector_length%8)*(vector_length%8-8) + vector_length)/2*nthread; if (total == total_ref) { printf("Solution validates\n"); printf("Rate (Mops/s) with branches: %lf time (s): %lf\n", ops/(branch_time*1.e6), branch_time); printf("Rate (Mops/s) without branches: %lf time (s): %lf\n", ops/(no_branch_time*1.e6), no_branch_time); #if VERBOSE printf("Array sum = %d, reference value = %d\n", total, total_ref); #endif } else { printf("ERROR: array sum = %d, reference value = %d\n", total, total_ref); } exit(EXIT_SUCCESS); }
int main(int argc, char* argv[]) { bool visualize = true; int threads = 8; int config = 0; real gravity = -9.81; //acceleration due to gravity real timestep = .01; //step size real time_to_run = 1; //length of simulation real current_time = 0; int num_steps = time_to_run / timestep; int max_iteration = 15; int tolerance = 0; //========================================================================================================= // Create system //========================================================================================================= ChSystemParallel * system_gpu = new ChSystemParallel; //========================================================================================================= // Populate the system with bodies/constraints/forces/etc. //========================================================================================================= ChVector<> lpos(0, 0, 0); ChQuaternion<> quat(1, 0, 0, 0); real container_width = 5; //width of area with particles real container_length = 25; //length of area that roller will go over real container_thickness = .25; //thickness of container walls real container_height = 2; //height of the outer walls real particle_radius = .58; // Create a material (will be used by both objects) ChSharedPtr<ChMaterialSurface> material; material = ChSharedPtr<ChMaterialSurface>(new ChMaterialSurface); material->SetFriction(0.4); // Create a ball ChSharedBodyPtr ball = ChSharedBodyPtr(new ChBody(new ChCollisionModelParallel)); InitObject(ball, 1, // mass ChVector<>(0, 10, 0), // position ChQuaternion<>(1, 0, 0, 0), // rotation material, // material true, // collide? false, // static? -15, -15); // collision family ball->SetPos_dt(ChVector<>(0,0,10)); AddCollisionGeometry(ball, SPHERE, particle_radius, lpos, quat); FinalizeObject(ball, (ChSystemParallel *) system_gpu); // Create a bin for the ball to fall into ChSharedBodyPtr bin = ChSharedBodyPtr(new ChBody(new ChCollisionModelParallel)); InitObject(bin, 1, // mass ChVector<>(0, 0, 0), // position ChQuaternion<>(1, 0, 0, 0), // rotation material, // material true, // collide? true, // static? -20, -20); // collision family AddCollisionGeometry(bin, BOX, ChVector<>(container_width, container_thickness, container_length), lpos, quat); AddCollisionGeometry(bin, BOX, Vector(container_thickness, container_height, container_length), Vector(-container_width + container_thickness, container_height, 0), quat); AddCollisionGeometry(bin, BOX, Vector(container_thickness, container_height, container_length), Vector(container_width - container_thickness, container_height, 0), quat); AddCollisionGeometry(bin, BOX, Vector(container_width, container_height, container_thickness), Vector(0, container_height, -container_length + container_thickness), quat); AddCollisionGeometry(bin, BOX, Vector(container_width, container_height, container_thickness), Vector(0, container_height, container_length - container_thickness), quat); FinalizeObject(bin, (ChSystemParallel *) system_gpu); //========================================================================================================= // Edit system settings //========================================================================================================= system_gpu->SetIntegrationType(ChSystem::INT_ANITESCU); system_gpu->SetParallelThreadNumber(threads); system_gpu->SetMaxiter(max_iteration); system_gpu->SetIterLCPmaxItersSpeed(max_iteration); system_gpu->SetTol(1e-3); system_gpu->SetTolSpeeds(1e-3); system_gpu->Set_G_acc(ChVector<>(0, gravity, 0)); system_gpu->SetStep(timestep); ((ChLcpSolverParallel *) (system_gpu->GetLcpSolverSpeed()))->SetMaxIteration(max_iteration); ((ChLcpSolverParallel *) (system_gpu->GetLcpSolverSpeed()))->SetTolerance(0); ((ChLcpSolverParallel *) (system_gpu->GetLcpSolverSpeed()))->SetCompliance(0, 0, 0); ((ChLcpSolverParallel *) (system_gpu->GetLcpSolverSpeed()))->SetContactRecoverySpeed(300); ((ChLcpSolverParallel *) (system_gpu->GetLcpSolverSpeed()))->SetSolverType(ACCELERATED_PROJECTED_GRADIENT_DESCENT); ((ChCollisionSystemParallel *) (system_gpu->GetCollisionSystem()))->SetCollisionEnvelope(particle_radius * .05); ((ChCollisionSystemParallel *) (system_gpu->GetCollisionSystem()))->setBinsPerAxis(R3(10, 10, 10)); ((ChCollisionSystemParallel *) (system_gpu->GetCollisionSystem()))->setBodyPerBin(100, 50); omp_set_num_threads(threads); //========================================================================================================= // Enter the time loop and render the simulation //========================================================================================================= if (visualize) { ChOpenGLManager * window_manager = new ChOpenGLManager(); ChOpenGL openGLView(window_manager, system_gpu, 800, 600, 0, 0, "Test_Solvers"); openGLView.render_camera->camera_pos = Vector(0, 5, -20); openGLView.render_camera->look_at = Vector(0, 0, 0); openGLView.SetCustomCallback(RunTimeStep); openGLView.StartSpinning(window_manager); window_manager->CallGlutMainLoop(); } return 0; }
int main(){ omp_set_num_threads(35); //run_vanilla_nolemma(400,0.05); //run_vanilla(800,0.05); run_sampler(100000,0.03,100); }
int main(int argc, char** argv) { int tid; int i, j; float interface_u; float FR[2]; float FL[2]; float speed; int index; int N_thread = N/2; // share work printf("1D SW Eqn Solver\n"); // Set the initial condition for (i = 0; i < N; i++) { if (i < 0.5*N) { P[0][i] = 1.0; // Water Depth P[1][i] = 0.0; // Water Speed } else { P[0][i] = 0.1; P[1][i] = 0.0; } // Compute U vector U[0][i] = P[0][i]; // Depth = mass of fluid U[1][i] = P[0][i]*P[1][i]; // Momentum of fluid } omp_set_num_threads(2); // Create 2 threads for this #pragma omp parallel private(tid, i, j, FL, FR, speed,index) shared(U, P, U_new, N_thread) { tid = omp_get_thread_num(); printf("Thread %d up and running\n", tid); for (j = 0; j < NO_STEPS; j++) { // Compute U_new in all cells (except the ends) for (index = 0; index < N_thread; index++) { i = tid*N_thread + index; if ((i > 0) && (i < (N-1))) { // Left Flux first - the flux across the surface between i-1 and i // Rusanov Flux speed = sqrtf(0.5*G*(P[0][i-1]+P[0][i])); FL[0] = 0.5*(P[0][i-1]*P[1][i-1] + P[0][i]*P[1][i]) - speed*(U[0][i] - U[0][i-1]); FL[1] = 0.5*( (P[0][i-1]*P[1][i-1]*P[1][i-1] + 0.5*G*P[0][i-1]*P[0][i-1]) + (P[0][i]*P[1][i]*P[1][i] + 0.5*G*P[0][i]*P[0][i]) ) - speed*(U[1][i] - U[1][i-1]); // Right Flux next - the flux across the surface between i and i+1 // Rusanov Flux speed = sqrtf(0.5*G*(P[0][i+1]+P[0][i])); FR[0] = 0.5*(P[0][i]*P[1][i] + P[0][i+1]*P[1][i+1]) - speed*(U[0][i+1] - U[0][i]); FR[1] = 0.5*( (P[0][i]*P[1][i]*P[1][i] + 0.5*G*P[0][i]*P[0][i]) + (P[0][i+1]*P[1][i+1]*P[1][i+1] + 0.5*G*P[0][i+1]*P[0][i+1]) ) - speed*(U[1][i+1] - U[1][i]); // Now, compute the new U value U_new[0][i] = U[0][i] - (DT/DX)*(FR[0]-FL[0]); U_new[1][i] = U[1][i] - (DT/DX)*(FR[1]-FL[1]); } // We cannot update P, yet. Next loop. } #pragma omp barrier // Update U and P now for (index = 0; index < N_thread; index++) { i = tid*N_thread + index; if ( (i > 0) && (i < (N-1)) ) { U[0][i] = U_new[0][i]; U[1][i] = U_new[1][i]; P[0][i] = U[0][i]; P[1][i] = U[1][i]/U[0][i]; } } #pragma omp barrier if (tid == 0) { // Correct ends using reflective conditions P[0][0] = P[0][1]; P[1][0] = -P[1][1]; P[0][N-1] = P[0][N-2]; P[1][N-1] = -P[1][N-2]; } #pragma omp barrier } } // end parallel section // Save the data Save_Results(); return 0; }
void FishModel::SetNumThreads(size_t n) { omp_set_num_threads(n); }
int main (int argc, char *argv[]) { void inidat(); float ***array; /* array for grid */ int taskid, /* this task's unique id */ numtasks, /* number of tasks */ averow,rows,offset,extra, /* for sending rows of data */ dest, source, /* to - from for message send-receive */ left,right, /* neighbor tasks */ msgtype, /* for message types */ rc,start,end, /* misc */ i,x,y,z,it,size,t_sqrt; /* loop variables */ MPI_Status status; MPI_Datatype dt,dt2; MPI_Request req, req2,req3,req4,req5; double t1,t2; /* First, find out my taskid and how many tasks are running */ MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); /*Set number of threads */ omp_set_num_threads(atoi(argv[1])); // Use n threads for all consecutive parallel regions omp_set_nested(1); if (taskid == 0) { //printf("Grid size: X= %d Y= %d Time steps= %d\n",NXPROB,NYPROB,STEPS); t1 = MPI_Wtime(); } i = 0; while(i*i < (NXPROB*NYPROB)/numtasks) i++; size = i; i = 0; while(i*i<numtasks) i++; t_sqrt = i; MPI_Type_contiguous(size+2,MPI_FLOAT, &dt); MPI_Type_commit(&dt); MPI_Type_vector(size+2,1,size+2,MPI_FLOAT,&dt2); MPI_Type_commit(&dt2); array = malloc(2*sizeof(float**)); for (i = 0;i<2;i++){ array[i] = malloc((2+size)*sizeof(float*)); array[i][0] = malloc(((2+size)*(2+size))*sizeof(float)); for (x = 1;x<2+size;x++){ array[i][x] = &(array[i][0][x*(2+size)]); } } for (z=0; z<2; z++){ for (x=0; x<2+size; x++){ for (y=0; y<2+size; y++){ array[z][x][y] = 0.0; } } } z = 0; inidat(NXPROB,NYPROB,array[z],size*(taskid/t_sqrt),size*(taskid%t_sqrt),size); for (i = 1; i <= STEPS; i++) { if (taskid/t_sqrt != 0) //not first row { MPI_Isend(array[z][1],1,dt,taskid-t_sqrt,100, MPI_COMM_WORLD, &req); MPI_Irecv(array[z][0],1,dt,taskid-t_sqrt,100, MPI_COMM_WORLD, &req2); } if (taskid/t_sqrt != t_sqrt-1) //not last row { MPI_Isend(array[z][size],1,dt,taskid+t_sqrt,100, MPI_COMM_WORLD, &req); MPI_Irecv(array[z][size+1],1,dt,taskid+t_sqrt,100, MPI_COMM_WORLD, &req3); } if(taskid%t_sqrt != 0) //not last column { MPI_Isend(&array[z][0][1],1,dt2,taskid-1,100, MPI_COMM_WORLD, &req); MPI_Irecv(&array[z][0][0],1,dt2,taskid-1,100, MPI_COMM_WORLD, &req4); } if(taskid%t_sqrt != t_sqrt-1) //not last column { MPI_Isend(&array[z][0][size],1,dt2,taskid+1,100, MPI_COMM_WORLD, &req); MPI_Irecv(&array[z][0][size+1],1,dt2,taskid+1,100, MPI_COMM_WORLD, &req5); } inner_update(size,array[z],array[1-z]); if (taskid/t_sqrt != 0) MPI_Wait(&req2,&status); if (taskid/t_sqrt != t_sqrt-1) MPI_Wait(&req3,&status); if(taskid%t_sqrt != 0) MPI_Wait(&req4,&status); if(taskid%t_sqrt != t_sqrt-1) MPI_Wait(&req5,&status); outer_update(size,taskid,t_sqrt,array[z],array[1-z]); z = 1-z; } if (taskid == 0){ t2 = MPI_Wtime(); printf("MPI_Wtime measured: %1.2f\n", t2-t1); } for (i = 0;i<2;i++){ free(array[i][0]); free(array[i]); } free(array); MPI_Type_free(&dt); MPI_Type_free(&dt2); MPI_Finalize(); }
int main(int argc, char *argv[]) { //////////////////////////***Definitions***//////////////////////////////////////////////////////////////////// int nthreads=16,chunk=CHUNKSIZE; /*Input args/files */ FILE *fp1, /*Spectroscopic Galaxy File */ *fp2; /*Imaging Galaxy File */ char *Gxy_Spectro, *Gxy_Imaging; int N_Bins; /*Number of log bins */ double Start_Bin, /*Location of the edge of the smallest bin */ Max_Separation, /*Maximum rp Separation */ log_Bin_Size, /*Rp Bin Size in log*/ Minimum_Redshift=1000.0, /*Used to calculated maximum serapartion to filter pairs.*/ Maximum_Redshift=0; int Normalization_Choice; /*Which normalization should be used for the imaging catalogue 1= Di 2=Ri */ /* Spectroscopic Galaxy/Randoms Information */ int Spectro_Size=1E5; /*This is the assumed length of the galaxy file */ double *RA_s, /* Given */ *Dec_s, /* Given */ *Redshift_s, /*Given */ *Weight_s, /*The Fiber Collision or Completeness Weight of The Galaxy/Randoms */ *Distance_s; double *X_s,*Y_s,*Z_s; /*The cartesian elements to calculate cos_Theta*/ double area_tot=4*PI; // fprintf(stderr,"ASSUMMING SPHERE GEOMETRY FOR NORMALIZATION CHOICE!!!!!!!!!!!!!\n"); /* Imaging Galaxy/Randoms Information */ int Imaging_Size=4E5; /*This is the assumed length of the imaging file */ double *RA_i, /* Given */ *Dec_i; /* Given */ double *X_i,*Y_i,*Z_i; /* Wp calculation information */ double *DD, /*This is not an int because the counts will be weights. It is the shape Nbins X NJackknife */ Maximum_Dec_Separation, /*Filter by this dec difference */ Distance_to_Near_Z=1646., /*distance to inner redshift bin */ Distance_to_Far_Z=0., /*distance to inner redshift bin */ cos_Theta, rp; int bin; /*Random Counters and Such */ int i=0,j=0,k=0; int Ngal_s=0; /*Number of Galaxies/Randoms in the Spectro Sample */ int Ngal_i=0; /*Number of Galaxies/Randoms in the Imagin Sample */ /* void gridlink1D(int np,double rmin,double rmax,double rcell,double *z,int *ngrid,int **gridinit,int **gridlist); */ void gridlink1D_with_struct(int np,double dmin,double dmax,double rcell,double *x1,double *y1,double *z1,double *dec,int *ngrid,cellarray **lattice); struct timeval t0,t1; int nitems,nread; char buffer[MAXBUFSIZE]; /*Read in Args */ Gxy_Spectro=argv[1]; Gxy_Imaging=argv[2]; sscanf(argv[3],"%lf",&Start_Bin); sscanf(argv[4],"%lf",&Max_Separation); sscanf(argv[5],"%d",&N_Bins); sscanf(argv[6],"%d",&Normalization_Choice); if(argc > 6) sscanf(argv[7],"%lf",&area_tot) ; log_Bin_Size=(log10(Max_Separation)-log10(Start_Bin))/(N_Bins); //log_Bin_Size=(log10(Max_Separation)-log10(Start_Bin))/(N_Bins-1.); fprintf(stderr,"BOSS Wp > Log Bin size = %lf \n",log_Bin_Size); //////////////////////////////*Allocate the Arrays that are going to be used *////////////////////////////////////////////// /* #ifdef USE_BINLOOKUP */ /* int *binlookup=NULL; */ /* const int NBINLOOKUP=5e4; */ /* binlookup = my_calloc(sizeof(*binlookup),NBINLOOKUP+2); */ /* #ifdef AVOID_SQRT */ /* setup_squared_bin_lookup(sdss_data_file,&rmin,&rmax,&nbin,NBINLOOKUP,&rupp,binlookup); */ /* binfac=NBINLOOKUP/(rmax*rmax); */ /* #else */ /* setup_bin_lookup(sdss_data_file,&rmin,&rmax,&nbin,NBINLOOKUP,&rupp,binlookup); */ /* binfac=NBINLOOKUP/rmax; */ /* #endif */ /* #endif */ /*Spectro Arrays*/ //Variables in the file RA_s = my_calloc(sizeof(*RA_s),Spectro_Size); Dec_s = my_calloc(sizeof(*Dec_s),Spectro_Size); Redshift_s = my_calloc(sizeof(*Redshift_s),Spectro_Size); Weight_s = my_calloc(sizeof(*Weight_s),Spectro_Size); /////////////////////////////* [ READ IN THE GALAXY FILES AND CONVERT REDSHIFTS TO MPC ] *//////////////////////////////////// /*Read in Spectro Sample*/ gettimeofday(&t0,NULL); fp1 = my_fopen(Gxy_Spectro,"r") ; i=0; int flag=0,trash_d; nitems=5; /* while(fscanf(fp1,"%lf %lf %lf %lf %d",&RA_s[i],&Dec_s[i],&Redshift_s[i],&Weight_s[i],&Sector_s[i])!=EOF) { */ while(fgets(buffer,MAXBUFSIZE,fp1)!=NULL) { nread=sscanf(buffer,"%lf %lf %lf %lf %d",&RA_s[i],&Dec_s[i],&Redshift_s[i],&Weight_s[i],&trash_d); if (nread == nitems) { if(Redshift_s[i] > 10.0) { Redshift_s[i]/=SPEED_OF_LIGHT; flag=1; } if(Redshift_s[i] < 0) { fprintf(stderr,"BOSS Wp > Warning! Redshift = %lf, NR = %d. Setting to nearly 0.\n",Redshift_s[i],i); Redshift_s[i]=0.00001; } i++; if(i==Spectro_Size) { fprintf(stderr,"Increasing memory allocation for the spectroscopic sample\n"); Spectro_Size *= MEMORY_INCREASE_FAC; RA_s = my_realloc(RA_s,sizeof(*RA_s),Spectro_Size,"RA_s"); Dec_s = my_realloc(Dec_s,sizeof(*Dec_s),Spectro_Size,"Dec_s"); Redshift_s = my_realloc(Redshift_s,sizeof(*Redshift_s),Spectro_Size,"Redshift_s"); Weight_s = my_realloc(Weight_s,sizeof(*Weight_s),Spectro_Size,"Weight_s"); } } else { fprintf(stderr,"WARNING: In spectroscopic sample line %d did not contain %d elements...skipping line\n",i,nitems); } } Ngal_s=i; fclose(fp1); gettimeofday(&t1,NULL); if(flag!=0) fprintf(stderr,"BOSS Wp > Warning! You gave me cz instead of redshift!\n"); //Derived variables Distance_s = my_calloc(sizeof(*Distance_s),Ngal_s); X_s = my_calloc(sizeof(*X_s),Ngal_s); Y_s = my_calloc(sizeof(*Y_s),Ngal_s); Z_s = my_calloc(sizeof(*Z_s),Ngal_s); if(Ngal_s >= Spectro_Size) { fprintf(stderr,"BOSS Wp > Something Terrible Has Happened: SPECTROSCOPIC FILE TOO LONG!!!"); return EXIT_FAILURE; } fprintf(stderr,"BOSS Wp > There are %d Galaxies in the Spectro Sample. Time taken = %6.2lf sec\n",Ngal_s,ADD_DIFF_TIME(t0,t1)); /*Convert Redshift to Comoving Distance in MPC */ /* Here I am using Simpsons' Numerical Integration Rule To * convert the redshift of the galaxy into Megaparsecs. * The details of the integrals I am using is obviously * in Hogg's Distance Measures in Cosmology and you can * wikipedia Simpsons' Rule. I am assuming WMAP7 Cosmology * throughout. You can adjust all those parameters in the header. * I'm including an extra parameter (the equation of state of dark energy) * because I felt like it. */ double mean_distance=0; /*GSL Numerical Integration Crap */ gsl_integration_workspace * w = gsl_integration_workspace_alloc (1000); double result, error,redshift_gsl; gsl_function F; F.function = &f; F.params = &redshift_gsl; for(i=0;i<Ngal_s;i++) { gsl_integration_qags (&F, 0, Redshift_s[i], 0, 1e-7, 1000, w, &result, &error); Distance_s[i]=result; if(Redshift_s[i] < Minimum_Redshift) { Distance_to_Near_Z=Distance_s[i]; Minimum_Redshift=Redshift_s[i]; } if(Redshift_s[i] > Maximum_Redshift){ Distance_to_Far_Z=Distance_s[i]; Maximum_Redshift=Redshift_s[i]; } mean_distance+=Distance_s[i]; } gsl_integration_workspace_free(w); fprintf(stderr,"BOSS Wp > Mean Distance = %lf\n",mean_distance/Ngal_s); fprintf(stderr,"BOSS Wp > The Distance to the closest redshift is %lf\n",Distance_to_Near_Z); fprintf(stderr,"BOSS Wp > The Distance to the furthest redshift %lf is %lf\n",Maximum_Redshift,Distance_to_Far_Z); double dist_range=(Distance_to_Far_Z - Distance_to_Near_Z); double Volume1=4./3.*PI*pow(Distance_to_Far_Z,3); double Volume2=4./3.*PI*pow(Distance_to_Near_Z,3); double percentage_area=area_tot/(4.*PI); double Volume=(Volume1-Volume2)*percentage_area; fprintf(stderr,"BOSS Wp > Spherical Volume =%lf\n",Volume); fprintf(stderr,"BOSS Wp > Number Density of Spectro Gal =%17.16f\n",Ngal_s/Volume); // fprintf(stderr,"The Maximum Separation you decided is %lf\n",Max_Separation); Maximum_Dec_Separation=asin(Max_Separation/(2*Distance_to_Near_Z))*2.*RAD_TO_DEG*1.00002; //The maximum separation that can happen and let's multiply it by 20% more fprintf(stderr,"BOSS Wp > Maximum Dec Separation is %lf\n",Maximum_Dec_Separation); /*Read in Imaging File*/ /*Imaging Arrays */ RA_i = my_calloc(sizeof(*RA_i),Imaging_Size); Dec_i = my_calloc(sizeof(*Dec_i),Imaging_Size); nitems=3; gettimeofday(&t0,NULL); fp2=my_fopen(Gxy_Imaging,"r") ; i=0; while(fgets(buffer,MAXBUFSIZE,fp2)!=NULL) { nread = sscanf(buffer,"%lf %lf %d",&RA_i[i],&Dec_i[i],&trash_d); if(nread == nitems) { i++; if(i==Imaging_Size) { fprintf(stderr,"Increasing memory allocation for the imaging sample\n"); Imaging_Size *= MEMORY_INCREASE_FAC; RA_i = my_realloc(RA_i,sizeof(*RA_i),Imaging_Size,"RA_i"); Dec_i = my_realloc(Dec_i,sizeof(*Dec_i),Imaging_Size,"Dec_i"); } } else { fprintf(stderr,"WARNING: line %d did not contain %d elements - skipping\n",i,nitems); } } fclose(fp2); gettimeofday(&t1,NULL); Ngal_i=i; if(Ngal_i >= Imaging_Size) { fprintf(stderr,"BOSS Wp > Something Terrible Has Happened: IMAGING FILE TOO LONG!!!\n"); return EXIT_FAILURE; } X_i = my_calloc(sizeof(*X_i),Ngal_i); Y_i = my_calloc(sizeof(*Y_i),Ngal_i); Z_i = my_calloc(sizeof(*Z_i),Ngal_i); fprintf(stderr,"BOSS Wp > There are %d Galaxies in the Imaging Sample. Time taken = %6.2lf sec\n",Ngal_i,ADD_DIFF_TIME(t0,t1)); for(i=0;i<Ngal_s;i++) { X_s[i]=sin((90-Dec_s[i]) * DEG_TO_RAD)*cos(RA_s[i] * DEG_TO_RAD) ; Y_s[i]=sin((90-Dec_s[i]) * DEG_TO_RAD)*sin(RA_s[i] * DEG_TO_RAD) ; Z_s[i]=cos((90-Dec_s[i]) * DEG_TO_RAD) ; } for(i=0;i<Ngal_i;i++){ X_i[i]=sin((90-Dec_i[i]) * DEG_TO_RAD)*cos(RA_i[i] * DEG_TO_RAD) ; Y_i[i]=sin((90-Dec_i[i]) * DEG_TO_RAD)*sin(RA_i[i] * DEG_TO_RAD) ; Z_i[i]=cos((90-Dec_i[i]) * DEG_TO_RAD) ; } /* *This is where the jackknife call is going to go. *It's going to take the map file,the number of jackknife samples and the observed sectors in the same order as the observed galaxies. *It will return the vector of jackknife ID's in the same order the sector list was given to it. *The jackknife ID corresponds to the *one* jackknife sample that galaxy doesn't belong in. */ double number_density_of_imaging=Ngal_i/area_tot; double distance_squared=0.0,Normalization=0.0; if(Normalization_Choice==1) { for(i=0;i<Ngal_s;i++) { Normalization+=Weight_s[i]; } } else { for(i=0;i<Ngal_s;i++){ distance_squared+=1./SQR(Distance_s[i]); Normalization+=number_density_of_imaging*Weight_s[i]*1./SQR(Distance_s[i]); } // Normalization=number_density_of_imaging*1.204988; fprintf(stderr,"Distance Squared = %lf,Normalization =%lf\n",distance_squared,Normalization); } //gridlink the spectroscopic sample /*---Gridlink-variables----------------*/ int ngrid;/* *gridinit1D,*gridlist1D ; */ double dmin=-90,dmax=90.0;//min/max dec double inv_dmax_diff = 1.0/(dmax-dmin); cellarray *lattice; ngrid=0 ; /* gridlink1D(Ngal_i,dmin,dmax,Max_Separation,Dec_i,&ngrid,&gridinit1D,&gridlist1D) ; */ gridlink1D_with_struct(Ngal_i,dmin,dmax,Maximum_Dec_Separation,X_i,Y_i,Z_i,Dec_i,&ngrid,&lattice); fprintf(stderr,"gridlink1D done. ngrid= %d\n",ngrid) ; ////////////////////////////////////****Calculation of Wp****///////////////////////////////////////////////////////////////////////// // double rp_sqr=0.0; double max_sep_sqr = Max_Separation*Max_Separation; double start_bin_sqr = Start_Bin*Start_Bin; double inv_start_bin_sqr = 1.0/start_bin_sqr; double inv_log_bin_size = 1.0/log_Bin_Size; /* int icen,icell; */ /* double *x1,*y1,*z1,*dec; */ /* int *imaging; */ cellarray *cellstruct __attribute__((aligned(ALIGNMENT))); int xx=0; for(i=0;i<ngrid;i++) xx+= lattice[i].nelements; if(xx!=Ngal_i) { fprintf(stderr,"ERROR: xx=%d is not equal to Ngal_i=%d\n",xx,Ngal_i); exit(EXIT_FAILURE); } /*Wp Measurement Arrays */ DD = my_calloc(sizeof(*DD),N_Bins); double DD_threads[N_Bins][nthreads]; for(i=0;i<N_Bins;i++) { for(j=0;j<nthreads;j++) { DD_threads[i][j]=0.0; } } /* int ispectro=0,ii=0,p; */ gettimeofday(&t0,NULL); omp_set_num_threads(nthreads); int counter=0; int interrupted=0; init_my_progressbar(Ngal_s,&interrupted); /* #pragma omp parallel shared(Dec_s,Weight_s,X_s,Y_s,Z_s,chunk) private(cos_Theta,ispectro,icen,icell,rp_sqr,bin,x1,y1,z1,imaging,cellstruct) */ #pragma omp parallel default(none) shared(interrupted,stderr,counter,Ngal_s,Dec_s,Weight_s,X_s,Y_s,Z_s,chunk,ngrid,dmin,inv_dmax_diff,Maximum_Dec_Separation,Distance_s,inv_start_bin_sqr,max_sep_sqr,inv_log_bin_size,start_bin_sqr,DD_threads,lattice) { int tid = omp_get_thread_num(); #pragma omp for schedule(dynamic,chunk) for(int ispectro=0;ispectro<Ngal_s;ispectro++) { #pragma omp atomic counter++; if(tid==0){ my_progressbar(counter,&interrupted); } int icen = (int)(ngrid*(Dec_s[ispectro]-dmin)*inv_dmax_diff); if(icen<0) icen++; if(icen>=ngrid) icen = icen--; assert(icen >=0 && icen < ngrid && "icen needs to be in [0, ngrid)"); for(int ii=-BIN_REFINE_FACTOR;ii<=BIN_REFINE_FACTOR;ii++) { int icell = icen + ii ; /* for(icell=0;icell<ngrid;icell++) { */ // This makes no difference in the output - so the logic is correct if(icell>=0 && icell<ngrid) { /*---Loop-over-particles-in-each-cell-----------------*/ cellarray *cellstruct=&(lattice[icell]); double *x1 = cellstruct->x; double *y1 = cellstruct->y; double *z1 = cellstruct->z; double *dec = cellstruct->dec; int *imaging = cellstruct->index; for(int p=0;p<cellstruct->nelements;p++) { if(fabs(Dec_s[ispectro]-dec[p]) <= Maximum_Dec_Separation) { double cos_Theta=X_s[ispectro] * x1[p] + Y_s[ispectro] * y1[p] + Z_s[ispectro] * z1[p]; /* rp_sqr=4.0*Distance_s[ispectro]*Distance_s[ispectro]*(1.0 - cos_Theta)*0.5; /\* sin(arccos x) = sqrt(1-x^2) *\/ */ double rp_sqr=2.0*Distance_s[ispectro]*Distance_s[ispectro]*(1.0 - cos_Theta); /* sin(arccos x) = sqrt(1-x^2) */ if(rp_sqr < max_sep_sqr && rp_sqr >= start_bin_sqr) { int bin=(int)floor((0.5*log10(rp_sqr*inv_start_bin_sqr))*inv_log_bin_size); // bin=(int)floor((0.5*log10(rp_sqr*inv_start_bin_sqr))*inv_log_bin_size)-1; /* bin=(int)floor((log10(sqrt(rp_sqr)/Start_Bin))/log_Bin_Size); */ DD_threads[bin][tid]+=Weight_s[ispectro]; //Put the Count in the Keeping Track Bin// } } } } } } } finish_myprogressbar(&interrupted); for(i=0;i<N_Bins;i++) { for(j=0;j<nthreads;j++){ DD[i]+=DD_threads[i][j]; } } gettimeofday(&t1,NULL); fprintf(stderr,"Double loop time in main -> %6.2lf sec \n",ADD_DIFF_TIME(t0,t1)); /* #ifndef USE_AVX */ /* for(p=0;p<cellstruct->nelements;p++) { */ /* if(fabs(Dec_s[ispectro]-dec[p]) <= Maximum_Dec_Separation) { */ /* cos_Theta=X_s[ispectro] * x1[p] + Y_s[ispectro] * y1[p] + Z_s[ispectro] * z1[p]; */ /* rp_sqr=4.0*Distance_s[ispectro]*Distance_s[ispectro]*(1.0 - cos_Theta)*0.5; /\* sin(arccos x) = sqrt(1-x^2) *\/ */ /* if(rp_sqr < max_sep_sqr && rp_sqr >= start_bin_sqr) { */ /* bin=(int)floor((0.5*log10(rp_sqr*inv_start_bin_sqr))*inv_log_bin_size); */ /* /\* bin=(int)floor((log10(sqrt(rp_sqr)/Start_Bin))/log_Bin_Size); *\/ */ /* DD[bin][0]+=Weight_s[ispectro]; //Put the Count in the Keeping Track Bin// */ /* DD[bin][Jackknife_s[ispectro]+1]+=Weight_s[ispectro]; */ /* if(Jackknife_i[imaging[p]]!=Jackknife_s[ispectro]){ */ /* DD[bin][Jackknife_i[imaging[p]]+1]+=Weight_s[ispectro]; */ /* } */ /* } */ /* } */ /* } */ /* #else */ /* double dec_separation[NVECD]; */ /* double rp_sqr_array[NVECD],cos_theta_array[NVECD]; */ /* for(p=0;(p+NVECD)<cellstruct->nelements;p+=NVECD) { */ /* #pragma vector always */ /* for(int j=0;j<NVECD;j++) { */ /* dec_separation[j] = fabs(Dec_s[ispectro]-dec[p]); */ /* cos_theta_array[j] = X_s[ispectro] * x1[p+j] + Y_s[ispectro] * y1[p+j] + Z_s[ispectro] * z1[p+j]; */ /* rp_sqr_array[j] = 4.0*Distance_s[ispectro]*Distance_s[ispectro]*(1.0 - cos_theta_array[j])*0.5; /\* sin(arccos x) = sqrt(1-x^2) *\/ */ /* } */ /* #pragma novector */ /* for(int j=0;j<NVECD;j++) { */ /* rp_sqr = rp_sqr_array[j]; */ /* if(dec_separation[j] <= Maximum_Dec_Separation) { */ /* if(rp_sqr < max_sep_sqr && rp_sqr >= start_bin_sqr) { */ /* bin=(int)floor((0.5*log10(rp_sqr*inv_start_bin_sqr))*inv_log_bin_size); */ /* DD[bin][0]+=Weight_s[ispectro]; //Put the Count in the Keeping Track Bin// */ /* DD[bin][Jackknife_s[ispectro]+1]+=Weight_s[ispectro]; */ /* if(Jackknife_i[imaging[p+j]]!=Jackknife_s[ispectro]){ */ /* DD[bin][Jackknife_i[imaging[p+j]]+1]+=Weight_s[ispectro]; */ /* } */ /* } */ /* } */ /* } */ /* } */ /* //Now serially process the rest */ /* p = p > cellstruct->nelements ? p-NVECD:p; */ /* for(;p<cellstruct->nelements;p++){ /* if(fabs(Dec_s[ispectro]-dec[p]) <= Maximum_Dec_Separation) { */ /* cos_Theta=X_s[ispectro] * x1[p] + Y_s[ispectro] * y1[p] + Z_s[ispectro] * z1[p]; */ /* rp_sqr=4.0*Distance_s[ispectro]*Distance_s[ispectro]*(1.0 - cos_Theta)*0.5; /\* sin(arccos x) = sqrt(1-x^2) *\/ */ /* if(rp_sqr < max_sep_sqr && rp_sqr >= start_bin_sqr) { */ /* bin=(int)floor((0.5*log10(rp_sqr*inv_start_bin_sqr))*inv_log_bin_size); */ /* DD[bin][0]+=Weight_s[ispectro]; //Put the Count in the Keeping Track Bin// */ /* DD[bin][Jackknife_s[ispectro]+1]+=Weight_s[ispectro]; */ /* if(Jackknife_i[imaging[p]]!=Jackknife_s[ispectro]){ */ /* DD[bin][Jackknife_i[imaging[p]]+1]+=Weight_s[ispectro]; */ /* } */ /* } */ /* } */ /* } */ /* #endif */ /* for(int ispectro=0;ispectro<Ngal_s;ispectro++){ */ /* for(int imaging=0;imaging<Ngal_i;imaging++){ */ /* if(fabs(Dec_s[ispectro]-Dec_i[imaging]) <= Maximum_Dec_Separation){ */ /* cos_Theta=X_s[ispectro] * X_i[imaging] + Y_s[ispectro] * Y_i[imaging] + Z_s[ispectro] * Z_i[imaging]; */ /* //rp=2.0*Distance_s[ispectro]*SQRT((1.0 - cos_Theta)/2.); /\* sin(arccos x) = sqrt(1-x^2) *\/ */ /* rp_sqr=4.0*Distance_s[ispectro]*Distance_s[ispectro]*(1.0 - cos_Theta)*0.5; /\* sin(arccos x) = sqrt(1-x^2) *\/ */ /* //fprintf(stderr,"distance = %lf,cos_Theta=%lf,rp = %lf\n",Distance_s[ispectro],cos_Theta,rp); */ /* /\* if(rp < Max_Separation && rp>=Start_Bin){ *\/ */ /* if(rp_sqr < max_sep_sqr && rp_sqr >= start_bin_sqr) { */ /* /\* bin=(int)floor((log10(rp/Start_Bin))/log_Bin_Size); *\/ */ /* bin=(int)floor((0.5*log10(rp_sqr*inv_start_bin_sqr))*inv_log_bin_size); */ /* DD[bin][0]+=Weight_s[ispectro]; //Put the Count in the Keeping Track Bin// */ /* DD[bin][Jackknife_s[ispectro]+1]+=Weight_s[ispectro]; */ /* if(Jackknife_i[imaging]!=Jackknife_s[ispectro]){ */ /* // fprintf(fp3,"%d %lf %d %d %d %d \n",bin, rp,Jackknife_s[ispectro],Sector_s[ispectro],Jackknife_i[imaging],Sector_i[imaging]); */ /* DD[bin][Jackknife_i[imaging]+1]+=Weight_s[ispectro]; */ /* } */ /* } */ /* } */ /* } */ /* } */ for(i=0;i<N_Bins;i++) { // fprintf(stderr,"%lf %e %e %e ",pow(10,(log_Bin_Size*(i)+log10(Start_Bin))),DD[i][0]/(Normalization),Mean[i],Error[i]); fprintf(stdout,"%lf %e %lf\n",pow(10,(log_Bin_Size*(i)+log10(Start_Bin))),DD[i]/(Normalization),DD[i]); } /* Free ALL the arrays */ free(RA_i); free(Dec_i); free(X_s); free(Y_s); free(Z_s); free(X_i); free(Y_i); free(Z_i); free(RA_s); free(Dec_s); free(Redshift_s); free(Distance_s); free(Weight_s); free(DD); for(i=0;i<ngrid;i++) { free(lattice[i].x); free(lattice[i].y); free(lattice[i].z); free(lattice[i].dec); free(lattice[i].index); } free(lattice); return 0; }
int _start(int argc, char *argv[], boost::shared_ptr<Logger> qLogger, const std::string& processpath) { bool bError = false; po::options_description desc("Program-Options"); desc.add_options() ("name", po::value<std::string>(), "layer name (string)") ("lod", po::value<int>(), "desired level of detail (integer)") ("extent", po::value< std::vector<int64> >()->multitoken(), "tile boundary (tx0 ty0 tx1 ty1) for elevation/image data") ("boundary", po::value<std::vector<double> >()->multitoken(), "WGS84 boundary for point data or mapnik rendering") ("force", "[optional] force creation. (Warning: if this layer already exists it will be deleted)") ("numthreads", po::value<int>(), "[optional] force number of threads") ("type", po::value<std::string>(), "[optional] layer type. This can be image, elevation, poi, point, geometry. image is default value.") ; po::variables_map vm; try { po::store(po::parse_command_line(argc, argv, desc), vm); po::notify(vm); } catch (const std::exception &ex) { bError = true; std::cout << "Error when parsing command line options:\n" << ex.what() << "\n\n"; std::cout << desc << "\n"; return 4; } std::string sLayerName; int nLod = 0; std::vector<int64> vecExtent; std::vector<double> vecBoundary; bool bForce = false; ELayerType eLayer = IMAGE_LAYER; if (!vm.count("name")) { qLogger->Error("layer name is not specified!"); bError = true; } else { sLayerName = vm["name"].as<std::string>(); if (sLayerName.length() == 0) { qLogger->Error("layer name is empty!"); bError = true; } } if (!vm.count("lod")) { if(vm["type"].as< std::string >() != "mapnik") { qLogger->Error("lod not specified!"); bError = true; } } else { nLod = vm["lod"].as<int>(); } if (vm.count("force")) { bForce = true; } if (vm.count("extent")) { vecExtent = vm["extent"].as< std::vector<int64> >(); } if (vm.count("boundary")) { vecBoundary = vm["boundary"].as< std::vector<double> >(); } if (vm.count("numthreads")) { int n = vm["numthreads"].as<int>(); if (n>0 && n<65) { std::ostringstream oss; oss << "Forcing number of threads to " << n; qLogger->Info(oss.str()); omp_set_num_threads(n); } } if (vm.count("type")) { std::string sLayerType = vm["type"].as< std::string >(); if (sLayerType == "image") { eLayer = IMAGE_LAYER; } else if (sLayerType == "imagepostprocessing") { eLayer = IMAGE_POSTPROCESSING_LAYER; } else if (sLayerType == "mapnik") { eLayer = MAPNIK_LAYER; } else if (sLayerType == "elevation") { eLayer = ELEVATION_LAYER; } else if (sLayerType == "poi") { eLayer = POI_LAYER; } else if (sLayerType == "point") { eLayer = POINT_LAYER; } else if (sLayerType == "geometry") { eLayer = GEOMETRY_LAYER; } else { bError = true; } } else { qLogger->Warn("It is highly recommended to use --type! Using default --type image"); } if (eLayer == POINT_LAYER) { if (vecBoundary.size() != 6 ) { qLogger->Error("boundary must be specified with 6 values (WGS84): lng0 lat0 elv0 lng1 lat1 elv1"); bError = true; } } else { if (vecExtent.size() != 4 ) { qLogger->Error("extent must be defined with 4 values (Tile Coords): x0 y0 x1 y1"); bError = true; } } if (bError) { qLogger->Error("Wrong parameters!"); std::ostringstream sstr; sstr << desc; qLogger->Info("\n" + sstr.str()); return ERROR_PARAMS; } std::string sLayerPath = FilenameUtils::DelimitPath(processpath) + sLayerName; qLogger->Info("Target directory: " + sLayerPath); if (FileSystem::DirExists(sLayerPath)) { if (!bForce) { qLogger->Error("Layer already exists!!"); qLogger->Error("the directory " + sLayerPath + " already exists. Please delete manually or choose another layer name or use the --force option"); return ERROR_LAYEREXISTS; } else { qLogger->Info("Force option detected. Deleting already existing layer... this may take a while"); if (!FileSystem::rm_all(sLayerPath)) { qLogger->Error("Can't delete old layer (file permission)."); return ERROR_DELETE_PERMISSION; } else { qLogger->Info("ok.. layer deleted."); } } } if (eLayer == IMAGE_LAYER) { return _createimagelayer(sLayerName, sLayerPath, nLod, vecExtent, qLogger, false); } if (eLayer == IMAGE_POSTPROCESSING_LAYER) { return _createimagelayer(sLayerName, sLayerPath, nLod, vecExtent, qLogger, true); } if (eLayer == MAPNIK_LAYER) { return _createmapniklayer(sLayerName, sLayerPath, vecBoundary, qLogger); } else if (eLayer == ELEVATION_LAYER) { return _createelevationlayer(sLayerName, sLayerPath, nLod, vecExtent, qLogger); } else if (eLayer == POINT_LAYER) { return _createpointlayer(sLayerName, sLayerPath, nLod, vecBoundary, qLogger); } else { return ERROR_UNSUPPORTED; } }
int main(int argc, char* argv[]) { int threads = 8; if (argc > 1) { threads = (atoi(argv[1])); } omp_set_num_threads(threads); //========================================================================================================= ChSystemParallelDVI * system_gpu = new ChSystemParallelDVI; ChCollisionSystemParallel *mcollisionengine = new ChCollisionSystemParallel(); system_gpu->SetIntegrationType(ChSystem::INT_ANITESCU); //========================================================================================================= system_gpu->SetParallelThreadNumber(threads); system_gpu->SetMaxiter(max_iter); system_gpu->SetIterLCPmaxItersSpeed(max_iter); ((ChLcpSolverParallelDVI *) (system_gpu->GetLcpSolverSpeed()))->SetMaxIteration(max_iter); system_gpu->SetTol(.1); system_gpu->SetTolSpeeds(.1); ((ChLcpSolverParallelDVI *) (system_gpu->GetLcpSolverSpeed()))->SetTolerance(.1); ((ChLcpSolverParallelDVI *) (system_gpu->GetLcpSolverSpeed()))->SetCompliance(0); ((ChLcpSolverParallelDVI *) (system_gpu->GetLcpSolverSpeed()))->SetContactRecoverySpeed(10); ((ChLcpSolverParallelDVI *) (system_gpu->GetLcpSolverSpeed()))->SetSolverType(ACCELERATED_PROJECTED_GRADIENT_DESCENT); ((ChCollisionSystemParallel *) (system_gpu->GetCollisionSystem()))->SetCollisionEnvelope(particle_radius * .01); mcollisionengine->setBinsPerAxis(I3(50, 50, 50)); mcollisionengine->setBodyPerBin(100, 50); system_gpu->Set_G_acc(ChVector<>(0, gravity, 0)); system_gpu->SetStep(timestep); ((ChSystemParallel*) system_gpu)->SetAABB(R3(-6, -3, -12), R3(6, 6, 12)); //========================================================================================================= //cout << num_per_dir.x << " " << num_per_dir.y << " " << num_per_dir.z << " " << num_per_dir.x * num_per_dir.y * num_per_dir.z << endl; //addPerturbedLayer(R3(0, -5 +container_thickness-particle_radius.y, 0), ELLIPSOID, particle_radius, num_per_dir, R3(.01, .01, .01), 10, 1, system_gpu); //addHCPCube(num_per_dir.x, num_per_dir.y, num_per_dir.z, 1, particle_radius.x, 1, true, 0, -6 +container_thickness+particle_radius.y, 0, 0, system_gpu); //========================================================================================================= ChSharedBodyPtr L = ChSharedBodyPtr(new ChBody(new ChCollisionModelParallel)); ChSharedBodyPtr R = ChSharedBodyPtr(new ChBody(new ChCollisionModelParallel)); ChSharedBodyPtr F = ChSharedBodyPtr(new ChBody(new ChCollisionModelParallel)); ChSharedBodyPtr B = ChSharedBodyPtr(new ChBody(new ChCollisionModelParallel)); ChSharedBodyPtr Bottom = ChSharedBodyPtr(new ChBody(new ChCollisionModelParallel)); ChSharedBodyPtr Top = ChSharedBodyPtr(new ChBody(new ChCollisionModelParallel)); ChSharedBodyPtr Tube = ChSharedBodyPtr(new ChBody(new ChCollisionModelParallel)); ChSharedPtr<ChMaterialSurface> material; material = ChSharedPtr<ChMaterialSurface>(new ChMaterialSurface); material->SetFriction(.1); material->SetRollingFriction(0); material->SetSpinningFriction(0); material->SetCompliance(0); material->SetCohesion(-100); Quaternion q; q.Q_from_AngX(-.1); InitObject(L, 100000, Vector(-container_size.x + container_thickness, container_height - container_thickness, 0), Quaternion(1, 0, 0, 0), material, true, true, -20, -20); InitObject(R, 100000, Vector(container_size.x - container_thickness, container_height - container_thickness, 0), Quaternion(1, 0, 0, 0), material, true, true, -20, -20); InitObject(F, 100000, Vector(0, container_height - container_thickness, -container_size.z + container_thickness), Quaternion(1, 0, 0, 0), material, true, true, -20, -20); InitObject(B, 100000, Vector(0, container_height - container_thickness, container_size.z - container_thickness), Quaternion(1, 0, 0, 0), material, true, true, -20, -20); InitObject(Bottom, 100000, Vector(0, container_height - container_size.y / 1.5, 0), q, material, true, true, -20, -20); InitObject(Top, 100000, Vector(0, container_height + container_size.y, 0), Quaternion(1, 0, 0, 0), material, true, true, -20, -20); InitObject(Tube, 100000, Vector(container_size.x - container_thickness, container_height - container_thickness, 0), Quaternion(1, 0, 0, 0), material, true, true, -20, -20); AddCollisionGeometry(L, BOX, Vector(container_thickness, container_size.y, container_size.z), Vector(0, 0, 0), Quaternion(1, 0, 0, 0)); AddCollisionGeometry(R, BOX, Vector(container_thickness, container_size.y, container_size.z), Vector(0, 0, 0), Quaternion(1, 0, 0, 0)); AddCollisionGeometry(F, BOX, Vector(container_size.x, container_size.y, container_thickness), Vector(0, 0, 0), Quaternion(1, 0, 0, 0)); AddCollisionGeometry(B, BOX, Vector(container_size.x, container_size.y, container_thickness), Vector(0, 0, 0), Quaternion(1, 0, 0, 0)); AddCollisionGeometry(Bottom, BOX, Vector(container_size.x, container_thickness, container_size.z), Vector(0, 0, 0), Quaternion(1, 0, 0, 0)); AddCollisionGeometry(Top, BOX, Vector(container_size.x, container_thickness, container_size.z), Vector(0, 0, 0), Quaternion(1, 0, 0, 0)); AddCollisionGeometry(Tube, BOX, Vector(2, container_thickness / 6.0, 1), Vector(0, container_size.y / 2.0 + .6 + .4, 0), Quaternion(1, 0, 0, 0)); AddCollisionGeometry(Tube, BOX, Vector(2, container_thickness / 6.0, 1), Vector(0, container_size.y / 2.0 - .6 + .4, 0), Quaternion(1, 0, 0, 0)); AddCollisionGeometry(Tube, BOX, Vector(2, .6, container_thickness / 6.0), Vector(0, container_size.y / 2.0 + .4, -1), Quaternion(1, 0, 0, 0)); AddCollisionGeometry(Tube, BOX, Vector(2, .6, container_thickness / 6.0), Vector(0, container_size.y / 2.0 + .4, 1), Quaternion(1, 0, 0, 0)); FinalizeObject(L, (ChSystemParallel *) system_gpu); FinalizeObject(R, (ChSystemParallel *) system_gpu); FinalizeObject(F, (ChSystemParallel *) system_gpu); FinalizeObject(B, (ChSystemParallel *) system_gpu); FinalizeObject(Bottom, (ChSystemParallel *) system_gpu); //FinalizeObject(Top, (ChSystemParallel *) system_gpu); //FinalizeObject(Tube, (ChSystemParallel *) system_gpu); material_fiber = ChSharedPtr<ChMaterialSurface>(new ChMaterialSurface); material_fiber->SetFriction(.4); material_fiber->SetRollingFriction(1); material_fiber->SetSpinningFriction(1); material_fiber->SetCompliance(0); material_fiber->SetCohesion(0); //========================================================================================================= //Rendering specific stuff: ChOpenGLManager * window_manager = new ChOpenGLManager(); ChOpenGL openGLView(window_manager, system_gpu, 800, 600, 0, 0, "Test_Solvers"); //openGLView.render_camera->camera_position = glm::vec3(0, -5, -10); //openGLView.render_camera->camera_look_at = glm::vec3(0, -5, 0); //openGLView.render_camera->camera_scale = .1; openGLView.SetCustomCallback(RunTimeStep); openGLView.StartSpinning(window_manager); window_manager->CallGlutMainLoop(); //========================================================================================================= int file = 0; for (int i = 0; i < num_steps; i++) { system_gpu->DoStepDynamics(timestep); double TIME = system_gpu->GetChTime(); double STEP = system_gpu->GetTimerStep(); double BROD = system_gpu->GetTimerCollisionBroad(); double NARR = system_gpu->GetTimerCollisionNarrow(); double LCP = system_gpu->GetTimerLcp(); double UPDT = system_gpu->GetTimerUpdate(); double RESID = ((ChLcpSolverParallelDVI *) (system_gpu->GetLcpSolverSpeed()))->GetResidual(); int BODS = system_gpu->GetNbodies(); int CNTC = system_gpu->GetNcontacts(); int REQ_ITS = ((ChLcpSolverParallelDVI*) (system_gpu->GetLcpSolverSpeed()))->GetTotalIterations(); printf("%7.4f|%7.4f|%7.4f|%7.4f|%7.4f|%7.4f|%7d|%7d|%7d|%7.4f\n", TIME, STEP, BROD, NARR, LCP, UPDT, BODS, CNTC, REQ_ITS, RESID); int save_every = 1.0 / timestep / 60.0; //save data every n steps if (i % save_every == 0) { stringstream ss; cout << "Frame: " << file << endl; ss << "data/fiber/" << "/" << file << ".txt"; //DumpAllObjects(system_gpu, ss.str(), ",", true); DumpAllObjectsWithGeometryPovray(system_gpu, ss.str()); //output.ExportData(ss.str()); file++; } RunTimeStep(system_gpu, i); } //DumpObjects(system_gpu, "diagonal_impact_settled.txt", "\t"); }
void generate_errors_per_base(JSONWriter* pWriter, const BWTIndexSet& index_set) { int n_samples = 100000; size_t k = 25; double max_error_rate = 0.95; size_t min_overlap = 50; std::vector<size_t> position_count; std::vector<size_t> error_count; Timer timer("test", true); #if HAVE_OPENMP omp_set_num_threads(opt::numThreads); #pragma omp parallel for #endif for(int i = 0; i < n_samples; ++i) { std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT); KmerOverlaps::retrieveMatches(s, k, min_overlap, max_error_rate, 2, index_set); //KmerOverlaps::approximateMatch(s, min_overlap, max_error_rate, 2, 200, index_set); MultipleAlignment ma = KmerOverlaps::buildMultipleAlignment(s, k, min_overlap, max_error_rate, 2, index_set); // Skip when there is insufficient depth to classify errors size_t ma_rows = ma.getNumRows(); if(ma_rows <= 1) continue; size_t ma_cols = ma.getNumColumns(); size_t position = 0; for(size_t j = 0; j < ma_cols; ++j) { char s_symbol = ma.getSymbol(0, j); // Skip gaps if(s_symbol == '-' || s_symbol == '\0') continue; SymbolCountVector scv = ma.getSymbolCountVector(j); int s_symbol_count = 0; char max_symbol = 0; int max_count = 0; for(size_t k = 0; k < scv.size(); ++k) { if(scv[k].symbol == s_symbol) s_symbol_count = scv[k].count; if(scv[k].count > max_count) { max_count = scv[k].count; max_symbol = scv[k].symbol; } } //printf("P: %zu S: %c M: %c MC: %d\n", position, s_symbol, max_symbol, max_count); // Call an error at this position if the consensus symbol differs from the read // and the support for the read symbol is less than 4 and the consensus symbol // is strongly supported. bool is_error = s_symbol != max_symbol && s_symbol_count < 4 && max_count >= 3; #if HAVE_OPENMP #pragma omp critical #endif { if(position >= position_count.size()) { position_count.resize(position+1); error_count.resize(position+1); } position_count[position]++; error_count[position] += is_error; } position += 1; } } pWriter->String("ErrorsPerBase"); pWriter->StartObject(); pWriter->String("base_count"); pWriter->StartArray(); for(size_t i = 0; i < position_count.size(); ++i) pWriter->Int(position_count[i]); pWriter->EndArray(); pWriter->String("error_count"); pWriter->StartArray(); for(size_t i = 0; i < position_count.size(); ++i) pWriter->Int(error_count[i]); pWriter->EndArray(); pWriter->EndObject(); }
double integrateVegas(double * limits , int threads, double * params){ //Setting the number of threads omp_set_num_threads(threads); //How many iterations to perform int iterations =15; //Which iteration to start sampling more int switchIteration = 7; //How many points to sample in total int samples = 100000; //How many points to sample after grid set up int samplesAfter = 5000000; //How many intervals for each dimension int intervals = 10; //How many subIntervals int subIntervals = 1000; //Parameter alpha controls convergence rate double alpha = 0.5; int seed = 40847516; //double to store volume integrated over double volume = 1.0; for(int i=0; i<dimensions; i++){ volume*= (limits[(2*i)+1]-limits[2*i]); }; //Number of boxes int numBoxes = intervals; for(int i=1; i<dimensions; i++){ numBoxes *= intervals; } //CHANGE SEED WHEN YOU KNOW IT WORKS //Setting up one random number stream for each thread VSLStreamStatePtr * streams; streams = ( VSLStreamStatePtr * )_mm_malloc(sizeof(VSLStreamStatePtr)*threads,64); for(int i=0; i<threads; i++){ vslNewStream(&streams[i], VSL_BRNG_MT2203+i,seed); } //Arrays to store integral and uncertainty for each iteration double * integral = (double *)_mm_malloc(sizeof(double)*iterations,64); double * sigmas = (double *)_mm_malloc(sizeof(double)*iterations,64); for(int i=0; i<iterations; i++){ integral[i] = 0; sigmas[i] = 0; } //Points per each box int pointsPerBox = samples/numBoxes; //Array storing the box limits (stores x limits then y limits and so on) intervals+1 to store all limits double * boxLimits = (double *)_mm_malloc(sizeof(double)*(intervals+1)*dimensions,64); //Array to store average function values for each box double * heights = (double *)_mm_malloc(sizeof(double)*dimensions*intervals,64); //Array storing values of m double * mValues = (double *)_mm_malloc(sizeof(double)*intervals,64); //Array storing widths of sub boxes double * subWidths = (double *) _mm_malloc(sizeof(double)*intervals,64); //Getting initial limits for the boxes for(int i=0; i<dimensions; i++){ double boxWidth = (limits[(2*i)+1]-limits[2*i])/intervals; //0th iteration boxLimits[i*(intervals+1)] = limits[2*i]; for(int j=1; j<=intervals; j++){ int x = (i*(intervals+1))+j; boxLimits[x] = boxLimits[x-1]+boxWidth; } }; //Pointer to store random generated numbers double randomNums[dimensions]__attribute__((aligned(64))); int binNums[dimensions]__attribute__((aligned(64))); //Double to store p(x) denominator for monte carlo double prob; //Values to store integral and sigma for each thread so they can be reduced in OpenMp double integralTemp; double sigmaTemp; double heightsTemp[dimensions*intervals]__attribute__((aligned(64))); int threadNum; #pragma omp parallel default(none) private(sigmaTemp,integralTemp,binNums,randomNums,prob,threadNum,heightsTemp) shared(iterations,subIntervals,alpha,mValues,subWidths,streams,samples,boxLimits,intervals, integral, sigmas, heights, threads, volume, samplesAfter, switchIteration, params) { for(int iter=0; iter<iterations; iter++){ //Stepping up to more samples when grid calibrated if(iter==switchIteration){ samples = samplesAfter; } //Performing iterations for(int i=0; i<dimensions*intervals; i++){ heightsTemp[i] = 0; } integralTemp = 0; sigmaTemp = 0; //Getting chunk sizes for each thread threadNum = omp_get_thread_num(); int seg = ceil((double)samples/threads); int lower = seg*threadNum; int upper = seg*(threadNum+1); if(upper > samples){ upper = samples; }; //Spliting monte carlo up for(int i=0; i<seg; i++){ prob = 1; //Randomly choosing bins to sample from viRngUniform(VSL_RNG_METHOD_UNIFORM_STD,streams[threadNum],dimensions,binNums,0,intervals); vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD,streams[threadNum],dimensions,randomNums,0,1); //Getting samples from bins for(int j=0; j<dimensions; j++){ int x = ((intervals+1)*j)+binNums[j]; randomNums[j] *= (boxLimits[x+1]-boxLimits[x]); randomNums[j] += boxLimits[x]; prob *= 1.0/(intervals*(boxLimits[x+1]-boxLimits[x])); } //Performing evaluation of function and adding it to the total integral double eval = evaluate(randomNums,params); integralTemp += eval/prob; sigmaTemp += (eval*eval)/(prob*prob); //Calculating the values of f for bin resising for(int j=0; j<dimensions; j++){ int x = binNums[j]+(j*intervals); //May need to initialize heights // #pragma omp atomic // printf("heightsTemp before=%f\n",heightsTemp[x]); heightsTemp[x] += eval; // printf("heightsTemp=%f x=%d eval=%f thread=%d\n",heightsTemp[x],x,eval,omp_get_thread_num()); } } #pragma omp critical { integral[iter] += integralTemp; sigmas[iter] += sigmaTemp; for(int k=0; k<dimensions*intervals; k++){ // printf("heightTemp[k]=%f k=%d\n",heightsTemp[k],k); heights[k] += heightsTemp[k]; } } #pragma omp barrier #pragma omp single { //Calculating the values of sigma and the integral integral[iter] /= samples; sigmas[iter] /= samples; sigmas[iter] -= (integral[iter]*integral[iter]); sigmas[iter] /= (samples-1); // printf("integral=%f\n",integral[iter]); //Readjusting the box widths based on the heights //Creating array to store values of m and their sum int totalM=0; //Doing for each dimension seperately for(int i=0; i<dimensions; i++){ double sum = 0; //Getting the sum of f*delta x for(int j=0; j<intervals; j++){ int x = (i*(intervals))+j ; //May be bug with these indicies sum += heights[x]*(boxLimits[x+1+i]-boxLimits[x+i]); } //Performing the rescaling for(int j=0; j<intervals; j++){ int x = (i*(intervals))+j; double value = heights[x]*(boxLimits[x+1+i]-boxLimits[x+i]); mValues[j] = ceil(subIntervals*pow((value-1)*(1.0/log(value)),alpha)); subWidths[j] = (boxLimits[x+1+i]-boxLimits[x+i])/mValues[j]; totalM += mValues[j]; } int mPerInterval = totalM/intervals; int mValueIterator = 0; //Adjusting the intervals going from 1 to less than intervals to keep the edges at the limits for(int j=1; j<intervals; j++){ double width = 0; for(int y=0; y<mPerInterval; y++){ width += subWidths[mValueIterator]; mValues[mValueIterator]--; if(mValues[mValueIterator]==0){ mValueIterator++; } } //NEED TO SET BOX LIMITS NOW int x = j+(i*(intervals+1)); boxLimits[x] = boxLimits[x-1]+width; } //Setting mvalues etc. (reseting memory allocated before the dimensions loop to 0) totalM = 0; for(int k=0; k<intervals; k++){ subWidths[k] = 0; mValues[k] = 0; } } //Setting heights to zero for next iteration for(int i=0; i<intervals*dimensions; i++ ){ heights[i] = 0; } } } } //All iterations done //Free stuff _mm_free(subWidths); _mm_free(mValues); _mm_free(boxLimits); _mm_free(streams); _mm_free(heights); //Calculating the final value of the integral double denom = 0; double numerator =0; for(int i=7; i<iterations; i++){ numerator += integral[i]*((integral[i]*integral[i])/(sigmas[i]*sigmas[i])); denom += ((integral[i]*integral[i])/(sigmas[i]*sigmas[i])); // printf("integral=%f sigma=%f\n",integral[i],sigmas[i]); } double output = numerator/denom; //Calculating value of x^2 to check if result can be trusted double chisq = 0; for(int i=0; i<iterations; i++){ chisq += (((integral[i]-output)*(integral[i]-output))/(sigmas[i]*sigmas[i])); } if(chisq>iterations){ printf("Chisq value is %f, it should be not much greater than %d (iterations-1) Integral:%f Analytical Value=%f\n",chisq,iterations-1,output,normValue(params)); } _mm_free(integral); _mm_free(sigmas); return output; }
// Measure genome repetitiveness using the rate of k-mers // that branch on both ends void generate_double_branch(JSONWriter* pWriter, const BWTIndexSet& index_set) { int n_samples = 50000; size_t min_coverage_to_test = 5; size_t min_coverage_for_branch = 3; double min_coverage_ratio = 0.5f; pWriter->String("DoubleBranch"); pWriter->StartArray(); for(size_t k = 16; k < 86; k += 5) { size_t num_branches = 0; size_t num_kmers = 0; #if HAVE_OPENMP omp_set_num_threads(opt::numThreads); #pragma omp parallel for #endif for(int i = 0; i < n_samples; ++i) { std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT); if(s.size() < k) continue; std::string kmer = s.substr(0, k); size_t count = BWTAlgorithms::countSequenceOccurrences(kmer, index_set); if(count >= min_coverage_to_test) { std::string right_extensions = get_valid_dbg_neighbors_coverage_and_ratio(kmer, index_set, min_coverage_for_branch, min_coverage_ratio, ED_SENSE); std::string left_extensions = get_valid_dbg_neighbors_coverage_and_ratio(kmer, index_set, min_coverage_for_branch, min_coverage_ratio, ED_ANTISENSE); #if HAVE_OPENMP #pragma omp critical #endif { num_branches += (left_extensions.size() > 1 && right_extensions.size() > 1); num_kmers += 1; } } } pWriter->StartObject(); pWriter->String("k"); pWriter->Int(k); pWriter->String("num_kmers"); pWriter->Int(num_kmers); pWriter->String("num_branches"); pWriter->Int(num_branches); pWriter->EndObject(); } pWriter->EndArray(); }
// An old main(), including a serial bottleneck. I've left it here for // now for benchmarking purposes. int bottlenecked_main(int argc, char **argv) { int numthreads; if( find_option( argc, argv, "-h" ) >= 0 ) { printf( "Options:\n" ); printf( "-h to see this help\n" ); printf( "-n <int> to set number of particles\n" ); printf( "-o <filename> to specify the output file name\n" ); printf( "-s <filename> to specify a summary file name\n" ); printf( "-no turns off all correctness checks and particle output\n"); printf( "-p <int> to set the (maximum) number of threads used\n"); return 0; } const int n = read_int( argc, argv, "-n", 1000 ); const bool fast = (find_option( argc, argv, "-no" ) != -1); const char *savename = read_string( argc, argv, "-o", NULL ); const char *sumname = read_string( argc, argv, "-s", NULL ); const int num_threads_override = read_int( argc, argv, "-p", 0); FILE *fsave = savename ? fopen( savename, "w" ) : NULL; FILE *fsum = sumname ? fopen ( sumname, "a" ) : NULL; const double size = set_size( n ); // We need to set the size of a grid square so that the average number of // particles per grid square is constant. The simulation already ensures // that the average number of particles in an arbitrary region is constant // and proportional to the area. So this is just a constant. const double grid_square_size = sqrt(0.0005) + 0.000001; const int num_grid_squares_per_side = size / grid_square_size; printf("Using %d grid squares of side-length %f for %d particles.\n", num_grid_squares_per_side*num_grid_squares_per_side, grid_square_size, n); std::unique_ptr<std::vector<particle_t> > particles = init_particles(n); if (num_threads_override > 0) { omp_set_dynamic(0); omp_set_num_threads(num_threads_override); } // // simulate a number of time steps // double simulation_time = read_timer( ); int max_num_threads = omp_get_max_threads(); // User-defined reductions aren't available in the version of OMP we're // using. Instead, we accumulate per-thread stats in this global array // and reduce manually when we're done. Stats per_thread_stats[max_num_threads]; // Shared across threads. std::unique_ptr<Grid> g(new Grid(size, num_grid_squares_per_side)); #pragma omp parallel { numthreads = omp_get_num_threads(); for (int step = 0; step < 1000; step++) { //TODO: Does this need to be declared private? int thread_idx; #pragma omp single g.reset(new Grid(size, num_grid_squares_per_side, *particles)); //TODO: Could improve data locality by blocking according to the block // structure of the grid. That would require keeping track, dynamically, // of the locations of each particle. It would be interesting to test // whether manually allocating sub-blocks (as in the distributed memory // code) to threads improves things further. #pragma omp for for (int i = 0; i < n; i++) { thread_idx = omp_get_thread_num(); particle_t& p = (*particles)[i]; p.ax = p.ay = 0; std::unique_ptr<SimpleIterator<particle_t&> > neighbors = (*g).neighbor_iterator(p); while (neighbors->hasNext()) { particle_t& neighbor = neighbors->next(); apply_force(p, neighbor, per_thread_stats[thread_idx]); } } // There is an implicit barrier here, which is important for correctness. // (Technically, some asynchrony could be allowed: A thread's sub-block // can be moved once it receives force messages from its neighboring // sub-blocks.) // // move particles // #pragma omp for for (int i = 0; i < n; i++) { move((*particles)[i]); } if (!fast) { // // save if necessary // #pragma omp master if( fsave && (step%SAVEFREQ) == 0 ) { save( fsave, n, (*particles).data() ); } } } } simulation_time = read_timer( ) - simulation_time; // Could do a tree reduce here, but it seems unnecessary. Stats overall_stats; for (int thread_idx = 0; thread_idx < max_num_threads; thread_idx++) { overall_stats.aggregate_left(per_thread_stats[thread_idx]); } printf( "n = %d,threads = %d, simulation time = %g seconds", n,numthreads, simulation_time); if (!fast) { // // -the minimum distance absmin between 2 particles during the run of the simulation // -A Correct simulation will have particles stay at greater than 0.4 (of cutoff) with typical values between .7-.8 // -A simulation were particles don't interact correctly will be less than 0.4 (of cutoff) with typical values between .01-.05 // // -The average distance absavg is ~.95 when most particles are interacting correctly and ~.66 when no particles are interacting // printf( ", absmin = %lf, absavg = %lf", overall_stats.min, overall_stats.avg); if (overall_stats.min < 0.4) printf ("\nThe minimum distance is below 0.4 meaning that some particle is not interacting"); if (overall_stats.avg < 0.8) printf ("\nThe average distance is below 0.8 meaning that most particles are not interacting"); } printf("\n"); // // Printing summary data // if( fsum) fprintf(fsum,"%d %d %g\n",n,numthreads,simulation_time); // // Clearing space // if( fsum ) fclose( fsum ); if( fsave ) fclose( fsave ); return 0; }
// Generate random walk length void generate_random_walk_length(JSONWriter* pWriter, const BWTIndexSet& index_set) { int n_samples = 1000; size_t min_coverage = 5; double coverage_cutoff = 0.75f; size_t max_length = 30000; // Create a bloom filter to mark // visited kmers. We do not allow a new // walk to start at one of these kmers size_t bf_overcommit = 20; BloomFilter* bloom_filter = new BloomFilter;; bloom_filter->initialize(n_samples * max_length * bf_overcommit, 3); pWriter->String("RandomWalkLength"); pWriter->StartArray(); for(size_t k = 16; k < 86; k += 5) { pWriter->StartObject(); pWriter->String("k"); pWriter->Int(k); pWriter->String("walk_lengths"); pWriter->StartArray(); #if HAVE_OPENMP omp_set_num_threads(opt::numThreads); #pragma omp parallel for #endif for(int i = 0; i < n_samples; ++i) { size_t walk_length = 0; std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT); if(s.size() < k) continue; std::string kmer = s.substr(0, k); if(bloom_filter->test(kmer.c_str(), k) || BWTAlgorithms::countSequenceOccurrences(kmer, index_set) < min_coverage) continue; bloom_filter->add(kmer.c_str(), k); while(walk_length < max_length) { std::string extensions = get_valid_dbg_neighbors_ratio(kmer, index_set, coverage_cutoff); if(!extensions.empty()) { kmer.erase(0, 1); kmer.append(1, extensions[rand() % extensions.size()]); walk_length += 1; bloom_filter->add(kmer.c_str(), k); } else { break; } } #if HAVE_OPENMP #pragma omp critical #endif pWriter->Int(walk_length); } pWriter->EndArray(); pWriter->EndObject(); } pWriter->EndArray(); delete bloom_filter; }
int main(int argc, char **argv) { int N; int nThreads; int nColumns; int i,j,k; double *A,*Bi,*C,*Ci; int BiRows, BiColumns; CompressedMatrix *cBi; CompressedMatrix *cCi; double elapsed; char printDebug; //************ Check Input **************/ if(argc < 3){ printf("Usage: %s MaxtrixSize NumberOfThreads\n" , argv[0] ); exit(EXIT_FAILURE); } N = atoi(argv[1]); if( N <= 1){ printf("MatrixSize must be bigger than 1!"); exit(EXIT_FAILURE); } nThreads = atoi(argv[2]); if( nThreads <= 1){ printf("NumberOfThreads must be bigger than 1!"); exit(EXIT_FAILURE); } omp_set_num_threads(nThreads); omp_set_schedule(omp_sched_dynamic, N/10); MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_id); MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); nColumns = N / mpi_size; //For the moment depend on N being a multiple the number of MPI nodes //************ Prepare Matrix **************/ A = (double *) malloc( N*N * sizeof(double) ); if((A == NULL) ){ printf("Running out of memory!\n"); exit(EXIT_FAILURE); } // if(mpi_id != 0){ // MPI_Finalize(); // exit(0); // } if(mpi_id == 0) { printDebug = 0; if(printDebug) printf("[%d] Generating A ...",mpi_id); //Fill matrixes. Generate Identity like matrix for A and B , So C should result in an matrix with a single major diagonal for(i=0; i < N; i++ ){ for(j=0; j < N; j++){ A[i+N*j] = (i==j)?i:0.0; // //Sparse Matrix with 10% population // A[i+N*j] = rand()%10; // if(A[i+N*j] == 0) // A[i+N*j] = rand()%10; // else // A[i+N*j] = 0; } } // printMatrix(A, N, nColumns); // cA = compressMatrix(A, N, nColumns); // printCompressedMatrix(cA); // uncompressMatrix(cA, &Bi, &i, &j); // printMatrix(Bi, i, j); // // MPI_Finalize(); // exit(0); tick(); if(printDebug) printf("[%d] Broadcasting A ...",mpi_id); MPI_Bcast( A, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD); if(printDebug) printf("[%d] Generating B ...",mpi_id); double* B; CompressedMatrix* cB; B = (double *) malloc( N*N * sizeof(double) ); for(i=0; i < N; i++ ){ for(j=0; j < N; j++){ B[j+N*i] = (i==j)?1.0:0.0; } } if(printDebug) printf("[%d] Compressing and distributing Bi ...",mpi_id); cB = compressMatrix(B, N, N); for(i=1; i < mpi_size; i++){ mpiSendCompressedMatrix(cB, i*nColumns, (i+1)*nColumns, i); } //Fake shorten cB free(B); cB->columns = nColumns; uncompressMatrix(cB, &Bi, &BiRows, &BiColumns); Ci = MatrixMultiply(A, N, N, Bi, nColumns); if(printDebug) printf("[%d] Ci = A x Bi ...", mpi_id); if(printDebug) printMatrix(Ci, N, nColumns); cCi = compressMatrix(Ci, N, nColumns); if(printDebug) printf("cCi ...\n"); if(printDebug) printCompressedMatrix(cCi); MPI_Barrier(MPI_COMM_WORLD); if(printDebug) printf("[%d] Receiving Ci fragments ...\n", mpi_id); CompressedMatrix** Cii; Cii = (CompressedMatrix**) malloc(sizeof(CompressedMatrix*) * mpi_size); if(Cii == NULL){ perror("malloc"); exit(EXIT_FAILURE); } Cii[0] = cCi; for(i=1; i < mpi_size; i++){ Cii[i] = mpiRecvCompressedMatrix(N,nColumns, i); } if(printDebug) printf("[%d] Joining Cii ...\n", mpi_id); CompressedMatrix *cC; cC = joinCompressedMatrices(Cii, mpi_size); if(printDebug) printCompressedMatrix(cC); elapsed = tack(); printf("[%d] C ...\n", mpi_id); uncompressMatrix(cC, &C, &i,&j); if(i <= 20){ printMatrix(C, i,j); } else { if(i < 1000){ printf("C is too big, only printing first diagonal %d.\n[",j); for(k=0; (k < i) && (k < j); k++){ printf("%3.2f ",C[k + k*j]); } printf("]\n"); } else { printf("C is just too big!"); } } printf("Took [%f] seconds!\n",elapsed); } else { printDebug = 0; if(printDebug) printf("[%d] Waiting for A ...",mpi_id); MPI_Bcast( A, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD); if(printDebug) printf("[%d] Received A ...\n", mpi_id); if(printDebug) printMatrix(A, N, N); if(printDebug) printf("[%d] Waiting for Bi ...",mpi_id); cBi = mpiRecvCompressedMatrix(N, nColumns, 0); uncompressMatrix(cBi, &Bi, &BiRows, &BiColumns); if(printDebug) printf("[%d] Received Bi ...",mpi_id); if(printDebug) printMatrix(Bi,BiRows, BiColumns); assert( (BiRows == N) && "Number or Rows in Bi is not right!"); assert( (BiColumns == nColumns) && "Number or Columns in Bi is not right!"); Ci = MatrixMultiply(A, N, N, Bi, BiColumns); if(printDebug) printf("[%d] Ci = A x Bi ...", mpi_id); if(printDebug) printMatrix(Ci, N, nColumns); cCi = compressMatrix(Ci, N, nColumns); if(printDebug) printCompressedMatrix(cCi); MPI_Barrier(MPI_COMM_WORLD); if(printDebug) printf("[%d] Returning Ci ...\n", mpi_id); mpiSendCompressedMatrix(cCi, 0, nColumns, 0); } MPI_Finalize(); // NxM = NxN * NxM exit(EXIT_SUCCESS); }
void generate_duplication_rate(JSONWriter* pJSONWriter, const BWTIndexSet& index_set) { int n_samples = 10000; size_t k = 50; size_t total_pairs = index_set.pBWT->getNumStrings() / 2; size_t num_pairs_checked = 0; size_t num_duplicates = 0; #if HAVE_OPENMP omp_set_num_threads(opt::numThreads); #pragma omp parallel for #endif for(int i = 0; i < n_samples; ++i) { // Choose a read pair int64_t source_pair_idx = rand() % total_pairs; std::string r1 = BWTAlgorithms::extractString(index_set.pBWT, source_pair_idx * 2); std::string r2 = BWTAlgorithms::extractString(index_set.pBWT, source_pair_idx * 2 + 1); // Get the interval for $k1/$k2 which corresponds to the // lexicographic rank of reads starting with those kmers std::string k1 = "$" + r1.substr(0, k); std::string k2 = "$" + r2.substr(0, k); BWTInterval i1 = BWTAlgorithms::findInterval(index_set.pBWT, k1); BWTInterval i2 = BWTAlgorithms::findInterval(index_set.pBWT, k2); std::vector<int64_t> pair_ids; for(int64_t j = i1.lower; j <= i1.upper; ++j) { int64_t read_id = index_set.pSSA->lookupLexoRank(j); if(read_id % 2 == 1) continue; int64_t pair_id = read_id % 2 == 0 ? read_id / 2 : (read_id - 1) / 2; if(pair_id != source_pair_idx) pair_ids.push_back(pair_id); } for(int64_t j = i2.lower; j <= i2.upper; ++j) { int64_t read_id = index_set.pSSA->lookupLexoRank(j); if(read_id % 2 == 0) continue; int64_t pair_id = read_id % 2 == 0 ? read_id / 2 : (read_id - 1) / 2; if(pair_id != source_pair_idx) pair_ids.push_back(pair_id); } std::sort(pair_ids.begin(), pair_ids.end()); std::vector<int64_t>::iterator iter = std::adjacent_find(pair_ids.begin(), pair_ids.end()); bool has_duplicate = iter != pair_ids.end(); #if HAVE_OPENMP #pragma omp critical #endif { num_pairs_checked += 1; num_duplicates += has_duplicate; } } pJSONWriter->String("PCRDuplicates"); pJSONWriter->StartObject(); pJSONWriter->String("num_duplicates"); pJSONWriter->Int(num_duplicates); pJSONWriter->String("num_pairs"); pJSONWriter->Int(num_pairs_checked); pJSONWriter->EndObject(); }
int main(int argc, char* argv[]) { ArgProcessor args(argc, argv); if(args.isArgSet("--help") || (!(args.isArgSet("--reads") && args.isArgSet("--kmers")))) { cerr << usage(args) << endl << endl; exit(1); } string reads_fasta_file = args.getStringVal("--reads"); string kmers_fasta_file = args.getStringVal("--kmers"); bool is_DS = (! args.isArgSet("--SS")); if(args.isArgSet("--kmer_size")) { KMER_SIZE = args.getIntVal("--kmer_size"); if(KMER_SIZE < 20) { cerr << "Error, min kmer size is 20"; exit(2); } } if(args.isArgSet("--monitor")) { IRKE_COMMON::MONITOR = args.getIntVal("--monitor"); } if(omp_get_max_threads() > MAX_THREADS) { omp_set_num_threads(MAX_THREADS); } KmerCounter kcounter (KMER_SIZE, is_DS); populate_kmer_counter(kcounter, kmers_fasta_file); Fasta_reader fasta_reader(reads_fasta_file); ofstream* filewriter = NULL; ofstream* covwriter = NULL; bool write_coverage_info = args.isArgSet("--capture_coverage_info"); while (true) { Fasta_entry fe = fasta_reader.getNext(); string sequence = fe.get_sequence(); if(sequence == "") break; string header = fe.get_header(); vector<unsigned int> kmer_coverage = compute_kmer_coverage(sequence, kcounter); unsigned int median_cov = median_coverage(kmer_coverage); float mean_cov = mean(kmer_coverage); float stdev = stDev(kmer_coverage); float pct_stdev_of_avg = stdev/mean_cov*100; stringstream stats_text; stats_text << median_cov << "\t" << mean_cov << "\t" << stdev << "\t" << pct_stdev_of_avg << "\t" << fe.get_accession(); if(write_coverage_info) { // add the coverage info stats_text << "\t"; for (int i = 0; i < kmer_coverage.size(); i++) { stats_text<< kmer_coverage[i]; if(i != kmer_coverage.size() - 1) { stats_text<< ","; } } } stats_text << endl; cout << stats_text.str(); if (mean_cov < 0) { cerr << "ERROR, cannot have negative coverage!!" << endl; exit(1); } } return(0); }
int main() { unsigned overlap = (fftlen - chunklen) / 2; // Create data float *input = (float *) malloc(nsamp * ndms * sizeof(float)); // LOAD FILE: FOR TESTING ONLY // FILE *fp = fopen("/home/lessju/Code/MDSM/src/prototypes/TestingCCode.dat", "rb"); // printf("Read: %ld\n", fread(input, sizeof(float), nsamp, fp)); // fclose(fp); // Initialise templating unsigned numDownFacts; for(numDownFacts = 0; numDownFacts < 12; numDownFacts++) if (downfactors[numDownFacts] > maxDownfact) break; // Allocate kernels fftwf_complex **kernels = (fftwf_complex **) malloc(numDownFacts * sizeof(fftwf_complex *)); for(unsigned i = 0; i < numDownFacts; i++) kernels[i] = (fftwf_complex *) fftwf_malloc(fftlen / 2 * sizeof(fftwf_complex)); // Create kernels for(unsigned i = 0; i < numDownFacts; i++) createFFTKernel(kernels[i], downfactors[i], fftlen); // Start timing struct timeval start, end; long mtime, seconds, useconds; gettimeofday(&start, NULL); // Set number of OpenMP threads omp_set_num_threads(threads); // Create candidate container std::vector<Candidate> **candidates = (std::vector<Candidate> **) malloc(threads * sizeof(std::vector<Candidate> *)); unsigned nchunks = nsamp / chunklen; #pragma omp parallel \ shared(kernels, input, ndms, nsamp, fftlen, chunklen, numDownFacts, tsamp, \ overlap, downfactors, threshold, nchunks, candidates) { // Get thread details unsigned numThreads = omp_get_num_threads(); unsigned threadId = omp_get_thread_num(); // Allocate memory to be used in processing candidates[threadId] = new std::vector<Candidate>(); float *chunk = (float *) fftwf_malloc(fftlen * sizeof(float)); // Store input chunk fftwf_complex *fftChunk = (fftwf_complex *) fftwf_malloc(fftlen / 2 * sizeof(fftwf_complex)); // Store FFT'ed input chunk fftwf_complex *convolvedChunk = (fftwf_complex *) fftwf_malloc(fftlen / 2 * sizeof(fftwf_complex)); // Store FFT'ed, convolved input chunk InitialCandidate *initialCands = (InitialCandidate *) malloc(fftlen * sizeof(InitialCandidate)); // Store initial Candidate list // Create FFTW plans (these calls are note thread safe, place in critical section) fftwf_plan chunkPlan, convPlan; #pragma omp critical { chunkPlan = fftwf_plan_dft_r2c_1d(fftlen, chunk, fftChunk, FFTW_ESTIMATE); convPlan = fftwf_plan_dft_c2r_1d(fftlen, convolvedChunk, chunk, FFTW_ESTIMATE) ; } // Process all DM buffer associated with this thread for(unsigned j = 0; j < ndms / numThreads; j++) { unsigned d = ndms / numThreads * threadId + j; std::vector<Candidate> dmCandidates; // Process all data chunks for (unsigned c = 0; c < nchunks; c++) { int beg = d * nsamp + c * chunklen - overlap; if (c == 0) // First chunk, we need to insert 0s at the beginning { memset(chunk, 0, overlap * sizeof(float)); memcpy(chunk + overlap, input, (fftlen - overlap) * sizeof(float)); } else if (c == nchunks - 1) // Last chunk, insert 0s at the end { memset(chunk + fftlen - overlap, 0, overlap * sizeof(float)); memcpy(chunk, input + beg, (fftlen - overlap) * sizeof(float)); } else memcpy(chunk, input + beg, fftlen * sizeof(float)); // Search non-downsampled data first for(unsigned i = overlap; i < chunklen; i++) if (chunk[i] >= threshold) { candidate newCand = { d, chunk[i], 25, c*chunklen+i, 1 }; dmCandidates.push_back(newCand); } // FFT current chunk fftwf_execute(chunkPlan); // Loop over all downfactor levels for(unsigned s = 0; s < numDownFacts; s++) { // Reset inital Candidate List memset(initialCands, 0, fftlen * sizeof(InitialCandidate)); // Perform convolution convolve(fftChunk, kernels[s], convolvedChunk, chunk, fftlen, convPlan); // Threshold results and build preliminary candidate list unsigned numCands = 0; for(unsigned i = overlap; i < chunklen; i++) { if (chunk[i] >= threshold) { // printf("We have something %d %d \n", c, s); initialCands[numCands].bin = i; initialCands[numCands].value = chunk[i]; numCands++; } } if (numCands != 0) { // Prune candidate list pruneRelated(initialCands, downfactors[s], numCands); // Store candidate list for(unsigned k = 0; k < numCands; k++) if (initialCands[k].value != 0) { Candidate newCand = { d, initialCands[j].value, 5, c * chunklen + k, downfactors[s] }; dmCandidates.push_back(newCand); } } } } // Remove redundate candidates across downsampling levels if (dmCandidates.size() > 0) { char *mask = (char *) malloc(dmCandidates.size() * sizeof(char)); pruneRelatedDownfactors(dmCandidates, mask, numDownFacts); // Append to final candidate list for(j = 0; j < dmCandidates.size(); j++) if (mask[j]) candidates[threadId] -> push_back(dmCandidates[j]); free(mask); } } free(convolvedChunk); free(fftChunk); free(chunk); } gettimeofday(&end, NULL); seconds = end.tv_sec - start.tv_sec; useconds = end.tv_usec - start.tv_usec; mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; printf("Processed everything in %ld ms\n", mtime); // Now write everything to disk... FILE *fp2 = fopen("output.dat", "w"); for(unsigned i = 0; i < threads; i++) for(unsigned j = 0; j < candidates[i] -> size(); j++) { Candidate cand = candidates[i] -> at(j); fprintf(fp2, "%f,%f,%f,%ld,%d\n", cand.dm, cand.value, cand.time, cand.bin, cand.downfact); } fflush(fp2); fclose(fp2); }
int kmeans(int iteration_n, int class_n, int data_n, Point* centroids, Point* data, int* partitioned, int num_threads, int local_size, int argc, char** argv) { // Count number of data in each class int* count = new int[class_n]; int max_threads = omp_get_max_threads(); Point* tempCentroids = new Point[max_threads * class_n]; int* tempCount = new int[max_threads * class_n]; // Iterate through number of interations omp_set_num_threads(num_threads); for (int i = 0; i < iteration_n; i++) { #pragma omp parallel { const int ithread = omp_get_thread_num(); #pragma omp single { memset(tempCentroids, 0, max_threads * class_n * sizeof(Point)); memset(tempCount, 0, max_threads * class_n * sizeof(int)); } // Assignment step #pragma omp for for (int data_i = 0; data_i < data_n; ++data_i) { float min_dist = FLT_MAX; for (int class_i = 0; class_i < class_n; class_i++) { float x = data[data_i].x - centroids[class_i].x; float y = data[data_i].y - centroids[class_i].y; float dist = x * x + y * y; if (dist < min_dist) { partitioned[data_i] = class_i; min_dist = dist; } } // Sum up and count data for each class int index = ithread * class_n + partitioned[data_i]; tempCentroids[index].x += data[data_i].x; tempCentroids[index].y += data[data_i].y; tempCount[index]++; } // Update step #pragma omp single { // Clear sum buffer and class count memset(centroids, 0, class_n * sizeof(Point)); memset(count, 0, class_n * sizeof(int)); } #pragma omp for for (int class_i = 0; class_i < class_n; ++class_i) { for (int t = 0; t < max_threads; ++t) { centroids[class_i].x += tempCentroids[t * class_n + class_i].x; centroids[class_i].y += tempCentroids[t * class_n + class_i].y; count[class_i] += tempCount[t * class_n + class_i]; } // Divide the sum with number of class for mean point centroids[class_i].x /= count[class_i]; centroids[class_i].y /= count[class_i]; } } } delete[] tempCount; delete[] tempCentroids; delete[] count; return 0; }
int main(int argc, char* argv[]) { if(argc < 2) { usage(argv[0]); } int num_threads = 2; if(argc > 2) { for(int i = 0; i < argc; i++) { if(strcmp(argv[i], "-t") == 0 && argc > i+1) { num_threads = atoi(argv[i+1]); } } } omp_set_num_threads(num_threads); // Open files FILE* input_file = fopen(argv[1], "r"); if(input_file == NULL) { usage(argv[0]); } // Read the matrix int dim = 0; fscanf(input_file, "%u\n", &dim); int mat[dim][dim]; int element = 0; for(int i=0; i<dim; i++) { for(int j=0; j<dim; j++) { if (j != (dim-1)) fscanf(input_file, "%d\t", &element); else fscanf(input_file, "%d\n",&element); mat[i][j] = element; } } #ifdef _PRINT_INFO // Print the matrix printf("Input matrix [%d]\n", dim); for(int i=0; i<dim; i++) { for(int j=0; j<dim; j++) { printf("%d\t", mat[i][j]); } printf("\n"); } #endif // Algorithm based on information obtained here: // http://stackoverflow.com/questions/2643908/getting-the-submatrix-with-maximum-sum long alg_start = get_usecs(); // Compute vertical prefix sum int ps[dim][dim]; for (int j=0; j<dim; j++) { ps[0][j] = mat[0][j]; for (int i=1; i<dim; i++) { ps[i][j] = ps[i-1][j] + mat[i][j]; } } #ifdef _PRINT_INFO // Print the matrix printf("Vertical prefix sum matrix [%d]\n", dim); for(int i=0; i<dim; i++) { for(int j=0; j<dim; j++) { printf("%d\t", ps[i][j]); } printf("\n"); } #endif int max_sum = mat[0][0]; int top = 0, left = 0, bottom = 0, right = 0; //Auxilliary variables int sum[dim]; int pos[dim]; int local_max; #pragma omp parallel for private(sum, pos, local_max) schedule(static, 10) for (int i=0; i<dim; i++) { for (int k=i; k<dim; k++) { // Kandane over all columns with the i..k rows clear(sum, dim); clear(pos, dim); local_max = 0; // We keep track of the position of the max value over each Kandane's execution // Notice that we do not keep track of the max value, but only its position sum[0] = ps[k][0] - (i==0 ? 0 : ps[i-1][0]); for (int j=1; j<dim; j++) { if (sum[j-1] > 0) { sum[j] = sum[j-1] + ps[k][j] - (i==0 ? 0 : ps[i-1][j]); pos[j] = pos[j-1]; } else { sum[j] = ps[k][j] - (i==0 ? 0 : ps[i-1][j]); pos[j] = j; } if (sum[j] > sum[local_max]) { local_max = j; } } //Kandane ends here #pragma omp critical if (sum[local_max] > max_sum) { // sum[local_max] is the new max value // the corresponding submatrix goes from rows i..k. // and from columns pos[local_max]..local_max max_sum = sum[local_max]; top = i; left = pos[local_max]; bottom = k; right = local_max; } } } // Compose the output matrix int outmat_row_dim = bottom - top + 1; int outmat_col_dim = right - left + 1; int outmat[outmat_row_dim][outmat_col_dim]; for(int i=top, k=0; i<=bottom; i++, k++) { for(int j=left, l=0; j<=right ; j++, l++) { outmat[k][l] = mat[i][j]; } } long alg_end = get_usecs(); // Print output matrix printf("Sub-matrix [%dX%d] with max sum = %d, top = %d, bottom = %d, left = %d, right = %d\n", outmat_row_dim, outmat_col_dim, max_sum, top, bottom, left, right); #ifdef _PRINT_INFO for(int i=0; i<outmat_row_dim; i++) { for(int j=0; j<outmat_col_dim; j++) { printf("%d\t", outmat[i][j]); } printf("\n"); } #endif printf("%s,arg(%s),%s,%f sec, threads: %d\n", argv[0], argv[1], "CHECK_NOT_PERFORMED", ((double)(alg_end-alg_start))/1000000, num_threads); // Release resources fclose(input_file); return 0; }
int main(int argc, char *argv[]) { HashFunction hf[] = { RSHash, JSHash, ELFHash, BKDRHash, SDBMHash, DJBHash, DEKHash, BPHash, FNVHash, APHash, hash_div_701, hash_div_899, hash_mult_700, hash_mult_900 }; word_list *wl; char *word; char *bv; double start, end, diff; size_t wl_size; size_t bv_size; size_t num_hf; size_t i, j; unsigned int hash; int misspelled; // Set Number of threads to 4 omp_set_num_threads(4); // if (argc != 2) { printf("Please give word to spell check\n"); exit(EXIT_FAILURE); } word = argv[1]; /* load the word list */ wl = create_word_list("word_list.txt"); if (!wl) { fprintf(stderr, "Could not read word list\n"); exit(EXIT_FAILURE); } wl_size = get_num_words(wl); start = omp_get_wtime(); /* create the bit vector */ bv_size = 100000000; num_hf = sizeof(hf) / sizeof(HashFunction); bv = calloc(bv_size, sizeof(char)); if (!bv) { destroy_word_list(wl); exit(EXIT_FAILURE); } for (j = 0; j < num_hf; j++) { #pragma omp parallel for private(hash) for (i = 0; i < wl_size; i++) { hash = hf[j] (get_word(wl, i)); hash %= bv_size; bv[hash] = 1; } } /* do the spell checking */ misspelled = 0; for (j = 0; j < num_hf; j++) { hash = hf[j] (word); hash %= bv_size; if (bv[hash] == 0) misspelled = 1; } end = omp_get_wtime(); diff = end - start; printf("Spell check time: %f\n", diff); /* tell the user the result */ if (misspelled) printf("Word %s is misspelled\n", word); else printf("Word %s is spelled correctly\n", word); free(bv); destroy_word_list(wl); return EXIT_SUCCESS; }
int main(int argc, char ** argv) { // enhanced usage, useful for testing if (argc != 1 && argc != 2 && argc != 7) { fprintf(stderr, "Usage: %s [[threads] yMin yMax xMin xMax dxy]\n", argv[0]); fprintf(stderr, "Either specify no args, or only threads, or all args.\n"); return -2; } // determine amount of threads if (argc > 1) omp_set_num_threads(atoi(argv[1])); else omp_set_num_threads(4); // set constants if supplied if (argc == 7) { yMin = atof(argv[2]); yMax = atof(argv[3]); xMin = atof(argv[4]); xMax = atof(argv[5]); dxy = atof(argv[6]); } double time; timer_start(); double cx, cy; double zx, zy, new_zx; unsigned char n; int nx, ny; // The Mandelbrot calculation is to iterate the equation // z = z*z + c, where z and c are complex numbers, z is initially // zero, and c is the coordinate of the point being tested. If // the magnitude of z remains less than 2 for ever, then the point // c is in the Mandelbrot set. We write out the number of iterations // before the magnitude of z exceeds 2, or UCHAR_MAX, whichever is // smaller. nx = 0; ny = 0; nx = (xMax - xMin) / dxy; ny = (yMax - yMin) / dxy; int i, j; unsigned char * buffer = malloc(nx * ny * sizeof(unsigned char)); if (buffer == NULL) { fprintf (stderr, "Couldn't malloc buffer!\n"); return EXIT_FAILURE; } // do the calculations parallel #pragma omp parallel for private(i, j, cx, zx, zy, n, new_zx, cy) for (i = 0; i < ny; i++) { cy = yMin - dxy + i * dxy; for (j = 0; j < nx; j++) { cx = xMin - dxy + j * dxy; zx = 0.0; zy = 0.0; n = 0; while ((zx*zx + zy*zy < 4.0) && (n != UCHAR_MAX)) { new_zx = zx*zx - zy*zy + cx; zy = 2.0*zx*zy + cy; zx = new_zx; n++; } buffer[i * nx + j] = n; } } time = timer_end(); fprintf (stderr, "Took %g seconds.\nNow writing file...\n", time); fwrite(buffer, sizeof(unsigned char), nx * ny, stdout); fprintf (stderr, "All done! To process the image: convert -depth 8 -size " \ "%dx%d gray:output out.jpg\n", nx, ny); return 0; }
int main(int argc, char * argv[]) { if(argc!= 11) { cout<<"usage: PLSACluster <inputfile> <indexmidfile> <indextagfile> <crossfolds> <numTopics> <numIters> <anneal> <numBlocks> <top-k words> <pos>"<<endl; cout<<"./PLSACluster data/inputtagsformat.txt data/indexmediaid.txt data/indextag.txt 10 200 200 100 8 50 0"<<endl; return 1; } char* inputfile=argv[1]; // input file char* indexmidfile=argv[2]; // mid inverted index table file char* indextagfile=argv[3]; // tag inverted index table file int crossfold=atoi(argv[4]); // cross validation dataset 10(1:9) int numLS=atoi(argv[5]); // topic number int numIters=atoi(argv[6]); // iterate number int anneal=atoi(argv[7]); // simulated annealing int numBlocks=atoi(argv[8]); // block number int topk=atoi(argv[9]); // number of tags in each topics int pos=atoi(argv[10]); int cpu_core_nums = omp_get_num_procs(); omp_set_num_threads(cpu_core_nums); iPLSA * plsa; plsa=new iPLSA(inputfile,indexmidfile,indextagfile,crossfold, numLS, numIters, 1, 1, 0.552, anneal, 0.92, cpu_core_nums, numBlocks, pos); plsa->run(); double ** p_d_z = plsa->get_p_d_z(); double ** p_w_z = plsa->get_p_w_z(); int document_num = plsa->numDocs(); int topic_num = plsa->numCats(); int word_num = plsa->numWords(); int midcount = plsa->numDocs(); vector<int> index2mid(midcount); vector<string> index2tag(word_num); ifstream in_inter(indexmidfile); ifstream in_inter2(indextagfile); loadmidinfo(in_inter,index2mid); loadtaginfo(in_inter2,index2tag); FILE *doc2topic_fp = fopen("doc2topic_distribution.txt","w"); if(doc2topic_fp==NULL) return -1; for( int i = 0; i < document_num; ++i ) { fprintf(doc2topic_fp, "%d ", index2mid[i]); for( int j = 1; j < topic_num; ++j ) { fprintf(doc2topic_fp, "%f ", p_d_z[i][j]); } fprintf(doc2topic_fp, "\n"); } FILE *topic2word_fp = fopen("topic2word_distribution.txt","w"); if(doc2topic_fp==NULL) return -1; for( int i = 0; i < topic_num; ++i ) { map<int,double> wMap; for( int w = 0; w<word_num; w++ ) { wMap[w] = p_w_z[w][i]; } vector< pair<int, double> > wVector; sortMapByValue(wMap,wVector); for( int w = 1; w<=topk; w++ ) { fprintf(topic2word_fp, "%s:%f ",index2tag[wVector[w].first].c_str(), wVector[w].second); } fprintf(topic2word_fp, "\n"); } return 0; }
/* * Class: com_intel_analytics_bigdl_mkl_MKL * Method: setNumThreads * Signature: (I)V */ JNIEXPORT void JNICALL Java_com_intel_analytics_bigdl_mkl_MKL_setNumThreads (JNIEnv * env, jclass cls, jint num_threads) { omp_set_num_threads(num_threads); }