int main(int argc, char **argv) { MPI_Init(&argc, &argv); double start_time, end_time; // Print some info omp_set_nested(1); int maxThreads = omp_get_max_threads(); printf("Available threads: %d\n", maxThreads); // Initialize the array. InitArray(); // Print data if in debug mode. if (DEBUG) { printf("===== BEFORE QUICK SORT (SEQ) =====\n\n"); PrintArray(); printf("===================================\n\n\n"); } // Start timer. start_time = MPI_Wtime(); // Split into 8 pieces and sort int subArraySize = ITEMS / maxThreads; int maxInd = ((maxThreads - 1) * subArraySize) - 1 + subArraySize; int i; #pragma omp parallel for for (i = 0; i < maxThreads; i++) { QuickSort(v, i * subArraySize, (i * subArraySize) - 1 + subArraySize); } // Sort the pieces int j; for (i = 0; i < ITEMS / maxThreads; i++) { for (j = 0; j < maxThreads; j++) { sorted[maxThreads * i + j] = v[subArraySize * j + i]; } } #pragma omp parallel for for (i = 0; i < subArraySize; i++) { QuickSort(sorted, i * maxThreads, i * maxThreads + maxThreads - 1); } // Stop timer. end_time = MPI_Wtime(); // Print data if in debug mode. if (DEBUG) { printf("===== AFTER QUICK SORT (SEQ) ======\n\n"); PrintArray(); printf("===================================\n\n"); } else { printf("Lowest: %d\n", sorted[0]); printf("Highest: %d\n", sorted[ITEMS - 1]); } double time_taken = (end_time - start_time); printf("Execution time: %fs\n", time_taken); CleanMemory(); }
static void createPreMarkers(RoadMapArray * rdmaps, PreGraph * preGraph, IDnum * chains) { IDnum sequenceIndex; IDnum referenceCount = rdmaps->referenceCount; #ifndef _OPENMP Annotation *annot = rdmaps->annotations; #endif #ifdef _OPENMP int threads = omp_get_max_threads(); if (threads > 8) threads = 8; #pragma omp parallel for num_threads(threads) #endif for (sequenceIndex = 1; sequenceIndex <= referenceCount; sequenceIndex++) { #ifdef _OPENMP Annotation *annot = getAnnotationInArray(rdmaps->annotations, annotationOffset[sequenceIndex - 1]); #endif RoadMap *rdmap; Coordinate currentPosition, currentInternalPosition; IDnum currentPreNodeID, nextInternalPreNodeID; IDnum annotIndex, lastAnnotIndex; PreMarker * previous; if (sequenceIndex % 1000000 == 0) velvetLog("Connecting %li / %li\n", (long) sequenceIndex, (long) sequenceCount_pg(preGraph)); rdmap = getRoadMapInArray(rdmaps, sequenceIndex - 1); annotIndex = 0; lastAnnotIndex = getAnnotationCount(rdmap); nextInternalPreNodeID = chooseNextInternalPreNode (chains[sequenceIndex] - 1, sequenceIndex, preGraph, chains); previous = NULL; currentPosition = 0; currentInternalPosition = 0; currentPreNodeID = 0; // Recursion up to last annotation while (annotIndex < lastAnnotIndex || nextInternalPreNodeID != 0) { if (annotIndex == lastAnnotIndex || (nextInternalPreNodeID != 0 && currentInternalPosition < getPosition(annot))) { #ifdef _OPENMP lockNode(nextInternalPreNodeID); #endif previous = addPreMarker_pg(preGraph, nextInternalPreNodeID, sequenceIndex, ¤tPosition, previous); #ifdef _OPENMP unLockNode(nextInternalPreNodeID); #endif currentPreNodeID = nextInternalPreNodeID; nextInternalPreNodeID = chooseNextInternalPreNode (currentPreNodeID, sequenceIndex, preGraph, chains); currentInternalPosition += getPreNodeLength_pg(currentPreNodeID, preGraph); } else { reConnectAnnotation(¤tPreNodeID, annot, ¤tPosition, sequenceIndex, preGraph, &previous); annot = getNextAnnotation(annot); annotIndex++; } } } }
int colvarproxy_lammps::smp_num_threads() { return omp_get_max_threads(); }
void lis_matvec_ccs(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[]) { LIS_INT i,j,js,je,jj; LIS_INT n,np; LIS_SCALAR t; #ifdef _OPENMP LIS_INT k,nprocs; LIS_SCALAR *w; #endif n = A->n; np = A->np; if( A->is_splited ) { for(i=0; i<n; i++) { y[i] = A->D->value[i]*x[i]; } for(i=0; i<np; i++) { js = A->L->ptr[i]; je = A->L->ptr[i+1]; t = x[i]; for(j=js;j<je;j++) { jj = A->L->index[j]; y[jj] += A->L->value[j] * t; } js = A->U->ptr[i]; je = A->U->ptr[i+1]; t = x[i]; for(j=js;j<je;j++) { jj = A->U->index[j]; y[jj] += A->U->value[j] * t; } } } else { #ifdef _OPENMP nprocs = omp_get_max_threads(); w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvec_ccs::w" ); #pragma omp parallel private(i,j,js,je,t,jj,k) { k = omp_get_thread_num(); #pragma omp for for(j=0;j<nprocs;j++) { memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) ); } #pragma omp for for(i=0; i<np; i++) { js = A->ptr[i]; je = A->ptr[i+1]; t = x[i]; for(j=js;j<je;j++) { jj = k*np+A->index[j]; w[jj] += A->value[j] * t; } } #pragma omp for for(i=0;i<n;i++) { t = 0.0; for(j=0;j<nprocs;j++) { t += w[j*np+i]; } y[i] = t; } } lis_free(w); #else for(i=0; i<n; i++) { y[i] = 0.0; } for(i=0; i<np; i++) { js = A->ptr[i]; je = A->ptr[i+1]; t = x[i]; for(j=js;j<je;j++) { jj = A->index[j]; y[jj] += A->value[j] * t; } } #endif } }
//MAIN int main(int argc,char **argv){ /////////////////////////////// //INITIALIZE MPI ENVIRONMENT // /////////////////////////////// MPI_Init(&argc, &argv); MPI_Barrier(MPI_COMM_WORLD); //SET MPI ID's AND NUMBER OF NODES MPI_Comm_rank(MPI_COMM_WORLD,&MPIBasic::ID); MPI_Comm_size(MPI_COMM_WORLD,&MPIBasic::NumberOfNodes); //////////////////////////////////// // INITIALIZE OPEN MP ENVIRONMENT // //////////////////////////////////// //CHECK THREAD COUNT std::cerr << "#NUMBER OF THREADS " << omp_get_max_threads() << std::endl; //INITIALIZE THREADED FFTW int FFTW3_THREAD_STATUS=fftw_init_threads(); std::cerr << "#FFTW THREAD STATUS " << FFTW3_THREAD_STATUS << std::endl; if(FFTW3_THREAD_STATUS==1){ fftw_plan_with_nthreads(omp_get_max_threads()); } ////////////////////////////////// //PROCESS COMMANDLINE ARGUMENTS // ////////////////////////////////// INT NumberOfConfigurations=1; ////////////////////////////////// //PROCESS COMMANDLINE ARGUMENTS ////////////////////////////////// Konfig arguments(argc,argv); //GET OUTPUT FOLDER arguments.Getval("nconfs",NumberOfConfigurations); ////////////////////////// // SET OUTPUT DIRECTORY // ////////////////////////// char OutDir[256]="OUTPUT"; arguments.Getval("o",OutDir); IO::SetOutputDirectory(OutDir); #if IC_FLAG==LOAD_FLAG ///////////////////////// // SET INPUT DIRECTORY // ///////////////////////// char InDir[256]="INPUT"; arguments.Getval("i",InDir); IO::SetInputDirectory(InDir); ///////////////////// // SET INPUT FILES // ///////////////////// // FOR LOADING FILES INT InputFileTime=0; INT InputFileID=1457712671; arguments.Getval("iT",InputFileTime); arguments.Getval("iID",InputFileID); IO::SetInputFile(InputFileTime,InputFileID); #endif //////////////////////////// // DETERMINE LATTICE SIZE // //////////////////////////// INT NSites=-1; arguments.Getval("N",NSites); if(NSites>0){ Lattice::N[0]=NSites; Lattice::N[1]=NSites; Lattice::N[2]=NSites; Lattice::Volume=NSites*NSites*NSites; std::cerr << "## LATTICE SIZE IS " << Lattice::N[0] << "x" << Lattice::N[1] << "x" << Lattice::N[2] << std::endl; } else{ std::cerr << "## NUMBER OF SITES NOT SPECIFIED -- USING " << Lattice::N[0] << "x" << Lattice::N[1] << "x" << Lattice::N[2] << std::endl; } /////////////////////////////// // GET SIMULATION PARAMETERS // /////////////////////////////// DOUBLE InvTemp=-1; arguments.Getval("beta",InvTemp); if(InvTemp>0.0){ LangevinDynamics::beta=InvTemp; std::cerr << "#beta=" << LangevinDynamics::beta << std::endl; } ////////////// // SIMULATE // ////////////// //COMMAND LINE OUTPUT std::cerr << "#GAUGE GROUP IS SU(" << Nc << ")" << std::endl; std::cerr << "#PRECISION IS " << MAX_DIGITS_PRECISION << " DIGITS" << std::endl; //INITIALIZE SIMULATION Simulation::Init(); // SAMPLE DIFFERENT CONFIGURATIONS // for(INT n=0;n<NumberOfConfigurations;n++){ //SET GLOBAL RANDOM NUMBER SEED// INT GLOBAL_RNG_SEED; if(MPIBasic::ID==0){ GLOBAL_RNG_SEED=time(0); arguments.Getval("SEED",GLOBAL_RNG_SEED); } // BROADCAST GLOBAL RANDOM SEED // MPI_Bcast(&GLOBAL_RNG_SEED, 1, MPI_INT,0,MPI_COMM_WORLD); // PERFORM CLASSICAL STATISTICAL SIMULATION // Simulation::Run(GLOBAL_RNG_SEED+MPIBasic::ID); // COMMADNLINE NOTIFICATION // std::cerr << "#COMPLETED " << GLOBAL_RNG_SEED+MPIBasic::ID << std::endl; } //SYNCHRONIZE ALL MPI NODES MPI_Barrier(MPI_COMM_WORLD); //FINALIZE MPI MPI_Finalize(); //EXIT exit(0); }
int main (int argc, char **argv) { int ret; int c; int pin2core = 0; // 1=> pin threads to single core int pin2range = 0; // 1=> pin threads to range of cores int cpn = 1; // default cores per node ret = MPI_Init (&argc, &argv); ret = MPI_Comm_rank (MPI_COMM_WORLD, &iam); shiftiam = iam; ret = MPI_Comm_size (MPI_COMM_WORLD, &nranks); pid = getpid (); nthreads = omp_get_max_threads (); core = malloc (nthreads * sizeof (int)); status = malloc (nthreads * sizeof (char)); tidarr = malloc (nthreads * sizeof (int)); fp = malloc (nthreads * sizeof (FILE *)); init_core (nthreads); while ((c = getopt (argc, argv, "hcrn:w:")) != -1) { switch (c) { case 'h': if (iam == 0) { printf ("Usage: %s [-c] to pin to single core\n" " [-r] to pin to range of cores\n" " [-n <num>] number of cores per node\n" " [-w <num> number of seconds between forced shifts", argv[0]); } return 0; break; case 'c': pin2core = 1; break; case 'r': pin2range = 1; break; case 'n': cpn = atoi (optarg); break; case 'w': shiftintvl = atoi (optarg); break; default: printf ("unknown option %c\n", c); return 1; break; } } if (pin2core) { if (iam == 0) { printf ("Pinning threads to individual cores\n"); } ret = set_affinity_ (&iam, &cpn, &nthreads, &pin2core); } else if (pin2range) { if (iam == 0) { printf ("Pinning threads to subsetted range of cores\n"); } ret = set_affinity_ (&iam, &cpn, &nthreads, &pin2core); } else { if (iam == 0) { printf ("No pinning\n"); } } ret = print_affinity_ (&iam); fill_tid_fp (); while (1) { // Loop for some time (default 10 seconds), printing any core attachment changes threaded_loop (); // Change affinity to guarantee all is working as expected if (pin2core || pin2range) { shiftiam = (shiftiam + 1) % nranks; printf ("shiftiam=%d\n", shiftiam); ret = set_affinity_ (&shiftiam, &cpn, &nthreads, &pin2core); } ret = print_affinity_ (&iam); print_all_statuses (); } }
//place_halos(): // //Takes a list of halo masses (Nhalos, HaloMass), a list of particles // (NTotPart,PartX,PartY,PartZ), some simulation parameters (L, mp), and // user-defined parameters (Nlin,rho_ref,alpha,Malpha,Nalpha,seed) //and returns a list of halo positions and radii (HaloX,HaloY,HaloZ,HaloR) int place_halos(long Nend, float *HaloMass, long Nlin, long NTotPart, float *PartX, float *PartY, float *PartZ, float *PartVX, float *PartVY, float *PartVZ, float L, float rho_ref, long seed, float mp, int nthreads, double *alpha, double *fvel, double *Malpha, long Nalpha,float recalc_frac, float *HaloX, float *HaloY, float *HaloZ, float *HaloVX, float *HaloVY, float *HaloVZ,float *HaloR,long **ListOfPart, long *NPartPerCell){ fprintf(stderr,"\tThis is place_halos.c\n"); //Initiallising ------------------------------------------------- long i,j,k,lin_ijk, Nmin; long *count,trials; long ihalo, ipart,i_alpha; double invL = 1./L; float Mcell,Mhalo,Mchange; float R; time_t t0,tI,tII; int check; double mpart,fvel_i; double exponent; double TotProb; double prob_repicked = 0.0; double *MassLeft; double *CumulativeProb; long *ListOfHalos, *NHalosPerCellStart, *NHalosPerCellEnd; long Nhalos; int recalc; float diff; time_t t5; #ifdef VERB time_t t1,t3,t4,t4_5; #endif long n_recalc =0; int use_vel=1; if (HaloVX==NULL) use_vel=0; NCells = Nlin; Lbox = L; t0=time(NULL); NTotCells = NCells*NCells*NCells; //Allocate memory for the arrays MassLeft = (double *) calloc(NTotCells,sizeof(double)); if(MassLeft == NULL) { fprintf(stderr,"\tplace_halos(): could not allocate %ld array for MassLeft[]\nABORTING",NTotCells); exit(-1); } NHalosPerCellStart = (long *) calloc(NTotCells,sizeof(long)); if(NHalosPerCellStart == NULL) { fprintf(stderr,"\tplace_halos(): could not allocate %ld array for NHalosPerCell[]\nABORTING",NTotCells); exit(-1); } NHalosPerCellEnd = (long *) calloc(NTotCells,sizeof(long)); if(NHalosPerCellEnd == NULL) { fprintf(stderr,"\tplace_halos(): could not allocate %ld array for NHalosPerCell[]\nABORTING",NTotCells); exit(-1); } count = (long *) calloc(NTotCells,sizeof(long)); if(count == NULL) { fprintf(stderr,"\tplace_halos(): could not allocate %ld array for NTotCells[]\nABORTING",NTotCells); exit(-1); } CumulativeProb = (double *) calloc(NTotCells, sizeof(double)); if(CumulativeProb == NULL) { fprintf(stderr,"\tplace_halos(): could not allocate %ld array for CumulativeProb[]\nABORTING",NTotCells); exit(-1); } if (nthreads<1){ NTHREADS = omp_get_max_threads(); }else{ NTHREADS = nthreads; } #ifdef NO_EXCLUSION int *already_chosen; already_chosen = (int*) calloc(NTotPart,sizeof(int)); if(already_chosen == NULL) { fprintf(stderr,"\tplace_halos(): could not allocate %ld array for already_chosen[]\nABORTING",NTotPart); exit(-1); } #endif //Initiallise random numbers #ifdef VERB fprintf(stderr,"\tinput seed: %ld. time0: %f.",seed, (float) t0); #endif if (seed>=0){ srand(seed); #ifdef VERB fprintf(stderr,"\tUsed: %ld \n",seed); #endif } else { srand(t0); #ifdef VERB fprintf(stderr,"\tSeed Used: %ld \n",t0); #endif } mpart = (double) mp; Nmin = (long)ceil(HaloMass[Nend-1]*0.9/mpart); lcell = (float) L/NCells; #ifdef VERB fprintf(stderr,"\n\tParticles and Halos placed in %ld^3 cells\n",NCells); fprintf(stderr,"\tBOX = %f lcell =%f rho_ref = %e invL %f\n",L,L/NCells,rho_ref,invL); fprintf(stderr,"\tNhalostart = %d,Nhalosend = %ld, NPart = %ld\n",0, Nend, NTotPart); fprintf(stderr,"\n\tMinimmum mass= %e. Minimum part per halo = %ld. mpart %e\n",HaloMass[Nend-1],Nmin,mpart); #endif #ifdef DEBUG fprintf(stderr,"\n\tRAND_MAX=%d\n",RAND_MAX); fprintf(stderr,"\tX[0] = %f Y[0] = %f Z[0] = %f\n",PartX[0],PartY[0],PartZ[0]); fprintf(stderr,"\tX[1] = %f Y[1] = %f Z[1] = %f\n",PartX[1],PartY[1],PartZ[1]); fprintf(stderr,"\tM[0] = %e \n",HaloMass[0]); fprintf(stderr,"\tM[1] = %e \n",HaloMass[1]); fprintf(stderr,"\t ... \n"); fprintf(stderr,"\tM[%ld] = %e \n",Nend-1,HaloMass[Nend-1]); fprintf(stderr,"\tX[%ld] = %f Y[%ld] = %f Z[%ld] = %f\n",Nend-1,PartX[Nend-1],Nend-1,PartY[Nend-1],Nend-1,PartZ[Nend-1]); #endif int r = (int) (R_from_mass(HaloMass[0],rho_ref)/(L/NCells)); if (L/NCells<R_from_mass(HaloMass[0],rho_ref)){ fprintf(stderr,"WARNING: cell size is smaller than the radius of the biggest halo. Using r=%i. This may be problematic\n",r); } #ifdef VERB fprintf(stderr,"\tR_max=%f, lcell=%f, r=%d\n",R_from_mass(HaloMass[0],rho_ref),(L/NCells),r); t1=time(NULL); diff = difftime(t1,t0); fprintf(stderr,"\ttime of initialisation %f\n",diff); #endif // ------------------------------------------------- Initiallised //Alloc Enough Memory Nhalos=0; for (i=0;i<NCells;i++){ for (j=0;j<NCells;j++){ for (k=0;k<NCells;k++){ lin_ijk = k+j*NCells+i*NCells*NCells; NHalosPerCellStart[lin_ijk] = Nhalos; NHalosPerCellEnd[lin_ijk] = Nhalos; Nhalos += (long) floor(NPartPerCell[lin_ijk]/Nmin+1); MassLeft[lin_ijk] = (double) NPartPerCell[lin_ijk]*mpart; #ifdef ULTRADEBUG if (lin_ijk<10 || lin_ijk > (NCells*NCells*NCells) - 10){ fprintf(stderr,"\tAllocated %ld (longs) in ListOfPart(%ld=[%ld,%ld,%ld])\n",NPartPerCell[lin_ijk],lin_ijk,i,j,k); } #endif } } } ListOfHalos = (long *) calloc(Nhalos,sizeof(long )); if(ListOfHalos == NULL) { fprintf(stderr,"\tplace_halos(): could not allocate %ld array for ListOfHalos[]\nABORTING",Nhalos); exit(-1); } #ifdef VERB // fprintf(stderr,"\tAllocated %ld (longs) in ListOfHalos\n",Nhalos); t3=time(NULL); diff = difftime(t3,t1); fprintf(stderr,"\t... memory allocated in %f\n",diff); fprintf(stderr,"\tComputing probabilities...\n"); #endif #ifdef DEBUG fprintf(stderr,"\tMass_cell[0]=%e",MassLeft[0]); fprintf(stderr,"\t Mass Function\n"); for (ihalo=0;ihalo<15;ihalo++){ fprintf(stderr,"\thalo %ld: ",ihalo); fprintf(stderr,"M=%e\n",HaloMass[ihalo]); } #endif //----------------------------------- Particles and haloes assigned to grid //Computing Cumulative Probability ----------------------------- //find the right alpha Mhalo = HaloMass[0]; i_alpha = 0; while(Mhalo<Malpha[i_alpha]) { i_alpha++; if (i_alpha==Nalpha){ fprintf(stderr,"\tERROR: No M_alpha low enough found\n"); fprintf(stderr,"\tERROR: N_alpha = %ld, Mh=%e, Ma= %e\n",Nalpha,Mhalo,Malpha[i_alpha-1]); exit(0); } } Mchange = Malpha[i_alpha]; exponent = alpha[i_alpha]; fvel_i = fvel[i_alpha]; //compute the probability #ifdef VERB fprintf(stderr,"\tUsing OMP with %d threads\n",NTHREADS); t4=time(NULL); #endif TotProb = ComputeCumulative(exponent, mpart, MassLeft, CumulativeProb); #ifdef VERB fprintf(stderr,"\n\tcase 0, TotProb=%e\n",TotProb); #endif #ifdef VERB fprintf(stderr,"\tNumber of alphas: %ld\n",Nalpha); fprintf(stderr,"\tUsing alpha_%ld=%f for M>%e\n",i_alpha,exponent,Mchange); t4_5=time(NULL); diff = difftime(t4_5,t4); fprintf(stderr,"\tprobabilty computed in %f secods\n",diff); #endif // ----------------------------------------- Computed Probability //Actually placing the haloes----------------------------------- #ifdef VERB fprintf(stderr,"\n\tPlacing Halos...\n\n"); #endif //Place one by one all the haloes (assumed to be ordered from the most massive to the least massive) for (ihalo=0;ihalo<Nend;ihalo++){ #ifdef DEBUG fprintf(stderr,"\n\t- Halo %ld ",ihalo); #endif #ifdef VERB if (ihalo%(Nend/10)==0 && ihalo>0){ //TEMPORARY fprintf(stderr,"\t\tFRAC, TOTPROB: %e, %e",(pow(Mcell/mpart,exponent)/TotProb),TotProb); fprintf(stderr,"\t%ld%% done\n",(ihalo/(Nend/100))); } #endif //Check whether or not, a change of alpha is needed for this halo mass Mhalo= HaloMass[ihalo]; recalc = 0; while (Mhalo < Mchange){//if so search the right alpha, and recompute probabilities i_alpha++; if (i_alpha==Nalpha){ fprintf(stderr,"\tERROR: No M_alpha low enough found: %e <%e\n",Mhalo,Malpha[Nalpha-1]); exit(0); } Mchange = Malpha[i_alpha]; exponent = alpha[i_alpha]; fvel_i = fvel[i_alpha]; #ifdef VERB fprintf(stderr,"\n\tUsing alpha_%ld=%f and fvel=%f for M>%e\n",i_alpha,exponent,fvel_i,Mchange); #endif recalc = 1; } // recalc if different alpha, OR there's a significant chance of choosing the same cell again. if(ihalo>0){ if(prob_repicked>=recalc_frac){ recalc = 1; n_recalc += 1; fprintf(stderr,"RECALCULATING: %ld, %e, ihalo=%ld\n",n_recalc,prob_repicked,ihalo); } } if (recalc==1){ tI=time(NULL); fprintf(stderr,"\tcase 1, TotProb_bef=%e",TotProb); TotProb=ComputeCumulative(exponent, mpart, MassLeft, CumulativeProb); fprintf(stderr," TotProb_aft=%e ihalo=%ld\n\n",TotProb,ihalo); prob_repicked=0.0; #ifdef VERB tII=time(NULL); diff = difftime(tII,tI); fprintf(stderr,"\tProbabilty recomputed in %f secods\n",diff); #endif recalc = 0; } do { //First, choose a cell #ifndef RANKED trials=0; do{ if (trials==MAXTRIALS){ fprintf(stderr,"MAXTRIALS=%d times picked an empty cell, recomputing Probs...\n",MAXTRIALS); fprintf(stderr,"\n\tcase 2, TotProb_bef=%e",TotProb); TotProb=ComputeCumulative(exponent, mpart, MassLeft, CumulativeProb); fprintf(stderr," TotProb_aft=%e ihalo=%ld\n",TotProb,ihalo); prob_repicked = 0.0; trials=0; } lin_ijk = select_cell(TotProb, CumulativeProb); trials++; }while (MassLeft[lin_ijk]==0.); k=lin_ijk%(NCells); j=((lin_ijk-k)/NCells)%NCells; i=(lin_ijk-k-j*NCells)/(NCells*NCells); #else //RANKED option: deprecated and not optimised lin_ijk=select_heaviest_cell(&i,&j,&k,MassLeft); #endif trials=0; //Second, choose a particle in that cell do { ipart = select_part_beta_0(lin_ijk,ListOfPart, NPartPerCell); if (ipart<0){ fprintf(stderr,"WARNING: Picked up an completely empty cell (ihalo %ld) lin_ijk=%ld \n",ihalo,lin_ijk); MassLeft[lin_ijk]=0.; check=1; //Choose another cell break; } HaloX[ihalo] = PartX[ipart]; HaloY[ihalo] = PartY[ipart]; HaloZ[ihalo] = PartZ[ipart]; #ifdef DEBUG fprintf(stderr,"HaloX=%f PartX=%f\n",HaloX[ihalo],PartX[ipart]); #endif if (use_vel==1){ HaloVX[ihalo] = fvel_i * PartVX[ipart]; HaloVY[ihalo] = fvel_i * PartVY[ipart]; HaloVZ[ihalo] = fvel_i * PartVZ[ipart]; } R=R_from_mass(HaloMass[ihalo],rho_ref); HaloR[ihalo]= R; #ifdef NO_EXCLUSION check = already_chosen[part]; already_chosen[ipart]=1; #else //Third, check that is not overlapping a previous halo check = check_HaloR_in_mesh(ihalo,HaloX,HaloY,HaloZ,HaloR,i,j,k,ListOfHalos,NHalosPerCellStart,NHalosPerCellEnd,r); #endif if (check==1){ #ifdef DEBUG fprintf(stderr,"Refused part : %ld\n",ipart); #endif trials++; } if (trials == MAXTRIALS){ //in order to avoid infinite loop, we will exit this loop, after MAXTRIALS trials #ifdef VERB fprintf(stderr,"MAXTRIALS=%d reached, removing cell [%ld,%ld,%ld]\n",MAXTRIALS,i,j,k); #endif MassLeft[lin_ijk]=0.; fprintf(stderr,"\n\tcase 3, TotProb_bef=%e",TotProb); TotProb=ComputeCumulative(exponent, mpart, MassLeft, CumulativeProb); fprintf(stderr," TotProb_aft=%e ihalo=%ld, R=%f\n",TotProb,ihalo,R); prob_repicked=0.0; trials=0; break; } } while (check==1);//If the particle was excluded, try another one in the same cell } while(check==1); //if reached MAXTRIALS, select another cell //Particle chosen! //mass in cell before assignment Mcell=MassLeft[lin_ijk]; #ifndef MASS_OF_PARTS if (Mcell>HaloMass[ihalo]) MassLeft[lin_ijk] -= Mhalo; else MassLeft[lin_ijk] = 0.; #else exclude(ipart,R,PartX,PartY,PartZ,i,j,k); #endif prob_repicked += pow(Mcell/mpart,exponent)/TotProb; #ifdef DEBUG fprintf(stderr,"\tAfter: Mcell=%e, CProbCell=%e, TotProb=%e. , Mhalo=%e. CProb[last]=%e\n",MassLeft[lin_ijk],CumulativeProb[lin_ijk],TotProb,Mhalo,CumulativeProb[NTotCells-1]); #endif #ifdef DEBUG fprintf(stderr,"\thalo %ld assigned to particle %ld at [%f,%f,%f]. R= %f, M= %e\n",ihalo,ipart,HaloX[ihalo],HaloY[ihalo],HaloZ[ihalo],R,Mhalo); #endif #ifdef DEBUG fprintf(stderr,"HaloX=%f PartX=%f\n",HaloX[ihalo],PartX[ipart]); #endif ListOfHalos[NHalosPerCellEnd[lin_ijk]]=ihalo; NHalosPerCellEnd[lin_ijk]++; }//for(ihalo=Nstart:Nend) //----------------------------------- Haloes Placed fprintf(stderr,"\t... placement Done!\n"); fprintf(stderr,"\t\tTOTAL NUMBER OF RE-CALCULATIONS: %ld\n",n_recalc); #ifdef VERB t5=time(NULL); diff = difftime(t5,t4_5); fprintf(stderr,"\ttime placing %f\n",diff); fprintf(stderr,"\tfreeing...\n"); #endif free(NHalosPerCellStart); free(NHalosPerCellEnd); free(count); free(CumulativeProb); free(MassLeft); free(ListOfHalos); #ifdef VERB diff = difftime(t5,t0); fprintf(stderr,"\ttotal time in place_halos.c %f\n",diff); fprintf(stderr,"\tPlacement done!!!\n"); #endif #ifdef MASS_OF_PARTS // free(excluded); free(Nexcluded); #endif return 0; }
main () { int i; thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } omp_set_dynamic (0); #pragma omp parallel { int j; #pragma omp for schedule(static,1) lastprivate (prvt) for (i=0; i<thds; i++) { for (j=0; j<ARRAYSIZ; j++) { prvt[j] = i+j; } barrier (thds); for (j=0; j<ARRAYSIZ; j++) { if (prvt[j] != i+j) { #pragma omp critical errors += 1; } } if (sizeof(prvt) != sizeof(int)*ARRAYSIZ) { #pragma omp critical errors += 1; } if (i==0) { waittime (1); } for (j=0; j<ARRAYSIZ; j++) { prvt[j] = i+j; } } for (j=0; j<ARRAYSIZ; j++) { if (prvt[j] != (thds-1)+j) { #pragma omp critical errors += 1; } } } #pragma omp parallel func (thds); func (1); if (errors == 0) { printf ("lastprivate 017 : SUCCESS\n"); return 0; } else { printf ("lastprivate 017 : FAILED\n"); return 1; } }
/*!******************************************************************* * \brief The main call * * \param argc The integer number of command line arguments * \param argv The character array of command line arguments *********************************************************************/ int main (int argc, char *argv[]) { int id = 0, n_elements = 1; // Initialize messenger mpi::messenger process_messenger (&argc, &argv); try { id = process_messenger.get_id (); n_elements = process_messenger.get_np (); io::parameters parameters = config (&argc, &argv, id); int m = parameters.get <int> ("grid.z.points") / n_elements + 1; m += (m - 1) % 2; std::vector <double> positions (n_elements + 1); for (int i = 0; i < n_elements + 1; ++i) { positions [i] = -parameters.get <double> ("grid.z.width") / 2.0 + parameters.get <double> ("grid.z.width") / n_elements * i; } int name = id; int n = parameters.get <int> ("grid.x.points"); grids::axis horizontal_axis (n, -parameters.get <double> ("grid.x.width") / 2.0, parameters.get <double> ("grid.x.width") / 2.0); grids::axis vertical_axis (m, positions [id], positions [id + 1], id == 0 ? 0 : 1, id == n_elements - 1 ? 0 : 1); TRACE ("Building data"); data::thermo_compositional_data data (&horizontal_axis, &vertical_axis, id, n_elements, parameters); TRACE ("Constructing element"); auto element = pisces::implemented_element::instance (parameters ["element"].as <std::string> (), horizontal_axis, vertical_axis, name, parameters, data, &process_messenger, 0x00); if (pisces::element::version () < versions::version ("0.6.0.0")) { INFO ("element.version < 0.6.0.0"); } else { INFO ("element.version not < 0.6.0.0"); } TRACE ("Element constructed."); clock_t cbegin, cend; std::chrono::time_point <std::chrono::system_clock> begin, end; cbegin = clock (); begin = std::chrono::system_clock::now (); int n_steps = data.n_steps; std::shared_ptr <io::input> virtual_input; while (n_steps < parameters.get <int> ("time.steps") && element->duration < parameters.get <double> ("time.stop")) { if (parameters.get <int> ("grid.rezone.check_every") > 0 && n_steps != 0 && n_elements > 1) { INFO ("Rezoning"); formats::virtual_file *virt = element->rezone_minimize_ts (&positions [0], parameters.get <double> ("grid.rezone.min_size"), parameters.get <double> ("grid.rezone.max_size"), parameters.get <int> ("grid.rezone.n_tries"), parameters.get <int> ("grid.rezone.iters_fixed_t"), parameters.get <double> ("grid.rezone.step_size"), parameters.get <double> ("grid.rezone.k"), parameters.get <double> ("grid.rezone.t_initial"), parameters.get <double> ("grid.rezone.mu_t"), parameters.get <double> ("grid.rezone.t_min")); if (virt) { formats::virtual_files ["main/virtual_file"] = *virt; grids::axis vertical_axis (m, positions [id], positions [id + 1], id == 0 ? 0 : 1, id == n_elements - 1 ? 0 : 1); virtual_input.reset (new io::formatted_input <formats::virtual_format> (formats::data_grid::two_d (n, m), "main/virtual_file")); data.setup (virtual_input); element = pisces::implemented_element::instance (parameters ["element"].as <std::string> (), horizontal_axis, vertical_axis, name, parameters, data, &process_messenger, 0x00); } } element->run (n_steps); } cend = clock (); end = std::chrono::system_clock::now (); std::chrono::duration <double> eb = end - begin; INFO ("Main complete. CPU Time: " << ((double) (cend - cbegin))/CLOCKS_PER_SEC << " Wall Time: " << (double) eb.count () << " Efficiency: " << (((double) (cend - cbegin))/CLOCKS_PER_SEC / (double) eb.count () / omp_get_max_threads () * 100.) << "%"); } catch (std::exception &except) { FATAL ("Fatal error occurred. Check log."); FATAL (except.what ()); return 1; /* TODO Last check all should be somewhere not defined by the user */ } catch (int &except) { FATAL ("Fatal error occurred. Check log."); FATAL (except); return 1; /* TODO Last check all should be somewhere not defined by the user */ } catch (...) { FATAL ("Last ditch..."); return 1; } return 0; }
// host stub function void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int *range, ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { // Timing double t1, t2, c1, c2; int offs[4][3]; ops_arg args[4] = {arg0, arg1, arg2, arg3}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 4, range, 134)) return; #endif if (OPS_diags > 1) { ops_timing_realloc(134, "advec_mom_kernel2_y"); OPS_kernels[134].count++; ops_timers_core(&c1, &t1); } #ifdef OPS_MPI sub_block_list sb = OPS_sub_block_list[block->index]; #endif // compute locally allocated range for the sub-block int start[3]; int end[3]; int arg_idx[3]; #ifdef OPS_MPI if (!sb->owned) return; for (int n = 0; n < 3; n++) { start[n] = sb->decomp_disp[n]; end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; if (start[n] >= range[2 * n]) { start[n] = 0; } else { start[n] = range[2 * n] - start[n]; } if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) start[n] = range[2 * n]; if (end[n] >= range[2 * n + 1]) { end[n] = range[2 * n + 1] - sb->decomp_disp[n]; } else { end[n] = sb->decomp_size[n]; } if (sb->id_p[n] == MPI_PROC_NULL && (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); if (end[n] < start[n]) end[n] = start[n]; } #else for (int n = 0; n < 3; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #endif #ifdef OPS_DEBUG ops_register_args(args, "advec_mom_kernel2_y"); #endif offs[0][0] = args[0].stencil->stride[0] * 1; // unit step in x dimension offs[0][1] = off3D(1, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) - offs[0][0]; offs[0][2] = off3D(2, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) - offs[0][1] - offs[0][0]; offs[1][0] = args[1].stencil->stride[0] * 1; // unit step in x dimension offs[1][1] = off3D(1, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) - offs[1][0]; offs[1][2] = off3D(2, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) - offs[1][1] - offs[1][0]; offs[2][0] = args[2].stencil->stride[0] * 1; // unit step in x dimension offs[2][1] = off3D(1, &start[0], &end[0], args[2].dat->size, args[2].stencil->stride) - offs[2][0]; offs[2][2] = off3D(2, &start[0], &end[0], args[2].dat->size, args[2].stencil->stride) - offs[2][1] - offs[2][0]; offs[3][0] = args[3].stencil->stride[0] * 1; // unit step in x dimension offs[3][1] = off3D(1, &start[0], &end[0], args[3].dat->size, args[3].stencil->stride) - offs[3][0]; offs[3][2] = off3D(2, &start[0], &end[0], args[3].dat->size, args[3].stencil->stride) - offs[3][1] - offs[3][0]; int off0_0 = offs[0][0]; int off0_1 = offs[0][1]; int off0_2 = offs[0][2]; int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); int off1_0 = offs[1][0]; int off1_1 = offs[1][1]; int off1_2 = offs[1][2]; int dat1 = (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); int off2_0 = offs[2][0]; int off2_1 = offs[2][1]; int off2_2 = offs[2][2]; int dat2 = (OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); int off3_0 = offs[3][0]; int off3_1 = offs[3][1]; int off3_2 = offs[3][2]; int dat3 = (OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); // Halo Exchanges ops_H_D_exchanges_host(args, 4); ops_halo_exchanges(args, 4, range); ops_H_D_exchanges_host(args, 4); #ifdef _OPENMP int nthreads = omp_get_max_threads(); #else int nthreads = 1; #endif xdim0 = args[0].dat->size[0]; ydim0 = args[0].dat->size[1]; xdim1 = args[1].dat->size[0]; ydim1 = args[1].dat->size[1]; xdim2 = args[2].dat->size[0]; ydim2 = args[2].dat->size[1]; xdim3 = args[3].dat->size[0]; ydim3 = args[3].dat->size[1]; if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[134].mpi_time += t2 - t1; } #pragma omp parallel for for (int thr = 0; thr < nthreads; thr++) { int z_size = end[2] - start[2]; char *p_a[4]; int start_i = start[2] + ((z_size - 1) / nthreads + 1) * thr; int finish_i = start[2] + MIN(((z_size - 1) / nthreads + 1) * (thr + 1), z_size); // get address per thread int start0 = start[0]; int start1 = start[1]; int start2 = start_i; // set up initial pointers int d_m[OPS_MAX_DIM]; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; #endif int base0 = dat0 * 1 * (start0 * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); base0 = base0 + dat0 * args[0].dat->size[0] * (start1 * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); base0 = base0 + dat0 * args[0].dat->size[0] * args[0].dat->size[1] * (start2 * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); p_a[0] = (char *)args[0].data + base0; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; #endif int base1 = dat1 * 1 * (start0 * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); base1 = base1 + dat1 * args[1].dat->size[0] * (start1 * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); base1 = base1 + dat1 * args[1].dat->size[0] * args[1].dat->size[1] * (start2 * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); p_a[1] = (char *)args[1].data + base1; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; #endif int base2 = dat2 * 1 * (start0 * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); base2 = base2 + dat2 * args[2].dat->size[0] * (start1 * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); base2 = base2 + dat2 * args[2].dat->size[0] * args[2].dat->size[1] * (start2 * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); p_a[2] = (char *)args[2].data + base2; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; #endif int base3 = dat3 * 1 * (start0 * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); base3 = base3 + dat3 * args[3].dat->size[0] * (start1 * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); base3 = base3 + dat3 * args[3].dat->size[0] * args[3].dat->size[1] * (start2 * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); p_a[3] = (char *)args[3].data + base3; for (int n_z = start_i; n_z < finish_i; n_z++) { for (int n_y = start[1]; n_y < end[1]; n_y++) { for (int n_x = start[0]; n_x < start[0] + (end[0] - start[0]) / SIMD_VEC; n_x++) { // call kernel function, passing in pointers to data -vectorised #pragma simd for (int i = 0; i < SIMD_VEC; i++) { advec_mom_kernel2_y((double *)p_a[0] + i * 1 * 1, (const double *)p_a[1] + i * 1 * 1, (const double *)p_a[2] + i * 1 * 1, (const double *)p_a[3] + i * 1 * 1); } // shift pointers to data x direction p_a[0] = p_a[0] + (dat0 * off0_0) * SIMD_VEC; p_a[1] = p_a[1] + (dat1 * off1_0) * SIMD_VEC; p_a[2] = p_a[2] + (dat2 * off2_0) * SIMD_VEC; p_a[3] = p_a[3] + (dat3 * off3_0) * SIMD_VEC; } for (int n_x = start[0] + ((end[0] - start[0]) / SIMD_VEC) * SIMD_VEC; n_x < end[0]; n_x++) { // call kernel function, passing in pointers to data - remainder advec_mom_kernel2_y((double *)p_a[0], (const double *)p_a[1], (const double *)p_a[2], (const double *)p_a[3]); // shift pointers to data x direction p_a[0] = p_a[0] + (dat0 * off0_0); p_a[1] = p_a[1] + (dat1 * off1_0); p_a[2] = p_a[2] + (dat2 * off2_0); p_a[3] = p_a[3] + (dat3 * off3_0); } // shift pointers to data y direction p_a[0] = p_a[0] + (dat0 * off0_1); p_a[1] = p_a[1] + (dat1 * off1_1); p_a[2] = p_a[2] + (dat2 * off2_1); p_a[3] = p_a[3] + (dat3 * off3_1); } // shift pointers to data z direction p_a[0] = p_a[0] + (dat0 * off0_2); p_a[1] = p_a[1] + (dat1 * off1_2); p_a[2] = p_a[2] + (dat2 * off2_2); p_a[3] = p_a[3] + (dat3 * off3_2); } } if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[134].time += t1 - t2; } ops_set_dirtybit_host(args, 4); ops_set_halo_dirtybit3(&args[0], range); if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c2, &t2); OPS_kernels[134].mpi_time += t2 - t1; OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg2); OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg3); } }
int main(int argc, char *argv[]) { int step, ie, iside, i, j, k; double mflops, tmax, nelt_tot = 0.0; char Class; logical ifmortar = false, verified; double t2, trecs[t_last+1]; char *t_names[t_last+1]; //-------------------------------------------------------------------- // Initialize NUMA control //-------------------------------------------------------------------- numa_initialize_env(NUMA_MIGRATE_EXISTING); //--------------------------------------------------------------------- // Read input file (if it exists), else take // defaults from parameters //--------------------------------------------------------------------- FILE *fp; if ((fp = fopen("timer.flag", "r")) != NULL) { timeron = true; t_names[t_total] = "total"; t_names[t_init] = "init"; t_names[t_convect] = "convect"; t_names[t_transfb_c] = "transfb_c"; t_names[t_diffusion] = "diffusion"; t_names[t_transf] = "transf"; t_names[t_transfb] = "transfb"; t_names[t_adaptation] = "adaptation"; t_names[t_transf2] = "transf+b"; t_names[t_add2] = "add2"; fclose(fp); } else { timeron = false; } printf("\n\n NAS Parallel Benchmarks (NPB3.3-OMP-C) - UA Benchmark\n\n"); if ((fp = fopen("inputua.data", "r")) != NULL) { int result; printf(" Reading from input file inputua.data\n"); result = fscanf(fp, "%d", &fre); while (fgetc(fp) != '\n'); result = fscanf(fp, "%d", &niter); while (fgetc(fp) != '\n'); result = fscanf(fp, "%d", &nmxh); while (fgetc(fp) != '\n'); result = fscanf(fp, "%lf", &alpha); Class = 'U'; fclose(fp); } else { printf(" No input file inputua.data. Using compiled defaults\n"); fre = FRE_DEFAULT; niter = NITER_DEFAULT; nmxh = NMXH_DEFAULT; alpha = ALPHA_DEFAULT; Class = CLASS_DEFAULT; } dlmin = pow(0.5, REFINE_MAX); dtime = 0.04*dlmin; printf(" Levels of refinement: %8d\n", REFINE_MAX); printf(" Adaptation frequency: %8d\n", fre); printf(" Time steps: %8d dt: %15.6E\n", niter, dtime); printf(" CG iterations: %8d\n", nmxh); printf(" Heat source radius: %8.4f\n", alpha); printf(" Number of available threads: %8d\n", omp_get_max_threads()); printf("\n"); top_constants(); for (i = 1; i <= t_last; i++) { timer_clear(i); } if (timeron) timer_start(t_init); // set up initial mesh (single element) and solution (all zero) create_initial_grid(); r_init_omp((double *)ta1, ntot, 0.0); nr_init_omp((int *)sje, 4*6*nelt, -1); init_locks(); // compute tables of coefficients and weights coef(); geom1(); // compute the discrete laplacian operators setdef(); // prepare for the preconditioner setpcmo_pre(); // refine initial mesh and do some preliminary work time = 0.0; mortar(); prepwork(); adaptation(&ifmortar, 0); if (timeron) timer_stop(t_init); timer_clear(1); time = 0.0; for (step = 0; step <= niter; step++) { if (step == 1) { // reset the solution and start the timer, keep track of total no elms r_init((double *)ta1, ntot, 0.0); time = 0.0; nelt_tot = 0.0; for (i = 1; i <= t_last; i++) { if (i != t_init) timer_clear(i); } timer_start(1); } // advance the convection step convect(ifmortar); if (timeron) timer_start(t_transf2); // prepare the intital guess for cg transf(tmort, (double *)ta1); // compute residual for diffusion term based on intital guess // compute the left hand side of equation, lapacian t #pragma omp parallel default(shared) private(ie,k,j,i) { #pragma omp for for (ie = 0; ie < nelt; ie++) { laplacian(ta2[ie], ta1[ie], size_e[ie]); } // compute the residual #pragma omp for for (ie = 0; ie < nelt; ie++) { for (k = 0; k < LX1; k++) { for (j = 0; j < LX1; j++) { for (i = 0; i < LX1; i++) { trhs[ie][k][j][i] = trhs[ie][k][j][i] - ta2[ie][k][j][i]; } } } } } //end parallel // get the residual on mortar transfb(rmor, (double *)trhs); if (timeron) timer_stop(t_transf2); // apply boundary condition: zero out the residual on domain boundaries // apply boundary conidtion to trhs #pragma omp parallel for default(shared) private(ie,iside) for (ie = 0; ie < nelt; ie++) { for (iside = 0; iside < NSIDES; iside++) { if (cbc[ie][iside] == 0) { facev(trhs[ie], iside, 0.0); } } } // apply boundary condition to rmor col2(rmor, tmmor, nmor); // call the conjugate gradient iterative solver diffusion(ifmortar); // add convection and diffusion if (timeron) timer_start(t_add2); add2((double *)ta1, (double *)t, ntot); if (timeron) timer_stop(t_add2); // perform mesh adaptation time = time + dtime; if ((step != 0) && (step/fre*fre == step)) { if (step != niter) { adaptation(&ifmortar, step); } } else { ifmortar = false; } nelt_tot = nelt_tot + (double)(nelt); } timer_stop(1); tmax = timer_read(1); verify(&Class, &verified); // compute millions of collocation points advanced per second. // diffusion: nmxh advancements, convection: 1 advancement mflops = nelt_tot*(double)(LX1*LX1*LX1*(nmxh+1))/(tmax*1.e6); print_results("UA", Class, REFINE_MAX, 0, 0, niter, tmax, mflops, " coll. point advanced", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, "(none)"); //--------------------------------------------------------------------- // More timers //--------------------------------------------------------------------- if (timeron) { for (i = 1; i <= t_last; i++) { trecs[i] = timer_read(i); } if (tmax == 0.0) tmax = 1.0; printf(" SECTION Time (secs)\n"); for (i = 1; i <= t_last; i++) { printf(" %-10s:%9.3f (%6.2f%%)\n", t_names[i], trecs[i], trecs[i]*100./tmax); if (i == t_transfb_c) { t2 = trecs[t_convect] - trecs[t_transfb_c]; printf(" --> %11s:%9.3f (%6.2f%%)\n", "sub-convect", t2, t2*100./tmax); } else if (i == t_transfb) { t2 = trecs[t_diffusion] - trecs[t_transf] - trecs[t_transfb]; printf(" --> %11s:%9.3f (%6.2f%%)\n", "sub-diffuse", t2, t2*100./tmax); } } } //-------------------------------------------------------------------- // Teardown NUMA control //-------------------------------------------------------------------- numa_shutdown(); return 0; }
arma_inline static int get() { #if defined(ARMA_USE_OPENMP) int n_threads = (std::min)(int(arma_config::mp_threads), int((std::max)(int(1), int(omp_get_max_threads())))); #else int n_threads = int(1); #endif return n_threads; }
main () { int i, r; thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } omp_set_dynamic (0); rdct = shrd = 0; fprvt = MAGICNO; #pragma omp parallel for default(none) private (prvt) firstprivate(fprvt) lastprivate(lprvt) reduction(+:rdct) shared(shrd,thds,errors) schedule (static,1) for (i=0; i<thds; i++) { #pragma omp critical { shrd += 6*i; /* shrd is shared, i is private */ } tprvt = i; /* tprvt is threadprivate */ prvt = 2*i; /* prvt is private */ fprvt += 3*i; /* fprvt is firstprivate */ lprvt = 4*i; /* lprvt is lastprivate */ rdct += 5*i; /* rdct is reduction(+) */ waittime (1); if (prvt != 2*i) { /* check private */ #pragma omp critical errors += 1; } if (fprvt != MAGICNO + 3*i) { #pragma omp critical errors += 1; } } r = 0; for (i=0; i<thds; i++) r += i; if (rdct != r * 5) { errors += 1; } if (shrd != r * 6) { errors += 1; } if (lprvt != 4*(thds-1)) { errors += 1; } #pragma omp parallel for default(shared) schedule (static) for (i=0; i<thds; i++) { if (tprvt != i) { #pragma omp critical errors += 1; } } if (errors == 0) { printf ("default 005 : SUCCESS\n"); return 0; } else { printf ("default 005 : FAILED\n"); return 1; } }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; int ninds = 1; int inds[2] = {0,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_map, Plan->loc_map, &arg1_l[64*omp_get_thread_num()], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); // combine reduction data if (col == Plan->ncolors_owned-1) { for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; } block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_dotPV(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2 ){ double *arg2h = (double *)arg2.data; int nargs = 3; op_arg args[3]; args[0] = arg0; args[1] = arg1; args[2] = arg2; if (OP_diags>2) { printf(" kernel routine w/o indirection: dotPV\n"); } op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg2_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg2_l[d+thr*64]=ZERO_double; if (set->size >0) { // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_dotPV( (double *) arg0.data, (double *) arg1.data, arg2_l + thr*64, start, finish ); } } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg2h[d] += arg2_l[d+thr*64]; op_mpi_reduce(&arg2,arg2h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(4); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size; }
static void john_run(void) { if (options.flags & FLG_TEST_CHK) exit_status = benchmark_all() ? 1 : 0; else if (options.flags & FLG_MAKECHR_CHK) do_makechars(&database, options.charset); else if (options.flags & FLG_CRACKING_CHK) { int remaining = database.password_count; if (!(options.flags & FLG_STDOUT)) { status_init(NULL, 1); log_init(LOG_NAME, options.loader.activepot, options.session); john_log_format(); if (idle_requested(database.format)) log_event("- Configured to use otherwise idle " "processor cycles only"); } tty_init(options.flags & FLG_STDIN_CHK); #if defined(HAVE_MPI) && defined(_OPENMP) if (database.format->params.flags & FMT_OMP && omp_get_max_threads() > 1 && mpi_p > 1) { if(cfg_get_bool(SECTION_OPTIONS, SUBSECTION_MPI, "MPIOMPmutex", 1)) { if(cfg_get_bool(SECTION_OPTIONS, SUBSECTION_MPI, "MPIOMPverbose", 1) && mpi_id == 0) fprintf(stderr, "MPI in use, disabling OMP (see doc/README.mpi)\n"); omp_set_num_threads(1); } else if(cfg_get_bool(SECTION_OPTIONS, SUBSECTION_MPI, "MPIOMPverbose", 1) && mpi_id == 0) fprintf(stderr, "Note: Running both MPI and OMP (see doc/README.mpi)\n"); } #endif if (options.flags & FLG_SINGLE_CHK) do_single_crack(&database); else if (options.flags & FLG_WORDLIST_CHK) do_wordlist_crack(&database, options.wordlist, (options.flags & FLG_RULES) != 0); else if (options.flags & FLG_INC_CHK) do_incremental_crack(&database, options.charset); else if (options.flags & FLG_MKV_CHK) do_markov_crack(&database, options.mkv_param); else if (options.flags & FLG_EXTERNAL_CHK) do_external_crack(&database); else if (options.flags & FLG_BATCH_CHK) do_batch_crack(&database); status_print(); tty_done(); if (database.password_count < remaining) { char *might = "Warning: passwords printed above might"; char *partial = " be partial"; char *not_all = " not be all those cracked"; switch (database.options->flags & (DB_SPLIT | DB_NODUP)) { case DB_SPLIT: #ifdef HAVE_MPI if (mpi_id == 0) #endif fprintf(stderr, "%s%s\n", might, partial); break; case DB_NODUP: #ifdef HAVE_MPI if (mpi_id == 0) #endif fprintf(stderr, "%s%s\n", might, not_all); break; case (DB_SPLIT | DB_NODUP): #ifdef HAVE_MPI if (mpi_id == 0) #endif fprintf(stderr, "%s%s and%s\n", might, partial, not_all); } #ifdef HAVE_MPI if (mpi_id == 0) #endif fputs("Use the \"--show\" option to display all of " "the cracked passwords reliably\n", stderr); } } }
int main(int argc, char *argv[]) { REAL_TYPE *A_gold, *B_gold, *A_gold2, *B_gold2; float *C_gold, *C0_gold, *C, *C2; int M, N, K; REAL_TYPE alpha, beta; int reps; libxsmm_spmdm_handle handle, handle2; libxsmm_CSR_sparseslice *A_sparse, *A_sparse2; int max_threads; /* Step 1: Read in args */ libxsmm_timer_tickint start, end; double flops, duration; char transA, transB, transC; int i, j, k; size_t l; /* Step 1: Initialize handle */ M = 0; N = 0; K = 0; alpha = (REAL_TYPE)1.0; beta = (REAL_TYPE)0.0; reps = 0; transA = 'N'; transB = 'N'; if (argc > 1 && !strncmp(argv[1], "-h", 3)) { printf("\nUsage: %s [M] [N] [K] [transA] [transB] [reps]\n\n", argv[0]); return EXIT_SUCCESS; } /* defaults */ M = 2048; N = 2048; K = 2048; transA = 'N'; transB = 'N'; transC = 'N'; reps = 100; /* reading new values from cli */ i = 1; if (argc > i) M = atoi(argv[i++]); if (argc > i) N = atoi(argv[i++]); if (argc > i) K = atoi(argv[i++]); if (argc > i) { transA = argv[i][0]; i++; } if (argc > i) { transB = argv[i][0]; i++; } if (argc > i) { transC = argv[i][0]; i++; } if (argc > i) reps = atoi(argv[i++]); /* Step 2: allocate data */ A_gold = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 ); B_gold = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 ); C_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); C0_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); C = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); /* Step 3: init data */ libxsmm_rng_set_seed(1); for (l = 0; l < (size_t)M * (size_t)K; ++l) { const double r64 = libxsmm_rng_f64(); const float r32 = (float)r64; #ifdef USE_BFLOAT const int r = *(const int*)(&r32); const libxsmm_bfloat16 val = (r >> 16); #else const float val = r32; #endif if (r64 > 0.85) A_gold[l] = val; else A_gold[l] = (REAL_TYPE)0.0; } for (l = 0; l < (size_t)K * (size_t)N; ++l) { const double r64 = libxsmm_rng_f64(); const float r32 = (float)r64; #ifdef USE_BFLOAT const int r = *(const int*)(&r32); const libxsmm_bfloat16 val = (r >> 16); #else const float val = r32; #endif B_gold[l] = val; } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C0_gold[l] = (float)libxsmm_rng_f64(); C_gold[l] = C0_gold[l]; } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C[l] = (float)C0_gold[l]; } flops = (double)M * (double)N * (double)K * 2.0; /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 4: Initialize LIBXSMM for these sizes - allocates handle and temporary space for the sparse data structure for A */ # if defined(_OPENMP) max_threads = omp_get_max_threads(); # else max_threads = 1; # endif start = libxsmm_timer_tick(); libxsmm_spmdm_init(M, N, K, max_threads, &handle, &A_sparse); end = libxsmm_timer_tick(); printf("Time for handle init = %f\n", libxsmm_timer_duration(start, end)); printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i -- forward pass\n", M, N, K, handle.bm, handle.bn, handle.bk, handle.mb, handle.nb, handle.kb, reps ); /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha input */ # ifdef USE_BFLOAT spmdm_exec_bfloat16(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # else spmdm_exec_fp32(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # endif /* Checks */ /* Compute a "gold" answer sequentially */ #if defined(_OPENMP) LIBXSMM_OMP_VAR(k); # pragma omp parallel for private(i, j, k) LIBXSMM_OPENMP_COLLAPSE(2) #endif for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { float sum = 0.0; float Cval; for (k = 0; k < K; ++k) { # ifdef USE_BFLOAT libxsmm_bfloat16 Atmp = A_gold[i*K+k]; int Atmp_int = Atmp; Atmp_int <<= 16; float Aval = *(float *)&Atmp_int; libxsmm_bfloat16 Btmp = B_gold[k*N+j]; int Btmp_int = Btmp; Btmp_int <<= 16; float Bval = *(float *)&Btmp_int; # else float Aval = A_gold[i*K + k]; float Bval = B_gold[k*N + j]; # endif sum += Aval * Bval; } Cval = sum; C_gold[i*N + j] = Cval + beta*C_gold[i*N + j]; } } /* LIBXSMM_FSYMBOL(sgemm)(&trans, &trans, &N, &M, &K, &alpha, B_gold, &N, A_gold, &K, &beta, C_gold, &N); */ /* Compute the max difference between gold and computed results. */ spmdm_check_c( &handle, C, C_gold ); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # else spmdm_exec_fp32( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); libxsmm_spmdm_destroy(&handle); /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 5: Initialize libxsmm for transpose A - allocates handle and temporary space for the sparse data structure for A */ transA = 'T'; transB = 'N'; transC = 'T'; libxsmm_spmdm_init(M, N, K, max_threads, &handle2, &A_sparse2); printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transA = Y, transC = Y -- weight update\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps ); A_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 ); C2 = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 ); for (i = 0; i < M; ++i) { for (j = 0; j < K; ++j) { A_gold2[j*M + i] = A_gold[i*K + j]; } } for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { C[j*M + i] = (float)C0_gold[i*N + j]; } } /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha inputs */ # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # endif for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { C2[i*N + j] = C[j*M + i]; } } /* Checks */ spmdm_check_c( &handle2, C2, C_gold); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); /*----------------------------------------------------------------------------------------------------------------------*/ /* Step 6: Test transpose B */ transA = 'N'; transB = 'T'; transC = 'N'; printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transB = Y -- backprop\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps ); B_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 ); for (i = 0; i < K; ++i) { for (j = 0; j < N; ++j) { B_gold2[j*K + i] = B_gold[i*N + j]; } } for (l = 0; l < (size_t)M * (size_t)N; ++l) { C[l] = (float)C0_gold[l]; } /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */ /* Currently ignores alpha */ /* TODO: fix alpha inputs */ # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # endif /* Checks */ spmdm_check_c( &handle2, C, C_gold); /* Timing loop starts */ start = libxsmm_timer_tick(); for (i = 0; i < reps; ++i) { # ifdef USE_BFLOAT spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # else spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2); # endif } end = libxsmm_timer_tick(); duration = libxsmm_timer_duration(start, end); printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps); libxsmm_spmdm_destroy(&handle2); libxsmm_free(A_gold); libxsmm_free(B_gold); libxsmm_free(C_gold); libxsmm_free(C); libxsmm_free(C2); libxsmm_free(C0_gold); libxsmm_free(B_gold2); libxsmm_free(A_gold2); return EXIT_SUCCESS; }
static void john_init(char *name, int argc, char **argv) { int show_usage = 0; int make_check = (argc == 2 && !strcmp(argv[1], "--make_check")); if (make_check) argv[1] = "--test=0"; CPU_detect_or_fallback(argv, make_check); status_init(NULL, 1); if (argc < 2 || (argc == 2 && (!strcasecmp(argv[1], "--help") || !strcasecmp(argv[1], "-h") || !strcasecmp(argv[1], "-help")))) { john_register_all(); /* for printing by opt_init() */ show_usage = 1; } opt_init(name, argc, argv, show_usage); /* * --list=? needs to be supported, because it has been supported in the released * john-1.7.9-jumbo-6 version, and it is used by the bash completion script. * --list=? is, however, not longer mentioned in doc/OPTIONS and in the usage * output. Instead, --list=help is. */ if (options.listconf && (!strcasecmp(options.listconf, "help") || !strcmp(options.listconf, "?"))) { john_list_options(); exit(0); } if (options.listconf && (!strcasecmp(options.listconf, "help:help") || !strcasecmp(options.listconf, "help:"))) { john_list_help_options(); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "help:format-methods")) { john_list_method_names(); exit(0); } if (options.listconf && !strncasecmp(options.listconf, "help:", 5)) { if (strcasecmp(options.listconf, "help:parameters") && strcasecmp(options.listconf, "help:list-data")) { fprintf(stderr, "%s is not a --list option that supports additional values.\nSupported options:\n", options.listconf+5); john_list_help_options(); exit(1); } } if (options.listconf && !strcasecmp(options.listconf, "hidden-options")) { puts("--help print usage summary, just like running the command"); puts(" without any parameters"); puts("--subformat=FORMAT pick a benchmark format for --format=crypt"); puts("--mkpc=N force a lower max. keys per crypt"); puts("--length=N force a lower max. length"); puts("--field-separator-char=C use 'C' instead of the ':' in input and pot files"); puts("--fix-state-delay=N performance tweak, see documentation"); puts("--log-stderr log to screen instead of file\n"); exit(0); } if (!make_check) { #if defined(_OPENMP) && OMP_FALLBACK #if defined(__DJGPP__) || defined(__CYGWIN32__) #error OMP_FALLBACK is incompatible with the current DOS and Win32 code #endif if (!getenv("JOHN_NO_OMP_FALLBACK") && omp_get_max_threads() <= 1) { #define OMP_FALLBACK_PATHNAME JOHN_SYSTEMWIDE_EXEC "/" OMP_FALLBACK_BINARY execv(OMP_FALLBACK_PATHNAME, argv); perror("execv: " OMP_FALLBACK_PATHNAME); } #endif path_init(argv); if (options.listconf && !strcasecmp(options.listconf, "build-info")) { puts("Version: " JOHN_VERSION); puts("Build: " JOHN_BLD _MP_VERSION); printf("Arch: %d-bit %s\n", ARCH_BITS, ARCH_LITTLE_ENDIAN ? "LE" : "BE"); #if JOHN_SYSTEMWIDE puts("System-wide exec: " JOHN_SYSTEMWIDE_EXEC); puts("System-wide home: " JOHN_SYSTEMWIDE_HOME); puts("Private home: " JOHN_PRIVATE_HOME); #endif printf("$JOHN is %s\n", path_expand("$JOHN/")); printf("Format interface version: %d\n", FMT_MAIN_VERSION); puts("Rec file version: " RECOVERY_V); puts("Charset file version: " CHARSET_V); printf("CHARSET_MIN: %d (0x%02x)\n", CHARSET_MIN, CHARSET_MIN); printf("CHARSET_MAX: %d (0x%02x)\n", CHARSET_MAX, CHARSET_MAX); printf("CHARSET_LENGTH: %d\n", CHARSET_LENGTH); printf("Max. Markov mode level: %d\n", MAX_MKV_LVL); printf("Max. Markov mode password length: %d\n", MAX_MKV_LEN); #ifdef __VERSION__ printf("Compiler version: %s\n", __VERSION__); #endif #ifdef __GNUC__ printf("gcc version: %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); #endif #ifdef __ICC printf("icc version: %d\n", __ICC); #endif #ifdef __clang_version__ printf("clang version: %s\n", __clang_version__); #endif #ifdef OPENSSL_VERSION_NUMBER // The man page suggests the type of OPENSSL_VERSION_NUMBER is long, // gcc insists it is int. printf("OpenSSL library version: %lx", (unsigned long)OPENSSL_VERSION_NUMBER); // FIXME: How do I detect a missing library? // Even if if is extremely unlikely that openssl is missing, // at least flush all output buffers... fflush(NULL); if ((unsigned long)OPENSSL_VERSION_NUMBER != (unsigned long)SSLeay()) printf("\t(loaded: %lx)", (unsigned long)SSLeay()); printf("\n"); #endif exit(0); } } if (options.listconf && !strcasecmp(options.listconf, "encodings")) { listEncodings(); exit(0); } #ifdef CL_VERSION_1_0 if (options.listconf && !strcasecmp(options.listconf, "opencl-devices")) { listOpenCLdevices(); exit(0); } #endif #ifdef HAVE_CUDA if (options.listconf && !strcasecmp(options.listconf, "cuda-devices")) { cuda_device_list(); exit(0); } #endif if (!make_check) { if (options.config) { path_init_ex(options.config); cfg_init(options.config, 1); cfg_init(CFG_FULL_NAME, 1); cfg_init(CFG_ALT_NAME, 0); } else { #if JOHN_SYSTEMWIDE cfg_init(CFG_PRIVATE_FULL_NAME, 1); cfg_init(CFG_PRIVATE_ALT_NAME, 1); #endif cfg_init(CFG_FULL_NAME, 1); cfg_init(CFG_ALT_NAME, 0); } } /* This is --crack-status. We toggle here, so if it's enabled in john.conf, we can disable it using the command line option */ if (cfg_get_bool(SECTION_OPTIONS, NULL, "CrackStatus", 0)) options.flags ^= FLG_CRKSTAT; initUnicode(UNICODE_UNICODE); /* Init the unicode system */ john_register_all(); /* maybe restricted to one format by options */ if ((options.subformat && !strcasecmp(options.subformat, "list")) || (options.listconf && !strcasecmp(options.listconf, "subformats"))) { dynamic_DISPLAY_ALL_FORMATS(); /* NOTE if we have other 'generics', like sha1, sha2, rc4, ... * then EACH of them should have a DISPLAY_ALL_FORMATS() * function and we can call them here. */ exit(0); } if (options.listconf && !strcasecmp(options.listconf, "inc-modes")) { cfg_print_subsections("Incremental", NULL, NULL, 0); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "rules")) { cfg_print_subsections("List.Rules", NULL, NULL, 0); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "externals")) { cfg_print_subsections("List.External", NULL, NULL, 0); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "sections")) { cfg_print_section_names(0); exit(0); } if (options.listconf && !strncasecmp(options.listconf, "parameters", 10) && (options.listconf[10] == '=' || options.listconf[10] == ':') && options.listconf[11] != '\0') { cfg_print_section_params(&options.listconf[11], NULL); exit(0); } if (options.listconf && !strncasecmp(options.listconf, "list-data", 9) && (options.listconf[9] == '=' || options.listconf[9] == ':') && options.listconf[10] != '\0') { cfg_print_section_list_lines(&options.listconf[10], NULL); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "ext-filters")) { cfg_print_subsections("List.External", "filter", NULL, 0); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "ext-filters-only")) { cfg_print_subsections("List.External", "filter", "generate", 0); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "ext-modes")) { cfg_print_subsections("List.External", "generate", NULL, 0); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "formats")) { int column; struct fmt_main *format; int i, dynamics = 0; char **formats_list; i = 0; format = fmt_list; while ((format = format->next)) i++; formats_list = malloc(sizeof(char*) * i); i = 0; format = fmt_list; do { char *label = format->params.label; if (!strncmp(label, "dynamic", 7)) { if (dynamics++) continue; else label = "dynamic_n"; } formats_list[i++] = label; } while ((format = format->next)); formats_list[i] = NULL; column = 0; i = 0; do { int length; char *label = formats_list[i++]; length = strlen(label) + 2; column += length; if (column > 78) { printf("\n"); column = length; } printf("%s%s", label, formats_list[i] ? ", " : "\n"); } while (formats_list[i]); free(formats_list); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "format-details")) { struct fmt_main *format; format = fmt_list; do { int ntests = 0; if(format->params.tests) { while (format->params.tests[ntests++].ciphertext); ntests--; } printf("%s\t%d\t%d\t%d\t%08x\t%d\t%s\t%s\t%s\t%d\t%d\t%d\n", format->params.label, format->params.plaintext_length, format->params.min_keys_per_crypt, format->params.max_keys_per_crypt, format->params.flags, ntests, format->params.algorithm_name, format->params.format_name, format->params.benchmark_comment, format->params.benchmark_length, format->params.binary_size, ((format->params.flags & FMT_DYNAMIC) && format->params.salt_size) ? // salts are handled internally within the format. We want to know the 'real' salt size // dynamic will alway set params.salt_size to 0 or sizeof a pointer. dynamic_real_salt_length(format) : format->params.salt_size); } while ((format = format->next)); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "format-all-details")) { struct fmt_main *format; format = fmt_list; do { int ntests = 0; if(format->params.tests) { while (format->params.tests[ntests++].ciphertext); ntests--; } /* * attributes should be printed in the same sequence * as with format-details, but human-readable */ printf("Format label \t%s\n", format->params.label); printf("Max. password length in bytes \t%d\n", format->params.plaintext_length); printf("Min. keys per crypt \t%d\n", format->params.min_keys_per_crypt); printf("Max. keys per crypt \t%d\n", format->params.max_keys_per_crypt); printf("Flags\n"); printf(" Case sensitive \t%s\n", (format->params.flags & FMT_CASE) ? "yes" : "no"); printf(" Supports 8-bit characters \t%s\n", (format->params.flags & FMT_8_BIT) ? "yes" : "no"); printf(" Converts 8859-1 to UTF-16/UCS-2\t%s\n", (format->params.flags & FMT_UNICODE) ? "yes" : "no"); printf(" Honours --encoding=NAME \t%s\n", (format->params.flags & FMT_UTF8) ? "yes" : "no"); printf(" False positives possible \t%s\n", (format->params.flags & FMT_NOT_EXACT) ? "yes" : "no"); printf(" Uses a bitslice implementation \t%s\n", (format->params.flags & FMT_BS) ? "yes" : "no"); printf(" The split() method unifies case\t%s\n", (format->params.flags & FMT_SPLIT_UNIFIES_CASE) ? "yes" : "no"); printf(" A $dynamic$ format \t%s\n", (format->params.flags & FMT_DYNAMIC) ? "yes" : "no"); #ifdef _OPENMP printf(" Parallelized with OpenMP \t%s\n", (format->params.flags & FMT_OMP) ? "yes" : "no"); #endif printf("Number of test cases for --test \t%d\n", ntests); printf("Algorithm name \t%s\n", format->params.algorithm_name); printf("Format name \t%s\n", format->params.format_name); printf("Benchmark comment \t%s\n", format->params.benchmark_comment); printf("Benchmark length \t%d\n", format->params.benchmark_length); printf("Binary size \t%d\n", format->params.binary_size); printf("Salt size \t%d\n", ((format->params.flags & FMT_DYNAMIC) && format->params.salt_size) ? // salts are handled internally within the format. We want to know the 'real' salt size/ // dynamic will alway set params.salt_size to 0 or sizeof a pointer. dynamic_real_salt_length(format) : format->params.salt_size); printf("\n"); } while ((format = format->next)); exit(0); } if (options.listconf && !strncasecmp(options.listconf, "format-methods", 14)) { struct fmt_main *format; format = fmt_list; do { int ShowIt = 1, i; if (options.listconf[14] == '=' || options.listconf[14] == ':') { ShowIt = 0; if (!strcasecmp(&options.listconf[15], "set_key") || !strcasecmp(&options.listconf[15], "get_key") || !strcasecmp(&options.listconf[15], "crypt_all") || !strcasecmp(&options.listconf[15], "cmp_all") || !strcasecmp(&options.listconf[15], "cmp_one") || !strcasecmp(&options.listconf[15], "cmp_exact")) ShowIt = 1; else if (strcasecmp(&options.listconf[15], "init") && strcasecmp(&options.listconf[15], "prepare") && strcasecmp(&options.listconf[15], "valid") && strcasecmp(&options.listconf[15], "split") && strcasecmp(&options.listconf[15], "binary") && strcasecmp(&options.listconf[15], "clear_keys") && strcasecmp(&options.listconf[15], "salt") && strcasecmp(&options.listconf[15], "get_hash") && strcasecmp(&options.listconf[15], "get_hash[0]") && strcasecmp(&options.listconf[15], "get_hash[1]") && strcasecmp(&options.listconf[15], "get_hash[2]") && strcasecmp(&options.listconf[15], "get_hash[3]") && strcasecmp(&options.listconf[15], "get_hash[4]") && strcasecmp(&options.listconf[15], "get_hash[5]") && strcasecmp(&options.listconf[15], "set_salt") && strcasecmp(&options.listconf[15], "binary_hash") && strcasecmp(&options.listconf[15], "binary_hash[0]") && strcasecmp(&options.listconf[15], "binary_hash[1]") && strcasecmp(&options.listconf[15], "binary_hash[2]") && strcasecmp(&options.listconf[15], "binary_hash[3]") && strcasecmp(&options.listconf[15], "binary_hash[3]") && strcasecmp(&options.listconf[15], "binary_hash[5]") && strcasecmp(&options.listconf[15], "salt_hash")) { fprintf(stderr, "Error, invalid option (invalid method name) %s\n", options.listconf); fprintf(stderr, "Valid method names are:\n"); john_list_method_names(); exit(1); } if (format->methods.init != fmt_default_init && !strcasecmp(&options.listconf[15], "init")) ShowIt = 1; if (format->methods.prepare != fmt_default_prepare && !strcasecmp(&options.listconf[15], "prepare")) ShowIt = 1; if (format->methods.valid != fmt_default_valid && !strcasecmp(&options.listconf[15], "valid")) ShowIt = 1; if (format->methods.split != fmt_default_split && !strcasecmp(&options.listconf[15], "split")) ShowIt = 1; if (format->methods.binary != fmt_default_binary && !strcasecmp(&options.listconf[15], "binary")) ShowIt = 1; if (format->methods.salt != fmt_default_salt && !strcasecmp(&options.listconf[15], "salt")) ShowIt = 1; if (format->methods.clear_keys != fmt_default_clear_keys && !strcasecmp(&options.listconf[15], "clear_keys")) ShowIt = 1; for (i = 0; i < 6; ++i) { char Buf[20]; sprintf(Buf, "get_hash[%d]", i); if (format->methods.get_hash[i] && format->methods.get_hash[i] != fmt_default_get_hash && !strcasecmp(&options.listconf[15], Buf)) ShowIt = 1; } if (format->methods.get_hash[0] && format->methods.get_hash[0] != fmt_default_get_hash && !strcasecmp(&options.listconf[15], "get_hash")) ShowIt = 1; for (i = 0; i < 6; ++i) { char Buf[20]; sprintf(Buf, "binary_hash[%d]", i); if (format->methods.binary_hash[i] && format->methods.binary_hash[i] != fmt_default_binary_hash && !strcasecmp(&options.listconf[15], Buf)) ShowIt = 1; } if (format->methods.binary_hash[0] && format->methods.binary_hash[0] != fmt_default_binary_hash && !strcasecmp(&options.listconf[15], "binary_hash")) ShowIt = 1; if (format->methods.salt_hash != fmt_default_salt_hash && !strcasecmp(&options.listconf[15], "salt_hash")) ShowIt = 1; if (format->methods.set_salt != fmt_default_set_salt && !strcasecmp(&options.listconf[15], "set_salt")) ShowIt = 1; } if (ShowIt) { int i; printf("Methods overridden for: %s [%s] %s\n", format->params.label, format->params.algorithm_name, format->params.format_name); if (format->methods.init != fmt_default_init) printf("\tinit()\n"); if (format->methods.prepare != fmt_default_prepare) printf("\tprepare()\n"); if (format->methods.valid != fmt_default_valid) printf("\tvalid()\n"); if (format->methods.split != fmt_default_split) printf("\tsplit()\n"); if (format->methods.binary != fmt_default_binary) printf("\tbinary()\n"); if (format->methods.salt != fmt_default_salt) printf("\tsalt()\n"); for (i = 0; i < 6; ++i) if (format->methods.binary_hash[i] != fmt_default_binary_hash) { if (format->methods.binary_hash[i]) printf("\t\tbinary_hash[%d]()\n", i); else printf("\t\tbinary_hash[%d]() (NULL pointer)\n", i); } if (format->methods.salt_hash != fmt_default_salt_hash) printf("\tsalt_hash()\n"); if (format->methods.set_salt != fmt_default_set_salt) printf("\tset_salt()\n"); // there is no default for set_key() it must be defined. printf("\tset_key()\n"); // there is no default for get_key() it must be defined. printf("\tget_key()\n"); if (format->methods.clear_keys != fmt_default_clear_keys) printf("\tclear_keys()\n"); for (i = 0; i < 6; ++i) if (format->methods.get_hash[i] != fmt_default_get_hash) { if (format->methods.get_hash[i]) printf("\t\tget_hash[%d]()\n", i); else printf("\t\tget_hash[%d]() (NULL pointer)\n", i); } // there is no default for crypt_all() it must be defined. printf("\tcrypt_all()\n"); // there is no default for cmp_all() it must be defined. printf("\tcmp_all()\n"); // there is no default for cmp_one() it must be defined. printf("\tcmp_one()\n"); // there is no default for cmp_exact() it must be defined. printf("\tcmp_exact()\n"); printf("\n\n"); } } while ((format = format->next)); exit(0); } /* * Other --list=help:WHAT are processed earlier, but these require * a valid config: */ if (options.listconf && !strcasecmp(options.listconf, "help:parameters")) { cfg_print_section_names(1); exit(0); } if (options.listconf && !strcasecmp(options.listconf, "help:list-data")) { cfg_print_section_names(2); exit(0); } /* --list last resort: list subsections of any john.conf section name */ if (options.listconf) { //printf("Subsections of [%s]:\n", options.listconf); if (cfg_print_subsections(options.listconf, NULL, NULL, 1)) exit(0); else { fprintf(stderr, "Section [%s] not found.\n", options.listconf); /* Just in case the user specified an invalid value * like help or list... * print the same list as with --list=?, but exit(1) */ john_list_options(); exit(1); } } #ifdef CL_VERSION_1_0 if (!options.ocl_platform) { if ((options.ocl_platform = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, "Platform"))) platform_id = atoi(options.ocl_platform); else platform_id = -1; } if (!options.gpu_device) { if ((options.gpu_device = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, "Device"))) ocl_gpu_id = atoi(options.gpu_device); else ocl_gpu_id = -1; } if (platform_id == -1 || ocl_gpu_id == -1) opencl_find_gpu(&ocl_gpu_id, &platform_id); #endif common_init(); sig_init(); john_load(); if (options.encodingStr && options.encodingStr[0]) log_event("- %s input encoding enabled", options.encodingStr); }
int main(int argc, char **argv) { MPI_Init(&argc, &argv); int nprocs, rank; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); int numthreads = omp_get_max_threads(); if (argc < 2) { printf("Usage:\n"); printf(" poisson n\n\n"); printf("Arguments:\n"); printf(" n: the problem size (must be a power of 2)\n"); } double time_start; if (rank == 0) { time_start = MPI_Wtime(); } // The number of grid points in each direction is n+1 // The number of degrees of freedom in each direction is n-1 = m int n = atoi(argv[1]); int m = n - 1; int nn = 4 * n; real h = 1.0 / n; // Splitting the matrix into columns: int exact = m/nprocs; int rem = m - (nprocs - 1)*exact; // Size of each process owns a strip matrix which is m*exact or m*remain. // We consider that each such a matrix is made of 'nprocs' blocks vertically. int block_col = exact; int block_uk = exact*exact; int rem_uk = exact*rem; // For the last such strip, number of columns is rem. Consequently: if (rank == nprocs-1){ block_col = rem; block_uk = rem*exact; rem_uk = rem*rem; } // Grid points real *grid = mk_1D_array(n+1, false); #pragma omp parallel for schedule(static) for (size_t i = 0; i < n+1; i++) { grid[i] = i * h; } // The diagonal of the eigenvalue matrix of T real *diag = mk_1D_array(m, false); #pragma omp parallel for schedule(static) for (size_t i = 0; i < m; i++) { diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n)); } // Initialize the right hand side data // B is the column strip that the process owns.* real **B = mk_2D_array(block_col, m, false); #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { for (size_t j = 0; j < m; j++) { B[i][j] = h * h * rhs(grid[i+1+(rank*exact)], grid[j+1]); } } // For the Sine Transform: real **z = mk_2D_array(numthreads, nn, false); // Calculate Btilde^T = S^-1 * (S * B)^T #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { fst_(B[i], &n, z[omp_get_thread_num()], &nn); } transpose(B, block_col, m, nprocs, block_uk, rem_uk, rank); #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { fstinv_(B[i], &n, z[omp_get_thread_num()], &nn); } // Solve Lambda * Xtilde = Btilde #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { for (size_t j = 0; j < m; j++) { B[i][j] = B[i][j] / (diag[i+(rank*exact)] + diag[j]); } } // Calculate X = S^-1 * (S * Xtilde^T) ^ T #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { fst_(B[i], &n, z[omp_get_thread_num()], &nn); } transpose(B, block_col, m, nprocs, block_uk, rem_uk, rank); #pragma omp parallel for schedule(static) for (size_t i = 0; i < block_col; i++) { fstinv_(B[i], &n, z[omp_get_thread_num()], &nn); } // Calculate maximal value of solution double U_max = 0.0, e_max = 0.0, global_max, global_emax, error; for (size_t i = 0; i < block_col; i++){ for (size_t j = 0; j < m; j++){ error = fabs(B[i][j] - sin(PI*(i+1+(rank*exact))*h)*sin(2*PI*(j+1)*h)); U_max = U_max > B[i][j] ? U_max : B[i][j]; e_max = e_max > error ? e_max : error; } } // MPI_Max to find the true maximum: MPI_Reduce(&U_max, &global_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&e_max, &global_emax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); // Print the Global Maximum on process 0: if (rank == 0){ printf("Problem Size = %d\tNumprocs = %d\tNumthreads = %d\n", n, nprocs, numthreads); printf("U_max = %0.16f\t", global_max); printf("E_max = %0.16f\t", global_emax); double duration = MPI_Wtime() - time_start ; printf("Execution Time: %0.16f \n", duration); } MPI_Finalize(); return 0; }
void PrefixJaccardScore<AttributeT>::run() { //this-> required to access members of base class, since this is a template class. if (!this->G.hasEdgeIds()) throw std::runtime_error("Error, edges need to be indexed first"); this->scoreData.clear(); this->scoreData.resize(this->G.upperEdgeIdBound()); struct RankedEdge { node u; AttributeT att; count rank; RankedEdge(node u, AttributeT att, count rank) : u(u), att(att), rank(rank) {}; bool operator<(const RankedEdge &other) const { return std::tie(rank, att, u) < std::tie(other.rank, other.att, other.u); }; bool operator>(const RankedEdge &other) const { return std::tie(rank, att, u) > std::tie(other.rank, other.att, other.u); }; }; std::vector<size_t> rankedEdgeBegin(G.upperNodeIdBound() + 1); std::vector<RankedEdge> rankedEdges; rankedEdges.reserve(2*G.numberOfEdges()); for (node u = 0; u < G.upperNodeIdBound(); ++u) { rankedEdgeBegin[u] = rankedEdges.size(); if (G.hasNode(u)) { G.forEdgesOf(u, [&](node, node v, edgeid eid) { rankedEdges.emplace_back(v, inAttribute[eid], 0); }); } } rankedEdgeBegin[G.upperNodeIdBound()] = rankedEdges.size(); this->G.balancedParallelForNodes([&](node u) { if (this->G.degree(u) == 0) return; const auto beginIt = rankedEdges.begin() + rankedEdgeBegin[u]; const auto endIt = rankedEdges.begin() + rankedEdgeBegin[u+1]; Aux::Parallel::sort(beginIt, endIt, std::greater<RankedEdge>()); AttributeT curVal = beginIt->att; count curRank = 0; count numEqual = 0; for (auto it = beginIt; it != endIt; ++it) { if (curVal != it->att) { curRank += numEqual; curVal = it->att; numEqual = 1; } else { ++numEqual; } it->rank = curRank; } }); std::vector<std::vector<bool>> uMarker(omp_get_max_threads(), std::vector<bool>(G.upperNodeIdBound(), false)); auto vMarker = uMarker; this->G.parallelForEdges([&](node u, node v, edgeid eid) { count curRank = 0; double bestJaccard = 0; auto tid = omp_get_thread_num(); auto uIt = rankedEdges.begin() + rankedEdgeBegin[u]; auto vIt = rankedEdges.begin() + rankedEdgeBegin[v]; const auto uEndIt = rankedEdges.begin() + rankedEdgeBegin[u+1]; const auto vEndIt = rankedEdges.begin() + rankedEdgeBegin[v+1]; count commonNeighbors = 0; count uNeighbors = 0; count vNeighbors = 0; while (uIt != uEndIt || vIt != vEndIt) { while (uIt != uEndIt && curRank == uIt->rank) { if (uIt->u == v) { ++uIt; continue; } if (vMarker[tid][uIt->u]) { vMarker[tid][uIt->u] = false; ++commonNeighbors; --vNeighbors; } else { uMarker[tid][uIt->u] = true; ++uNeighbors; } ++uIt; } while (vIt != vEndIt && curRank == vIt->rank) { if (vIt->u == u) { ++vIt; continue; } if (uMarker[tid][vIt->u]) { uMarker[tid][vIt->u] = false; ++commonNeighbors; --uNeighbors; } else { vMarker[tid][vIt->u] = true; ++vNeighbors; } ++vIt; } bestJaccard = std::max(bestJaccard, commonNeighbors * 1.0 / (uNeighbors + vNeighbors + commonNeighbors)); ++curRank; } G.forNeighborsOf(u, [&](node w) { uMarker[tid][w] = false; }); G.forNeighborsOf(v, [&](node w) { vMarker[tid][w] = false; }); this->scoreData[eid] = bestJaccard; }); this->hasRun = true; }
inline int OpenMPTarget::thread_pool_size( int depth ) { //return Impl::OpenMPTargetExec::pool_size(depth); return omp_get_max_threads(); }
int main( int argc, char **argv ) { int i; uint64_t p, threads, chunk_size; uint8_t *m; struct stat s; ssize_t rd, ts = 0; size_t page_size; struct sigaction new_action, old_action; struct utimbuf u; lzma_filter filters[LZMA_FILTERS_MAX + 1]; lzma_options_lzma lzma_options; page_size = sysconf(_SC_PAGE_SIZE); xzcmd = malloc(xzcmd_max); if (!xzcmd) { fprintf(stderr, "Failed to allocate %lu bytes for xz command.\n", xzcmd_max); return -1; } snprintf(xzcmd, xzcmd_max, XZ_BINARY); parse_args(argc, argv); lzma_lzma_preset(&lzma_options, opt_complevel); filters[0].id = LZMA_FILTER_LZMA2; filters[0].options = &lzma_options; filters[1].id = LZMA_VLI_UNKNOWN; for (i=0; i<files; i++) { int std_in = file[i][0] == '-' && file[i][1] == '\0'; #ifdef _OPENMP threads = omp_get_max_threads(); #else threads = 1; #endif if ( (rd=strlen(file[i])) >= 3 && !strncmp(&file[i][rd-3], ".xz", 3) ) { if (opt_verbose) { error(EXIT_FAILURE, 0, "ignoring '%s', it seems to be already compressed", file[i]); } continue; } if ( !std_in ) { if ( stat(file[i], &s)) { error(EXIT_FAILURE, errno, "can't stat '%s'", file[i]); } } chunk_size = opt_context_size * lzma_options.dict_size; chunk_size = (chunk_size + page_size)&~(page_size-1); if ( opt_verbose ) { fprintf(stderr, "context size per thread: %"PRIu64" B\n", chunk_size); } if ( opt_threads && (threads > opt_threads || opt_force) ) { threads = opt_threads; } fo = stdout; if ( std_in ) { fi = stdin; } else { if ( !(fi=fopen(file[i], "rb")) ) { error(EXIT_FAILURE, errno, "can't open '%s' for reading", file[i]); } if ( !opt_stdout ) { snprintf(str, sizeof(str), "%s.xz", file[i]); if ( !(fo=fopen(str, "wb")) ) { error(EXIT_FAILURE, errno, "error creating target archive '%s'", str); } } } if ( opt_verbose ) { if ( fo != stdout ) { fprintf(stderr, "%s -> %"PRIu64"/%"PRIu64" thread%c: [", file[i], threads, (s.st_size+chunk_size-1)/chunk_size, threads != 1 ? 's' : ' '); } else { fprintf(stderr, "%"PRIu64" thread%c: [", threads, threads != 1 ? 's' : ' '); } fflush(stderr); } m = malloc(threads*chunk_size); new_action.sa_handler = term_handler; sigemptyset (&new_action.sa_mask); new_action.sa_flags = 0; sigaction(SIGINT, NULL, &old_action); if (old_action.sa_handler != SIG_IGN) sigaction(SIGINT, &new_action, NULL); sigaction(SIGHUP, NULL, &old_action); if (old_action.sa_handler != SIG_IGN) sigaction(SIGHUP, &new_action, NULL); sigaction(SIGTERM, NULL, &old_action); if (old_action.sa_handler != SIG_IGN) sigaction(SIGTERM, &new_action, NULL); ftemp = malloc(threads*sizeof(ftemp[0])); while ( !feof(fi) ) { size_t actrd; for (p=0; p<threads; p++) { ftemp[p] = tmpfile(); } for ( actrd=rd=0; !feof(fi) && !ferror(fi) && (uint64_t)rd < threads*chunk_size; rd += actrd) { actrd = fread(&m[rd], 1, threads*chunk_size-actrd, fi); } if (ferror(fi)) { error(EXIT_FAILURE, errno, "error in reading input"); } #pragma omp parallel for private(p) num_threads(threads) for ( p=0; p<(rd+chunk_size-1)/chunk_size; p++ ) { off_t pt, len = rd-p*chunk_size >= chunk_size ? chunk_size : rd-p*chunk_size; uint8_t *mo; lzma_stream strm = LZMA_STREAM_INIT; lzma_ret ret; mo = malloc(BUFFSIZE); if ( lzma_stream_encoder(&strm, filters, LZMA_CHECK_CRC64) != LZMA_OK ) { error(EXIT_FAILURE, errno, "unable to initialize LZMA encoder"); } for (pt=0; pt<len; pt+=BUFFSIZE) { strm.next_in = &m[p*chunk_size+pt]; strm.avail_in = len-pt >= BUFFSIZE ? BUFFSIZE : len-pt; strm.next_out = mo; strm.avail_out = BUFFSIZE; do { ret = lzma_code(&strm, LZMA_RUN); if ( ret != LZMA_OK ) { error(EXIT_FAILURE, 0, "error in LZMA_RUN"); } if ( BUFFSIZE - strm.avail_out > 0 ) { if ( !fwrite(mo, 1, BUFFSIZE - strm.avail_out, ftemp[p]) ) { error(EXIT_FAILURE, errno, "writing to temp file failed"); } strm.next_out = mo; strm.avail_out = BUFFSIZE; } } while ( strm.avail_in ); } strm.next_out = mo; strm.avail_out = BUFFSIZE; do { ret = lzma_code(&strm, LZMA_FINISH); if ( ret != LZMA_OK && ret != LZMA_STREAM_END ) { error(EXIT_FAILURE, 0, "error in LZMA_FINISH"); } if ( BUFFSIZE - strm.avail_out > 0 ) { if ( !fwrite(mo, 1, BUFFSIZE - strm.avail_out, ftemp[p]) ) { error(EXIT_FAILURE, errno, "writing to temp file failed"); } strm.next_out = mo; strm.avail_out = BUFFSIZE; } } while ( ret == LZMA_OK ); lzma_end(&strm); free(mo); if ( opt_verbose ) { fprintf(stderr, "%"PRIu64" ", p); fflush(stderr); } } for ( p=0; p<threads; p++ ) { rewind(ftemp[p]); while ( (rd=fread(buf, 1, sizeof(buf), ftemp[p])) > 0 ) { if ( fwrite(buf, 1, rd, fo) != (size_t)rd ) { error(0, errno, "writing to archive failed"); if ( fo != stdout && unlink(str) ) { error(0, errno, "error deleting corrupted target archive %s", str); } exit(EXIT_FAILURE); } else ts += rd; } if (rd < 0) { error(0, errno, "reading from temporary file failed"); if ( fo != stdout && unlink(str) ) { error(0, errno, "error deleting corrupted target archive %s", str); } exit(EXIT_FAILURE); } if ( close_stream(ftemp[p]) ) { error(0, errno, "I/O error in temp file"); } } } if ( fi != stdin && close_stream(fi) ) { error(0, errno, "I/O error in input file"); } if ( opt_verbose ) { fprintf(stderr, "] "); } free(ftemp); if ( fo != stdout ) { if ( close_stream(fo) ) { error(0, errno, "I/O error in target archive"); } } else return 0; if ( chmod(str, s.st_mode) ) { error(0, errno, "warning: unable to change archive permissions"); } u.actime = s.st_atime; u.modtime = s.st_mtime; if ( utime(str, &u) ) { error(0, errno, "warning: unable to change archive timestamp"); } sigaction(SIGINT, &old_action, NULL); sigaction(SIGHUP, &old_action, NULL); sigaction(SIGTERM, &old_action, NULL); if ( opt_verbose ) { fprintf(stderr, "%"PRIu64" -> %zd %3.3f%%\n", s.st_size, ts, ts*100./s.st_size); } if ( !opt_keep && unlink(file[i]) ) { error(0, errno, "error deleting input file %s", file[i]); } } return 0; }
int fixedparamBKZ(mat_ZZ &L,int index,int beta,double prob, double alpha, int tourlim,int vl,int opt) { sharememalloc(); pruning_func::init_pruning_func(); BKZproperty BP; BP.enumlim[0] = 1000000; // limit of #processed nodes (/10^8) BP.enumlim[1] = 1; BP.beta[0] = beta; //blocksize BP.beta[1] = beta*0.6; //blocksize in preprocess strategy1 BP.tourlim = tourlim; BP.breakindex = -1; //multithread? if (opt & OPT_MULTITHREAD) { BP.multithread = true; BP.numthreads = omp_get_max_threads(); BP.MTlimit = BP.numthreads*10000000; cout << "setting # threads = " << BP.numthreads << endl; } else { BP.multithread = false; BP.MTlimit = 10000000; } //optimize pruning function if (opt & OPT_OPTIMIZE_PRUNING_FUNCTION) { BP.optimizepf = true; BP.optimizepf_at_least = 1000; lattice_enum::enum_speed_bench(BP.numthreads); } //do preprocess? if (opt & (OPT_PREPROCESS | OPT_PREPROCESS2)) { BP.preprocess = true; BP.preprocess_at_least = 1; BP.preprocess_strategy = 1; if (opt & OPT_PREPROCESS2) BP.preprocess_strategy = 2; lattice_enum::enum_speed_bench(BP.numthreads); } else { BP.preprocess = false; } if (opt & OPT_EXTENDBLOCKSIZE) { BP.extend_blocksize=true; BP.extend_blocksizemult = 2; //extend blocksize while expected cost <= max cost of the current tour } //break at a specific index if (opt & OPT_FIRSTINDEX) { BP.breakindex = 1; } //break at a specific index if (opt & OPT_GHBASEDSKIP) { BP.ghskip = true; BP.ghskipcoeff = 1.025; //if |b*i| < a*GH(L), skip the block } //output log file if (opt & OPT_TIMELOG) { BP.tlname="bkzlog.txt"; } BP.verboselevel = vl; BP.pruning_prob = prob; BP.init_radius = alpha; BP.init_mode = 'G'; //R=alpha*GH(L) BP.holdvecs = 16; //A heuristic strategy for finding short vectors if (opt & OPT_FIND_SHORT) { BP.process_entire_basis = true; BP.ec_boundalpha = 1.05; } cputime = 0; //CPU time in second start = clock(); return BKZmain(L,0,BP); }
void CudaDb::requestQuery(const KdRequest &r) { KdRequest request = r; if (request.type == RT_CPU) request.result = RequestResult(new std::vector<long>()); this->queue.push(request); switch (request.type) { #if USE_CUDA case RT_CUDA: case RT_CUDA_DP: case RT_CUDA_IM: request.numBlocks = this->numBlocks / this->devices.size(); for (size_t i = 0; i < this->devices.size(); i++) { request.ranges = (uint64_t*) this->fRange.data() + (request.numBlocks) * i * 2 * r.query->size; request.keys = (TripKey*) this->fBin.data() + (request.numBlocks) * i * KdBlock::MAX_RECORDS_PER_BLOCK * (r.query->size + 1); this->devices[i]->push(request); } break; case RT_CUDA_PARTIAL_IM: { KdBlock::QueryResult result = this->kdb->execute(*request.query); int numBlocks = result.blocks->size(); request.totalBlocks = this->numBlocks / this->devices.size(); int tmpct = 0; for (size_t i = 0; i < this->devices.size(); i++) { request.keys = (TripKey*) this->fBin.data() + (request.totalBlocks) * i * KdBlock::MAX_RECORDS_PER_BLOCK * (r.query->size + 1); request.ranges = new uint64_t[numBlocks]; int ctBlocks = 0; for (int k = 0; k < numBlocks; k++) { uint64_t blockId = result.blocks->at(k).second; blockId /= KdBlock::MAX_RECORDS_PER_BLOCK; int index = blockId / request.totalBlocks; if(index == i) { request.ranges[ctBlocks ++] = (blockId % request.totalBlocks); } } request.numBlocks = ctBlocks; this->devices[i]->push(request); tmpct += ctBlocks; } } break; case RT_CUDA_PARTIAL: { request.keys = (TripKey*) this->fBin.data(); KdBlock::QueryResult result = this->kdb->execute(*request.query); int numBlocks = result.blocks->size(); for (size_t i = 0; i < this->devices.size(); i++) { request.numBlocks = numBlocks / this->devices.size(); request.ranges = new uint64_t[request.numBlocks]; for (int k = 0; k < request.numBlocks; k++) { request.ranges[k] = result.blocks->at(request.numBlocks * i + k).second; } this->devices[i]->push(request); } } break; #endif case RT_CPU: { // hlog << "CPU execution" << endl; size_t EXTRA_BLOCKS_PER_LEAF = this->keySize; int noKeys = this->keySize - 1; int gsize = request.noRegions; KdBlock::QueryResult result = this->kdb->execute(*request.query); TripKey *keys = (TripKey*) fBin.data(); // printf("No. of blocks %zu \n", result.blocks->size()); uint64_t noBlocks = result.blocks->size(); int noThreads = omp_get_max_threads(); std::vector<ResultVec> res(noThreads); #pragma omp parallel for for (size_t i = 0; i < noBlocks; i++) { uint32_t count = result.blocks->at(i).first; uint64_t offset = result.blocks->at(i).second; for (uint32_t j = 0; j < count; j++) { uint64_t pos = (offset + j) * EXTRA_BLOCKS_PER_LEAF; TripKey * curKey = keys + pos; uint64_t index = * (curKey + noKeys); bool match = true; for(int k = 0;k < noKeys;k ++) { if(!request.query->isMatched(curKey,k)) { match = false; break; } } if(match) { for(int k = 0;k < gsize;k ++) { double x = uint2double(curKey[k * 2]); double y = uint2double(curKey[k * 2 + 1]); if(!Neighborhoods::isInside(request.regions[k].size(),&request.regions[k][0].first,x,y)) { match = false; break; } } } if(match) { res[omp_get_thread_num()].push_back(index); } } } for (size_t i = 0; i < noThreads; i++) { request.result->insert(request.result->end(),res[i].begin(),res[i].end()); } } break; default: fprintf(stderr, "Unhandled request type %d\n", request.state); break; } }
void test_solver(BfmSolver solver) { g5dParams parms; int Ls=16; double M5=1.8; double mq=0.0001; double wilson_lo = 0.05; double wilson_hi = 6.8; double shamir_lo = 0.025; double shamir_hi = 1.7; double ht_scale=1.7; double hw_scale=1.0; if ( solver != DWF ) { exit(0); Printf("Should be testing HtCayleyTanh aka DWF\n"); } parms.pDWF(mq,M5,Ls); multi1d<LatticeColorMatrix> u(4); HotSt(u); // ArchivGauge_t Header ; readArchiv(Header,u,"ckpoint_lat.3000"); multi1d<LatticeFermion> src(Ls); /* Rudy calculate some eigenvectors */ BfmWrapperParams BWP; BWP.BfmInverter = BfmInv_CG; BWP.BfmMatrix = BfmMat_M; BWP.BfmPrecision= Bfm64bit; BWP.MaxIter = 10000; BWP.RsdTarget.resize(1); BWP.RsdTarget[0]= 1.0e-9; BWP.Delta = 1.0e-4; BWP.BAP = parms; BfmWrapper bfm(BWP); bfmarg bfma; #if defined(QDP_USE_OMP_THREADS) bfma.Threads(omp_get_max_threads()); #else bfma.Threads(16); #endif bfma.Verbose(0); //Physics parameters bfmActionParams *bfmap = (bfmActionParams *) &bfma; *bfmap = bfm.invParam.BAP; // Algorithm & code control bfma.time_report_iter=-100; bfma.max_iter = bfm.invParam.MaxIter; bfma.residual = toDouble(bfm.invParam.RsdTarget[0]); int lx = QDP::Layout::subgridLattSize()[0]; int ly = QDP::Layout::subgridLattSize()[1]; int lz = QDP::Layout::subgridLattSize()[2]; int lt = QDP::Layout::subgridLattSize()[3]; //Geometry bfma.node_latt[0] = lx; bfma.node_latt[1] = ly; bfma.node_latt[2] = lz; bfma.node_latt[3] = lt; multi1d<int> procs = QDP::Layout::logicalSize(); for(int mu=0;mu<4;mu++){ if (procs[mu]>1) bfma.local_comm[mu] = 0; else bfma.local_comm[mu] = 1; } // Bfm object bfm_qdp<double> bfm_eig; bfm_eig.init(bfma); //Gauge field import bfm_eig.importGauge(u); //Subspace #define NumberGaussian (1) Fermion_t subspace[NumberGaussian]; Fermion_t check; Fermion_t mp; Fermion_t mmp; Fermion_t tmp_t; check = bfm_eig.allocFermion(); mp = bfm_eig.allocFermion(); mmp = bfm_eig.allocFermion(); tmp_t = bfm_eig.allocFermion(); bfm_eig.importFermion(src,check,1); QDPIO::cout << "Ls = "<<Ls<<endl; for(int g=0;g<NumberGaussian;g++){ for(int s=0;s<Ls;s++){ gaussian(src[s]); } subspace[g]=bfm_eig.allocFermion(); bfm_eig.importFermion(src,subspace[g],1); // Half parity gaussian if ( g==0) { bfm_eig.importFermion(src,check,1); } for(int s=0;s<Ls;s++){ src[s]=zero; } bfm_eig.exportFermion(src,subspace[g],1); QDPIO::cout << "Subspace norm " << norm2(src)<<endl; } for(int s=0;s<Ls;s++){ gaussian(src[s]); } QDPIO::cout << "Got here " << endl; // Handle< LinearOperatorArray<T> > linop =GetLinOp(u, parms); int block[5]; for(int i=0;i<5;i++) block[i]=4; QDPIO::cout << "Initialised dirac op"<<endl; BfmLittleDiracOperator ldop(Ls,NumberGaussian,block,subspace,&bfm_eig); int ns = ldop.SubspaceDimension(); QDPIO::cout << "subspace dimension is "<< ns<<endl; ns = ldop.SubspaceLocalDimension(); QDPIO::cout << "subspace dimension per node is "<< ns<<endl; std::vector<std::complex<double> > decomp(ns); ldop.ProjectToSubspace(check,decomp); if (QMP_is_primary_node()){ FILE * fp = fopen("coeff.dat","w"); for(int s=0;s<ns;s++){ fprintf(fp,"coeff %d %le %le\n",s,real(decomp[s]),imag(decomp[s])); } fclose(fp); } for(int s=0;s<ns;s++){ QDPIO::cout << "coeff "<<s<<" " << real(decomp[s]) << " " << imag(decomp[s])<<endl; } ldop.PromoteFromSubspace(decomp,mp); double n; #pragma omp parallel { omp_set_num_threads(bfm_eig.nthread); #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.axpy(check,mp,check,-1); n = bfm_eig.norm(check); } } QDPIO::cout << "project/promote n2diff "<< n<<endl; QMP_barrier(); QDPIO::cout << "Computing little dirac matrix"<<endl; ldop.ComputeLittleMatrixColored(); QDPIO::cout << "Done"<<endl; std::vector<std::complex<double> > Aphi(ns); // phi^dag DdagD phi = |Dphi|^2 with phi a subspace vector // should be equal to Project/Apply/Promote + inner product #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.Mprec(subspace[0],mp,tmp_t,0); } } QDPIO::cout << "Applied BFM matrix "<<endl; double n2; #pragma omp parallel { omp_set_num_threads(bfm_eig.nthread); #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { n2 = bfm_eig.norm(mp); } } QDPIO::cout << "Applied BFM matrix "<<n2<<endl; ldop.ProjectToSubspace(subspace[0],decomp); QDPIO::cout << "Projected to subspace "<<endl; ldop.Apply(decomp,Aphi); QDPIO::cout << "Applied A "<<endl; ldop.PromoteFromSubspace(Aphi,check); QDPIO::cout << "Promoted "<<endl; complex<double> inn; #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { inn = bfm_eig.inner(subspace[0],check); } } QDPIO::cout << "phi^dag Ddag D phi check " << n2 << " " <<real(inn) << imag(inn) <<endl; std::vector<std::complex<double> > AinvAphi(ns); ldop.ProjectToSubspace(subspace[0],decomp); ldop.Apply(decomp,Aphi); for(int s=0;s<ns;s++){ QDPIO::cout << "Aphi "<<s<<" " << real(Aphi[s]) <<" " << imag(Aphi[s])<<endl; } ldop.PromoteFromSubspace(Aphi,check); #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.Mprec(subspace[0],mp,tmp_t,0); bfm_eig.Mprec(mp,mmp,tmp_t,1); } } ldop.ProjectToSubspace(mmp,decomp); ldop.PromoteFromSubspace(decomp,mmp); #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.axpy(check,mmp,check,-1.0); n2 = bfm_eig.norm(check); } } QDPIO::cout << "PMdagMP check n2diff "<< n2<<endl; QMP_barrier(); QDPIO::cout << "Applying inverse"<<endl; ldop.ApplyInverse(Aphi,AinvAphi); QMP_barrier(); for(int s=0;s<ns;s++){ QDPIO::cout << "AinvAphi "<<s<<" " << real(AinvAphi[s]) << " " << imag(AinvAphi[s])<<endl; } ldop.PromoteFromSubspace(AinvAphi,check); #pragma omp parallel { #pragma omp for for(int t=0;t<bfm_eig.nthread;t++) { bfm_eig.axpy(check,subspace[0],check,-1.0); n2 = bfm_eig.norm(check); } } QDPIO::cout << "AinvA check n2diff "<< n2<<endl; }
int main ( int argc, char *argv[] ) { omp_set_num_threads(omp_get_max_threads()); //Control number of input parameter if(argc<3) { printf("ERROR MISSING DIR PATH IN/OUT \n"); return 1; } //MPI vars int error = 0; // mi restituisce gli errori mpi int nproc = 0; // numero processori totali int myid = 0; // id singolo processore //init MPI error = MPI_Init(&argc, &argv); //init MPI Comm error = MPI_Comm_size(MPI_COMM_WORLD, &nproc); error = MPI_Comm_rank(MPI_COMM_WORLD, &myid); //check directory char * dirIn; char * dirOut; char * istant; dirIn = (char *) malloc(500*sizeof(char )); dirOut = (char *) malloc(500*sizeof(char )); istant = (char *) malloc(100*sizeof(char )); strcpy(dirIn,argv[1]); strcpy(dirOut,argv[2]); strcat(dirIn,"/"); strcat(dirOut,"/"); //printf("I'm %d of %d\n",myid,nproc); // read number of json file in input dir int numFile = 0; numFile = readDirectoryNum(dirIn); //create file list structure char ** list; list = (char **) malloc(numFile*sizeof(char*)); for(int i=0;i<numFile;i++) list[i] = (char *) malloc(200*sizeof(char)); // read list file in input directory readDirectory(dirIn,list,numFile); if(myid == 0) { printf("\n"); printf("GILLESPIE HT v 1.0 \n"); printf("Gillespie algo high throughput software\n"); printf("https://github.com/EricPascolo/GillespieHT\n"); printf("Created by Eric Pascolo (set 2014)\n"); printf("\n"); if(nproc>1) printf("\tParallel Run with %d slave\n",nproc-1); else printf("\tSerial Run\n"); printf("\tThreads/Task : %d \n",omp_get_max_threads()); printf("\tInput directory : %s \n",dirIn); printf("\tOutput directory : %s \n",dirOut); printf("\tNumber of file: %d \n",numFile); printf("\nBEGIN Simulation at %s\n",getTime(istant)); printf("\n"); printf("\n"); printf("\tLIST FILE\n"); printf("\t---------\n"); for(int i=0;i<numFile;i++) printf("\t%5d %20s\n",i,list[i]); printf("\t---------\n\n"); } MPI_Barrier(MPI_COMM_WORLD); if(myid == 0) { Master(nproc,dirIn,dirOut,list,numFile); } else { Slave(myid,nproc,dirIn,dirOut,list,numFile); } MPI_Barrier(MPI_COMM_WORLD); if(myid == 0) { printf("\nEND Simulation at %s\n",getTime(istant)); } error = MPI_Finalize(); return 0; }
// Threads each sequences and creates preArcs according to road map indications static void connectPreNodes(RoadMapArray * rdmaps, PreGraph * preGraph, IDnum * chains) { IDnum sequenceIndex; IDnum referenceCount = rdmaps->referenceCount; #ifdef _OPENMP annotationOffset = mallocOrExit(rdmaps->length + 1, Coordinate); annotationOffset[0] = 0; for (sequenceIndex = 1; sequenceIndex <= rdmaps->length; sequenceIndex++) annotationOffset[sequenceIndex] = annotationOffset[sequenceIndex - 1] + getAnnotationCount(getRoadMapInArray(rdmaps, sequenceIndex - 1)); #else Annotation *annot = rdmaps->annotations; #endif if (rdmaps->referenceCount > 0) allocatePreMarkerCountSpace_pg(preGraph); #ifdef _OPENMP int threads = omp_get_max_threads(); if (threads > 8) threads = 8; #pragma omp parallel for num_threads(threads) #endif for (sequenceIndex = 1; sequenceIndex <= sequenceCount_pg(preGraph); sequenceIndex++) { #ifdef _OPENMP Annotation *annot = getAnnotationInArray(rdmaps->annotations, annotationOffset[sequenceIndex - 1]); #endif RoadMap *rdmap; Coordinate currentPosition, currentInternalPosition; IDnum currentPreNodeID, nextInternalPreNodeID; IDnum annotIndex, lastAnnotIndex; boolean isReference; if (sequenceIndex % 1000000 == 0) velvetLog("Connecting %li / %li\n", (long) sequenceIndex, (long) sequenceCount_pg(preGraph)); rdmap = getRoadMapInArray(rdmaps, sequenceIndex - 1); annotIndex = 0; lastAnnotIndex = getAnnotationCount(rdmap); nextInternalPreNodeID = chooseNextInternalPreNode (chains[sequenceIndex] - 1, sequenceIndex, preGraph, chains); isReference = (sequenceIndex <= referenceCount); currentPosition = 0; currentInternalPosition = 0; currentPreNodeID = 0; // Recursion up to last annotation while (annotIndex < lastAnnotIndex || nextInternalPreNodeID != 0) { if (annotIndex == lastAnnotIndex || (nextInternalPreNodeID != 0 && currentInternalPosition < getPosition(annot))) { connectPreNodeToTheNext(¤tPreNodeID, nextInternalPreNodeID, ¤tPosition, sequenceIndex, isReference, preGraph); nextInternalPreNodeID = chooseNextInternalPreNode (currentPreNodeID, sequenceIndex, preGraph, chains); currentInternalPosition += getPreNodeLength_pg(currentPreNodeID, preGraph); } else { connectAnnotation(¤tPreNodeID, annot, ¤tPosition, sequenceIndex, isReference, preGraph); annot = getNextAnnotation(annot); annotIndex++; } } } if (rdmaps->referenceCount > 0) { allocatePreMarkerSpace_pg(preGraph); createPreMarkers(rdmaps, preGraph, chains); } #ifdef _OPENMP free(annotationOffset); annotationOffset = NULL; #endif }
int main(int argc, char *argv[]) { herr_t err = 0; int n_threads = omp_get_max_threads(); hid_t kernel_file_id = 0; hid_t levy_basis_file_id = 0; hid_t levy_basis_dataset_id = 0; hid_t levy_basis_dataspace_id = 0; hid_t output_file_id = 0; hid_t output_dataset_id = 0; hid_t output_dataspace_id = 0; hid_t memspace = 0; hsize_t n_k = 0; double *tmp = NULL; double *k_abscissa = NULL; double *k_ordinate = NULL; double *x1 = NULL; double *x2 = NULL; double *x3 = NULL; DEBUGPRINT("### Parsing arguments"); struct arguments args; initialise_arguments(&args); argp_parse (&argp, argc, argv, 0, 0, &args); #ifdef DEBUG print_arguments(&args); #endif DEBUGPRINT("### Reading kernel"); kernel_file_id = H5Fopen(args.kernel_file, H5F_ACC_RDONLY, H5P_DEFAULT); if (kernel_file_id <= 0) { printf("Error: Could not open \"%s\".\n", args.kernel_file); err = -1; goto cleanup; } err = H5LTget_dataset_info(kernel_file_id, "/abscissa", &n_k, NULL, NULL); if (err < 0) { printf("Error: Could not read dataset info.\n"); goto cleanup; } printf("n_k = %i\n", (int)n_k); k_abscissa = malloc(n_k * sizeof(double)); k_ordinate = malloc(n_k * sizeof(double)); err = H5LTread_dataset_double(kernel_file_id, "/abscissa", k_abscissa); err = H5LTread_dataset_double(kernel_file_id, "/ordinate", k_ordinate); DEBUGPRINT("### Reading Levy basis"); hsize_t dims[4]; hsize_t offset[4]; hsize_t count[4]; levy_basis_file_id = H5Fopen(args.levy_basis_file, H5F_ACC_RDONLY, H5P_DEFAULT); levy_basis_dataset_id = H5Dopen(levy_basis_file_id, "/levy_basis_realization", H5P_DEFAULT); levy_basis_dataspace_id = H5Dget_space(levy_basis_dataset_id); err = H5Sget_simple_extent_dims(levy_basis_dataspace_id, dims, NULL); if (dims[0] != dims[1] || dims[1] != dims[2]) { printf("Error: The three dimensions must be equal.\n"); err = -1; goto cleanup; } hsize_t dims_pad[3]; dims_pad[0] = dims[0]; dims_pad[1] = dims[1]; dims_pad[2] = 2 * (dims[2] / 2 + 1); hsize_t n_x = dims_pad[0] * dims_pad[1] * dims_pad[2]; x1 = malloc(n_x * sizeof(double)); x2 = malloc(n_x * sizeof(double)); x3 = malloc(n_x * sizeof(double)); double *x[] = {x1, x2, x3}; if (!x1 || !x2 || !x3) { printf("Error: Could not allocate memory for the Levy basis.\n"); err = -1; goto cleanup; } #pragma omp parallel for for (ptrdiff_t i = 0; i < n_x; i++) { x1[i] = 0.0; x2[i] = 0.0; x3[i] = 0.0; } /* Define memory dataspace */ memspace = H5Screate_simple(3, dims_pad, NULL); offset[0] = offset[1] = offset[2] = 0; count[0] = dims[0]; count[1] = dims[1]; count[2] = dims[2]; /* Define hyperslab in the memory dataspace */ err = H5Sselect_hyperslab(memspace, H5S_SELECT_SET, offset, NULL, count, NULL); for (int j = 0; j < 3; j++) { /* Define hyperslap in the file dataspace */ offset[3] = j; count[3] = 1; err = H5Sselect_hyperslab(levy_basis_dataspace_id, H5S_SELECT_SET, offset, NULL, count, NULL); /* Read data from hyperslab */ err = H5Dread(levy_basis_dataset_id, H5T_NATIVE_DOUBLE, memspace, levy_basis_dataspace_id, H5P_DEFAULT, x[j]); if (err < 0) { printf("Error: Could not read hyperslab.\n"); err = -1; goto cleanup; } } DEBUGPRINT("### Convolving"); double delta = 2.0 * M_PI / dims[0]; err = ambit_symmetric_odd_isotropic_circular_convolution_inplace( n_threads, n_k, k_abscissa, k_ordinate, dims[0], delta, x1, x2, x3); if (err) { printf("Error in ambit_symmetric_odd_isotropic_circular_convolution_inplace.\n"); goto cleanup; } DEBUGPRINT("### Writing output"); output_file_id = H5Fcreate(args.output_file, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); if (output_file_id < 0) { printf("Error: Could not open \"%s\".\n", args.output_file); err = -1; goto cleanup; } output_dataspace_id = H5Screate_simple(4, dims, NULL); output_dataset_id = H5Dcreate(output_file_id, "/simulation", H5T_NATIVE_DOUBLE, output_dataspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); for (int j = 0; j < 3; j++) { printf("j = %i\n", j); /* Define hyperslap in the file dataspace */ offset[3] = j; count[3] = 1; err = H5Sselect_hyperslab(output_dataspace_id, H5S_SELECT_SET, offset, NULL, count, NULL); /* Write data to hyperslab */ err = H5Dwrite(output_dataset_id, H5T_NATIVE_DOUBLE, memspace, output_dataspace_id, H5P_DEFAULT, x[j]); if (err < 0) { printf("Error: Could not write hyperslab.\n"); err = -1; goto cleanup; } } cleanup: if (memspace > 0) H5Sclose(memspace); if (output_dataspace_id > 0) H5Sclose(output_dataspace_id); if (output_dataset_id > 0) H5Dclose(output_dataset_id); if (output_file_id > 0) H5Fclose(output_file_id); if (levy_basis_dataspace_id > 0) H5Sclose(levy_basis_dataspace_id); if (levy_basis_dataset_id > 0) H5Dclose(levy_basis_dataset_id); if (levy_basis_file_id > 0) H5Fclose(levy_basis_file_id); if (kernel_file_id > 0) H5Fclose(kernel_file_id); free(tmp); free(k_abscissa); free(k_ordinate); free(x1); free(x2); free(x3); return err; }
double Gradient::computeGradient(dVector& vecGradient, Model* m, DataSet* X) { double ans = 0.0; #ifdef _OPENMP if( nbThreadsMP < 1 ) nbThreadsMP = omp_get_max_threads(); setMaxNumberThreads(nbThreadsMP); pInfEngine->setMaxNumberThreads(nbThreadsMP); pFeatureGen->setMaxNumberThreads(nbThreadsMP); #endif //Check the size of vecGradient int nbFeatures = pFeatureGen->getNumberOfFeatures(); if(vecGradient.getLength() != nbFeatures) vecGradient.create(nbFeatures); else vecGradient.set(0); //////////////////////////////////////////////////////////// // Start of parallel Region // Some weird stuff in gcc 4.1, with openmp 2.5 support // // Note 1: In OpenMP 2.5, the iteration variable in "for" must be // a signed integer variable type. In OpenMP 3.0 (_OPENMP>=200805), // it may also be an unsigned integer variable type, a pointer type, // or a constant-time random access iterator type. // // Note 2: schedule(static | dynamic): In the dynamic schedule, there // is no predictable order in which the loop items are assigned to // different threads. Each thread asks the OpenMP runtime library for // an iteration number, then handles it, then asks for the next one. // It is thus useful when different iterations in the loop may take // different time to execute. #pragma omp parallel default(none) \ shared(vecGradient, X, m, ans, nbFeatures, std::cout) { // code inside this region runs in parallel dVector g(nbFeatures, COLVECTOR, 0.0); #pragma omp for schedule(dynamic) reduction(+:ans) for(int i=0; (int)i<X->size(); i++) { DataSequence* x = X->at(i); if( m->isWeightSequence() && x->getWeightSequence() != 1.0) { dVector tmp(nbFeatures, COLVECTOR, 0.0); ans += computeGradient(tmp, m, x) * x->getWeightSequence(); tmp.multiply(x->getWeightSequence()); g.add(tmp); } else { ans += computeGradient(g, m, x); } } // We now put togheter the gradients // No two threads can execute a critical directive of the same name at the same time #pragma omp critical (reduce_sum) { vecGradient.add(g); } } // End of parallel Region //////////////////////////////////////////////////////////// vecGradient.negate(); // MaxMargin objective: min L = 0.5*\L2sigma*W*W + Loss() // MLE objective: min L = 0.5*1/(\L2sigma*\L2sigma)*W*W - log p(y|x) // Add the regularization term double scale = (m->isMaxMargin()) ? m->getRegL2Sigma() : 1/(double)(m->getRegL2Sigma()*m->getRegL2Sigma()); if( m->isMaxMargin() ) ans = (1/(double)X->size()) * ans; if(m->getRegL2Sigma()!=0.0f) { for(int f=0; f<nbFeatures; f++) vecGradient[f] += (*m->getWeights())[f]*scale; ans += 0.5*scale*m->getWeights()->l2Norm(false); } return ans; }
/* // Update all running averages */ void rtGlobalUpdateTransfer(int top_level, MPI_Comm level_com) { int iomp, i, freq, field; int level, cell, *level_cells, num_level_cells, bottom_level = max_level_local(); float amin, amax; float *abc[2]; #ifdef _OPENMP int nomp = omp_get_max_threads(); #else int nomp = 1; #endif double s[nomp][rt_num_fields]; double s1, sw[nomp][rt_num_fields_per_freq]; start_time(WORK_TIMER); /* // Compute per-level averages */ for(level=top_level; level<=bottom_level; level++) { select_level(level,CELL_TYPE_LOCAL | CELL_TYPE_LEAF,&num_level_cells,&level_cells); if(num_level_cells == 0) continue; /* // Because the reduction variable cannot be an array in C, doing // reduction manually. Cannot re-arrange the loops because of the // cache access pattern. */ for(i=0; i<nomp; i++) { for(field=0; field<rt_num_fields; field++) s[i][field] = 0.0; } #pragma omp parallel for default(none), private(i,field,cell,iomp), shared(num_level_cells,level_cells,level,cell_vars,cell_child_oct,nomp,s) for(i=0; i<num_level_cells; i++) { cell = level_cells[i]; // No need to check for leaves, we selected only them! #ifdef _OPENMP iomp = omp_get_thread_num(); cart_assert(iomp>=0 && iomp<nomp); #else iomp = 0; #endif for(field=0; field<rt_num_fields; field++) { s[iomp][field] += cell_var(cell,rt_field_offset+field)*cell_volume[level]/num_root_cells; } } #ifdef _OPENMP for(i=1; i<nomp; i++) { for(field=0; field<rt_num_fields; field++) s[0][field] += s[i][field]; } #endif for(field=0; field<rt_num_fields; field++) { rtGlobalValueUpdate(&rtAvgRF[field],level,s[0][field]); } /* // Now do absoprtion - since we need to recompute the abs. coefficient, // loop over frequencies first */ abc[0] = cart_alloc(float,num_level_cells); #if (RT_CFI == 1) abc[1] = cart_alloc(float,num_level_cells); #else abc[1] = abc[0]; #endif for(freq=0; freq<rt_num_freqs; freq++) { /* // Average by weighting with the far field only */ rtComputeAbsLevel(level,num_level_cells,level_cells,freq,abc); linear_array_maxmin(num_level_cells,abc[1],&amax,&amin); rtGlobalValueUpdate(&rtMaxAC[freq],level,amax); /* // Because the reduction variable cannot be an array in C, doing // reduction manually. Cannot re-arrange the loops because of the // cache access pattern. */ for(i=0; i<nomp; i++) { for(field=0; field<rt_num_fields_per_freq; field++) sw[i][field] = 0.0; } #pragma omp parallel for default(none), private(cell,i,iomp,field), shared(num_level_cells,level_cells,abc,level,cell_vars,freq,nomp,sw,units,constants), reduction(+:s1) for(i=0; i<num_level_cells; i++) { float facLLS; #ifdef RT_ADD_EXTERNAL_LLS float tauLLS; #endif /* RT_ADD_EXTERNAL_LLS */ cell = level_cells[i]; // No need to check for leaves, we selected only them! #ifdef _OPENMP iomp = omp_get_thread_num(); cart_assert(iomp>=0 && iomp<nomp); #else iomp = 0; #endif #ifdef RT_ADD_EXTERNAL_LLS tauLLS = 6.3e-18*units->number_density*units->length*cell_HI_density(cell)*cell_sobolev_length2(cell,level,NULL); facLLS = exp(-tauLLS); #else facLLS = 1.0; #endif /* RT_ADD_EXTERNAL_LLS */ for(field=0; field<rt_num_near_fields_per_freq; field++) { sw[iomp][field] += facLLS*cell_var(cell,rt_field_offset+rt_num_freqs*field+freq)*abc[1][i]*cell_volume[level]/num_root_cells; } for(field=rt_num_near_fields_per_freq; field<rt_num_fields_per_freq; field++) { sw[iomp][field] += cell_var(cell,rt_field_offset+rt_num_freqs*field+freq)*abc[1][i]*cell_volume[level]/num_root_cells; } s1 += abc[1][i]*cell_volume[level]/num_root_cells; } #ifdef _OPENMP for(i=1; i<nomp; i++) { for(field=0; field<rt_num_fields_per_freq; field++) { sw[0][field] += sw[i][field]; } } #endif rtGlobalValueUpdate(&rtAvgAC[freq],level,s1); for(field=0; field<rt_num_fields_per_freq; field++) rtGlobalValueUpdate(&rtAvgACxRF[rt_num_freqs*field+freq],level,sw[0][field]); } cart_free(abc[0]); #if (RT_CFI == 1) cart_free(abc[1]); #endif cart_free(level_cells); } end_time(WORK_TIMER); for(field=0; field<rt_num_fields; field++) { rtGlobalValueCommunicate(&rtAvgRF[field],MPI_SUM,level_com); rtGlobalValueCommunicate(&rtAvgACxRF[field],MPI_SUM,level_com); } for(freq=0; freq<rt_num_freqs; freq++) { rtGlobalValueCommunicate(&rtMaxAC[freq],MPI_MAX,level_com); rtGlobalValueCommunicate(&rtAvgAC[freq],MPI_SUM,level_com); } start_time(WORK_TIMER); /* // Weighted average */ for(freq=0; freq<rt_num_freqs; freq++) { float wACxRF = 0.0; float wRF = 0.0; for(field=0; field<rt_num_fields_per_freq-1; field++) { wACxRF += rtAvgACxRF[rt_num_freqs*field+freq].Value; wRF += rtAvgRF[rt_num_freqs*field+freq].Value; } if(wRF > 1.0e-35) { frtAbcLoc[freq] = wACxRF/wRF; } else { frtAbcLoc[freq] = rtAvgAC[freq].Value; } cart_assert(field == rt_num_fields_per_freq-1); if(rtAvgRF[rt_num_freqs*field+freq].Value > 1.0e-35) { frtAbcUni[freq] = rtAvgACxRF[rt_num_freqs*field+freq].Value/rtAvgRF[rt_num_freqs*field+freq].Value; } else { frtAbcUni[freq] = rtAvgAC[freq].Value; } frtAbcAvg[freq] = rtAvgAC[freq].Value; } end_time(WORK_TIMER); #ifdef RT_OUTPUT for(freq=0; freq<rt_num_freqs; freq++) { cart_debug("RT: Abc[%d] loc=%10.3e, uni=%10.3e, avg=%10.3le, max=%10.3le",freq,frtAbcLoc[freq],frtAbcUni[freq],rtAvgAC[freq].Value,rtMaxAC[freq].Value); } for(field=0; field<rt_num_fields; field++) { cart_debug("RT: field=%d: <rf>=%10.3e, <abc>=%10.3e",field,rtAvgRF[field].Value,(rtAvgRF[field].Value>0.0)?rtAvgACxRF[field].Value/rtAvgRF[field].Value:0.0); } #endif /* RT_OUTPUT */ /* // Maintain the unit average of the far field - should be called // by all run tasks only, to ensure the buffer consistency. */ if(top_level == min_level) for(level=top_level; level<=bottom_level; level++) { select_level(level,CELL_TYPE_ANY,&num_level_cells,&level_cells); #pragma omp parallel for default(none), private(i,freq), shared(num_level_cells,level_cells,cell_vars,rtAvgRF) for(i=0; i<num_level_cells; i++) { for(freq=0; freq<rt_num_freqs; freq++) if(rtAvgRF[rt_far_freq_offset+freq].Value > 0.0) { cell_var(level_cells[i],rt_far_field_offset+freq) /= rtAvgRF[rt_far_freq_offset+freq].Value; } } cart_free(level_cells); for(freq=0; freq<rt_num_freqs; freq++) if(rtAvgRF[rt_far_freq_offset+freq].Value > 0.0) { rtAvgRF[rt_far_freq_offset+freq].buffer[i] /= rtAvgRF[rt_far_freq_offset+freq].Value; rtAvgACxRF[rt_far_freq_offset+freq].buffer[i] /= rtAvgRF[rt_far_freq_offset+freq].Value; } } #ifdef RT_SINGLE_SOURCE start_time(WORK_TIMER); cell = cell_find_position(rtSingleSourcePos); if(cell>-1 && cell_is_local(cell)) { level = cell_level(cell); } else { level = -1; } end_time(WORK_TIMER); start_time(COMMUNICATION_TIMER); /* // NG: I don't know why, but Bcast blocks here, hence using Allreduce */ MPI_Allreduce(&level,&rtSingleSourceLevel,1,MPI_INT,MPI_MAX,level_com); end_time(COMMUNICATION_TIMER); #endif /* RT_SINGLE_SOURCE */ }