int main(int argc, char **argv)
{
	MPI_Init(&argc, &argv);
	double start_time, end_time;

	// Print some info
	omp_set_nested(1);

	int maxThreads = omp_get_max_threads();


	printf("Available threads: %d\n", maxThreads);

	// Initialize the array.
	InitArray();

	// Print data if in debug mode.
	if (DEBUG)
	{
		printf("===== BEFORE QUICK SORT (SEQ) =====\n\n");
		PrintArray();
		printf("===================================\n\n\n");
	}

	// Start timer.
	start_time = MPI_Wtime();

	// Split into 8 pieces and sort
	int subArraySize = ITEMS / maxThreads;

	int maxInd = ((maxThreads - 1) * subArraySize) - 1 + subArraySize;


	int i;
	#pragma omp parallel for
	for (i = 0; i < maxThreads; i++)
	{
		QuickSort(v, i * subArraySize, (i * subArraySize) - 1 + subArraySize);
	}

	// Sort the pieces
	int j;
	for (i = 0; i < ITEMS / maxThreads; i++)
	{
		for (j = 0; j < maxThreads; j++)
		{
			sorted[maxThreads * i + j] = v[subArraySize * j + i];
		}
	}
	#pragma omp parallel for
	for (i = 0; i < subArraySize; i++)
	{
		QuickSort(sorted, i * maxThreads, i * maxThreads + maxThreads - 1);
	}
	
	// Stop timer.
	end_time = MPI_Wtime();

	// Print data if in debug mode.
	if (DEBUG)
	{
		printf("===== AFTER QUICK SORT (SEQ) ======\n\n");
		PrintArray();
		printf("===================================\n\n");
	}
	else
	{
		printf("Lowest: %d\n", sorted[0]);
		printf("Highest: %d\n", sorted[ITEMS - 1]);
	}

	double time_taken = (end_time - start_time);
	printf("Execution time: %fs\n", time_taken);
	CleanMemory();
}
Exemple #2
0
static void createPreMarkers(RoadMapArray * rdmaps, PreGraph * preGraph,
			    IDnum * chains)
{
	IDnum sequenceIndex;
	IDnum referenceCount = rdmaps->referenceCount;
#ifndef _OPENMP
	Annotation *annot = rdmaps->annotations;
#endif

#ifdef _OPENMP
	int threads = omp_get_max_threads();
	if (threads > 8)
		threads = 8;

	#pragma omp parallel for num_threads(threads)
#endif
	for (sequenceIndex = 1;
	     sequenceIndex <= referenceCount;
	     sequenceIndex++) {
#ifdef _OPENMP
		Annotation *annot = getAnnotationInArray(rdmaps->annotations, annotationOffset[sequenceIndex - 1]);
#endif
		RoadMap *rdmap;
		Coordinate currentPosition, currentInternalPosition;
		IDnum currentPreNodeID, nextInternalPreNodeID;
		IDnum annotIndex, lastAnnotIndex;
		PreMarker * previous;

		if (sequenceIndex % 1000000 == 0)
			velvetLog("Connecting %li / %li\n", (long) sequenceIndex,
			       (long) sequenceCount_pg(preGraph));

		rdmap = getRoadMapInArray(rdmaps, sequenceIndex - 1);
		annotIndex = 0;
		lastAnnotIndex = getAnnotationCount(rdmap);
		nextInternalPreNodeID = chooseNextInternalPreNode
		    (chains[sequenceIndex] - 1, sequenceIndex,
		     preGraph, chains);

		previous = NULL;
		currentPosition = 0;
		currentInternalPosition = 0;
		currentPreNodeID = 0;
		// Recursion up to last annotation
		while (annotIndex < lastAnnotIndex
		       || nextInternalPreNodeID != 0) {
			if (annotIndex == lastAnnotIndex
			    || (nextInternalPreNodeID != 0
				&& currentInternalPosition <
				getPosition(annot))) {
#ifdef _OPENMP
				lockNode(nextInternalPreNodeID);
#endif
				previous = addPreMarker_pg(preGraph, 
						nextInternalPreNodeID,
						sequenceIndex,
						&currentPosition,
						previous);
#ifdef _OPENMP
				unLockNode(nextInternalPreNodeID);
#endif
				currentPreNodeID = nextInternalPreNodeID;
				nextInternalPreNodeID =
				    chooseNextInternalPreNode
				    (currentPreNodeID, sequenceIndex,
				     preGraph, chains);
				currentInternalPosition +=
				    getPreNodeLength_pg(currentPreNodeID,
							preGraph);

			} else {
				reConnectAnnotation(&currentPreNodeID, annot,
						  &currentPosition,
						  sequenceIndex, 
						  preGraph,
						  &previous);
				annot = getNextAnnotation(annot);
				annotIndex++;
			}
		}
	}
}
int colvarproxy_lammps::smp_num_threads()
{
  return omp_get_max_threads();
}
void lis_matvec_ccs(LIS_MATRIX A, LIS_SCALAR x[], LIS_SCALAR y[])
{
	LIS_INT i,j,js,je,jj;
	LIS_INT n,np;
	LIS_SCALAR t;
	#ifdef _OPENMP
		LIS_INT k,nprocs;
		LIS_SCALAR *w;
	#endif

	n    = A->n;
	np   = A->np;
	if( A->is_splited )
	{
		for(i=0; i<n; i++)
		{
			y[i] = A->D->value[i]*x[i];
		}
		for(i=0; i<np; i++)
		{
			js = A->L->ptr[i];
			je = A->L->ptr[i+1];
			t = x[i];
			for(j=js;j<je;j++)
			{
				jj  = A->L->index[j];
				y[jj] += A->L->value[j] * t;
			}
			js = A->U->ptr[i];
			je = A->U->ptr[i+1];
			t = x[i];
			for(j=js;j<je;j++)
			{
				jj  = A->U->index[j];
				y[jj] += A->U->value[j] * t;
			}
		}
	}
	else
	{
		#ifdef _OPENMP
			nprocs = omp_get_max_threads();
			w = (LIS_SCALAR *)lis_malloc( nprocs*np*sizeof(LIS_SCALAR),"lis_matvec_ccs::w" );
			#pragma omp parallel private(i,j,js,je,t,jj,k)
			{
				k = omp_get_thread_num();
				#pragma omp for
				for(j=0;j<nprocs;j++)
				{
					memset( &w[j*np], 0, np*sizeof(LIS_SCALAR) );
				}
				#pragma omp for 
				for(i=0; i<np; i++)
				{
					js = A->ptr[i];
					je = A->ptr[i+1];
					t = x[i];
					for(j=js;j<je;j++)
					{
						jj  = k*np+A->index[j];
						w[jj] += A->value[j] * t;
					}
				}
				#pragma omp for 
				for(i=0;i<n;i++)
				{
					t = 0.0;
					for(j=0;j<nprocs;j++)
					{
						t += w[j*np+i];
					}
					y[i] = t;
				}
			}
			lis_free(w);
		#else
			for(i=0; i<n; i++)
			{
				y[i] = 0.0;
			}
			for(i=0; i<np; i++)
			{
				js = A->ptr[i];
				je = A->ptr[i+1];
				t = x[i];
				for(j=js;j<je;j++)
				{
					jj  = A->index[j];
					y[jj] += A->value[j] * t;
				}
			}
		#endif
	}
}
//MAIN
int main(int argc,char **argv){
    
    ///////////////////////////////   
	//INITIALIZE MPI ENVIRONMENT //
    ///////////////////////////////
    
	MPI_Init(&argc, &argv);
	MPI_Barrier(MPI_COMM_WORLD);
    
    //SET MPI ID's AND NUMBER OF NODES
    MPI_Comm_rank(MPI_COMM_WORLD,&MPIBasic::ID);
    MPI_Comm_size(MPI_COMM_WORLD,&MPIBasic::NumberOfNodes);
    
    
    ////////////////////////////////////
    // INITIALIZE OPEN MP ENVIRONMENT //
    ////////////////////////////////////
    
    //CHECK THREAD COUNT
    std::cerr << "#NUMBER OF THREADS " << omp_get_max_threads() << std::endl;
    
    //INITIALIZE THREADED FFTW
    int FFTW3_THREAD_STATUS=fftw_init_threads();
    
    std::cerr << "#FFTW THREAD STATUS " << FFTW3_THREAD_STATUS << std::endl;
    
    if(FFTW3_THREAD_STATUS==1){
        
        fftw_plan_with_nthreads(omp_get_max_threads());
        
    }
    
    //////////////////////////////////
    //PROCESS COMMANDLINE ARGUMENTS //
    //////////////////////////////////
    
    INT NumberOfConfigurations=1;
    
    //////////////////////////////////
    //PROCESS COMMANDLINE ARGUMENTS
    //////////////////////////////////
    
	Konfig arguments(argc,argv);
	
    //GET OUTPUT FOLDER
    arguments.Getval("nconfs",NumberOfConfigurations);
    
    //////////////////////////
    // SET OUTPUT DIRECTORY //
    //////////////////////////
    
    char OutDir[256]="OUTPUT";
    
    arguments.Getval("o",OutDir);
    
    IO::SetOutputDirectory(OutDir);
    
    #if IC_FLAG==LOAD_FLAG

        /////////////////////////
        // SET INPUT DIRECTORY //
        /////////////////////////
    
        char InDir[256]="INPUT";
    
        arguments.Getval("i",InDir);
    
        IO::SetInputDirectory(InDir);
    
        /////////////////////
        // SET INPUT FILES //
        /////////////////////
    
        // FOR LOADING FILES
        INT InputFileTime=0;
        INT InputFileID=1457712671;
    
        arguments.Getval("iT",InputFileTime);
        arguments.Getval("iID",InputFileID);
    
        IO::SetInputFile(InputFileTime,InputFileID);
    
    #endif
    
    
    ////////////////////////////
    // DETERMINE LATTICE SIZE //
    ////////////////////////////
    
    INT NSites=-1;
    
    arguments.Getval("N",NSites);
    
    if(NSites>0){
        
        Lattice::N[0]=NSites;
        Lattice::N[1]=NSites;
        Lattice::N[2]=NSites;
        
        Lattice::Volume=NSites*NSites*NSites;
        
        std::cerr << "## LATTICE SIZE IS " << Lattice::N[0] << "x" << Lattice::N[1] << "x" << Lattice::N[2] << std::endl;

    }
    else{
        std::cerr << "## NUMBER OF SITES NOT SPECIFIED -- USING " << Lattice::N[0] << "x" << Lattice::N[1] << "x" << Lattice::N[2] << std::endl;
    }
    
    ///////////////////////////////
    // GET SIMULATION PARAMETERS //
    ///////////////////////////////
    
    DOUBLE InvTemp=-1;

    arguments.Getval("beta",InvTemp);
    
    if(InvTemp>0.0){
        
        LangevinDynamics::beta=InvTemp;

        std::cerr << "#beta=" << LangevinDynamics::beta << std::endl;
        
    }
    
    //////////////
    // SIMULATE //
    //////////////
    
    
    //COMMAND LINE OUTPUT
    std::cerr << "#GAUGE GROUP IS SU(" << Nc << ")" << std::endl;
    
    std::cerr << "#PRECISION IS " << MAX_DIGITS_PRECISION << " DIGITS" << std::endl;
    
    //INITIALIZE SIMULATION
    Simulation::Init();
    
    // SAMPLE DIFFERENT CONFIGURATIONS //
    for(INT n=0;n<NumberOfConfigurations;n++){
        
        //SET GLOBAL RANDOM NUMBER SEED//
        INT GLOBAL_RNG_SEED;
        
        if(MPIBasic::ID==0){
            
            GLOBAL_RNG_SEED=time(0);
            
            arguments.Getval("SEED",GLOBAL_RNG_SEED);
        }
        
        // BROADCAST GLOBAL RANDOM SEED //
        MPI_Bcast(&GLOBAL_RNG_SEED, 1, MPI_INT,0,MPI_COMM_WORLD);
        
        // PERFORM CLASSICAL STATISTICAL SIMULATION //
        Simulation::Run(GLOBAL_RNG_SEED+MPIBasic::ID);
        
        // COMMADNLINE NOTIFICATION //
        std::cerr << "#COMPLETED " << GLOBAL_RNG_SEED+MPIBasic::ID << std::endl;
        
    }
    
    
    //SYNCHRONIZE ALL MPI NODES
    MPI_Barrier(MPI_COMM_WORLD);
	
    //FINALIZE MPI
	MPI_Finalize();
    
    //EXIT
    exit(0);
    
}
Exemple #6
0
int main (int argc, char **argv)
{
  int ret;
  int c;
  int pin2core = 0;    // 1=> pin threads to single core
  int pin2range = 0;   // 1=> pin threads to range of cores
  int cpn = 1;         // default cores per node

  ret = MPI_Init (&argc, &argv);
  ret = MPI_Comm_rank (MPI_COMM_WORLD, &iam);
  shiftiam = iam;
  ret = MPI_Comm_size (MPI_COMM_WORLD, &nranks);

  pid = getpid ();
  nthreads = omp_get_max_threads ();
  core   = malloc (nthreads * sizeof (int));
  status = malloc (nthreads * sizeof (char));
  tidarr = malloc (nthreads * sizeof (int));
  fp     = malloc (nthreads * sizeof (FILE *));
  init_core (nthreads);

  while ((c = getopt (argc, argv, "hcrn:w:")) != -1) {
    switch (c) {
    case 'h':
      if (iam == 0) {
	printf ("Usage: %s [-c] to pin to single core\n"
		"          [-r] to pin to range of cores\n"
		"          [-n <num>] number of cores per node\n"
		"          [-w <num> number of seconds between forced shifts", argv[0]);
      }
      return 0;
      break;
    case 'c':
      pin2core = 1;
      break;
    case 'r':
      pin2range = 1;
      break;
    case 'n':
      cpn = atoi (optarg);
      break;
    case 'w':
      shiftintvl = atoi (optarg);
      break;
    default:
      printf ("unknown option %c\n", c);
      return 1;
      break;
    }
  }

  if (pin2core) {
    if (iam == 0) {
      printf ("Pinning threads to individual cores\n");
    }
    ret = set_affinity_ (&iam, &cpn, &nthreads, &pin2core);
  } else if (pin2range) {
    if (iam == 0) {
      printf ("Pinning threads to subsetted range of cores\n");
    }
    ret = set_affinity_ (&iam, &cpn, &nthreads, &pin2core);
  } else {
    if (iam == 0) {
      printf ("No pinning\n");
    }
  }

  ret = print_affinity_ (&iam);
  fill_tid_fp ();

  while (1) {
    // Loop for some time (default 10 seconds), printing any core attachment changes
    threaded_loop ();
    // Change affinity to guarantee all is working as expected
    if (pin2core || pin2range) {
      shiftiam = (shiftiam + 1) % nranks;
      printf ("shiftiam=%d\n", shiftiam);
      ret = set_affinity_ (&shiftiam, &cpn, &nthreads, &pin2core);
    }
    ret = print_affinity_ (&iam);
    print_all_statuses ();
  }
}
Exemple #7
0
//place_halos():
//
//Takes a list of halo masses (Nhalos, HaloMass), a list of particles 
// (NTotPart,PartX,PartY,PartZ), some simulation parameters (L, mp), and 
// user-defined parameters (Nlin,rho_ref,alpha,Malpha,Nalpha,seed)
//and returns a list of halo positions and radii (HaloX,HaloY,HaloZ,HaloR)
int place_halos(long Nend, float *HaloMass, long Nlin, long NTotPart, float *PartX, 
		float *PartY, float *PartZ, float *PartVX, float *PartVY, float *PartVZ,
		float L, float rho_ref, long seed, float mp, int nthreads, double *alpha, double *fvel, double *Malpha,
		long Nalpha,float recalc_frac, float *HaloX, float *HaloY, float *HaloZ, float *HaloVX,
		float *HaloVY, float *HaloVZ,float *HaloR,long **ListOfPart, 
		long *NPartPerCell){


fprintf(stderr,"\tThis is place_halos.c\n");

//Initiallising -------------------------------------------------
	long i,j,k,lin_ijk, Nmin;
	long *count,trials;
	long ihalo, ipart,i_alpha;
	double invL = 1./L;
	float Mcell,Mhalo,Mchange; 
	float R;
	time_t t0,tI,tII;
	int check;

	double mpart,fvel_i;
	double exponent;
	double TotProb;
	double prob_repicked = 0.0;
	double *MassLeft;
	double *CumulativeProb; 
	long *ListOfHalos,  *NHalosPerCellStart, *NHalosPerCellEnd;
	long Nhalos;
	int recalc;



	float diff;
	time_t t5;
	#ifdef VERB
	time_t t1,t3,t4,t4_5;
	#endif

	long n_recalc =0;
	int use_vel=1;
	
	if (HaloVX==NULL)
		use_vel=0;


	NCells = Nlin;
	Lbox = L;
	
	t0=time(NULL);
	NTotCells = NCells*NCells*NCells;

	
	//Allocate memory for the arrays 
	MassLeft = (double *) calloc(NTotCells,sizeof(double));
  	if(MassLeft == NULL) {
    		fprintf(stderr,"\tplace_halos(): could not allocate %ld array for MassLeft[]\nABORTING",NTotCells);
    		exit(-1);
	}	
	NHalosPerCellStart = (long *) calloc(NTotCells,sizeof(long));
  	if(NHalosPerCellStart == NULL) {
    		fprintf(stderr,"\tplace_halos(): could not allocate %ld array for NHalosPerCell[]\nABORTING",NTotCells);
    		exit(-1);
	}

  	NHalosPerCellEnd = (long *) calloc(NTotCells,sizeof(long));
  	if(NHalosPerCellEnd == NULL) {
    		fprintf(stderr,"\tplace_halos(): could not allocate %ld array for NHalosPerCell[]\nABORTING",NTotCells);
    		exit(-1);
	}

  	count = (long *) calloc(NTotCells,sizeof(long));
  	if(count == NULL) {
    		fprintf(stderr,"\tplace_halos(): could not allocate %ld array for NTotCells[]\nABORTING",NTotCells);
    		exit(-1);
	}

	CumulativeProb = (double *) calloc(NTotCells, sizeof(double));
  	if(CumulativeProb == NULL) {
    		fprintf(stderr,"\tplace_halos(): could not allocate %ld array for CumulativeProb[]\nABORTING",NTotCells);
    		exit(-1);
	}

  	if (nthreads<1){
  		NTHREADS = omp_get_max_threads();
  	}else{
  		NTHREADS = nthreads;
  	}
	#ifdef NO_EXCLUSION
	int *already_chosen;
	already_chosen = (int*) calloc(NTotPart,sizeof(int));
 	if(already_chosen == NULL) {
                fprintf(stderr,"\tplace_halos(): could not allocate %ld array for already_chosen[]\nABORTING",NTotPart);
                exit(-1);
        }
	#endif

        //Initiallise random numbers
	#ifdef VERB
        fprintf(stderr,"\tinput seed: %ld.    time0: %f.",seed, (float) t0);
	#endif
	if (seed>=0){
		srand(seed);
		#ifdef VERB
			fprintf(stderr,"\tUsed: %ld \n",seed);
		#endif
	}
	else {
		srand(t0);
#ifdef VERB
		fprintf(stderr,"\tSeed Used: %ld \n",t0);
#endif
	}


	mpart = (double) mp;
	Nmin = (long)ceil(HaloMass[Nend-1]*0.9/mpart);
	lcell = (float) L/NCells;

	#ifdef VERB
	fprintf(stderr,"\n\tParticles and Halos placed in %ld^3 cells\n",NCells);
	fprintf(stderr,"\tBOX = %f  lcell =%f   rho_ref = %e  invL %f\n",L,L/NCells,rho_ref,invL);
	fprintf(stderr,"\tNhalostart = %d,Nhalosend = %ld,  NPart = %ld\n",0, Nend, NTotPart);
	fprintf(stderr,"\n\tMinimmum mass= %e. Minimum part per halo = %ld. mpart %e\n",HaloMass[Nend-1],Nmin,mpart);
	#endif
	

	#ifdef DEBUG
	fprintf(stderr,"\n\tRAND_MAX=%d\n",RAND_MAX);
	fprintf(stderr,"\tX[0] = %f Y[0] = %f Z[0] = %f\n",PartX[0],PartY[0],PartZ[0]);
	fprintf(stderr,"\tX[1] = %f Y[1] = %f Z[1] = %f\n",PartX[1],PartY[1],PartZ[1]);
	fprintf(stderr,"\tM[0] = %e \n",HaloMass[0]);
	fprintf(stderr,"\tM[1] = %e \n",HaloMass[1]);
	fprintf(stderr,"\t    ... \n");
	fprintf(stderr,"\tM[%ld] = %e \n",Nend-1,HaloMass[Nend-1]);
	fprintf(stderr,"\tX[%ld] = %f Y[%ld] = %f Z[%ld] = %f\n",Nend-1,PartX[Nend-1],Nend-1,PartY[Nend-1],Nend-1,PartZ[Nend-1]);
	#endif	
	
	int r = (int) (R_from_mass(HaloMass[0],rho_ref)/(L/NCells));
	if (L/NCells<R_from_mass(HaloMass[0],rho_ref)){
		fprintf(stderr,"WARNING: cell size is smaller than the radius of the biggest halo. Using r=%i. This may be problematic\n",r);
	}

#ifdef VERB
	fprintf(stderr,"\tR_max=%f, lcell=%f, r=%d\n",R_from_mass(HaloMass[0],rho_ref),(L/NCells),r);
	t1=time(NULL);
 	diff = difftime(t1,t0);
	fprintf(stderr,"\ttime of initialisation %f\n",diff);
#endif
// ------------------------------------------------- Initiallised

	//Alloc Enough Memory
	Nhalos=0;
	for (i=0;i<NCells;i++){
	for (j=0;j<NCells;j++){
	for (k=0;k<NCells;k++){
		lin_ijk = k+j*NCells+i*NCells*NCells;
		NHalosPerCellStart[lin_ijk] = Nhalos;
		NHalosPerCellEnd[lin_ijk] = Nhalos;
		Nhalos += (long) floor(NPartPerCell[lin_ijk]/Nmin+1);
		MassLeft[lin_ijk] = (double) NPartPerCell[lin_ijk]*mpart;
#ifdef ULTRADEBUG
		if (lin_ijk<10 || lin_ijk > (NCells*NCells*NCells) - 10){
			fprintf(stderr,"\tAllocated %ld (longs) in ListOfPart(%ld=[%ld,%ld,%ld])\n",NPartPerCell[lin_ijk],lin_ijk,i,j,k);
		}
#endif		
	}	
	}
	}


	ListOfHalos = (long *) calloc(Nhalos,sizeof(long ));
	if(ListOfHalos == NULL) {
		fprintf(stderr,"\tplace_halos(): could not allocate %ld array for ListOfHalos[]\nABORTING",Nhalos);
		exit(-1);
	}

#ifdef VERB
//	fprintf(stderr,"\tAllocated %ld (longs) in ListOfHalos\n",Nhalos);
	t3=time(NULL);
 	diff = difftime(t3,t1);
	fprintf(stderr,"\t... memory allocated in %f\n",diff);
	fprintf(stderr,"\tComputing probabilities...\n");
#endif 

#ifdef DEBUG
        fprintf(stderr,"\tMass_cell[0]=%e",MassLeft[0]);
	fprintf(stderr,"\t Mass Function\n");
	for (ihalo=0;ihalo<15;ihalo++){
		fprintf(stderr,"\thalo %ld: ",ihalo);
		fprintf(stderr,"M=%e\n",HaloMass[ihalo]);
	}
#endif

//----------------------------------- Particles and haloes assigned to grid



//Computing Cumulative Probability -----------------------------
	
	//find the right alpha
	Mhalo = HaloMass[0];
	i_alpha = 0;
	while(Mhalo<Malpha[i_alpha]) {
		i_alpha++;
		if (i_alpha==Nalpha){
			fprintf(stderr,"\tERROR: No M_alpha low enough found\n");
			fprintf(stderr,"\tERROR: N_alpha = %ld, Mh=%e, Ma= %e\n",Nalpha,Mhalo,Malpha[i_alpha-1]);
			exit(0);
		}
	}	
	Mchange = Malpha[i_alpha];
	exponent = alpha[i_alpha];
	fvel_i = fvel[i_alpha];

	//compute the probability

#ifdef VERB
	fprintf(stderr,"\tUsing OMP with %d threads\n",NTHREADS);
	t4=time(NULL);
#endif
	TotProb = ComputeCumulative(exponent, mpart, MassLeft, CumulativeProb);	
#ifdef VERB
	fprintf(stderr,"\n\tcase 0, TotProb=%e\n",TotProb);
#endif


#ifdef VERB
        fprintf(stderr,"\tNumber of alphas: %ld\n",Nalpha);
        fprintf(stderr,"\tUsing alpha_%ld=%f for M>%e\n",i_alpha,exponent,Mchange);
	t4_5=time(NULL);
 	diff = difftime(t4_5,t4);
	fprintf(stderr,"\tprobabilty computed in %f secods\n",diff);
#endif
// ----------------------------------------- Computed Probability



//Actually placing the haloes----------------------------------- 
#ifdef VERB
	fprintf(stderr,"\n\tPlacing Halos...\n\n");
#endif

	//Place one by one all the haloes (assumed to be ordered from the most massive to the least massive)
	for (ihalo=0;ihalo<Nend;ihalo++){

		#ifdef DEBUG
		fprintf(stderr,"\n\t- Halo %ld ",ihalo);
		#endif
		#ifdef VERB
		if (ihalo%(Nend/10)==0 && ihalo>0){
			//TEMPORARY
			fprintf(stderr,"\t\tFRAC, TOTPROB: %e, %e",(pow(Mcell/mpart,exponent)/TotProb),TotProb);
			fprintf(stderr,"\t%ld%% done\n",(ihalo/(Nend/100)));
		}
		#endif
		//Check whether or not, a change of alpha is needed for this halo mass 		
		Mhalo= HaloMass[ihalo];
		recalc = 0;
		while (Mhalo < Mchange){//if so search the right alpha, and recompute probabilities
			i_alpha++;		
			if (i_alpha==Nalpha){
				fprintf(stderr,"\tERROR: No M_alpha low enough found: %e <%e\n",Mhalo,Malpha[Nalpha-1]);
				exit(0);
			}
			Mchange = Malpha[i_alpha];
			exponent = alpha[i_alpha];
			fvel_i = fvel[i_alpha];


		#ifdef VERB
        		fprintf(stderr,"\n\tUsing alpha_%ld=%f and fvel=%f for M>%e\n",i_alpha,exponent,fvel_i,Mchange);
		#endif
        	recalc = 1;
		}
		
		// recalc if different alpha, OR there's a significant chance of choosing the same cell again.
		if(ihalo>0){
		  if(prob_repicked>=recalc_frac){
			recalc = 1;
			n_recalc += 1;
			fprintf(stderr,"RECALCULATING: %ld, %e,    ihalo=%ld\n",n_recalc,prob_repicked,ihalo);

		  }
		}

		if (recalc==1){
			tI=time(NULL);
			fprintf(stderr,"\tcase 1, TotProb_bef=%e",TotProb);
			TotProb=ComputeCumulative(exponent, mpart, MassLeft, CumulativeProb);
			fprintf(stderr,"    TotProb_aft=%e       ihalo=%ld\n\n",TotProb,ihalo);

			prob_repicked=0.0;
#ifdef VERB
			tII=time(NULL);
			diff = difftime(tII,tI);
			fprintf(stderr,"\tProbabilty recomputed in %f secods\n",diff);
#endif

			recalc = 0;
		}


		do {	
		  //First, choose a cell	
		  #ifndef RANKED	
		  trials=0;
		  do{			
			if (trials==MAXTRIALS){
				fprintf(stderr,"MAXTRIALS=%d times picked an empty cell, recomputing Probs...\n",MAXTRIALS);
				fprintf(stderr,"\n\tcase 2, TotProb_bef=%e",TotProb);
				TotProb=ComputeCumulative(exponent, mpart, MassLeft, CumulativeProb);
				fprintf(stderr,"    TotProb_aft=%e       ihalo=%ld\n",TotProb,ihalo);
				prob_repicked = 0.0;
				trials=0;
				
			}
		  	lin_ijk = select_cell(TotProb, CumulativeProb);
			trials++;
		
		  }while (MassLeft[lin_ijk]==0.);



		  k=lin_ijk%(NCells);
		  j=((lin_ijk-k)/NCells)%NCells;
	  	  i=(lin_ijk-k-j*NCells)/(NCells*NCells);

		  #else //RANKED option: deprecated and not optimised
		  lin_ijk=select_heaviest_cell(&i,&j,&k,MassLeft);		  
		  #endif

		  trials=0;


		  //Second, choose a particle in that cell
		  do {

			ipart = select_part_beta_0(lin_ijk,ListOfPart, NPartPerCell);		
			if (ipart<0){
				fprintf(stderr,"WARNING: Picked up an completely empty cell (ihalo %ld) lin_ijk=%ld \n",ihalo,lin_ijk);
				MassLeft[lin_ijk]=0.;
				check=1;  //Choose another cell
				break;
			}

               		HaloX[ihalo] = PartX[ipart];
               		HaloY[ihalo] = PartY[ipart];
               		HaloZ[ihalo] = PartZ[ipart];
			#ifdef DEBUG
			fprintf(stderr,"HaloX=%f PartX=%f\n",HaloX[ihalo],PartX[ipart]);
			#endif

			if (use_vel==1){
               			HaloVX[ihalo] = fvel_i * PartVX[ipart];
               			HaloVY[ihalo] = fvel_i * PartVY[ipart];
               			HaloVZ[ihalo] = fvel_i * PartVZ[ipart];
			}
			R=R_from_mass(HaloMass[ihalo],rho_ref);
			HaloR[ihalo]= R;

			#ifdef NO_EXCLUSION
			  	check = already_chosen[part];
				already_chosen[ipart]=1;
			#else
			//Third, check that is not overlapping a previous halo
			check = check_HaloR_in_mesh(ihalo,HaloX,HaloY,HaloZ,HaloR,i,j,k,ListOfHalos,NHalosPerCellStart,NHalosPerCellEnd,r);
			#endif
			

			if (check==1){
				#ifdef DEBUG
				fprintf(stderr,"Refused part : %ld\n",ipart);
				#endif
				trials++;
			}
			if (trials == MAXTRIALS){
				//in order to avoid infinite loop, we will exit this loop, after MAXTRIALS trials
				#ifdef VERB
				fprintf(stderr,"MAXTRIALS=%d reached, removing cell [%ld,%ld,%ld]\n",MAXTRIALS,i,j,k);
				#endif
				MassLeft[lin_ijk]=0.;
				fprintf(stderr,"\n\tcase 3, TotProb_bef=%e",TotProb);
				TotProb=ComputeCumulative(exponent, mpart, MassLeft, CumulativeProb);
				fprintf(stderr,"    TotProb_aft=%e       ihalo=%ld, R=%f\n",TotProb,ihalo,R);
				prob_repicked=0.0;
				trials=0;
				break;
			}
		  } while (check==1);//If the particle was excluded, try another one in the same cell

	        } while(check==1); //if reached MAXTRIALS, select another cell
		//Particle chosen!
		
		//mass in cell before assignment
                Mcell=MassLeft[lin_ijk];

		
		  #ifndef MASS_OF_PARTS 
                  if (Mcell>HaloMass[ihalo])
			MassLeft[lin_ijk] -= Mhalo; 
                  else
			MassLeft[lin_ijk] = 0.;
		  #else
			exclude(ipart,R,PartX,PartY,PartZ,i,j,k);
		  #endif

		prob_repicked += pow(Mcell/mpart,exponent)/TotProb;


		#ifdef DEBUG
		fprintf(stderr,"\tAfter: Mcell=%e, CProbCell=%e, TotProb=%e.   , Mhalo=%e. CProb[last]=%e\n",MassLeft[lin_ijk],CumulativeProb[lin_ijk],TotProb,Mhalo,CumulativeProb[NTotCells-1]);
		#endif
		#ifdef DEBUG
		fprintf(stderr,"\thalo %ld assigned to particle %ld at [%f,%f,%f]. R= %f, M= %e\n",ihalo,ipart,HaloX[ihalo],HaloY[ihalo],HaloZ[ihalo],R,Mhalo);
		#endif
		#ifdef DEBUG
		fprintf(stderr,"HaloX=%f PartX=%f\n",HaloX[ihalo],PartX[ipart]);
		#endif

		ListOfHalos[NHalosPerCellEnd[lin_ijk]]=ihalo;
		NHalosPerCellEnd[lin_ijk]++;

	}//for(ihalo=Nstart:Nend)
//----------------------------------- Haloes Placed

	fprintf(stderr,"\t... placement Done!\n");
	fprintf(stderr,"\t\tTOTAL NUMBER OF RE-CALCULATIONS: %ld\n",n_recalc);

#ifdef VERB
	t5=time(NULL);
 	diff = difftime(t5,t4_5);
	fprintf(stderr,"\ttime placing %f\n",diff);
	fprintf(stderr,"\tfreeing...\n");
#endif

	free(NHalosPerCellStart);
	free(NHalosPerCellEnd);
        free(count); 
        free(CumulativeProb);
	free(MassLeft);
        free(ListOfHalos);
#ifdef VERB
 	diff = difftime(t5,t0);
	fprintf(stderr,"\ttotal time in place_halos.c %f\n",diff);
	fprintf(stderr,"\tPlacement done!!!\n");
#endif

#ifdef MASS_OF_PARTS
//	free(excluded); free(Nexcluded);
#endif
	return 0;
}
main ()
{
  int	i;


  thds = omp_get_max_threads ();
  if (thds == 1) {
    printf ("should be run this program on multi threads.\n");
    exit (0);
  }
  omp_set_dynamic (0);


  #pragma omp parallel
  {
    int	j;

    #pragma omp for schedule(static,1) lastprivate (prvt)
    for (i=0; i<thds; i++) {
      for (j=0; j<ARRAYSIZ; j++) {
	prvt[j] = i+j;
      }
      barrier (thds);
      for (j=0; j<ARRAYSIZ; j++) {
	if (prvt[j] != i+j) {
          #pragma omp critical
	  errors += 1;
	}
      }
      if (sizeof(prvt) != sizeof(int)*ARRAYSIZ) {
        #pragma omp critical
	errors += 1;
      }
      if (i==0) {
	waittime (1);
      }
      for (j=0; j<ARRAYSIZ; j++) {
	prvt[j] = i+j;
      }
    }

    for (j=0; j<ARRAYSIZ; j++) {
      if (prvt[j] != (thds-1)+j) {
        #pragma omp critical
	errors += 1;
      }
    }
  }


  #pragma omp parallel
  func (thds);


  func (1);


  if (errors == 0) {
    printf ("lastprivate 017 : SUCCESS\n");
    return 0;
  } else {
    printf ("lastprivate 017 : FAILED\n");
    return 1;
  }
}
Exemple #9
0
/*!*******************************************************************
 * \brief The main call
 * 
 * \param argc The integer number of command line arguments
 * \param argv The character array of command line arguments
 *********************************************************************/
int main (int argc, char *argv[])
{
	int id = 0, n_elements = 1;

	// Initialize messenger
	mpi::messenger process_messenger (&argc, &argv);

	try {
		id = process_messenger.get_id ();
		n_elements = process_messenger.get_np ();
		
		io::parameters parameters = config (&argc, &argv, id);
		
		int m = parameters.get <int> ("grid.z.points") / n_elements + 1;
		m += (m - 1) % 2;

		std::vector <double> positions (n_elements + 1);
		for (int i = 0; i < n_elements + 1; ++i) {
			positions [i] = -parameters.get <double> ("grid.z.width") / 2.0 + parameters.get <double> ("grid.z.width") / n_elements * i;
		}

		int name = id;

		int n = parameters.get <int> ("grid.x.points");

		grids::axis horizontal_axis (n, -parameters.get <double> ("grid.x.width") / 2.0, parameters.get <double> ("grid.x.width") / 2.0);
		grids::axis vertical_axis (m, positions [id], positions [id + 1], id == 0 ? 0 : 1, id == n_elements - 1 ? 0 : 1);
		
		TRACE ("Building data");
		
		data::thermo_compositional_data data (&horizontal_axis, &vertical_axis, id, n_elements, parameters);
		
		TRACE ("Constructing element");
		
		auto element = pisces::implemented_element::instance (parameters ["element"].as <std::string> (), horizontal_axis, vertical_axis, name, parameters, data, &process_messenger, 0x00);
		
		if (pisces::element::version () < versions::version ("0.6.0.0")) {
			INFO ("element.version < 0.6.0.0");
		}
		else {
			INFO ("element.version not < 0.6.0.0");
		}

		TRACE ("Element constructed.");

		clock_t cbegin, cend;
		std::chrono::time_point <std::chrono::system_clock> begin, end;

		cbegin = clock ();
		begin = std::chrono::system_clock::now ();

		int n_steps = data.n_steps;
		std::shared_ptr <io::input> virtual_input;
		while (n_steps < parameters.get <int> ("time.steps") && element->duration < parameters.get <double> ("time.stop")) {
			if (parameters.get <int> ("grid.rezone.check_every") > 0 && n_steps != 0 && n_elements > 1) {
				INFO ("Rezoning");
				formats::virtual_file *virt = element->rezone_minimize_ts (&positions [0], parameters.get <double> ("grid.rezone.min_size"), parameters.get <double> ("grid.rezone.max_size"), parameters.get <int> ("grid.rezone.n_tries"), parameters.get <int> ("grid.rezone.iters_fixed_t"), parameters.get <double> ("grid.rezone.step_size"), parameters.get <double> ("grid.rezone.k"), parameters.get <double> ("grid.rezone.t_initial"), parameters.get <double> ("grid.rezone.mu_t"), parameters.get <double> ("grid.rezone.t_min"));
				
				if (virt) {
					formats::virtual_files ["main/virtual_file"] = *virt;
					grids::axis vertical_axis (m, positions [id], positions [id + 1], id == 0 ? 0 : 1, id == n_elements - 1 ? 0 : 1);
				
					virtual_input.reset (new io::formatted_input <formats::virtual_format> (formats::data_grid::two_d (n, m), "main/virtual_file"));
					data.setup (virtual_input);
				
					element = pisces::implemented_element::instance (parameters ["element"].as <std::string> (), horizontal_axis, vertical_axis, name, parameters, data, &process_messenger, 0x00);
				}
			}
			element->run (n_steps);
		}

		cend = clock ();
		end = std::chrono::system_clock::now ();

		std::chrono::duration <double> eb = end - begin;

		INFO ("Main complete. CPU Time: " << ((double) (cend - cbegin))/CLOCKS_PER_SEC << " Wall Time: " << (double) eb.count () << " Efficiency: " << (((double) (cend - cbegin))/CLOCKS_PER_SEC / (double) eb.count () / omp_get_max_threads () * 100.) << "%");
	} catch (std::exception &except) {
		FATAL ("Fatal error occurred. Check log.");
		FATAL (except.what ());
		return 1;

		/*
			TODO Last check all should be somewhere not defined by the user
		*/
	} catch (int &except) {
		FATAL ("Fatal error occurred. Check log.");
		FATAL (except);
		return 1;

		/*
			TODO Last check all should be somewhere not defined by the user
		*/
	} catch (...) {
		FATAL ("Last ditch...");
		return 1;
	}
	
	return 0;
}
// host stub function
void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block,
                                      int dim, int *range, ops_arg arg0,
                                      ops_arg arg1, ops_arg arg2,
                                      ops_arg arg3) {

  // Timing
  double t1, t2, c1, c2;

  int offs[4][3];
  ops_arg args[4] = {arg0, arg1, arg2, arg3};

#ifdef CHECKPOINTING
  if (!ops_checkpointing_before(args, 4, range, 134))
    return;
#endif

  if (OPS_diags > 1) {
    ops_timing_realloc(134, "advec_mom_kernel2_y");
    OPS_kernels[134].count++;
    ops_timers_core(&c1, &t1);
  }

#ifdef OPS_MPI
  sub_block_list sb = OPS_sub_block_list[block->index];
#endif

  // compute locally allocated range for the sub-block

  int start[3];
  int end[3];
  int arg_idx[3];

#ifdef OPS_MPI
  if (!sb->owned)
    return;
  for (int n = 0; n < 3; n++) {
    start[n] = sb->decomp_disp[n];
    end[n] = sb->decomp_disp[n] + sb->decomp_size[n];
    if (start[n] >= range[2 * n]) {
      start[n] = 0;
    } else {
      start[n] = range[2 * n] - start[n];
    }
    if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0)
      start[n] = range[2 * n];
    if (end[n] >= range[2 * n + 1]) {
      end[n] = range[2 * n + 1] - sb->decomp_disp[n];
    } else {
      end[n] = sb->decomp_size[n];
    }
    if (sb->id_p[n] == MPI_PROC_NULL &&
        (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n]))
      end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]);
    if (end[n] < start[n])
      end[n] = start[n];
  }
#else
  for (int n = 0; n < 3; n++) {
    start[n] = range[2 * n];
    end[n] = range[2 * n + 1];
  }
#endif
#ifdef OPS_DEBUG
  ops_register_args(args, "advec_mom_kernel2_y");
#endif

  offs[0][0] = args[0].stencil->stride[0] * 1; // unit step in x dimension
  offs[0][1] =
      off3D(1, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) -
      offs[0][0];
  offs[0][2] =
      off3D(2, &start[0], &end[0], args[0].dat->size, args[0].stencil->stride) -
      offs[0][1] - offs[0][0];

  offs[1][0] = args[1].stencil->stride[0] * 1; // unit step in x dimension
  offs[1][1] =
      off3D(1, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) -
      offs[1][0];
  offs[1][2] =
      off3D(2, &start[0], &end[0], args[1].dat->size, args[1].stencil->stride) -
      offs[1][1] - offs[1][0];

  offs[2][0] = args[2].stencil->stride[0] * 1; // unit step in x dimension
  offs[2][1] =
      off3D(1, &start[0], &end[0], args[2].dat->size, args[2].stencil->stride) -
      offs[2][0];
  offs[2][2] =
      off3D(2, &start[0], &end[0], args[2].dat->size, args[2].stencil->stride) -
      offs[2][1] - offs[2][0];

  offs[3][0] = args[3].stencil->stride[0] * 1; // unit step in x dimension
  offs[3][1] =
      off3D(1, &start[0], &end[0], args[3].dat->size, args[3].stencil->stride) -
      offs[3][0];
  offs[3][2] =
      off3D(2, &start[0], &end[0], args[3].dat->size, args[3].stencil->stride) -
      offs[3][1] - offs[3][0];

  int off0_0 = offs[0][0];
  int off0_1 = offs[0][1];
  int off0_2 = offs[0][2];
  int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size);
  int off1_0 = offs[1][0];
  int off1_1 = offs[1][1];
  int off1_2 = offs[1][2];
  int dat1 = (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size);
  int off2_0 = offs[2][0];
  int off2_1 = offs[2][1];
  int off2_2 = offs[2][2];
  int dat2 = (OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size);
  int off3_0 = offs[3][0];
  int off3_1 = offs[3][1];
  int off3_2 = offs[3][2];
  int dat3 = (OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size);

  // Halo Exchanges
  ops_H_D_exchanges_host(args, 4);
  ops_halo_exchanges(args, 4, range);
  ops_H_D_exchanges_host(args, 4);

#ifdef _OPENMP
  int nthreads = omp_get_max_threads();
#else
  int nthreads = 1;
#endif
  xdim0 = args[0].dat->size[0];
  ydim0 = args[0].dat->size[1];
  xdim1 = args[1].dat->size[0];
  ydim1 = args[1].dat->size[1];
  xdim2 = args[2].dat->size[0];
  ydim2 = args[2].dat->size[1];
  xdim3 = args[3].dat->size[0];
  ydim3 = args[3].dat->size[1];

  if (OPS_diags > 1) {
    ops_timers_core(&c2, &t2);
    OPS_kernels[134].mpi_time += t2 - t1;
  }

#pragma omp parallel for
  for (int thr = 0; thr < nthreads; thr++) {

    int z_size = end[2] - start[2];
    char *p_a[4];

    int start_i = start[2] + ((z_size - 1) / nthreads + 1) * thr;
    int finish_i =
        start[2] + MIN(((z_size - 1) / nthreads + 1) * (thr + 1), z_size);

    // get address per thread
    int start0 = start[0];
    int start1 = start[1];
    int start2 = start_i;

    // set up initial pointers
    int d_m[OPS_MAX_DIM];
#ifdef OPS_MPI
    for (int d = 0; d < dim; d++)
      d_m[d] =
          args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d];
#else
    for (int d = 0; d < dim; d++)
      d_m[d] = args[0].dat->d_m[d];
#endif
    int base0 = dat0 * 1 * (start0 * args[0].stencil->stride[0] -
                            args[0].dat->base[0] - d_m[0]);
    base0 = base0 +
            dat0 * args[0].dat->size[0] * (start1 * args[0].stencil->stride[1] -
                                           args[0].dat->base[1] - d_m[1]);
    base0 = base0 +
            dat0 * args[0].dat->size[0] * args[0].dat->size[1] *
                (start2 * args[0].stencil->stride[2] - args[0].dat->base[2] -
                 d_m[2]);
    p_a[0] = (char *)args[0].data + base0;

#ifdef OPS_MPI
    for (int d = 0; d < dim; d++)
      d_m[d] =
          args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d];
#else
    for (int d = 0; d < dim; d++)
      d_m[d] = args[1].dat->d_m[d];
#endif
    int base1 = dat1 * 1 * (start0 * args[1].stencil->stride[0] -
                            args[1].dat->base[0] - d_m[0]);
    base1 = base1 +
            dat1 * args[1].dat->size[0] * (start1 * args[1].stencil->stride[1] -
                                           args[1].dat->base[1] - d_m[1]);
    base1 = base1 +
            dat1 * args[1].dat->size[0] * args[1].dat->size[1] *
                (start2 * args[1].stencil->stride[2] - args[1].dat->base[2] -
                 d_m[2]);
    p_a[1] = (char *)args[1].data + base1;

#ifdef OPS_MPI
    for (int d = 0; d < dim; d++)
      d_m[d] =
          args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d];
#else
    for (int d = 0; d < dim; d++)
      d_m[d] = args[2].dat->d_m[d];
#endif
    int base2 = dat2 * 1 * (start0 * args[2].stencil->stride[0] -
                            args[2].dat->base[0] - d_m[0]);
    base2 = base2 +
            dat2 * args[2].dat->size[0] * (start1 * args[2].stencil->stride[1] -
                                           args[2].dat->base[1] - d_m[1]);
    base2 = base2 +
            dat2 * args[2].dat->size[0] * args[2].dat->size[1] *
                (start2 * args[2].stencil->stride[2] - args[2].dat->base[2] -
                 d_m[2]);
    p_a[2] = (char *)args[2].data + base2;

#ifdef OPS_MPI
    for (int d = 0; d < dim; d++)
      d_m[d] =
          args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d];
#else
    for (int d = 0; d < dim; d++)
      d_m[d] = args[3].dat->d_m[d];
#endif
    int base3 = dat3 * 1 * (start0 * args[3].stencil->stride[0] -
                            args[3].dat->base[0] - d_m[0]);
    base3 = base3 +
            dat3 * args[3].dat->size[0] * (start1 * args[3].stencil->stride[1] -
                                           args[3].dat->base[1] - d_m[1]);
    base3 = base3 +
            dat3 * args[3].dat->size[0] * args[3].dat->size[1] *
                (start2 * args[3].stencil->stride[2] - args[3].dat->base[2] -
                 d_m[2]);
    p_a[3] = (char *)args[3].data + base3;

    for (int n_z = start_i; n_z < finish_i; n_z++) {
      for (int n_y = start[1]; n_y < end[1]; n_y++) {
        for (int n_x = start[0];
             n_x < start[0] + (end[0] - start[0]) / SIMD_VEC; n_x++) {
// call kernel function, passing in pointers to data -vectorised
#pragma simd
          for (int i = 0; i < SIMD_VEC; i++) {
            advec_mom_kernel2_y((double *)p_a[0] + i * 1 * 1,
                                (const double *)p_a[1] + i * 1 * 1,
                                (const double *)p_a[2] + i * 1 * 1,
                                (const double *)p_a[3] + i * 1 * 1);
          }

          // shift pointers to data x direction
          p_a[0] = p_a[0] + (dat0 * off0_0) * SIMD_VEC;
          p_a[1] = p_a[1] + (dat1 * off1_0) * SIMD_VEC;
          p_a[2] = p_a[2] + (dat2 * off2_0) * SIMD_VEC;
          p_a[3] = p_a[3] + (dat3 * off3_0) * SIMD_VEC;
        }

        for (int n_x = start[0] + ((end[0] - start[0]) / SIMD_VEC) * SIMD_VEC;
             n_x < end[0]; n_x++) {
          // call kernel function, passing in pointers to data - remainder
          advec_mom_kernel2_y((double *)p_a[0], (const double *)p_a[1],
                              (const double *)p_a[2], (const double *)p_a[3]);

          // shift pointers to data x direction
          p_a[0] = p_a[0] + (dat0 * off0_0);
          p_a[1] = p_a[1] + (dat1 * off1_0);
          p_a[2] = p_a[2] + (dat2 * off2_0);
          p_a[3] = p_a[3] + (dat3 * off3_0);
        }

        // shift pointers to data y direction
        p_a[0] = p_a[0] + (dat0 * off0_1);
        p_a[1] = p_a[1] + (dat1 * off1_1);
        p_a[2] = p_a[2] + (dat2 * off2_1);
        p_a[3] = p_a[3] + (dat3 * off3_1);
      }
      // shift pointers to data z direction
      p_a[0] = p_a[0] + (dat0 * off0_2);
      p_a[1] = p_a[1] + (dat1 * off1_2);
      p_a[2] = p_a[2] + (dat2 * off2_2);
      p_a[3] = p_a[3] + (dat3 * off3_2);
    }
  }

  if (OPS_diags > 1) {
    ops_timers_core(&c1, &t1);
    OPS_kernels[134].time += t1 - t2;
  }

  ops_set_dirtybit_host(args, 4);

  ops_set_halo_dirtybit3(&args[0], range);

  if (OPS_diags > 1) {
    // Update kernel record
    ops_timers_core(&c2, &t2);
    OPS_kernels[134].mpi_time += t2 - t1;
    OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0);
    OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1);
    OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg2);
    OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg3);
  }
}
int main(int argc, char *argv[])
{
  int step, ie, iside, i, j, k;
  double mflops, tmax, nelt_tot = 0.0;
  char Class;
  logical ifmortar = false, verified;

  double t2, trecs[t_last+1];
  char *t_names[t_last+1];

	//--------------------------------------------------------------------
	// Initialize NUMA control
	//--------------------------------------------------------------------
	numa_initialize_env(NUMA_MIGRATE_EXISTING);

  //---------------------------------------------------------------------
  // Read input file (if it exists), else take
  // defaults from parameters
  //---------------------------------------------------------------------
  FILE *fp;
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    timeron = true;
    t_names[t_total] = "total";
    t_names[t_init] = "init";
    t_names[t_convect] = "convect";
    t_names[t_transfb_c] = "transfb_c";
    t_names[t_diffusion] = "diffusion";
    t_names[t_transf] = "transf";
    t_names[t_transfb] = "transfb";
    t_names[t_adaptation] = "adaptation";
    t_names[t_transf2] = "transf+b";
    t_names[t_add2] = "add2";
    fclose(fp);
  } else {
    timeron = false;
  }

  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OMP-C) - UA Benchmark\n\n");

  if ((fp = fopen("inputua.data", "r")) != NULL) {
    int result;
    printf(" Reading from input file inputua.data\n");
    result = fscanf(fp, "%d", &fre);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%d", &niter);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%d", &nmxh);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%lf", &alpha);
    Class = 'U';
    fclose(fp);
  } else {
    printf(" No input file inputua.data. Using compiled defaults\n");
    fre   = FRE_DEFAULT;
    niter = NITER_DEFAULT;
    nmxh  = NMXH_DEFAULT;
    alpha = ALPHA_DEFAULT;
    Class = CLASS_DEFAULT;
  }

  dlmin = pow(0.5, REFINE_MAX);
  dtime = 0.04*dlmin;

  printf(" Levels of refinement:        %8d\n", REFINE_MAX);
  printf(" Adaptation frequency:        %8d\n", fre);
  printf(" Time steps:                  %8d    dt: %15.6E\n", niter, dtime);
  printf(" CG iterations:               %8d\n", nmxh);
  printf(" Heat source radius:          %8.4f\n", alpha);
  printf(" Number of available threads: %8d\n", omp_get_max_threads());
  printf("\n");

  top_constants();

  for (i = 1; i <= t_last; i++) {
    timer_clear(i);
  }
  if (timeron) timer_start(t_init);

  // set up initial mesh (single element) and solution (all zero)
  create_initial_grid();

  r_init_omp((double *)ta1, ntot, 0.0);
  nr_init_omp((int *)sje, 4*6*nelt, -1);

  init_locks();

  // compute tables of coefficients and weights      
  coef();
  geom1();

  // compute the discrete laplacian operators
  setdef();

  // prepare for the preconditioner
  setpcmo_pre();

  // refine initial mesh and do some preliminary work
  time = 0.0;
  mortar();
  prepwork();
  adaptation(&ifmortar, 0);
  if (timeron) timer_stop(t_init);

  timer_clear(1);

  time = 0.0;
  for (step = 0; step <= niter; step++) {
    if (step == 1) {
      // reset the solution and start the timer, keep track of total no elms
      r_init((double *)ta1, ntot, 0.0);

      time = 0.0;
      nelt_tot = 0.0;
      for (i = 1; i <= t_last; i++) {
        if (i != t_init) timer_clear(i);
      }
      timer_start(1);
    }

    // advance the convection step 
    convect(ifmortar);

    if (timeron) timer_start(t_transf2);
    // prepare the intital guess for cg
    transf(tmort, (double *)ta1);

    // compute residual for diffusion term based on intital guess

    // compute the left hand side of equation, lapacian t
    #pragma omp parallel default(shared) private(ie,k,j,i) 
    {
    #pragma omp for
    for (ie = 0; ie < nelt; ie++) {
      laplacian(ta2[ie], ta1[ie], size_e[ie]);
    }

    // compute the residual 
    #pragma omp for
    for (ie = 0; ie < nelt; ie++) {
      for (k = 0; k < LX1; k++) {
        for (j = 0; j < LX1; j++) {
          for (i = 0; i < LX1; i++) {
            trhs[ie][k][j][i] = trhs[ie][k][j][i] - ta2[ie][k][j][i];
          }
        }
      }
    }
    } //end parallel

    // get the residual on mortar 
    transfb(rmor, (double *)trhs);
    if (timeron) timer_stop(t_transf2);

    // apply boundary condition: zero out the residual on domain boundaries

    // apply boundary conidtion to trhs
    #pragma omp parallel for default(shared) private(ie,iside)
    for (ie = 0; ie < nelt; ie++) {
      for (iside = 0; iside < NSIDES; iside++) {
        if (cbc[ie][iside] == 0) {
          facev(trhs[ie], iside, 0.0);
        }
      }
    }
    // apply boundary condition to rmor
    col2(rmor, tmmor, nmor);

    // call the conjugate gradient iterative solver
    diffusion(ifmortar);

    // add convection and diffusion
    if (timeron) timer_start(t_add2);
    add2((double *)ta1, (double *)t, ntot);
    if (timeron) timer_stop(t_add2);

    // perform mesh adaptation
    time = time + dtime;
    if ((step != 0) && (step/fre*fre == step)) {
      if (step != niter) {
        adaptation(&ifmortar, step);
      }
    } else {
      ifmortar = false;
    }
    nelt_tot = nelt_tot + (double)(nelt);
  }

  timer_stop(1);
  tmax = timer_read(1);

  verify(&Class, &verified);

  // compute millions of collocation points advanced per second.
  // diffusion: nmxh advancements, convection: 1 advancement
  mflops = nelt_tot*(double)(LX1*LX1*LX1*(nmxh+1))/(tmax*1.e6);

  print_results("UA", Class, REFINE_MAX, 0, 0, niter, 
                tmax, mflops, "    coll. point advanced", 
                verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, 
                CS6, "(none)");

  //---------------------------------------------------------------------
  // More timers
  //---------------------------------------------------------------------
  if (timeron) {
    for (i = 1; i <= t_last; i++) {
      trecs[i] = timer_read(i);
    }
    if (tmax == 0.0) tmax = 1.0;

    printf("  SECTION     Time (secs)\n");
    for (i = 1; i <= t_last; i++) {
      printf("  %-10s:%9.3f  (%6.2f%%)\n",
          t_names[i], trecs[i], trecs[i]*100./tmax);
      if (i == t_transfb_c) {
        t2 = trecs[t_convect] - trecs[t_transfb_c];
        printf("    --> %11s:%9.3f  (%6.2f%%)\n", 
            "sub-convect", t2, t2*100./tmax);
      } else if (i == t_transfb) {
        t2 = trecs[t_diffusion] - trecs[t_transf] - trecs[t_transfb];
        printf("    --> %11s:%9.3f  (%6.2f%%)\n", 
            "sub-diffuse", t2, t2*100./tmax);
      }
    }
  }

	//--------------------------------------------------------------------
	// Teardown NUMA control
	//--------------------------------------------------------------------
	numa_shutdown();

  return 0;
}
Exemple #12
0
 arma_inline
 static
 int
 get()
   {
   #if defined(ARMA_USE_OPENMP)
     int n_threads = (std::min)(int(arma_config::mp_threads), int((std::max)(int(1), int(omp_get_max_threads()))));
   #else
     int n_threads = int(1);
   #endif
   
   return n_threads;
   }
main ()
{
  int	i, r;


  thds = omp_get_max_threads ();
  if (thds == 1) {
    printf ("should be run this program on multi threads.\n");
    exit (0);
  }
  omp_set_dynamic (0);

  rdct = shrd = 0;
  fprvt = MAGICNO;
  #pragma omp parallel for default(none) private (prvt) firstprivate(fprvt) lastprivate(lprvt) reduction(+:rdct) shared(shrd,thds,errors) schedule (static,1)
  for (i=0; i<thds; i++) {
    #pragma omp critical
    {
      shrd  += 6*i;		      /* shrd is shared, i is private */
    }
    tprvt  = i;			      /* tprvt is threadprivate */
    prvt   = 2*i;		      /* prvt is private */
    fprvt += 3*i;		      /* fprvt is firstprivate */
    lprvt  = 4*i;		      /* lprvt is lastprivate */
    rdct  += 5*i;		      /* rdct is reduction(+) */
    waittime (1);

    if (prvt != 2*i) {		      /* check private */
      #pragma omp critical
      errors += 1;
    }
    if (fprvt != MAGICNO + 3*i) {
      #pragma omp critical
      errors += 1;
    }
  }

  r = 0;
  for (i=0; i<thds; i++) 
    r += i;
  if (rdct != r * 5) {
    errors += 1;
  }

  if (shrd != r * 6) {
    errors += 1;
  }

  if (lprvt != 4*(thds-1)) {
    errors += 1;
  }

  #pragma omp parallel for default(shared) schedule (static)
  for (i=0; i<thds; i++) {
    if (tprvt != i) {
      #pragma omp critical
      errors += 1;
    }
  }


  if (errors == 0) {
    printf ("default 005 : SUCCESS\n");
    return 0;
  } else {
    printf ("default 005 : FAILED\n");
    return 1;
  }
}
void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1 ){

  int *arg1h = (int *)arg1.data;

  int    nargs   = 2;
  op_arg args[2];

  args[0] = arg0;
  args[1] = arg1;

  int    ninds   = 1;
  int    inds[2] = {0,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;

  // set number of threads

#ifdef _OPENMP
  int nthreads = omp_get_max_threads( );
#else
  int nthreads = 1;
#endif

  // allocate and initialise arrays for global reduction

  int arg1_l[1+64*64];
  for (int thr=0; thr<nthreads; thr++)
    for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int;

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    op_timers_core(&cpu_t1, &wall_t1);

    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res_calc( blockIdx,
         (double *)arg0.data,
         Plan->ind_map,
         Plan->loc_map,
         &arg1_l[64*omp_get_thread_num()],
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);


  // combine reduction data
    if (col == Plan->ncolors_owned-1) {
      for (int thr=0; thr<nthreads; thr++)
        for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64];
    }

      block_offset += nblocks;
    }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_reduce(&arg1,arg1h);

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
Exemple #15
0
void op_par_loop_dotPV(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2 ){

  double *arg2h = (double *)arg2.data;

  int    nargs   = 3;
  op_arg args[3];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;

  if (OP_diags>2) {
    printf(" kernel routine w/o indirection:  dotPV\n");
  }

  op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timers_core(&cpu_t1, &wall_t1);

  // set number of threads

#ifdef _OPENMP
  int nthreads = omp_get_max_threads( );
#else
  int nthreads = 1;
#endif

  // allocate and initialise arrays for global reduction

  double arg2_l[1+64*64];
  for (int thr=0; thr<nthreads; thr++)
    for (int d=0; d<1; d++) arg2_l[d+thr*64]=ZERO_double;

  if (set->size >0) {


  // execute plan

#pragma omp parallel for
  for (int thr=0; thr<nthreads; thr++) {
    int start  = (set->size* thr   )/nthreads;
    int finish = (set->size*(thr+1))/nthreads;
    op_x86_dotPV( (double *) arg0.data,
                  (double *) arg1.data,
                  arg2_l + thr*64,
                  start, finish );
  }

  }


  // combine reduction data

  for (int thr=0; thr<nthreads; thr++)
    for(int d=0; d<1; d++) arg2h[d] += arg2_l[d+thr*64];

  op_mpi_reduce(&arg2,arg2h);

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  op_timing_realloc(4);
  OP_kernels[4].name      = name;
  OP_kernels[4].count    += 1;
  OP_kernels[4].time     += wall_t2 - wall_t1;
  OP_kernels[4].transfer += (float)set->size * arg0.size;
  OP_kernels[4].transfer += (float)set->size * arg1.size;
}
Exemple #16
0
static void john_run(void)
{
	if (options.flags & FLG_TEST_CHK)
		exit_status = benchmark_all() ? 1 : 0;
	else
	if (options.flags & FLG_MAKECHR_CHK)
		do_makechars(&database, options.charset);
	else
	if (options.flags & FLG_CRACKING_CHK) {
		int remaining = database.password_count;

		if (!(options.flags & FLG_STDOUT)) {
			status_init(NULL, 1);
			log_init(LOG_NAME, options.loader.activepot, options.session);
			john_log_format();
			if (idle_requested(database.format))
				log_event("- Configured to use otherwise idle "
					"processor cycles only");
		}
		tty_init(options.flags & FLG_STDIN_CHK);

#if defined(HAVE_MPI) && defined(_OPENMP)
		if (database.format->params.flags & FMT_OMP &&
		    omp_get_max_threads() > 1 && mpi_p > 1) {
			if(cfg_get_bool(SECTION_OPTIONS, SUBSECTION_MPI, "MPIOMPmutex", 1)) {
				if(cfg_get_bool(SECTION_OPTIONS, SUBSECTION_MPI, "MPIOMPverbose", 1) &&
				   mpi_id == 0)
					fprintf(stderr, "MPI in use, disabling OMP (see doc/README.mpi)\n");
				omp_set_num_threads(1);
			} else
				if(cfg_get_bool(SECTION_OPTIONS, SUBSECTION_MPI, "MPIOMPverbose", 1) &&
				   mpi_id == 0)
					fprintf(stderr, "Note: Running both MPI and OMP (see doc/README.mpi)\n");
		}
#endif
		if (options.flags & FLG_SINGLE_CHK)
			do_single_crack(&database);
		else
		if (options.flags & FLG_WORDLIST_CHK)
			do_wordlist_crack(&database, options.wordlist,
				(options.flags & FLG_RULES) != 0);
		else
		if (options.flags & FLG_INC_CHK)
			do_incremental_crack(&database, options.charset);
		else
		if (options.flags & FLG_MKV_CHK)
			do_markov_crack(&database, options.mkv_param);
		else
		if (options.flags & FLG_EXTERNAL_CHK)
			do_external_crack(&database);
		else
		if (options.flags & FLG_BATCH_CHK)
			do_batch_crack(&database);

		status_print();
		tty_done();

		if (database.password_count < remaining) {
			char *might = "Warning: passwords printed above might";
			char *partial = " be partial";
			char *not_all = " not be all those cracked";
			switch (database.options->flags &
			    (DB_SPLIT | DB_NODUP)) {
			case DB_SPLIT:
#ifdef HAVE_MPI
				if (mpi_id == 0)
#endif
				fprintf(stderr, "%s%s\n", might, partial);
				break;
			case DB_NODUP:
#ifdef HAVE_MPI
				if (mpi_id == 0)
#endif
				fprintf(stderr, "%s%s\n", might, not_all);
				break;
			case (DB_SPLIT | DB_NODUP):
#ifdef HAVE_MPI
				if (mpi_id == 0)
#endif
				fprintf(stderr, "%s%s and%s\n",
				    might, partial, not_all);
			}
#ifdef HAVE_MPI
			if (mpi_id == 0)
#endif
			fputs("Use the \"--show\" option to display all of "
			    "the cracked passwords reliably\n", stderr);
		}
	}
}
Exemple #17
0
int main(int argc, char *argv[])
{
  REAL_TYPE *A_gold, *B_gold, *A_gold2, *B_gold2;
  float *C_gold, *C0_gold, *C, *C2;

  int M, N, K;
  REAL_TYPE alpha, beta;
  int reps;

  libxsmm_spmdm_handle handle, handle2;
  libxsmm_CSR_sparseslice *A_sparse, *A_sparse2;
  int max_threads;

  /* Step 1: Read in args */
  libxsmm_timer_tickint start, end;
  double flops, duration;
  char transA, transB, transC;
  int i, j, k;
  size_t l;

  /* Step 1: Initialize handle */
  M = 0; N = 0; K = 0; alpha = (REAL_TYPE)1.0; beta = (REAL_TYPE)0.0; reps = 0; transA = 'N'; transB = 'N';

  if (argc > 1 && !strncmp(argv[1], "-h", 3)) {
    printf("\nUsage: %s [M] [N] [K] [transA] [transB] [reps]\n\n", argv[0]);
    return EXIT_SUCCESS;
  }

  /* defaults */
  M = 2048;
  N = 2048;
  K = 2048;
  transA = 'N';
  transB = 'N';
  transC = 'N';
  reps = 100;

  /* reading new values from cli */
  i = 1;
  if (argc > i) M = atoi(argv[i++]);
  if (argc > i) N = atoi(argv[i++]);
  if (argc > i) K = atoi(argv[i++]);
  if (argc > i) { transA = argv[i][0]; i++; }
  if (argc > i) { transB = argv[i][0]; i++; }
  if (argc > i) { transC = argv[i][0]; i++; }
  if (argc > i) reps = atoi(argv[i++]);

  /* Step 2: allocate data */
  A_gold  = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 );
  B_gold  = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 );
  C_gold  = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );
  C0_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );
  C       = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );

  /* Step 3: init data */
  libxsmm_rng_set_seed(1);
  for (l = 0; l < (size_t)M * (size_t)K; ++l) {
    const double r64 = libxsmm_rng_f64();
    const float r32 = (float)r64;
#ifdef USE_BFLOAT
    const int r = *(const int*)(&r32);
    const libxsmm_bfloat16 val = (r >> 16);
#else
    const float val = r32;
#endif
    if (r64 > 0.85) A_gold[l] = val;
    else              A_gold[l] = (REAL_TYPE)0.0;
  }

  for (l = 0; l < (size_t)K * (size_t)N; ++l) {
    const double r64 = libxsmm_rng_f64();
    const float r32 = (float)r64;
#ifdef USE_BFLOAT
    const int r = *(const int*)(&r32);
    const libxsmm_bfloat16 val = (r >> 16);
#else
    const float val = r32;
#endif
    B_gold[l] = val;
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C0_gold[l] = (float)libxsmm_rng_f64();
    C_gold[l] = C0_gold[l];
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C[l] = (float)C0_gold[l];
  }
  flops = (double)M * (double)N * (double)K * 2.0;

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 4: Initialize LIBXSMM for these sizes - allocates handle and temporary space for the sparse data structure for A */
# if defined(_OPENMP)
  max_threads = omp_get_max_threads();
# else
  max_threads = 1;
# endif

  start = libxsmm_timer_tick();
  libxsmm_spmdm_init(M, N, K, max_threads, &handle, &A_sparse);
  end = libxsmm_timer_tick();
  printf("Time for handle init = %f\n", libxsmm_timer_duration(start, end));

  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i -- forward pass\n", M, N, K, handle.bm, handle.bn, handle.bk, handle.mb, handle.nb, handle.kb, reps );
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha input */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
# else
  spmdm_exec_fp32(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
# endif

  /* Checks */

  /* Compute a "gold" answer sequentially */
#if defined(_OPENMP)
  LIBXSMM_OMP_VAR(k);
# pragma omp parallel for private(i, j, k) LIBXSMM_OPENMP_COLLAPSE(2)
#endif
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      float sum = 0.0;
      float Cval;
      for (k = 0; k < K; ++k) {
#       ifdef USE_BFLOAT
        libxsmm_bfloat16 Atmp = A_gold[i*K+k];
        int Atmp_int  = Atmp; Atmp_int <<= 16;
        float Aval = *(float *)&Atmp_int;
        libxsmm_bfloat16 Btmp = B_gold[k*N+j];
        int Btmp_int  = Btmp; Btmp_int <<= 16;
        float Bval = *(float *)&Btmp_int;
#       else
        float Aval = A_gold[i*K + k];
        float Bval = B_gold[k*N + j];
#       endif
        sum += Aval * Bval;
      }
      Cval = sum;
      C_gold[i*N + j] = Cval + beta*C_gold[i*N + j];
    }
  }
  /* LIBXSMM_FSYMBOL(sgemm)(&trans, &trans, &N, &M, &K, &alpha, B_gold, &N, A_gold, &K, &beta, C_gold, &N); */

  /* Compute the max difference between gold and computed results. */
  spmdm_check_c( &handle, C, C_gold );

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
#   else
    spmdm_exec_fp32( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);
  libxsmm_spmdm_destroy(&handle);

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 5: Initialize libxsmm for transpose A - allocates handle and temporary space for the sparse data structure for A */
  transA = 'T'; transB = 'N'; transC = 'T';
  libxsmm_spmdm_init(M, N, K, max_threads, &handle2, &A_sparse2);
  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transA = Y, transC = Y -- weight update\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps );
  A_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 );
  C2 = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );

  for (i = 0; i < M; ++i) {
    for (j = 0; j < K; ++j) {
      A_gold2[j*M + i] = A_gold[i*K + j];
    }
  }
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      C[j*M + i] = (float)C0_gold[i*N + j];
    }
  }
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha inputs */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
# else
  spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
# endif

  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      C2[i*N + j] = C[j*M + i];
    }
  }
  /* Checks */
  spmdm_check_c( &handle2, C2, C_gold);

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
#   else
    spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 6: Test transpose B */
  transA = 'N'; transB = 'T'; transC = 'N';
  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transB = Y -- backprop\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps );
  B_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 );

  for (i = 0; i < K; ++i) {
    for (j = 0; j < N; ++j) {
      B_gold2[j*K + i] = B_gold[i*N + j];
    }
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C[l] = (float)C0_gold[l];
  }
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha inputs */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
# else
  spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
# endif

  /* Checks */
  spmdm_check_c( &handle2, C, C_gold);

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
#   else
    spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);
  libxsmm_spmdm_destroy(&handle2);

  libxsmm_free(A_gold);
  libxsmm_free(B_gold);
  libxsmm_free(C_gold);
  libxsmm_free(C);
  libxsmm_free(C2);
  libxsmm_free(C0_gold);
  libxsmm_free(B_gold2);
  libxsmm_free(A_gold2);

  return EXIT_SUCCESS;
}
Exemple #18
0
static void john_init(char *name, int argc, char **argv)
{
	int show_usage = 0;
	int make_check = (argc == 2 && !strcmp(argv[1], "--make_check"));
	if (make_check)
		argv[1] = "--test=0";

	CPU_detect_or_fallback(argv, make_check);

	status_init(NULL, 1);
	if (argc < 2 ||
            (argc == 2 &&
             (!strcasecmp(argv[1], "--help") ||
              !strcasecmp(argv[1], "-h") ||
              !strcasecmp(argv[1], "-help"))))
	{
		john_register_all(); /* for printing by opt_init() */
		show_usage = 1;
	}
	opt_init(name, argc, argv, show_usage);

	/*
	 * --list=? needs to be supported, because it has been supported in the released
	 * john-1.7.9-jumbo-6 version, and it is used by the bash completion script.
	 * --list=? is, however, not longer mentioned in doc/OPTIONS and in the usage
	 * output. Instead, --list=help is.
	 */
	if (options.listconf &&
	    (!strcasecmp(options.listconf, "help") ||
	     !strcmp(options.listconf, "?")))
	{
		john_list_options();
		exit(0);
	}
	if (options.listconf &&
	    (!strcasecmp(options.listconf, "help:help") ||
	     !strcasecmp(options.listconf, "help:")))
	{
		john_list_help_options();
		exit(0);
	}
	if (options.listconf && !strcasecmp(options.listconf, "help:format-methods"))
	{
		john_list_method_names();
		exit(0);
	}
	if (options.listconf && !strncasecmp(options.listconf, "help:", 5))
	{
		if (strcasecmp(options.listconf, "help:parameters") &&
		    strcasecmp(options.listconf, "help:list-data"))
		{
			fprintf(stderr,
			        "%s is not a --list option that supports additional values.\nSupported options:\n",
			        options.listconf+5);
			john_list_help_options();
			exit(1);
		}
	}
	if (options.listconf && !strcasecmp(options.listconf, "hidden-options"))
	{
		puts("--help                    print usage summary, just like running the command");
		puts("                          without any parameters");
		puts("--subformat=FORMAT        pick a benchmark format for --format=crypt");
		puts("--mkpc=N                  force a lower max. keys per crypt");
		puts("--length=N                force a lower max. length");
		puts("--field-separator-char=C  use 'C' instead of the ':' in input and pot files");
		puts("--fix-state-delay=N       performance tweak, see documentation");
		puts("--log-stderr              log to screen instead of file\n");
		exit(0);
	}

	if (!make_check) {
#if defined(_OPENMP) && OMP_FALLBACK
#if defined(__DJGPP__) || defined(__CYGWIN32__)
#error OMP_FALLBACK is incompatible with the current DOS and Win32 code
#endif
		if (!getenv("JOHN_NO_OMP_FALLBACK") &&
		    omp_get_max_threads() <= 1) {
#define OMP_FALLBACK_PATHNAME JOHN_SYSTEMWIDE_EXEC "/" OMP_FALLBACK_BINARY
			execv(OMP_FALLBACK_PATHNAME, argv);
			perror("execv: " OMP_FALLBACK_PATHNAME);
		}
#endif

		path_init(argv);

		if (options.listconf && !strcasecmp(options.listconf,
		                                    "build-info"))
		{
			puts("Version: " JOHN_VERSION);
			puts("Build: " JOHN_BLD _MP_VERSION);
			printf("Arch: %d-bit %s\n", ARCH_BITS,
			       ARCH_LITTLE_ENDIAN ? "LE" : "BE");
#if JOHN_SYSTEMWIDE
			puts("System-wide exec: " JOHN_SYSTEMWIDE_EXEC);
			puts("System-wide home: " JOHN_SYSTEMWIDE_HOME);
			puts("Private home: " JOHN_PRIVATE_HOME);
#endif
			printf("$JOHN is %s\n", path_expand("$JOHN/"));
			printf("Format interface version: %d\n", FMT_MAIN_VERSION);
			puts("Rec file version: " RECOVERY_V);
			puts("Charset file version: " CHARSET_V);
			printf("CHARSET_MIN: %d (0x%02x)\n", CHARSET_MIN,
			       CHARSET_MIN);
			printf("CHARSET_MAX: %d (0x%02x)\n", CHARSET_MAX,
			       CHARSET_MAX);
			printf("CHARSET_LENGTH: %d\n", CHARSET_LENGTH);
			printf("Max. Markov mode level: %d\n", MAX_MKV_LVL);
			printf("Max. Markov mode password length: %d\n", MAX_MKV_LEN);
#ifdef __VERSION__
		printf("Compiler version: %s\n", __VERSION__);
#endif
#ifdef __GNUC__
			printf("gcc version: %d.%d.%d\n", __GNUC__,
			       __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
#endif
#ifdef __ICC
			printf("icc version: %d\n", __ICC);
#endif
#ifdef __clang_version__
			printf("clang version: %s\n", __clang_version__);
#endif
#ifdef OPENSSL_VERSION_NUMBER
			// The man page suggests the type of OPENSSL_VERSION_NUMBER is long,
			// gcc insists it is int.
			printf("OpenSSL library version: %lx", (unsigned long)OPENSSL_VERSION_NUMBER);
			// FIXME: How do I detect a missing library?
			// Even if if is extremely unlikely that openssl is missing,
			// at least flush all output buffers...
			fflush(NULL);
			if ((unsigned long)OPENSSL_VERSION_NUMBER != (unsigned long)SSLeay())
				printf("\t(loaded: %lx)", (unsigned long)SSLeay());
			printf("\n");
#endif
			exit(0);
		}
	}

	if (options.listconf && !strcasecmp(options.listconf, "encodings"))
	{
		listEncodings();
		exit(0);
	}
#ifdef CL_VERSION_1_0
	if (options.listconf && !strcasecmp(options.listconf, "opencl-devices"))
	{
		listOpenCLdevices();
		exit(0);
	}
#endif
#ifdef HAVE_CUDA
	if (options.listconf && !strcasecmp(options.listconf, "cuda-devices"))
	{
		cuda_device_list();
		exit(0);
	}
#endif

	if (!make_check) {
		if (options.config)
		{
			path_init_ex(options.config);
			cfg_init(options.config, 1);
			cfg_init(CFG_FULL_NAME, 1);
			cfg_init(CFG_ALT_NAME, 0);
		}
		else
		{
#if JOHN_SYSTEMWIDE
			cfg_init(CFG_PRIVATE_FULL_NAME, 1);
			cfg_init(CFG_PRIVATE_ALT_NAME, 1);
#endif
			cfg_init(CFG_FULL_NAME, 1);
			cfg_init(CFG_ALT_NAME, 0);
		}
	}

	/* This is --crack-status. We toggle here, so if it's enabled in
	   john.conf, we can disable it using the command line option */
	if (cfg_get_bool(SECTION_OPTIONS, NULL, "CrackStatus", 0))
		options.flags ^= FLG_CRKSTAT;

	initUnicode(UNICODE_UNICODE); /* Init the unicode system */

	john_register_all(); /* maybe restricted to one format by options */
	if ((options.subformat && !strcasecmp(options.subformat, "list")) ||
	    (options.listconf && !strcasecmp(options.listconf, "subformats")))
	{
		dynamic_DISPLAY_ALL_FORMATS();
		/* NOTE if we have other 'generics', like sha1, sha2, rc4, ...
		 * then EACH of them should have a DISPLAY_ALL_FORMATS()
		 * function and we can call them here. */
		exit(0);
	}

	if (options.listconf && !strcasecmp(options.listconf, "inc-modes"))
	{
		cfg_print_subsections("Incremental", NULL, NULL, 0);
		exit(0);
	}
	if (options.listconf && !strcasecmp(options.listconf, "rules"))
	{
		cfg_print_subsections("List.Rules", NULL, NULL, 0);
		exit(0);
	}
	if (options.listconf && !strcasecmp(options.listconf, "externals"))
	{
		cfg_print_subsections("List.External", NULL, NULL, 0);
		exit(0);
	}
	if (options.listconf && !strcasecmp(options.listconf, "sections"))
	{
		cfg_print_section_names(0);
		exit(0);
	}
	if (options.listconf &&
	    !strncasecmp(options.listconf, "parameters", 10) &&
	    (options.listconf[10] == '=' || options.listconf[10] == ':') &&
	    options.listconf[11] != '\0')
	{
		cfg_print_section_params(&options.listconf[11], NULL);
		exit(0);
	}
	if (options.listconf &&
	    !strncasecmp(options.listconf, "list-data", 9) &&
	    (options.listconf[9] == '=' || options.listconf[9] == ':') &&
	    options.listconf[10] != '\0')
	{
		cfg_print_section_list_lines(&options.listconf[10], NULL);
		exit(0);
	}
	if (options.listconf && !strcasecmp(options.listconf, "ext-filters"))
	{
		cfg_print_subsections("List.External", "filter", NULL, 0);
		exit(0);
	}
	if (options.listconf && !strcasecmp(options.listconf, "ext-filters-only"))
	{
		cfg_print_subsections("List.External", "filter", "generate", 0);
		exit(0);
	}
	if (options.listconf && !strcasecmp(options.listconf, "ext-modes"))
	{
		cfg_print_subsections("List.External", "generate", NULL, 0);
		exit(0);
	}

	if (options.listconf &&
	    !strcasecmp(options.listconf, "formats")) {
		int column;
		struct fmt_main *format;
		int i, dynamics = 0;
		char **formats_list;

		i = 0;
		format = fmt_list;
		while ((format = format->next))
			i++;

		formats_list = malloc(sizeof(char*) * i);

		i = 0;
		format = fmt_list;
		do {
			char *label = format->params.label;
			if (!strncmp(label, "dynamic", 7)) {
				if (dynamics++)
					continue;
				else
					label = "dynamic_n";
			}
			formats_list[i++] = label;
		} while ((format = format->next));
		formats_list[i] = NULL;

		column = 0;
		i = 0;
		do {
			int length;
			char *label = formats_list[i++];
			length = strlen(label) + 2;
			column += length;
			if (column > 78) {
				printf("\n");
				column = length;
			}
			printf("%s%s", label, formats_list[i] ? ", " : "\n");
		} while (formats_list[i]);
		free(formats_list);
		exit(0);
	}
	if (options.listconf &&
	    !strcasecmp(options.listconf, "format-details")) {
		struct fmt_main *format;
		format = fmt_list;
		do {
			int ntests = 0;

			if(format->params.tests) {
				while (format->params.tests[ntests++].ciphertext);
				ntests--;
			}
			printf("%s\t%d\t%d\t%d\t%08x\t%d\t%s\t%s\t%s\t%d\t%d\t%d\n",
			       format->params.label,
			       format->params.plaintext_length,
			       format->params.min_keys_per_crypt,
			       format->params.max_keys_per_crypt,
			       format->params.flags,
			       ntests,
			       format->params.algorithm_name,
			       format->params.format_name,
			       format->params.benchmark_comment,
			       format->params.benchmark_length,
			       format->params.binary_size,
			       ((format->params.flags & FMT_DYNAMIC) && format->params.salt_size) ?
			       // salts are handled internally within the format. We want to know the 'real' salt size
			       // dynamic will alway set params.salt_size to 0 or sizeof a pointer.
			       dynamic_real_salt_length(format) : format->params.salt_size);
		} while ((format = format->next));
		exit(0);
	}
	if (options.listconf &&
	    !strcasecmp(options.listconf, "format-all-details")) {
		struct fmt_main *format;
		format = fmt_list;
		do {
			int ntests = 0;

			if(format->params.tests) {
				while (format->params.tests[ntests++].ciphertext);
				ntests--;
			}
			/*
			 * attributes should be printed in the same sequence
			 * as with format-details, but human-readable
			 */
			printf("Format label                    \t%s\n", format->params.label);
			printf("Max. password length in bytes   \t%d\n", format->params.plaintext_length);
			printf("Min. keys per crypt             \t%d\n", format->params.min_keys_per_crypt);
			printf("Max. keys per crypt             \t%d\n", format->params.max_keys_per_crypt);
			printf("Flags\n");
			printf(" Case sensitive                 \t%s\n", (format->params.flags & FMT_CASE) ? "yes" : "no");
			printf(" Supports 8-bit characters      \t%s\n", (format->params.flags & FMT_8_BIT) ? "yes" : "no");
			printf(" Converts 8859-1 to UTF-16/UCS-2\t%s\n", (format->params.flags & FMT_UNICODE) ? "yes" : "no");
			printf(" Honours --encoding=NAME        \t%s\n", (format->params.flags & FMT_UTF8) ? "yes" : "no");
			printf(" False positives possible       \t%s\n", (format->params.flags & FMT_NOT_EXACT) ? "yes" : "no");
			printf(" Uses a bitslice implementation \t%s\n", (format->params.flags & FMT_BS) ? "yes" : "no");
			printf(" The split() method unifies case\t%s\n", (format->params.flags & FMT_SPLIT_UNIFIES_CASE) ? "yes" : "no");
			printf(" A $dynamic$ format             \t%s\n", (format->params.flags & FMT_DYNAMIC) ? "yes" : "no");
#ifdef _OPENMP
			printf(" Parallelized with OpenMP       \t%s\n", (format->params.flags & FMT_OMP) ? "yes" : "no");
#endif
			printf("Number of test cases for --test \t%d\n", ntests);
			printf("Algorithm name                  \t%s\n", format->params.algorithm_name);
			printf("Format name                     \t%s\n", format->params.format_name);
			printf("Benchmark comment               \t%s\n", format->params.benchmark_comment);
			printf("Benchmark length                \t%d\n", format->params.benchmark_length);
			printf("Binary size                     \t%d\n", format->params.binary_size);
			printf("Salt size                       \t%d\n",
			       ((format->params.flags & FMT_DYNAMIC) && format->params.salt_size) ?
			       // salts are handled internally within the format. We want to know the 'real' salt size/
			       // dynamic will alway set params.salt_size to 0 or sizeof a pointer.
			       dynamic_real_salt_length(format) : format->params.salt_size);
			printf("\n");
		} while ((format = format->next));
		exit(0);
	}
	if (options.listconf &&
	    !strncasecmp(options.listconf, "format-methods", 14)) {
		struct fmt_main *format;
		format = fmt_list;
		do {
			int ShowIt = 1, i;
			if (options.listconf[14] == '=' || options.listconf[14] == ':') {
				ShowIt = 0;
				if (!strcasecmp(&options.listconf[15], "set_key")   ||
					!strcasecmp(&options.listconf[15], "get_key")   ||
					!strcasecmp(&options.listconf[15], "crypt_all") ||
					!strcasecmp(&options.listconf[15], "cmp_all")   ||
					!strcasecmp(&options.listconf[15], "cmp_one")  ||
					!strcasecmp(&options.listconf[15], "cmp_exact"))
					ShowIt = 1;
				else if (strcasecmp(&options.listconf[15], "init") && strcasecmp(&options.listconf[15], "prepare") &&
					strcasecmp(&options.listconf[15], "valid") && strcasecmp(&options.listconf[15], "split") &&
					strcasecmp(&options.listconf[15], "binary") && strcasecmp(&options.listconf[15], "clear_keys") &&
					strcasecmp(&options.listconf[15], "salt") && strcasecmp(&options.listconf[15], "get_hash") &&
					strcasecmp(&options.listconf[15], "get_hash[0]") && strcasecmp(&options.listconf[15], "get_hash[1]") &&
					strcasecmp(&options.listconf[15], "get_hash[2]") && strcasecmp(&options.listconf[15], "get_hash[3]") &&
					strcasecmp(&options.listconf[15], "get_hash[4]") && strcasecmp(&options.listconf[15], "get_hash[5]") &&
					strcasecmp(&options.listconf[15], "set_salt") && strcasecmp(&options.listconf[15], "binary_hash") &&
					strcasecmp(&options.listconf[15], "binary_hash[0]") && strcasecmp(&options.listconf[15], "binary_hash[1]") &&
					strcasecmp(&options.listconf[15], "binary_hash[2]") && strcasecmp(&options.listconf[15], "binary_hash[3]") &&
					strcasecmp(&options.listconf[15], "binary_hash[3]") && strcasecmp(&options.listconf[15], "binary_hash[5]") &&
					strcasecmp(&options.listconf[15], "salt_hash"))
				{
					fprintf(stderr, "Error, invalid option (invalid method name) %s\n", options.listconf);
					fprintf(stderr, "Valid method names are:\n");
					john_list_method_names();
					exit(1);
				}
				if (format->methods.init != fmt_default_init && !strcasecmp(&options.listconf[15], "init"))
					ShowIt = 1;
				if (format->methods.prepare != fmt_default_prepare && !strcasecmp(&options.listconf[15], "prepare"))
					ShowIt = 1;
				if (format->methods.valid != fmt_default_valid && !strcasecmp(&options.listconf[15], "valid"))
					ShowIt = 1;
				if (format->methods.split != fmt_default_split && !strcasecmp(&options.listconf[15], "split"))
					ShowIt = 1;
				if (format->methods.binary != fmt_default_binary && !strcasecmp(&options.listconf[15], "binary"))
					ShowIt = 1;
				if (format->methods.salt != fmt_default_salt && !strcasecmp(&options.listconf[15], "salt"))
					ShowIt = 1;
				if (format->methods.clear_keys != fmt_default_clear_keys && !strcasecmp(&options.listconf[15], "clear_keys"))
					ShowIt = 1;
				for (i = 0; i < 6; ++i) {
					char Buf[20];
					sprintf(Buf, "get_hash[%d]", i);
					if (format->methods.get_hash[i] && format->methods.get_hash[i] != fmt_default_get_hash && !strcasecmp(&options.listconf[15], Buf))
						ShowIt = 1;
				}
				if (format->methods.get_hash[0] && format->methods.get_hash[0] != fmt_default_get_hash && !strcasecmp(&options.listconf[15], "get_hash"))
					ShowIt = 1;

				for (i = 0; i < 6; ++i) {
					char Buf[20];
					sprintf(Buf, "binary_hash[%d]", i);
					if (format->methods.binary_hash[i] && format->methods.binary_hash[i] != fmt_default_binary_hash && !strcasecmp(&options.listconf[15], Buf))
						ShowIt = 1;
				}
				if (format->methods.binary_hash[0] && format->methods.binary_hash[0] != fmt_default_binary_hash && !strcasecmp(&options.listconf[15], "binary_hash"))
					ShowIt = 1;
				if (format->methods.salt_hash != fmt_default_salt_hash && !strcasecmp(&options.listconf[15], "salt_hash"))
					ShowIt = 1;
				if (format->methods.set_salt != fmt_default_set_salt && !strcasecmp(&options.listconf[15], "set_salt"))
					ShowIt = 1;
			}
			if (ShowIt) {
				int i;
				printf("Methods overridden for:   %s [%s] %s\n", format->params.label, format->params.algorithm_name, format->params.format_name);
				if (format->methods.init != fmt_default_init)
					printf("\tinit()\n");
				if (format->methods.prepare != fmt_default_prepare)
					printf("\tprepare()\n");
				if (format->methods.valid != fmt_default_valid)
					printf("\tvalid()\n");
				if (format->methods.split != fmt_default_split)
					printf("\tsplit()\n");
				if (format->methods.binary != fmt_default_binary)
					printf("\tbinary()\n");
				if (format->methods.salt != fmt_default_salt)
					printf("\tsalt()\n");
				for (i = 0; i < 6; ++i)
					if (format->methods.binary_hash[i] != fmt_default_binary_hash) {
						if (format->methods.binary_hash[i])
							printf("\t\tbinary_hash[%d]()\n", i);
						else
							printf("\t\tbinary_hash[%d]()  (NULL pointer)\n", i);
					}
				if (format->methods.salt_hash != fmt_default_salt_hash)
					printf("\tsalt_hash()\n");
				if (format->methods.set_salt != fmt_default_set_salt)
					printf("\tset_salt()\n");
				// there is no default for set_key() it must be defined.
				printf("\tset_key()\n");
				// there is no default for get_key() it must be defined.
				printf("\tget_key()\n");
				if (format->methods.clear_keys != fmt_default_clear_keys)
					printf("\tclear_keys()\n");
				for (i = 0; i < 6; ++i)
					if (format->methods.get_hash[i] != fmt_default_get_hash) {
						if (format->methods.get_hash[i])
							printf("\t\tget_hash[%d]()\n", i);
						else
							printf("\t\tget_hash[%d]()  (NULL pointer)\n", i);
					}
				// there is no default for crypt_all() it must be defined.
				printf("\tcrypt_all()\n");
				// there is no default for cmp_all() it must be defined.
				printf("\tcmp_all()\n");
				// there is no default for cmp_one() it must be defined.
				printf("\tcmp_one()\n");
				// there is no default for cmp_exact() it must be defined.
				printf("\tcmp_exact()\n");
				printf("\n\n");
			}
		} while ((format = format->next));
		exit(0);
	}
	/*
	 * Other --list=help:WHAT are processed earlier, but these require
	 * a valid config:
	 */
	if (options.listconf && !strcasecmp(options.listconf, "help:parameters"))
	{
		cfg_print_section_names(1);
		exit(0);
	}
	if (options.listconf && !strcasecmp(options.listconf, "help:list-data"))
	{
		cfg_print_section_names(2);
		exit(0);
	}

	/* --list last resort: list subsections of any john.conf section name */
	if (options.listconf)
	{
		//printf("Subsections of [%s]:\n", options.listconf);
		if (cfg_print_subsections(options.listconf, NULL, NULL, 1))
			exit(0);
		else {
			fprintf(stderr, "Section [%s] not found.\n", options.listconf);
			/* Just in case the user specified an invalid value
			 * like help or list...
			 * print the same list as with --list=?, but exit(1)
			 */
			john_list_options();
			exit(1);
		}
	}

#ifdef CL_VERSION_1_0
	if (!options.ocl_platform) {
		if ((options.ocl_platform =
		     cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, "Platform")))
			platform_id = atoi(options.ocl_platform);
		else
			platform_id = -1;
	}
	if (!options.gpu_device) {
		if ((options.gpu_device =
		     cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, "Device")))
			ocl_gpu_id = atoi(options.gpu_device);
		else
			ocl_gpu_id = -1;
	}
	if (platform_id == -1 || ocl_gpu_id == -1)
		opencl_find_gpu(&ocl_gpu_id, &platform_id);
#endif

	common_init();
	sig_init();

	john_load();

	if (options.encodingStr && options.encodingStr[0])
		log_event("- %s input encoding enabled", options.encodingStr);
}
Exemple #19
0
int main(int argc, char **argv)
{
    MPI_Init(&argc, &argv);
    int nprocs, rank;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    int numthreads = omp_get_max_threads();

    if (argc < 2) {
        printf("Usage:\n");
        printf("  poisson n\n\n");
        printf("Arguments:\n");
        printf("  n: the problem size (must be a power of 2)\n");
    }

    double time_start;
    if (rank == 0) {
        time_start = MPI_Wtime();
    }

    // The number of grid points in each direction is n+1
    // The number of degrees of freedom in each direction is n-1 = m
    int n = atoi(argv[1]);
    int m = n - 1;
    int nn = 4 * n;
    real h = 1.0 / n;

    // Splitting the matrix into columns:
    int exact = m/nprocs;
    int rem = m - (nprocs - 1)*exact;
    // Size of each process owns a strip matrix which is m*exact or m*remain.
    // We consider that each such a matrix is made of 'nprocs' blocks vertically.
    int block_col = exact;
    int block_uk = exact*exact;
    int rem_uk = exact*rem;
    // For the last such strip, number of columns is rem. Consequently:
    if (rank == nprocs-1){
        block_col = rem;
        block_uk = rem*exact;
        rem_uk = rem*rem;
    }

    // Grid points
    real *grid = mk_1D_array(n+1, false);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < n+1; i++) {
        grid[i] = i * h;
    }

    // The diagonal of the eigenvalue matrix of T
    real *diag = mk_1D_array(m, false);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < m; i++) {
        diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n));
    }

    // Initialize the right hand side data
    // B is the column strip that the process owns.*
    real **B = mk_2D_array(block_col, m, false);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        for (size_t j = 0; j < m; j++) {
            B[i][j] = h * h * rhs(grid[i+1+(rank*exact)], grid[j+1]);
        }
    }

    // For the Sine Transform:
    real **z = mk_2D_array(numthreads, nn, false);

    // Calculate Btilde^T = S^-1 * (S * B)^T
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        fst_(B[i], &n, z[omp_get_thread_num()], &nn);
    }
    transpose(B, block_col, m, nprocs, block_uk, rem_uk, rank);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        fstinv_(B[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Solve Lambda * Xtilde = Btilde
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        for (size_t j = 0; j < m; j++) {
            B[i][j] = B[i][j] / (diag[i+(rank*exact)] + diag[j]);
        }
    }

    // Calculate X = S^-1 * (S * Xtilde^T) ^ T
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        fst_(B[i], &n, z[omp_get_thread_num()], &nn);
    }
    transpose(B, block_col, m, nprocs, block_uk, rem_uk, rank);
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < block_col; i++) {
        fstinv_(B[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Calculate maximal value of solution
    double U_max = 0.0, e_max = 0.0, global_max, global_emax, error;
    for (size_t i = 0; i < block_col; i++){
        for (size_t j = 0; j < m; j++){
        	error = fabs(B[i][j] - sin(PI*(i+1+(rank*exact))*h)*sin(2*PI*(j+1)*h));
            U_max = U_max > B[i][j] ? U_max : B[i][j];
            e_max = e_max > error ? e_max : error;
        }
    }

    // MPI_Max to find the true maximum:
    MPI_Reduce(&U_max, &global_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    MPI_Reduce(&e_max, &global_emax, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    // Print the Global Maximum on process 0:
    if (rank == 0){
    	printf("Problem Size = %d\tNumprocs = %d\tNumthreads = %d\n", n, nprocs, numthreads);
        printf("U_max = %0.16f\t", global_max);
		printf("E_max = %0.16f\t", global_emax);
        double duration = MPI_Wtime() - time_start ;
        printf("Execution Time: %0.16f \n", duration);
    }

    MPI_Finalize();
    return 0;
}
void PrefixJaccardScore<AttributeT>::run() {
	//this-> required to access members of base class, since this is a template class.
	if (!this->G.hasEdgeIds()) throw std::runtime_error("Error, edges need to be indexed first");

	this->scoreData.clear();
	this->scoreData.resize(this->G.upperEdgeIdBound());

	struct RankedEdge {
		node u;
		AttributeT att;
		count rank;

		RankedEdge(node u, AttributeT att, count rank) : u(u), att(att), rank(rank) {};

		bool operator<(const RankedEdge &other) const {
			return std::tie(rank, att, u) < std::tie(other.rank, other.att, other.u);
		};

		bool operator>(const RankedEdge &other) const {
			return std::tie(rank, att, u) > std::tie(other.rank, other.att, other.u);
		};
	};

	std::vector<size_t> rankedEdgeBegin(G.upperNodeIdBound() + 1);
	std::vector<RankedEdge> rankedEdges;
	rankedEdges.reserve(2*G.numberOfEdges());

	for (node u = 0; u < G.upperNodeIdBound(); ++u) {
		rankedEdgeBegin[u] = rankedEdges.size();
		if (G.hasNode(u)) {
			G.forEdgesOf(u, [&](node, node v, edgeid eid) {
				rankedEdges.emplace_back(v, inAttribute[eid], 0);
			});
		}
	}
	rankedEdgeBegin[G.upperNodeIdBound()] = rankedEdges.size();

	this->G.balancedParallelForNodes([&](node u) {
		if (this->G.degree(u) == 0) return;

		const auto beginIt = rankedEdges.begin() + rankedEdgeBegin[u];
		const auto endIt = rankedEdges.begin() + rankedEdgeBegin[u+1];

		Aux::Parallel::sort(beginIt, endIt, std::greater<RankedEdge>());

		AttributeT curVal = beginIt->att;
		count curRank = 0;
		count numEqual = 0;
		for (auto it = beginIt; it != endIt; ++it) {
			if (curVal != it->att) {
				curRank += numEqual;
				curVal = it->att;
				numEqual = 1;
			} else {
				++numEqual;
			}

			it->rank = curRank;
		}
	});

	std::vector<std::vector<bool>> uMarker(omp_get_max_threads(), std::vector<bool>(G.upperNodeIdBound(), false));
	auto vMarker = uMarker;

	this->G.parallelForEdges([&](node u, node v, edgeid eid) {
		count curRank = 0;
		double bestJaccard = 0;
		auto tid = omp_get_thread_num();

		auto uIt = rankedEdges.begin() + rankedEdgeBegin[u];
		auto vIt = rankedEdges.begin() + rankedEdgeBegin[v];
		const auto uEndIt = rankedEdges.begin() + rankedEdgeBegin[u+1];
		const auto vEndIt = rankedEdges.begin() + rankedEdgeBegin[v+1];

		count commonNeighbors = 0;
		count uNeighbors = 0;
		count vNeighbors = 0;

		while (uIt != uEndIt || vIt != vEndIt) {
			while (uIt != uEndIt && curRank == uIt->rank) {
				if (uIt->u == v) {
					++uIt;
					continue;
				}

				if (vMarker[tid][uIt->u]) {
					vMarker[tid][uIt->u] = false;
					++commonNeighbors;
					--vNeighbors;
				} else {
					uMarker[tid][uIt->u] = true;
					++uNeighbors;
				}

				++uIt;
			}

			while (vIt != vEndIt && curRank == vIt->rank) {
				if (vIt->u == u) {
					++vIt;
					continue;
				}

				if (uMarker[tid][vIt->u]) {
					uMarker[tid][vIt->u] = false;
					++commonNeighbors;
					--uNeighbors;
				} else {
					vMarker[tid][vIt->u] = true;
					++vNeighbors;
				}

				++vIt;
			}

			bestJaccard = std::max(bestJaccard, commonNeighbors * 1.0 / (uNeighbors + vNeighbors + commonNeighbors));

			++curRank;
		}

		G.forNeighborsOf(u, [&](node w) {
			uMarker[tid][w] = false;
		});

		G.forNeighborsOf(v, [&](node w) {
			vMarker[tid][w] = false;
		});

		this->scoreData[eid] = bestJaccard;
	});

	this->hasRun = true;
}
inline
int OpenMPTarget::thread_pool_size( int depth )
{
  //return Impl::OpenMPTargetExec::pool_size(depth);
  return omp_get_max_threads();
}
Exemple #22
0
Fichier : pxz.c Projet : ip1981/pxz
int main( int argc, char **argv ) {
	int i;
	uint64_t p, threads, chunk_size;
	uint8_t *m;
	struct stat s;
	ssize_t rd, ts = 0;
	size_t page_size;
	struct sigaction new_action, old_action;
	struct utimbuf u;
	lzma_filter filters[LZMA_FILTERS_MAX + 1];
	lzma_options_lzma lzma_options;
	
	page_size = sysconf(_SC_PAGE_SIZE);
	xzcmd = malloc(xzcmd_max);
	if (!xzcmd) {
		fprintf(stderr, "Failed to allocate %lu bytes for xz command.\n", xzcmd_max);
		return -1;
	}
	snprintf(xzcmd, xzcmd_max, XZ_BINARY);
	
	parse_args(argc, argv);

	lzma_lzma_preset(&lzma_options, opt_complevel);

	filters[0].id = LZMA_FILTER_LZMA2;
	filters[0].options = &lzma_options;
	filters[1].id = LZMA_VLI_UNKNOWN;

	
	for (i=0; i<files; i++) {
		int std_in = file[i][0] == '-' && file[i][1] == '\0';
#ifdef _OPENMP
		threads = omp_get_max_threads();
#else
		threads = 1;
#endif
		if ( (rd=strlen(file[i])) >= 3 && !strncmp(&file[i][rd-3], ".xz", 3) ) {
			if (opt_verbose) {
				error(EXIT_FAILURE, 0, "ignoring '%s', it seems to be already compressed", file[i]);
			}
			continue;
		}
		
		if ( !std_in ) {
			if ( stat(file[i], &s)) {
				error(EXIT_FAILURE, errno, "can't stat '%s'", file[i]);
			}
		}
		
		chunk_size = opt_context_size * lzma_options.dict_size;
		chunk_size = (chunk_size + page_size)&~(page_size-1);
		
		if ( opt_verbose ) {
			fprintf(stderr, "context size per thread: %"PRIu64" B\n", chunk_size);
		}
		
		if ( opt_threads && (threads > opt_threads || opt_force) ) {
			threads = opt_threads;
		}
		
		fo = stdout;
		if ( std_in ) {
			fi = stdin;
		} else {
			if ( !(fi=fopen(file[i], "rb")) ) {
				error(EXIT_FAILURE, errno, "can't open '%s' for reading", file[i]);
			}
			if ( !opt_stdout ) {
				snprintf(str, sizeof(str), "%s.xz", file[i]);
				if ( !(fo=fopen(str, "wb")) ) {
					error(EXIT_FAILURE, errno, "error creating target archive '%s'", str);
				}
			}
		}
		
		if ( opt_verbose ) {
			if ( fo != stdout ) {
				fprintf(stderr, "%s -> %"PRIu64"/%"PRIu64" thread%c: [", file[i], threads, (s.st_size+chunk_size-1)/chunk_size, threads != 1 ? 's' : ' ');
			} else {
				fprintf(stderr, "%"PRIu64" thread%c: [", threads, threads != 1 ? 's' : ' ');
			}
			fflush(stderr);
		}
		
		m  = malloc(threads*chunk_size);
		
		new_action.sa_handler = term_handler;
		sigemptyset (&new_action.sa_mask);
		new_action.sa_flags = 0;
		
		sigaction(SIGINT, NULL, &old_action);
		if (old_action.sa_handler != SIG_IGN) sigaction(SIGINT, &new_action, NULL);
		sigaction(SIGHUP, NULL, &old_action);
		if (old_action.sa_handler != SIG_IGN) sigaction(SIGHUP, &new_action, NULL);
		sigaction(SIGTERM, NULL, &old_action);
		if (old_action.sa_handler != SIG_IGN) sigaction(SIGTERM, &new_action, NULL);
		
		ftemp = malloc(threads*sizeof(ftemp[0]));
		
		while ( !feof(fi) ) {
			size_t actrd;
			
			for (p=0; p<threads; p++) {
				ftemp[p] = tmpfile();
			}
			
			for ( actrd=rd=0; !feof(fi) && !ferror(fi) && (uint64_t)rd < threads*chunk_size; rd += actrd) {
				actrd = fread(&m[rd], 1, threads*chunk_size-actrd, fi);
			}
			if (ferror(fi)) {
				error(EXIT_FAILURE, errno, "error in reading input");
			}

#pragma omp parallel for private(p) num_threads(threads)
			for ( p=0; p<(rd+chunk_size-1)/chunk_size; p++ ) {
				off_t pt, len = rd-p*chunk_size >= chunk_size ? chunk_size : rd-p*chunk_size;
				uint8_t *mo;
				lzma_stream strm = LZMA_STREAM_INIT;
				lzma_ret ret;
				
				mo = malloc(BUFFSIZE);
				
				if ( lzma_stream_encoder(&strm, filters, LZMA_CHECK_CRC64) != LZMA_OK ) {
					error(EXIT_FAILURE, errno, "unable to initialize LZMA encoder");
				}
				
				for (pt=0; pt<len; pt+=BUFFSIZE) {
					strm.next_in = &m[p*chunk_size+pt];
					strm.avail_in = len-pt >= BUFFSIZE ? BUFFSIZE : len-pt;
					strm.next_out = mo;
					strm.avail_out = BUFFSIZE;
					do {
						ret = lzma_code(&strm, LZMA_RUN);
						if ( ret != LZMA_OK ) {
							error(EXIT_FAILURE, 0, "error in LZMA_RUN");
						}
						if ( BUFFSIZE - strm.avail_out > 0 ) {
							if ( !fwrite(mo, 1, BUFFSIZE - strm.avail_out, ftemp[p]) ) {
								error(EXIT_FAILURE, errno, "writing to temp file failed");
							}
							strm.next_out = mo;
							strm.avail_out = BUFFSIZE;
						}
					} while ( strm.avail_in );
				}
				
				strm.next_out = mo;
				strm.avail_out = BUFFSIZE;
				do {
					ret = lzma_code(&strm, LZMA_FINISH);
					if ( ret != LZMA_OK && ret != LZMA_STREAM_END ) {
						error(EXIT_FAILURE, 0, "error in LZMA_FINISH");
					}
					if ( BUFFSIZE - strm.avail_out > 0 ) {
						if ( !fwrite(mo, 1, BUFFSIZE - strm.avail_out, ftemp[p]) ) {
							error(EXIT_FAILURE, errno, "writing to temp file failed");
						}
						strm.next_out = mo;
						strm.avail_out = BUFFSIZE;
					}
				} while ( ret == LZMA_OK );
				lzma_end(&strm);
				
				free(mo);
				
				if ( opt_verbose ) {
					fprintf(stderr, "%"PRIu64" ", p);
					fflush(stderr);
				}
			}
			
			for ( p=0; p<threads; p++ ) {
				rewind(ftemp[p]);
				while ( (rd=fread(buf, 1, sizeof(buf), ftemp[p])) > 0 ) {
					if ( fwrite(buf, 1, rd, fo) != (size_t)rd ) {
						error(0, errno, "writing to archive failed");
						if ( fo != stdout && unlink(str) ) {
							error(0, errno, "error deleting corrupted target archive %s", str);
						}
						exit(EXIT_FAILURE);
					} else ts += rd;
				}
				if (rd < 0) {
					error(0, errno, "reading from temporary file failed");
					if ( fo != stdout && unlink(str) ) {
						error(0, errno, "error deleting corrupted target archive %s", str);
					}
					exit(EXIT_FAILURE);
				}
				if ( close_stream(ftemp[p]) ) {
					error(0, errno, "I/O error in temp file");
				}
			}
		}
		
		if ( fi != stdin && close_stream(fi) ) {
			error(0, errno, "I/O error in input file");
		}
		
		if ( opt_verbose ) {
			fprintf(stderr, "] ");
		}

		free(ftemp);
		
		if ( fo != stdout ) {
			if ( close_stream(fo) ) {
				error(0, errno, "I/O error in target archive");
			}
		} else return 0;
		
		if ( chmod(str, s.st_mode) ) {
			error(0, errno, "warning: unable to change archive permissions");
		}

		u.actime = s.st_atime;
		u.modtime = s.st_mtime;
		
		if ( utime(str, &u) ) {
			error(0, errno, "warning: unable to change archive timestamp");
		}
		
		sigaction(SIGINT, &old_action, NULL);
		sigaction(SIGHUP, &old_action, NULL);
		sigaction(SIGTERM, &old_action, NULL);
		
		if ( opt_verbose ) {
			fprintf(stderr, "%"PRIu64" -> %zd %3.3f%%\n", s.st_size, ts, ts*100./s.st_size);
		}
		
		if ( !opt_keep && unlink(file[i]) ) {
			error(0, errno, "error deleting input file %s", file[i]);
		}
	}
	
	return 0;
}
Exemple #23
0
    int fixedparamBKZ(mat_ZZ &L,int index,int beta,double prob, double alpha, int tourlim,int vl,int opt) {

        sharememalloc();
        pruning_func::init_pruning_func();

        BKZproperty BP;
        BP.enumlim[0] = 1000000;    // limit of #processed nodes (/10^8) 
        BP.enumlim[1] = 1; 

        BP.beta[0] = beta;  //blocksize
        BP.beta[1] = beta*0.6;    //blocksize in preprocess strategy1
        BP.tourlim = tourlim;
        BP.breakindex = -1;


        //multithread?
        if (opt & OPT_MULTITHREAD) {
            BP.multithread = true;
            BP.numthreads = omp_get_max_threads();
            BP.MTlimit = BP.numthreads*10000000;
            cout << "setting # threads = " << BP.numthreads << endl;
        } else {
            BP.multithread = false;
            BP.MTlimit = 10000000;
        }

        //optimize pruning function
        if (opt & OPT_OPTIMIZE_PRUNING_FUNCTION) {
            BP.optimizepf = true;
            BP.optimizepf_at_least = 1000;
            lattice_enum::enum_speed_bench(BP.numthreads);
        }

        //do preprocess?
        if (opt & (OPT_PREPROCESS | OPT_PREPROCESS2)) {
            BP.preprocess = true;
            BP.preprocess_at_least = 1;  
            BP.preprocess_strategy = 1;
            if (opt & OPT_PREPROCESS2) BP.preprocess_strategy = 2;
            lattice_enum::enum_speed_bench(BP.numthreads);
        } else {
            BP.preprocess = false;
        }

        if (opt & OPT_EXTENDBLOCKSIZE) {
            BP.extend_blocksize=true;
            BP.extend_blocksizemult = 2;   //extend blocksize while expected cost <= max cost of the current tour
        }


        //break at a specific index
        if (opt & OPT_FIRSTINDEX) {
            BP.breakindex = 1;
        }

        //break at a specific index
        if (opt & OPT_GHBASEDSKIP) {
            BP.ghskip = true;
            BP.ghskipcoeff = 1.025;  //if |b*i| < a*GH(L), skip the block
        }

        //output log file
        if (opt & OPT_TIMELOG) {
             BP.tlname="bkzlog.txt"; 
        }

        BP.verboselevel = vl;
        BP.pruning_prob = prob;

        BP.init_radius = alpha;
        BP.init_mode = 'G';  //R=alpha*GH(L)

        BP.holdvecs = 16;

        //A heuristic strategy for finding short vectors 
        if (opt & OPT_FIND_SHORT) {
            BP.process_entire_basis = true;
            BP.ec_boundalpha = 1.05;
        }    

        cputime = 0;    //CPU time in second
        start = clock();

        return BKZmain(L,0,BP);
    }
Exemple #24
0
void CudaDb::requestQuery(const KdRequest &r) {
    KdRequest request = r;

    if (request.type == RT_CPU)
        request.result = RequestResult(new std::vector<long>());
    this->queue.push(request);

    switch (request.type) {
#if USE_CUDA
    case RT_CUDA:
    case RT_CUDA_DP:
    case RT_CUDA_IM:
        request.numBlocks = this->numBlocks / this->devices.size();
        for (size_t i = 0; i < this->devices.size(); i++) {
            request.ranges = (uint64_t*) this->fRange.data() + (request.numBlocks) * i * 2 * r.query->size;
            request.keys = (TripKey*) this->fBin.data() + (request.numBlocks) * i * KdBlock::MAX_RECORDS_PER_BLOCK * (r.query->size + 1);
            this->devices[i]->push(request);
        }
        break;

    case RT_CUDA_PARTIAL_IM: {
        KdBlock::QueryResult result = this->kdb->execute(*request.query);
        int numBlocks = result.blocks->size();
        request.totalBlocks = this->numBlocks / this->devices.size();
        int tmpct = 0;
        for (size_t i = 0; i < this->devices.size(); i++) {
            request.keys = (TripKey*) this->fBin.data() + (request.totalBlocks) * i * KdBlock::MAX_RECORDS_PER_BLOCK * (r.query->size + 1);
            request.ranges = new uint64_t[numBlocks];
            int ctBlocks = 0;
            for (int k = 0; k < numBlocks; k++) {
                uint64_t blockId = result.blocks->at(k).second;
                blockId /= KdBlock::MAX_RECORDS_PER_BLOCK;
                int index = blockId / request.totalBlocks;
                if(index == i) {
                    request.ranges[ctBlocks ++] = (blockId % request.totalBlocks);
                }
            }
            request.numBlocks = ctBlocks;
            this->devices[i]->push(request);
            tmpct += ctBlocks;
        }
    }
        break;

    case RT_CUDA_PARTIAL: {
        request.keys = (TripKey*) this->fBin.data();
        KdBlock::QueryResult result = this->kdb->execute(*request.query);
        int numBlocks = result.blocks->size();
        for (size_t i = 0; i < this->devices.size(); i++) {
            request.numBlocks = numBlocks / this->devices.size();
            request.ranges = new uint64_t[request.numBlocks];
            for (int k = 0; k < request.numBlocks; k++) {
                request.ranges[k] = result.blocks->at(request.numBlocks * i + k).second;
            }
            this->devices[i]->push(request);
        }
    }
        break;
#endif

    case RT_CPU: {
        //			hlog << "CPU execution" << endl;
        size_t EXTRA_BLOCKS_PER_LEAF = this->keySize;
        int noKeys = this->keySize - 1;
        int gsize = request.noRegions;

        KdBlock::QueryResult result = this->kdb->execute(*request.query);
        TripKey *keys = (TripKey*) fBin.data();
        //			printf("No. of blocks %zu \n", result.blocks->size());

        uint64_t noBlocks = result.blocks->size();


        int noThreads = omp_get_max_threads();
        std::vector<ResultVec> res(noThreads);


#pragma omp parallel for
        for (size_t i = 0; i < noBlocks; i++) {
            uint32_t count = result.blocks->at(i).first;
            uint64_t offset = result.blocks->at(i).second;
            for (uint32_t j = 0; j < count; j++) {
                uint64_t pos = (offset + j) * EXTRA_BLOCKS_PER_LEAF;
                TripKey * curKey = keys + pos;
                uint64_t index = * (curKey + noKeys);
                bool match = true;
                for(int k = 0;k < noKeys;k ++) {
                    if(!request.query->isMatched(curKey,k)) {
                        match = false;
                        break;
                    }
                }
                if(match) {
                    for(int k = 0;k < gsize;k ++) {
                        double x = uint2double(curKey[k * 2]);
                        double y = uint2double(curKey[k * 2 + 1]);
                        if(!Neighborhoods::isInside(request.regions[k].size(),&request.regions[k][0].first,x,y)) {
                            match = false;
                            break;
                        }
                    }
                }
                if(match) {
                    res[omp_get_thread_num()].push_back(index);
                }
            }
        }
        for (size_t i = 0; i < noThreads; i++) {
            request.result->insert(request.result->end(),res[i].begin(),res[i].end());
        }
    }
        break;

    default:
        fprintf(stderr, "Unhandled request type %d\n", request.state);
        break;
    }
}
Exemple #25
0
void test_solver(BfmSolver solver)
{

  g5dParams parms;

  int Ls=16;
  double M5=1.8;
  double mq=0.0001;
  double wilson_lo = 0.05;
  double wilson_hi = 6.8;
  double shamir_lo = 0.025;
  double shamir_hi = 1.7;
  double ht_scale=1.7;
  double hw_scale=1.0;

  if ( solver != DWF ) { 
    exit(0);
    Printf("Should be testing HtCayleyTanh aka DWF\n");
  }
  parms.pDWF(mq,M5,Ls);

  multi1d<LatticeColorMatrix> u(4);
  HotSt(u);
  //  ArchivGauge_t Header ; readArchiv(Header,u,"ckpoint_lat.3000");  

  multi1d<LatticeFermion> src(Ls);

/* Rudy calculate some eigenvectors */


  BfmWrapperParams BWP;
  BWP.BfmInverter = BfmInv_CG; 
  BWP.BfmMatrix   = BfmMat_M;
  BWP.BfmPrecision= Bfm64bit;
  BWP.MaxIter     = 10000;
  BWP.RsdTarget.resize(1);
  BWP.RsdTarget[0]= 1.0e-9;
  BWP.Delta = 1.0e-4;
  BWP.BAP = parms;
  BfmWrapper bfm(BWP);

    bfmarg bfma;
#if defined(QDP_USE_OMP_THREADS)
    bfma.Threads(omp_get_max_threads());
#else
    bfma.Threads(16);
#endif
    bfma.Verbose(0);

    //Physics parameters
    bfmActionParams *bfmap = (bfmActionParams *) &bfma;
    *bfmap = bfm.invParam.BAP;
    
    // Algorithm & code control
    bfma.time_report_iter=-100;
    bfma.max_iter     = bfm.invParam.MaxIter;
    bfma.residual     = toDouble(bfm.invParam.RsdTarget[0]);

  int lx = QDP::Layout::subgridLattSize()[0];
  int ly = QDP::Layout::subgridLattSize()[1];
  int lz = QDP::Layout::subgridLattSize()[2];
  int lt = QDP::Layout::subgridLattSize()[3];
    //Geometry
    bfma.node_latt[0] = lx;
    bfma.node_latt[1] = ly;
    bfma.node_latt[2] = lz;
    bfma.node_latt[3] = lt;
    
    multi1d<int> procs = QDP::Layout::logicalSize();
    for(int mu=0;mu<4;mu++){
      if (procs[mu]>1) bfma.local_comm[mu] = 0;
      else             bfma.local_comm[mu] = 1;
    }
    
    // Bfm object
    bfm_qdp<double> bfm_eig; 
    bfm_eig.init(bfma);

    //Gauge field import
    bfm_eig.importGauge(u);

    //Subspace
#define NumberGaussian (1)
  Fermion_t subspace[NumberGaussian];
  Fermion_t check;
  Fermion_t mp;
  Fermion_t mmp;
  Fermion_t tmp_t;
  check = bfm_eig.allocFermion();
     mp = bfm_eig.allocFermion();
    mmp = bfm_eig.allocFermion();
  tmp_t = bfm_eig.allocFermion();
  bfm_eig.importFermion(src,check,1);

  QDPIO::cout << "Ls = "<<Ls<<endl;
  for(int g=0;g<NumberGaussian;g++){
    for(int s=0;s<Ls;s++){
      gaussian(src[s]);
    }
    subspace[g]=bfm_eig.allocFermion();
    bfm_eig.importFermion(src,subspace[g],1); // Half parity gaussian
    if ( g==0) {
      bfm_eig.importFermion(src,check,1);
    }
    for(int s=0;s<Ls;s++){
      src[s]=zero;
    }
    bfm_eig.exportFermion(src,subspace[g],1);
    QDPIO::cout << "Subspace norm " << norm2(src)<<endl;
  }
  for(int s=0;s<Ls;s++){
    gaussian(src[s]);
  }
  QDPIO::cout << "Got here " << endl;

  //  Handle< LinearOperatorArray<T> > linop =GetLinOp(u, parms);
  int block[5];
  for(int i=0;i<5;i++) block[i]=4;

  QDPIO::cout << "Initialised dirac op"<<endl;
  BfmLittleDiracOperator ldop(Ls,NumberGaussian,block,subspace,&bfm_eig);

  int ns = ldop.SubspaceDimension();
  QDPIO::cout << "subspace dimension is "<< ns<<endl;
  ns = ldop.SubspaceLocalDimension();
  QDPIO::cout << "subspace dimension per node is "<< ns<<endl;

  std::vector<std::complex<double> > decomp(ns);
  ldop.ProjectToSubspace(check,decomp);
  if (QMP_is_primary_node()){
    FILE * fp = fopen("coeff.dat","w");
    for(int s=0;s<ns;s++){
      fprintf(fp,"coeff %d %le %le\n",s,real(decomp[s]),imag(decomp[s]));
    }
    fclose(fp);
  }
  for(int s=0;s<ns;s++){
    QDPIO::cout << "coeff "<<s<<" " << real(decomp[s]) << " " << imag(decomp[s])<<endl;
  }
  ldop.PromoteFromSubspace(decomp,mp);
  double n;
#pragma omp parallel 
  {
    omp_set_num_threads(bfm_eig.nthread);
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.axpy(check,mp,check,-1);
      n = bfm_eig.norm(check);
    }
  }
  QDPIO::cout << "project/promote n2diff "<< n<<endl;
  QMP_barrier();

QDPIO::cout << "Computing little dirac matrix"<<endl;
  ldop.ComputeLittleMatrixColored();

  QDPIO::cout << "Done"<<endl;

  std::vector<std::complex<double> > Aphi(ns);
  //        phi^dag DdagD phi = |Dphi|^2 with phi a subspace vector
  //        should be equal to Project/Apply/Promote + inner product

#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.Mprec(subspace[0],mp,tmp_t,0);
    }
  }

  QDPIO::cout << "Applied BFM matrix "<<endl;

  double n2;
#pragma omp parallel 
  {
    omp_set_num_threads(bfm_eig.nthread);
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      n2 = bfm_eig.norm(mp);
    }
  }

  QDPIO::cout << "Applied BFM matrix "<<n2<<endl;

  ldop.ProjectToSubspace(subspace[0],decomp);
  QDPIO::cout << "Projected to subspace "<<endl;
  ldop.Apply(decomp,Aphi);
  QDPIO::cout << "Applied A "<<endl;
  ldop.PromoteFromSubspace(Aphi,check);
  QDPIO::cout << "Promoted "<<endl;

  complex<double> inn;
#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      inn = bfm_eig.inner(subspace[0],check);
    }
  }

  QDPIO::cout << "phi^dag Ddag D phi check " << n2 << " " <<real(inn) << imag(inn) <<endl;

  std::vector<std::complex<double> > AinvAphi(ns);
  ldop.ProjectToSubspace(subspace[0],decomp);
  ldop.Apply(decomp,Aphi);
  for(int s=0;s<ns;s++){
    QDPIO::cout << "Aphi "<<s<<" " << real(Aphi[s]) <<" " << imag(Aphi[s])<<endl;
  }
  ldop.PromoteFromSubspace(Aphi,check);

#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.Mprec(subspace[0],mp,tmp_t,0);
      bfm_eig.Mprec(mp,mmp,tmp_t,1);
    }
  }
  ldop.ProjectToSubspace(mmp,decomp);
  ldop.PromoteFromSubspace(decomp,mmp);
#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.axpy(check,mmp,check,-1.0);
      n2 = bfm_eig.norm(check);
    }
  }
  QDPIO::cout << "PMdagMP check n2diff "<< n2<<endl;


  QMP_barrier();
  QDPIO::cout << "Applying inverse"<<endl;
  ldop.ApplyInverse(Aphi,AinvAphi);
  QMP_barrier();
  for(int s=0;s<ns;s++){
    QDPIO::cout << "AinvAphi "<<s<<" " << real(AinvAphi[s]) << " " << imag(AinvAphi[s])<<endl;
  }
  ldop.PromoteFromSubspace(AinvAphi,check);

#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.axpy(check,subspace[0],check,-1.0);
      n2 = bfm_eig.norm(check);
    }
  }
  QDPIO::cout << "AinvA check n2diff "<< n2<<endl;
  

}
Exemple #26
0
int main ( int argc, char *argv[] )
{
  
  omp_set_num_threads(omp_get_max_threads());
  
  //Control number of input parameter
  if(argc<3)
  {
    printf("ERROR MISSING DIR PATH IN/OUT \n");
    return 1;
  }
  
  //MPI vars
  int error = 0; // mi restituisce gli errori mpi
  int nproc = 0; // numero processori totali 
  int myid  = 0; // id singolo processore
  
  //init MPI
  error = MPI_Init(&argc, &argv);
  
  //init MPI Comm
  error = MPI_Comm_size(MPI_COMM_WORLD, &nproc);
  error = MPI_Comm_rank(MPI_COMM_WORLD, &myid);
  
  //check directory
  char * dirIn;
  char * dirOut;
  char * istant;
  dirIn = (char *) malloc(500*sizeof(char ));
  dirOut = (char *) malloc(500*sizeof(char ));
  istant = (char *) malloc(100*sizeof(char ));
  strcpy(dirIn,argv[1]);
  strcpy(dirOut,argv[2]);
  strcat(dirIn,"/");
  strcat(dirOut,"/");
  
  
  //printf("I'm %d of %d\n",myid,nproc);
  
  // read number of json file in input dir
  int numFile = 0;
  numFile = readDirectoryNum(dirIn);
  
  //create file list structure
  char ** list;
  list = (char **) malloc(numFile*sizeof(char*));
  for(int i=0;i<numFile;i++)
    list[i] = (char *) malloc(200*sizeof(char));
  
  // read list file in input directory
  readDirectory(dirIn,list,numFile);
  
  if(myid == 0)
  {
    printf("\n");
    printf("GILLESPIE HT v 1.0 \n");
    printf("Gillespie algo high throughput software\n");
    printf("https://github.com/EricPascolo/GillespieHT\n");
    printf("Created by Eric Pascolo (set 2014)\n");
    printf("\n");
    
    if(nproc>1)
      printf("\tParallel Run with %d slave\n",nproc-1);
    else
      printf("\tSerial Run\n");
    
    printf("\tThreads/Task : %d \n",omp_get_max_threads());
    printf("\tInput directory : %s \n",dirIn);
    printf("\tOutput directory : %s \n",dirOut);
    printf("\tNumber of file: %d \n",numFile);
    printf("\nBEGIN Simulation at %s\n",getTime(istant));
    printf("\n");
    printf("\n");
    printf("\tLIST FILE\n");
    printf("\t---------\n");
    for(int i=0;i<numFile;i++)
      printf("\t%5d %20s\n",i,list[i]);
    printf("\t---------\n\n");
  }
  
  MPI_Barrier(MPI_COMM_WORLD);
  
  if(myid == 0)
  {
    Master(nproc,dirIn,dirOut,list,numFile);
  }
  else
  {
    Slave(myid,nproc,dirIn,dirOut,list,numFile);
  }
  
  MPI_Barrier(MPI_COMM_WORLD);
  
  if(myid == 0)
  {
    printf("\nEND Simulation at %s\n",getTime(istant));
  }
  
  error = MPI_Finalize();
  return 0;
  
  
}
Exemple #27
0
// Threads each sequences and creates preArcs according to road map indications
static void connectPreNodes(RoadMapArray * rdmaps, PreGraph * preGraph,
			    IDnum * chains)
{
	IDnum sequenceIndex;
	IDnum referenceCount = rdmaps->referenceCount;
#ifdef _OPENMP
	annotationOffset = mallocOrExit(rdmaps->length + 1, Coordinate);
	annotationOffset[0] = 0;
	for (sequenceIndex = 1; sequenceIndex <= rdmaps->length; sequenceIndex++)
		annotationOffset[sequenceIndex] = annotationOffset[sequenceIndex - 1] +
						  getAnnotationCount(getRoadMapInArray(rdmaps, sequenceIndex - 1));
#else
	Annotation *annot = rdmaps->annotations;
#endif

	if (rdmaps->referenceCount > 0) 
		allocatePreMarkerCountSpace_pg(preGraph);

#ifdef _OPENMP
	int threads = omp_get_max_threads();
	if (threads > 8)
		threads = 8;

	#pragma omp parallel for num_threads(threads)
#endif
	for (sequenceIndex = 1;
	     sequenceIndex <= sequenceCount_pg(preGraph);
	     sequenceIndex++) {
#ifdef _OPENMP
		Annotation *annot = getAnnotationInArray(rdmaps->annotations, annotationOffset[sequenceIndex - 1]);
#endif
		RoadMap *rdmap;
		Coordinate currentPosition, currentInternalPosition;
		IDnum currentPreNodeID, nextInternalPreNodeID;
		IDnum annotIndex, lastAnnotIndex;
		boolean isReference;

		if (sequenceIndex % 1000000 == 0)
			velvetLog("Connecting %li / %li\n", (long) sequenceIndex,
			       (long) sequenceCount_pg(preGraph));

		rdmap = getRoadMapInArray(rdmaps, sequenceIndex - 1);
		annotIndex = 0;
		lastAnnotIndex = getAnnotationCount(rdmap);
		nextInternalPreNodeID = chooseNextInternalPreNode
		    (chains[sequenceIndex] - 1, sequenceIndex,
		     preGraph, chains);
		isReference = (sequenceIndex <= referenceCount);

		currentPosition = 0;
		currentInternalPosition = 0;
		currentPreNodeID = 0;
		// Recursion up to last annotation
		while (annotIndex < lastAnnotIndex
		       || nextInternalPreNodeID != 0) {
			if (annotIndex == lastAnnotIndex
			    || (nextInternalPreNodeID != 0
				&& currentInternalPosition <
				getPosition(annot))) {
				connectPreNodeToTheNext(&currentPreNodeID,
							nextInternalPreNodeID,
							&currentPosition,
							sequenceIndex,
							isReference,
							preGraph);
				nextInternalPreNodeID =
				    chooseNextInternalPreNode
				    (currentPreNodeID, sequenceIndex,
				     preGraph, chains);
				currentInternalPosition +=
				    getPreNodeLength_pg(currentPreNodeID,
							preGraph);

			} else {
				connectAnnotation(&currentPreNodeID, annot,
						  &currentPosition,
						  sequenceIndex, isReference,
						  preGraph);
				annot = getNextAnnotation(annot);
				annotIndex++;
			}
		}
	}

	if (rdmaps->referenceCount > 0) {
		allocatePreMarkerSpace_pg(preGraph);
		createPreMarkers(rdmaps, preGraph, chains);	
	}

#ifdef _OPENMP
	free(annotationOffset);
	annotationOffset = NULL;
#endif
}
int main(int argc, char *argv[]) {
    herr_t err = 0;
    
    int n_threads = omp_get_max_threads();

    hid_t kernel_file_id = 0;
    hid_t levy_basis_file_id = 0;
    hid_t levy_basis_dataset_id = 0;
    hid_t levy_basis_dataspace_id = 0;
    hid_t output_file_id      = 0;
    hid_t output_dataset_id   = 0;
    hid_t output_dataspace_id = 0;
    hid_t memspace = 0;

    hsize_t n_k = 0;
    double *tmp = NULL;
    double *k_abscissa = NULL;
    double *k_ordinate = NULL;
    double *x1 = NULL;
    double *x2 = NULL;
    double *x3 = NULL;
    
    DEBUGPRINT("### Parsing arguments");
    struct arguments args;
    initialise_arguments(&args);
    argp_parse (&argp, argc, argv, 0, 0, &args);
#ifdef DEBUG
    print_arguments(&args);
#endif
    
    DEBUGPRINT("### Reading kernel");
    kernel_file_id = H5Fopen(args.kernel_file, H5F_ACC_RDONLY, H5P_DEFAULT);
    if (kernel_file_id <= 0) {
        printf("Error: Could not open \"%s\".\n", args.kernel_file);
        err = -1;
        goto cleanup;
    }
    
    err = H5LTget_dataset_info(kernel_file_id, "/abscissa", &n_k, NULL, NULL);
    if (err < 0) {
        printf("Error: Could not read dataset info.\n");
        goto cleanup;
    }
    printf("n_k = %i\n", (int)n_k);
    k_abscissa = malloc(n_k * sizeof(double));
    k_ordinate = malloc(n_k * sizeof(double));
    err = H5LTread_dataset_double(kernel_file_id, "/abscissa", k_abscissa);
    err = H5LTread_dataset_double(kernel_file_id, "/ordinate", k_ordinate);
    
    DEBUGPRINT("### Reading Levy basis");
    hsize_t dims[4];
    hsize_t offset[4];
    hsize_t count[4];
    levy_basis_file_id      = H5Fopen(args.levy_basis_file, H5F_ACC_RDONLY, H5P_DEFAULT);
    levy_basis_dataset_id   = H5Dopen(levy_basis_file_id, "/levy_basis_realization", H5P_DEFAULT);
    levy_basis_dataspace_id = H5Dget_space(levy_basis_dataset_id);
    err = H5Sget_simple_extent_dims(levy_basis_dataspace_id, dims, NULL);
    
    if (dims[0] != dims[1] || dims[1] != dims[2]) {
        printf("Error: The three dimensions must be equal.\n");
        err = -1;
        goto cleanup;
    }

    hsize_t dims_pad[3];
    dims_pad[0] = dims[0];
    dims_pad[1] = dims[1];
    dims_pad[2] = 2 * (dims[2] / 2 + 1);

    hsize_t n_x = dims_pad[0] * dims_pad[1] * dims_pad[2];
    x1 = malloc(n_x * sizeof(double));
    x2 = malloc(n_x * sizeof(double));
    x3 = malloc(n_x * sizeof(double));

    double *x[] = {x1, x2, x3};
    if (!x1 || !x2 || !x3) {
        printf("Error: Could not allocate memory for the Levy basis.\n");
        err = -1;
        goto cleanup;
    }

#pragma omp parallel for 
    for (ptrdiff_t i = 0; i < n_x; i++) {
        x1[i] = 0.0;
        x2[i] = 0.0;
        x3[i] = 0.0;
    }    

    /* Define memory dataspace */
    memspace = H5Screate_simple(3, dims_pad, NULL);
    offset[0] = offset[1] = offset[2] = 0;
    count[0] = dims[0];
    count[1] = dims[1];
    count[2] = dims[2];
    /* Define hyperslab in the memory dataspace */
    err = H5Sselect_hyperslab(memspace, H5S_SELECT_SET, offset, NULL, count, NULL);
    
    for (int j = 0; j < 3; j++) {
        /* Define hyperslap in the file dataspace */
        offset[3] = j;
        count[3] = 1;
        err = H5Sselect_hyperslab(levy_basis_dataspace_id, H5S_SELECT_SET, offset, NULL, count, NULL);
        /* Read data from hyperslab */
        err = H5Dread(levy_basis_dataset_id, H5T_NATIVE_DOUBLE, 
                      memspace, levy_basis_dataspace_id,
                      H5P_DEFAULT, x[j]);
        if (err < 0) {
            printf("Error: Could not read hyperslab.\n");
            err = -1;
            goto cleanup;
        }
    }
    

    DEBUGPRINT("### Convolving");
    double delta = 2.0 * M_PI / dims[0];
    err = ambit_symmetric_odd_isotropic_circular_convolution_inplace(
        n_threads, n_k, k_abscissa, k_ordinate, dims[0], delta, x1, x2, x3);
    if (err) {
        printf("Error in ambit_symmetric_odd_isotropic_circular_convolution_inplace.\n");
        goto cleanup;
    }

    DEBUGPRINT("### Writing output");
    output_file_id = H5Fcreate(args.output_file, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
    if (output_file_id < 0) {
        printf("Error: Could not open \"%s\".\n", args.output_file);
        err = -1;
        goto cleanup;
    }
    
    output_dataspace_id = H5Screate_simple(4, dims, NULL);
    output_dataset_id   = H5Dcreate(output_file_id, "/simulation", H5T_NATIVE_DOUBLE, output_dataspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
    for (int j = 0; j < 3; j++) {
        printf("j = %i\n", j);
        /* Define hyperslap in the file dataspace */
        offset[3] = j;
        count[3] = 1;
        err = H5Sselect_hyperslab(output_dataspace_id, H5S_SELECT_SET, offset, NULL, count, NULL);
        /* Write data to hyperslab */
        err = H5Dwrite(output_dataset_id, H5T_NATIVE_DOUBLE, 
                       memspace, output_dataspace_id,
                       H5P_DEFAULT, x[j]);
        if (err < 0) {
            printf("Error: Could not write hyperslab.\n");
            err = -1;
            goto cleanup;
        }
    }
    
  cleanup:
    if (memspace > 0) H5Sclose(memspace);
    if (output_dataspace_id > 0) H5Sclose(output_dataspace_id);
    if (output_dataset_id > 0) H5Dclose(output_dataset_id);
    if (output_file_id > 0) H5Fclose(output_file_id);
    if (levy_basis_dataspace_id > 0) H5Sclose(levy_basis_dataspace_id);
    if (levy_basis_dataset_id > 0) H5Dclose(levy_basis_dataset_id);
    if (levy_basis_file_id > 0) H5Fclose(levy_basis_file_id);
    if (kernel_file_id > 0) H5Fclose(kernel_file_id);
    free(tmp);
    free(k_abscissa);
    free(k_ordinate);
    free(x1);
    free(x2);
    free(x3);    
    return err;
}
Exemple #29
0
double Gradient::computeGradient(dVector& vecGradient, Model* m, DataSet* X)
{
  double ans = 0.0;
  
#ifdef _OPENMP
  if( nbThreadsMP < 1 )
    nbThreadsMP = omp_get_max_threads();
  setMaxNumberThreads(nbThreadsMP);
  pInfEngine->setMaxNumberThreads(nbThreadsMP);
  pFeatureGen->setMaxNumberThreads(nbThreadsMP);
#endif
  
  //Check the size of vecGradient
  int nbFeatures = pFeatureGen->getNumberOfFeatures();
  if(vecGradient.getLength() != nbFeatures)
    vecGradient.create(nbFeatures);
  else
    vecGradient.set(0);
  
  ////////////////////////////////////////////////////////////
  // Start of parallel Region
  // Some weird stuff in gcc 4.1, with openmp 2.5 support
  //
  // Note 1: In OpenMP 2.5, the iteration variable in "for" must be
  // a signed integer variable type. In OpenMP 3.0 (_OPENMP>=200805),
  // it may  also be an unsigned integer variable type, a pointer type,
  // or a constant-time random access iterator type.
  //
  // Note 2: schedule(static | dynamic): In the dynamic schedule, there
  // is no predictable order in which the loop items are assigned to
  // different threads. Each thread asks the OpenMP runtime library for
  // an iteration number, then handles it, then asks for the next one.
  // It is thus useful when different iterations in the loop may take
  // different time to execute.
#pragma omp parallel default(none) \
  shared(vecGradient, X, m, ans, nbFeatures, std::cout)
  {
    // code inside this region runs in parallel
    dVector g(nbFeatures, COLVECTOR, 0.0);
    
#pragma omp for schedule(dynamic) reduction(+:ans)
    for(int i=0; (int)i<X->size(); i++) {
      DataSequence* x = X->at(i);
      if( m->isWeightSequence() && x->getWeightSequence() != 1.0) {
        dVector tmp(nbFeatures, COLVECTOR, 0.0);
        ans += computeGradient(tmp, m, x) * x->getWeightSequence();
        tmp.multiply(x->getWeightSequence());
        g.add(tmp);
      }
      else {
        ans += computeGradient(g, m, x);
      }
    }
    
    // We now put togheter the gradients
    // No two threads can execute a critical directive of the same name at the same time
#pragma omp critical (reduce_sum)
    {
      vecGradient.add(g);
    }
  }
  // End of parallel Region
  ////////////////////////////////////////////////////////////
  vecGradient.negate();
  
  // MaxMargin objective: min L = 0.5*\L2sigma*W*W + Loss()
  // MLE objective: min L = 0.5*1/(\L2sigma*\L2sigma)*W*W - log p(y|x)
  
  // Add the regularization term
  double scale = (m->isMaxMargin())
		? m->getRegL2Sigma()
		: 1/(double)(m->getRegL2Sigma()*m->getRegL2Sigma());
  
  if( m->isMaxMargin() )
    ans = (1/(double)X->size()) * ans;
  
  if(m->getRegL2Sigma()!=0.0f)
  {
    for(int f=0; f<nbFeatures; f++)
      vecGradient[f] += (*m->getWeights())[f]*scale;
    ans += 0.5*scale*m->getWeights()->l2Norm(false);
  }
  
  return ans;
}
Exemple #30
0
/*
//  Update all running averages
*/
void rtGlobalUpdateTransfer(int top_level, MPI_Comm level_com)
{
  int iomp, i, freq, field;
  int level, cell, *level_cells, num_level_cells, bottom_level = max_level_local();
  float amin, amax;
  float *abc[2];
#ifdef _OPENMP
  int nomp = omp_get_max_threads();
#else
  int nomp = 1;
#endif
  double s[nomp][rt_num_fields];
  double s1, sw[nomp][rt_num_fields_per_freq];

  start_time(WORK_TIMER);

  /*
  //  Compute per-level averages
  */
  for(level=top_level; level<=bottom_level; level++)
    {
      select_level(level,CELL_TYPE_LOCAL | CELL_TYPE_LEAF,&num_level_cells,&level_cells);
      if(num_level_cells == 0) continue;

      /*
      //  Because the reduction variable cannot be an array in C, doing
      //  reduction manually. Cannot re-arrange the loops because of the
      //  cache access pattern.
      */
      for(i=0; i<nomp; i++)
	{
	  for(field=0; field<rt_num_fields; field++) s[i][field] = 0.0;
	}

#pragma omp parallel for default(none), private(i,field,cell,iomp), shared(num_level_cells,level_cells,level,cell_vars,cell_child_oct,nomp,s)
      for(i=0; i<num_level_cells; i++)
	{
	  cell = level_cells[i]; // No need to check for leaves, we selected only them!

#ifdef _OPENMP
	  iomp = omp_get_thread_num();
	  cart_assert(iomp>=0 && iomp<nomp);
#else
	  iomp = 0;
#endif

	  for(field=0; field<rt_num_fields; field++)
	    {
	      s[iomp][field] += cell_var(cell,rt_field_offset+field)*cell_volume[level]/num_root_cells;
	    }
	}

#ifdef _OPENMP
      for(i=1; i<nomp; i++)
	{
	  for(field=0; field<rt_num_fields; field++) s[0][field] += s[i][field];
	}
#endif

      for(field=0; field<rt_num_fields; field++)
	{
	  rtGlobalValueUpdate(&rtAvgRF[field],level,s[0][field]);
	}

      /*
      //  Now do absoprtion - since we need to recompute the abs. coefficient,
      //  loop over frequencies first
      */
      abc[0] = cart_alloc(float,num_level_cells);
#if (RT_CFI == 1)
      abc[1] = cart_alloc(float,num_level_cells);
#else
      abc[1] = abc[0];
#endif

      for(freq=0; freq<rt_num_freqs; freq++)
	{
	  /*
	  //  Average by weighting with the far field only
	  */
	  rtComputeAbsLevel(level,num_level_cells,level_cells,freq,abc);

	  linear_array_maxmin(num_level_cells,abc[1],&amax,&amin);
	  rtGlobalValueUpdate(&rtMaxAC[freq],level,amax);

	  /*
	  //  Because the reduction variable cannot be an array in C, doing
	  //  reduction manually. Cannot re-arrange the loops because of the
	  //  cache access pattern.
	  */
	  for(i=0; i<nomp; i++)
	    {
	      for(field=0; field<rt_num_fields_per_freq; field++) sw[i][field] = 0.0;
	    }

#pragma omp parallel for default(none), private(cell,i,iomp,field), shared(num_level_cells,level_cells,abc,level,cell_vars,freq,nomp,sw,units,constants), reduction(+:s1)
	  for(i=0; i<num_level_cells; i++)
	    {
	      float facLLS;
#ifdef RT_ADD_EXTERNAL_LLS
	      float tauLLS;
#endif /* RT_ADD_EXTERNAL_LLS */

	      cell = level_cells[i]; // No need to check for leaves, we selected only them!

#ifdef _OPENMP
	      iomp = omp_get_thread_num();
	      cart_assert(iomp>=0 && iomp<nomp);
#else
	      iomp = 0;
#endif

#ifdef RT_ADD_EXTERNAL_LLS
              tauLLS = 6.3e-18*units->number_density*units->length*cell_HI_density(cell)*cell_sobolev_length2(cell,level,NULL);
	      facLLS = exp(-tauLLS);
#else
	      facLLS = 1.0;
#endif /* RT_ADD_EXTERNAL_LLS */

              for(field=0; field<rt_num_near_fields_per_freq; field++)
                {
                  sw[iomp][field] += facLLS*cell_var(cell,rt_field_offset+rt_num_freqs*field+freq)*abc[1][i]*cell_volume[level]/num_root_cells;
                }

              for(field=rt_num_near_fields_per_freq; field<rt_num_fields_per_freq; field++)
                {
                  sw[iomp][field] += cell_var(cell,rt_field_offset+rt_num_freqs*field+freq)*abc[1][i]*cell_volume[level]/num_root_cells;
                }

              s1 += abc[1][i]*cell_volume[level]/num_root_cells;
	    }

#ifdef _OPENMP
	  for(i=1; i<nomp; i++)
	    {
	      for(field=0; field<rt_num_fields_per_freq; field++)
		{
		  sw[0][field] += sw[i][field];
		}
	    }
#endif

	  rtGlobalValueUpdate(&rtAvgAC[freq],level,s1);
	  for(field=0; field<rt_num_fields_per_freq; field++) rtGlobalValueUpdate(&rtAvgACxRF[rt_num_freqs*field+freq],level,sw[0][field]);
	}

      cart_free(abc[0]);
#if (RT_CFI == 1)
      cart_free(abc[1]);
#endif

      cart_free(level_cells);
    }

  end_time(WORK_TIMER);

  for(field=0; field<rt_num_fields; field++)
    {
      rtGlobalValueCommunicate(&rtAvgRF[field],MPI_SUM,level_com);
      rtGlobalValueCommunicate(&rtAvgACxRF[field],MPI_SUM,level_com);
    }

  for(freq=0; freq<rt_num_freqs; freq++)
    {
      rtGlobalValueCommunicate(&rtMaxAC[freq],MPI_MAX,level_com);
      rtGlobalValueCommunicate(&rtAvgAC[freq],MPI_SUM,level_com);
    }

  start_time(WORK_TIMER);

  /*
  //  Weighted average
  */
  for(freq=0; freq<rt_num_freqs; freq++)
    {

      float wACxRF = 0.0;
      float wRF = 0.0;
      for(field=0; field<rt_num_fields_per_freq-1; field++)
	{
	  wACxRF += rtAvgACxRF[rt_num_freqs*field+freq].Value;
	  wRF += rtAvgRF[rt_num_freqs*field+freq].Value;
	}

      if(wRF > 1.0e-35)
	{
	  frtAbcLoc[freq] = wACxRF/wRF;
	}
      else
	{
	  frtAbcLoc[freq] = rtAvgAC[freq].Value;
	}
      
      cart_assert(field == rt_num_fields_per_freq-1);

      if(rtAvgRF[rt_num_freqs*field+freq].Value > 1.0e-35)
	{
	  frtAbcUni[freq] = rtAvgACxRF[rt_num_freqs*field+freq].Value/rtAvgRF[rt_num_freqs*field+freq].Value;
	}
      else
	{
	  frtAbcUni[freq] = rtAvgAC[freq].Value;
	}

      frtAbcAvg[freq] = rtAvgAC[freq].Value;
    }

  end_time(WORK_TIMER);


#ifdef RT_OUTPUT
  for(freq=0; freq<rt_num_freqs; freq++)
    {
      cart_debug("RT: Abc[%d] loc=%10.3e, uni=%10.3e, avg=%10.3le, max=%10.3le",freq,frtAbcLoc[freq],frtAbcUni[freq],rtAvgAC[freq].Value,rtMaxAC[freq].Value);
    }
  for(field=0; field<rt_num_fields; field++)
    {
      cart_debug("RT: field=%d: <rf>=%10.3e, <abc>=%10.3e",field,rtAvgRF[field].Value,(rtAvgRF[field].Value>0.0)?rtAvgACxRF[field].Value/rtAvgRF[field].Value:0.0);
    }
#endif /* RT_OUTPUT */

  /*
  //  Maintain the unit average of the far field - should be called
  //  by all run tasks only, to ensure the buffer consistency.
  */
  if(top_level == min_level)
  for(level=top_level; level<=bottom_level; level++)
    {
      select_level(level,CELL_TYPE_ANY,&num_level_cells,&level_cells);

#pragma omp parallel for default(none), private(i,freq), shared(num_level_cells,level_cells,cell_vars,rtAvgRF)
      for(i=0; i<num_level_cells; i++)
	{
	  for(freq=0; freq<rt_num_freqs; freq++) if(rtAvgRF[rt_far_freq_offset+freq].Value > 0.0)
	    {
	      cell_var(level_cells[i],rt_far_field_offset+freq) /= rtAvgRF[rt_far_freq_offset+freq].Value;
	    }
	}

      cart_free(level_cells);

      for(freq=0; freq<rt_num_freqs; freq++) if(rtAvgRF[rt_far_freq_offset+freq].Value > 0.0)
	{
	  rtAvgRF[rt_far_freq_offset+freq].buffer[i] /= rtAvgRF[rt_far_freq_offset+freq].Value;
	  rtAvgACxRF[rt_far_freq_offset+freq].buffer[i] /= rtAvgRF[rt_far_freq_offset+freq].Value;
	}
    }

#ifdef RT_SINGLE_SOURCE
  start_time(WORK_TIMER);
  cell = cell_find_position(rtSingleSourcePos);
  if(cell>-1 && cell_is_local(cell))
    {
      level = cell_level(cell);
    }
  else
    {
      level = -1;
    }
  end_time(WORK_TIMER);
 
  start_time(COMMUNICATION_TIMER);
  /*
  //  NG: I don't know why, but Bcast blocks here, hence using Allreduce
  */
  MPI_Allreduce(&level,&rtSingleSourceLevel,1,MPI_INT,MPI_MAX,level_com);
  end_time(COMMUNICATION_TIMER);
#endif /* RT_SINGLE_SOURCE */
}