Beispiel #1
0
void bspfft_test()
{
    void bspfft( double * x, int n, int p, int s, int sign, double * w0,
                 double * w, double * tw, int *rho_np, int *rho_p );
    void bspfft_init( int n, int p, int s, double * w0,
                      double * w, double * tw, int *rho_np, int *rho_p );
    int k1_init( int n, int p );

    int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p;
    double time0, time1, time2, ffttime, nflops,
           max_error, error_re, error_im, error,
           *Error, *x, *w0, *w, *tw;

    bsp_begin( P );
    p = bsp_nprocs();
    s = bsp_pid();

    bsp_push_reg( &n, SZINT );
    Error = vecallocd( p );
    bsp_push_reg( Error, p * SZDBL );
    bsp_sync();

    if ( s == 0 )
    {
        printf( "Please enter length n: \n" );

#ifdef _WIN32
        scanf_s( "%d", &n );
#else
        scanf( "%d", &n );
#endif

        if ( n < 2 * p )
        {
            bsp_abort( "Error in input: n < 2p" );
        }

        for ( q = 1; q < p; q++ )
        {
            bsp_put( q, &n, &n, 0, SZINT );
        }
    }

    bsp_sync();

    if ( s == 0 )
    {
        printf( "FFT of vector of length %d using %d processors\n", n, p );
        printf( "performing %d forward and %d backward transforms\n",
                NITERS, NITERS );
    }

    /* Allocate, register,  and initialize vectors */
    np = n / p;
    x = vecallocd( 2 * np );
    bsp_push_reg( x, 2 * np * SZDBL );
    k1 = k1_init( n, p );
    w0 = vecallocd( k1 );
    w =  vecallocd( np );
    tw = vecallocd( 2 * np + p );
    rho_np = vecalloci( np );
    rho_p =  vecalloci( p );

    for ( j = 0; j < np; j++ )
    {
        jglob = j * p + s;
        x[2 * j] = ( double )jglob;
        x[2 * j + 1] = 1.0;
    }

    bsp_sync();
    time0 = bsp_time();

    /* Initialize the weight and bit reversal tables */
    for ( it = 0; it < NITERS; it++ )
    {
        bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p );
    }

    bsp_sync();
    time1 = bsp_time();

    /* Perform the FFTs */
    for ( it = 0; it < NITERS; it++ )
    {
        bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p );
        bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p );
    }

    bsp_sync();
    time2 = bsp_time();

    /* Compute the accuracy */
    max_error = 0.0;

    for ( j = 0; j < np; j++ )
    {
        jglob = j * p + s;
        error_re = fabs( x[2 * j] - ( double )jglob );
        error_im = fabs( x[2 * j + 1] - 1.0 );
        error = sqrt( error_re * error_re + error_im * error_im );

        if ( error > max_error )
        {
            max_error = error;
        }
    }

    bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL );
    bsp_sync();

    if ( s == 0 )
    {
        max_error = 0.0;

        for ( q = 0; q < p; q++ )
        {
            if ( Error[q] > max_error )
            {
                max_error = Error[q];
            }
        }
    }

    for ( j = 0; j < NPRINT && j < np; j++ )
    {
        jglob = j * p + s;
        printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] );
    }

    fflush( stdout );
    bsp_sync();

    if ( s == 0 )
    {
        printf( "Time per initialization = %lf sec \n",
                ( time1 - time0 ) / NITERS );
        ffttime = ( time2 - time1 ) / ( 2.0 * NITERS );
        printf( "Time per FFT = %lf sec \n", ffttime );
        nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n;
        printf( "Computing rate in FFT = %lf Mflop/s \n",
                nflops / ( MEGA * ffttime ) );
        printf( "Absolute error= %e \n", max_error );
        printf( "Relative error= %e \n\n", max_error / n );
    }


    bsp_pop_reg( x );
    bsp_pop_reg( Error );
    bsp_pop_reg( &n );
    bsp_sync();

    vecfreei( rho_p );
    vecfreei( rho_np );
    vecfreed( tw );
    vecfreed( w );
    vecfreed( w0 );
    vecfreed( x );
    vecfreed( Error );
    bsp_end();

} /* end bspfft_test */
Beispiel #2
0
int main (int argc, char** argv) {

    // aim for a nonzero density given by sparsity:
    sparsity = 0.2; // nz = sparsity*100% of the size of the matrix

    /*
     * we say 'aim' here, since of course initially exactly
     *    nz = sparsity * N^2
     * nonzeroes will be generated at random spots, but because
     * the matrix must be symmetric and diagonally positive, the
     * actual number of nonzeroes will probably not be exactly
     * the projected number.
     */

    // read the desired size of the matrix from command line
    if (argc < 2) {
        printf("Usage: %s N [mu] [sparsity]\n", argv[0]);
        exit(-1);
    }

    if(sscanf(argv[1], "%d", &N) != 1) {
        printf("couldn't read command-line argument for N. must be an integer.\n");
        exit(-2);
    }
    double mu;
    mu = 2.5; //default scalar for making matrix diagonal-dominant

    // maybe the user supplied a different mu
    if(argc > 2 && sscanf(argv[2], "%lf", &mu) != 1) {
        exit(-2);
    }
    // maybe the user supplied a different sparsity
    if(argc > 3 && sscanf(argv[3], "%lf", &sparsity) != 1) {
        exit(-2);
    }

    int nz = sparsity*N*N;
    int* xs;
    int* ys;
    double* vals;

    fprintf(stderr,"Generating matrix. N=%d, density=%lf, target nz=%d, ",
            N, sparsity, nz);
    fprintf(stderr, "mu = %lf\n", mu);

    // seed the random generator.
    srandom((unsigned)time(NULL));

    xs = vecalloci(nz);
    ys = vecalloci(nz);
    vals = vecallocd(nz);

    bool* diag_done;
    diag_done = malloc(N*sizeof(bool));

    int i;
    for(i = 0; i<N; i++) {
        diag_done[i] = false;
    }

    int nz_generated;

    int x,y;
    nz_generated = 0;
    double fake_transpose;
    x=0;y=0;
    while(x<N && y<N) { //don't escape matrix bounds.

        if (nz_generated % 1000000 == 0) {
            fprintf(stderr,"progress: %f%%\r", (double)nz_generated/(double)(nz/2.0)*100.0);
        }
        if(x==y) {
            //diagonal, so always generate.

            xs[nz_generated]=x;
            ys[nz_generated]=y;
            vals[nz_generated]=ran()*2.0-1.0;
            diag_done[x] = true;

#ifdef DEBUG
            fprintf(stderr,"generated A[//][%d]=%lf\n"
                                                      , ys[nz_generated]
                                                      , vals[nz_generated]);
#endif

            nz_generated++;

        } else {
            // not a diagonal. only add if in
            // lower triangular half
            if(x<y) {

                if(nz_generated > nz) {
                    // this should NEVER happen, although all that's
                    // stopping it from happening is our ran() being
                    // well-behaved...........
                    printf("EEK! something went wrong!!\n");
                    exit(666);
                }

                xs[nz_generated]= x;
                ys[nz_generated]= y;
                // simulate the distribution of values which
                // would occur if we do A+A^T afterwards.
                if (ran() < sparsity) {
                    fake_transpose    = ran()*2.0-1.0;
                    vals[nz_generated]= ran()*2.0-1.0 + fake_transpose;
                } else {
                    vals[nz_generated]= ran()*2.0-1.0;
                }
#ifdef DEBUG
            fprintf(stderr,"generated A[%d][%d]=%lf\n", xs[nz_generated]
                                                      , ys[nz_generated]
                                                      , vals[nz_generated]);
#endif
                nz_generated++;


            }
        }
        x += 1/sparsity * (ran() + 0.5);
        if( x >= N ) {
            y += x/N;
            x  = x%N;
        }

    }

    fprintf(stderr, "generated initial randoms\n");

    int diagonals_present = 0;
    for(i=0; i<nz_generated; i++) {
        if(xs[i]==ys[i])
            diagonals_present++;
    }

    fprintf(stderr,"generated %d nzeros, array was %d big.\n", nz_generated, nz);


#ifdef DEBUG
    fprintf(stderr, "found %d diagonal(s), still need %d more.\n", diagonals_present, (N-diagonals_present));
#endif

    // add the missing diagonals, and add mu to each diagonal.
    int newsize = nz_generated + (N - diagonals_present);
    fprintf(stderr,"reallocating values array to %lluM \n", (unsigned long long)(SZDBL+2*SZINT)*newsize/1048576);
    int* diag_i;
    int* diag_j;
    double* diag_val;

    diag_i   = realloc(xs  ,SZINT*newsize);
    diag_j   = realloc(ys  ,SZINT*newsize);
    diag_val = realloc(vals,SZDBL*newsize);

    if(diag_i == NULL ||
            diag_j == NULL ||
            diag_val == NULL)
    {
        printf("out of memory!");
        exit(44);
    }

    addDiagonal(mu, diag_i, diag_j, diag_val, nz_generated, newsize, diag_done);
    nz_generated=newsize;
#ifdef DEBUG
    for(i=0;i<newsize;i++) {
        fprintf(stderr,"after addDiagonal A[%d][%d]=%lf\n", diag_i[i],diag_j[i], diag_val[i]);
    }

    fprintf(stderr, "Going to make symmetric now... (nz_generated = %d)\n", nz_generated);
#endif

    // now we explicitly fill the array with the
    // upper triangle values

    // things must be symmetric, but they aren't, yet
    // ... here's a good place to do the transposing thing.

    newsize = nz_generated * 2 - N; //number of real nonzeros, don't
                                    // count diagonals twice.
    int *new_i;
    int *new_j;
    double *new_v;

    new_i = realloc(diag_i  ,SZINT*newsize);
    new_j = realloc(diag_j  ,SZINT*newsize);
    new_v = realloc(diag_val,SZDBL*newsize);

    if(new_i == NULL ||
            new_i == NULL ||
            new_v == NULL)
    {
        printf("out of memory (2)!");
        exit(44);
    }
    diag_i = new_i;
    diag_j = new_j;
    diag_val = new_v;
    addTranspose(newsize,diag_i,diag_j,diag_val,
                              nz_generated);

#ifdef DEBUG
    for(i=0;i<newsize;i++)
        // to make diags stand out.
        if(diag_i[i]==diag_j[i])
            fprintf(stderr,"after transpose A[%d][%d]=%lf \\\\\n", diag_i[i],diag_j[i], diag_val[i]);
        else
            fprintf(stderr,"after transpose A[%d][%d]=%lf\n", diag_i[i],diag_j[i], diag_val[i]);
#endif

    checkStrictDiagonallyDominant(diag_i,diag_j,diag_val, newsize);

    // now quickly generate a test-vector to solve against:

    double *vec = vecallocd(N);
    for(i=0;i<N;i++)
        vec[i]=ran();

    fprintf(stderr,"Left with %d nonzeroes; nonzero density = %lf (desired=%lf)\n", newsize, newsize/((double)N*N), sparsity);
    fprintf(stderr,"========== OUTPUTTING ... ==========\n");

    outputMondriaanMatrix(newsize, diag_i, diag_j, diag_val, vec);
    outputMathematicaMatrix(newsize, diag_i, diag_j, diag_val, vec);

    free(diag_done);
    free(vec);
    free(diag_i);
    free(diag_j);
    free(diag_val);

    return 0;
}
Beispiel #3
0
void mainloop(){

//int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000,
//2,1000,-5,0,1000,1000,1000,1000,6,0};

int i,j,k,l,v,t,lsize,*lsize_m,*lrow,*lcol, *linit, *linter,*startrow_m;
int li,lj,lk,startrow, endrow,g;

int* init = gen_graph(N, 0.05);  

bsp_begin(bsp_nprocs());


/**********Initialization***************/

/*******Comp. Superstep 0******/

lsize = nloc(bsp_nprocs(),bsp_pid(), N); //Get the number of rows of processor s
lrow = vecalloci(lsize*N);				 //The main storing array of processor s
lcol = vecalloci(N);					 //array to hold the column for the matrix squaring
startrow_m = vecalloci(bsp_nprocs());    //array to hold all processors starting global row
lsize_m = vecalloci(bsp_nprocs());		 //array to hold the number of rows of all processors
linter = vecalloci(lsize*N);			 //Intermidiate array used for the matrix "multiplication"

bsp_push_reg(startrow_m,bsp_nprocs()*SZINT);
bsp_push_reg(lsize_m,bsp_nprocs()*SZINT);
bsp_push_reg(lrow,lsize*N*SZINT);

/****Get the first and last global row of processor s***/
if(bsp_pid() == (bsp_nprocs() - 1)){
 startrow = (N - lsize);
 endrow = N;
}else{
 startrow = bsp_pid()*lsize;
 endrow = bsp_pid()*lsize + lsize;
}



//Distribute Data, according row block distribution
li=0;
for ( i= startrow; i < endrow; i++) {
	lj=0;
	 for(j=0; j < N; j++) {	
   		lrow[N*li+lj] = init[N*i+j];
		lj++;
   	 } 
 li++;
}
vecfreei(init); //out of the shared enviroment

//initialize arrays
for ( i=0; i<bsp_nprocs(); i++) {
			startrow_m[i] = 0;
			lsize_m[i] = 0;
}

bsp_sync();
/*******End Comp. Superstep 0******/


/*********Comm. Superstep 1********/
//Communicate the global starting rows of all processors
for(g=0; g<bsp_nprocs();g++){
	bsp_put(g,&startrow,&startrow_m[0],bsp_pid()*SZINT,SZINT);
	bsp_put(g,&lsize,&lsize_m[0],bsp_pid()*SZINT,SZINT);
}
/*********End Comm. Superstep 1*****/
bsp_sync();
/**********End Initialization***************/

double time0= bsp_time();
/*********Repeated Squaring loop start*************/
j=1;
while ((N-1) > j) {
 
		/****Comp. Superstep j0****/ 
		//initialize arrays
		for ( i=0; i<N*lsize; i++) {
			linter[i] = 1000;
		}
		for ( i=0; i<N; i++) {
			lcol[i] = 0;
		}
		bsp_sync();
		/****End Comp. Superstep j0****/ 
	   		
        	for ( lj=0; lj < N; lj++) {
				/***Comm. SuperStep jlj0*******/
				//get global column lj 
				t=0;
				for(g=0; g < bsp_nprocs();g++){
				  for(v=0; v<lsize_m[g]; v++){				
					bsp_get(g,&lrow[0],(lj+v*N)*SZINT,&lcol[t],SZINT);
					t++;
				  }
				}
				bsp_sync();
				/***End Comm. SuperStep jlj0***/
				/***Comp. SuperStep jlj1*******/
				//update the values that use global column lj
				for ( li = 0; li < lsize; li++){
					for ( lk=0; lk < N; lk++) {
						linter[N*li+lj] = fmin(linter[N*li+lj], lrow[N*li+lk]+lcol[lk]);
					} 
        		}
				bsp_sync();
				/***End Comp. SuperStep jlj1***/
    		}
 		/****Comp. Superstep j1****/ 
		memcpy(lrow,linter,N*lsize*SZINT);
  		j=2*j;
		bsp_sync();
		/****End Comp. Superstep j1****/ 
}
/*********Repeated Squaring loop end*************/
double time1= bsp_time();
bsp_sync();
/*********display matrices and time*********/
if(bsp_pid()==0){
	printf( " \n Block Row Distr (need to know basis) calculation of APSP took: %f seconds \n", time1-time0 ); 
}

/*for(g = 0; g < bsp_nprocs(); g++){
if(bsp_pid()==g){
 printf("\n i am proc %d and i have APSP Mat \n",bsp_pid());
  for(k=0;k<lsize;k++)
     {
	  printf("\n");
		 for(l=0;l<N;l++){
		    printf("\t %d",lrow[N*k+l]);
			  }
			printf("\n \n ");
		}
	}
	bsp_sync();
}*/


//Clean up
bsp_pop_reg(startrow_m);
bsp_pop_reg(lsize_m);
bsp_pop_reg(lrow);


vecfreei(lrow);
vecfreei(lcol);
vecfreei(startrow_m);
vecfreei(lsize_m);
vecfreei(linter);

bsp_end();   
}
Beispiel #4
0
void mainloop(){

//int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000,
//2,1000,-5,0,1000,1000,1000,1000,6,0};
   
int nlr,nlc,s,t,i,j,k,l,li,lsize,tsize0, tsize1,tempp,tempoff,rpos,cpos, 
*lpart,*linter,*gindx,*lcol,*lrow,*lsrow, *lscol, *ltrow, *ltcol, *temp;

int* init = gen_graph(N, 0.05);  

bsp_begin(bsp_nprocs());

/**********Initialization SuperStep 0***************/

//Compute global row and column indeces for each element
int pm = sqrt(bsp_nprocs());
int pn = (bsp_nprocs())/pm;
/* Compute 2D processor numbering from 1D numbering 
 with failsafe if the number of processors are not enough, back to simple 1D cyclic distribution */ 
if ( pn  != pm ){
	pn = bsp_nprocs();
	pm = 1;
	t = bsp_pid();
	s = 0;
  
}else{
	s= bsp_pid()%pm;  /* 0 <= s < pm */
	t= bsp_pid()/pn;  /* 0 <= t < pn */
}

nlr=  nloc(pm,s,N); /* number of local rows */
nlc=  nloc(pn,t,N); /* number of local columns */

lsize = nlr*nlc;						  //interpret 2D size to array size
lpart = vecalloci(lsize);				  //Initialize local part of processor s
linter = vecalloci(lsize);				  //Intermidiate array used for the matrix "multiplication"
gindx = vecalloci(lsize);				  //Array to store the global indeces of the local elements
lcol  = vecalloci(lsize);				  //Array to store the glocal column index
lrow  = vecalloci(lsize);				  //Array to store the glocal row index
bsp_push_reg(lpart,lsize*SZINT);

//Distribute the Data
li=0;
for ( i= 0; i < N; i++){
	for ( j= 0; j < N; j++){
		if ((j % pn) == t){
			lpart[li] = init[N*i+j];
			lrow[li] = i;
			lcol[li] = j;
			gindx[li] = N*i+j;
			li++;	
		}
	}
}


/*for ( i= 0; i < N*N; i++) {

		if(bsp_pid() == (i % bsp_nprocs())){
   			lpart[li] = init[i];
			lrow[li] = i/N;
			lcol[li] = i % N;
			gindx[li] = i;
			li++;	
		}
		

}*/
vecfreei(init);//out of the shared space

tsize0 = tsize1 =lsize;
temp = lrow;

//find unique global rows for processor s
for(i=0;i<tsize0;i++){
    for(j=0;j<tsize0;j++){
         if(i==j){
             continue;
         }
         else if(*(temp+i)==*(temp+j)){
             k=j;
             tsize0--;
             while(k < tsize0){
                 *(temp+k)=*(temp+k+1);
                 k++;
             }
              j=0;
         }
    }
}
temp = lcol;

//find unique global column for processor s
for(i=0;i<tsize1;i++){
    for(j=0;j<tsize1;j++){
         if(i==j){
             continue;
         }
         else if(*(temp+i)==*(temp+j)){
             k=j;
             tsize1--;
             while(k < tsize1){
                 *(temp+k)=*(temp+k+1);
                 k++;
             }
              j=0;
         }
    }
}


//keep unique global rows and columns in arrays
//initialize arrays to hold the elements of those rows and columns(ltcol, ltrow)
lscol  = vecalloci(tsize1); 
lsrow  = vecalloci(tsize0);
ltcol  = vecalloci(N*tsize1);
ltrow  = vecalloci(N*tsize0);

for(i=0;i < tsize0;i++){
    lsrow[i] = lrow[i];
  }
for(i=0;i < tsize1;i++){
    lscol[i] = lcol[i];
  }


vecfreei(lcol);//not needed from this point on
vecfreei(lrow);//we use lscol, lsrow, ltrow, ltcol

//sort arrays
qsort (lsrow, tsize0, sizeof(int), compare_int);
qsort (lscol, tsize1, sizeof(int), compare_int);
bsp_sync();
/**********End Initialization SuperStep 0***************/

double time0= bsp_time();
/*********Repeated Squaring loop start*************/
j=1;
while ((N-1) > j) {

/*************Comm. SuperStep j0*************/
for(i=0;i < tsize1;i++){
	for(k=0; k<N;k++){
		tempp=((N*k+lscol[i]) % bsp_nprocs());
		tempoff = ((double)(N*k+lscol[i])/(double)bsp_nprocs());
		bsp_get(tempp, &lpart[0],tempoff*SZINT, &ltcol[N*i+k],SZINT);
	} 
}

for(i=0;i < tsize0;i++){
	for(k=0; k<N;k++){
		tempp=((N*lsrow[i]+k) % bsp_nprocs());
		tempoff = ((double)(N*lsrow[i]+k)/(double)bsp_nprocs());
		bsp_get(tempp, &lpart[0],tempoff*SZINT, &ltrow[N*i+k],SZINT);
	} 
}
bsp_sync();
/*************End Comm. SuperStep j0*************/

/*************Comp. SuperStep j1*************/
for ( i=0; i<lsize; i++) {
  
	int gcol = gindx[i] % N; //get global col indx of current element
	int grow = gindx[i]/N;	 //get global row indx of current element

    linter[i]=1000;//initiliaze array
	//find appropriate indx of the global rows and columns to perform "multiplication"
	/*for ( l=0; l < tsize0;l++){
		if(grow == lsrow[l]){
			rpos =l;
			break;
		}
	}*/
	int *rp = bsearch (&grow, lsrow, tsize0, sizeof (lsrow),compare_int);
	rpos = rp - lsrow;
	

	int *cp = bsearch (&gcol, lscol, tsize1, sizeof (lscol),compare_int);
	cpos = cp - lscol;
	
	/*for ( l=0; l < tsize1;l++){
		if(gcol == lscol[l]){
			cpos =l;
			break;
		}
	}*/

	//this is where the update is done
	for(k=0;k<N;k++){
		linter[i] = fmin(linter[i], ltrow[N*rpos + k]+ltcol[N*cpos + k]);
	}

}

memcpy(lpart,linter,lsize*SZINT);
j = 2*j;
bsp_sync();
/*************End Comp. SuperStep j1*************/

}
/*********Repeated Squaring loop end*************/
double time1= bsp_time();
bsp_sync();
/*********display matrices and time*********/
if(bsp_pid()==0){
	printf( " \n Block Cyclic Distr  calculation of APSP took: %f seconds \n", time1-time0 ); 
}
/*printf("\n The array is, proc %d \n ", bsp_pid());
  for(i=0;i < lsize;i++){
    	printf(" %d",lpart[i]);
	
}*/
printf("\n ");

//clean up
bsp_pop_reg(lpart);
vecfreei(lpart);
vecfreei(linter);
vecfreei(lscol);
vecfreei(lsrow);
vecfreei(ltcol);
vecfreei(ltrow);
vecfreei(gindx);

bsp_end();   
}
Beispiel #5
0
void bspParSort(){

  int Log2(int x);
  void mergeSort(int x, int *temp1);
  void merge2(int *arr1, int *arr2, int size);

  int *localArr; /* local array in each processor */
  int i,j,k; /* index variables */
  int n_divide_p; /* Avoid multiple computation */
  int n; /* Number of elements to be sorted */
  int szLocalArray; /* Size of local array */
  double time0, time1; /* Time */
  FILE *ifp = 0; /* Reader to read sequence of numbers to be sorted */

  bsp_begin(P);
  int p= bsp_nprocs(); /* Number of processors obtained */ 
  int s= bsp_pid();    /* Processor number */ 

  //Get number of elements to be sorted
  if(s==0){
    ifp = fopen("sort","r");
    if(ifp == NULL){
      fprintf(stderr, "Can't open input file!\n");
      exit(1);
    }
    fscanf(ifp, "%i", &n);
  }

  // Make sure every processor knows everything
  bsp_push_reg(&n,sizeof(int));
  bsp_sync();
  bsp_get(0,&n,0,&n,sizeof(int));
  bsp_sync();
  bsp_pop_reg(&n);

  //Setup distribution 
  n_divide_p = n/p;
  szLocalArray = n/pow(2,ceil(Log2(s+1)));
  localArr = vecalloci(szLocalArray);
  bsp_push_reg(localArr,sizeof(int)*szLocalArray);

  if(s==0){ 
    printf("Distribution start\n"); fflush(stdout); 
  }

  bsp_sync();
  int value;
  if(s==0){
    //allocate to array on proc 0
    for(i=0; i< n_divide_p; i++){
      fscanf(ifp, "%i", &value);
      localArr[i]=value;      
    }
    //Send to arrays on other processors
    for(i=1; i< p; i++){
      for(j=0;j<n_divide_p;j++){
        fscanf(ifp, "%i", &value);
        bsp_put(i,&value,localArr,j*sizeof(int),sizeof(int));
      }
    }
    fclose(ifp);
  }
  bsp_sync();
  if(s==0){ 
    printf("Distribution done\n"); fflush(stdout); 
  }

  //Distribution done and we can start time measurement 
  if(s==0){
    printf("Time start\n"); fflush(stdout);
  }
  time0 = bsp_time();

  //Locally sort each array
  if(s==0){
    printf("Local sort\n"); fflush(stdout);
  }
  mergeSort(n_divide_p, localArr);
  bsp_sync();

  //Merging 
  int *temp = malloc(sizeof(int)*pow(2,Log2(p))*n_divide_p);
  for(j=1;j<Log2(p)+1;j++){
    if(s<p/pow(2,j)){
      for(k=0;k<pow(2,j-1)*n_divide_p;k++){
        bsp_get(s+(p/pow(2,j)),localArr,k*sizeof(int),&(temp[k]),sizeof(int));
      }
    }
    bsp_sync();

    if(s<p/pow(2,j)){
      merge2(localArr, temp, n_divide_p*pow(2,j-1));
    }

    bsp_sync();
    if(s==0){ 
      printf("Round %i out of %i rounds of merging done (on proc 0)\n",j,Log2(p)); fflush(stdout); 
    }
  }
  if(s==0){
    printf("Sorting done\n"); fflush(stdout);
  }
  bsp_sync();
 
  //Print sorted array - expensive if sample is big
  /*
  if(s==0){
    printf("Sorted sequence is:\n");
    for(i=0; i<szLocalArray; i++){
      printf("%i ",localArr[i]); fflush(stdout);
    }
    printf("\n"); fflush(stdout);
  }
  */

  //Parallel algorithm ends
  time1 = bsp_time();
  if(s==0){
    printf("Time stop\n"); fflush(stdout);
  }

  //Report time to user
  if(s==0){
    printf("Sorting took %.6lf seconds.\n", time1-time0); fflush(stdout);
  }
  
  //Clean up
  free(temp);
  bsp_pop_reg(localArr); free(localArr);

  bsp_end();
} /* End bspParSort */