void bspfft_test() { void bspfft( double * x, int n, int p, int s, int sign, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); void bspfft_init( int n, int p, int s, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); int k1_init( int n, int p ); int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p; double time0, time1, time2, ffttime, nflops, max_error, error_re, error_im, error, *Error, *x, *w0, *w, *tw; bsp_begin( P ); p = bsp_nprocs(); s = bsp_pid(); bsp_push_reg( &n, SZINT ); Error = vecallocd( p ); bsp_push_reg( Error, p * SZDBL ); bsp_sync(); if ( s == 0 ) { printf( "Please enter length n: \n" ); #ifdef _WIN32 scanf_s( "%d", &n ); #else scanf( "%d", &n ); #endif if ( n < 2 * p ) { bsp_abort( "Error in input: n < 2p" ); } for ( q = 1; q < p; q++ ) { bsp_put( q, &n, &n, 0, SZINT ); } } bsp_sync(); if ( s == 0 ) { printf( "FFT of vector of length %d using %d processors\n", n, p ); printf( "performing %d forward and %d backward transforms\n", NITERS, NITERS ); } /* Allocate, register, and initialize vectors */ np = n / p; x = vecallocd( 2 * np ); bsp_push_reg( x, 2 * np * SZDBL ); k1 = k1_init( n, p ); w0 = vecallocd( k1 ); w = vecallocd( np ); tw = vecallocd( 2 * np + p ); rho_np = vecalloci( np ); rho_p = vecalloci( p ); for ( j = 0; j < np; j++ ) { jglob = j * p + s; x[2 * j] = ( double )jglob; x[2 * j + 1] = 1.0; } bsp_sync(); time0 = bsp_time(); /* Initialize the weight and bit reversal tables */ for ( it = 0; it < NITERS; it++ ) { bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time1 = bsp_time(); /* Perform the FFTs */ for ( it = 0; it < NITERS; it++ ) { bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p ); bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time2 = bsp_time(); /* Compute the accuracy */ max_error = 0.0; for ( j = 0; j < np; j++ ) { jglob = j * p + s; error_re = fabs( x[2 * j] - ( double )jglob ); error_im = fabs( x[2 * j + 1] - 1.0 ); error = sqrt( error_re * error_re + error_im * error_im ); if ( error > max_error ) { max_error = error; } } bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL ); bsp_sync(); if ( s == 0 ) { max_error = 0.0; for ( q = 0; q < p; q++ ) { if ( Error[q] > max_error ) { max_error = Error[q]; } } } for ( j = 0; j < NPRINT && j < np; j++ ) { jglob = j * p + s; printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] ); } fflush( stdout ); bsp_sync(); if ( s == 0 ) { printf( "Time per initialization = %lf sec \n", ( time1 - time0 ) / NITERS ); ffttime = ( time2 - time1 ) / ( 2.0 * NITERS ); printf( "Time per FFT = %lf sec \n", ffttime ); nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n; printf( "Computing rate in FFT = %lf Mflop/s \n", nflops / ( MEGA * ffttime ) ); printf( "Absolute error= %e \n", max_error ); printf( "Relative error= %e \n\n", max_error / n ); } bsp_pop_reg( x ); bsp_pop_reg( Error ); bsp_pop_reg( &n ); bsp_sync(); vecfreei( rho_p ); vecfreei( rho_np ); vecfreed( tw ); vecfreed( w ); vecfreed( w0 ); vecfreed( x ); vecfreed( Error ); bsp_end(); } /* end bspfft_test */
int main (int argc, char** argv) { // aim for a nonzero density given by sparsity: sparsity = 0.2; // nz = sparsity*100% of the size of the matrix /* * we say 'aim' here, since of course initially exactly * nz = sparsity * N^2 * nonzeroes will be generated at random spots, but because * the matrix must be symmetric and diagonally positive, the * actual number of nonzeroes will probably not be exactly * the projected number. */ // read the desired size of the matrix from command line if (argc < 2) { printf("Usage: %s N [mu] [sparsity]\n", argv[0]); exit(-1); } if(sscanf(argv[1], "%d", &N) != 1) { printf("couldn't read command-line argument for N. must be an integer.\n"); exit(-2); } double mu; mu = 2.5; //default scalar for making matrix diagonal-dominant // maybe the user supplied a different mu if(argc > 2 && sscanf(argv[2], "%lf", &mu) != 1) { exit(-2); } // maybe the user supplied a different sparsity if(argc > 3 && sscanf(argv[3], "%lf", &sparsity) != 1) { exit(-2); } int nz = sparsity*N*N; int* xs; int* ys; double* vals; fprintf(stderr,"Generating matrix. N=%d, density=%lf, target nz=%d, ", N, sparsity, nz); fprintf(stderr, "mu = %lf\n", mu); // seed the random generator. srandom((unsigned)time(NULL)); xs = vecalloci(nz); ys = vecalloci(nz); vals = vecallocd(nz); bool* diag_done; diag_done = malloc(N*sizeof(bool)); int i; for(i = 0; i<N; i++) { diag_done[i] = false; } int nz_generated; int x,y; nz_generated = 0; double fake_transpose; x=0;y=0; while(x<N && y<N) { //don't escape matrix bounds. if (nz_generated % 1000000 == 0) { fprintf(stderr,"progress: %f%%\r", (double)nz_generated/(double)(nz/2.0)*100.0); } if(x==y) { //diagonal, so always generate. xs[nz_generated]=x; ys[nz_generated]=y; vals[nz_generated]=ran()*2.0-1.0; diag_done[x] = true; #ifdef DEBUG fprintf(stderr,"generated A[//][%d]=%lf\n" , ys[nz_generated] , vals[nz_generated]); #endif nz_generated++; } else { // not a diagonal. only add if in // lower triangular half if(x<y) { if(nz_generated > nz) { // this should NEVER happen, although all that's // stopping it from happening is our ran() being // well-behaved........... printf("EEK! something went wrong!!\n"); exit(666); } xs[nz_generated]= x; ys[nz_generated]= y; // simulate the distribution of values which // would occur if we do A+A^T afterwards. if (ran() < sparsity) { fake_transpose = ran()*2.0-1.0; vals[nz_generated]= ran()*2.0-1.0 + fake_transpose; } else { vals[nz_generated]= ran()*2.0-1.0; } #ifdef DEBUG fprintf(stderr,"generated A[%d][%d]=%lf\n", xs[nz_generated] , ys[nz_generated] , vals[nz_generated]); #endif nz_generated++; } } x += 1/sparsity * (ran() + 0.5); if( x >= N ) { y += x/N; x = x%N; } } fprintf(stderr, "generated initial randoms\n"); int diagonals_present = 0; for(i=0; i<nz_generated; i++) { if(xs[i]==ys[i]) diagonals_present++; } fprintf(stderr,"generated %d nzeros, array was %d big.\n", nz_generated, nz); #ifdef DEBUG fprintf(stderr, "found %d diagonal(s), still need %d more.\n", diagonals_present, (N-diagonals_present)); #endif // add the missing diagonals, and add mu to each diagonal. int newsize = nz_generated + (N - diagonals_present); fprintf(stderr,"reallocating values array to %lluM \n", (unsigned long long)(SZDBL+2*SZINT)*newsize/1048576); int* diag_i; int* diag_j; double* diag_val; diag_i = realloc(xs ,SZINT*newsize); diag_j = realloc(ys ,SZINT*newsize); diag_val = realloc(vals,SZDBL*newsize); if(diag_i == NULL || diag_j == NULL || diag_val == NULL) { printf("out of memory!"); exit(44); } addDiagonal(mu, diag_i, diag_j, diag_val, nz_generated, newsize, diag_done); nz_generated=newsize; #ifdef DEBUG for(i=0;i<newsize;i++) { fprintf(stderr,"after addDiagonal A[%d][%d]=%lf\n", diag_i[i],diag_j[i], diag_val[i]); } fprintf(stderr, "Going to make symmetric now... (nz_generated = %d)\n", nz_generated); #endif // now we explicitly fill the array with the // upper triangle values // things must be symmetric, but they aren't, yet // ... here's a good place to do the transposing thing. newsize = nz_generated * 2 - N; //number of real nonzeros, don't // count diagonals twice. int *new_i; int *new_j; double *new_v; new_i = realloc(diag_i ,SZINT*newsize); new_j = realloc(diag_j ,SZINT*newsize); new_v = realloc(diag_val,SZDBL*newsize); if(new_i == NULL || new_i == NULL || new_v == NULL) { printf("out of memory (2)!"); exit(44); } diag_i = new_i; diag_j = new_j; diag_val = new_v; addTranspose(newsize,diag_i,diag_j,diag_val, nz_generated); #ifdef DEBUG for(i=0;i<newsize;i++) // to make diags stand out. if(diag_i[i]==diag_j[i]) fprintf(stderr,"after transpose A[%d][%d]=%lf \\\\\n", diag_i[i],diag_j[i], diag_val[i]); else fprintf(stderr,"after transpose A[%d][%d]=%lf\n", diag_i[i],diag_j[i], diag_val[i]); #endif checkStrictDiagonallyDominant(diag_i,diag_j,diag_val, newsize); // now quickly generate a test-vector to solve against: double *vec = vecallocd(N); for(i=0;i<N;i++) vec[i]=ran(); fprintf(stderr,"Left with %d nonzeroes; nonzero density = %lf (desired=%lf)\n", newsize, newsize/((double)N*N), sparsity); fprintf(stderr,"========== OUTPUTTING ... ==========\n"); outputMondriaanMatrix(newsize, diag_i, diag_j, diag_val, vec); outputMathematicaMatrix(newsize, diag_i, diag_j, diag_val, vec); free(diag_done); free(vec); free(diag_i); free(diag_j); free(diag_val); return 0; }
void mainloop(){ //int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000, //2,1000,-5,0,1000,1000,1000,1000,6,0}; int i,j,k,l,v,t,lsize,*lsize_m,*lrow,*lcol, *linit, *linter,*startrow_m; int li,lj,lk,startrow, endrow,g; int* init = gen_graph(N, 0.05); bsp_begin(bsp_nprocs()); /**********Initialization***************/ /*******Comp. Superstep 0******/ lsize = nloc(bsp_nprocs(),bsp_pid(), N); //Get the number of rows of processor s lrow = vecalloci(lsize*N); //The main storing array of processor s lcol = vecalloci(N); //array to hold the column for the matrix squaring startrow_m = vecalloci(bsp_nprocs()); //array to hold all processors starting global row lsize_m = vecalloci(bsp_nprocs()); //array to hold the number of rows of all processors linter = vecalloci(lsize*N); //Intermidiate array used for the matrix "multiplication" bsp_push_reg(startrow_m,bsp_nprocs()*SZINT); bsp_push_reg(lsize_m,bsp_nprocs()*SZINT); bsp_push_reg(lrow,lsize*N*SZINT); /****Get the first and last global row of processor s***/ if(bsp_pid() == (bsp_nprocs() - 1)){ startrow = (N - lsize); endrow = N; }else{ startrow = bsp_pid()*lsize; endrow = bsp_pid()*lsize + lsize; } //Distribute Data, according row block distribution li=0; for ( i= startrow; i < endrow; i++) { lj=0; for(j=0; j < N; j++) { lrow[N*li+lj] = init[N*i+j]; lj++; } li++; } vecfreei(init); //out of the shared enviroment //initialize arrays for ( i=0; i<bsp_nprocs(); i++) { startrow_m[i] = 0; lsize_m[i] = 0; } bsp_sync(); /*******End Comp. Superstep 0******/ /*********Comm. Superstep 1********/ //Communicate the global starting rows of all processors for(g=0; g<bsp_nprocs();g++){ bsp_put(g,&startrow,&startrow_m[0],bsp_pid()*SZINT,SZINT); bsp_put(g,&lsize,&lsize_m[0],bsp_pid()*SZINT,SZINT); } /*********End Comm. Superstep 1*****/ bsp_sync(); /**********End Initialization***************/ double time0= bsp_time(); /*********Repeated Squaring loop start*************/ j=1; while ((N-1) > j) { /****Comp. Superstep j0****/ //initialize arrays for ( i=0; i<N*lsize; i++) { linter[i] = 1000; } for ( i=0; i<N; i++) { lcol[i] = 0; } bsp_sync(); /****End Comp. Superstep j0****/ for ( lj=0; lj < N; lj++) { /***Comm. SuperStep jlj0*******/ //get global column lj t=0; for(g=0; g < bsp_nprocs();g++){ for(v=0; v<lsize_m[g]; v++){ bsp_get(g,&lrow[0],(lj+v*N)*SZINT,&lcol[t],SZINT); t++; } } bsp_sync(); /***End Comm. SuperStep jlj0***/ /***Comp. SuperStep jlj1*******/ //update the values that use global column lj for ( li = 0; li < lsize; li++){ for ( lk=0; lk < N; lk++) { linter[N*li+lj] = fmin(linter[N*li+lj], lrow[N*li+lk]+lcol[lk]); } } bsp_sync(); /***End Comp. SuperStep jlj1***/ } /****Comp. Superstep j1****/ memcpy(lrow,linter,N*lsize*SZINT); j=2*j; bsp_sync(); /****End Comp. Superstep j1****/ } /*********Repeated Squaring loop end*************/ double time1= bsp_time(); bsp_sync(); /*********display matrices and time*********/ if(bsp_pid()==0){ printf( " \n Block Row Distr (need to know basis) calculation of APSP took: %f seconds \n", time1-time0 ); } /*for(g = 0; g < bsp_nprocs(); g++){ if(bsp_pid()==g){ printf("\n i am proc %d and i have APSP Mat \n",bsp_pid()); for(k=0;k<lsize;k++) { printf("\n"); for(l=0;l<N;l++){ printf("\t %d",lrow[N*k+l]); } printf("\n \n "); } } bsp_sync(); }*/ //Clean up bsp_pop_reg(startrow_m); bsp_pop_reg(lsize_m); bsp_pop_reg(lrow); vecfreei(lrow); vecfreei(lcol); vecfreei(startrow_m); vecfreei(lsize_m); vecfreei(linter); bsp_end(); }
void mainloop(){ //int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000, //2,1000,-5,0,1000,1000,1000,1000,6,0}; int nlr,nlc,s,t,i,j,k,l,li,lsize,tsize0, tsize1,tempp,tempoff,rpos,cpos, *lpart,*linter,*gindx,*lcol,*lrow,*lsrow, *lscol, *ltrow, *ltcol, *temp; int* init = gen_graph(N, 0.05); bsp_begin(bsp_nprocs()); /**********Initialization SuperStep 0***************/ //Compute global row and column indeces for each element int pm = sqrt(bsp_nprocs()); int pn = (bsp_nprocs())/pm; /* Compute 2D processor numbering from 1D numbering with failsafe if the number of processors are not enough, back to simple 1D cyclic distribution */ if ( pn != pm ){ pn = bsp_nprocs(); pm = 1; t = bsp_pid(); s = 0; }else{ s= bsp_pid()%pm; /* 0 <= s < pm */ t= bsp_pid()/pn; /* 0 <= t < pn */ } nlr= nloc(pm,s,N); /* number of local rows */ nlc= nloc(pn,t,N); /* number of local columns */ lsize = nlr*nlc; //interpret 2D size to array size lpart = vecalloci(lsize); //Initialize local part of processor s linter = vecalloci(lsize); //Intermidiate array used for the matrix "multiplication" gindx = vecalloci(lsize); //Array to store the global indeces of the local elements lcol = vecalloci(lsize); //Array to store the glocal column index lrow = vecalloci(lsize); //Array to store the glocal row index bsp_push_reg(lpart,lsize*SZINT); //Distribute the Data li=0; for ( i= 0; i < N; i++){ for ( j= 0; j < N; j++){ if ((j % pn) == t){ lpart[li] = init[N*i+j]; lrow[li] = i; lcol[li] = j; gindx[li] = N*i+j; li++; } } } /*for ( i= 0; i < N*N; i++) { if(bsp_pid() == (i % bsp_nprocs())){ lpart[li] = init[i]; lrow[li] = i/N; lcol[li] = i % N; gindx[li] = i; li++; } }*/ vecfreei(init);//out of the shared space tsize0 = tsize1 =lsize; temp = lrow; //find unique global rows for processor s for(i=0;i<tsize0;i++){ for(j=0;j<tsize0;j++){ if(i==j){ continue; } else if(*(temp+i)==*(temp+j)){ k=j; tsize0--; while(k < tsize0){ *(temp+k)=*(temp+k+1); k++; } j=0; } } } temp = lcol; //find unique global column for processor s for(i=0;i<tsize1;i++){ for(j=0;j<tsize1;j++){ if(i==j){ continue; } else if(*(temp+i)==*(temp+j)){ k=j; tsize1--; while(k < tsize1){ *(temp+k)=*(temp+k+1); k++; } j=0; } } } //keep unique global rows and columns in arrays //initialize arrays to hold the elements of those rows and columns(ltcol, ltrow) lscol = vecalloci(tsize1); lsrow = vecalloci(tsize0); ltcol = vecalloci(N*tsize1); ltrow = vecalloci(N*tsize0); for(i=0;i < tsize0;i++){ lsrow[i] = lrow[i]; } for(i=0;i < tsize1;i++){ lscol[i] = lcol[i]; } vecfreei(lcol);//not needed from this point on vecfreei(lrow);//we use lscol, lsrow, ltrow, ltcol //sort arrays qsort (lsrow, tsize0, sizeof(int), compare_int); qsort (lscol, tsize1, sizeof(int), compare_int); bsp_sync(); /**********End Initialization SuperStep 0***************/ double time0= bsp_time(); /*********Repeated Squaring loop start*************/ j=1; while ((N-1) > j) { /*************Comm. SuperStep j0*************/ for(i=0;i < tsize1;i++){ for(k=0; k<N;k++){ tempp=((N*k+lscol[i]) % bsp_nprocs()); tempoff = ((double)(N*k+lscol[i])/(double)bsp_nprocs()); bsp_get(tempp, &lpart[0],tempoff*SZINT, <col[N*i+k],SZINT); } } for(i=0;i < tsize0;i++){ for(k=0; k<N;k++){ tempp=((N*lsrow[i]+k) % bsp_nprocs()); tempoff = ((double)(N*lsrow[i]+k)/(double)bsp_nprocs()); bsp_get(tempp, &lpart[0],tempoff*SZINT, <row[N*i+k],SZINT); } } bsp_sync(); /*************End Comm. SuperStep j0*************/ /*************Comp. SuperStep j1*************/ for ( i=0; i<lsize; i++) { int gcol = gindx[i] % N; //get global col indx of current element int grow = gindx[i]/N; //get global row indx of current element linter[i]=1000;//initiliaze array //find appropriate indx of the global rows and columns to perform "multiplication" /*for ( l=0; l < tsize0;l++){ if(grow == lsrow[l]){ rpos =l; break; } }*/ int *rp = bsearch (&grow, lsrow, tsize0, sizeof (lsrow),compare_int); rpos = rp - lsrow; int *cp = bsearch (&gcol, lscol, tsize1, sizeof (lscol),compare_int); cpos = cp - lscol; /*for ( l=0; l < tsize1;l++){ if(gcol == lscol[l]){ cpos =l; break; } }*/ //this is where the update is done for(k=0;k<N;k++){ linter[i] = fmin(linter[i], ltrow[N*rpos + k]+ltcol[N*cpos + k]); } } memcpy(lpart,linter,lsize*SZINT); j = 2*j; bsp_sync(); /*************End Comp. SuperStep j1*************/ } /*********Repeated Squaring loop end*************/ double time1= bsp_time(); bsp_sync(); /*********display matrices and time*********/ if(bsp_pid()==0){ printf( " \n Block Cyclic Distr calculation of APSP took: %f seconds \n", time1-time0 ); } /*printf("\n The array is, proc %d \n ", bsp_pid()); for(i=0;i < lsize;i++){ printf(" %d",lpart[i]); }*/ printf("\n "); //clean up bsp_pop_reg(lpart); vecfreei(lpart); vecfreei(linter); vecfreei(lscol); vecfreei(lsrow); vecfreei(ltcol); vecfreei(ltrow); vecfreei(gindx); bsp_end(); }
void bspParSort(){ int Log2(int x); void mergeSort(int x, int *temp1); void merge2(int *arr1, int *arr2, int size); int *localArr; /* local array in each processor */ int i,j,k; /* index variables */ int n_divide_p; /* Avoid multiple computation */ int n; /* Number of elements to be sorted */ int szLocalArray; /* Size of local array */ double time0, time1; /* Time */ FILE *ifp = 0; /* Reader to read sequence of numbers to be sorted */ bsp_begin(P); int p= bsp_nprocs(); /* Number of processors obtained */ int s= bsp_pid(); /* Processor number */ //Get number of elements to be sorted if(s==0){ ifp = fopen("sort","r"); if(ifp == NULL){ fprintf(stderr, "Can't open input file!\n"); exit(1); } fscanf(ifp, "%i", &n); } // Make sure every processor knows everything bsp_push_reg(&n,sizeof(int)); bsp_sync(); bsp_get(0,&n,0,&n,sizeof(int)); bsp_sync(); bsp_pop_reg(&n); //Setup distribution n_divide_p = n/p; szLocalArray = n/pow(2,ceil(Log2(s+1))); localArr = vecalloci(szLocalArray); bsp_push_reg(localArr,sizeof(int)*szLocalArray); if(s==0){ printf("Distribution start\n"); fflush(stdout); } bsp_sync(); int value; if(s==0){ //allocate to array on proc 0 for(i=0; i< n_divide_p; i++){ fscanf(ifp, "%i", &value); localArr[i]=value; } //Send to arrays on other processors for(i=1; i< p; i++){ for(j=0;j<n_divide_p;j++){ fscanf(ifp, "%i", &value); bsp_put(i,&value,localArr,j*sizeof(int),sizeof(int)); } } fclose(ifp); } bsp_sync(); if(s==0){ printf("Distribution done\n"); fflush(stdout); } //Distribution done and we can start time measurement if(s==0){ printf("Time start\n"); fflush(stdout); } time0 = bsp_time(); //Locally sort each array if(s==0){ printf("Local sort\n"); fflush(stdout); } mergeSort(n_divide_p, localArr); bsp_sync(); //Merging int *temp = malloc(sizeof(int)*pow(2,Log2(p))*n_divide_p); for(j=1;j<Log2(p)+1;j++){ if(s<p/pow(2,j)){ for(k=0;k<pow(2,j-1)*n_divide_p;k++){ bsp_get(s+(p/pow(2,j)),localArr,k*sizeof(int),&(temp[k]),sizeof(int)); } } bsp_sync(); if(s<p/pow(2,j)){ merge2(localArr, temp, n_divide_p*pow(2,j-1)); } bsp_sync(); if(s==0){ printf("Round %i out of %i rounds of merging done (on proc 0)\n",j,Log2(p)); fflush(stdout); } } if(s==0){ printf("Sorting done\n"); fflush(stdout); } bsp_sync(); //Print sorted array - expensive if sample is big /* if(s==0){ printf("Sorted sequence is:\n"); for(i=0; i<szLocalArray; i++){ printf("%i ",localArr[i]); fflush(stdout); } printf("\n"); fflush(stdout); } */ //Parallel algorithm ends time1 = bsp_time(); if(s==0){ printf("Time stop\n"); fflush(stdout); } //Report time to user if(s==0){ printf("Sorting took %.6lf seconds.\n", time1-time0); fflush(stdout); } //Clean up free(temp); bsp_pop_reg(localArr); free(localArr); bsp_end(); } /* End bspParSort */