int parallelMax(int* A, int s) { int blocksize, localMax, i, limit; int* maxs; blocksize = ceil((double)n/p); //get block size i = blocksize * s; limit = MIN(n, blocksize * (s+1)); localMax = A[i]; for(i += 1; i < limit;i++) { localMax = MAX(localMax, A[i]); } maxs = (int*) malloc(sizeof(int) * p); bsp_push_reg(maxs, sizeof(int) * p); //Make maxs visible globally bsp_sync(); //sync bsp_put(0, &localMax, maxs, s * sizeof(int), sizeof(int)); //send localMax to P0 bsp_sync(); if(s == 0) { localMax = maxs[0]; for(i = 1; i < p; i++) { localMax = MAX(localMax, maxs[i]); } } bsp_push_reg(&localMax, sizeof(int)); //Make localMax visible globally bsp_sync(); bsp_get(0, &localMax, 0, &localMax, sizeof(int)); //each processor gets the min bsp_sync(); return localMax; }
int main() { bsp_begin(); int p = bsp_pid(); char a = 0; char b = 0; char c = 0; bsp_push_reg(&a, sizeof(char)); bsp_sync(); bsp_push_reg(&b, sizeof(char)); bsp_sync(); if (p == 0) { c = 'y'; bsp_hpput(3, &c, &a, 0, sizeof(char)); bsp_hpput(3, &c, &b, 0, sizeof(char)); } bsp_end(); return 0; }
double bspip(int p, int s, int n, double *x, double *y){ /* Compute inner product of vectors x and y of length n>=0 */ int nloc(int p, int s, int n); double inprod, *Inprod, alpha; int i, t; Inprod= vecallocd(p); bsp_push_reg(Inprod,p*SZDBL); bsp_sync(); inprod= 0.0; for (i=0; i<nloc(p,s,n); i++){ inprod += x[i]*y[i]; } for (t=0; t<p; t++){ bsp_put(t,&inprod,Inprod,s*SZDBL,SZDBL); } bsp_sync(); alpha= 0.0; for (t=0; t<p; t++){ alpha += Inprod[t]; } bsp_pop_reg(Inprod); vecfreed(Inprod); return alpha; } /* end bspip */
void Simulation::report() { size_t total_p = bsp_nprocs(); size_t s = bsp_pid(); double *densities = new double[total_p](); double current_density = 0; bsp_push_reg(densities,total_p * sizeof(double)); bsp_sync(); for (auto node : d_domain->nodes) current_density += density(d_domain->set, node); // send density to each processor for (size_t t = 0; t < total_p; t++) bsp_put(t, ¤t_density, densities, s * sizeof(double), sizeof(double)); bsp_sync(); // now calculate the total density double total_density = 0; for (size_t t = 0; t < total_p; t++) total_density += densities[t]; bsp_pop_reg(densities); if (s == 0) std::cout << "Total density: " << total_density << '\n'; delete[] densities; }
void withPut() { bsp_begin( 4 ); std::map<char, int*> m; for(int i = 0; i < 4; ++i) { if(i == bsp_pid()) { // Init m['m'] = new int[3]; memset(m['m'], 0, sizeof(int)*3); m['s'] = new int[3]; memset(m['s'], 0, sizeof(int)*3); if(0 == bsp_pid() % 2) { m['s'][0] = bsp_pid()*5+1; m['s'][1] = bsp_pid()*10+1; m['s'][2] = bsp_pid()*15+1; //std::cout << "proc " << bsp_pid() << " unregistered " // << m['m'] << std::endl << std::flush; } //if(1 == bsp_pid() % 2) { std::cout << "proc " << bsp_pid() << " registering " << m['s'] << std::endl << std::flush; bsp_push_reg(m['s'], 3*sizeof(int)); //} } bsp_sync(); } bsp_sync(); for(int i = 0; i < 4; ++i) { if(i == bsp_pid()) { if(0 == i % 2) { std::cout << "proc " << bsp_pid() << " puts to proc " << bsp_pid() + 1 << "data from " << m['s'] << " to " << m['s'] << std::endl << std::flush; bsp_put(bsp_pid() + 1, m['s'], m['s'], 0, 3 * sizeof(int)); } } bsp_sync(); } bsp_sync(); // print values for(int i = 0; i < 4; ++i) { if(i == bsp_pid()) { bsp_pop_reg(m['s']); std::cout << "Proc {" << bsp_pid() << "} contains" << std::endl << std::flush; for(int i = 0; i < 3; ++i) std::cout << m['s'][i] << " "; std::cout << std::endl << std::flush; } bsp_sync(); } bsp_end(); }
int main() { bsp_begin(); int var = bsp_pid(); int* unregistered_var = (int*)0x7000; char teststr[] = "Default test string!"; char goodstr[] = "Replacement string."; bsp_push_reg(&var, sizeof(int)); if (bsp_pid() != 2) bsp_sync(); bsp_push_reg(teststr, sizeof(int)); // Only core 2 will do both registrations in the same sync if (bsp_pid() == 2) bsp_sync(); // expect: ($02: BSP ERROR: multiple bsp_push_reg calls within one sync) if (bsp_pid() == 1) { bsp_hpput(0, &var, &var, 0, sizeof(int)); bsp_hpput(0, &var, unregistered_var, 0, sizeof(int)); // Error // expect: ($01: BSP ERROR: could not find bsp var 0x7000) } if (bsp_pid() == 0) { bsp_hpput(1, goodstr, teststr, 0, 19 * sizeof(char)); } bsp_sync(); if (bsp_pid() == 0) ebsp_message("%d", var); // expect: ($00: 1) bsp_sync(); if (bsp_pid() == 1) ebsp_message(teststr); // expect: ($01: Replacement string.!) bsp_end(); return 0; }
void DLargestCommonSubSequence::distributedInit() { // allocate storage for rows before int rowsToExportPerProc = (0 == chunkStride_%n_) ? chunkStride_/n_ : chunkStride_/n_+1; //std::cout << "rowsToExportPerProc " // << rowsToExportPerProc << std::endl; for(int i = 0; i < rowsToExportPerProc; ++i) { Row curr; curr.resize(chunkLength_); over_.push_back(curr); bsp_push_reg(over_.back().data(), sizeof(int)*chunkLength_); //std::cout << "Proc " << id_ << // " Last Row index " << i << " addr: "; // std::cout << over_.back().data() // << std::endl << std::flush; } for(int i = 0; i < chunkStride_; ++i) { for(int j = 0; j < chunkStride_; ++j) { if(id_ != i % n_) continue; // allocate L L_[getCPair(i,j)] = new int*[chunkLength_]; for(int k = 0; k < chunkLength_; ++k) { L_[getCPair(i,j)][k] = new int [chunkLength_]; memset(L_[getCPair(i,j)][k], 0, chunkLength_*sizeof(int)); } //allocate chunks for lastRow if(i > 0) { L_[getCPair(i-1,j)] = new int*[chunkLength_]; for(int k = 0; k < chunkLength_; ++k) { if(k < chunkLength_-1) { L_[getCPair(i-1,j)][k] = NULL; } else { L_[getCPair(i-1,j)][k] = new int[chunkLength_]; memset(L_[getCPair(i-1,j)][k], 0, chunkLength_*sizeof(int)); } } } } } }
void bspinprod(){ double bspip(int p, int s, int n, double *x, double *y); int nloc(int p, int s, int n); double *x, alpha, time0, time1; int p, s, n, nl, i, iglob; bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ s= bsp_pid(); /* s = processor number */ if (s==0){ printf("Please enter n:\n"); fflush(stdout); scanf("%d",&n); if(n<0) bsp_abort("Error in input: n is negative"); } bsp_push_reg(&n,SZINT); bsp_sync(); bsp_get(0,&n,0,&n,SZINT); bsp_sync(); bsp_pop_reg(&n); nl= nloc(p,s,n); x= vecallocd(nl); for (i=0; i<nl; i++){ iglob= i*p+s; x[i]= iglob+1; } bsp_sync(); time0=bsp_time(); alpha= bspip(p,s,n,x,x); bsp_sync(); time1=bsp_time(); printf("Processor %d: sum of squares up to %d*%d is %.lf\n", s,n,n,alpha); fflush(stdout); if (s==0){ printf("This took only %.6lf seconds.\n", time1-time0); fflush(stdout); } vecfreed(x); bsp_end(); } /* end bspinprod */
void spmd() { bsp_begin( 4 ); /// Init std::vector<unsigned int> a; std::vector<unsigned int> b; a.resize(3); if(0 == bsp_pid() % 2) { a = {bsp_pid()*5+1, bsp_pid()*10+1, bsp_pid()*15+1}; b = {bsp_pid()*5+1, bsp_pid()*10+1, bsp_pid()*15+1}; } bsp_sync(); bsp_push_reg (b.data (), b.size () * sizeof(unsigned int)); bsp_sync (); // getting values of even into odd if(1 == bsp_pid() % 2) { bsp_get(0, b.data(), 1* sizeof(unsigned int), a.data(), 2 * sizeof(unsigned int)); } bsp_sync(); // print values for(int i = 0; i < 4; ++i) { if(i == bsp_pid()) { std::cout << "Proc {" << bsp_pid() << "} contains" << std::endl << std::flush; for(std::vector<unsigned int>::const_iterator it = a.cbegin(); it != a.cend(); ++it) std::cout << *it << " "; std::cout << std::endl << std::flush; } bsp_sync(); } bsp_pop_reg (b.data()); bsp_end(); }
void withMaps() { bsp_begin( 4 ); // Init std::map<char, int*> n; std::map<char, std::vector<unsigned int> > m; std::vector<unsigned int> a; a.resize(3); m['m'] = a; bsp_push_reg(m['m'].data(), m['m'].capacity()*sizeof(unsigned int)); if(0 == bsp_pid() % 2) { m['m'] = {bsp_pid()*5+1, bsp_pid()*10+1, bsp_pid()*15+1}; } bsp_sync(); // getting values of even into odd if(1 == bsp_pid() % 2) { bsp_get(bsp_pid() - 1, m['m'].data(), 0, m['m'].data(), 3 * sizeof(unsigned int)); } bsp_sync(); // print values for(int i = 0; i < 4; ++i) { if(i == bsp_pid()) { std::cout << "Test " << n['n'] <<std::endl; std::cout << "Proc {" << bsp_pid() << "} contains" << std::endl << std::flush; for(std::vector<unsigned int>::const_iterator it = m['m'].cbegin(); it != m['m'].cend(); ++it) std::cout << *it << " "; std::cout << std::endl << std::flush; } bsp_sync(); } bsp_pop_reg(m['m'].data()); bsp_end(); }
//initialisation function for ip void ip_init( double **ip_buffer ) { const size_t size = bsp_nprocs() * sizeof(double); *ip_buffer = malloc( size ); bsp_push_reg( *ip_buffer, size ); }
void countSort() { int s, i, j, min, max, blocksize, start, limit, localCount, index; double time, time0, time1; int *A, *C, *localB, *sizes; bsp_begin(p); //Begin Parallel s = bsp_pid(); //Current Processor Number A = (int*)malloc(sizeof(int) * n); if(s == 0) { for(i = 0; i < n; i++) { //Generating the array A[i] = rand() % 100; } } bsp_sync(); //sync time0 = bsp_time(); bsp_push_reg(A, sizeof(int) * n); //push the array bsp_sync(); //sync bsp_get(0, A, 0, A, sizeof(int)* n); //Get the array bsp_sync(); //sync min = parallelMin(A, s); //find min of A max = parallelMax(A, s); //find max of A blocksize = ceil((double)(max - min + 1)/p); //get block size start = blocksize * s; //start index in C limit = MIN(max - min + 1, blocksize * (s+1)); //end index in C C = (int*) malloc(sizeof(int) * blocksize); //init C to 0 for(i = 0; i < blocksize; i++) { C[i] = 0; } localCount = 0; for(i = 0; i < n;i++) { //fill C for values in Range[start, limit[ int tmp = A[i] - min; if(start <= tmp && tmp < limit) { C[tmp - start] += 1; localCount++; } } if(localCount > 0) { localB = (int*)malloc(sizeof(int) * localCount); int tmp = limit - start; int j = 0; for(i = 0; i < tmp; i++) { //Generate localB from C if(C[i] > 0) { localB[j] = i + start + min; C[i] = C[i] - 1; i--; j++; } } } sizes = (int*) malloc(sizeof(int) * p); bsp_push_reg(sizes, sizeof(int) * p); bsp_sync(); //sync bsp_put(0, &localCount, sizes, s * sizeof(int), sizeof(int)); //send localCount to P0 bsp_sync(); //sync index = 0; bsp_push_reg(&index, sizeof(int)); bsp_sync(); //sync if(s == 0) { //Processor 0 sends start index to all processors int tmp = 0; for(i = 0; i < p; i++) { bsp_put(i, &tmp, &index, 0, sizeof(int)); tmp += sizes[i]; } } bsp_sync(); //sync bsp_put(0, localB, A, index * sizeof(int), sizeof(int) * localCount); //put localB in its place in A bsp_sync(); //A now contains the final sorted array time1 = bsp_time(); time = time1 - time0; if(s == 0) { //printing Result printf("Number of processors: %d \t Input Size: %d \t Time Taken: %.8f", p, n, time); } bsp_end(); }
void bspParSort(){ int Log2(int x); void mergeSort(int x, int *temp1); void merge2(int *arr1, int *arr2, int size); int *localArr; /* local array in each processor */ int i,j,k; /* index variables */ int n_divide_p; /* Avoid multiple computation */ int n; /* Number of elements to be sorted */ int szLocalArray; /* Size of local array */ double time0, time1; /* Time */ FILE *ifp = 0; /* Reader to read sequence of numbers to be sorted */ bsp_begin(P); int p= bsp_nprocs(); /* Number of processors obtained */ int s= bsp_pid(); /* Processor number */ //Get number of elements to be sorted if(s==0){ ifp = fopen("sort","r"); if(ifp == NULL){ fprintf(stderr, "Can't open input file!\n"); exit(1); } fscanf(ifp, "%i", &n); } // Make sure every processor knows everything bsp_push_reg(&n,sizeof(int)); bsp_sync(); bsp_get(0,&n,0,&n,sizeof(int)); bsp_sync(); bsp_pop_reg(&n); //Setup distribution n_divide_p = n/p; szLocalArray = n/pow(2,ceil(Log2(s+1))); localArr = vecalloci(szLocalArray); bsp_push_reg(localArr,sizeof(int)*szLocalArray); if(s==0){ printf("Distribution start\n"); fflush(stdout); } bsp_sync(); int value; if(s==0){ //allocate to array on proc 0 for(i=0; i< n_divide_p; i++){ fscanf(ifp, "%i", &value); localArr[i]=value; } //Send to arrays on other processors for(i=1; i< p; i++){ for(j=0;j<n_divide_p;j++){ fscanf(ifp, "%i", &value); bsp_put(i,&value,localArr,j*sizeof(int),sizeof(int)); } } fclose(ifp); } bsp_sync(); if(s==0){ printf("Distribution done\n"); fflush(stdout); } //Distribution done and we can start time measurement if(s==0){ printf("Time start\n"); fflush(stdout); } time0 = bsp_time(); //Locally sort each array if(s==0){ printf("Local sort\n"); fflush(stdout); } mergeSort(n_divide_p, localArr); bsp_sync(); //Merging int *temp = malloc(sizeof(int)*pow(2,Log2(p))*n_divide_p); for(j=1;j<Log2(p)+1;j++){ if(s<p/pow(2,j)){ for(k=0;k<pow(2,j-1)*n_divide_p;k++){ bsp_get(s+(p/pow(2,j)),localArr,k*sizeof(int),&(temp[k]),sizeof(int)); } } bsp_sync(); if(s<p/pow(2,j)){ merge2(localArr, temp, n_divide_p*pow(2,j-1)); } bsp_sync(); if(s==0){ printf("Round %i out of %i rounds of merging done (on proc 0)\n",j,Log2(p)); fflush(stdout); } } if(s==0){ printf("Sorting done\n"); fflush(stdout); } bsp_sync(); //Print sorted array - expensive if sample is big /* if(s==0){ printf("Sorted sequence is:\n"); for(i=0; i<szLocalArray; i++){ printf("%i ",localArr[i]); fflush(stdout); } printf("\n"); fflush(stdout); } */ //Parallel algorithm ends time1 = bsp_time(); if(s==0){ printf("Time stop\n"); fflush(stdout); } //Report time to user if(s==0){ printf("Sorting took %.6lf seconds.\n", time1-time0); fflush(stdout); } //Clean up free(temp); bsp_pop_reg(localArr); free(localArr); bsp_end(); } /* End bspParSort */
void parallel_part() { int i, j; srand(1452764); //Matrix initilization float **matrix = (float**)calloc(N+2, sizeof(float*)); for (i=0; i<N+2; i++) { matrix[i] = (float*)calloc(N+2, sizeof(float)); } for (i=0; i<N+2; i++) { for (j=0; j<N+2; j++) { matrix[i][j] = (float)rand()/(float)RAND_MAX; //printf("row %d, coloum %d, element: %f\n", i, j, matrix[i][j]); } } //Parallel part bsp_begin(bsp_nprocs()); int pid, x, y, done; pid=x=y=done=0; int sqroot = (int)(sqrt(bsp_nprocs())); int size = (int)(N/sqroot); //side float Ai_jm1, Aim1_j, Ai_jp1, Aip1_j; Ai_jm1 = Aim1_j = Ai_jp1 = Aip1_j = 0.0; float temp, diff, convergence, total_diff; temp = convergence = 0.0; float *diffs = (float*)calloc(bsp_nprocs(), sizeof(float)); int counter= 0; //(N/sqrt(p)) is an integer assurance if ( N%sqroot!=0) { bsp_abort("N/sqrt(p) is not an integer.\nProgram Aborted.\n"); } //Initiliaze a piece of martix in decomposition float **sub_martix = (float**)calloc(size, sizeof(float*)); for (i=0; i<size; i++) { sub_martix[i] = (float*) calloc(size, sizeof(float)); } //Initiliaze borders float *upper = (float*)calloc(size, sizeof(float)); float *lower = (float*)calloc(size, sizeof(float)); float *left = (float*)calloc(size, sizeof(float)); float *right = (float*)calloc(size, sizeof(float)); float *overlap = (float*)calloc(size, sizeof(float)); bsp_push_reg(&diff, sizeof(float)); bsp_push_reg(upper, size*sizeof(float)); bsp_push_reg(lower, size*sizeof(float)); bsp_push_reg(left, size*sizeof(float)); bsp_push_reg(right, size*sizeof(float)); //Make each matrix and border available globally for (i=0; i<size; i++) { bsp_push_reg(sub_martix[i], size*sizeof(float)); } bsp_sync(); /*Processor 0 distributes the data*/ if (bsp_pid()==0) { for (pid = 0; pid<bsp_nprocs(); pid++) { //Determine which part of the original matrix x = pid/sqroot; y = pid%sqroot; //Then the processor 0 copy the data to each processor for (i=0; i<size; i++) { for (j=0; j<size; j++) { sub_martix[i][j] = matrix[x*size+i+1][y*size+j+1]; } } if (pid!=0) { for (i=0; i<size; i++) { bsp_put(pid, sub_martix[i], sub_martix[i], 0, size*sizeof(float)); } } } } bsp_sync(); if (bsp_pid()==0) { for (pid=0; pid<bsp_nprocs(); pid++) { x=pid/sqroot; x=pid%sqroot; //if the part is in 1st row if (x==0) { for (i=0; i<size; i++) { upper[i] = matrix[0][y*size+1+i]; } } //if the part is in leftmost column if (y==0) { for (i=0; i<size; i++) { left[i] = matrix[x*size+1+i][0]; } } //if the part is in last row if (x==sqroot-1) { for (i=0; i<size; i++) { lower[i] = matrix[N+1][y*size+1+i]; } } //if the part is in rightmost column if (y==1) { for (i=0; i<size; i++) { right[i] = matrix[x*size+1+i][N+1]; } } if (pid!=0) { bsp_put(pid, upper, upper, 0, size*sizeof(float)); bsp_put(pid, lower, lower, 0, size*sizeof(float)); bsp_put(pid, left, left, 0, size*sizeof(float)); bsp_put(pid, right, right, 0, size*sizeof(float)); } } } bsp_sync(); /* Computation */ while (!done) { pid = bsp_pid(); diff=0.0; total_diff=0.0; x = pid/sqroot; y = pid%sqroot; //printf("Now %d th round:", ++counter); if (x<sqroot-1) { for (i=0; i<size; i++) { overlap[i] = sub_martix[size-1][i]; } bsp_put(bsp_pid()+sqroot, overlap, upper, 0, size*sizeof(float)); } if (y<sqroot-1) { for (i=0; i<size; i++) { overlap[i]=sub_martix[i][size-1]; } bsp_put(bsp_pid()+1, overlap, left, 0, size*sizeof(float)); } if (x>0) { for (i=0; i<size; i++) { overlap[i]=sub_martix[0][i]; } bsp_put(bsp_pid()-sqroot, overlap, lower, 0, size*sizeof(float)); } if (y>0) { for (i=0; i<size; i++) { overlap[i]=sub_martix[i][0]; } bsp_put(bsp_pid()-1, overlap, right, 0, size*sizeof(float)); } bsp_sync(); for (i=0; i<size; i++) { for (j=0; j<size; j++) { temp = sub_martix[i][j]; if (i-1<0) { Aim1_j=upper[j]; } else { Aim1_j=sub_martix[i-1][j]; } if (i+1>size-1) { Aip1_j=lower[j]; } else { Aip1_j=sub_martix[i+1][j]; } if (j-1<0) { if (y!=0) { Ai_jm1 = left[size-1]; } else { Ai_jm1 = left[i]; } } else { Ai_jm1 = sub_martix[i][j-1]; } if (j+1>size-1) { if (y!=sqroot-1) { Ai_jp1 = right[0]; } else { Ai_jp1 = right[i]; } } else { Ai_jp1 = sub_martix[i][j+1]; } sub_martix[i][j] = 0.2*(sub_martix[i][j] + Ai_jm1 + Aim1_j + Ai_jp1 + Aip1_j); //printf("data is %f\n", sub_martix[i][j]); diff += fabs(sub_martix[i][j]-temp); } } //printf("Result from pid: %d: difference= %f \n", bsp_pid(), diff); bsp_sync(); for (i=0; i<bsp_nprocs(); i++) { bsp_get(i, &diff, 0, &diffs[i], sizeof(float)); } bsp_sync(); for (i=0; i<bsp_nprocs(); i++) { total_diff += diffs[i]; } bsp_sync(); convergence = (total_diff)/(float)(N*N); //printf("Current Convergence is %f\n", convergence); if (convergence<TOL) { done = 1; } bsp_sync(); } for (i=0; i<size; i++) { bsp_pop_reg(sub_martix[i]); } bsp_pop_reg(&diff); bsp_pop_reg(lower); bsp_pop_reg(upper); bsp_pop_reg(left); bsp_pop_reg(right); bsp_sync(); for (i=0; i<size; i++) { free(sub_martix[i]); } free(sub_martix); free(diffs); free(lower); free(upper); free(left); free(right); free(overlap); bsp_sync(); bsp_end(); for (i=0; i<N+2; i++) { free(matrix[i]); } free(matrix); }
void bspfft_test() { void bspfft( double * x, int n, int p, int s, int sign, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); void bspfft_init( int n, int p, int s, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); int k1_init( int n, int p ); int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p; double time0, time1, time2, ffttime, nflops, max_error, error_re, error_im, error, *Error, *x, *w0, *w, *tw; bsp_begin( P ); p = bsp_nprocs(); s = bsp_pid(); bsp_push_reg( &n, SZINT ); Error = vecallocd( p ); bsp_push_reg( Error, p * SZDBL ); bsp_sync(); if ( s == 0 ) { printf( "Please enter length n: \n" ); #ifdef _WIN32 scanf_s( "%d", &n ); #else scanf( "%d", &n ); #endif if ( n < 2 * p ) { bsp_abort( "Error in input: n < 2p" ); } for ( q = 1; q < p; q++ ) { bsp_put( q, &n, &n, 0, SZINT ); } } bsp_sync(); if ( s == 0 ) { printf( "FFT of vector of length %d using %d processors\n", n, p ); printf( "performing %d forward and %d backward transforms\n", NITERS, NITERS ); } /* Allocate, register, and initialize vectors */ np = n / p; x = vecallocd( 2 * np ); bsp_push_reg( x, 2 * np * SZDBL ); k1 = k1_init( n, p ); w0 = vecallocd( k1 ); w = vecallocd( np ); tw = vecallocd( 2 * np + p ); rho_np = vecalloci( np ); rho_p = vecalloci( p ); for ( j = 0; j < np; j++ ) { jglob = j * p + s; x[2 * j] = ( double )jglob; x[2 * j + 1] = 1.0; } bsp_sync(); time0 = bsp_time(); /* Initialize the weight and bit reversal tables */ for ( it = 0; it < NITERS; it++ ) { bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time1 = bsp_time(); /* Perform the FFTs */ for ( it = 0; it < NITERS; it++ ) { bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p ); bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time2 = bsp_time(); /* Compute the accuracy */ max_error = 0.0; for ( j = 0; j < np; j++ ) { jglob = j * p + s; error_re = fabs( x[2 * j] - ( double )jglob ); error_im = fabs( x[2 * j + 1] - 1.0 ); error = sqrt( error_re * error_re + error_im * error_im ); if ( error > max_error ) { max_error = error; } } bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL ); bsp_sync(); if ( s == 0 ) { max_error = 0.0; for ( q = 0; q < p; q++ ) { if ( Error[q] > max_error ) { max_error = Error[q]; } } } for ( j = 0; j < NPRINT && j < np; j++ ) { jglob = j * p + s; printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] ); } fflush( stdout ); bsp_sync(); if ( s == 0 ) { printf( "Time per initialization = %lf sec \n", ( time1 - time0 ) / NITERS ); ffttime = ( time2 - time1 ) / ( 2.0 * NITERS ); printf( "Time per FFT = %lf sec \n", ffttime ); nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n; printf( "Computing rate in FFT = %lf Mflop/s \n", nflops / ( MEGA * ffttime ) ); printf( "Absolute error= %e \n", max_error ); printf( "Relative error= %e \n\n", max_error / n ); } bsp_pop_reg( x ); bsp_pop_reg( Error ); bsp_pop_reg( &n ); bsp_sync(); vecfreei( rho_p ); vecfreei( rho_np ); vecfreed( tw ); vecfreed( w ); vecfreed( w0 ); vecfreed( x ); vecfreed( Error ); bsp_end(); } /* end bspfft_test */
void bspbench(){ void leastsquares(int h0, int h1, double *t, double *g, double *l); int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH]; double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest, time0, time1, time, *Time, mintime, maxtime, nflops, r, g0, l0, g, l, t[MAXH+1]; /**** Determine p ****/ bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ s= bsp_pid(); /* s = processor number */ Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL); dest= vecallocd(2*MAXH+p); bsp_push_reg(dest,(2*MAXH+p)*SZDBL); bsp_sync(); /**** Determine r ****/ for (n=1; n <= MAXN; n *= 2){ /* Initialize scalars and vectors */ alpha= 1.0/3.0; beta= 4.0/9.0; for (i=0; i<n; i++){ z[i]= y[i]= x[i]= (double)i; } /* Measure time of 2*NITERS DAXPY operations of length n */ time0=bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<n; i++) y[i] += alpha*x[i]; for (i=0; i<n; i++) z[i] -= beta*x[i]; } time1= bsp_time(); time= time1-time0; bsp_put(0,&time,Time,s*SZDBL,SZDBL); bsp_sync(); /* Processor 0 determines minimum, maximum, average30 INTRODUCTION computing rate */ if (s==0){ mintime= maxtime= Time[0]; for(s1=1; s1<p; s1++){ mintime= MIN(mintime,Time[s1]); maxtime= MAX(maxtime,Time[s1]); } if (mintime>0.0){ /* Compute r = average computing rate in flop/s */ nflops= 4*NITERS*n; r= 0.0; for(s1=0; s1<p; s1++) r += nflops/Time[s1]; r /= p; printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ", n, nflops/(maxtime*MEGA),nflops/ (mintime*MEGA), r/MEGA); fflush(stdout); /* Output for fooling benchmark-detecting compilers */ printf(" fool=%7.1lf\n",y[n-1]+z[n-1]); } else printf("minimum time is 0\n"); fflush(stdout); } } /**** Determine g and l ****/ for (h=0; h<=MAXH; h++){ /* Initialize communication pattern */ for (i=0; i<h; i++){ src[i]= (double)i; if (p==1){ destproc[i]=0; destindex[i]=i; } else { /* destination processor is one of the p-1 others */ destproc[i]= (s+1 + i%(p-1)) %p; /* destination index is in my own part of dest */ destindex[i]= s + (i/(p-1))*p; } } /* Measure time of NITERS h-relations */ bsp_sync(); time0= bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<h; i++) bsp_put(destproc[i],&src[i],dest,destindex[i]*SZDBL, SZDBL); bsp_sync(); } time1= bsp_time(); time= time1-time0; /* Compute time of one h-relation */ if (s==0){ t[h]= (time*r)/NITERS; printf("Time of %5d-relation= %lf sec= %8.0lf flops\n", h, time/NITERS, t[h]); fflush(stdout); } } if (s==0){ printf("size of double = %d bytes\n",(int)SZDBL); leastsquares(0,p,t,&g0,&l0); printf("Range h=0 to p : g= %.1lf, l= %.1lf\n",g0,l0); leastsquares(p,MAXH,t,&g,&l); printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l); printf("The bottom line for this BSP computer is:\n"); printf("p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf\n", p,r/MEGA,g,l); fflush(stdout); } bsp_pop_reg(dest); vecfreed(dest); bsp_pop_reg(Time); vecfreed(Time); bsp_end(); } /* end bspbench */
void bspbench(){ void leastsquares(int h0, int h1, double *t, double *g, double *l); int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH]; double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest, time0, time1, time, *Time, mintime, maxtime, nflops, r, g0, l0, g, l, t[MAXH+1]; size_t pin[100]; // Determine p // start: new code for pinning for (i=0; i< tnode->length; i++) pin[i] = tnode->sons[i]->index; mcbsp_set_pinning( pin, tnode->length ); bsp_begin(tnode->length); // end: new code for pinning p= bsp_nprocs(); // p = number of processors obtained s= bsp_pid(); // s = processor number Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL); dest= vecallocd(2*(MAXH+p)); bsp_push_reg(dest,(2*(MAXH+p))*SZDBL); bsp_sync(); // Determine r for (n=1; n < MAXN; n *= 2){ // Initialize scalars and vectors alpha= 1.0/3.0; beta= 4.0/9.0; for (i=0; i<n; i++){ z[i]= y[i]= x[i]= (double)i; } // Measure time of 2*NITERS DAXPY operations of length n time0=bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<n; i++) y[i] += alpha*x[i]; for (i=0; i<n; i++) z[i] -= beta*x[i]; } time1= bsp_time(); time= time1-time0; bsp_put(0,&time,Time,s*SZDBL,SZDBL); bsp_sync(); // Processor 0 determines minimum, maximum, average computing rate if (s==0){ mintime= maxtime= Time[0]; for(s1=1; s1<p; s1++){ mintime= MIN(mintime,Time[s1]); maxtime= MAX(maxtime,Time[s1]); } if (mintime>0.0){ // Compute r = average computing rate in flop/s nflops= 4*NITERS*n; r= 0.0; for(s1=0; s1<p; s1++) r += nflops/Time[s1]; r /= p; //printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ", // n, nflops/(maxtime*MEGA),nflops/(mintime*MEGA), r/MEGA); //fflush(stdout); // Output for fooling benchmark-detecting compilers printf( "", y[n-1]+z[n-1] ); } } } // Determine g and l for (h=0; h<=MAXH; h++){ // Initialize communication pattern for (i=0; i<h; i++){ src[i]= (double)i; if (p==1){ destproc[i]=0; destindex[i]=i; } else { // destination processor is one of the p-1 others destproc[i]= (s+1 + i%(p-1)) %p; // destination index is in my own part of dest destindex[i]= s + (i/(p-1))*p; } } for (i=0; i<h; i++){ src[i]= (double)i; if (p==1){ destproc[i]=0; destindex[i]=i; } else { // destination processor is one of the p-1 others destproc[i]= (s+1 + i%(p-1)) %p; // destination index is in my own part of dest destindex[i]= s + (i/(p-1))*p; } } // Measure time of NITERS h-relations bsp_sync(); time0= bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<h; i++) { //bsp_get(0, dest, destindex[i]*SZDBL, &src[i] , SZDBL); //bsp_get(destproc[i], dest, destindex[i]*SZDBL, &src[i] , SZDBL); bsp_put(destproc[i], &src[i] , dest , destindex[i]*SZDBL, SZDBL); } //if (s == 0) // bsp_get(0, dest, destindex[i]*SZDBL, &src[i] , SZDBL); bsp_sync(); } time1= bsp_time(); time= time1-time0; // Compute time of one h-relation if (s==0){ t[h]= (time*r)/NITERS; //#define SEHLOC_BENCH_VERBOSE #ifdef SEHLOC_BENCH_VERBOSE char strnodes[256]; sprintf(strnodes, ""); for (i=0; i<tnode->length; i++) { sprintf(strnodes, "%s %d", strnodes, tnode->sons[i]->index); } printf("SEH# Level%d %5d %lf %8.0lf\n", tnode->level, h, time/NITERS, t[h]); fflush(stdout); #endif } } if (s==0){ leastsquares(0,p,t,&g0,&l0); printf("Range h=0 to p : g= %.1lf, l= %.1lf\n",g0,l0); leastsquares(p,MAXH,t,&g,&l); g=(g>0)? g: g0*2; printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l); //printf("plot# %d %.1lf %.1lf\n",tnode->level, g,l); printf("The bottom line for this MultiBSP component is:\n"); printf("<p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf>\n", p,r/MEGA,g,l); fflush(stdout); } bsp_pop_reg(dest); vecfreed(dest); bsp_pop_reg(Time); vecfreed(Time); bsp_end(); } /* end bspbench */
void spmd( void ) { //parallel over three processes bsp_begin( 3 ); //test bsp_push_reg (results in next superstep) size_t localInt; bsp_push_reg( &localInt, sizeof( size_t ) ); checkLocalIntAddress[ bsp_pid() ] = &localInt; //check pid/nprocs, both using primitives as well as manually checkPcount[ bsp_pid() ] = (size_t)(bsp_nprocs()); pthread_mutex_lock( &test_mutex ); check++; checkP[ bsp_pid() ] = true; pthread_mutex_unlock( &test_mutex ); //nobody should be at superstep 0 if( superstep == 1 ) superstepOK = false; //test barrier synchronisation bsp_sync(); //note someone is at superstep 1 superstep = 1; //check bsp_time if( bsp_time() <= 0 ) bsp_abort( "FAILURE \t bsp_time returned 0 or less!\n" ); //set up a pop_reg, but should only take effect after the next sync //(testing the push_reg after this statement thus provides a free test) bsp_pop_reg( &localInt ); struct mcbsp_thread_data * const data = pthread_getspecific( mcbsp_internal_thread_data ); if( data->localsToRemove.top != 1 || data->localsToRemove.cap != 16 || *((void**)(data->localsToRemove.array)) != (void*)&localInt ) { fprintf( stderr, "FAILURE \t bsp_pop_reg did not push entry on the to-remove stack (%p != %p)!\n", *((void**)(data->localsToRemove.array)), (void*)&localInt ); mcbsp_util_fatal(); } //check push_reg for( unsigned char i=0; i<3; ++i ) { if( checkLocalIntAddress[ i ] != mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address ) { fprintf( stderr, "FAILURE \t bsp_push_reg did not register correct address!\n" ); mcbsp_util_fatal(); } } bsp_sync(); //check pop_reg for( unsigned char i=0; i<3; ++i ) { if( mcbsp_util_address_table_get( &(data->init->global2local), 0, i ) != NULL || data->localC != 0 ) { fprintf( stderr, "FAILURE \t bsp_pop_reg did not de-register correctly (entry=%p)!\n", mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address ); mcbsp_util_fatal(); } //localInt = *(size_t*)mcbsp_util_stack_pop( &(data->removedGlobals) ); } bsp_sync(); //going to test communication primitives on the following area size_t commTest[ 3 ]; commTest[ 0 ] = commTest[ 1 ] = ((size_t)bsp_pid()); commTest[ 2 ] = (size_t)(bsp_nprocs()); bsp_push_reg( &commTest, 3 * sizeof( size_t ) ); //make push valid bsp_sync(); //after this put, commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &commTest, &commTest, sizeof( size_t ), 2*sizeof( size_t) ); commTest[ 2 ] = ULONG_MAX; //this should not influence the result after sync. //test behind-the-scenes const struct mcbsp_util_stack queue = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ]; size_t predicted_cap = predictCap( sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) ); if( queue.cap != predicted_cap || queue.top != sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) || queue.size != sizeof( struct mcbsp_message ) ) { fprintf( stderr, "FAILURE \t bsp_put did not adapt the communication queue as expected!\n(cap = %ld, top = %ld, size = %ld)\n", (size_t)queue.cap, (size_t)queue.top, (size_t)queue.size ); mcbsp_util_fatal(); } const struct mcbsp_message request = *((struct mcbsp_message*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message )) ); if( request.length != 2 * sizeof( size_t) ) { fprintf( stderr, "FAILURE \t bsp_put did not push a request of the expected length!\n(length = %ld)\n", (size_t)request.length ); mcbsp_util_fatal(); } const size_t * const chk_array = (size_t*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message ) - 2 * sizeof( size_t )); if( chk_array[ 0 ] != ((size_t)bsp_pid()) || chk_array[ 1 ] != ((size_t)bsp_pid()) ) { fprintf( stderr, "FAILURE \t bsp_put did not push an expected communication request!\n" ); mcbsp_util_fatal(); } //note there is no easy way to check request.destination; the top-level BSP test will handle that one bsp_sync(); //test for the above expectation after bsp_put, namely //commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs if( commTest[ 0 ] != ((size_t)bsp_pid()) || commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs()) || commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t array after bsp_put is not as expected! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] ); mcbsp_util_fatal(); } //do a get on the next processor on the last element of commTest bsp_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest[ 2 ]), sizeof( size_t ) ); //fill the expected value after the get to test non-buffering commTest[ 2 ] = ((size_t)bsp_pid()); //communicate bsp_sync(); //commTest[ 0 ] should equal bsp_pid, commTest[ 1 ] should equal bsp_pid-1, commTest[ 2 ] should be bsp_pid+1 if( commTest[ 0 ] != ((size_t)bsp_pid()) || commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs() - 1)%bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t start of array after bsp_get changed! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] ); mcbsp_util_fatal(); } if( commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs() + 1)%bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t last element of array after bsp_get erroneous! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] ); mcbsp_util_fatal(); } bsp_sync(); //test direct_get functionality size_t commTest2[ 3 ]; commTest2[ 0 ] = commTest[ 0 ]; //get commTest[1] from right neighbour bsp_direct_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, sizeof( size_t ), &(commTest2[ 1 ]), sizeof( size_t ) ); //get commTest[2] from left neighbour bsp_direct_get( (bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest2[ 2 ]), sizeof( size_t ) ); //now everything should equal bsp_pid if( commTest2[ 0 ] != ((size_t)bsp_pid()) || commTest2[ 1 ] != ((size_t)bsp_pid()) || commTest2[ 2 ] != ((size_t)bsp_pid()) ) { fprintf( stderr, "FAILURE \t direct_get does not function properly! (%d: [%ld %ld %ld])\n", bsp_pid(), commTest2[ 0 ], commTest2[ 1 ], commTest2[ 2 ] ); mcbsp_util_fatal(); } //now test single BSMP message bsp_send( (bsp_pid() + 1) % bsp_nprocs(), NULL, &commTest, sizeof( size_t ) ); //check messages const struct mcbsp_util_stack queue1 = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ]; const size_t new_predicted_cap = predictCap( sizeof( struct mcbsp_message ) + sizeof( size_t ) ); predicted_cap = predicted_cap > new_predicted_cap ? predicted_cap : new_predicted_cap; if( queue1.cap != predicted_cap || queue1.size != sizeof( struct mcbsp_message ) || queue1.top != sizeof( struct mcbsp_message ) + sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_send did not adapt the communication queue as expected!\n(cap = %ld, size = %ld, top = %ld; prediction was %ld, %ld, %ld)\n", (size_t)queue1.cap, (size_t)queue1.size, (size_t)queue1.top, (size_t)predicted_cap, (size_t)(sizeof( struct mcbsp_message )), (size_t)(sizeof( struct mcbsp_message ) + sizeof( size_t )) ); mcbsp_util_fatal(); } const struct mcbsp_message request2 = *(struct mcbsp_message*) ((char*)queue1.array + queue1.top - sizeof( struct mcbsp_message )); if( request2.destination != NULL || request2.length != sizeof( size_t ) || // assumes tagSize = 0 *(size_t *)queue1.array != ((size_t)bsp_pid()) ) { fprintf( stderr, "FAILURE \t bsp_send did not push the expected communication request!\n(top = %ld, destination = %p, length = %ld, payload = %ld\n", (size_t)queue1.top, request2.destination, (size_t)request2.length, *(size_t *)queue1.array ); mcbsp_util_fatal(); } bsp_sync(); //inspect incoming BSMP queue (assuming tagSize = 0) predicted_cap = predictCap( sizeof( size_t ) + sizeof( size_t ) ); if( data->bsmp.cap != predicted_cap || data->bsmp.top != sizeof( size_t ) + sizeof( size_t ) || data->bsmp.size != sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t BSMP queue after superstep with sends is not as expected!\n(cap = %ld, top = %ld, size = %ld; prediction was %ld, %ld, %ld)\n", (size_t)data->bsmp.cap, (size_t)data->bsmp.top, (size_t)data->bsmp.size, (size_t)predicted_cap, (size_t)(8 + sizeof( size_t )), (size_t)(data->bsmp.size) ); mcbsp_util_fatal(); } if( *(size_t*)(data->bsmp.array) != (size_t)((bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t Value in BSMP queue is not correct!\n" ); mcbsp_util_fatal(); } //inspect using primitives MCBSP_NUMMSG_TYPE packets; MCBSP_BYTESIZE_TYPE packetSize; bsp_qsize( &packets, &packetSize ); if( packets != 1 || packetSize != sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_qsize does not function correctly!\n" ); mcbsp_util_fatal(); } bsp_move( &commTest, sizeof( size_t ) ); if( commTest[ 0 ] != (size_t)(( bsp_pid() + bsp_nprocs() - 1 ) % bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t bsp_move does not function correctly!\n" ); mcbsp_util_fatal(); } //check set_tagsize MCBSP_BYTESIZE_TYPE tsz = sizeof( size_t ); bsp_set_tagsize( &tsz ); if( tsz != 0 ) { fprintf( stderr, "FAILURE \t return value of bsp_set_tagsize is incorrect!\n" ); mcbsp_util_fatal(); } bsp_sync(); //check set_tagsize if( data->init->tagSize != sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_set_tagsize failed!\n" ); mcbsp_util_fatal(); } commTest[ 0 ] = ((size_t)bsp_pid()); commTest[ 1 ] = 3; commTest[ 2 ] = 8 + ((size_t)bsp_pid()); for( unsigned char i = 0; i < bsp_nprocs(); ++i ) { bsp_send( i, commTest, &(commTest[1]), 2 * sizeof( size_t ) ); char * const test = (char*)(data->queues[ (size_t)i ].array) + data->queues[ (size_t)i ].top - sizeof( struct mcbsp_message ) - sizeof( size_t ); if( *(size_t*)test != *commTest ) { fprintf( stderr, "FAILURE \t BSMP tag did not get pushed correctly (reads %ld instead of %ld)!\n", *(size_t*)test, *commTest ); mcbsp_util_fatal(); } } bsp_sync(); MCBSP_BYTESIZE_TYPE status; size_t tag; for( unsigned char i = 0; i < bsp_nprocs(); ++i ) { bsp_get_tag( &status, &tag ); if( tag >= ((size_t)bsp_nprocs()) || status != 2 * sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t error in BSMP tag handling! (tag=%ld, status=%ld)\n", tag, (size_t)status ); mcbsp_util_fatal(); } size_t *p_tag, *msg; if( bsp_hpmove( (void**)&p_tag, (void**)&msg ) != 2 * sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_hpmove does not return correct payload length." ); } if( msg[ 0 ] != 3 || *p_tag != tag ) { fprintf( stderr, "FAILURE \t bsp_hpmove does not contain correct message (tag=%ld, payload = %ld) which should be (%ld, 3).\n", *p_tag, msg[ 0 ], tag ); mcbsp_util_fatal(); } commTest[ tag ] = msg[ 1 ]; } for( unsigned short int i = 0; i < bsp_nprocs(); ++i ) { if( commTest[ i ] != (unsigned int)(8 + i) ) { fprintf( stderr, "FAILURE \t error in bsp_tag / bsp_(hp)move combination!\n" ); mcbsp_util_fatal(); } } bsp_sync(); #ifdef MCBSP_ALLOW_MULTIPLE_REGS //test multiple regs double mreg[17]; bsp_push_reg( &(mreg[0]), 7*sizeof( double ) ); bsp_sync(); double mregs = 1.3; bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 6 * sizeof( double ), sizeof( double ) ); bsp_push_reg( &(mreg[0]), 17*sizeof( double ) ); bsp_sync(); bsp_push_reg( &(mreg[0]), 13*sizeof( double ) ); bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 16 * sizeof( double ), sizeof( double ) ); bsp_sync(); if( mreg[ 6 ] != mreg[ 16 ] || mreg[ 6 ] != mregs ) { fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg calls (%f,%f,%f,...,%f,%f)\n", mreg[ 5 ], mreg[ 6 ], mreg[ 7 ], mreg[ 15 ], mreg[ 16 ] ); mcbsp_util_fatal(); } bsp_pop_reg( &(mreg[0]) ); bsp_pop_reg( &(mreg[0]) ); bsp_sync(); bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 2 * sizeof( double ), sizeof( double ) ); bsp_sync(); if( mreg[ 2 ] != mregs ) { fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg + multiple bsp_pop_reg calls\n" ); mcbsp_util_fatal(); } #endif bsp_end(); }
/** * \brief Main function. */ int main(int argc, char **argv) { char *s,*t; int size,sizes,sizet; int i,j,k,P; int cond; int *simi,res,Paux; int *a,*b; FILE *f,*f2; fpos_t filepos; int my_rank,set; struct timeval ini, fi; struct timezone tz; bsp_begin(atoi(argv[1])); size = atoi(argv[1]); f=fopen(argv[2],"r"); if (f==NULL) Exit("Error: File %s not found\n",argv[2]); fscanf(f,"%d",&sizes); if (sizes%size != 0) Exit("Error: The sequences have to have multiple of " "processes quantity size"); f2=fopen(argv[3],"r"); if (f2==NULL) Exit("Error: File %s not found\n",argv[3]); fscanf(f2,"%d",&sizet); if (bsp_pid() == 0) if (sizet%size != 0) Exit("Error: The sequences have to have multiple of " "processes quantity size"); P = atoi(argv[4]); if (bsp_pid() == 0) printf("align %d %s %s %d\n",size,argv[2],argv[3],P); sizes /= size; sizet /= size; s = (char*) malloc (sizes*sizeof(char)); t = (char*) malloc (sizet*sizeof(char)); if (s == NULL || t == NULL) Exit("No memory\n"); a = (int*)malloc ((sizet+1)*sizeof(int)); b = (int*)malloc ((sizes+1)*sizeof(int)); if (a == NULL || b == NULL) Exit("No memory\n"); if (bsp_pid() == size-1) { simi = (int*) malloc(P*sizeof(int)); if (simi == NULL) Exit("No memory\n"); } Paux = 0; bsp_push_reg(s,sizes*sizeof(char)); bsp_push_reg(b,(sizes+1)*sizeof(int)); bsp_push_reg(&filepos,sizeof(long int)); bsp_push_reg(&i,sizeof(int)); bsp_sync(); gettimeofday(&ini,&tz); for (k = 0; k < P*size + size -1; k++) { if (k >= bsp_pid() && k <= P*size+bsp_pid()-1) cond = 1; else cond = 0; set = 0; if (cond==1 && (k-bsp_pid())%size == 0)/*start of a reading*/ { if (bsp_pid() == 0 && k < size); else if (bsp_pid() == 0) { bsp_get(size-1,&filepos,0,&filepos,sizeof(long int)); } else { bsp_get(bsp_pid()-1,&filepos,0,&filepos,sizeof(long int)); } set = 1; } bsp_sync(); if (cond==1 && (k-bsp_pid())%size == 0)/*start of a reading*/ { if (set == 1) fsetpos(f2,&filepos); for (i = 0; i < sizet; i++) { fscanf(f2,"%c",&t[i]); if (t[i] == 'A' ||t[i] == 'T' ||t[i] == 'C' ||t[i] == 'G'); else { if (t[i] == EOF) Exit("Error: End of file reached without" "read all sequence in %s\n",argv[3]); i--; } } fgetpos(f2,&filepos); for (i = 0; i <= sizet; i++) a[i] = (i+bsp_pid()*sizet)*gap; } if (cond==1) { if (bsp_pid() == 0) { for (i = 0; i < sizes; i++) { fscanf(f,"%c",&s[i]); if (s[i] == 'A' ||s[i] == 'T' ||s[i] == 'C' ||s[i] == 'G'); else { if (s[i] == EOF) Exit("Error: End of file reached without" "read all sequence in %s\n",argv[2]); i--; } } for (j = 0; j <= sizes; j++) b[j] = (j + (k%size)*sizes)*gap; } res = Similarity (s, sizes, t, sizet, a, b); if (bsp_pid() == size-1 && (k-bsp_pid()+1)%size == 0) { simi[Paux++] = res; } } if (cond) { if (bsp_pid() != size -1) { bsp_put(bsp_pid()+1,s,s,0,sizes*sizeof(char)); bsp_put(bsp_pid()+1,b,b,0,(sizes+1)*sizeof(int)); } } bsp_sync(); } gettimeofday(&fi,&tz); printf("process %d ended\n",bsp_pid()); fclose(f); fclose(f2); if (bsp_pid() == size-1) { printf("Similarities: "); for (i = 0; i < P; i++) printf("%d ",simi[i]); printf("\n"); } if (bsp_pid() == 0) { printf("Computation time: %f\n", (fi.tv_sec - ini.tv_sec + (double)(fi.tv_usec - ini.tv_usec)/1000000)/60); } bsp_pop_reg(&filepos); bsp_pop_reg(b); bsp_pop_reg(s); bsp_sync(); return 0; }
void bspsieve(){ double time0, time1; ulong *x; // local list of candidates ulong *ks; //place for proc0 to store intermediate ks ulong n, nl, i, iglob; int s, p; ulong k; // the current largest sure-prime n = N+1; // copy global N and increase by 1. (only proc 1 knows this) // this is so the maximum array idx == N bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ printf("Now we have %d processors.\n", p); s= bsp_pid(); /* s = processor number */ if (s==0){ if(n<0) bsp_abort("Error in input: n is negative"); ks = vecalloculi(p); } bsp_push_reg(&n,SZULL); bsp_sync(); bsp_get(0,&n,0,&n,SZULL); //everyone reads N from proc 0 bsp_sync(); bsp_pop_reg(&n); nl= blockSize(p,s,n); // how big must s block be? printf("P(%d) tries to alloc vec of %lld ulongs", s, nl); printf(", size would be = %lld Mb\n", nl*SZULL/1024/1024); x= vecalloculi(nl); for (i=0; i<nl; i++){ // start by assuming everything is prime, except 1 iglob= globalIdx(p,s,n,i); x[i]= iglob; } if(s==0) x[1]=0; bsp_sync(); time0=bsp_time(); k = 2; // begin work while( k*k <= n ) { bspmarkmultiples(p,s,n,k,x); k = nextPrime(p,s,n,k,x); bsp_push_reg(&k, SZULL); bsp_sync(); if(s==0) { ks[0] = k; // my k for(i=1;i<p; i++) { bsp_get(i, &k, 0, &ks[i], SZULL); } } bsp_sync(); if(s==0) { k = findMinimum(p,ks); } bsp_sync(); //broadcast minimum bsp_get(0,&k,0,&k,SZULL); bsp_sync(); bsp_pop_reg(&k); } // end work bsp_sync(); time1=bsp_time(); ulong primes= 0; //printf("Processor %lld primes: \n", s); for(i = 0; i < blockSize(p,s,n); i++) if( x[i] != 0) primes++; //do not print primes, just count them. printf("proc %d finds %lld primes.\n", s, primes); fflush(stdout); if (s==0){ printf("This took only %.6lf seconds.\n", time1-time0); fflush(stdout); vecfreeuli(ks); } vecfreeuli(x); bsp_end(); } /* end bspsieve */
int main() { bsp_begin(); int s = bsp_pid(); int p = bsp_nprocs(); int a = 0; bsp_push_reg(&a, sizeof(int)); bsp_sync(); int b = 0; bsp_push_reg(&b, sizeof(int)); bsp_sync(); int c[16] = {0}; bsp_push_reg(&c, 16 * sizeof(int)); bsp_sync(); // first we test puts int data = s; bsp_hpput((s + 1) % p, &data, &a, 0, sizeof(int)); bsp_hpput((s + 2) % p, &data, &b, 0, sizeof(int)); for (int t = 0; t < p; ++t) { bsp_hpput(t, &data, &c, sizeof(int) * s, sizeof(int)); } ebsp_barrier(); // test: can set and get tagsize from core EBSP_MSG_ORDERED("%i", a); // expect_for_pid: ((pid - 1) % 16) // test: can put register multiple vars, and put multiple times EBSP_MSG_ORDERED("%i", b); // expect_for_pid: ((pid - 2) % 16) // test: support for larger variables EBSP_MSG_ORDERED("%i", c[5]); // expect_for_pid: ("5") // next we test gets int core_num_next = 0; int core_num_next_next = 0; bsp_hpget((s + 1) % p, &a, 0, &core_num_next, sizeof(int)); bsp_hpget((s + 2) % p, &b, 0, &core_num_next_next, sizeof(int)); bsp_hpget((s + 3) % p, &c, 4 * sizeof(int), &data, sizeof(int)); ebsp_barrier(); // test: can set and get tagsize from core EBSP_MSG_ORDERED("%i", core_num_next); // expect_for_pid: (pid) // test: can put register multiple vars, and put multiple times EBSP_MSG_ORDERED("%i", core_num_next_next); // expect_for_pid: (pid) // test: support for larger variables EBSP_MSG_ORDERED("%i", data); // expect_for_pid: ("4") bsp_end(); return 0; }
void mainloop(){ //int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000, //2,1000,-5,0,1000,1000,1000,1000,6,0}; int nlr,nlc,s,t,i,j,k,l,li,lsize,tsize0, tsize1,tempp,tempoff,rpos,cpos, *lpart,*linter,*gindx,*lcol,*lrow,*lsrow, *lscol, *ltrow, *ltcol, *temp; int* init = gen_graph(N, 0.05); bsp_begin(bsp_nprocs()); /**********Initialization SuperStep 0***************/ //Compute global row and column indeces for each element int pm = sqrt(bsp_nprocs()); int pn = (bsp_nprocs())/pm; /* Compute 2D processor numbering from 1D numbering with failsafe if the number of processors are not enough, back to simple 1D cyclic distribution */ if ( pn != pm ){ pn = bsp_nprocs(); pm = 1; t = bsp_pid(); s = 0; }else{ s= bsp_pid()%pm; /* 0 <= s < pm */ t= bsp_pid()/pn; /* 0 <= t < pn */ } nlr= nloc(pm,s,N); /* number of local rows */ nlc= nloc(pn,t,N); /* number of local columns */ lsize = nlr*nlc; //interpret 2D size to array size lpart = vecalloci(lsize); //Initialize local part of processor s linter = vecalloci(lsize); //Intermidiate array used for the matrix "multiplication" gindx = vecalloci(lsize); //Array to store the global indeces of the local elements lcol = vecalloci(lsize); //Array to store the glocal column index lrow = vecalloci(lsize); //Array to store the glocal row index bsp_push_reg(lpart,lsize*SZINT); //Distribute the Data li=0; for ( i= 0; i < N; i++){ for ( j= 0; j < N; j++){ if ((j % pn) == t){ lpart[li] = init[N*i+j]; lrow[li] = i; lcol[li] = j; gindx[li] = N*i+j; li++; } } } /*for ( i= 0; i < N*N; i++) { if(bsp_pid() == (i % bsp_nprocs())){ lpart[li] = init[i]; lrow[li] = i/N; lcol[li] = i % N; gindx[li] = i; li++; } }*/ vecfreei(init);//out of the shared space tsize0 = tsize1 =lsize; temp = lrow; //find unique global rows for processor s for(i=0;i<tsize0;i++){ for(j=0;j<tsize0;j++){ if(i==j){ continue; } else if(*(temp+i)==*(temp+j)){ k=j; tsize0--; while(k < tsize0){ *(temp+k)=*(temp+k+1); k++; } j=0; } } } temp = lcol; //find unique global column for processor s for(i=0;i<tsize1;i++){ for(j=0;j<tsize1;j++){ if(i==j){ continue; } else if(*(temp+i)==*(temp+j)){ k=j; tsize1--; while(k < tsize1){ *(temp+k)=*(temp+k+1); k++; } j=0; } } } //keep unique global rows and columns in arrays //initialize arrays to hold the elements of those rows and columns(ltcol, ltrow) lscol = vecalloci(tsize1); lsrow = vecalloci(tsize0); ltcol = vecalloci(N*tsize1); ltrow = vecalloci(N*tsize0); for(i=0;i < tsize0;i++){ lsrow[i] = lrow[i]; } for(i=0;i < tsize1;i++){ lscol[i] = lcol[i]; } vecfreei(lcol);//not needed from this point on vecfreei(lrow);//we use lscol, lsrow, ltrow, ltcol //sort arrays qsort (lsrow, tsize0, sizeof(int), compare_int); qsort (lscol, tsize1, sizeof(int), compare_int); bsp_sync(); /**********End Initialization SuperStep 0***************/ double time0= bsp_time(); /*********Repeated Squaring loop start*************/ j=1; while ((N-1) > j) { /*************Comm. SuperStep j0*************/ for(i=0;i < tsize1;i++){ for(k=0; k<N;k++){ tempp=((N*k+lscol[i]) % bsp_nprocs()); tempoff = ((double)(N*k+lscol[i])/(double)bsp_nprocs()); bsp_get(tempp, &lpart[0],tempoff*SZINT, <col[N*i+k],SZINT); } } for(i=0;i < tsize0;i++){ for(k=0; k<N;k++){ tempp=((N*lsrow[i]+k) % bsp_nprocs()); tempoff = ((double)(N*lsrow[i]+k)/(double)bsp_nprocs()); bsp_get(tempp, &lpart[0],tempoff*SZINT, <row[N*i+k],SZINT); } } bsp_sync(); /*************End Comm. SuperStep j0*************/ /*************Comp. SuperStep j1*************/ for ( i=0; i<lsize; i++) { int gcol = gindx[i] % N; //get global col indx of current element int grow = gindx[i]/N; //get global row indx of current element linter[i]=1000;//initiliaze array //find appropriate indx of the global rows and columns to perform "multiplication" /*for ( l=0; l < tsize0;l++){ if(grow == lsrow[l]){ rpos =l; break; } }*/ int *rp = bsearch (&grow, lsrow, tsize0, sizeof (lsrow),compare_int); rpos = rp - lsrow; int *cp = bsearch (&gcol, lscol, tsize1, sizeof (lscol),compare_int); cpos = cp - lscol; /*for ( l=0; l < tsize1;l++){ if(gcol == lscol[l]){ cpos =l; break; } }*/ //this is where the update is done for(k=0;k<N;k++){ linter[i] = fmin(linter[i], ltrow[N*rpos + k]+ltcol[N*cpos + k]); } } memcpy(lpart,linter,lsize*SZINT); j = 2*j; bsp_sync(); /*************End Comp. SuperStep j1*************/ } /*********Repeated Squaring loop end*************/ double time1= bsp_time(); bsp_sync(); /*********display matrices and time*********/ if(bsp_pid()==0){ printf( " \n Block Cyclic Distr calculation of APSP took: %f seconds \n", time1-time0 ); } /*printf("\n The array is, proc %d \n ", bsp_pid()); for(i=0;i < lsize;i++){ printf(" %d",lpart[i]); }*/ printf("\n "); //clean up bsp_pop_reg(lpart); vecfreei(lpart); vecfreei(linter); vecfreei(lscol); vecfreei(lsrow); vecfreei(ltcol); vecfreei(ltrow); vecfreei(gindx); bsp_end(); }
void mainloop(){ //int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000, //2,1000,-5,0,1000,1000,1000,1000,6,0}; int i,j,k,l,v,t,lsize,*lsize_m,*lrow,*lcol, *linit, *linter,*startrow_m; int li,lj,lk,startrow, endrow,g; int* init = gen_graph(N, 0.05); bsp_begin(bsp_nprocs()); /**********Initialization***************/ /*******Comp. Superstep 0******/ lsize = nloc(bsp_nprocs(),bsp_pid(), N); //Get the number of rows of processor s lrow = vecalloci(lsize*N); //The main storing array of processor s lcol = vecalloci(N); //array to hold the column for the matrix squaring startrow_m = vecalloci(bsp_nprocs()); //array to hold all processors starting global row lsize_m = vecalloci(bsp_nprocs()); //array to hold the number of rows of all processors linter = vecalloci(lsize*N); //Intermidiate array used for the matrix "multiplication" bsp_push_reg(startrow_m,bsp_nprocs()*SZINT); bsp_push_reg(lsize_m,bsp_nprocs()*SZINT); bsp_push_reg(lrow,lsize*N*SZINT); /****Get the first and last global row of processor s***/ if(bsp_pid() == (bsp_nprocs() - 1)){ startrow = (N - lsize); endrow = N; }else{ startrow = bsp_pid()*lsize; endrow = bsp_pid()*lsize + lsize; } //Distribute Data, according row block distribution li=0; for ( i= startrow; i < endrow; i++) { lj=0; for(j=0; j < N; j++) { lrow[N*li+lj] = init[N*i+j]; lj++; } li++; } vecfreei(init); //out of the shared enviroment //initialize arrays for ( i=0; i<bsp_nprocs(); i++) { startrow_m[i] = 0; lsize_m[i] = 0; } bsp_sync(); /*******End Comp. Superstep 0******/ /*********Comm. Superstep 1********/ //Communicate the global starting rows of all processors for(g=0; g<bsp_nprocs();g++){ bsp_put(g,&startrow,&startrow_m[0],bsp_pid()*SZINT,SZINT); bsp_put(g,&lsize,&lsize_m[0],bsp_pid()*SZINT,SZINT); } /*********End Comm. Superstep 1*****/ bsp_sync(); /**********End Initialization***************/ double time0= bsp_time(); /*********Repeated Squaring loop start*************/ j=1; while ((N-1) > j) { /****Comp. Superstep j0****/ //initialize arrays for ( i=0; i<N*lsize; i++) { linter[i] = 1000; } for ( i=0; i<N; i++) { lcol[i] = 0; } bsp_sync(); /****End Comp. Superstep j0****/ for ( lj=0; lj < N; lj++) { /***Comm. SuperStep jlj0*******/ //get global column lj t=0; for(g=0; g < bsp_nprocs();g++){ for(v=0; v<lsize_m[g]; v++){ bsp_get(g,&lrow[0],(lj+v*N)*SZINT,&lcol[t],SZINT); t++; } } bsp_sync(); /***End Comm. SuperStep jlj0***/ /***Comp. SuperStep jlj1*******/ //update the values that use global column lj for ( li = 0; li < lsize; li++){ for ( lk=0; lk < N; lk++) { linter[N*li+lj] = fmin(linter[N*li+lj], lrow[N*li+lk]+lcol[lk]); } } bsp_sync(); /***End Comp. SuperStep jlj1***/ } /****Comp. Superstep j1****/ memcpy(lrow,linter,N*lsize*SZINT); j=2*j; bsp_sync(); /****End Comp. Superstep j1****/ } /*********Repeated Squaring loop end*************/ double time1= bsp_time(); bsp_sync(); /*********display matrices and time*********/ if(bsp_pid()==0){ printf( " \n Block Row Distr (need to know basis) calculation of APSP took: %f seconds \n", time1-time0 ); } /*for(g = 0; g < bsp_nprocs(); g++){ if(bsp_pid()==g){ printf("\n i am proc %d and i have APSP Mat \n",bsp_pid()); for(k=0;k<lsize;k++) { printf("\n"); for(l=0;l<N;l++){ printf("\t %d",lrow[N*k+l]); } printf("\n \n "); } } bsp_sync(); }*/ //Clean up bsp_pop_reg(startrow_m); bsp_pop_reg(lsize_m); bsp_pop_reg(lrow); vecfreei(lrow); vecfreei(lcol); vecfreei(startrow_m); vecfreei(lsize_m); vecfreei(linter); bsp_end(); }