double bspip(int p, int s, int n, double *x, double *y){ /* Compute inner product of vectors x and y of length n>=0 */ int nloc(int p, int s, int n); double inprod, *Inprod, alpha; int i, t; Inprod= vecallocd(p); bsp_push_reg(Inprod,p*SZDBL); bsp_sync(); inprod= 0.0; for (i=0; i<nloc(p,s,n); i++){ inprod += x[i]*y[i]; } for (t=0; t<p; t++){ bsp_put(t,&inprod,Inprod,s*SZDBL,SZDBL); } bsp_sync(); alpha= 0.0; for (t=0; t<p; t++){ alpha += Inprod[t]; } bsp_pop_reg(Inprod); vecfreed(Inprod); return alpha; } /* end bspip */
void bspinprod(){ double bspip(int p, int s, int n, double *x, double *y); int nloc(int p, int s, int n); double *x, alpha, time0, time1; int p, s, n, nl, i, iglob; bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ s= bsp_pid(); /* s = processor number */ if (s==0){ printf("Please enter n:\n"); fflush(stdout); scanf("%d",&n); if(n<0) bsp_abort("Error in input: n is negative"); } bsp_push_reg(&n,SZINT); bsp_sync(); bsp_get(0,&n,0,&n,SZINT); bsp_sync(); bsp_pop_reg(&n); nl= nloc(p,s,n); x= vecallocd(nl); for (i=0; i<nl; i++){ iglob= i*p+s; x[i]= iglob+1; } bsp_sync(); time0=bsp_time(); alpha= bspip(p,s,n,x,x); bsp_sync(); time1=bsp_time(); printf("Processor %d: sum of squares up to %d*%d is %.lf\n", s,n,n,alpha); fflush(stdout); if (s==0){ printf("This took only %.6lf seconds.\n", time1-time0); fflush(stdout); } vecfreed(x); bsp_end(); } /* end bspinprod */
void bspfft1d_init(int n1, int N, int s, int t, double *w0, double *w, double *tw, int *rho_np, int *rho_p){ /* This parallel function initializes all the tables used in the FFT. */ int nlc, k1, ntw, c; double alpha; nlc= nloc(N,t,n1); bitrev_init(nlc,rho_np); bitrev_init(N,rho_p); k1= k1_init(n1,N,nlc); ufft_init(k1,w0); ufft_init(nlc,w); ntw= 0; for (c=k1; c<=N; c *=nlc){ alpha= (t%c) / (double)(c); twiddle_init(nlc,alpha,rho_np,&tw[2*ntw*nlc]); ntw++; } } /* end bspfft_init */
static int hwloc_look_osf(struct hwloc_backend *backend) { struct hwloc_topology *topology = backend->topology; cpu_cursor_t cursor; unsigned nbnodes; radid_t radid, radid2; radset_t radset, radset2; cpuid_t cpuid; cpuset_t cpuset; struct hwloc_obj *obj; unsigned distance; if (topology->levels[0][0]->cpuset) /* somebody discovered things */ return 0; hwloc_alloc_obj_cpusets(topology->levels[0][0]); nbnodes = rad_get_num(); cpusetcreate(&cpuset); radsetcreate(&radset); radsetcreate(&radset2); { hwloc_obj_t *nodes = calloc(nbnodes, sizeof(hwloc_obj_t)); unsigned *indexes = calloc(nbnodes, sizeof(unsigned)); float *distances = calloc(nbnodes*nbnodes, sizeof(float)); unsigned nfound; numa_attr_t attr; attr.nattr_type = R_RAD; attr.nattr_descr.rd_radset = radset; attr.nattr_flags = 0; for (radid = 0; radid < (radid_t) nbnodes; radid++) { rademptyset(radset); radaddset(radset, radid); cpuemptyset(cpuset); if (rad_get_cpus(radid, cpuset)==-1) { fprintf(stderr,"rad_get_cpus(%d) failed: %s\n",radid,strerror(errno)); continue; } indexes[radid] = radid; nodes[radid] = obj = hwloc_alloc_setup_object(HWLOC_OBJ_NODE, radid); obj->cpuset = hwloc_bitmap_alloc(); obj->memory.local_memory = rad_get_physmem(radid) * hwloc_getpagesize(); obj->memory.page_types_len = 2; obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types)); memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types)); obj->memory.page_types[0].size = hwloc_getpagesize(); #ifdef HAVE__SC_LARGE_PAGESIZE obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE); #endif cursor = SET_CURSOR_INIT; while((cpuid = cpu_foreach(cpuset, 0, &cursor)) != CPU_NONE) hwloc_bitmap_set(obj->cpuset, cpuid); hwloc_debug_1arg_bitmap("node %d has cpuset %s\n", radid, obj->cpuset); hwloc_insert_object_by_cpuset(topology, obj); nfound = 0; for (radid2 = 0; radid2 < (radid_t) nbnodes; radid2++) distances[radid*nbnodes+radid2] = RAD_DIST_REMOTE; for (distance = RAD_DIST_LOCAL; distance < RAD_DIST_REMOTE; distance++) { attr.nattr_distance = distance; /* get set of NUMA nodes at distance <= DISTANCE */ if (nloc(&attr, radset2)) { fprintf(stderr,"nloc failed: %s\n", strerror(errno)); continue; } cursor = SET_CURSOR_INIT; while ((radid2 = rad_foreach(radset2, 0, &cursor)) != RAD_NONE) { if (distances[radid*nbnodes+radid2] == RAD_DIST_REMOTE) { distances[radid*nbnodes+radid2] = (float) distance; nfound++; } } if (nfound == nbnodes) /* Finished finding distances, no need to go up to RAD_DIST_REMOTE */ break; } } hwloc_distances_set(topology, HWLOC_OBJ_NODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */); } radsetdestroy(&radset2); radsetdestroy(&radset); cpusetdestroy(&cpuset); /* add PU objects */ hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology)); hwloc_obj_add_info(topology->levels[0][0], "Backend", "OSF"); if (topology->is_thissystem) hwloc_add_uname_info(topology); return 1; }
void mainloop(){ //int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000, //2,1000,-5,0,1000,1000,1000,1000,6,0}; int nlr,nlc,s,t,i,j,k,l,li,lsize,tsize0, tsize1,tempp,tempoff,rpos,cpos, *lpart,*linter,*gindx,*lcol,*lrow,*lsrow, *lscol, *ltrow, *ltcol, *temp; int* init = gen_graph(N, 0.05); bsp_begin(bsp_nprocs()); /**********Initialization SuperStep 0***************/ //Compute global row and column indeces for each element int pm = sqrt(bsp_nprocs()); int pn = (bsp_nprocs())/pm; /* Compute 2D processor numbering from 1D numbering with failsafe if the number of processors are not enough, back to simple 1D cyclic distribution */ if ( pn != pm ){ pn = bsp_nprocs(); pm = 1; t = bsp_pid(); s = 0; }else{ s= bsp_pid()%pm; /* 0 <= s < pm */ t= bsp_pid()/pn; /* 0 <= t < pn */ } nlr= nloc(pm,s,N); /* number of local rows */ nlc= nloc(pn,t,N); /* number of local columns */ lsize = nlr*nlc; //interpret 2D size to array size lpart = vecalloci(lsize); //Initialize local part of processor s linter = vecalloci(lsize); //Intermidiate array used for the matrix "multiplication" gindx = vecalloci(lsize); //Array to store the global indeces of the local elements lcol = vecalloci(lsize); //Array to store the glocal column index lrow = vecalloci(lsize); //Array to store the glocal row index bsp_push_reg(lpart,lsize*SZINT); //Distribute the Data li=0; for ( i= 0; i < N; i++){ for ( j= 0; j < N; j++){ if ((j % pn) == t){ lpart[li] = init[N*i+j]; lrow[li] = i; lcol[li] = j; gindx[li] = N*i+j; li++; } } } /*for ( i= 0; i < N*N; i++) { if(bsp_pid() == (i % bsp_nprocs())){ lpart[li] = init[i]; lrow[li] = i/N; lcol[li] = i % N; gindx[li] = i; li++; } }*/ vecfreei(init);//out of the shared space tsize0 = tsize1 =lsize; temp = lrow; //find unique global rows for processor s for(i=0;i<tsize0;i++){ for(j=0;j<tsize0;j++){ if(i==j){ continue; } else if(*(temp+i)==*(temp+j)){ k=j; tsize0--; while(k < tsize0){ *(temp+k)=*(temp+k+1); k++; } j=0; } } } temp = lcol; //find unique global column for processor s for(i=0;i<tsize1;i++){ for(j=0;j<tsize1;j++){ if(i==j){ continue; } else if(*(temp+i)==*(temp+j)){ k=j; tsize1--; while(k < tsize1){ *(temp+k)=*(temp+k+1); k++; } j=0; } } } //keep unique global rows and columns in arrays //initialize arrays to hold the elements of those rows and columns(ltcol, ltrow) lscol = vecalloci(tsize1); lsrow = vecalloci(tsize0); ltcol = vecalloci(N*tsize1); ltrow = vecalloci(N*tsize0); for(i=0;i < tsize0;i++){ lsrow[i] = lrow[i]; } for(i=0;i < tsize1;i++){ lscol[i] = lcol[i]; } vecfreei(lcol);//not needed from this point on vecfreei(lrow);//we use lscol, lsrow, ltrow, ltcol //sort arrays qsort (lsrow, tsize0, sizeof(int), compare_int); qsort (lscol, tsize1, sizeof(int), compare_int); bsp_sync(); /**********End Initialization SuperStep 0***************/ double time0= bsp_time(); /*********Repeated Squaring loop start*************/ j=1; while ((N-1) > j) { /*************Comm. SuperStep j0*************/ for(i=0;i < tsize1;i++){ for(k=0; k<N;k++){ tempp=((N*k+lscol[i]) % bsp_nprocs()); tempoff = ((double)(N*k+lscol[i])/(double)bsp_nprocs()); bsp_get(tempp, &lpart[0],tempoff*SZINT, <col[N*i+k],SZINT); } } for(i=0;i < tsize0;i++){ for(k=0; k<N;k++){ tempp=((N*lsrow[i]+k) % bsp_nprocs()); tempoff = ((double)(N*lsrow[i]+k)/(double)bsp_nprocs()); bsp_get(tempp, &lpart[0],tempoff*SZINT, <row[N*i+k],SZINT); } } bsp_sync(); /*************End Comm. SuperStep j0*************/ /*************Comp. SuperStep j1*************/ for ( i=0; i<lsize; i++) { int gcol = gindx[i] % N; //get global col indx of current element int grow = gindx[i]/N; //get global row indx of current element linter[i]=1000;//initiliaze array //find appropriate indx of the global rows and columns to perform "multiplication" /*for ( l=0; l < tsize0;l++){ if(grow == lsrow[l]){ rpos =l; break; } }*/ int *rp = bsearch (&grow, lsrow, tsize0, sizeof (lsrow),compare_int); rpos = rp - lsrow; int *cp = bsearch (&gcol, lscol, tsize1, sizeof (lscol),compare_int); cpos = cp - lscol; /*for ( l=0; l < tsize1;l++){ if(gcol == lscol[l]){ cpos =l; break; } }*/ //this is where the update is done for(k=0;k<N;k++){ linter[i] = fmin(linter[i], ltrow[N*rpos + k]+ltcol[N*cpos + k]); } } memcpy(lpart,linter,lsize*SZINT); j = 2*j; bsp_sync(); /*************End Comp. SuperStep j1*************/ } /*********Repeated Squaring loop end*************/ double time1= bsp_time(); bsp_sync(); /*********display matrices and time*********/ if(bsp_pid()==0){ printf( " \n Block Cyclic Distr calculation of APSP took: %f seconds \n", time1-time0 ); } /*printf("\n The array is, proc %d \n ", bsp_pid()); for(i=0;i < lsize;i++){ printf(" %d",lpart[i]); }*/ printf("\n "); //clean up bsp_pop_reg(lpart); vecfreei(lpart); vecfreei(linter); vecfreei(lscol); vecfreei(lsrow); vecfreei(ltcol); vecfreei(ltrow); vecfreei(gindx); bsp_end(); }
void mainloop(){ //int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000, //2,1000,-5,0,1000,1000,1000,1000,6,0}; int i,j,k,l,v,t,lsize,*lsize_m,*lrow,*lcol, *linit, *linter,*startrow_m; int li,lj,lk,startrow, endrow,g; int* init = gen_graph(N, 0.05); bsp_begin(bsp_nprocs()); /**********Initialization***************/ /*******Comp. Superstep 0******/ lsize = nloc(bsp_nprocs(),bsp_pid(), N); //Get the number of rows of processor s lrow = vecalloci(lsize*N); //The main storing array of processor s lcol = vecalloci(N); //array to hold the column for the matrix squaring startrow_m = vecalloci(bsp_nprocs()); //array to hold all processors starting global row lsize_m = vecalloci(bsp_nprocs()); //array to hold the number of rows of all processors linter = vecalloci(lsize*N); //Intermidiate array used for the matrix "multiplication" bsp_push_reg(startrow_m,bsp_nprocs()*SZINT); bsp_push_reg(lsize_m,bsp_nprocs()*SZINT); bsp_push_reg(lrow,lsize*N*SZINT); /****Get the first and last global row of processor s***/ if(bsp_pid() == (bsp_nprocs() - 1)){ startrow = (N - lsize); endrow = N; }else{ startrow = bsp_pid()*lsize; endrow = bsp_pid()*lsize + lsize; } //Distribute Data, according row block distribution li=0; for ( i= startrow; i < endrow; i++) { lj=0; for(j=0; j < N; j++) { lrow[N*li+lj] = init[N*i+j]; lj++; } li++; } vecfreei(init); //out of the shared enviroment //initialize arrays for ( i=0; i<bsp_nprocs(); i++) { startrow_m[i] = 0; lsize_m[i] = 0; } bsp_sync(); /*******End Comp. Superstep 0******/ /*********Comm. Superstep 1********/ //Communicate the global starting rows of all processors for(g=0; g<bsp_nprocs();g++){ bsp_put(g,&startrow,&startrow_m[0],bsp_pid()*SZINT,SZINT); bsp_put(g,&lsize,&lsize_m[0],bsp_pid()*SZINT,SZINT); } /*********End Comm. Superstep 1*****/ bsp_sync(); /**********End Initialization***************/ double time0= bsp_time(); /*********Repeated Squaring loop start*************/ j=1; while ((N-1) > j) { /****Comp. Superstep j0****/ //initialize arrays for ( i=0; i<N*lsize; i++) { linter[i] = 1000; } for ( i=0; i<N; i++) { lcol[i] = 0; } bsp_sync(); /****End Comp. Superstep j0****/ for ( lj=0; lj < N; lj++) { /***Comm. SuperStep jlj0*******/ //get global column lj t=0; for(g=0; g < bsp_nprocs();g++){ for(v=0; v<lsize_m[g]; v++){ bsp_get(g,&lrow[0],(lj+v*N)*SZINT,&lcol[t],SZINT); t++; } } bsp_sync(); /***End Comm. SuperStep jlj0***/ /***Comp. SuperStep jlj1*******/ //update the values that use global column lj for ( li = 0; li < lsize; li++){ for ( lk=0; lk < N; lk++) { linter[N*li+lj] = fmin(linter[N*li+lj], lrow[N*li+lk]+lcol[lk]); } } bsp_sync(); /***End Comp. SuperStep jlj1***/ } /****Comp. Superstep j1****/ memcpy(lrow,linter,N*lsize*SZINT); j=2*j; bsp_sync(); /****End Comp. Superstep j1****/ } /*********Repeated Squaring loop end*************/ double time1= bsp_time(); bsp_sync(); /*********display matrices and time*********/ if(bsp_pid()==0){ printf( " \n Block Row Distr (need to know basis) calculation of APSP took: %f seconds \n", time1-time0 ); } /*for(g = 0; g < bsp_nprocs(); g++){ if(bsp_pid()==g){ printf("\n i am proc %d and i have APSP Mat \n",bsp_pid()); for(k=0;k<lsize;k++) { printf("\n"); for(l=0;l<N;l++){ printf("\t %d",lrow[N*k+l]); } printf("\n \n "); } } bsp_sync(); }*/ //Clean up bsp_pop_reg(startrow_m); bsp_pop_reg(lsize_m); bsp_pop_reg(lrow); vecfreei(lrow); vecfreei(lcol); vecfreei(startrow_m); vecfreei(lsize_m); vecfreei(linter); bsp_end(); }