void checkStrictDiagonallyDominant(int* i, int* j, double* v, int nz) { // steps: // first sum all rows // then find diagonals // check each diagonal against the summed rows. int c; double * rowtotal; rowtotal = vecallocd(N); double * diagonals; diagonals = vecallocd(N); for(c = 0; c< N; c++) { rowtotal[c] = 0; diagonals[c] = 0; } for(c = 0; c< nz; c++) { // find diagonals: if(i[c] == j[c]){ diagonals[i[c]] = v[c]; } else { rowtotal[i[c]] += fabs(v[c]); } } // foreach diag, check. for(c=0; c<N; c++) { if ( !(fabs(diagonals[c]) > rowtotal[c]) ) { fprintf(stderr, "PROBLEM: diagonal > rowtotal doesn't hold: \n" " diagonals[%d] = %lf\n" " rowtotal[%d] = %lf\n", c, fabs(diagonals[c]), c, rowtotal[c] ); fprintf(stderr, "increase mu? sometimes just running again is enough.\n"); exit(5); } } free(rowtotal); free(diagonals); }
double bspip(int p, int s, int n, double *x, double *y){ /* Compute inner product of vectors x and y of length n>=0 */ int nloc(int p, int s, int n); double inprod, *Inprod, alpha; int i, t; Inprod= vecallocd(p); bsp_push_reg(Inprod,p*SZDBL); bsp_sync(); inprod= 0.0; for (i=0; i<nloc(p,s,n); i++){ inprod += x[i]*y[i]; } for (t=0; t<p; t++){ bsp_put(t,&inprod,Inprod,s*SZDBL,SZDBL); } bsp_sync(); alpha= 0.0; for (t=0; t<p; t++){ alpha += Inprod[t]; } bsp_pop_reg(Inprod); vecfreed(Inprod); return alpha; } /* end bspip */
void bspredistr(double *x, int i, int length, int M, int N, int s, int t, int c0, int c1,char rev, int *rho_p, double *pm, int col){ /* This function redistributes the complex vector x of length n, col = 0 means that we are considering proc rows col = 1 means that we are considering proc columns */ double *tmp; int j0, j2, j, jglob, ratio, size; int npackets, destproc, destindex, r; ratio= c1/c0; size= MAX(length/ratio,1); npackets= length/size; tmp= vecallocd(2*size); if (rev) { j0= rho_p[t]%c0; j2= rho_p[t]/c0; } else { j0= t%c0; j2= t/c0; } for(j=0; j<npackets; j++){ jglob= j2*c0*length + j*c0 + j0; destproc = (jglob/(c1*length))*c1 + jglob%c1; destproc = (col == 0 ? s+M*destproc : N*s+destproc); /* * the first term of the sum is because we don't really know * the address of a[i] in the destproc, so we start from the * beginning of a and jump */ destindex = (jglob%(c1*length))/c1; for(r=0; r<size; r++){ tmp[2*r]=x[2*(j+r*ratio)]; tmp[2*r+1]= x[2*(j+r*ratio)+1]; } destindex= i*length+destindex; bsp_put(destproc,tmp,pm,destindex*2*SZDBL,size*2*SZDBL); } vecfreed(tmp); } /* end bspredistr */
/* This function provides the actual Matlab interface. */ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { struct sparsematrix* MondriaanMatrix; int i; /* converting the matrix from Matlab to Mondriaan */ MondriaanMatrix = ConvertMatlabToMondriaan(prhs[0]); /* computing the length of the priority vector */ int length = MondriaanMatrix->m+MondriaanMatrix->n; /* getting the vector as double*, DO NOT FREE (the memory is allocated in Matlab!) */ double* inputVec = mxGetPr(prhs[1]); /* explicit conversion of double* to long* */ long* vec = double_array_to_long(inputVec,length); /* switching from mondriaan numbering of rows/cols to C */ for(i=0;i<length;i++) vec[i]--; if(MondriaanMatrix->ReValue == NULL){ MondriaanMatrix->ReValue = vecallocd(MondriaanMatrix->NrNzElts); int j; for(j=0;j<MondriaanMatrix->NrNzElts;j++) MondriaanMatrix->ReValue[j] = 0.0; } /* switching from Matlab matrix storage (ascending columns) to ascending rows */ struct sparsematrixplus matplus = reorder_row_incr(MondriaanMatrix); struct sparsematrix mat = matplus.matrix; /* performing split */ struct twomatrices two = overpaint(&mat,vec); /* separating the two parts */ struct sparsematrix matrix = two.Ac; struct sparsematrix matrix2 = two.Ar; /* converting back from Mondriaan to Matlab */ plhs[0] = ConvertMondriaanToMatlab(&matrix); plhs[1] = ConvertMondriaanToMatlab(&matrix2); vecfreel(vec); MMDeleteSparseMatrix(&mat); MMDeleteSparseMatrix(&two.Ar); MMDeleteSparseMatrix(&two.Ac); vecfreel(matplus.perm); MMDeleteSparseMatrix(MondriaanMatrix); }
void bspinprod(){ double bspip(int p, int s, int n, double *x, double *y); int nloc(int p, int s, int n); double *x, alpha, time0, time1; int p, s, n, nl, i, iglob; bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ s= bsp_pid(); /* s = processor number */ if (s==0){ printf("Please enter n:\n"); fflush(stdout); scanf("%d",&n); if(n<0) bsp_abort("Error in input: n is negative"); } bsp_push_reg(&n,SZINT); bsp_sync(); bsp_get(0,&n,0,&n,SZINT); bsp_sync(); bsp_pop_reg(&n); nl= nloc(p,s,n); x= vecallocd(nl); for (i=0; i<nl; i++){ iglob= i*p+s; x[i]= iglob+1; } bsp_sync(); time0=bsp_time(); alpha= bspip(p,s,n,x,x); bsp_sync(); time1=bsp_time(); printf("Processor %d: sum of squares up to %d*%d is %.lf\n", s,n,n,alpha); fflush(stdout); if (s==0){ printf("This took only %.6lf seconds.\n", time1-time0); fflush(stdout); } vecfreed(x); bsp_end(); } /* end bspinprod */
/* * methods that reorders the nonzeros of a given matrix such that the columns are in ascending order */ struct sparsematrixplus reorder_col_incr(struct sparsematrix* matrix){ /* allocating memory */ long length = matrix->NrNzElts; long* I = vecallocl(length); long* J = vecallocl(length); double* Val = vecallocd(length); int k,l; /* creating a temporary array for storing the values to be sorted (rows) */ long* tempArray = vecallocl(length); for(k=0;k<length;k++) tempArray[k] = matrix->j[k]; /* sorting tempArray with Counting Sort and getting back the permutation indices */ long* indices = CSortVec(tempArray,length,matrix->n); /* creation of the vectors of the permuted rows, columns, value */ for(l=0;l<length;l++){ k = indices[l]; I[l] = matrix->i[k]; J[l] = matrix->j[k]; Val[l] = matrix->ReValue[k]; } /* creating the matrix part of the */ struct sparsematrix newmatrix; MMSparseMatrixInit(&newmatrix); newmatrix.m = matrix->m; newmatrix.n = matrix->n; newmatrix.i = I; newmatrix.j = J; newmatrix.ReValue = Val; newmatrix.NrNzElts = length; /* removing the temporary array */ vecfreel(tempArray); /* creating the final output */ struct sparsematrixplus output; output.matrix = newmatrix; output.perm = indices; return output; }
int main(){ /* reading the matrix from file */ FILE* File; struct sparsematrix matrix; File = fopen("../../matrices/cre_b.mtx", "r"); /* File = fopen("../../matrices/tbdlinux.mtx", "r"); */ if (!MMReadSparseMatrix(File, &matrix)) printf("Unable to read input matrix!\n"); fclose(File); /* creating explicitly a particular priority vector vec */ int m = matrix.m; int n = matrix.n; if(matrix.ReValue == NULL){ matrix.ReValue = vecallocd(matrix.NrNzElts); int j; for(j=0;j<matrix.NrNzElts;j++) matrix.ReValue[j] = 0.0; } /*long* vec = random_permutation(m+n); */ int i; long* vec = vecallocl(m+n); for(i=0;i<m+n;i++) vec[i]=i; /* explicit computation of Ar and Ac with the overpaint method */ struct sparsematrixplus m2plus = reorder_row_incr(&matrix); struct sparsematrix matrix2 = m2plus.matrix; struct twomatrices two = overpaint(&matrix2,vec); printf("---------------\n"); /*print_matrix(two.Ar); print_matrix(two.Ac); */ vecfreel(vec); vecfreel(m2plus.perm); MMDeleteSparseMatrix(&matrix); MMDeleteSparseMatrix(&matrix2); MMDeleteSparseMatrix(&two.Ar); MMDeleteSparseMatrix(&two.Ac); return 0; }
/* from partitioned matrix, obtaining subpart * id = 1 or 2 */ struct sparsematrix assignMatrix(struct sparsematrix* matrix, int id){ struct sparsematrix output; MMSparseMatrixInit(&output); output.m = matrix->m; output.n = matrix->n; output.NrNzElts = matrix->Pstart[id]-matrix->Pstart[id-1]; output.i = vecallocl(output.NrNzElts); output.j = vecallocl(output.NrNzElts); output.ReValue = vecallocd(output.NrNzElts); int start = matrix->Pstart[id-1]; int k; for(k=0;k<output.NrNzElts;k++){ output.i[k] = matrix->i[start+k]; output.j[k] = matrix->j[start+k]; output.ReValue[k] = 1.0*id; } return output; }
struct sparsematrix partition_to_matrix(struct sparsematrix* A){ struct sparsematrix A1 = assignMatrix(A,1); struct sparsematrix A2 = assignMatrix(A,2); struct sparsematrix B; MMSparseMatrixInit(&B); B.m = A->m; B.n = A->n; B.NrNzElts = A->NrNzElts; B.i = vecallocl(B.NrNzElts); B.j = vecallocl(B.NrNzElts); B.ReValue = vecallocd(B.NrNzElts); int index_B = 0, i; for(i=0;i<A1.NrNzElts;i++){ B.i[index_B] = A1.i[i]; B.j[index_B] = A1.j[i]; B.ReValue[index_B++] = 1.0; } for(i=0;i<A2.NrNzElts;i++){ B.i[index_B] = A2.i[i]; B.j[index_B] = A2.j[i]; B.ReValue[index_B++] = 2.0; } MMDeleteSparseMatrix(&A1); MMDeleteSparseMatrix(&A2); struct sparsematrixplus plus = reorder_row_incr(&B); MMDeleteSparseMatrix(&B); vecfreel(plus.perm); return plus.matrix; }
void bspbench(){ void leastsquares(int h0, int h1, double *t, double *g, double *l); int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH]; double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest, time0, time1, time, *Time, mintime, maxtime, nflops, r, g0, l0, g, l, t[MAXH+1]; /**** Determine p ****/ bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ s= bsp_pid(); /* s = processor number */ Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL); dest= vecallocd(2*MAXH+p); bsp_push_reg(dest,(2*MAXH+p)*SZDBL); bsp_sync(); /**** Determine r ****/ for (n=1; n <= MAXN; n *= 2){ /* Initialize scalars and vectors */ alpha= 1.0/3.0; beta= 4.0/9.0; for (i=0; i<n; i++){ z[i]= y[i]= x[i]= (double)i; } /* Measure time of 2*NITERS DAXPY operations of length n */ time0=bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<n; i++) y[i] += alpha*x[i]; for (i=0; i<n; i++) z[i] -= beta*x[i]; } time1= bsp_time(); time= time1-time0; bsp_put(0,&time,Time,s*SZDBL,SZDBL); bsp_sync(); /* Processor 0 determines minimum, maximum, average30 INTRODUCTION computing rate */ if (s==0){ mintime= maxtime= Time[0]; for(s1=1; s1<p; s1++){ mintime= MIN(mintime,Time[s1]); maxtime= MAX(maxtime,Time[s1]); } if (mintime>0.0){ /* Compute r = average computing rate in flop/s */ nflops= 4*NITERS*n; r= 0.0; for(s1=0; s1<p; s1++) r += nflops/Time[s1]; r /= p; printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ", n, nflops/(maxtime*MEGA),nflops/ (mintime*MEGA), r/MEGA); fflush(stdout); /* Output for fooling benchmark-detecting compilers */ printf(" fool=%7.1lf\n",y[n-1]+z[n-1]); } else printf("minimum time is 0\n"); fflush(stdout); } } /**** Determine g and l ****/ for (h=0; h<=MAXH; h++){ /* Initialize communication pattern */ for (i=0; i<h; i++){ src[i]= (double)i; if (p==1){ destproc[i]=0; destindex[i]=i; } else { /* destination processor is one of the p-1 others */ destproc[i]= (s+1 + i%(p-1)) %p; /* destination index is in my own part of dest */ destindex[i]= s + (i/(p-1))*p; } } /* Measure time of NITERS h-relations */ bsp_sync(); time0= bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<h; i++) bsp_put(destproc[i],&src[i],dest,destindex[i]*SZDBL, SZDBL); bsp_sync(); } time1= bsp_time(); time= time1-time0; /* Compute time of one h-relation */ if (s==0){ t[h]= (time*r)/NITERS; printf("Time of %5d-relation= %lf sec= %8.0lf flops\n", h, time/NITERS, t[h]); fflush(stdout); } } if (s==0){ printf("size of double = %d bytes\n",(int)SZDBL); leastsquares(0,p,t,&g0,&l0); printf("Range h=0 to p : g= %.1lf, l= %.1lf\n",g0,l0); leastsquares(p,MAXH,t,&g,&l); printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l); printf("The bottom line for this BSP computer is:\n"); printf("p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf\n", p,r/MEGA,g,l); fflush(stdout); } bsp_pop_reg(dest); vecfreed(dest); bsp_pop_reg(Time); vecfreed(Time); bsp_end(); } /* end bspbench */
void bspfft_test() { void bspfft( double * x, int n, int p, int s, int sign, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); void bspfft_init( int n, int p, int s, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); int k1_init( int n, int p ); int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p; double time0, time1, time2, ffttime, nflops, max_error, error_re, error_im, error, *Error, *x, *w0, *w, *tw; bsp_begin( P ); p = bsp_nprocs(); s = bsp_pid(); bsp_push_reg( &n, SZINT ); Error = vecallocd( p ); bsp_push_reg( Error, p * SZDBL ); bsp_sync(); if ( s == 0 ) { printf( "Please enter length n: \n" ); #ifdef _WIN32 scanf_s( "%d", &n ); #else scanf( "%d", &n ); #endif if ( n < 2 * p ) { bsp_abort( "Error in input: n < 2p" ); } for ( q = 1; q < p; q++ ) { bsp_put( q, &n, &n, 0, SZINT ); } } bsp_sync(); if ( s == 0 ) { printf( "FFT of vector of length %d using %d processors\n", n, p ); printf( "performing %d forward and %d backward transforms\n", NITERS, NITERS ); } /* Allocate, register, and initialize vectors */ np = n / p; x = vecallocd( 2 * np ); bsp_push_reg( x, 2 * np * SZDBL ); k1 = k1_init( n, p ); w0 = vecallocd( k1 ); w = vecallocd( np ); tw = vecallocd( 2 * np + p ); rho_np = vecalloci( np ); rho_p = vecalloci( p ); for ( j = 0; j < np; j++ ) { jglob = j * p + s; x[2 * j] = ( double )jglob; x[2 * j + 1] = 1.0; } bsp_sync(); time0 = bsp_time(); /* Initialize the weight and bit reversal tables */ for ( it = 0; it < NITERS; it++ ) { bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time1 = bsp_time(); /* Perform the FFTs */ for ( it = 0; it < NITERS; it++ ) { bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p ); bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time2 = bsp_time(); /* Compute the accuracy */ max_error = 0.0; for ( j = 0; j < np; j++ ) { jglob = j * p + s; error_re = fabs( x[2 * j] - ( double )jglob ); error_im = fabs( x[2 * j + 1] - 1.0 ); error = sqrt( error_re * error_re + error_im * error_im ); if ( error > max_error ) { max_error = error; } } bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL ); bsp_sync(); if ( s == 0 ) { max_error = 0.0; for ( q = 0; q < p; q++ ) { if ( Error[q] > max_error ) { max_error = Error[q]; } } } for ( j = 0; j < NPRINT && j < np; j++ ) { jglob = j * p + s; printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] ); } fflush( stdout ); bsp_sync(); if ( s == 0 ) { printf( "Time per initialization = %lf sec \n", ( time1 - time0 ) / NITERS ); ffttime = ( time2 - time1 ) / ( 2.0 * NITERS ); printf( "Time per FFT = %lf sec \n", ffttime ); nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n; printf( "Computing rate in FFT = %lf Mflop/s \n", nflops / ( MEGA * ffttime ) ); printf( "Absolute error= %e \n", max_error ); printf( "Relative error= %e \n\n", max_error / n ); } bsp_pop_reg( x ); bsp_pop_reg( Error ); bsp_pop_reg( &n ); bsp_sync(); vecfreei( rho_p ); vecfreei( rho_np ); vecfreed( tw ); vecfreed( w ); vecfreed( w0 ); vecfreed( x ); vecfreed( Error ); bsp_end(); } /* end bspfft_test */
/* * method that splits the two parts of A which have value "first" * and value "second", assigning them respectively to Ar and Ac */ struct twomatrices split_matrix(struct sparsematrix* A, double first, double second){ int k; /* initialization of the counters */ int max1=0; int max2=0; /* initial sweep of the matrix to see how long should be the vectors*/ for(k=0;k<A->NrNzElts;k++) (A->ReValue[k] == second) ? max2++ : max1++; /* initialization of the vectors */ long *i1 = vecallocl(max1); long *j1 = vecallocl(max1); double *v1 = vecallocd(max1); double *c1 = vecallocd(max1); long *i2 = vecallocl(max2); long *j2 = vecallocl(max2); double *v2 = vecallocd(max2); double *c2 = vecallocd(max2); /* population of the vectors */ int index1=0; int index2=0; for(k=0;k<(A->NrNzElts);k++){ if (A->ReValue[k] == second ){ i2[index2] = A->i[k]; j2[index2] = A->j[k]; v2[index2] = second; c2[index2] = 0.0; index2++; } else { i1[index1] = A->i[k]; j1[index1] = A->j[k]; v1[index1] = first; c1[index1] = 0.0; index1++; } } /* construction of the output */ struct sparsematrix A1, A2; MMSparseMatrixInit(&A1); MMSparseMatrixInit(&A2); A1.m = A->m; A1.n = A->n; A1.NrNzElts = max1; A1.i = i1; A1.j = j1; A1.ReValue = v1; A1.ImValue = c1; A2.NrNzElts = max2; A2.m = A->m; A2.n = A->n; A2.i = i2; A2.j = j2; A2.ReValue = v2; A2.ImValue = c2; struct twomatrices output; output.Ar = A1; output.Ac = A2; return output; }
int main (int argc, char** argv) { // aim for a nonzero density given by sparsity: sparsity = 0.2; // nz = sparsity*100% of the size of the matrix /* * we say 'aim' here, since of course initially exactly * nz = sparsity * N^2 * nonzeroes will be generated at random spots, but because * the matrix must be symmetric and diagonally positive, the * actual number of nonzeroes will probably not be exactly * the projected number. */ // read the desired size of the matrix from command line if (argc < 2) { printf("Usage: %s N [mu] [sparsity]\n", argv[0]); exit(-1); } if(sscanf(argv[1], "%d", &N) != 1) { printf("couldn't read command-line argument for N. must be an integer.\n"); exit(-2); } double mu; mu = 2.5; //default scalar for making matrix diagonal-dominant // maybe the user supplied a different mu if(argc > 2 && sscanf(argv[2], "%lf", &mu) != 1) { exit(-2); } // maybe the user supplied a different sparsity if(argc > 3 && sscanf(argv[3], "%lf", &sparsity) != 1) { exit(-2); } int nz = sparsity*N*N; int* xs; int* ys; double* vals; fprintf(stderr,"Generating matrix. N=%d, density=%lf, target nz=%d, ", N, sparsity, nz); fprintf(stderr, "mu = %lf\n", mu); // seed the random generator. srandom((unsigned)time(NULL)); xs = vecalloci(nz); ys = vecalloci(nz); vals = vecallocd(nz); bool* diag_done; diag_done = malloc(N*sizeof(bool)); int i; for(i = 0; i<N; i++) { diag_done[i] = false; } int nz_generated; int x,y; nz_generated = 0; double fake_transpose; x=0;y=0; while(x<N && y<N) { //don't escape matrix bounds. if (nz_generated % 1000000 == 0) { fprintf(stderr,"progress: %f%%\r", (double)nz_generated/(double)(nz/2.0)*100.0); } if(x==y) { //diagonal, so always generate. xs[nz_generated]=x; ys[nz_generated]=y; vals[nz_generated]=ran()*2.0-1.0; diag_done[x] = true; #ifdef DEBUG fprintf(stderr,"generated A[//][%d]=%lf\n" , ys[nz_generated] , vals[nz_generated]); #endif nz_generated++; } else { // not a diagonal. only add if in // lower triangular half if(x<y) { if(nz_generated > nz) { // this should NEVER happen, although all that's // stopping it from happening is our ran() being // well-behaved........... printf("EEK! something went wrong!!\n"); exit(666); } xs[nz_generated]= x; ys[nz_generated]= y; // simulate the distribution of values which // would occur if we do A+A^T afterwards. if (ran() < sparsity) { fake_transpose = ran()*2.0-1.0; vals[nz_generated]= ran()*2.0-1.0 + fake_transpose; } else { vals[nz_generated]= ran()*2.0-1.0; } #ifdef DEBUG fprintf(stderr,"generated A[%d][%d]=%lf\n", xs[nz_generated] , ys[nz_generated] , vals[nz_generated]); #endif nz_generated++; } } x += 1/sparsity * (ran() + 0.5); if( x >= N ) { y += x/N; x = x%N; } } fprintf(stderr, "generated initial randoms\n"); int diagonals_present = 0; for(i=0; i<nz_generated; i++) { if(xs[i]==ys[i]) diagonals_present++; } fprintf(stderr,"generated %d nzeros, array was %d big.\n", nz_generated, nz); #ifdef DEBUG fprintf(stderr, "found %d diagonal(s), still need %d more.\n", diagonals_present, (N-diagonals_present)); #endif // add the missing diagonals, and add mu to each diagonal. int newsize = nz_generated + (N - diagonals_present); fprintf(stderr,"reallocating values array to %lluM \n", (unsigned long long)(SZDBL+2*SZINT)*newsize/1048576); int* diag_i; int* diag_j; double* diag_val; diag_i = realloc(xs ,SZINT*newsize); diag_j = realloc(ys ,SZINT*newsize); diag_val = realloc(vals,SZDBL*newsize); if(diag_i == NULL || diag_j == NULL || diag_val == NULL) { printf("out of memory!"); exit(44); } addDiagonal(mu, diag_i, diag_j, diag_val, nz_generated, newsize, diag_done); nz_generated=newsize; #ifdef DEBUG for(i=0;i<newsize;i++) { fprintf(stderr,"after addDiagonal A[%d][%d]=%lf\n", diag_i[i],diag_j[i], diag_val[i]); } fprintf(stderr, "Going to make symmetric now... (nz_generated = %d)\n", nz_generated); #endif // now we explicitly fill the array with the // upper triangle values // things must be symmetric, but they aren't, yet // ... here's a good place to do the transposing thing. newsize = nz_generated * 2 - N; //number of real nonzeros, don't // count diagonals twice. int *new_i; int *new_j; double *new_v; new_i = realloc(diag_i ,SZINT*newsize); new_j = realloc(diag_j ,SZINT*newsize); new_v = realloc(diag_val,SZDBL*newsize); if(new_i == NULL || new_i == NULL || new_v == NULL) { printf("out of memory (2)!"); exit(44); } diag_i = new_i; diag_j = new_j; diag_val = new_v; addTranspose(newsize,diag_i,diag_j,diag_val, nz_generated); #ifdef DEBUG for(i=0;i<newsize;i++) // to make diags stand out. if(diag_i[i]==diag_j[i]) fprintf(stderr,"after transpose A[%d][%d]=%lf \\\\\n", diag_i[i],diag_j[i], diag_val[i]); else fprintf(stderr,"after transpose A[%d][%d]=%lf\n", diag_i[i],diag_j[i], diag_val[i]); #endif checkStrictDiagonallyDominant(diag_i,diag_j,diag_val, newsize); // now quickly generate a test-vector to solve against: double *vec = vecallocd(N); for(i=0;i<N;i++) vec[i]=ran(); fprintf(stderr,"Left with %d nonzeroes; nonzero density = %lf (desired=%lf)\n", newsize, newsize/((double)N*N), sparsity); fprintf(stderr,"========== OUTPUTTING ... ==========\n"); outputMondriaanMatrix(newsize, diag_i, diag_j, diag_val, vec); outputMathematicaMatrix(newsize, diag_i, diag_j, diag_val, vec); free(diag_done); free(vec); free(diag_i); free(diag_j); free(diag_val); return 0; }
void bspbench(){ void leastsquares(int h0, int h1, double *t, double *g, double *l); int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH]; double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest, time0, time1, time, *Time, mintime, maxtime, nflops, r, g0, l0, g, l, t[MAXH+1]; size_t pin[100]; // Determine p // start: new code for pinning for (i=0; i< tnode->length; i++) pin[i] = tnode->sons[i]->index; mcbsp_set_pinning( pin, tnode->length ); bsp_begin(tnode->length); // end: new code for pinning p= bsp_nprocs(); // p = number of processors obtained s= bsp_pid(); // s = processor number Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL); dest= vecallocd(2*(MAXH+p)); bsp_push_reg(dest,(2*(MAXH+p))*SZDBL); bsp_sync(); // Determine r for (n=1; n < MAXN; n *= 2){ // Initialize scalars and vectors alpha= 1.0/3.0; beta= 4.0/9.0; for (i=0; i<n; i++){ z[i]= y[i]= x[i]= (double)i; } // Measure time of 2*NITERS DAXPY operations of length n time0=bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<n; i++) y[i] += alpha*x[i]; for (i=0; i<n; i++) z[i] -= beta*x[i]; } time1= bsp_time(); time= time1-time0; bsp_put(0,&time,Time,s*SZDBL,SZDBL); bsp_sync(); // Processor 0 determines minimum, maximum, average computing rate if (s==0){ mintime= maxtime= Time[0]; for(s1=1; s1<p; s1++){ mintime= MIN(mintime,Time[s1]); maxtime= MAX(maxtime,Time[s1]); } if (mintime>0.0){ // Compute r = average computing rate in flop/s nflops= 4*NITERS*n; r= 0.0; for(s1=0; s1<p; s1++) r += nflops/Time[s1]; r /= p; //printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ", // n, nflops/(maxtime*MEGA),nflops/(mintime*MEGA), r/MEGA); //fflush(stdout); // Output for fooling benchmark-detecting compilers printf( "", y[n-1]+z[n-1] ); } } } // Determine g and l for (h=0; h<=MAXH; h++){ // Initialize communication pattern for (i=0; i<h; i++){ src[i]= (double)i; if (p==1){ destproc[i]=0; destindex[i]=i; } else { // destination processor is one of the p-1 others destproc[i]= (s+1 + i%(p-1)) %p; // destination index is in my own part of dest destindex[i]= s + (i/(p-1))*p; } } for (i=0; i<h; i++){ src[i]= (double)i; if (p==1){ destproc[i]=0; destindex[i]=i; } else { // destination processor is one of the p-1 others destproc[i]= (s+1 + i%(p-1)) %p; // destination index is in my own part of dest destindex[i]= s + (i/(p-1))*p; } } // Measure time of NITERS h-relations bsp_sync(); time0= bsp_time(); for (iter=0; iter<NITERS; iter++){ for (i=0; i<h; i++) { //bsp_get(0, dest, destindex[i]*SZDBL, &src[i] , SZDBL); //bsp_get(destproc[i], dest, destindex[i]*SZDBL, &src[i] , SZDBL); bsp_put(destproc[i], &src[i] , dest , destindex[i]*SZDBL, SZDBL); } //if (s == 0) // bsp_get(0, dest, destindex[i]*SZDBL, &src[i] , SZDBL); bsp_sync(); } time1= bsp_time(); time= time1-time0; // Compute time of one h-relation if (s==0){ t[h]= (time*r)/NITERS; //#define SEHLOC_BENCH_VERBOSE #ifdef SEHLOC_BENCH_VERBOSE char strnodes[256]; sprintf(strnodes, ""); for (i=0; i<tnode->length; i++) { sprintf(strnodes, "%s %d", strnodes, tnode->sons[i]->index); } printf("SEH# Level%d %5d %lf %8.0lf\n", tnode->level, h, time/NITERS, t[h]); fflush(stdout); #endif } } if (s==0){ leastsquares(0,p,t,&g0,&l0); printf("Range h=0 to p : g= %.1lf, l= %.1lf\n",g0,l0); leastsquares(p,MAXH,t,&g,&l); g=(g>0)? g: g0*2; printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l); //printf("plot# %d %.1lf %.1lf\n",tnode->level, g,l); printf("The bottom line for this MultiBSP component is:\n"); printf("<p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf>\n", p,r/MEGA,g,l); fflush(stdout); } bsp_pop_reg(dest); vecfreed(dest); bsp_pop_reg(Time); vecfreed(Time); bsp_end(); } /* end bspbench */
/* * function that assigns the nonzeros of matrix either to Ar or Ac */ struct twomatrices localview(struct sparsematrix* matrix){ /* dividing between A1 and A2 */ struct twomatrices A = split_matrix(matrix,1.0,2.0); struct sparsematrix* A1 = &(A.Ar); struct sparsematrix* A2 = &(A.Ac); /* explicit saving of m,n for brevity */ int m = matrix->m; int n = matrix->n; /* * building the bookkeeping vectors * nzXr = nonzeros in the rows of AX * nzXc = nonzeros in the columns of AX * nzr,nzc = nonzeros in row/col of matrix */ long* nz1r = nnz(A1->i, A1->NrNzElts, m); long* nz2r = nnz(A2->i, A2->NrNzElts, m); long* nzr = nnz(matrix->i, matrix->NrNzElts, m); long* nz1c = nnz(A1->j, A1->NrNzElts, n); long* nz2c = nnz(A2->j, A2->NrNzElts, n); long* nzc = nnz(matrix->j, matrix->NrNzElts, n); /* storing the number of nonzeros that have to be assigned */ int len = matrix->NrNzElts; /* * initialization of the new vectors to be populated * assuming everything is assigned to one and the other stays empty * the max size is matrix.NrNzElts (len) */ long* ir = vecallocl(len); long* jr = vecallocl(len); long* ic = vecallocl(len); long* jc = vecallocl(len); /* counters for filling of ir,jr and ic,jc */ int index_r = 0; int index_c = 0; int i,j,k; k = 0; while(len>0){ /* TODO k randomly chosen between 0 and len */ /* k = randi(len); */ /* computing explicitly row and column of the k-th element of the matrix */ i = matrix->i[k]; j = matrix->j[k]; /* computing whether i,j are split */ int rowsplit = (nz1r[i] && nz2r[i]); int colsplit = (nz1c[j] && nz2c[j]); /* actual assignment of the nonzero */ if (!xor(rowsplit,colsplit)){ if (nzr[i]<nzc[j]){ ir[index_r] = i; jr[index_r] = j; index_r++; } else { ic[index_c] = i; jc[index_c] = j; index_c++; } } else { if (rowsplit) { ic[index_c] = i; jc[index_c] = j; index_c++; } else { ir[index_r] = i; jr[index_r] = j; index_r++; } } /* * putting the last element that could be chosen instead of the * k-th one, and we reduce the interval for randi by 1 */ /* matrix.i[k] = matrix.i[len-1]; matrix.j[k] = matrix.j[len-1]; */ k++; len--; } /* creation of vectors of the right size */ long* ir_n = vecallocl(index_r); long* jr_n = vecallocl(index_r); long* ic_n = vecallocl(index_c); long* jc_n = vecallocl(index_c); /* copying only the filled part */ memcpy(ir_n,ir,index_r*SZLONG); memcpy(jr_n,jr,index_r*SZLONG); memcpy(ic_n,ic,index_c*SZLONG); memcpy(jc_n,jc,index_c*SZLONG); /* creating the (dummy) values for the nonzeros */ double* val_r = vecallocd(index_r); double* valc_r = vecallocd(index_r); double* val_c = vecallocd(index_c); double* valc_c = vecallocd(index_c); for(k=0;k<index_r;k++){ val_r[k] = 1.0; valc_r[k] = 0.0; } for(k=0;k<index_c;k++){ val_c[k] = 1.0; valc_c[k] = 0.0; } /* explicit creation of the final matrices */ struct sparsematrix Ar; MMSparseMatrixInit(&Ar); Ar.NrNzElts = index_r; Ar.m = m; Ar.n = n; Ar.i = ir_n; Ar.j = jr_n; Ar.ReValue = val_r; Ar.ImValue = valc_r; struct sparsematrix Ac; MMSparseMatrixInit(&Ac); Ac.NrNzElts = index_c; Ac.i = ic_n; Ac.j = jc_n; Ac.m = m; Ac.n = n; Ac.ReValue = val_c; Ac.ImValue = valc_c; /* freeing memory from unnecessary arrays */ vecfreel(ir); vecfreel(jr); vecfreel(ic); vecfreel(jc); vecfreel(nz1c); vecfreel(nz2c); vecfreel(nzc); vecfreel(nz1r); vecfreel(nz2r); vecfreel(nzr); MMDeleteSparseMatrix(A1); MMDeleteSparseMatrix(A2); /* explicit construction of the output */ struct twomatrices output; output.Ar = Ar; output.Ac = Ac; return output; }