void terminate(void) { fprintf(stderr, "unpartition !!\n"); starpu_data_unpartition(C_handle, 0); starpu_data_unregister(C_handle); gettimeofday(&end, NULL); double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec)); display_stats(timing); #ifdef CHECK_OUTPUT /* check results */ /* compute C = C - AB */ SGEMM("N", "N", ydim, xdim, zdim, -1.0f, A, ydim, B, zdim, 1.0f, C, ydim); /* make sure C = 0 */ float err; err = SASUM(xdim*ydim, C, 1); if (err < xdim*ydim*0.001) { fprintf(stderr, "Results are OK\n"); } else { fprintf(stderr, "There were errors ... err = %f\n", err); } #endif // CHECK_OUTPUT }
int slacon_(int *n, float *v, float *x, int *isgn, float *est, int *kase) { /* Table of constant values */ int c__1 = 1; float zero = 0.0; float one = 1.0; /* Local variables */ static int iter; static int jump, jlast; static float altsgn, estold; static int i, j; float temp; #ifdef _CRAY extern int ISAMAX(int *, float *, int *); extern float SASUM(int *, float *, int *); extern int SCOPY(int *, float *, int *, float *, int *); #else extern int isamax_(int *, float *, int *); extern float sasum_(int *, float *, int *); extern int scopy_(int *, float *, int *, float *, int *); #endif #define d_sign(a, b) (b >= 0 ? fabs(a) : -fabs(a)) /* Copy sign */ #define i_dnnt(a) \ ( a>=0 ? floor(a+.5) : -floor(.5-a) ) /* Round to nearest integer */ if ( *kase == 0 ) { for (i = 0; i < *n; ++i) { x[i] = 1. / (float) (*n); } *kase = 1; jump = 1; return 0; } switch (jump) { case 1: goto L20; case 2: goto L40; case 3: goto L70; case 4: goto L110; case 5: goto L140; } /* ................ ENTRY (JUMP = 1) FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ L20: if (*n == 1) { v[0] = x[0]; *est = fabs(v[0]); /* ... QUIT */ goto L150; } #ifdef _CRAY *est = SASUM(n, x, &c__1); #else *est = sasum_(n, x, &c__1); #endif for (i = 0; i < *n; ++i) { x[i] = d_sign(one, x[i]); isgn[i] = i_dnnt(x[i]); } *kase = 2; jump = 2; return 0; /* ................ ENTRY (JUMP = 2) FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ L40: #ifdef _CRAY j = ISAMAX(n, &x[0], &c__1); #else j = isamax_(n, &x[0], &c__1); #endif --j; iter = 2; /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ L50: for (i = 0; i < *n; ++i) x[i] = zero; x[j] = one; *kase = 1; jump = 3; return 0; /* ................ ENTRY (JUMP = 3) X HAS BEEN OVERWRITTEN BY A*X. */ L70: #ifdef _CRAY SCOPY(n, x, &c__1, v, &c__1); #else scopy_(n, x, &c__1, v, &c__1); #endif estold = *est; #ifdef _CRAY *est = SASUM(n, v, &c__1); #else *est = sasum_(n, v, &c__1); #endif for (i = 0; i < *n; ++i) if (i_dnnt(d_sign(one, x[i])) != isgn[i]) goto L90; /* REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. */ goto L120; L90: /* TEST FOR CYCLING. */ if (*est <= estold) goto L120; for (i = 0; i < *n; ++i) { x[i] = d_sign(one, x[i]); isgn[i] = i_dnnt(x[i]); } *kase = 2; jump = 4; return 0; /* ................ ENTRY (JUMP = 4) X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ L110: jlast = j; #ifdef _CRAY j = ISAMAX(n, &x[0], &c__1); #else j = isamax_(n, &x[0], &c__1); #endif --j; if (x[jlast] != fabs(x[j]) && iter < 5) { ++iter; goto L50; } /* ITERATION COMPLETE. FINAL STAGE. */ L120: altsgn = 1.; for (i = 1; i <= *n; ++i) { x[i-1] = altsgn * ((float)(i - 1) / (float)(*n - 1) + 1.); altsgn = -altsgn; } *kase = 1; jump = 5; return 0; /* ................ ENTRY (JUMP = 5) X HAS BEEN OVERWRITTEN BY A*X. */ L140: #ifdef _CRAY temp = SASUM(n, x, &c__1) / (float)(*n * 3) * 2.; #else temp = sasum_(n, x, &c__1) / (float)(*n * 3) * 2.; #endif if (temp > *est) { #ifdef _CRAY SCOPY(n, &x[0], &c__1, &v[0], &c__1); #else scopy_(n, &x[0], &c__1, &v[0], &c__1); #endif *est = temp; } L150: *kase = 0; return 0; } /* slacon_ */
int dlacon_(int *n, double *v, double *x, int *isgn, double *est, int *kase) { /* Purpose ======= DLACON estimates the 1-norm of a square matrix A. Reverse communication is used for evaluating matrix-vector products. Arguments ========= N (input) INT The order of the matrix. N >= 1. V (workspace) DOUBLE PRECISION array, dimension (N) On the final return, V = A*W, where EST = norm(V)/norm(W) (W is not returned). X (input/output) DOUBLE PRECISION array, dimension (N) On an intermediate return, X should be overwritten by A * X, if KASE=1, A' * X, if KASE=2, and DLACON must be re-called with all the other parameters unchanged. ISGN (workspace) INT array, dimension (N) EST (output) DOUBLE PRECISION An estimate (a lower bound) for norm(A). KASE (input/output) INT On the initial call to DLACON, KASE should be 0. On an intermediate return, KASE will be 1 or 2, indicating whether X should be overwritten by A * X or A' * X. On the final return from DLACON, KASE will again be 0. Further Details ======= ======= Contributed by Nick Higham, University of Manchester. Originally named CONEST, dated March 16, 1988. Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of a real or complex matrix, with applications to condition estimation", ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. ===================================================================== */ /* Table of constant values */ int c__1 = 1; double zero = 0.0; double one = 1.0; /* Local variables */ static int iter; static int jump, jlast; static double altsgn, estold; static int i, j; double temp; #ifdef _CRAY extern int ISAMAX(int *, double *, int *); extern double SASUM(int *, double *, int *); extern int SCOPY(int *, double *, int *, double *, int *); #else extern int idamax_(int *, double *, int *); extern double dasum_(int *, double *, int *); extern int dcopy_(int *, double *, int *, double *, int *); #endif #define d_sign(a, b) (b >= 0 ? fabs(a) : -fabs(a)) /* Copy sign */ #define i_dnnt(a) \ ( a>=0 ? floor(a+.5) : -floor(.5-a) ) /* Round to nearest integer */ if ( *kase == 0 ) { for (i = 0; i < *n; ++i) { x[i] = 1. / (double) (*n); } *kase = 1; jump = 1; return 0; } switch (jump) { case 1: goto L20; case 2: goto L40; case 3: goto L70; case 4: goto L110; case 5: goto L140; } /* ................ ENTRY (JUMP = 1) FIRST ITERATION. X HAS BEEN OVERWRITTEN BY A*X. */ L20: if (*n == 1) { v[0] = x[0]; *est = fabs(v[0]); /* ... QUIT */ goto L150; } #ifdef _CRAY *est = SASUM(n, x, &c__1); #else *est = dasum_(n, x, &c__1); #endif for (i = 0; i < *n; ++i) { x[i] = d_sign(one, x[i]); isgn[i] = i_dnnt(x[i]); } *kase = 2; jump = 2; return 0; /* ................ ENTRY (JUMP = 2) FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ L40: #ifdef _CRAY j = ISAMAX(n, &x[0], &c__1); #else j = idamax_(n, &x[0], &c__1); #endif --j; iter = 2; /* MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */ L50: for (i = 0; i < *n; ++i) x[i] = zero; x[j] = one; *kase = 1; jump = 3; return 0; /* ................ ENTRY (JUMP = 3) X HAS BEEN OVERWRITTEN BY A*X. */ L70: #ifdef _CRAY SCOPY(n, x, &c__1, v, &c__1); #else dcopy_(n, x, &c__1, v, &c__1); #endif estold = *est; #ifdef _CRAY *est = SASUM(n, v, &c__1); #else *est = dasum_(n, v, &c__1); #endif for (i = 0; i < *n; ++i) if (i_dnnt(d_sign(one, x[i])) != isgn[i]) goto L90; /* REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. */ goto L120; L90: /* TEST FOR CYCLING. */ if (*est <= estold) goto L120; for (i = 0; i < *n; ++i) { x[i] = d_sign(one, x[i]); isgn[i] = i_dnnt(x[i]); } *kase = 2; jump = 4; return 0; /* ................ ENTRY (JUMP = 4) X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */ L110: jlast = j; #ifdef _CRAY j = ISAMAX(n, &x[0], &c__1); #else j = idamax_(n, &x[0], &c__1); #endif --j; if (x[jlast] != fabs(x[j]) && iter < 5) { ++iter; goto L50; } /* ITERATION COMPLETE. FINAL STAGE. */ L120: altsgn = 1.; for (i = 1; i <= *n; ++i) { x[i-1] = altsgn * ((double)(i - 1) / (double)(*n - 1) + 1.); altsgn = -altsgn; } *kase = 1; jump = 5; return 0; /* ................ ENTRY (JUMP = 5) X HAS BEEN OVERWRITTEN BY A*X. */ L140: #ifdef _CRAY temp = SASUM(n, x, &c__1) / (double)(*n * 3) * 2.; #else temp = dasum_(n, x, &c__1) / (double)(*n * 3) * 2.; #endif if (temp > *est) { #ifdef _CRAY SCOPY(n, &x[0], &c__1, &v[0], &c__1); #else dcopy_(n, &x[0], &c__1, &v[0], &c__1); #endif *est = temp; } L150: *kase = 0; return 0; } /* dlacon_ */
int main( int argc, char *argv[] ) { unsigned int numspes = 1, i; unsigned int numblocks = 8, blocksize = 4; unsigned int type = 1; // There are arguments if ( argc > 1 ) { // The first argument is present if ( argc > 1 ) { numblocks = atoi( argv[1] ); } if ( argc > 2 ) { blocksize = atoi( argv[2] ); } if ( argc > 3 ) { type = atoi( argv[3] ); } if ( argc > 4 ) { numspes = atoi( argv[4] ); } } else { printf( "Usage pputest <numblocks> <blocksize^2> <type> <num spes>\n" ); printf( "type: 1: sdot, 2: sdotv, 3: snrm2, 4: snrm2v\n" ); return -1; } init( numspes ); paddingx = 0; paddingy = 1; unsigned int size, numelements = numblocks*blocksize*blocksize; printf( "Testing BLAS()\n" ); printf( "Num SPEs:\t%u\n", speThreads ); printf( "------------------\n" ); printf( "Vector size: \t\t%u\n", blocksize*blocksize*numblocks-paddingx ); printf( "Num blocks:\t\t%u\n", numblocks ); printf( "Num elements pr block: \t%u\n", blocksize*blocksize ); printf( "1 block in bytes: \t%u\n", blocksize*blocksize*4 ); printf( "Num elements:\t\t%u\n", numblocks*blocksize*blocksize ); printf( "Total size in MB: \t%f\n", (double)(numblocks*blocksize*blocksize*4)/(1024*1024) ); printf( "Total size in MB: \t%f\n", (double)(numblocks*blocksize*blocksize*4)/(1024*1024)*2 ); printf( "------------------\n" ); // PyArrayObject PyArrayObject pyobj1; PyArrayObject pyobj2; PyArrayObject pyscalar1; sMakeMatrix( numblocks, 1, blocksize, 1.0f, &pyobj1 ); sMakeMatrix( numblocks, 1, blocksize, 1.0f, &pyobj2 ); sMakeMatrix( 1, 1, 4, 2.334f, &pyscalar1 ); // printf( "First block address=%#x\n", pyobj1.blockData[0] ); // printf( "Sum is %f\n", SumBlock( pyobj1.blockData[0], blocksize ) ); unsigned int *shader; double time; switch( type ) { case 1: shader = blas_1_sdot; printf( "Calling SDOT();\n" ); size = (numblocks*blocksize*blocksize*4) * 2; time = SDOT( &pyobj1, &pyobj2, blas_1_sdot_size, shader ); break; case 2: shader = blas_1_sdotv; printf( "Calling SDOTvvvv();\n" ); size = (numblocks*blocksize*blocksize*4) * 2; time = SDOT( &pyobj1, &pyobj2, blas_1_sdotv_size, shader ); break; case 3: shader = snrm2; printf( "Calling SNRM2();\n" ); size = (numblocks*blocksize*blocksize*4); time = SNRM2( &pyobj1, snrm2_size, shader ); break; case 4: shader = snrm2v; printf( "Calling SNRM2vvvvv();\n" ); size = (numblocks*blocksize*blocksize*4); time = SNRM2( &pyobj1, snrm2v_size, shader ); break; case 5: shader = blas_1_sscal; printf( "Calling SSCAL();\n" ); size = (numblocks*blocksize*blocksize*4); time = SSCAL( &pyobj1, &pyscalar1, blas_1_sscal_size, shader ); break; case 7: shader = blas_1_sasum; printf( "Calling SASUM();\n" ); size = (numblocks*blocksize*blocksize*4); time = SASUM( &pyobj1, blas_1_sasum_size, shader ); break; case 8: shader = blas_1_sasumv; printf( "Calling SASUMv();\n" ); size = (numblocks*blocksize*blocksize*4); time = SASUM( &pyobj1, blas_1_sasumv_size, shader ); break; case 9: shader = blas_1_isamaxv; printf( "Calling ISAMAXv();\n" ); size = (numblocks*blocksize*blocksize*4); //ISAMAX( &pyobj1,blas_1_isamaxv_size, shader ); break; case 102: shader = blas_1_sdotv; printf( "Calling SDOT2vvvv();\n" ); size = (numblocks*blocksize*blocksize*4) * 2; time = SDOT2( &pyobj1, &pyobj2, blas_1_sdotv_size, shader ); break; } unsigned int state = 0; for ( i = 0 ; i < speThreads ; i++ ) { spe_in_mbox_write ( speData[i].spe_ctx, &state, 1, SPE_MBOX_ALL_BLOCKING ); } // Wait for all the SPE threads to complete. for ( i = 0 ; i < speThreads ; i++ ) { CompleteSPEThreads( &speData[i] ); } double GB = (double)size / (1024*1024*1024); printf( "Time: %f\n", time ); printf( "Size: %u\n", size ); printf( "GigaBytes: %f\n", GB ); printf( "GB/s: %f\n", GB/time ); printf( "GFlops: %f\n", ( numelements/time ) / ( 1024*1024*1024 ) ); return 1; }