void time_gpu(int TA, int TB, int m, int k, int n) { int iter = 10; float *a = random_matrix(m,k); float *b = random_matrix(k,n); int lda = (!TA)?k:m; int ldb = (!TB)?n:k; float *c = random_matrix(m,n); float *a_cl = cuda_make_array(a, m*k); float *b_cl = cuda_make_array(b, k*n); float *c_cl = cuda_make_array(c, m*n); int i; clock_t start = clock(), end; for(i = 0; i<iter; ++i){ gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n); cudaThreadSynchronize(); } double flop = ((double)m)*n*(2.*k + 2.)*iter; double gflop = flop/pow(10., 9); end = clock(); double seconds = sec(end-start); printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds); cuda_free(a_cl); cuda_free(b_cl); cuda_free(c_cl); free(a); free(b); free(c); }
void time_gpu_random_matrix(int TA, int TB, int m, int k, int n) { float *a; if(!TA) a = random_matrix(m,k); else a = random_matrix(k,m); int lda = (!TA)?k:m; float *b; if(!TB) b = random_matrix(k,n); else b = random_matrix(n,k); int ldb = (!TB)?n:k; float *c = random_matrix(m,n); int i; clock_t start = clock(), end; for(i = 0; i<32; ++i){ gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); } end = clock(); printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC); free(a); free(b); free(c); }
void test_gpu_accuracy(int TA, int TB, int m, int k, int n) { srand(0); float *a; if(!TA) a = random_matrix(m,k); else a = random_matrix(k,m); int lda = (!TA)?k:m; float *b; if(!TB) b = random_matrix(k,n); else b = random_matrix(n,k); int ldb = (!TB)?n:k; float *c = random_matrix(m,n); float *c_gpu = random_matrix(m,n); memset(c, 0, m*n*sizeof(float)); memset(c_gpu, 0, m*n*sizeof(float)); int i; //pm(m,k,b); gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n); //printf("GPU\n"); //pm(m, n, c_gpu); gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); //printf("\n\nCPU\n"); //pm(m, n, c); double sse = 0; for(i = 0; i < m*n; ++i) { //printf("%f %f\n", c[i], c_gpu[i]); sse += pow(c[i]-c_gpu[i], 2); } printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n)); free(a); free(b); free(c); free(c_gpu); }