int main(int argc, char* argv[]) { int dim, nthreads; double **a, *b; double start_time; double total_time; if (argc != 2) { fprintf(stderr, "Usage: ./matmul dim\n"); exit(1); } dim = atoi(argv[1]); a = rand_mat(dim, dim); b = rand_vec(dim); omp_set_dynamic(0); printf("dim,nthreads,time\n"); for (nthreads = 1; nthreads <= MAX_THREADS; nthreads++) { start_time = omp_get_wtime(); #pragma omp parallel num_threads(nthreads) { mat_mul_vec(a, b, dim, dim); } total_time = omp_get_wtime() - start_time; printf("%d,%d,%f\n", dim, nthreads, total_time); } return 0; }
int main() { /** initialize A **/ printf("init A\n"); float *A = NULL; size_t nr_A = 1000; size_t nc_A = 1000; rand_mat(&A, nr_A, nc_A); /* for(size_t i=0; i<nr_A; i++) { for(size_t j=0; j<nc_A; j++) { A[i*nc_A+j] = i*nc_A+j; } } print_mat(A, nr_A, nc_A); */ /** initialize B **/ printf("init B\n"); float *B = NULL; size_t nr_B = 1000; size_t nc_B = 1000; rand_mat(&B, nr_B, nc_B); /* for(size_t i=0; i<nr_B; i++) { for(size_t j=0; j<nc_B; j++) { B[i*nc_B+j] = i*nc_B+j; } } print_mat(B, nr_B, nc_B); */ /** initialize C **/ printf("init C\n"); float *C = NULL; size_t nr_C = 0; size_t nc_C = 0; // timing variables clock_t begin; clock_t end; float difftime_ms; /** test mat_mult_v1 **/ float *C_true; printf("test mat_mult_v1\n"); begin = clock(); mat_mult_v1(A, nr_A, nc_A, B, nr_B, nc_B, &C_true, &nr_C, &nc_C); end = clock(); difftime_ms = (float)(end-begin); difftime_ms /= (float)(CLOCKS_PER_SEC/1000.0); printf("took %f ms\n", difftime_ms); //print_mat(C, nr_C, nc_C); /** test mat_mult_v1 **/ /** test mat_mult_v2 **/ printf("test mat_mult_v2\n"); begin = clock(); mat_mult_v2(A, nr_A, nc_A, B, nr_B, nc_B, &C, &nr_C, &nc_C); end = clock(); difftime_ms = (float)(end-begin); difftime_ms /= (float)(CLOCKS_PER_SEC/1000.0); printf("took %f ms\n", difftime_ms); //print_mat(C, nr_C, nc_C); free(C); /** test mat_mult_v2 **/ /** test mat_mult_v3 **/ printf("test mat_mult_v3\n"); begin = clock(); mat_mult_v3(A, nr_A, nc_A, B, nr_B, nc_B, &C, &nr_C, &nc_C); end = clock(); difftime_ms = (float)(end-begin); difftime_ms /= (float)(CLOCKS_PER_SEC/1000.0); printf("took %f ms\n", difftime_ms); //print_mat(C, nr_C, nc_C); free(C); /** test mat_mult_v3 **/ /** test mat_mult_v2_gpu **/ float *gA; cu_safe_falloc(&gA, nr_A*nc_A); memcpy_htod(gA, A, nr_A*nc_A); float *gB; cu_safe_falloc(&gB, nr_B*nc_B); memcpy_htod(gB, B, nr_B*nc_B); float *gC; printf("test mat_mult_gpu_v1\n"); begin = clock(); mat_mult_gpu_v1(gA, nr_A, nc_A, gB, nr_B, nc_B, &gC, &nr_C, &nc_C); end = clock(); difftime_ms = (float)(end-begin); difftime_ms /= (float)(CLOCKS_PER_SEC/1000.0); printf("took %f ms\n", difftime_ms); float *C_validate = (float*) safe_calloc(nr_C*nc_C, sizeof(float)); memcpy_dtoh(C_validate, gC, nr_C*nc_C); float sum_diff = 0.0; for(size_t i=0; i<nr_C*nc_C; i++) { sum_diff += fabs(C_validate[i]-C_true[i]); } sum_diff = sum_diff/((float)nr_C*nc_C); printf("diff: %f\n", sum_diff); // free memory cu_free(gA); cu_free(gB); cu_free(gC); return 0; }
int main(int argc, char *argv[]) { int nargs = 3; if(argc != nargs) { usage(argv); return 1; } int M = atoi(argv[1]); int N = atoi(argv[2]); double *A = alloc(sizeof(double)*M*N); double *B = alloc(sizeof(double)*M*N); rand_mat(A, M, N); rand_mat(B, N, M); double *C = alloc(sizeof(double)*M*M); zero_mat(C, M, M); mat_mul(C, M, N, A, B); { double t0 = stop_watch(0); mat_mul(C, M, N, A, B); t0 = stop_watch(t0); double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */; printf(" ORIG: M = %d, N = %d,", M, N); printf(" took: %4.2e sec,", t0); printf(" P = %4.2e Mflop/s\n", beta_fp); } #ifdef BLCK double *Cb = alloc(sizeof(double)*M*M); zero_mat(Cb, M, M); mat_mul_blocked(Cb, M, N, A, B); { double t0 = stop_watch(0); mat_mul_blocked(Cb, M, N, A, B); t0 = stop_watch(t0); double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */; printf(" BLCK: M = %d, N = %d,", M, N); printf(" took: %4.2e sec,", t0); printf(" P = %4.2e Mflop/s, BM = %d, BN = %d\n", beta_fp, BM, BN); } #endif #ifdef BLCK double eps = 1e-12; double diff = 0; for(int i=0; i<M*M; i++) { diff += fabs((C[i] - Cb[i])/C[i]); } /* * If the difference between the flat and blocked result is larger * than eps, complain to stdout and write the two matrices to file * "diffs.out". */ diff /= (double)M*M; if(diff > eps) { printf(" Non zero diff: %e\n", diff); FILE *fp = fopen("diffs.out", "w"); for(int i=0; i<M*M; i++) fprintf(fp, "%e\n", fabs((C[i]-Cb[i])/C[i])); fclose(fp); } #endif free(A); free(B); free(C); #ifdef BLCK free(Cb); #endif return 0; }