int main() { int N, N2; printf(" \n Input matrix size N x N, N = "); scanf("%d", &N); printf(" N = %d \n \n", N); N2 = N*N; double *A, *B, *C_cpu, *C_gpu, *D_cpu, *D_gpu, t1, t2, cpu_time, gpu_time; double r_cpu, *r_gpu, nrmC_cpu, *nrmC_gpu; A = (double *) malloc(N2*sizeof(double)); B = (double *) malloc(N2*sizeof(double)); C_cpu = (double *) malloc(N2*sizeof(double)); C_gpu = (double *) malloc(N2*sizeof(double)); D_cpu = (double *) malloc(N2*sizeof(double)); D_gpu = (double *) malloc(N2*sizeof(double)); r_gpu = (double *) malloc(1*sizeof(double)); nrmC_gpu = (double *) malloc(1*sizeof(double)); initial(A, B, N); t1 = clock(); #pragma acc data copyin(A[0:N2], B[0:N2]) copyout(C_cpu[0:N2]) { cublas_gemm(A, B, C_cpu, N); } r_cpu = dot_cpu(C_cpu, B, N2); axpy_cpu(-1.0*r_cpu, B, C_cpu, N2); nrmC_cpu = norm_cpu(C_cpu, N2); copy_cpu(C_cpu, D_cpu, N2); scal_cpu(1.0/nrmC_cpu, D_cpu, N2); t2 = clock(); cpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC; t1 = clock(); #pragma acc enter data copyin(A[0:N2], B[0:N2]) create(C_gpu[0:N2], r_gpu[0], nrmC_gpu[0], D_gpu[0:N2]) { gpu_cublas1(A, B, C_gpu, D_gpu, r_gpu, nrmC_gpu, N, N2); } #pragma acc update host(D_gpu[0:N2]) t2 = clock(); gpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC; printf(" gpu part success \n"); printf(" \n error = %f \n", error(D_cpu, D_gpu, N2)); printf(" gpu time = %f, cpu times = %f \n", gpu_time, cpu_time); return 0; }
void slerp(float *start, float *end, float s, int n, float *out) { float omega = acos(dot_cpu(n, start, 1, end, 1)); float so = sin(omega); fill_cpu(n, 0, out, 1); axpy_cpu(n, sin((1-s)*omega)/so, start, 1, out, 1); axpy_cpu(n, sin(s*omega)/so, end, 1, out, 1); float mag = mag_array(out, n); scale_array(out, n, 1./mag); }
float cuda_compare(float *x_gpu, float *x, size_t n, char *s) { float *tmp = calloc(n, sizeof(float)); cuda_pull_array(x_gpu, tmp, n); //int i; //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]); axpy_cpu(n, -1, x, 1, tmp, 1); float err = dot_cpu(n, tmp, 1, tmp, 1); printf("Error %s: %f\n", s, sqrt(err / n)); free(tmp); return err; }
void forward_cost_layer_gpu(cost_layer l, network_state state) { if (!state.truth) return; if (l.cost_type == MASKED) { mask_ongpu(l.batch*l.inputs, state.input, state.truth); } copy_ongpu(l.batch*l.inputs, state.truth, 1, l.delta_gpu, 1); axpy_ongpu(l.batch*l.inputs, -1, state.input, 1, l.delta_gpu, 1); cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); }
void forward_cost_layer(cost_layer l, network_state state) { if (!state.truth) return; if(l.cost_type == MASKED){ int i; for(i = 0; i < l.batch*l.inputs; ++i){ if(state.truth[i] == 0) state.input[i] = 0; } } copy_cpu(l.batch*l.inputs, state.truth, 1, l.delta, 1); axpy_cpu(l.batch*l.inputs, -1, state.input, 1, l.delta, 1); *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); //printf("cost: %f\n", *l.output); }
void forward_cost_layer_gpu(cost_layer l, network_state state) { if (!state.truth) return; if (l.cost_type == MASKED) { mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth); } if(l.cost_type == SMOOTH){ smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu); } else { copy_ongpu(l.batch*l.inputs, state.truth, 1, l.delta_gpu, 1); axpy_ongpu(l.batch*l.inputs, -1, state.input, 1, l.delta_gpu, 1); } cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); }
void forward_cost_layer(cost_layer l, network_state state) { if (!state.truth) return; if(l.cost_type == MASKED){ int i; for(i = 0; i < l.batch*l.inputs; ++i){ if(state.truth[i] == SECRET_NUM) state.input[i] = SECRET_NUM; } } if(l.cost_type == SMOOTH){ smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta); } else { copy_cpu(l.batch*l.inputs, state.truth, 1, l.delta, 1); axpy_cpu(l.batch*l.inputs, -1, state.input, 1, l.delta, 1); } *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); //printf("cost: %f\n", *l.output); }