void backward_connected_layer_gpu(connected_layer l, network_state state) { int i; constrain_ongpu(l.outputs*l.batch, 5, l.delta_gpu, 1); gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); for(i = 0; i < l.batch; ++i){ axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1); } if(l.batch_normalize){ backward_batchnorm_layer_gpu(l, state); } int m = l.outputs; int k = l.batch; int n = l.inputs; float * a = l.delta_gpu; float * b = state.input; float * c = l.weight_updates_gpu; gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n); m = l.batch; k = l.outputs; n = l.inputs; a = l.delta_gpu; b = l.weights_gpu; c = state.delta; if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n); }
void forward_local_layer_gpu(const local_layer l, network_state state) { int out_h = local_out_height(l); int out_w = local_out_width(l); int i, j; int locations = out_h * out_w; for(i = 0; i < l.batch; ++i) { copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1, state.st_handle.stream); } for(i = 0; i < l.batch; ++i) { float *input = state.input + i*l.w*l.h*l.c; im2col_ongpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image_gpu, state.st_handle.stream); float *output = l.output_gpu + i*l.outputs; for(j = 0; j < locations; ++j) { float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n; float *b = l.col_image_gpu + j; float *c = output + j; int m = l.n; int n = 1; int k = l.size*l.size*l.c; //printf("passou no local layer forward_local_layer_gpu\n"); gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations, state.st_handle); } } activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, state.st_handle.stream); }
void time_ongpu(int TA, int TB, int m, int k, int n) { int iter = 10; float *a = random_matrix(m,k); float *b = random_matrix(k,n); int lda = (!TA)?k:m; int ldb = (!TB)?n:k; float *c = random_matrix(m,n); float *a_cl = cuda_make_array(a, m*k); float *b_cl = cuda_make_array(b, k*n); float *c_cl = cuda_make_array(c, m*n); int i; clock_t start = clock(), end; for(i = 0; i<iter; ++i){ gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n); cudaThreadSynchronize(); } double flop = ((double)m)*n*(2.*k + 2.)*iter; double gflop = flop/pow(10., 9); end = clock(); double seconds = sec(end-start); printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds); cuda_free(a_cl); cuda_free(b_cl); cuda_free(c_cl); free(a); free(b); free(c); }
void backward_local_layer_gpu(local_layer l, network_state state) { int i, j; int locations = l.out_w*l.out_h; gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu, state.st_handle.stream); for(i = 0; i < l.batch; ++i) { axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1, state.st_handle.stream); } for(i = 0; i < l.batch; ++i) { float *input = state.input + i*l.w*l.h*l.c; im2col_ongpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image_gpu, state.st_handle.stream); for(j = 0; j < locations; ++j) { float *a = l.delta_gpu + i*l.outputs + j; float *b = l.col_image_gpu + j; float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; int k = 1; //printf("passou no backward_local_layer_gpu first call\n"); gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n, state.st_handle); } if(state.delta) { for(j = 0; j < locations; ++j) { float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n; float *b = l.delta_gpu + i*l.outputs + j; float *c = l.col_image_gpu + j; int m = l.size*l.size*l.c; int n = 1; int k = l.n; //printf("passou no backward_local_layer_gpu second call\n"); gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations, state.st_handle); } col2im_ongpu(l.col_image_gpu, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w, state.st_handle.stream); } } }
void backward_local_layer_gpu(local_layer l, network_state state) { int i, j; int locations = l.out_w*l.out_h; gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); for(i = 0; i < l.batch; ++i){ axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1); } for(i = 0; i < l.batch; ++i){ float *input = state.input + i*l.w*l.h*l.c; im2col_ongpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image_gpu); for(j = 0; j < locations; ++j){ float *a = l.delta_gpu + i*l.outputs + j; float *b = l.col_image_gpu + j; float *c = l.filter_updates_gpu + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; int k = 1; gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n); } if(state.delta){ for(j = 0; j < locations; ++j){ float *a = l.filters_gpu + j*l.size*l.size*l.c*l.n; float *b = l.delta_gpu + i*l.outputs + j; float *c = l.col_image_gpu + j; int m = l.size*l.size*l.c; int n = 1; int k = l.n; gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } col2im_ongpu(l.col_image_gpu, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } }
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc) { float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M)); float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K)); float *C_gpu = cuda_make_array(C, ldc*M); gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); cuda_pull_array(C_gpu, C, ldc*M); cuda_free(A_gpu); cuda_free(B_gpu); cuda_free(C_gpu); }
void forward_connected_layer_gpu(connected_layer l, network_state state) { int i; fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); int m = l.batch; int k = l.inputs; int n = l.outputs; float * a = state.input; float * b = l.weights_gpu; float * c = l.output_gpu; gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); if(l.batch_normalize){ forward_batchnorm_layer_gpu(l, state); } for(i = 0; i < l.batch; ++i){ axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1); } activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); }
void forward_connected_layer_gpu(connected_layer l, network_state state) { int i; for(i = 0; i < l.batch; ++i){ copy_ongpu_offset(l.outputs, l.biases_gpu, 0, 1, l.output_gpu, i*l.outputs, 1); } int m = l.batch; int k = l.inputs; int n = l.outputs; float * a = state.input; float * b = l.weights_gpu; float * c = l.output_gpu; gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); /* cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); float avg = mean_array(l.output, l.outputs*l.batch); printf("%f\n", avg); */ }