void forward_local_layer_gpu(const local_layer l, network_state state) { int out_h = local_out_height(l); int out_w = local_out_width(l); int i, j; int locations = out_h * out_w; for(i = 0; i < l.batch; ++i) { copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1, state.st_handle.stream); } for(i = 0; i < l.batch; ++i) { float *input = state.input + i*l.w*l.h*l.c; im2col_ongpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image_gpu, state.st_handle.stream); float *output = l.output_gpu + i*l.outputs; for(j = 0; j < locations; ++j) { float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n; float *b = l.col_image_gpu + j; float *c = output + j; int m = l.n; int n = 1; int k = l.size*l.size*l.c; //printf("passou no local layer forward_local_layer_gpu\n"); gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations, state.st_handle); } } activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, state.st_handle.stream); }
void forward_compact_layer_gpu(const layer l, network_state state) { int i, b; for (b=0;b<l.batch;b++) { if (l.method==0) // add { // copy first section copy_ongpu(l.outputs, state.input+b*l.inputs, 1, l.output_gpu+b*l.outputs, 1); // add other splits for (i=1;i<l.index;i++) { axpy_ongpu(l.outputs, 1, state.input+b*l.inputs+i*l.outputs, 1, l.output_gpu+b*l.outputs, 1); } } else if (l.method==1) // sub { // copy first section copy_ongpu(l.outputs, state.input+b*l.inputs, 1, l.output_gpu+b*l.outputs, 1); // sub other splits for (i=1;i<l.index;i++) { axpy_ongpu(l.outputs, -1, state.input+b*l.inputs+i*l.outputs, 1, l.output_gpu+b*l.outputs, 1); } } else if (l.method==2) // max { compact_forward_max_gpu(l.w, l.h, l.c, l.index, state.input+b*l.inputs, l.output_gpu+b*l.outputs, l.indexes_gpu); } else if (l.method==10) { compact_forward_padd_gpu(l.w, l.h, l.c, state.input+b*l.inputs, l.output_gpu+b*l.outputs); } else if (l.method==12) { compact_forward_pmax_gpu(l.w, l.h, l.c, state.input+b*l.inputs, l.output_gpu+b*l.outputs, l.indexes_gpu); } } activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); }
void forward_connected_layer_gpu(connected_layer l, network_state state) { int i; fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); int m = l.batch; int k = l.inputs; int n = l.outputs; float * a = state.input; float * b = l.weights_gpu; float * c = l.output_gpu; gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); if(l.batch_normalize){ forward_batchnorm_layer_gpu(l, state); } for(i = 0; i < l.batch; ++i){ axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1); } activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); }
void forward_connected_layer_gpu(connected_layer l, network_state state) { int i; for(i = 0; i < l.batch; ++i){ copy_ongpu_offset(l.outputs, l.biases_gpu, 0, 1, l.output_gpu, i*l.outputs, 1); } int m = l.batch; int k = l.inputs; int n = l.outputs; float * a = state.input; float * b = l.weights_gpu; float * c = l.output_gpu; gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); /* cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); float avg = mean_array(l.output, l.outputs*l.batch); printf("%f\n", avg); */ }
void forward_shortcut_layer_gpu(const layer l, network_state state) { copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1); shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu); activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); }