void backward_connected_layer(connected_layer l, network_state state) { int i; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1); } if(l.batch_normalize){ backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates); scale_bias(l.delta, l.scales, l.batch, l.outputs, 1); mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta); variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta); normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta); } int m = l.outputs; int k = l.batch; int n = l.inputs; float *a = l.delta; float *b = state.input; float *c = l.weight_updates; gemm(1,0,m,n,k,1,a,m,b,n,1,c,n); m = l.batch; k = l.outputs; n = l.inputs; a = l.delta; b = l.weights; c = state.delta; if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); }
void backward_detection_layer(const detection_layer l, network_state state) { int locations = get_detection_layer_locations(l); int i,j; int in_i = 0; int out_i = 0; for(i = 0; i < l.batch*locations; ++i){ float scale = 1; float latent_delta = 0; if(l.joint) scale = state.input[in_i++]; else if (l.objectness) state.delta[in_i++] += -l.delta[out_i++]; else if (l.background) state.delta[in_i++] += scale*l.delta[out_i++]; for(j = 0; j < l.classes; ++j){ latent_delta += state.input[in_i]*l.delta[out_i]; state.delta[in_i++] += scale*l.delta[out_i++]; } if (l.objectness) { }else if (l.background) gradient_array(l.output + out_i, l.coords, LOGISTIC, l.delta + out_i); for(j = 0; j < l.coords; ++j){ state.delta[in_i++] += l.delta[out_i++]; } if(l.joint) state.delta[in_i-l.coords-l.classes-l.joint] += latent_delta; } }
void backward_connected_layer(connected_layer l, network_state state) { int i; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1); } int m = l.outputs; int k = l.batch; int n = l.inputs; float *a = l.delta; float *b = state.input; float *c = l.weight_updates; gemm(1,0,m,n,k,1,a,m,b,n,1,c,n); m = l.batch; k = l.outputs; n = l.inputs; a = l.delta; b = l.weights; c = state.delta; if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); }
void backward_convolutional_layer(convolutional_layer l, network_state state) { int i; int m = l.n; int n = l.size*l.size*l.c; int k = convolutional_out_height(l)* convolutional_out_width(l); gradient_array(l.output, m*k*l.batch, l.activation, l.delta); backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); for(i = 0; i < l.batch; ++i){ float *a = l.delta + i*m*k; float *b = l.col_image; float *c = l.filter_updates; float *im = state.input+i*l.c*l.h*l.w; im2col_cpu(im, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(state.delta){ a = l.filters; b = l.delta + i*m*k; c = l.col_image; gemm(1,0,n,k,m,1,a,n,b,k,0,c,k); col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } }
void backward_compact_layer(const layer l, network_state state) { gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); int i, b; for (b=0;b<l.batch;b++) { for (i=0;i<l.index;i++) { axpy_cpu(l.outputs, 1, l.delta+b*l.outputs, 1, state.delta+b*l.inputs+i*l.outputs, 1); } } }
void backward_local_layer(local_layer l, network_state state) { int i, j; int locations = l.out_w*l.out_h; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1); } for(i = 0; i < l.batch; ++i){ float *input = state.input + i*l.w*l.h*l.c; im2col_cpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image); for(j = 0; j < locations; ++j){ float *a = l.delta + i*l.outputs + j; float *b = l.col_image + j; float *c = l.filter_updates + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; int k = 1; gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n); } if(state.delta){ for(j = 0; j < locations; ++j){ float *a = l.filters + j*l.size*l.size*l.c*l.n; float *b = l.delta + i*l.outputs + j; float *c = l.col_image + j; int m = l.size*l.size*l.c; int n = 1; int k = l.n; gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } }
void backward_deconvolutional_layer(layer l, network_state state) { float alpha = 1./l.batch; int out_h = deconvolutional_out_height(l); int out_w = deconvolutional_out_width(l); int size = out_h*out_w; int i; gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta); if(l.batch_normalize){ backward_batchnorm_layer(l, state); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h); } for(i = 0; i < l.batch; ++i){ int m = l.c; int n = l.size*l.size*l.n; int k = l.h*l.w; float *a = state.input + i*m*n; float *b = state.workspace; float *c = l.weight_updates; im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w, l.size, l.stride, 0, b); gemm(0,1,m,n,k,alpha,a,k,b,k,1,c,n); if(state.delta){ int m = l.c; int n = l.h*l.w; int k = l.size*l.size*l.n; float *a = l.weights; float *b = state.workspace; float *c = state.delta + i*n*m; gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); } } }
void backward_deconvolutional_layer(layer l, network net) { int i; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); if(l.batch_normalize){ backward_batchnorm_layer(l, net); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h); } //if(net.delta) memset(net.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float)); for(i = 0; i < l.batch; ++i){ int m = l.c; int n = l.size*l.size*l.n; int k = l.h*l.w; float *a = net.input + i*m*k; float *b = net.workspace; float *c = l.weight_updates; im2col_cpu(l.delta + i*l.outputs, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, b); gemm_cpu(0,1,m,n,k,1,a,k,b,k,1,c,n); if(net.delta){ int m = l.c; int n = l.h*l.w; int k = l.size*l.size*l.n; float *a = l.weights; float *b = net.workspace; float *c = net.delta + i*n*m; gemm_cpu(0,0,m,n,k,1,a,k,b,n,1,c,n); } } }
void backward_convolutional_layer(convolutional_layer l, network net) { int i, j; int m = l.n/l.groups; int n = l.size*l.size*l.c/l.groups; int k = l.out_w*l.out_h; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); if(l.batch_normalize){ backward_batchnorm_layer(l, net); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); } for(i = 0; i < l.batch; ++i){ for(j = 0; j < l.groups; ++j){ float *a = l.delta + (i*l.groups + j)*m*k; float *b = net.workspace; float *c = l.weight_updates + j*l.nweights/l.groups; float *im = net.input+(i*l.groups + j)*l.c/l.groups*l.h*l.w; im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(net.delta){ a = l.weights + j*l.nweights/l.groups; b = l.delta + (i*l.groups + j)*m*k; c = net.workspace; gemm(1,0,n,k,m,1,a,n,b,k,0,c,k); col2im_cpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, net.delta + (i*l.groups + j)*l.c/l.groups*l.h*l.w); } } } }
void backward_shortcut_layer(const layer l, network net) { gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, net.delta, 1); shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, net.layers[l.index].delta); }
void backward_lstm_layer(layer l, network state) { network s = { 0 }; s.train = state.train; int i; layer wf = *(l.wf); layer wi = *(l.wi); layer wg = *(l.wg); layer wo = *(l.wo); layer uf = *(l.uf); layer ui = *(l.ui); layer ug = *(l.ug); layer uo = *(l.uo); increment_layer(&wf, l.steps - 1); increment_layer(&wi, l.steps - 1); increment_layer(&wg, l.steps - 1); increment_layer(&wo, l.steps - 1); increment_layer(&uf, l.steps - 1); increment_layer(&ui, l.steps - 1); increment_layer(&ug, l.steps - 1); increment_layer(&uo, l.steps - 1); state.input += l.inputs * l.batch * (l.steps - 1); if (state.delta) state.delta += l.inputs * l.batch * (l.steps - 1); l.output += l.outputs * l.batch * (l.steps - 1); l.cell_cpu += l.outputs * l.batch * (l.steps - 1); l.delta += l.outputs * l.batch * (l.steps - 1); for (i = l.steps - 1; i >= 0; --i) { if (i != 0) copy_cpu(l.outputs * l.batch, l.cell_cpu - l.outputs * l.batch, 1, l.prev_cell_cpu, 1); copy_cpu(l.outputs * l.batch, l.cell_cpu, 1, l.c_cpu, 1); if (i != 0) copy_cpu(l.outputs * l.batch, l.output - l.outputs * l.batch, 1, l.prev_state_cpu, 1); copy_cpu(l.outputs * l.batch, l.output, 1, l.h_cpu, 1); l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs * l.batch; copy_cpu(l.outputs * l.batch, wf.output, 1, l.f_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uf.output, 1, l.f_cpu, 1); copy_cpu(l.outputs * l.batch, wi.output, 1, l.i_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ui.output, 1, l.i_cpu, 1); copy_cpu(l.outputs * l.batch, wg.output, 1, l.g_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ug.output, 1, l.g_cpu, 1); copy_cpu(l.outputs * l.batch, wo.output, 1, l.o_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uo.output, 1, l.o_cpu, 1); activate_array(l.f_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.i_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.g_cpu, l.outputs * l.batch, TANH); activate_array(l.o_cpu, l.outputs * l.batch, LOGISTIC); copy_cpu(l.outputs * l.batch, l.delta, 1, l.temp3_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.temp_cpu, 1); activate_array(l.temp_cpu, l.outputs * l.batch, TANH); copy_cpu(l.outputs * l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1); mul_cpu(l.outputs * l.batch, l.o_cpu, 1, l.temp2_cpu, 1); gradient_array(l.temp_cpu, l.outputs * l.batch, TANH, l.temp2_cpu); axpy_cpu(l.outputs * l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.temp_cpu, 1); activate_array(l.temp_cpu, l.outputs * l.batch, TANH); mul_cpu(l.outputs * l.batch, l.temp3_cpu, 1, l.temp_cpu, 1); gradient_array(l.o_cpu, l.outputs * l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wo.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wo, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, uo.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(uo, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.i_cpu, 1, l.temp_cpu, 1); gradient_array(l.g_cpu, l.outputs * l.batch, TANH, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wg.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wg, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, ug.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(ug, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.g_cpu, 1, l.temp_cpu, 1); gradient_array(l.i_cpu, l.outputs * l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wi.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wi, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, ui.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(ui, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1); gradient_array(l.f_cpu, l.outputs * l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wf.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wf, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, uf.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(uf, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.f_cpu, 1, l.temp_cpu, 1); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, l.dc_cpu, 1); state.input -= l.inputs * l.batch; if (state.delta) state.delta -= l.inputs * l.batch; l.output -= l.outputs * l.batch; l.cell_cpu -= l.outputs * l.batch; l.delta -= l.outputs * l.batch; increment_layer(&wf, -1); increment_layer(&wi, -1); increment_layer(&wg, -1); increment_layer(&wo, -1); increment_layer(&uf, -1); increment_layer(&ui, -1); increment_layer(&ug, -1); increment_layer(&uo, -1); } }
void forward_detection_layer(const detection_layer l, network_state state) { int locations = l.side*l.side; int i,j; memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float)); int b; if (l.softmax){ for(b = 0; b < l.batch; ++b){ int index = b*l.inputs; for (i = 0; i < locations; ++i) { int offset = i*l.classes; softmax_array(l.output + index + offset, l.classes, 1, l.output + index + offset); } int offset = locations*l.classes; activate_array(l.output + index + offset, locations*l.n*(1+l.coords), LOGISTIC); } } if(state.train){ float avg_iou = 0; float avg_cat = 0; float avg_allcat = 0; float avg_obj = 0; float avg_anyobj = 0; int count = 0; *(l.cost) = 0; int size = l.inputs * l.batch; memset(l.delta, 0, size * sizeof(float)); for (b = 0; b < l.batch; ++b){ int index = b*l.inputs; for (i = 0; i < locations; ++i) { int truth_index = (b*locations + i)*(1+l.coords+l.classes); int is_obj = state.truth[truth_index]; for (j = 0; j < l.n; ++j) { int p_index = index + locations*l.classes + i*l.n + j; l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]); *(l.cost) += l.noobject_scale*pow(l.output[p_index], 2); avg_anyobj += l.output[p_index]; } int best_index = -1; float best_iou = 0; float best_rmse = 20; if (!is_obj){ continue; } int class_index = index + i*l.classes; for(j = 0; j < l.classes; ++j) { l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]); *(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2); if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j]; avg_allcat += l.output[class_index+j]; } box truth = float_to_box(state.truth + truth_index + 1 + l.classes); truth.x /= l.side; truth.y /= l.side; for(j = 0; j < l.n; ++j){ int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords; box out = float_to_box(l.output + box_index); out.x /= l.side; out.y /= l.side; if (l.sqrt){ out.w = out.w*out.w; out.h = out.h*out.h; } float iou = box_iou(out, truth); //iou = 0; float rmse = box_rmse(out, truth); if(best_iou > 0 || iou > 0){ if(iou > best_iou){ best_iou = iou; best_index = j; } }else{ if(rmse < best_rmse){ best_rmse = rmse; best_index = j; } } } if(l.forced){ if(truth.w*truth.h < .1){ best_index = 1; }else{ best_index = 0; } } int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords; int tbox_index = truth_index + 1 + l.classes; box out = float_to_box(l.output + box_index); out.x /= l.side; out.y /= l.side; if (l.sqrt) { out.w = out.w*out.w; out.h = out.h*out.h; } float iou = box_iou(out, truth); //printf("%d,", best_index); int p_index = index + locations*l.classes + i*l.n + best_index; *(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2); *(l.cost) += l.object_scale * pow(1-l.output[p_index], 2); avg_obj += l.output[p_index]; l.delta[p_index] = l.object_scale * (1.-l.output[p_index]); if(l.rescore){ l.delta[p_index] = l.object_scale * (iou - l.output[p_index]); } l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]); l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]); l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]); l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]); if(l.sqrt){ l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]); l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]); } *(l.cost) += pow(1-iou, 2); avg_iou += iou; ++count; } if(l.softmax){ gradient_array(l.output + index + locations*l.classes, locations*l.n*(1+l.coords), LOGISTIC, l.delta + index + locations*l.classes); } } if ( l.b_debug ) { printf("Detection Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count); } } }
void backward_activation_layer(layer l, network net) { gradient_array(l.output, l.outputs * l.batch, l.activation, l.delta); copy_cpu(l.outputs * l.batch, l.delta, 1, net.delta, 1); }