void forward_convolutional_layer(convolutional_layer l, network_state state) { int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); /* if(l.binary){ binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters); binarize_filters2(l.filters, l.n, l.c*l.size*l.size, l.cfilters, l.scales); swap_binary(&l); } */ if(l.binary){ int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; char *a = l.cfilters; float *b = state.workspace; float *c = l.output; for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm_bin(m,n,k,1,a,k,b,n,c,n); c += n*m; state.input += l.c*l.h*l.w; } scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); return; } int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; float *a = l.filters; float *b = state.workspace; float *c = l.output; for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; state.input += l.c*l.h*l.w; } if(l.batch_normalize){ forward_batchnorm_layer(l, state); } add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); }
void forward_convolutional_layer(const convolutional_layer l, network_state state) { int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); int i; bias_output(l.output, l.biases, l.batch, l.n, out_h*out_w); int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; float *a = l.filters; float *b = l.col_image; float *c = l.output; for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; state.input += l.c*l.h*l.w; } activate_array(l.output, m*n*l.batch, l.activation); }
void forward_connected_layer(connected_layer l, network_state state) { int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.batch; int k = l.inputs; int n = l.outputs; float *a = state.input; float *b = l.weights; float *c = l.output; gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(l.batch_normalize){ if(state.train){ mean_cpu(l.output, l.batch, l.outputs, 1, l.mean); variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance); scal_cpu(l.outputs, .95, l.rolling_mean, 1); axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1); scal_cpu(l.outputs, .95, l.rolling_variance, 1); axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1); } scale_bias(l.output, l.scales, l.batch, l.outputs, 1); } for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1); } activate_array(l.output, l.outputs*l.batch, l.activation); }
void forward_local_layer(const local_layer l, network_state state) { int out_h = local_out_height(l); int out_w = local_out_width(l); int i, j; int locations = out_h * out_w; for (i = 0; i < l.batch; ++i) { copy_cpu(l.outputs, l.biases, 1, l.output + i * l.outputs, 1); } for (i = 0; i < l.batch; ++i) { float *input = state.input + i * l.w * l.h * l.c; im2col_cpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image); float *output = l.output + i * l.outputs; for (j = 0; j < locations; ++j) { float *a = l.weights + j * l.size * l.size * l.c * l.n; float *b = l.col_image + j; float *c = output + j; int m = l.n; int n = 1; int k = l.size * l.size * l.c; gemm(0, 0, m, n, k, 1, a, k, b, locations, 1, c, locations); } } activate_array(l.output, l.outputs * l.batch, l.activation); }
void forward_deconvolutional_layer(const layer l, network net) { int i; int m = l.size * l.size * l.n; int n = l.h * l.w; int k = l.c; fill_cpu(l.outputs * l.batch, 0, l.output, 1); for (i = 0; i < l.batch; ++i) { real_t *a = l.weights; real_t *b = net.input + i * l.c * l.h * l.w; real_t *c = net.workspace; gemm_cpu(1, 0, m, n, k, 1, a, m, b, n, 0, c, n); col2im_cpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output + i * l.outputs); } if (l.batch_normalize) { forward_batchnorm_layer(l, net); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_w * l.out_h); } activate_array(l.output, l.batch * l.n * l.out_w * l.out_h, l.activation); }
void forward_deconvolutional_layer(const layer l, network_state state) { int i; int out_h = l.out_h; int out_w = l.out_w; int size = out_h*out_w; int m = l.size*l.size*l.n; int n = l.h*l.w; int k = l.c; fill_cpu(l.outputs*l.batch, 0, l.output, 1); for(i = 0; i < l.batch; ++i){ float *a = l.weights; float *b = state.input + i*l.c*l.h*l.w; float *c = state.workspace; gemm(1,0,m,n,k,1,a,m,b,n,0,c,n); col2im_cpu(c, l.n, out_h, out_w, l.size, l.stride, 0, l.output+i*l.n*size); } if(l.batch_normalize){ forward_batchnorm_layer(l, state); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); } activate_array(l.output, l.batch*l.n*size, l.activation); }
void forward_connected_layer(connected_layer l, network_state state) { int i; for(i = 0; i < l.batch; ++i){ copy_cpu(l.outputs, l.biases, 1, l.output + i*l.outputs, 1); } int m = l.batch; int k = l.inputs; int n = l.outputs; float *a = state.input; float *b = l.weights; float *c = l.output; gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); activate_array(l.output, l.outputs*l.batch, l.activation); }
void forward_compact_layer(const layer l, network_state state) { int i, b; for (b=0;b<l.batch;b++) { // copy first section copy_cpu(l.outputs, state.input+b*l.inputs, 1, l.output+b*l.outputs, 1); // add other splits for (i=1;i<l.index;i++) { axpy_cpu(l.outputs, 1, state.input+b*l.inputs+i*l.outputs, 1, l.output+b*l.outputs, 1); } } activate_array(l.output, l.outputs*l.batch, l.activation); }
void forward_convolutional_layer(convolutional_layer l, network net) { int i, j; fill_cpu(l.outputs*l.batch, 0, l.output, 1); if(l.xnor){ binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights); swap_binary(&l); binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input); net.input = l.binary_input; } int m = l.n/l.groups; int k = l.size*l.size*l.c/l.groups; int n = l.out_w*l.out_h; for(i = 0; i < l.batch; ++i){ for(j = 0; j < l.groups; ++j){ float *a = l.weights + j*l.nweights/l.groups; float *b = net.workspace; float *c = l.output + (i*l.groups + j)*n*m; im2col_cpu(net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); } } if(l.batch_normalize){ forward_batchnorm_layer(l, net); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); } activate_array(l.output, l.outputs*l.batch, l.activation); if(l.binary || l.xnor) swap_binary(&l); }
void forward_shortcut_layer(const layer l, network net) { copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1); shortcut_cpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output); activate_array(l.output, l.outputs*l.batch, l.activation); }
void backward_lstm_layer(layer l, network state) { network s = { 0 }; s.train = state.train; int i; layer wf = *(l.wf); layer wi = *(l.wi); layer wg = *(l.wg); layer wo = *(l.wo); layer uf = *(l.uf); layer ui = *(l.ui); layer ug = *(l.ug); layer uo = *(l.uo); increment_layer(&wf, l.steps - 1); increment_layer(&wi, l.steps - 1); increment_layer(&wg, l.steps - 1); increment_layer(&wo, l.steps - 1); increment_layer(&uf, l.steps - 1); increment_layer(&ui, l.steps - 1); increment_layer(&ug, l.steps - 1); increment_layer(&uo, l.steps - 1); state.input += l.inputs * l.batch * (l.steps - 1); if (state.delta) state.delta += l.inputs * l.batch * (l.steps - 1); l.output += l.outputs * l.batch * (l.steps - 1); l.cell_cpu += l.outputs * l.batch * (l.steps - 1); l.delta += l.outputs * l.batch * (l.steps - 1); for (i = l.steps - 1; i >= 0; --i) { if (i != 0) copy_cpu(l.outputs * l.batch, l.cell_cpu - l.outputs * l.batch, 1, l.prev_cell_cpu, 1); copy_cpu(l.outputs * l.batch, l.cell_cpu, 1, l.c_cpu, 1); if (i != 0) copy_cpu(l.outputs * l.batch, l.output - l.outputs * l.batch, 1, l.prev_state_cpu, 1); copy_cpu(l.outputs * l.batch, l.output, 1, l.h_cpu, 1); l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs * l.batch; copy_cpu(l.outputs * l.batch, wf.output, 1, l.f_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uf.output, 1, l.f_cpu, 1); copy_cpu(l.outputs * l.batch, wi.output, 1, l.i_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ui.output, 1, l.i_cpu, 1); copy_cpu(l.outputs * l.batch, wg.output, 1, l.g_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ug.output, 1, l.g_cpu, 1); copy_cpu(l.outputs * l.batch, wo.output, 1, l.o_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uo.output, 1, l.o_cpu, 1); activate_array(l.f_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.i_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.g_cpu, l.outputs * l.batch, TANH); activate_array(l.o_cpu, l.outputs * l.batch, LOGISTIC); copy_cpu(l.outputs * l.batch, l.delta, 1, l.temp3_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.temp_cpu, 1); activate_array(l.temp_cpu, l.outputs * l.batch, TANH); copy_cpu(l.outputs * l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1); mul_cpu(l.outputs * l.batch, l.o_cpu, 1, l.temp2_cpu, 1); gradient_array(l.temp_cpu, l.outputs * l.batch, TANH, l.temp2_cpu); axpy_cpu(l.outputs * l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.temp_cpu, 1); activate_array(l.temp_cpu, l.outputs * l.batch, TANH); mul_cpu(l.outputs * l.batch, l.temp3_cpu, 1, l.temp_cpu, 1); gradient_array(l.o_cpu, l.outputs * l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wo.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wo, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, uo.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(uo, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.i_cpu, 1, l.temp_cpu, 1); gradient_array(l.g_cpu, l.outputs * l.batch, TANH, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wg.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wg, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, ug.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(ug, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.g_cpu, 1, l.temp_cpu, 1); gradient_array(l.i_cpu, l.outputs * l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wi.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wi, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, ui.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(ui, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1); gradient_array(l.f_cpu, l.outputs * l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, wf.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; backward_connected_layer(wf, s); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, uf.delta, 1); s.input = state.input; s.delta = state.delta; backward_connected_layer(uf, s); copy_cpu(l.outputs * l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.f_cpu, 1, l.temp_cpu, 1); copy_cpu(l.outputs * l.batch, l.temp_cpu, 1, l.dc_cpu, 1); state.input -= l.inputs * l.batch; if (state.delta) state.delta -= l.inputs * l.batch; l.output -= l.outputs * l.batch; l.cell_cpu -= l.outputs * l.batch; l.delta -= l.outputs * l.batch; increment_layer(&wf, -1); increment_layer(&wi, -1); increment_layer(&wg, -1); increment_layer(&wo, -1); increment_layer(&uf, -1); increment_layer(&ui, -1); increment_layer(&ug, -1); increment_layer(&uo, -1); } }
void forward_lstm_layer(layer l, network state) { network s = { 0 }; s.train = state.train; int i; layer wf = *(l.wf); layer wi = *(l.wi); layer wg = *(l.wg); layer wo = *(l.wo); layer uf = *(l.uf); layer ui = *(l.ui); layer ug = *(l.ug); layer uo = *(l.uo); fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1); if (state.train) { fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1); } for (i = 0; i < l.steps; ++i) { s.input = l.h_cpu; forward_connected_layer(wf, s); forward_connected_layer(wi, s); forward_connected_layer(wg, s); forward_connected_layer(wo, s); s.input = state.input; forward_connected_layer(uf, s); forward_connected_layer(ui, s); forward_connected_layer(ug, s); forward_connected_layer(uo, s); copy_cpu(l.outputs * l.batch, wf.output, 1, l.f_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uf.output, 1, l.f_cpu, 1); copy_cpu(l.outputs * l.batch, wi.output, 1, l.i_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ui.output, 1, l.i_cpu, 1); copy_cpu(l.outputs * l.batch, wg.output, 1, l.g_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ug.output, 1, l.g_cpu, 1); copy_cpu(l.outputs * l.batch, wo.output, 1, l.o_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uo.output, 1, l.o_cpu, 1); activate_array(l.f_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.i_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.g_cpu, l.outputs * l.batch, TANH); activate_array(l.o_cpu, l.outputs * l.batch, LOGISTIC); copy_cpu(l.outputs * l.batch, l.i_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.g_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.f_cpu, 1, l.c_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.h_cpu, 1); activate_array(l.h_cpu, l.outputs * l.batch, TANH); mul_cpu(l.outputs * l.batch, l.o_cpu, 1, l.h_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.cell_cpu, 1); copy_cpu(l.outputs * l.batch, l.h_cpu, 1, l.output, 1); state.input += l.inputs * l.batch; l.output += l.outputs * l.batch; l.cell_cpu += l.outputs * l.batch; increment_layer(&wf, 1); increment_layer(&wi, 1); increment_layer(&wg, 1); increment_layer(&wo, 1); increment_layer(&uf, 1); increment_layer(&ui, 1); increment_layer(&ug, 1); increment_layer(&uo, 1); } }
void forward_detection_layer(const detection_layer l, network_state state) { int in_i = 0; int out_i = 0; int locations = get_detection_layer_locations(l); int i,j; for(i = 0; i < l.batch*locations; ++i){ int mask = (!state.truth || state.truth[out_i + (l.background || l.objectness) + l.classes + 2]); float scale = 1; if(l.joint) scale = state.input[in_i++]; else if(l.objectness){ l.output[out_i++] = 1-state.input[in_i++]; scale = mask; } else if(l.background) l.output[out_i++] = scale*state.input[in_i++]; for(j = 0; j < l.classes; ++j){ l.output[out_i++] = scale*state.input[in_i++]; } if(l.objectness){ }else if(l.background){ softmax_array(l.output + out_i - l.classes-l.background, l.classes+l.background, l.output + out_i - l.classes-l.background); activate_array(state.input+in_i, l.coords, LOGISTIC); } for(j = 0; j < l.coords; ++j){ l.output[out_i++] = mask*state.input[in_i++]; } } float avg_iou = 0; int count = 0; if(l.does_cost && state.train){ *(l.cost) = 0; int size = get_detection_layer_output_size(l) * l.batch; memset(l.delta, 0, size * sizeof(float)); for (i = 0; i < l.batch*locations; ++i) { int classes = l.objectness+l.classes; int offset = i*(classes+l.coords); for (j = offset; j < offset+classes; ++j) { *(l.cost) += pow(state.truth[j] - l.output[j], 2); l.delta[j] = state.truth[j] - l.output[j]; } box truth; truth.x = state.truth[j+0]/7; truth.y = state.truth[j+1]/7; truth.w = pow(state.truth[j+2], 2); truth.h = pow(state.truth[j+3], 2); box out; out.x = l.output[j+0]/7; out.y = l.output[j+1]/7; out.w = pow(l.output[j+2], 2); out.h = pow(l.output[j+3], 2); if(!(truth.w*truth.h)) continue; float iou = box_iou(out, truth); avg_iou += iou; ++count; dbox delta = diou(out, truth); l.delta[j+0] = 10 * delta.dx/7; l.delta[j+1] = 10 * delta.dy/7; l.delta[j+2] = 10 * delta.dw * 2 * sqrt(out.w); l.delta[j+3] = 10 * delta.dh * 2 * sqrt(out.h); *(l.cost) += pow((1-iou), 2); l.delta[j+0] = 4 * (state.truth[j+0] - l.output[j+0]); l.delta[j+1] = 4 * (state.truth[j+1] - l.output[j+1]); l.delta[j+2] = 4 * (state.truth[j+2] - l.output[j+2]); l.delta[j+3] = 4 * (state.truth[j+3] - l.output[j+3]); if(l.rescore){ for (j = offset; j < offset+classes; ++j) { if(state.truth[j]) state.truth[j] = iou; l.delta[j] = state.truth[j] - l.output[j]; } } } printf("Avg IOU: %f\n", avg_iou/count); } }
void forward_activation_layer(layer l, network net) { copy_cpu(l.outputs * l.batch, net.input, 1, l.output, 1); activate_array(l.output, l.outputs * l.batch, l.activation); }
void forward_convolutional_layer(const convolutional_layer l, network_state state) { int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; float *a = l.filters; float *b = l.col_image; float *c = l.output; // printf("the l.size is %i \n", l.size); ///* //printf("the m,k,n is %i,%i,%i \n", m,k,n); for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; state.input += l.c*l.h*l.w; } //*/ //add by fanghao /* int ii,jj,kk,mm,pp,tt; int lcc = l.c; int lhh = l.h; int lww = l.w; int kernel = l.size; int pad; if(l.pad) pad = l.size/2; else pad = l.pad; lhh += 2*pad; lww += 2*pad; float *dataP; dataP = (float *)calloc(lcc*lhh*lww, sizeof(float)); //printf("the l.h is %i \n", l.h); //printf("the l.w is %i \n", l.w); //printf("the lhh is %i \n", lhh); //printf("the lww is %i \n", lww); //printf("the pad is %i \n", pad); for(ii=0; ii < lcc; ii++) for(jj=pad; jj<lhh-pad; jj++) for(kk=pad; kk<lww-pad; kk++) dataP[ii*lhh*lww + jj*lww + kk] = state.input[ii*(lhh - 2*pad)*(lww-2*pad) + (jj - pad)*(lww - 2*pad) + kk-pad]; for(ii=0; ii<m; ii++) for(jj=0; jj<out_h; jj++) for(kk=0; kk<out_w; kk++) { float tempAcc = 0.0; for(mm=0; mm<lcc; mm++) for(pp=0; pp<kernel; pp++) for(tt=0; tt<kernel; tt++) tempAcc += a[ii*lcc*kernel*kernel+mm*kernel*kernel+pp*kernel+tt]*dataP[mm*lhh*lww+(l.stride*jj+pp)*lww+l.stride*kk+tt]; c[ii*out_h*out_w+jj*out_w+kk] = tempAcc; } // c += n*m; //state.input += l.c*l.h*l.w; // */ if(l.batch_normalize){ if(state.train){ mean_cpu(l.output, l.batch, l.n, l.out_h*l.out_w, l.mean); variance_cpu(l.output, l.mean, l.batch, l.n, l.out_h*l.out_w, l.variance); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.n, l.out_h*l.out_w); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.n, l.out_h*l.out_w); } scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); } add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); }
void forward_detection_layer(const detection_layer l, network_state state) { int locations = l.side*l.side; int i,j; memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float)); int b; if (l.softmax){ for(b = 0; b < l.batch; ++b){ int index = b*l.inputs; for (i = 0; i < locations; ++i) { int offset = i*l.classes; softmax_array(l.output + index + offset, l.classes, 1, l.output + index + offset); } int offset = locations*l.classes; activate_array(l.output + index + offset, locations*l.n*(1+l.coords), LOGISTIC); } } if(state.train){ float avg_iou = 0; float avg_cat = 0; float avg_allcat = 0; float avg_obj = 0; float avg_anyobj = 0; int count = 0; *(l.cost) = 0; int size = l.inputs * l.batch; memset(l.delta, 0, size * sizeof(float)); for (b = 0; b < l.batch; ++b){ int index = b*l.inputs; for (i = 0; i < locations; ++i) { int truth_index = (b*locations + i)*(1+l.coords+l.classes); int is_obj = state.truth[truth_index]; for (j = 0; j < l.n; ++j) { int p_index = index + locations*l.classes + i*l.n + j; l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]); *(l.cost) += l.noobject_scale*pow(l.output[p_index], 2); avg_anyobj += l.output[p_index]; } int best_index = -1; float best_iou = 0; float best_rmse = 20; if (!is_obj){ continue; } int class_index = index + i*l.classes; for(j = 0; j < l.classes; ++j) { l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]); *(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2); if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j]; avg_allcat += l.output[class_index+j]; } box truth = float_to_box(state.truth + truth_index + 1 + l.classes); truth.x /= l.side; truth.y /= l.side; for(j = 0; j < l.n; ++j){ int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords; box out = float_to_box(l.output + box_index); out.x /= l.side; out.y /= l.side; if (l.sqrt){ out.w = out.w*out.w; out.h = out.h*out.h; } float iou = box_iou(out, truth); //iou = 0; float rmse = box_rmse(out, truth); if(best_iou > 0 || iou > 0){ if(iou > best_iou){ best_iou = iou; best_index = j; } }else{ if(rmse < best_rmse){ best_rmse = rmse; best_index = j; } } } if(l.forced){ if(truth.w*truth.h < .1){ best_index = 1; }else{ best_index = 0; } } int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords; int tbox_index = truth_index + 1 + l.classes; box out = float_to_box(l.output + box_index); out.x /= l.side; out.y /= l.side; if (l.sqrt) { out.w = out.w*out.w; out.h = out.h*out.h; } float iou = box_iou(out, truth); //printf("%d,", best_index); int p_index = index + locations*l.classes + i*l.n + best_index; *(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2); *(l.cost) += l.object_scale * pow(1-l.output[p_index], 2); avg_obj += l.output[p_index]; l.delta[p_index] = l.object_scale * (1.-l.output[p_index]); if(l.rescore){ l.delta[p_index] = l.object_scale * (iou - l.output[p_index]); } l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]); l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]); l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]); l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]); if(l.sqrt){ l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]); l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]); } *(l.cost) += pow(1-iou, 2); avg_iou += iou; ++count; } if(l.softmax){ gradient_array(l.output + index + locations*l.classes, locations*l.n*(1+l.coords), LOGISTIC, l.delta + index + locations*l.classes); } } if ( l.b_debug ) { printf("Detection Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count); } } }
void forward_iseg_layer(const layer l, network net) { double time = what_time_is_it_now(); int i, b, j, k; int ids = l.extra; memcpy(l.output, net.input, l.outputs * l.batch * sizeof(real_t)); memset(l.delta, 0, l.outputs * l.batch * sizeof(real_t)); #ifndef GPU for (b = 0; b < l.batch; ++b) { int index = b * l.outputs; activate_array(l.output + index, l.classes * l.w * l.h, LOGISTIC); } #endif for (b = 0; b < l.batch; ++b) { // a priori, each pixel has no class for (i = 0; i < l.classes; ++i) { for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + i * l.w * l.h + k; l.delta[index] = 0 - l.output[index]; } } // a priori, embedding should be small magnitude for (i = 0; i < ids; ++i) { for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + (i + l.classes) * l.w * l.h + k; l.delta[index] = .1 * (0 - l.output[index]); } } memset(l.counts, 0, 90 * sizeof(int)); for (i = 0; i < 90; ++i) { fill_cpu(ids, 0, l.sums[i], 1); int c = net.truth[b * l.truths + i * (l.w * l.h + 1)]; if (c < 0) break; // add up metric embeddings for each instance for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + c * l.w * l.h + k; real_t v = net.truth[b * l.truths + i * (l.w * l.h + 1) + 1 + k]; if (v) { l.delta[index] = v - l.output[index]; axpy_cpu(ids, 1, l.output + b * l.outputs + l.classes * l.w * l.h + k, l.w * l.h, l.sums[i], 1); ++l.counts[i]; } } } real_t *mse = calloc(90, sizeof(real_t)); for (i = 0; i < 90; ++i) { int c = net.truth[b * l.truths + i * (l.w * l.h + 1)]; if (c < 0) break; for (k = 0; k < l.w * l.h; ++k) { real_t v = net.truth[b * l.truths + i * (l.w * l.h + 1) + 1 + k]; if (v) { int z; real_t sum = 0; for (z = 0; z < ids; ++z) { int index = b * l.outputs + (l.classes + z) * l.w * l.h + k; sum += pow(l.sums[i][z] / l.counts[i] - l.output[index], 2); } mse[i] += sum; } } mse[i] /= l.counts[i]; } // Calculate average embedding for (i = 0; i < 90; ++i) { if (!l.counts[i]) continue; scal_cpu(ids, 1.f / l.counts[i], l.sums[i], 1); if (b == 0 && net.gpu_index == 0) { printf("%4d, %6.3f, ", l.counts[i], mse[i]); for (j = 0; j < ids; ++j) { printf("%6.3f,", l.sums[i][j]); } printf("\n"); } } free(mse); // Calculate embedding loss for (i = 0; i < 90; ++i) { if (!l.counts[i]) continue; for (k = 0; k < l.w * l.h; ++k) { real_t v = net.truth[b * l.truths + i * (l.w * l.h + 1) + 1 + k]; if (v) { for (j = 0; j < 90; ++j) { if (!l.counts[j]) continue; int z; for (z = 0; z < ids; ++z) { int index = b * l.outputs + (l.classes + z) * l.w * l.h + k; real_t diff = l.sums[j][z] - l.output[index]; if (j == i) l.delta[index] += diff < 0 ? -.1 : .1; else l.delta[index] += -(diff < 0 ? -.1 : .1); } } } } } for (i = 0; i < ids; ++i) { for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + (i + l.classes) * l.w * l.h + k; l.delta[index] *= .01; } } } *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2); printf("took %lf sec\n", what_time_is_it_now() - time); }