void forward_connected_layer(connected_layer l, network_state state) { int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.batch; int k = l.inputs; int n = l.outputs; float *a = state.input; float *b = l.weights; float *c = l.output; gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(l.batch_normalize){ if(state.train){ mean_cpu(l.output, l.batch, l.outputs, 1, l.mean); variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance); scal_cpu(l.outputs, .95, l.rolling_mean, 1); axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1); scal_cpu(l.outputs, .95, l.rolling_variance, 1); axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1); } scale_bias(l.output, l.scales, l.batch, l.outputs, 1); } for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1); } activate_array(l.output, l.outputs*l.batch, l.activation); }
void test_cifar_multi(char *filename, char *weightfile) { network net = parse_network_cfg(filename); if(weightfile){ load_weights(&net, weightfile); } set_batch_network(&net, 1); srand(time(0)); float avg_acc = 0; data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin"); int i; for(i = 0; i < test.X.rows; ++i){ image im = float_to_image(32, 32, 3, test.X.vals[i]); float pred[10] = {0}; float *p = network_predict(net, im.data); axpy_cpu(10, 1, p, 1, pred, 1); flip_image(im); p = network_predict(net, im.data); axpy_cpu(10, 1, p, 1, pred, 1); int index = max_index(pred, 10); int class = max_index(test.y.vals[i], 10); if(index == class) avg_acc += 1; free_image(im); printf("%4d: %.2f%%\n", i, 100.*avg_acc/(i+1)); } }
void forward_batchnorm_layer(layer l, network_state state) { if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); if(l.type == CONNECTED){ l.out_c = l.outputs; l.out_h = l.out_w = 1; } if(state.train){ mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean); variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance); scal_cpu(l.out_c, .99, l.rolling_mean, 1); axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1); scal_cpu(l.out_c, .99, l.rolling_variance, 1); axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w); } scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w); add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w); }
void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay) { axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.outputs, momentum, l.bias_updates, 1); axpy_cpu(l.inputs*l.outputs, -decay*batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates, 1, l.weights, 1); scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1); }
void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay) { int size = l.size*l.size*l.c*l.n; axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); axpy_cpu(size, -decay*batch, l.filters, 1, l.filter_updates, 1); axpy_cpu(size, learning_rate/batch, l.filter_updates, 1, l.filters, 1); scal_cpu(size, momentum, l.filter_updates, 1); }
void slerp(float *start, float *end, float s, int n, float *out) { float omega = acos(dot_cpu(n, start, 1, end, 1)); float so = sin(omega); fill_cpu(n, 0, out, 1); axpy_cpu(n, sin((1-s)*omega)/so, start, 1, out, 1); axpy_cpu(n, sin(s*omega)/so, end, 1, out, 1); float mag = mag_array(out, n); scale_array(out, n, 1./mag); }
void update_local_layer(local_layer l, int batch, float learning_rate, float momentum, float decay) { int locations = l.out_w * l.out_h; int size = l.size * l.size * l.c * l.n * locations; axpy_cpu(l.outputs, learning_rate / batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.outputs, momentum, l.bias_updates, 1); axpy_cpu(size, -decay * batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(size, learning_rate / batch, l.weight_updates, 1, l.weights, 1); scal_cpu(size, momentum, l.weight_updates, 1); }
void backward_rnn_layer(layer l, network_state state) { network_state s = { 0 }; s.train = state.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); increment_layer(&input_layer, l.steps - 1); increment_layer(&self_layer, l.steps - 1); increment_layer(&output_layer, l.steps - 1); l.state += l.hidden * l.batch * l.steps; for (i = l.steps - 1; i >= 0; --i) { copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1); axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1); s.input = l.state; s.delta = self_layer.delta; backward_connected_layer(output_layer, s); l.state -= l.hidden * l.batch; /* if(i > 0){ copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1); axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1); }else{ fill_cpu(l.hidden * l.batch, 0, l.state, 1); } */ s.input = l.state; s.delta = self_layer.delta - l.hidden * l.batch; if (i == 0) s.delta = 0; backward_connected_layer(self_layer, s); copy_cpu(l.hidden * l.batch, self_layer.delta, 1, input_layer.delta, 1); if (i > 0 && l.shortcut) axpy_cpu(l.hidden * l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden * l.batch, 1); s.input = state.input + i * l.inputs * l.batch; if (state.delta) s.delta = state.delta + i * l.inputs * l.batch; else s.delta = 0; backward_connected_layer(input_layer, s); increment_layer(&input_layer, -1); increment_layer(&self_layer, -1); increment_layer(&output_layer, -1); } }
void merge_updates(layer l, layer base) { if (l.type == CONVOLUTIONAL) { axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1); axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weight_updates, 1, base.weight_updates, 1); if (l.scale_updates) { axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1); } } else if(l.type == CONNECTED) { axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.bias_updates, 1); axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weight_updates, 1); } }
void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay) { int size = l.size*l.size*l.c*l.n; axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); if(l.scales){ axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1); scal_cpu(l.n, momentum, l.scale_updates, 1); } axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1); scal_cpu(size, momentum, l.weight_updates, 1); }
void backward_connected_layer(connected_layer l, network_state state) { int i; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1); } int m = l.outputs; int k = l.batch; int n = l.inputs; float *a = l.delta; float *b = state.input; float *c = l.weight_updates; gemm(1,0,m,n,k,1,a,m,b,n,1,c,n); m = l.batch; k = l.outputs; n = l.inputs; a = l.delta; b = l.weights; c = state.delta; if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); }
void read_and_add_into(int fd, float *a, int n) { float *buff = calloc(n, sizeof(float)); read_all(fd, (char*) buff, n*sizeof(float)); axpy_cpu(n, 1, buff, 1, a, 1); free(buff); }
void backward_connected_layer(connected_layer l, network_state state) { int i; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1); } if(l.batch_normalize){ backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates); scale_bias(l.delta, l.scales, l.batch, l.outputs, 1); mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta); variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta); normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta); } int m = l.outputs; int k = l.batch; int n = l.inputs; float *a = l.delta; float *b = state.input; float *c = l.weight_updates; gemm(1,0,m,n,k,1,a,m,b,n,1,c,n); m = l.batch; k = l.outputs; n = l.inputs; a = l.delta; b = l.weights; c = state.delta; if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); }
void predict_move(network net, float *board, float *move, int multi) { float *output = network_predict(net, board); copy_cpu(19*19, output, 1, move, 1); int i; if(multi){ image bim = float_to_image(19, 19, 1, board); for(i = 1; i < 8; ++i){ rotate_image_cw(bim, i); if(i >= 4) flip_image(bim); float *output = network_predict(net, board); image oim = float_to_image(19, 19, 1, output); if(i >= 4) flip_image(oim); rotate_image_cw(oim, -i); axpy_cpu(19*19, 1, output, 1, move, 1); if(i >= 4) flip_image(bim); rotate_image_cw(bim, -i); } scal_cpu(19*19, 1./8., move, 1); } for(i = 0; i < 19*19; ++i){ if(board[i]) move[i] = 0; } }
void test_mnist_multi(char *filename, char *weightfile) { network net = parse_network_cfg(filename); if(weightfile){ load_weights(&net, weightfile); } set_batch_network(&net, 1); srand(time(0)); float avg_acc = 0; data test; test = load_mnist_data("data/mnist/t10k-images.idx3-ubyte", "data/mnist/t10k-labels.idx1-ubyte", 10000); int i; for(i = 0; i < test.X.rows; ++i){ image im = float_to_image(28, 28, 1, test.X.vals[i]); float pred[10] = {0}; float *p = network_predict(net, im.data); axpy_cpu(10, 1, p, 1, pred, 1); // flip_image(im); image im1 = rotate_image(im, -2.0*3.1415926/180.0); image im2 = rotate_image(im, 2.0*3.1415926/180.0); image im3 = rotate_image(im, -3.0*3.1415926/180.0); image im4 = rotate_image(im, 3.0*3.1415926/180.0); p = network_predict(net, im1.data); axpy_cpu(10, 1, p, 1, pred, 1); p = network_predict(net, im2.data); axpy_cpu(10, 1, p, 1, pred, 1); p = network_predict(net, im3.data); axpy_cpu(10, 1, p, 1, pred, 1); p = network_predict(net, im4.data); axpy_cpu(10, 1, p, 1, pred, 1); int index = max_index(pred, 10); int class = max_index(test.y.vals[i], 10); if(index == class) avg_acc += 1; free_image(im); free_image(im1); free_image(im2); free_image(im3); free_image(im4); printf("%4d: %.2f%%\n", i, 100.*avg_acc/(i+1)); } printf("%4d: %.2f%%\n", i, 100.*avg_acc/(i+1)); }
void average(int argc, char *argv[]) { char *cfgfile = argv[2]; char *outfile = argv[3]; gpu_index = -1; network net = parse_network_cfg(cfgfile); network sum = parse_network_cfg(cfgfile); char *weightfile = argv[4]; load_weights(&sum, weightfile); int i, j; int n = argc - 5; for(i = 0; i < n; ++i){ weightfile = argv[i+5]; load_weights(&net, weightfile); for(j = 0; j < net.n; ++j){ layer l = net.layers[j]; layer out = sum.layers[j]; if(l.type == CONVOLUTIONAL){ int num = l.n*l.c*l.size*l.size; axpy_cpu(l.n, 1, l.biases, 1, out.biases, 1); axpy_cpu(num, 1, l.filters, 1, out.filters, 1); } if(l.type == CONNECTED){ axpy_cpu(l.outputs, 1, l.biases, 1, out.biases, 1); axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, out.weights, 1); } } } n = n+1; for(j = 0; j < net.n; ++j){ layer l = sum.layers[j]; if(l.type == CONVOLUTIONAL){ int num = l.n*l.c*l.size*l.size; scal_cpu(l.n, 1./n, l.biases, 1); scal_cpu(num, 1./n, l.filters, 1); } if(l.type == CONNECTED){ scal_cpu(l.outputs, 1./n, l.biases, 1); scal_cpu(l.outputs*l.inputs, 1./n, l.weights, 1); } } save_weights(sum, outfile); }
int main() { int N, N2; printf(" \n Input matrix size N x N, N = "); scanf("%d", &N); printf(" N = %d \n \n", N); N2 = N*N; double *A, *B, *C_cpu, *C_gpu, *D_cpu, *D_gpu, t1, t2, cpu_time, gpu_time; double r_cpu, *r_gpu, nrmC_cpu, *nrmC_gpu; A = (double *) malloc(N2*sizeof(double)); B = (double *) malloc(N2*sizeof(double)); C_cpu = (double *) malloc(N2*sizeof(double)); C_gpu = (double *) malloc(N2*sizeof(double)); D_cpu = (double *) malloc(N2*sizeof(double)); D_gpu = (double *) malloc(N2*sizeof(double)); r_gpu = (double *) malloc(1*sizeof(double)); nrmC_gpu = (double *) malloc(1*sizeof(double)); initial(A, B, N); t1 = clock(); #pragma acc data copyin(A[0:N2], B[0:N2]) copyout(C_cpu[0:N2]) { cublas_gemm(A, B, C_cpu, N); } r_cpu = dot_cpu(C_cpu, B, N2); axpy_cpu(-1.0*r_cpu, B, C_cpu, N2); nrmC_cpu = norm_cpu(C_cpu, N2); copy_cpu(C_cpu, D_cpu, N2); scal_cpu(1.0/nrmC_cpu, D_cpu, N2); t2 = clock(); cpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC; t1 = clock(); #pragma acc enter data copyin(A[0:N2], B[0:N2]) create(C_gpu[0:N2], r_gpu[0], nrmC_gpu[0], D_gpu[0:N2]) { gpu_cublas1(A, B, C_gpu, D_gpu, r_gpu, nrmC_gpu, N, N2); } #pragma acc update host(D_gpu[0:N2]) t2 = clock(); gpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC; printf(" gpu part success \n"); printf(" \n error = %f \n", error(D_cpu, D_gpu, N2)); printf(" gpu time = %f, cpu times = %f \n", gpu_time, cpu_time); return 0; }
void update_convolutional_layer(convolutional_layer l, update_args a) { float learning_rate = a.learning_rate*l.learning_rate_scale; float momentum = a.momentum; float decay = a.decay; int batch = a.batch; axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); if(l.scales){ axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1); scal_cpu(l.n, momentum, l.scale_updates, 1); } axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(l.nweights, learning_rate/batch, l.weight_updates, 1, l.weights, 1); scal_cpu(l.nweights, momentum, l.weight_updates, 1); }
void update_deconvolutional_layer(layer l, update_args a) { real_t learning_rate = a.learning_rate * l.learning_rate_scale; real_t momentum = a.momentum; real_t decay = a.decay; int batch = a.batch; int size = l.size * l.size * l.c * l.n; axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); if (l.scales) { axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1); scal_cpu(l.n, momentum, l.scale_updates, 1); } axpy_cpu(size, -decay * batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(size, learning_rate / batch, l.weight_updates, 1, l.weights, 1); scal_cpu(size, momentum, l.weight_updates, 1); }
float cuda_compare(float *x_gpu, float *x, size_t n, char *s) { float *tmp = calloc(n, sizeof(float)); cuda_pull_array(x_gpu, tmp, n); //int i; //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]); axpy_cpu(n, -1, x, 1, tmp, 1); float err = dot_cpu(n, tmp, 1, tmp, 1); printf("Error %s: %f\n", s, sqrt(err / n)); free(tmp); return err; }
void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size) { scale_image(recon, 2); translate_image(recon, -1); image delta = make_image(recon.w, recon.h, recon.c); network_state state = {0}; #ifdef GPU state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c); state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c); state.truth = cuda_make_array(features, get_network_output_size(net)); forward_network_gpu(net, state); backward_network_gpu(net, state); cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c); cuda_free(state.input); cuda_free(state.delta); cuda_free(state.truth); #else state.input = recon.data; state.delta = delta.data; state.truth = features; forward_network(net, state); backward_network(net, state); #endif axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1); smooth(recon, update, lambda, smooth_size); axpy_cpu(recon.w*recon.h*recon.c, rate, update.data, 1, recon.data, 1); scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1); translate_image(recon, 1); scale_image(recon, .5); constrain_image(recon); free_image(delta); }
void forward_rnn_layer(layer l, network_state state) { network_state s = { 0 }; s.train = state.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1); if (state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1); for (i = 0; i < l.steps; ++i) { s.input = state.input; forward_connected_layer(input_layer, s); s.input = l.state; forward_connected_layer(self_layer, s); float *old_state = l.state; if (state.train) l.state += l.hidden * l.batch; if (l.shortcut) { copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1); } else { fill_cpu(l.hidden * l.batch, 0, l.state, 1); } axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1); axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1); s.input = l.state; forward_connected_layer(output_layer, s); state.input += l.inputs * l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } }
void backward_compact_layer(const layer l, network_state state) { gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); int i, b; for (b=0;b<l.batch;b++) { for (i=0;i<l.index;i++) { axpy_cpu(l.outputs, 1, l.delta+b*l.outputs, 1, state.delta+b*l.inputs+i*l.outputs, 1); } } }
void average(int argc, char *argv[]) { char *cfgfile = argv[2]; char *outfile = argv[3]; gpu_index = -1; network *net = parse_network_cfg(cfgfile); network *sum = parse_network_cfg(cfgfile); char *weightfile = argv[4]; load_weights(sum, weightfile); int i, j; int n = argc - 5; for(i = 0; i < n; ++i){ weightfile = argv[i+5]; load_weights(net, weightfile); for(j = 0; j < net->n; ++j){ layer l = net->layers[j]; layer out = sum->layers[j]; if(l.type == CONVOLUTIONAL){ int num = l.n*l.c*l.size*l.size; axpy_cpu(l.n, 1, l.biases, 1, out.biases, 1); axpy_cpu(num, 1, l.weights, 1, out.weights, 1); if(l.batch_normalize){ axpy_cpu(l.n, 1, l.scales, 1, out.scales, 1); axpy_cpu(l.n, 1, l.rolling_mean, 1, out.rolling_mean, 1); axpy_cpu(l.n, 1, l.rolling_variance, 1, out.rolling_variance, 1); } } if(l.type == CONNECTED){ axpy_cpu(l.outputs, 1, l.biases, 1, out.biases, 1); axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, out.weights, 1); } } } n = n+1; for(j = 0; j < net->n; ++j){ layer l = sum->layers[j]; if(l.type == CONVOLUTIONAL){ int num = l.n*l.c*l.size*l.size; scal_cpu(l.n, 1./n, l.biases, 1); scal_cpu(num, 1./n, l.weights, 1); if(l.batch_normalize){ scal_cpu(l.n, 1./n, l.scales, 1); scal_cpu(l.n, 1./n, l.rolling_mean, 1); scal_cpu(l.n, 1./n, l.rolling_variance, 1); } } if(l.type == CONNECTED){ scal_cpu(l.outputs, 1./n, l.biases, 1); scal_cpu(l.outputs*l.inputs, 1./n, l.weights, 1); } } save_weights(sum, outfile); }
void backward_route_layer(const route_layer l, network_state state) { int i, j; int offset = 0; for(i = 0; i < l.n; ++i){ int index = l.input_layers[i]; float *delta = state.net.layers[index].delta; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ axpy_cpu(input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1); } offset += input_size; } }
void forward_cost_layer(cost_layer l, network_state state) { if (!state.truth) return; if(l.cost_type == MASKED){ int i; for(i = 0; i < l.batch*l.inputs; ++i){ if(state.truth[i] == 0) state.input[i] = 0; } } copy_cpu(l.batch*l.inputs, state.truth, 1, l.delta, 1); axpy_cpu(l.batch*l.inputs, -1, state.input, 1, l.delta, 1); *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); //printf("cost: %f\n", *l.output); }
void forward_compact_layer(const layer l, network_state state) { int i, b; for (b=0;b<l.batch;b++) { // copy first section copy_cpu(l.outputs, state.input+b*l.inputs, 1, l.output+b*l.outputs, 1); // add other splits for (i=1;i<l.index;i++) { axpy_cpu(l.outputs, 1, state.input+b*l.inputs+i*l.outputs, 1, l.output+b*l.outputs, 1); } } activate_array(l.output, l.outputs*l.batch, l.activation); }
void forward_cost_layer(cost_layer l, network_state state) { if (!state.truth) return; if(l.cost_type == MASKED){ int i; for(i = 0; i < l.batch*l.inputs; ++i){ if(state.truth[i] == SECRET_NUM) state.input[i] = SECRET_NUM; } } if(l.cost_type == SMOOTH){ smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta); } else { copy_cpu(l.batch*l.inputs, state.truth, 1, l.delta, 1); axpy_cpu(l.batch*l.inputs, -1, state.input, 1, l.delta, 1); } *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1); //printf("cost: %f\n", *l.output); }
void backward_local_layer(local_layer l, network_state state) { int i, j; int locations = l.out_w*l.out_h; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1); } for(i = 0; i < l.batch; ++i){ float *input = state.input + i*l.w*l.h*l.c; im2col_cpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image); for(j = 0; j < locations; ++j){ float *a = l.delta + i*l.outputs + j; float *b = l.col_image + j; float *c = l.filter_updates + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; int k = 1; gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n); } if(state.delta){ for(j = 0; j < locations; ++j){ float *a = l.filters + j*l.size*l.size*l.c*l.n; float *b = l.delta + i*l.outputs + j; float *c = l.col_image + j; int m = l.size*l.size*l.c; int n = 1; int k = l.n; gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } }
void backward_shortcut_layer(const layer l, network net) { gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, net.delta, 1); shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, net.layers[l.index].delta); }