void forward_batchnorm_layer(layer l, network_state state) { if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); if(l.type == CONNECTED){ l.out_c = l.outputs; l.out_h = l.out_w = 1; } if(state.train){ mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean); variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance); scal_cpu(l.out_c, .99, l.rolling_mean, 1); axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1); scal_cpu(l.out_c, .99, l.rolling_variance, 1); axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w); } scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w); add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w); }
void oneoff(char *cfgfile, char *weightfile, char *outfile) { gpu_index = -1; network *net = parse_network_cfg(cfgfile); int oldn = net->layers[net->n - 2].n; int c = net->layers[net->n - 2].c; scal_cpu(oldn*c, .1, net->layers[net->n - 2].weights, 1); scal_cpu(oldn, 0, net->layers[net->n - 2].biases, 1); net->layers[net->n - 2].n = 11921; net->layers[net->n - 2].biases += 5; net->layers[net->n - 2].weights += 5*c; if(weightfile){ load_weights(net, weightfile); } net->layers[net->n - 2].biases -= 5; net->layers[net->n - 2].weights -= 5*c; net->layers[net->n - 2].n = oldn; printf("%d\n", oldn); layer l = net->layers[net->n - 2]; copy_cpu(l.n/3, l.biases, 1, l.biases + l.n/3, 1); copy_cpu(l.n/3, l.biases, 1, l.biases + 2*l.n/3, 1); copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + l.n/3*l.c, 1); copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + 2*l.n/3*l.c, 1); *net->seen = 0; save_weights(net, outfile); free_network(net); }
void forward_connected_layer(connected_layer l, network_state state) { int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.batch; int k = l.inputs; int n = l.outputs; float *a = state.input; float *b = l.weights; float *c = l.output; gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(l.batch_normalize){ if(state.train){ mean_cpu(l.output, l.batch, l.outputs, 1, l.mean); variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance); scal_cpu(l.outputs, .95, l.rolling_mean, 1); axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1); scal_cpu(l.outputs, .95, l.rolling_variance, 1); axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1); } scale_bias(l.output, l.scales, l.batch, l.outputs, 1); } for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1); } activate_array(l.output, l.outputs*l.batch, l.activation); }
void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay) { axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.outputs, momentum, l.bias_updates, 1); axpy_cpu(l.inputs*l.outputs, -decay*batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates, 1, l.weights, 1); scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1); }
void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay) { int size = l.size*l.size*l.c*l.n; axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); axpy_cpu(size, -decay*batch, l.filters, 1, l.filter_updates, 1); axpy_cpu(size, learning_rate/batch, l.filter_updates, 1, l.filters, 1); scal_cpu(size, momentum, l.filter_updates, 1); }
void update_local_layer(local_layer l, int batch, float learning_rate, float momentum, float decay) { int locations = l.out_w * l.out_h; int size = l.size * l.size * l.c * l.n * locations; axpy_cpu(l.outputs, learning_rate / batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.outputs, momentum, l.bias_updates, 1); axpy_cpu(size, -decay * batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(size, learning_rate / batch, l.weight_updates, 1, l.weights, 1); scal_cpu(size, momentum, l.weight_updates, 1); }
void scale_weights(layer l, float s) { if (l.type == CONVOLUTIONAL) { scal_cpu(l.n, s, l.biases, 1); scal_cpu(l.n*l.size*l.size*l.c, s, l.weights, 1); if (l.scales) { scal_cpu(l.n, s, l.scales, 1); } } else if(l.type == CONNECTED) { scal_cpu(l.outputs, s, l.biases, 1); scal_cpu(l.outputs*l.inputs, s, l.weights, 1); } }
void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay) { int size = l.size*l.size*l.c*l.n; axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); if(l.scales){ axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1); scal_cpu(l.n, momentum, l.scale_updates, 1); } axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1); scal_cpu(size, momentum, l.weight_updates, 1); }
void predict_move(network net, float *board, float *move, int multi) { float *output = network_predict(net, board); copy_cpu(19*19, output, 1, move, 1); int i; if(multi){ image bim = float_to_image(19, 19, 1, board); for(i = 1; i < 8; ++i){ rotate_image_cw(bim, i); if(i >= 4) flip_image(bim); float *output = network_predict(net, board); image oim = float_to_image(19, 19, 1, output); if(i >= 4) flip_image(oim); rotate_image_cw(oim, -i); axpy_cpu(19*19, 1, output, 1, move, 1); if(i >= 4) flip_image(bim); rotate_image_cw(bim, -i); } scal_cpu(19*19, 1./8., move, 1); } for(i = 0; i < 19*19; ++i){ if(board[i]) move[i] = 0; } }
static void average(int argc, char *argv[]) { char *cfgfile = argv[2]; char *outfile = argv[3]; gpu_index = -1; network net = parse_network_cfg(cfgfile); network sum = parse_network_cfg(cfgfile); char *weightfile = argv[4]; load_weights(&sum, weightfile); int i, j; int n = argc - 5; for(i = 0; i < n; ++i){ weightfile = argv[i+5]; load_weights(&net, weightfile); for(j = 0; j < net.n; ++j){ layer_t l = net.layers[j]; layer_t out = sum.layers[j]; if(l.type == CONVOLUTIONAL){ int num = l.n*l.c*l.size*l.size; fltadd(out.biases, l.biases, l.n); fltadd(out.filters, l.filters, num); } if(l.type == CONNECTED){ fltadd(out.biases, l.biases, l.outputs); fltadd(out.weights, l.weights, l.outputs * l.inputs); } } } n = n+1; for(j = 0; j < net.n; ++j){ layer_t l = sum.layers[j]; if(l.type == CONVOLUTIONAL){ int num = l.n*l.c*l.size*l.size; scal_cpu(l.n, 1./n, l.biases, 1); scal_cpu(num, 1./n, l.filters, 1); } if(l.type == CONNECTED){ scal_cpu(l.outputs, 1./n, l.biases, 1); scal_cpu(l.outputs*l.inputs, 1./n, l.weights, 1); } } save_weights(sum, outfile); }
int main() { int N, N2; printf(" \n Input matrix size N x N, N = "); scanf("%d", &N); printf(" N = %d \n \n", N); N2 = N*N; double *A, *B, *C_cpu, *C_gpu, *D_cpu, *D_gpu, t1, t2, cpu_time, gpu_time; double r_cpu, *r_gpu, nrmC_cpu, *nrmC_gpu; A = (double *) malloc(N2*sizeof(double)); B = (double *) malloc(N2*sizeof(double)); C_cpu = (double *) malloc(N2*sizeof(double)); C_gpu = (double *) malloc(N2*sizeof(double)); D_cpu = (double *) malloc(N2*sizeof(double)); D_gpu = (double *) malloc(N2*sizeof(double)); r_gpu = (double *) malloc(1*sizeof(double)); nrmC_gpu = (double *) malloc(1*sizeof(double)); initial(A, B, N); t1 = clock(); #pragma acc data copyin(A[0:N2], B[0:N2]) copyout(C_cpu[0:N2]) { cublas_gemm(A, B, C_cpu, N); } r_cpu = dot_cpu(C_cpu, B, N2); axpy_cpu(-1.0*r_cpu, B, C_cpu, N2); nrmC_cpu = norm_cpu(C_cpu, N2); copy_cpu(C_cpu, D_cpu, N2); scal_cpu(1.0/nrmC_cpu, D_cpu, N2); t2 = clock(); cpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC; t1 = clock(); #pragma acc enter data copyin(A[0:N2], B[0:N2]) create(C_gpu[0:N2], r_gpu[0], nrmC_gpu[0], D_gpu[0:N2]) { gpu_cublas1(A, B, C_gpu, D_gpu, r_gpu, nrmC_gpu, N, N2); } #pragma acc update host(D_gpu[0:N2]) t2 = clock(); gpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC; printf(" gpu part success \n"); printf(" \n error = %f \n", error(D_cpu, D_gpu, N2)); printf(" gpu time = %f, cpu times = %f \n", gpu_time, cpu_time); return 0; }
void update_deconvolutional_layer(layer l, update_args a) { real_t learning_rate = a.learning_rate * l.learning_rate_scale; real_t momentum = a.momentum; real_t decay = a.decay; int batch = a.batch; int size = l.size * l.size * l.c * l.n; axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); if (l.scales) { axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1); scal_cpu(l.n, momentum, l.scale_updates, 1); } axpy_cpu(size, -decay * batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(size, learning_rate / batch, l.weight_updates, 1, l.weights, 1); scal_cpu(size, momentum, l.weight_updates, 1); }
void update_convolutional_layer(convolutional_layer l, update_args a) { float learning_rate = a.learning_rate*l.learning_rate_scale; float momentum = a.momentum; float decay = a.decay; int batch = a.batch; axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); if(l.scales){ axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1); scal_cpu(l.n, momentum, l.scale_updates, 1); } axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(l.nweights, learning_rate/batch, l.weight_updates, 1, l.weights, 1); scal_cpu(l.nweights, momentum, l.weight_updates, 1); }
void average(int argc, char *argv[]) { char *cfgfile = argv[2]; char *outfile = argv[3]; gpu_index = -1; network *net = parse_network_cfg(cfgfile); network *sum = parse_network_cfg(cfgfile); char *weightfile = argv[4]; load_weights(sum, weightfile); int i, j; int n = argc - 5; for(i = 0; i < n; ++i){ weightfile = argv[i+5]; load_weights(net, weightfile); for(j = 0; j < net->n; ++j){ layer l = net->layers[j]; layer out = sum->layers[j]; if(l.type == CONVOLUTIONAL){ int num = l.n*l.c*l.size*l.size; axpy_cpu(l.n, 1, l.biases, 1, out.biases, 1); axpy_cpu(num, 1, l.weights, 1, out.weights, 1); if(l.batch_normalize){ axpy_cpu(l.n, 1, l.scales, 1, out.scales, 1); axpy_cpu(l.n, 1, l.rolling_mean, 1, out.rolling_mean, 1); axpy_cpu(l.n, 1, l.rolling_variance, 1, out.rolling_variance, 1); } } if(l.type == CONNECTED){ axpy_cpu(l.outputs, 1, l.biases, 1, out.biases, 1); axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, out.weights, 1); } } } n = n+1; for(j = 0; j < net->n; ++j){ layer l = sum->layers[j]; if(l.type == CONVOLUTIONAL){ int num = l.n*l.c*l.size*l.size; scal_cpu(l.n, 1./n, l.biases, 1); scal_cpu(num, 1./n, l.weights, 1); if(l.batch_normalize){ scal_cpu(l.n, 1./n, l.scales, 1); scal_cpu(l.n, 1./n, l.rolling_mean, 1); scal_cpu(l.n, 1./n, l.rolling_variance, 1); } } if(l.type == CONNECTED){ scal_cpu(l.outputs, 1./n, l.biases, 1); scal_cpu(l.outputs*l.inputs, 1./n, l.weights, 1); } } save_weights(sum, outfile); }
void forward_network(network net, network_state state) { state.workspace = net.workspace; int i; for(i = 0; i < net.n; ++i){ state.index = i; layer l = net.layers[i]; if(l.delta){ scal_cpu(l.outputs * l.batch, 0, l.delta, 1); } if(l.type == CONVOLUTIONAL){ forward_convolutional_layer(l, state); } else if(l.type == DECONVOLUTIONAL){ forward_deconvolutional_layer(l, state); } else if(l.type == ACTIVE){ forward_activation_layer(l, state); } else if(l.type == LOCAL){ forward_local_layer(l, state); } else if(l.type == NORMALIZATION){ forward_normalization_layer(l, state); } else if(l.type == BATCHNORM){ forward_batchnorm_layer(l, state); } else if(l.type == DETECTION){ forward_detection_layer(l, state); } else if(l.type == CONNECTED){ forward_connected_layer(l, state); } else if(l.type == RNN){ forward_rnn_layer(l, state); } else if(l.type == GRU){ forward_gru_layer(l, state); } else if(l.type == CRNN){ forward_crnn_layer(l, state); } else if(l.type == CROP){ forward_crop_layer(l, state); } else if(l.type == COST){ forward_cost_layer(l, state); } else if(l.type == SOFTMAX){ forward_softmax_layer(l, state); } else if(l.type == MAXPOOL){ forward_maxpool_layer(l, state); } else if(l.type == AVGPOOL){ forward_avgpool_layer(l, state); } else if(l.type == DROPOUT){ forward_dropout_layer(l, state); } else if(l.type == ROUTE){ forward_route_layer(l, net); } else if(l.type == SHORTCUT){ forward_shortcut_layer(l, state); } state.input = l.output; } }
void forward_network(network net, network_state state) { state.workspace = net.workspace; int i; for(i = 0; i < net.n; ++i){ state.index = i; layer l = net.layers[i]; if(l.delta){ scal_cpu(l.outputs * l.batch, 0, l.delta, 1); } l.forward(l, state); state.input = l.output; } }
void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters) { int iter = 0; for (iter = 0; iter < iters; ++iter) { image delta = make_image(recon.w, recon.h, recon.c); NETWORK_STATE(state); #ifdef GPU state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c); state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c); state.truth = cuda_make_array(features, get_network_output_size(net)); forward_network_gpu(net, state); backward_network_gpu(net, state); cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c); cuda_free(state.input); cuda_free(state.delta); cuda_free(state.truth); #else state.input = recon.data; state.delta = delta.data; state.truth = features; forward_network(net, state); backward_network(net, state); #endif fltadd(update.data, delta.data, recon.w * recon.h * recon.c); smooth(recon, update, lambda, smooth_size); fltaddmul(recon.data, update.data, recon.w * recon.h * recon.c, rate); scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1); //float mag = mag_array(recon.data, recon.w*recon.h*recon.c); //scal_cpu(recon.w*recon.h*recon.c, 600/mag, recon.data, 1); constrain_image(recon); free_image(delta); } }
void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size) { scale_image(recon, 2); translate_image(recon, -1); image delta = make_image(recon.w, recon.h, recon.c); network_state state = {0}; #ifdef GPU state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c); state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c); state.truth = cuda_make_array(features, get_network_output_size(net)); forward_network_gpu(net, state); backward_network_gpu(net, state); cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c); cuda_free(state.input); cuda_free(state.delta); cuda_free(state.truth); #else state.input = recon.data; state.delta = delta.data; state.truth = features; forward_network(net, state); backward_network(net, state); #endif axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1); smooth(recon, update, lambda, smooth_size); axpy_cpu(recon.w*recon.h*recon.c, rate, update.data, 1, recon.data, 1); scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1); translate_image(recon, 1); scale_image(recon, .5); constrain_image(recon); free_image(delta); }
void forward_network(network net, network_state state) { int i; for(i = 0; i < net.n; ++i){ layer l = net.layers[i]; if(l.delta){ scal_cpu(l.outputs * l.batch, 0, l.delta, 1); } if(l.type == CONVOLUTIONAL){ forward_convolutional_layer(l, state); } else if(l.type == DECONVOLUTIONAL){ forward_deconvolutional_layer(l, state); } else if(l.type == NORMALIZATION){ forward_normalization_layer(l, state); } else if(l.type == DETECTION){ forward_detection_layer(l, state); } else if(l.type == CONNECTED){ forward_connected_layer(l, state); } else if(l.type == CROP){ forward_crop_layer(l, state); } else if(l.type == COST){ forward_cost_layer(l, state); } else if(l.type == SOFTMAX){ forward_softmax_layer(l, state); } else if(l.type == MAXPOOL){ forward_maxpool_layer(l, state); } else if(l.type == AVGPOOL){ forward_avgpool_layer(l, state); } else if(l.type == DROPOUT){ forward_dropout_layer(l, state); } else if(l.type == ROUTE){ forward_route_layer(l, net); } state.input = l.output; } }
layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam) { int i; layer l = { 0 }; l.type = DECONVOLUTIONAL; l.h = h; l.w = w; l.c = c; l.n = n; l.batch = batch; l.stride = stride; l.size = size; l.nweights = c * n * size * size; l.nbiases = n; l.weights = calloc(c * n * size * size, sizeof(real_t)); l.weight_updates = calloc(c * n * size * size, sizeof(real_t)); l.biases = calloc(n, sizeof(real_t)); l.bias_updates = calloc(n, sizeof(real_t)); //real_t scale = n/(size*size*c); //printf("scale: %f\n", scale); real_t scale = .02; for (i = 0; i < c * n * size * size; ++i) l.weights[i] = scale * rand_normal(); //bilinear_init(l); for (i = 0; i < n; ++i) { l.biases[i] = 0; } l.pad = padding; l.out_h = (l.h - 1) * l.stride + l.size - 2 * l.pad; l.out_w = (l.w - 1) * l.stride + l.size - 2 * l.pad; l.out_c = n; l.outputs = l.out_w * l.out_h * l.out_c; l.inputs = l.w * l.h * l.c; scal_cpu(l.nweights, (real_t) l.out_w * l.out_h / (l.w * l.h), l.weights, 1); l.output = calloc(l.batch * l.outputs, sizeof(real_t)); l.delta = calloc(l.batch * l.outputs, sizeof(real_t)); l.forward = forward_deconvolutional_layer; l.backward = backward_deconvolutional_layer; l.update = update_deconvolutional_layer; l.batch_normalize = batch_normalize; if (batch_normalize) { l.scales = calloc(n, sizeof(real_t)); l.scale_updates = calloc(n, sizeof(real_t)); for (i = 0; i < n; ++i) { l.scales[i] = 1; } l.mean = calloc(n, sizeof(real_t)); l.variance = calloc(n, sizeof(real_t)); l.mean_delta = calloc(n, sizeof(real_t)); l.variance_delta = calloc(n, sizeof(real_t)); l.rolling_mean = calloc(n, sizeof(real_t)); l.rolling_variance = calloc(n, sizeof(real_t)); l.x = calloc(l.batch * l.outputs, sizeof(real_t)); l.x_norm = calloc(l.batch * l.outputs, sizeof(real_t)); } if (adam) { l.m = calloc(c * n * size * size, sizeof(real_t)); l.v = calloc(c * n * size * size, sizeof(real_t)); l.bias_m = calloc(n, sizeof(real_t)); l.scale_m = calloc(n, sizeof(real_t)); l.bias_v = calloc(n, sizeof(real_t)); l.scale_v = calloc(n, sizeof(real_t)); } #ifdef GPU l.forward_gpu = forward_deconvolutional_layer_gpu; l.backward_gpu = backward_deconvolutional_layer_gpu; l.update_gpu = update_deconvolutional_layer_gpu; if(gpu_index >= 0) { if (adam) { l.m_gpu = cuda_make_array(l.m, c*n*size*size); l.v_gpu = cuda_make_array(l.v, c*n*size*size); l.bias_m_gpu = cuda_make_array(l.bias_m, n); l.bias_v_gpu = cuda_make_array(l.bias_v, n); l.scale_m_gpu = cuda_make_array(l.scale_m, n); l.scale_v_gpu = cuda_make_array(l.scale_v, n); } l.weights_gpu = cuda_make_array(l.weights, c*n*size*size); l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size); l.biases_gpu = cuda_make_array(l.biases, n); l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); l.delta_gpu = cuda_make_array(l.delta, l.batch*l.out_h*l.out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n); if(batch_normalize) { l.mean_gpu = cuda_make_array(0, n); l.variance_gpu = cuda_make_array(0, n); l.rolling_mean_gpu = cuda_make_array(0, n); l.rolling_variance_gpu = cuda_make_array(0, n); l.mean_delta_gpu = cuda_make_array(0, n); l.variance_delta_gpu = cuda_make_array(0, n); l.scales_gpu = cuda_make_array(l.scales, n); l.scale_updates_gpu = cuda_make_array(0, n); l.x_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n); l.x_norm_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n); } } #ifdef CUDNN cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateTensorDescriptor(&l.normTensorDesc); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); #endif #endif l.activation = activation; l.workspace_size = get_workspace_size(l); fprintf(stderr, "deconv%5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c); return l; }
void forward_iseg_layer(const layer l, network net) { double time = what_time_is_it_now(); int i, b, j, k; int ids = l.extra; memcpy(l.output, net.input, l.outputs * l.batch * sizeof(real_t)); memset(l.delta, 0, l.outputs * l.batch * sizeof(real_t)); #ifndef GPU for (b = 0; b < l.batch; ++b) { int index = b * l.outputs; activate_array(l.output + index, l.classes * l.w * l.h, LOGISTIC); } #endif for (b = 0; b < l.batch; ++b) { // a priori, each pixel has no class for (i = 0; i < l.classes; ++i) { for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + i * l.w * l.h + k; l.delta[index] = 0 - l.output[index]; } } // a priori, embedding should be small magnitude for (i = 0; i < ids; ++i) { for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + (i + l.classes) * l.w * l.h + k; l.delta[index] = .1 * (0 - l.output[index]); } } memset(l.counts, 0, 90 * sizeof(int)); for (i = 0; i < 90; ++i) { fill_cpu(ids, 0, l.sums[i], 1); int c = net.truth[b * l.truths + i * (l.w * l.h + 1)]; if (c < 0) break; // add up metric embeddings for each instance for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + c * l.w * l.h + k; real_t v = net.truth[b * l.truths + i * (l.w * l.h + 1) + 1 + k]; if (v) { l.delta[index] = v - l.output[index]; axpy_cpu(ids, 1, l.output + b * l.outputs + l.classes * l.w * l.h + k, l.w * l.h, l.sums[i], 1); ++l.counts[i]; } } } real_t *mse = calloc(90, sizeof(real_t)); for (i = 0; i < 90; ++i) { int c = net.truth[b * l.truths + i * (l.w * l.h + 1)]; if (c < 0) break; for (k = 0; k < l.w * l.h; ++k) { real_t v = net.truth[b * l.truths + i * (l.w * l.h + 1) + 1 + k]; if (v) { int z; real_t sum = 0; for (z = 0; z < ids; ++z) { int index = b * l.outputs + (l.classes + z) * l.w * l.h + k; sum += pow(l.sums[i][z] / l.counts[i] - l.output[index], 2); } mse[i] += sum; } } mse[i] /= l.counts[i]; } // Calculate average embedding for (i = 0; i < 90; ++i) { if (!l.counts[i]) continue; scal_cpu(ids, 1.f / l.counts[i], l.sums[i], 1); if (b == 0 && net.gpu_index == 0) { printf("%4d, %6.3f, ", l.counts[i], mse[i]); for (j = 0; j < ids; ++j) { printf("%6.3f,", l.sums[i][j]); } printf("\n"); } } free(mse); // Calculate embedding loss for (i = 0; i < 90; ++i) { if (!l.counts[i]) continue; for (k = 0; k < l.w * l.h; ++k) { real_t v = net.truth[b * l.truths + i * (l.w * l.h + 1) + 1 + k]; if (v) { for (j = 0; j < 90; ++j) { if (!l.counts[j]) continue; int z; for (z = 0; z < ids; ++z) { int index = b * l.outputs + (l.classes + z) * l.w * l.h + k; real_t diff = l.sums[j][z] - l.output[index]; if (j == i) l.delta[index] += diff < 0 ? -.1 : .1; else l.delta[index] += -(diff < 0 ? -.1 : .1); } } } } } for (i = 0; i < ids; ++i) { for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + (i + l.classes) * l.w * l.h + k; l.delta[index] *= .01; } } } *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2); printf("took %lf sec\n", what_time_is_it_now() - time); }
void test_go(char *filename, char *weightfile, int multi) { network net = parse_network_cfg(filename); if(weightfile){ load_weights(&net, weightfile); } srand(time(0)); set_batch_network(&net, 1); float *board = calloc(19*19, sizeof(float)); float *move = calloc(19*19, sizeof(float)); int color = 1; while(1){ float *output = network_predict(net, board); fltcpy(move, output, 19 * 19); int i; if(multi){ image bim = float_to_image(19, 19, 1, board); for(i = 1; i < 8; ++i){ rotate_image_cw(bim, i); if(i >= 4) flip_image(bim); float *output = network_predict(net, board); image oim = float_to_image(19, 19, 1, output); if(i >= 4) flip_image(oim); rotate_image_cw(oim, -i); fltadd(move, output, 19 * 19); if(i >= 4) flip_image(bim); rotate_image_cw(bim, -i); } scal_cpu(19*19, 1./8., move, 1); } for(i = 0; i < 19*19; ++i){ if(board[i]) move[i] = 0; } int indexes[nind]; int row, col; top_k(move, 19*19, nind, indexes); print_board(board, color, indexes); for(i = 0; i < nind; ++i){ int index = indexes[i]; row = index / 19; col = index % 19; printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100); } if(color == 1) printf("\u25EF Enter move: "); else printf("\u25C9 Enter move: "); char c; char *line = fgetl(stdin); int picked = 1; int dnum = sscanf(line, "%d", &picked); int cnum = sscanf(line, "%c", &c); if (strlen(line) == 0 || dnum) { --picked; if (picked < nind){ int index = indexes[picked]; row = index / 19; col = index % 19; board[row*19 + col] = 1; } } else if (cnum){ if (c <= 'T' && c >= 'A'){ int num = sscanf(line, "%c %d", &c, &row); row = (inverted)?19 - row : row-1; col = c - 'A'; if (col > 7 && noi) col -= 1; if (num == 2) board[row*19 + col] = 1; } else if (c == 'p') { // Pass } else if(c=='b' || c == 'w'){ char g; int num = sscanf(line, "%c %c %d", &g, &c, &row); row = (inverted)?19 - row : row-1; col = c - 'A'; if (col > 7 && noi) col -= 1; if (num == 3) board[row*19 + col] = (g == 'b') ? color : -color; } else if(c == 'c'){ char g; int num = sscanf(line, "%c %c %d", &g, &c, &row); row = (inverted)?19 - row : row-1; col = c - 'A'; if (col > 7 && noi) col -= 1; if (num == 3) board[row*19 + col] = 0; } } free(line); update_board(board); flip_board(board); color = -color; } }